Cuda Template Kernels using atomicAdd

CUDA is NVIDIA's GPU computing library for C/C++. GPU computing uses the Graphic Processing Unit to handle massively parallel runs of small function kernels. Both the GPU memory and GPU processor are separate from the system memory and system processor - but through managed allocation, the CUDA memory access patterns similar.

The CUDA API (Application Programming Interface) is written in C, but has added some C++ features. One of these features is templates.

Note: This post assumes CUDA 9 or higher.

Templates allow you to define a method or object whose functionality can be adapted to more than one type or class without repeating the entire code for each type. Look at the following example (a CUDA kernel written in C++):


#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

template<typename T>
__global__ void cuda_array_sum_kernel(
    const unsigned int size, // The length of the array
    const T* values, // The values of the array
    T* out // The resulting sum
    )
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    T val = 0;
    for (int i = index; i < size; i += stride) {
        val += values[i];
    }

    // atomicAdd cannot be done using templates
    atomicAdd(out, val);
}

Since the body of the function is the same for types like int, float or double (any numeric type), it makes more sense to create a template method than define an implementation for each. (you can always make a specific implementation later if you like) The problem is that atomicAdd is not a templated method - even though it supports many different numeric types (except for double, but that will be addressed). Since atomicAdd is probably used in many more kernels, we desire some generic approach to handling atomic addition such as the following:

    
    // don't forget to #include "CudaAddAtomicTemplate.h"
    // replace  atomicAdd(out, val); with the following
    CudaAtomicAdd<T> caa;    
    caa.AtomicAdd(out, val);
}

Now it's time to define the CudeAtomicAdd template. Template type definition only works for structs and classes (you can always template a generic method), and since structs are more light-weight, it makes sense to create the method on a C++ struct.


#pragma once
#include "stdafx.h"
#ifndef CUDA_ATOMIC_ADD_TEMPLATE_H
#define CUDA_ATOMIC_ADD_TEMPLATE_H
#include "cuda.h"
#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_double_functions.h"
#include "device_functions.h"
#include "device_launch_parameters.h"

template <typename T>
struct CudaAtomicAdd
{
	__device__ T AtomicAdd(T* ref, T value)
	{
		extern __device__ void error(void);
		error(); // Ensure that we won't compile any un-specialized types
		return NULL;
	}
};
template <>
struct CudaAtomicAdd <int>
{
	__device__ unsigned int AtomicAdd(int* ref, int value)
	{
		return atomicAdd(ref, value);
	}
};
template <>
struct CudaAtomicAdd <unsigned int>
{
	__device__ unsigned int AtomicAdd(unsigned int* ref, unsigned int value)
	{
		return atomicAdd(ref, value);
	}
};
template <>
struct CudaAtomicAdd <unsigned long long int>
{
	__device__ unsigned int AtomicAdd(unsigned long long int* ref, unsigned long long int value)
	{
		return atomicAdd(ref, value);
	}
};
template <>
struct CudaAtomicAdd <float>
{
	__device__ float AtomicAdd(float* ref, float value)
	{
		return atomicAdd(ref, value);
	}
};
template <>
struct CudaAtomicAdd <double>
{
	__device__ double AtomicAdd(double* ref, double value)
	{
    	// double is different becase it is only supported in later architectures
	#if __CUDA_ARCH__ < 600
		unsigned long long int* address_as_ull = (unsigned long long int*)ref;
		unsigned long long int old = *address_as_ull, assumed;
		do {
			assumed = old;
			old = atomicCAS(address_as_ull, 
    			    assumed,
    			    __double_as_longlong(
    			        value + __longlong_as_double(assumed)
    			    )
    			);
		} while (assumed != old);
		return __longlong_as_double(old);
	#else
		return atomicAdd(ref, value);
	#endif
	}
};
#endif // !CUDA_ATOMIC_ADD_TEMPLATE_H

The final method would be written as follows:


#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "CudaAddAtomicTemplate.h"

template<typename T>
__global__ void cuda_array_sum_kernel(
    const unsigned int size, // The length of the array
    const T* values, // The values of the array
    T* out // The resulting sum
    )
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    T val = 0;
    for (int i = index; i < size; i += stride) {
        val += values[i];
    }

    CudaAtomicAdd<T> caa;    
    caa.AtomicAdd(out, val);
}