Code of Honour: Static allocation of __device_

This is a brief post on how to allocate device memory statically using the __device__ keyword. We explain how one can statically allocate memory space on the device and initialize it as with standard C code. This memory is the available to the device but cannot be directly referenced from the host (unless cudaGetSymbolAddress is used as discussed in a previous post).

Device-side static allocation

To get straight to the point, here's the code:

	#include <stdio.h>
	#include <cuda_runtime.h>
	#include "helper_cuda.h"

	#define NS 3

	/**
	* Statically allocated memory on the device
	*/
	__device__ float dev_static[NS] = {10.0, -20.0, 235.0};


	/**
	* Adds to a given vector dx, the vector dev_static
	*/
	__global__ void kernel(float *dx) {
	int tid =threadIdx.x + blockIdx.x * blockDim.x;
	dx[tid] += dev_static[tid];
	}


	int main(void) {

	float * hst_p = NULL;
	float * dev_p = NULL;
	int size = NS * sizeof(float);
	hst_p = (float *)malloc(size);


	for (int i = 0; i < NS; i++){
	hst_p[i] = ( float ) ( i + 1 );
	}

	checkCudaErrors(cudaMalloc(&dev_p, size));
	checkCudaErrors(cudaMemcpy(dev_p, hst_p, size, cudaMemcpyHostToDevice));

	kernel<<<1, NS>>>(dev_p);
	getLastCudaError("kernel error");

	checkCudaErrors(
	cudaMemcpy(hst_p, dev_p, size, cudaMemcpyDeviceToHost));

	for (int i = 0; i < NS; i++){
	printf("host_p[%d] = %g\n", i, hst_p[i]);
	}

	if (hst_p != NULL) free(hst_p);
	if (dev_p != NULL) checkCudaErrors( cudaFree(dev_p) );

	return 0;
	}

view raw device_static_alloc.cu hosted with ❤ by GitHub

Let us now go through the code and make some remarks. First device memory for variable dev_static is allocated statically, no cudaMemcpy has been used to transfer the data, no cudaMalloc was used to allocate space on the device. This of course assumes that we know beforehand the dimension of the array we need to allocate.

__device__ float dev_static[NS] = {10.0, -20.0, 235.0};

Now let's look at the kernel:

__global__ void kernel(float *dx) {
 int tid =threadIdx.x + blockIdx.x * blockDim.x;
 dx[tid] += dev_static[tid];
}

The __device__ variable dev_static is accessible from inside the kernel (but not from the host). It wouldn't be possible to pass the address of dev_static directly to the kernel (although there are ways to do so if necessary using cudaGetSymbolAddress).

Then, we have the kernel invocation:

kernel<<<1, NS>>>(dev_p);

where we pass the address of a variable that as been allocated on the device linear memory using cudaMalloc and that has been initialized using cudaMemcpy (the very standard way). As you may have guessed, the program prints:

host_p[0] = 11
host_p[1] = -18
host_p[2] = 238

Example with pinned memory

Just for fun, you can do the same with pinned memory. The code can be found here and goes like this:

	#include <stdio.h>

	#include <cuda_runtime.h>
	#include "helper_cuda.h"

	#define NS 3

	/**
	* Statically allocated memory on the device
	*/__device__ float dev_static[NS] = { 10.0, -20.0, 235.0 };

	/**
	* Adds to a given vector `dx`,
	* the vector `dev_static`
	*/
	__global__ void kernel(float *dx) {
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	if (tid < NS)
	dx[tid] += dev_static[tid];
	}

	int main(void) {

	cudaDeviceProp prop;
	int whichDevice;
	float * hst_p = NULL;
	float * dev_address = NULL;
	int size = NS * sizeof(float);
	hst_p = (float *) malloc(size);

	checkCudaErrors(cudaGetDevice(&whichDevice));
	checkCudaErrors(cudaGetDeviceProperties(&prop, whichDevice));

	if (prop.canMapHostMemory != 1) {
	fprintf(stderr, "Device cannot map memory!\n");
	return 1;
	}

	checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));

	for (int i = 0; i < NS; i++) {
	hst_p[i] = (float) (i + 1);
	}

	checkCudaErrors(cudaHostAlloc(&hst_p, size, cudaHostAllocMapped));

	checkCudaErrors(cudaHostGetDevicePointer(&dev_address, hst_p, 0));

	kernel<<<1, NS>>>(dev_address);
	/*
	* The following line is necessary for the host
	* to be able to "see" the changes that have been done
	* on `host_p`
	*/
	checkCudaErrors(cudaDeviceSynchronize());

	for (int i = 0; i < NS; i++) {
	printf("host_p[%d] = %g\n", i, hst_p[i]);
	}

	if (hst_p != NULL)
	checkCudaErrors(cudaFreeHost(hst_p));

	return 0;
	}

view raw device_static_alloc_pinned.cu hosted with ❤ by GitHub

Example with cuBLAS

In this example we allocate two variables, dev_static_x and dev_static_y on the device using the __device__ keyword and we use cudaGetSymbolAddress to retrieve their device address which we then pass to a cuBLAS function to compute the dot product of the two variables on the device.

	#include <stdio.h>
	#include <stdlib.h>
	#include <cuda_runtime.h>
	#include "cublas_v2.h"
	#include "helper_cuda.h"

	#define ns 6

	__device__ float dev_static_x[6] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0};
	__device__ float dev_static_y[6] = {0.0, -1.0, -2.0, -3.0, -4.0, -5.0};


	int main(void) {

	cublasHandle_t handle;
	float * dev_address_x;
	float * dev_address_y;
	float result;

	checkCudaErrors(
	cudaGetSymbolAddress((void**)&dev_address_x, dev_static_x));

	checkCudaErrors(
	cudaGetSymbolAddress((void**)&dev_address_y, dev_static_y));

	/* Create the CUDA handle */
	checkCudaErrors(cublasCreate(&handle));

	/* Call cuBLAS on dev_address_x and dev_address_y */
	checkCudaErrors(
	cublasSdot(handle, ns, dev_address_x, 1,
	dev_address_y, 1, &result));

	printf("result = %f\n", result);

	return 0;
	}

view raw dev_static_cublas.cu hosted with ❤ by GitHub

Code of Honour

Pages

Monday, 13 October 2014

Static allocation of device vars

Device-side static allocation

Example with pinned memory

Example with cuBLAS

No comments:

Post a Comment

Pages

Monday, 13 October 2014

Static allocation of __device__ vars

Device-side static allocation

Example with pinned memory

Example with cuBLAS

No comments:

Post a Comment

Static allocation of device vars