
The fine print: Note that the code presented here is intended to demonstrate the use shared memory in CUDA and is, under no circumstances, optimised for high throughput and efficiency!
Le code - A first approach using statically allocated shared memory
Proper use of shared memory can lead to considerable acceleration of your application if used properly. Shared memory is an on-chip memory that is shared among the threads of a block. As mentioned in the CUDA C Programming Guide, Section 3.2.3, "Any opportunity to replace global memory accesses by shared memory accesses should therefore be exploited." In this post we illustrate how to allocate and use shared memory through a simple example, matrix-vector multiplication, and we discuss about static and dynamic allocation of shared memory.In this post we assume that the values of our matrix are stored column-by-column, i.e., in column-major order. We will first assume that dimensions of our matrix are multiples of the block size we use and then we will see how this assumption can be lifted. Let us first take a look at the kernel that multiplies a matrix dA with a vector dx:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include "helper_cuda.h" | |
#include "rand_data.h" | |
#define BLOCK_SIZE 16 | |
/** | |
* Performs a matrix-vector multiplication on the device. | |
* | |
* @param dA Matrix A | |
* @param dx Vector x | |
* @param dev_ptr_y Result y = A*x | |
*/ | |
template<typename T> | |
__global__ void matvec_kernel( | |
T * dA, | |
T * dx, | |
T * dev_ptr_y) | |
{ | |
unsigned int bid = blockIdx.x; | |
T * y_sub = dev_ptr_y + bid * BLOCK_SIZE; | |
T y_val = 0.0; | |
T * Asub; | |
T * xsub; | |
unsigned int row = threadIdx.x; | |
unsigned int col = threadIdx.y; | |
for (unsigned int m = 0; m < (NS / BLOCK_SIZE); ++m) { | |
__shared__ T A_shared[BLOCK_SIZE * BLOCK_SIZE]; | |
__shared__ T x_shared[BLOCK_SIZE]; | |
Asub = dA + BLOCK_SIZE * (bid + m * NS); | |
xsub = dx + m * BLOCK_SIZE; | |
/* x_shared <--- xsub */ | |
if (row < BLOCK_SIZE) | |
x_shared[row] = xsub[row]; | |
/* A_shared <--- Asub */ | |
if (row < BLOCK_SIZE && col < BLOCK_SIZE) | |
A_shared[row + col * BLOCK_SIZE] = Asub[row + col * NS]; | |
__syncthreads(); | |
for (unsigned int e = 0; e < BLOCK_SIZE; ++e) { | |
if (row < BLOCK_SIZE) | |
y_val += A_shared[row + e * BLOCK_SIZE] * x_shared[e]; | |
} | |
__syncthreads(); | |
} | |
if (row < BLOCK_SIZE) | |
y_sub[row] = y_val; | |
} |
The data are all contained in rand_data.h as __device__ variables, statically allocated (well, not too important).
In this example we have loaded the values of matrix A onto the shared memory, which is not necessary, but we did so just to illustrate how this can be achieved. Actually, the values of A are used only once, so loading them onto the shared memory is redundant. The kernel above can be launched as follows:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int main(void) { | |
checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); | |
float * dev_ptr_A = NULL; | |
float * dev_ptr_x = NULL; | |
float * hst_y = NULL; | |
float * address_y = NULL; | |
checkCudaErrors( | |
cudaHostAlloc((void**)&hst_y, NS * sizeof(float), cudaHostAllocMapped)); | |
checkCudaErrors(cudaGetSymbolAddress((void** )&dev_ptr_A, dev_A)); | |
checkCudaErrors(cudaGetSymbolAddress((void** )&dev_ptr_x, dev_x)); | |
checkCudaErrors(cudaHostGetDevicePointer(&address_y, hst_y, 0)); | |
dim3 dim_grid(NS / BLOCK_SIZE); | |
dim3 dim_block(BLOCK_SIZE, BLOCK_SIZE); | |
matvec_kernel<float><<<dim_grid, dim_block>>>(dev_ptr_A, dev_ptr_x, address_y); | |
getLastCudaError("matver_kernel error"); | |
checkCudaErrors(cudaDeviceSynchronize()); | |
printf("Result :\n"); | |
for (unsigned int j = 0; j < NS; j++) { | |
printf("%2.4f\n", hst_y[j]); | |
} | |
if (hst_y != NULL) | |
checkCudaErrors(cudaFreeHost(hst_y)); | |
return EXIT_SUCCESS; | |
} |
Externally allocated shared memory
In the previous example we used statically allocated shared memory assuming that the size of A_shared and x_shared is known at compile time. This has a few clear disadvantages especially if we need to run the same code using different block sizes. For this reason we need to replace the static allocation (lines 35 and 36 above) with dynamic allocation as follows: extern __shared__ T smem[]; // Externally allocated shared memory
T *x_shared = (T *)smem;
T *A_shared = (T *)&smem[block_size];
and then, we launch the kernel allocating the shared memory variable smem like this:
unsigned int shared_mem_bytes =
BLOCK_SIZE * (BLOCK_SIZE + 1) * sizeof(real_t);
matvec_kernel<real_t><<<dim_grid, dim_block, shared_mem_bytes>>>(
dev_ptr_A, dev_ptr_x, address_y);
Inexact tiling
The assumption of exact tiling is a convenient, but too strong one. Having understood the algorithm presented in the previous sections we can now describe the method of inexact tiling where we use square matrices of size BLOCK_SIZE-by-BLOCK_SIZE to cover the whole matrix A (regardless of its dimensions). The concept is illustrated in the following figure:To do so, we need to decide the number of horizontal and vertical blocks needed to cover the matrix. Integer division in C does the trick! Let nCols and nRows be the number of columns and rows of A respectively. Then, the number of horizontal blocks is (nCols+BLOCK_SIZE -1)/BLOCK_SIZE and the number of vertical blocks is (nCols+BLOCK_SIZE-1)/BLOCK_SIZE. Additionally, one needs to take into account that certain blocks don't lay fully inside matrix A, therefore, some of their threads need to remain idle (practically, they do absolutely nothing). For this purpose, we make use of guards which are simply if blocks. For instance:
if (row < BLOCK_SIZE)
y_sub[row] = y_val;
becomes:
if (row < block_size && row + bid * block_size < nx_)
y_sub[row] = y_val;
What is more, we now need to change the API of the function so as to pass to it the dimensions of the matrix (as device variables of course):
template<typename T>
__global__ void matvec_kernel(
const T * dA,
const T * dx,
T * dy,
const unsigned int * nRows,
const unsigned int * nx)
The new kernel runs as follows:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
template<typename T> | |
__global__ void matvec_kernel( | |
const T * dA, | |
const T * dx, | |
T * dy, | |
const unsigned int * nRows, | |
const unsigned int * nx) | |
{ | |
unsigned int bid = blockIdx.x; | |
unsigned int row = threadIdx.x; | |
unsigned int col = threadIdx.y; | |
const unsigned int nRows_ = *nRows; | |
const unsigned int nx_ = *nx; | |
const unsigned int block_size = blockDim.x; | |
const T * Asub; | |
const T * xsub; | |
extern __shared__ T smem[]; // Externally allocated shared memory | |
T *x_shared = (T *) smem; | |
T *A_shared = (T *) &smem[block_size]; | |
T * y_sub = dy + bid * block_size; | |
T y_val = 0.0; | |
for (unsigned int m = 0; m < ((nx_ + block_size - 1)/ block_size); ++m) | |
{ | |
Asub = dA + block_size * (bid + m * nRows_); | |
xsub = dx + m * block_size; | |
/* x_shared <--- xsub */ | |
if (row < block_size | |
&& m * block_size + row < nx_) // GUARD 1 | |
x_shared[row] = xsub[row]; | |
/* A_shared <--- Asub */ | |
// GUARD 2 | |
if (row < block_size | |
&& col < block_size | |
&& row + col * nRows_ + block_size * (bid + m * nRows_) < (nRows_ * nx_ )) | |
{ | |
A_shared[row + col * block_size] = Asub[row + col * nRows_]; | |
} | |
__syncthreads(); | |
if (row < BLOCK_SIZE) // GUARD 3A | |
{ | |
for (unsigned int e = 0; e < block_size; ++e) | |
{ | |
if (e + m * block_size < nx_) // GUARD 3B | |
y_val += A_shared[row + e * block_size] * x_shared[e]; | |
} | |
} | |
__syncthreads(); | |
} /* end for */ | |
if (row < block_size && row + bid * block_size < nRows_) // GUARD 4 | |
y_sub[row] = y_val; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <cuda_runtime.h> | |
#include <assert.h> | |
#include "helper_cuda.h" | |
#include "rand_data.h" | |
#include <math.h> | |
#define BLOCK_SIZE 16 | |
typedef float real_t; | |
int main(void) { | |
checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); | |
real_t * dev_ptr_A = NULL; | |
real_t * dev_ptr_x = NULL; | |
real_t * hst_y = NULL; | |
real_t * address_y = NULL; | |
unsigned int host_nRows = NS; | |
unsigned int host_nx = NX; | |
unsigned int * dev_nRows; | |
unsigned int * dev_nx; | |
checkCudaErrors(cudaMalloc((void**)&dev_nRows, sizeof(*dev_nRows))); | |
checkCudaErrors(cudaMalloc((void**)&dev_nx, sizeof(*dev_nx))); | |
checkCudaErrors(cudaMemcpy(dev_nx, &host_nx, sizeof(*dev_nx), cudaMemcpyHostToDevice)); | |
checkCudaErrors(cudaMemcpy(dev_nRows, &host_nRows, sizeof(*dev_nRows), cudaMemcpyHostToDevice)); | |
checkCudaErrors( | |
cudaHostAlloc((void**)&hst_y, NS * sizeof(real_t), cudaHostAllocMapped)); | |
checkCudaErrors(cudaGetSymbolAddress((void** )&dev_ptr_A, dev_A)); | |
checkCudaErrors(cudaGetSymbolAddress((void** )&dev_ptr_x, dev_x)); | |
checkCudaErrors(cudaHostGetDevicePointer(&address_y, hst_y, 0)); | |
printf("GRID SIZE :: [%d, 1, 1]\n", (host_nRows + BLOCK_SIZE -1)/ BLOCK_SIZE); | |
printf("BLOCK SIZE :: [%d, %d, 1]\n", BLOCK_SIZE, BLOCK_SIZE); | |
printf("THREADS :: %d\n", BLOCK_SIZE * BLOCK_SIZE * ((host_nRows + BLOCK_SIZE -1)/ BLOCK_SIZE)); | |
dim3 dim_grid((host_nRows + BLOCK_SIZE -1)/ BLOCK_SIZE); | |
dim3 dim_block(BLOCK_SIZE, BLOCK_SIZE); | |
unsigned int shared_mem_bytes = BLOCK_SIZE * (BLOCK_SIZE + 1) * sizeof(real_t); | |
matvec_kernel<real_t><<<dim_grid, dim_block, shared_mem_bytes>>>( | |
dev_ptr_A, dev_ptr_x, address_y, dev_nRows, dev_nx); | |
getLastCudaError("matver_kernel error"); | |
checkCudaErrors(cudaDeviceSynchronize()); | |
float rel_err; | |
float max_err = 0.0; | |
for (unsigned int j = 0; j < host_nx; j++) { | |
rel_err = abs((hst_y[j]-y_correct[j])/y_correct[j]); | |
assert(rel_err < 1e-4); | |
max_err = max(max_err, rel_err); | |
} | |
printf("ABS ERROR = %f\n", max_err); | |
if (hst_y != NULL) | |
checkCudaErrors(cudaFreeHost(hst_y)); | |
if (dev_nRows != NULL) | |
checkCudaErrors(cudaFree(dev_nRows)); | |
if (dev_nx != NULL) | |
checkCudaErrors(cudaFree(dev_nx)); | |
printf("Tests passed:: OK!\n"); | |
return EXIT_SUCCESS; | |
} |
As I mentioned in the beginning, this code is not optimized. A lot of things can be done to improve its performance. I'll get back to you next week with a better version of the above implementation.
No comments:
Post a Comment