OpenCL Guide
OpenCL Guide
OpenCL Guide
Introduction
OpenCL (Open Computing Language) Is an open royalty-free standard For general purpose parallel programming across CPUs, GPUs and other processors
OpenCL lets Programmers write a single portable program that uses ALL resources in the heterogeneous platform
OpenCL programs
OpenCL programs are divided in two part:
qOne that executes on the device (in our case, on the GPU).
write Kernels The device program is the one you may be concerned about
qOne that executes on the host (in our case, the CPU).
Offers an API so that you can manage your device execution. Can be programmed in C or C++ and it controls the OpenCL environment (context, command-queue,...).
void vector_add_cpu (const float* src_a, const float* src_b, float* res, const int num) { for (int i = 0; i < num; i++) res[i] = src_a[i] + src_b[i]; }
Sample..
// Some interesting data for the vectors int InitialData1[20] = {37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17}; int InitialData2[20] = {35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15}; // Number of elements in the vectors to be added #define SIZE 2048 // Main function // ********************************************************************* int main(int argc, char **argv) { // Two integer source vectors in Host memory int HostVector1[SIZE], HostVector2[SIZE]; // Initialize with some interesting repeating data for(int c = 0; c < SIZE; c++) { HostVector1[c] = InitialData1[c%20]; HostVector2[c] = InitialData2[c%20]; }
Sample..
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL); // Get the list of GPU devices associated with this context size_t ParmDataBytes; clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes); cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes); clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL); // Create a command-queue on the first GPU device cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL); // Allocate GPU memory for source vectors AND initialize from CPU memory cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector1, NULL); cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector2, NULL);
// Allocate output memory on GPU cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(int) * SIZE, NULL, NULL);
Sample..
// Create OpenCL program with source code cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLSource, NULL, NULL); // Build the program (OpenCL JIT compilation) clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL); // Create a handle to the compiled OpenCL function (Kernel) cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "VectorAdd", NULL); // In the next step we associate the GPU memory with the Kernel arguments clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&GPUOutputVector); clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1); clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2);
Sample..
// Launch the Kernel on the GPU size_t WorkSize[1] = {SIZE}; // one dimensional Range
// Copy the output in GPU memory back to CPU memory int HostOutputVector[SIZE]; clEnqueueReadBuffer(GPUCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE * sizeof(int), HostOutputVector, 0, NULL, NULL); // Cleanup free(GPUDevices); clReleaseKernel(OpenCLVectorAdd); clReleaseProgram(OpenCLProgram); clReleaseCommandQueue(GPUCommandQueue); clReleaseContext(GPUContext); clReleaseMemObject(GPUVector1); clReleaseMemObject(GPUVector2); clReleaseMemObject(GPUOutputVector);
Sample
// Print out the results for (int Rows = 0; Rows < (SIZE/20); Rows++, printf("\n")) { for(int c = 0; c <20; c++) { printf("%c",(char)HostOutputVector[Rows * 20 + c]); } }
Thanks! Thanks!