0% found this document useful (0 votes)
78 views3 pages

Clenqueuereadbuffer (Queue, C - Buffer,, 0, N, C, 0, ,)

This document discusses measuring the performance of OpenCL vector addition code. It implements vector addition using OpenCL and measures the time taken for memory transfers between the host and device for different array lengths. The results show that memory transfer time varies between around 1 and 2.5 microseconds for array lengths between 1024 and 67108864 elements. The conclusion suggests this provides familiarization with OpenCL and measuring the impact of work group size on performance.

Uploaded by

Samuele Tesfaye
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
78 views3 pages

Clenqueuereadbuffer (Queue, C - Buffer,, 0, N, C, 0, ,)

This document discusses measuring the performance of OpenCL vector addition code. It implements vector addition using OpenCL and measures the time taken for memory transfers between the host and device for different array lengths. The results show that memory transfer time varies between around 1 and 2.5 microseconds for array lengths between 1024 and 67108864 elements. The conclusion suggests this provides familiarization with OpenCL and measuring the impact of work group size on performance.

Uploaded by

Samuele Tesfaye
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 3

Objective

The objective of this assignment is to get familiarized with OpenCL.

Problem statement

1. Measuring the benefit of using OPenCL

1. Measuring the impact of work group size on performance

Methodology

By implementing the vector addition code given on the lecture slide the following results are found.

To calculate the time it takes to complete memory copy from host to device I measured the time of the
clEnqueueReadBuffer (queue, c_buffer, CL_TRUE, 0, N * sizeof (cl_float), c, 0, NULL,
NULL);

Result and discussion

The following result is obtained by using the average of running the code multiple times.

Task Array length Time

the time it takes to complete 1024 1.08 microsec


memory copy from device to host
2048 1.1 microsec
4096 0.95 microsec
8192 1.016667 microsec
16384 1.016667 microsec
32768 2.55 microsec
65536 1.133333 microsec
131072 1.183333 microsec
262144 1.883333 microsec
67108864 2.53333 microsec

Conclusion and suggestion

Appendix A
// Assignment4.cpp : This file contains the 'main' function. Program execution begins and
ends there.
#include<CL\cl.h>
#include<stdio.h>
#include <stdlib.h>
#include <tchar.h>
#include <memory.h>
#include <windows.h>
#include "CL\cl_ext.h"
#include "utils.h"
#include <assert.h>
#include<iostream>
#include<chrono>
#include<ctime>
using namespace std::chrono;
using namespace std;
//====

const char* source =

"_ _kernel void vec_add (_ _global const float *a,\n"


"_ _global const float *b,\n"
"_ _ global float *c) \n"
"{ \n"
" int gid = get_global_id(0); \n"
"c[gid]=a[gid]+b[gid];\n"
"}\n";

//=====

void main() {
chrono::time_point<std::chrono::system_clock> start, end;

int N = 67108864;//array length


cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
cl_context context = clCreateContext(0, 1, &device, NULL, NULL, NULL);
cl_command_queue queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, 0);

cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);

clBuildProgram(program, 1, &device, NULL, NULL, NULL);


start = std::chrono::system_clock::now();
cl_kernel kernel = clCreateKernel(program, "vec_add", NULL);
cl_float* a = (cl_float*)malloc(N * sizeof(cl_float));
cl_float* b = (cl_float*)malloc(N * sizeof(cl_float));

int i;
for (i = 0; i < N; i++) {
a[i] = i;
b[i] = N - i;
}
cl_mem a_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
N * sizeof(cl_float), a, NULL);//buffer object read only for kernel copy data from memory
referenced
cl_mem b_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
N * sizeof(cl_float), b, NULL);
cl_mem c_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY |
CL_MEM_COPY_HOST_PTR, N * sizeof(cl_float), NULL, NULL);
size_t global_work_size = N;

clSetKernelArg(kernel, 0, sizeof(a_buffer), (void*)&a_buffer);


clSetKernelArg(kernel, 1, sizeof(b_buffer), (void*)&b_buffer);
clSetKernelArg(kernel, 2, sizeof(a_buffer), (void*)&c_buffer);
cl_event event;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL,
&event);
clWaitForEvents(1, &event);
clFinish(queue);

cl_ulong time_start;
cl_ulong time_end;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start),
&time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end),
&time_end, NULL);
cout << "hello";
cout << time_start;
cout << time_end;
double nanoSeconds = time_end - time_start;
cout<< nanoSeconds / 1000000.0;

cl_float* c = (cl_float*)malloc(N * sizeof(cl_float));


//read from a buffer object from device to host memory
clEnqueueReadBuffer(queue, c_buffer, CL_TRUE, 0, N * sizeof(cl_float), c, 0, NULL,
NULL);

free(a);
free(b);
free(c);
clReleaseMemObject(a_buffer);
clReleaseMemObject(b_buffer);
clReleaseMemObject(c_buffer);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseContext(context);
clReleaseCommandQueue(queue);
end = chrono::system_clock::now();
time_t end_time = std::chrono::system_clock::to_time_t(end);
chrono::duration<double> elapsed_seconds = end - start;
cout << "elapsed time: " << elapsed_seconds.count() << " sec\n";

system("pause");
}

You might also like