0% found this document useful (0 votes)
75 views8 pages

Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código

This document contains the code for an exercise on CUDA that analyzes performance based on memory access regularity and homogeneity. It defines a kernel that squares or adds array elements in parallel depending on a thread's group, implements timing using CUDA events, and checks for errors. Main allocates host and device memory, copies data to the device, runs the kernel concurrently on blocks of threads, returns results to the host, and prints timing results.

Uploaded by

Hecttor Juarez
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as RTF, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
75 views8 pages

Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código

This document contains the code for an exercise on CUDA that analyzes performance based on memory access regularity and homogeneity. It defines a kernel that squares or adds array elements in parallel depending on a thread's group, implements timing using CUDA events, and checks for errors. Main allocates host and device memory, copies data to the device, runs the kernel concurrently on blocks of threads, returns results to the host, and prints timing results.

Uploaded by

Hecttor Juarez
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as RTF, PDF, TXT or read online on Scribd
You are on page 1/ 8

/*

* Ejercicio 2 Práctica 3: CUDA

* Desempeño en función de la
homogeneidad para acceder a memoria

* y de la regularidad del código

*/

#include <stdio.h>

//PP#include <cuda.h>

#define STRIDE 32

#define OFFSET 0

#define GROUP_SIZE 512

/* Utilidad para checar errores de CUDA


*/

void checkCUDAError(const char*);

// Kernel that executes on the CUDA


device

__global__ void square_array(float *a,


int N)

int n_elem_per_thread = N /
(gridDim.x * blockDim.x);

int block_start_idx =
n_elem_per_thread * blockIdx.x *
blockDim.x;

int thread_start_idx =
block_start_idx

+ (threadIdx.x / STRIDE)
* n_elem_per_thread * STRIDE

+ ((threadIdx.x +
OFFSET) % STRIDE);

int thread_end_idx =
thread_start_idx + n_elem_per_thread *
STRIDE;

if(thread_end_idx > N)
thread_end_idx = N;
int group = (threadIdx.x /
GROUP_SIZE) & 1;

// printf("%d",group);

for(int idx=thread_start_idx; idx


< thread_end_idx; idx+=STRIDE)

if(!group) a[idx] = a[idx]


* a[idx];

else a[idx] = a[idx] +


a[idx];

<<<<<<< HEAD

=======

//printf("idx: %d a: %d
\n", idx, a[idx]);

//printf("start: %d end: %d
STRIDE: %d OFFSET: %d GS: %d \n",

// thread_start_idx,
thread_end_idx, STRIDE, OFFSET,
GROUP_SIZE);

>>>>>>>
3bbd12b90c03095c7e4f6b401835b31e27
8d3066

// main routine that executes on the host

int main(void)

float *a_h, *a_d; // Pointer to


host & device arrays

const int N = 1<<22; // Make a


big array with 2**10 elements

size_t size = N * sizeof(float);

/* Auxiliares para medir tiempos */

cudaEvent_t start, stop;

float time;

a_h = (float *)malloc(size); //


Allocate array on host

cudaMalloc((void **) &a_d,


size); // Allocate array on device
// Initialize host array and copy it to
CUDA device

for (int i=0; i<N; i++)

a_h[i] = (float)i;

//for (int i=0; i<N; i+=N/50)


printf("%d %f\n", i, a_h[i]);

cudaMemcpy(a_d, a_h, size,


cudaMemcpyHostToDevice);

checkCUDAError("memcpy");

// Create timer for timing CUDA


calculation

//PPunsigned int timer = 0;

//PPcutCreateTimer( &timer );

cudaEventCreate(&start);

cudaEventCreate(&stop);
// Set number of threads and blocks

int n_threads_per_block =
128;//1<<9; // 512 threads per block

int n_blocks = 256;//1<<10; //


1024 blocks

// Do calculation on device

cudaEventRecord(start,0);

square_array <<< n_blocks,


n_threads_per_block >>> (a_d, N);

cudaThreadSynchronize(); //
Wait for square_array to finish on CUDA

checkCUDAError("kernel
invocation");

// Retrieve result from device and


store it in host array
cudaMemcpy(a_h, a_d, size,
cudaMemcpyDeviceToHost);

checkCUDAError("memcpy");

cudaEventRecord(stop,0);

cudaEventSynchronize(stop);

cudaEventElapsedTime( &time, start,


stop );

// Print some of the results

for (int i=0; i<N; i+=N/50)


printf("%d %f\n", i, a_h[i]);

// Imprime tiempo de ejecución

printf("\n\nTIEMPO DE
EJECUCIÓN: %f mSeg\n\n", time);

cudaEventDestroy( start );

cudaEventDestroy( stop );
free(a_h); cudaFree(a_d);

/* Utility function to check for and report


CUDA errors */

void checkCUDAError(const char *msg)

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err)

fprintf(stderr, "Cuda error: %s:


%s.\n", msg, cudaGetErrorString( err) );

exit(EXIT_FAILURE);

You might also like