Moving To Parallel - Addition of 2 Matrices
Moving To Parallel - Addition of 2 Matrices
Addition of 2 Matrices
Matrix Addition
Block (0, 0)
rowIndex = threadIdx.y;
colIndex = threadIdx.x; Thread Thread Thread Thread Thread
(0, 0) (1, 0) (2, 0) (3, 0) (4, 0)
c[rowIndex][colIndex] = a[rowIndex][colIndex] +
b[rowIndex][colIndex];
}
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
Matrix Addition on the Device: add()
Device
rowIndex = blockIdx.y;
colIndex = blockIdx.x;
c[rowIndex][colIndex] = a[rowIndex][colIndex] +
b[rowIndex][colIndex];
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
Matrix Addition on the Device: add()
Device
Running the kernel with (N * N) grid Blocks
with (N * N) threads each. Grid 1
c[rowIndex][colIndex] = a[rowIndex][colIndex] +
b[rowIndex][colIndex];
}
Matrix Addition on the Device: main()
#define N 1024
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int nb = N * N * N * N;
int size = nb * sizeof(int);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
Matrix Addition on the Device: add()
• Parallelized add() kernel
__global__ void add(int *a, int *b, int *c, int width) {
int rowIndex, colIndex, width;
}
Matrix Addition on the Device: main()
#define N 16
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c, *w; // device copies of a, b, c
int width = 16;
int nb = N * N * N * N * width * width;
int size = nb * sizeof(int);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}