Assignment 3 GPU Course Lab
Key Points on Grids, Blocks, and Threads:¶
-
Thread Index Calculation:
int idx = threadIdx.x + blockIdx.x * blockDim.x;
threadIdx.x
: The thread’s index within a block.blockIdx.x
: The index of the block within the grid.blockDim.x
: The number of threads per block.- This formula ensures that each thread processes a unique index.
- Grid and Block Dimensions:
int threadsPerBlock = 256; int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
- We choose
256
threads per block for efficiency. - The number of blocks is computed so that all elements are processed.
-
Parallel Execution:
-
Each thread computes
c[idx] = a[idx] + b[idx]
in parallel. - Multiple threads in a block work concurrently.
- Multiple blocks together form a grid to handle large datasets.
#include <stdio.h>
#include <cuda.h>
// CUDA Kernel for vector addition
__global__ void vectorAdd(int *a, int *b, int *c, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < size) {
c[idx] = a[idx] + b[idx];
}
}
int main() {
int size = 1000; // Number of elements
int bytes = size * sizeof(int);
srand(time(NULL));
// Host arrays
int *h_a, *h_b, *h_c;
h_a = (int*)malloc(bytes);
h_b = (int*)malloc(bytes);
h_c = (int*)malloc(bytes);
// Initialize input vectors
for (int i = 0; i < size; i++) {
h_a[i] = rand() % 100 + 1;
h_b[i] = (rand() % 100 + rand() % 100 + 1) % 100;
}
// Device arrays
int *d_a, *d_b, *d_c;
cudaMalloc((void**)&d_a, bytes);
cudaMalloc((void**)&d_b, bytes);
cudaMalloc((void**)&d_c, bytes);
// Copy data from host to device
cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
// Define grid and block dimensions
int threadsPerBlock = 256;
int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
// Launch kernel
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, size);
// Copy result back to host
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
// Print some results
printf("Vector 1 : ");
for(int i = 0; i < size; i++)
printf("%d ",h_a[i]);
printf("\n\nVector 2 : ");
for(int i = 0; i < size; i++)
printf("%d ",h_b[i]);
printf("\n\nVector 3: ");
for (int i = 0; i < size; i++) {
printf("%d ",h_c[i]);
}
// Free memory
free(h_a); free(h_b); free(h_c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}