Assignment 3 GPU Course Lab

Key Points on Grids, Blocks, and Threads:¶

Thread Index Calculation:

int idx = threadIdx.x + blockIdx.x * blockDim.x;
- threadIdx.x: The thread’s index within a block.
- blockIdx.x: The index of the block within the grid.
- blockDim.x: The number of threads per block.
- This formula ensures that each thread processes a unique index.
- Grid and Block Dimensions:
int threadsPerBlock = 256; int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
- We choose 256 threads per block for efficiency.
- The number of blocks is computed so that all elements are processed.
- Parallel Execution:
- Each thread computes c[idx] = a[idx] + b[idx] in parallel.
- Multiple threads in a block work concurrently.
- Multiple blocks together form a grid to handle large datasets.

#include <stdio.h>
#include <cuda.h>

// CUDA Kernel for vector addition
__global__ void vectorAdd(int *a, int *b, int *c, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < size) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int size = 1000; // Number of elements
    int bytes = size * sizeof(int);
    srand(time(NULL));
    // Host arrays
    int *h_a, *h_b, *h_c;
    h_a = (int*)malloc(bytes);
    h_b = (int*)malloc(bytes);
    h_c = (int*)malloc(bytes);

    // Initialize input vectors
    for (int i = 0; i < size; i++) {
        h_a[i] = rand() % 100 + 1;
        h_b[i] = (rand() % 100 + rand() % 100 + 1) % 100;
    }

    // Device arrays
    int *d_a, *d_b, *d_c;
    cudaMalloc((void**)&d_a, bytes);
    cudaMalloc((void**)&d_b, bytes);
    cudaMalloc((void**)&d_c, bytes);

    // Copy data from host to device
    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;

    // Launch kernel
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, size);

    // Copy result back to host
    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    // Print some results
    printf("Vector 1 : ");
    for(int i = 0; i < size; i++)
        printf("%d ",h_a[i]);

    printf("\n\nVector 2 : ");
    for(int i = 0; i < size; i++)
        printf("%d ",h_b[i]);

    printf("\n\nVector 3: ");
    for (int i = 0; i < size; i++) {
        printf("%d ",h_c[i]);
    }

    // Free memory
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

    return 0;
}