diff --git a/testers/cuda_iteration_test.cu b/testers/cuda_iteration_test.cu new file mode 100644 index 0000000..980345c --- /dev/null +++ b/testers/cuda_iteration_test.cu @@ -0,0 +1,53 @@ +#include +#include + +#define N 11 +#define M 3 + +__global__ void kernel(float * d_matrix, size_t pitch) { + for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < N; j += blockDim.y * gridDim.y) { + float* row_d_matrix = (float*)((char*)d_matrix + j*pitch); + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { + row_d_matrix[i] = (j * M + i) + (j * M + i); + } + } +} + +void verify(float *h, float *d, int size) { + for (int i = 0; i < size; i++) { + assert(h[i] == d[i]); + } + printf("Results match\n"); +} + +int main() { + + float *h_matrix; + float *d_matrix; + float *dc_matrix; + + h_matrix = (float *) malloc(M * N * sizeof(float)); + dc_matrix = (float *) malloc(M * N * sizeof(float)); + + for (int j = 0; j < N; j++) { + for (int i = 0; i < M; i++) { + h_matrix[j * M + i] = (j * M + i) + (j * M + i); + } + } + + size_t pitch; + cudaMallocPitch(&d_matrix, &pitch, M * sizeof(float), N); + + dim3 grid(1, 1, 1); + dim3 block(3, 3, 1); + + kernel<<>>(d_matrix, pitch); + + cudaMemcpy2D(dc_matrix, M * sizeof(float), d_matrix, pitch, M * sizeof(float), N, cudaMemcpyDeviceToHost); + + verify(h_matrix, dc_matrix, M * N); + + free(h_matrix); + cudaFree(d_matrix); + free(dc_matrix); +}