#include #include "jacobi_kernel.hu" #include #include #include #include "mf_pragma.h" int main (int argc, char *argv[]) { int T = 4, N = 1026, x = 11, B = 128; if (argc > 1 && argc != 4) { printf("usage: ./jacobi iterations input_size block_size\n"); exit(0); } if (argc == 4) { T = atoi(argv[1]); N = atoi(argv[2]); B = atoi(argv[3]); } int a[N], b[N], c[N], d[N]; srand (time(NULL)); for (int i = 0; i < N; ++i) { a[i] = rand() % x; c[i] = a[i]; } int ub_v = (N - 2) / B; float cpu_time = 0, time1 = 0; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); #define floord(n,d) (((n)<0) ? -((-(n)+(d)-1)/(d)) : (n)/(d)) if (T >= 1 && ub_v >= 1 && B >= 0) { #define cudaCheckReturn(ret) \ do { \ cudaError_t cudaCheckReturn_e = (ret); \ if (cudaCheckReturn_e != cudaSuccess) { \ fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(cudaCheckReturn_e)); \ fflush(stderr); \ } \ assert(cudaCheckReturn_e == cudaSuccess); \ } while(0) #define cudaCheckKernel() \ do { \ cudaCheckReturn(cudaGetLastError()); \ } while(0) int *dev_a; int *dev_b; cudaCheckReturn(cudaMalloc((void **) &dev_a, (N) * sizeof(int))); cudaCheckReturn(cudaMalloc((void **) &dev_b, (N) * sizeof(int))); { if (N >= 1) { cudaCheckReturn(cudaMemcpy(dev_a, a, (N) * sizeof(int), cudaMemcpyHostToDevice)); cudaCheckReturn(cudaMemcpy(dev_b, b, (N) * sizeof(int), cudaMemcpyHostToDevice)); } for (int c0 = 0; c0 < T; c0 += 1) { { dim3 k0_dimBlock(B); dim3 k0_dimGrid(ub_v <= 32767 ? ub_v : 32768); cudaEventRecord(start, 0); kernel0 <<>> (dev_a, dev_b, N, T, ub_v, B, c0); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time1, start, stop); cpu_time += time1; cudaCheckKernel(); } { dim3 k1_dimBlock(B); dim3 k1_dimGrid(ub_v <= 32767 ? ub_v : 32768); cudaEventRecord(start, 0); kernel1 <<>> (dev_a, dev_b, N, T, ub_v, B, c0); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time1, start, stop); cpu_time += time1; cudaCheckKernel(); } } if (N >= 1) { cudaCheckReturn(cudaMemcpy(a, dev_a, (N) * sizeof(int), cudaMemcpyDeviceToHost)); cudaCheckReturn(cudaMemcpy(b, dev_b, (N) * sizeof(int), cudaMemcpyDeviceToHost)); } } cudaCheckReturn(cudaFree(dev_a)); cudaCheckReturn(cudaFree(dev_b)); } // serial jacabi code for (int t = 0; t < T; ++t) { for (int i = 0; i < N-2; ++i) d[i+1] = (c[i] + c[i+1] + c[i+2]) / 3; for (int i = 0; i < N-2; ++i) c[i+1] = d[i+1]; } cudaEventDestroy(start); cudaEventDestroy(stop); printf("parallel running time: %f secs\n", cpu_time/1000); // compare the result of serial code and parallel code and verify the validation int isCorrect = 1; for (int i = 0; i < N; i++){ if (c[i] != a[i]) { printf("== [%d] ==\n", i); printf("%d :: %d\n", c[i], a[i]); isCorrect = 0; } } if (isCorrect) { printf("pass.\n"); } return 0; }