#include "jacobi_kernel.hu" __global__ void kernel0(int *a, int *b, int N, int T, int ub_v, int B, int c0) { int b0 = blockIdx.x; int t0 = threadIdx.x; int private_p; __shared__ int shared_a[BLOCK_0+2]; #define floord(n,d) (((n)<0) ? -((-(n)+(d)-1)/(d)) : (n)/(d)) #define min(x,y) ((x) < (y) ? (x) : (y)) for (int c1 = b0; c1 < ub_v; c1 += 32768) { for (int c2 = t0; c2 <= min(B + 1, N - B * c1 - 1); c2 += B) shared_a[c2] = a[B * c1 + c2]; __syncthreads(); private_p = (((c1) * (B)) + (t0)); b[private_p + 1] = (((shared_a[private_p - B * c1] + shared_a[private_p - B * c1 + 1]) + shared_a[private_p - B * c1 + 2]) / 3); __syncthreads(); } } __global__ void kernel1(int *a, int *b, int N, int T, int ub_v, int B, int c0) { int b0 = blockIdx.x; int t0 = threadIdx.x; int private_w; __shared__ int shared_b[BLOCK_0]; #define floord(n,d) (((n)<0) ? -((-(n)+(d)-1)/(d)) : (n)/(d)) for (int c1 = b0; c1 < ub_v; c1 += 32768) { if (N >= t0 + B * c1 + 2) shared_b[t0] = b[t0 + B * c1 + 1]; __syncthreads(); private_w = (((c1) * (B)) + (t0)); a[private_w + 1] = shared_b[private_w - B * c1]; __syncthreads(); } }