Dot Product - Shared cache, reduction and final partial sum on host - Giovanni Di Grezia

xgiovio

xgiovio

Join Date: Mar 2011

Posts: 190
- Share
- Tweet
#1

Dot Product - Shared cache, reduction and final partial sum on host

11-16-2013, 03:02 PM

PHP Code:

#include <iostream> #define minimo(a,b) a<b?a:b #define MAXBLOCKS 32 #define NTHREADS 256 // must be a power of 2 __global__ void dot (int * a, int * b, int *c, int size){ int id = (blockIdx.x * blockDim.x) + threadIdx.x; int nextid = gridDim.x * blockDim.x; //// reducted output __shared__ int shared_cache [NTHREADS]; int sum = 0; for (;id < size ;){ sum += (*(a+id)) * (*(b+id)); id+= nextid; } *(shared_cache + threadIdx.x) = sum; __syncthreads(); ///////// sum of internal cache int i; for (i=(NTHREADS /2); i>0 ; i= i/2){ if (threadIdx.x < i){ *(shared_cache + threadIdx.x) += *(shared_cache + threadIdx.x + i); } __syncthreads(); } if ( threadIdx.x == 0){ *(c+ blockIdx.x) = shared_cache[0]; } } int main () { int n = 100; /// size of arrays to multiply int n_blocks = minimo( MAXBLOCKS, ((n+NTHREADS-1)/NTHREADS)); int * a = new int[n]; int * b = new int[n]; int * c = new int[n_blocks]; ////////////////////////sample data allocated on host for (int i=0 ; i< n ; ++i){ *(a+i)=1; *(b+i)=2; } int * d_a; int * d_b; int * d_c; cudaMalloc((void**) &d_a, n * sizeof(int)); cudaMalloc((void**) &d_b, n * sizeof(int)); cudaMalloc((void**) &d_c, n_blocks * sizeof(int)); cudaMemcpy(d_a , a, n * sizeof(int),cudaMemcpyHostToDevice); cudaMemcpy(d_b , b, n * sizeof(int),cudaMemcpyHostToDevice); dot<<< n_blocks ,NTHREADS>>>(d_a,d_b,d_c,n); cudaMemcpy(c,d_c,n_blocks*sizeof(int),cudaMemcpyDeviceToHost); //// final sum on host int final_result = 0; for (int i=0 ; i< n_blocks ; ++i){ final_result += *(c+i); } std::cout << final_result << std::endl; std::cin.get(); return 0;}

| VFX Artist, C++ Programmer, HW Overclocker | Web: xgiovio.com Email: xgiovio@gmail.com Twitter: @xgiovio
Tags: None

Announcement

Dot Product - Shared cache, reduction and final partial sum on host

Dot Product - Shared cache, reduction and final partial sum on host

Google Profile