Dot Product - Shared cache, reduction and final partial sum on host - Giovanni Di Grezia

PHP Code:

#include <iostream>

#define minimo(a,b) a<b?a:b
#define MAXBLOCKS 32
#define NTHREADS 256 // must be a power of 2


__global__ void dot (int * a, int * b, int *c, int size){


    int id = (blockIdx.x * blockDim.x) + threadIdx.x;
    int nextid = gridDim.x * blockDim.x;

    //// reducted output
    __shared__ int shared_cache [NTHREADS];

    int sum = 0;
    for (;id < size ;){
        sum += (*(a+id)) * (*(b+id));
        id+= nextid;
    }

    *(shared_cache + threadIdx.x) = sum;

    __syncthreads();

    ///////// sum of internal cache

    int i;    
    
    for (i=(NTHREADS /2); i>0 ; i= i/2){
            if (threadIdx.x < i){
                *(shared_cache + threadIdx.x) += *(shared_cache + threadIdx.x + i);
            }
            __syncthreads();
    }

    if ( threadIdx.x == 0){
        *(c+ blockIdx.x) = shared_cache[0];
    }
}




int main () {

    
    int n = 100; /// size of arrays to multiply
    int n_blocks = minimo( MAXBLOCKS, ((n+NTHREADS-1)/NTHREADS));

    int * a = new int[n];
    int * b = new int[n];
    int * c = new int[n_blocks];

    ////////////////////////sample data allocated on host
    for (int i=0 ; i< n ; ++i){
        *(a+i)=1;
        *(b+i)=2;
    }

    int * d_a;
    int * d_b;
    int * d_c;

    

        cudaMalloc((void**) &d_a, n * sizeof(int));
        cudaMalloc((void**) &d_b, n * sizeof(int));
        cudaMalloc((void**) &d_c, n_blocks * sizeof(int));

        cudaMemcpy(d_a , a, n * sizeof(int),cudaMemcpyHostToDevice);
        cudaMemcpy(d_b , b, n * sizeof(int),cudaMemcpyHostToDevice);

        dot<<< n_blocks ,NTHREADS>>>(d_a,d_b,d_c,n);

        cudaMemcpy(c,d_c,n_blocks*sizeof(int),cudaMemcpyDeviceToHost);


        //// final sum on host
        int final_result = 0;
        for (int i=0 ; i< n_blocks ; ++i){
            final_result += *(c+i);
        }

        std::cout << final_result << std::endl;

        std::cin.get();

return 0;}
Announcement

Dot Product - Shared cache, reduction and final partial sum on host

Dot Product - Shared cache, reduction and final partial sum on host

Google Profile