Announcement

Collapse
No announcement yet.

Dot Product - Shared cache, reduction and final partial sum on host

Collapse
X
  • Filter
  • Time
  • Show
Clear All
new posts

  • Dot Product - Shared cache, reduction and final partial sum on host

    PHP Code:
    #include <iostream>

    #define minimo(a,b) a<b?a:b
    #define MAXBLOCKS 32
    #define NTHREADS 256 // must be a power of 2


    __global__ void dot (int aint bint *cint size){


        
    int id = (blockIdx.blockDim.x) + threadIdx.x;
        
    int nextid gridDim.blockDim.x;

        
    //// reducted output
        
    __shared__ int shared_cache [NTHREADS];

        
    int sum 0;
        for (;
    id size ;){
            
    sum += (*(a+id)) * (*(b+id));
            
    id+= nextid;
        }

        *(
    shared_cache threadIdx.x) = sum;

        
    __syncthreads();

        
    ///////// sum of internal cache

        
    int i;    
        
        for (
    i=(NTHREADS /2); i>ii/2){
                if (
    threadIdx.i){
                    *(
    shared_cache threadIdx.x) += *(shared_cache threadIdx.i);
                }
                
    __syncthreads();
        }

        if ( 
    threadIdx.== 0){
            *(
    cblockIdx.x) = shared_cache[0];
        }
    }




    int main () {

        
        
    int n 100/// size of arrays to multiply
        
    int n_blocks minimoMAXBLOCKS, ((n+NTHREADS-1)/NTHREADS));

        
    int = new int[n];
        
    int = new int[n];
        
    int = new int[n_blocks];

        
    ////////////////////////sample data allocated on host
        
    for (int i=i; ++i){
            *(
    a+i)=1;
            *(
    b+i)=2;
        }

        
    int d_a;
        
    int d_b;
        
    int d_c;

        

            
    cudaMalloc((void**) &d_asizeof(int));
            
    cudaMalloc((void**) &d_bsizeof(int));
            
    cudaMalloc((void**) &d_cn_blocks sizeof(int));

            
    cudaMemcpy(d_a asizeof(int),cudaMemcpyHostToDevice);
            
    cudaMemcpy(d_b bsizeof(int),cudaMemcpyHostToDevice);

            
    dot<<< n_blocks ,NTHREADS>>>(d_a,d_b,d_c,n);

            
    cudaMemcpy(c,d_c,n_blocks*sizeof(int),cudaMemcpyDeviceToHost);


            
    //// final sum on host
            
    int final_result 0;
            for (
    int i=in_blocks ; ++i){
                
    final_result += *(c+i);
            }

            
    std::cout << final_result << std::endl;

            
    std::cin.get();

    return 
    0;} 
    | VFX Artist, C++ Programmer, HW Overclocker | Web: xgiovio.com Email: xgiovio@gmail.com Twitter: @xgiovio
Working...
X

Google Profile


My name is Giovanni Di Grezia, but people call me xgiovio.

Here is my homepage:.

I'm a VFX Artist and Software Developer.

Giovanni Di Grezia