PHP Code:
#include <iostream>
#define minimo(a,b) a<b?a:b
#define MAXBLOCKS 32
#define NTHREADS 256 // must be a power of 2
__global__ void dot (int * a, int * b, int *c, int size){
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
int nextid = gridDim.x * blockDim.x;
//// reducted output
__shared__ int shared_cache [NTHREADS];
int sum = 0;
for (;id < size ;){
sum += (*(a+id)) * (*(b+id));
id+= nextid;
}
*(shared_cache + threadIdx.x) = sum;
__syncthreads();
///////// sum of internal cache
int i;
for (i=(NTHREADS /2); i>0 ; i= i/2){
if (threadIdx.x < i){
*(shared_cache + threadIdx.x) += *(shared_cache + threadIdx.x + i);
}
__syncthreads();
}
if ( threadIdx.x == 0){
*(c+ blockIdx.x) = shared_cache[0];
}
}
int main () {
int n = 100; /// size of arrays to multiply
int n_blocks = minimo( MAXBLOCKS, ((n+NTHREADS-1)/NTHREADS));
int * a = new int[n];
int * b = new int[n];
int * c = new int[n_blocks];
////////////////////////sample data allocated on host
for (int i=0 ; i< n ; ++i){
*(a+i)=1;
*(b+i)=2;
}
int * d_a;
int * d_b;
int * d_c;
cudaMalloc((void**) &d_a, n * sizeof(int));
cudaMalloc((void**) &d_b, n * sizeof(int));
cudaMalloc((void**) &d_c, n_blocks * sizeof(int));
cudaMemcpy(d_a , a, n * sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_b , b, n * sizeof(int),cudaMemcpyHostToDevice);
dot<<< n_blocks ,NTHREADS>>>(d_a,d_b,d_c,n);
cudaMemcpy(c,d_c,n_blocks*sizeof(int),cudaMemcpyDeviceToHost);
//// final sum on host
int final_result = 0;
for (int i=0 ; i< n_blocks ; ++i){
final_result += *(c+i);
}
std::cout << final_result << std::endl;
std::cin.get();
return 0;}