CUDA[4] sample program: matrix-vector multiplication

Use Compressed Sparse Row Format (CSR) to represent matrix
  1 #include "cuda_runtime.h"
  2 #include "device_launch_parameters.h"
  3 #include "gputimer.h"
  4 #include<stdio.h>
  5 #include<stdlib.h>
  6 #include<string.h>
  7 #define WARP_SIZE 32
  8 
  9 __global__ void
 10 spmv_csr_vector_kernel ( const int num_rows ,
 11                          const int * ptr ,
 12                          const int * indices ,
 13                          const double * data ,
 14                          const double * x ,
 15                          double * y)
 16 {
 17     __shared__ double vals [WARP_SIZE];
 18     int thread_id = blockDim.x * blockIdx.x + threadIdx.x ; // global thread index
 19     int warp_id = thread_id / WARP_SIZE; // global warp index
 20     int lane = thread_id & (WARP_SIZE - 1); // thread index within the warp
 21     // one warp per row
 22     int row = warp_id ;
 23     if ( row < num_rows )
 24     {
 25         int row_start = ptr [ row ];
 26         int row_end = ptr [ row +1];
 27         // compute running sum per thread
 28         vals [ threadIdx.x ] = 0;
 29         for ( int jj = row_start + lane ; jj < row_end ; jj += WARP_SIZE)
 30         vals [ threadIdx.x ] += data [ jj ] * x [ indices [ jj ]];
 31         // parallel reduction in shared memory
 32         if ( lane < 16) vals [ threadIdx.x ] += vals [ threadIdx.x + 16];
 33         if ( lane < 8) vals [ threadIdx.x ] += vals [ threadIdx.x + 8];
 34         if ( lane < 4) vals [ threadIdx.x ] += vals [ threadIdx.x + 4];
 35         if ( lane < 2) vals [ threadIdx.x ] += vals [ threadIdx.x + 2];
 36         if ( lane < 1) vals [ threadIdx.x ] += vals [ threadIdx.x + 1];
 37         // first thread writes the result
 38         if ( lane == 0)
 39         y[ row ] += vals [ threadIdx.x ];
 40     }
 41 }
 42 
 43 __global__ void
 44 spmv_csr_scalar_kernel ( const int num_rows ,
 45                          const int * ptr ,
 46                          const int * indices ,
 47                          const double * data ,
 48                          const double * x ,
 49                          double * y)
 50 {
 51     int row = blockDim.x * blockIdx.x + threadIdx.x ;
 52     if( row < num_rows )
 53     {
 54         double dot = 0;
 55         int row_start = ptr [ row ];
 56         int row_end = ptr [ row +1];
 57         for (int jj = row_start ; jj < row_end ; jj ++)
 58             dot += data [ jj ] * x[ indices [ jj ]];
 59         y[ row ] += dot ;
 60     }
 61 }
 62 
 63 int main(int argc,char **argv)
 64 {
 65     double h_data[32]={1,4,2,3,5,7,8,9,6};
 66     int h_col[32]={0,1,1,2,0,3,4,2,4};
 67     int h_ptr[32]={0,2,4,7,9};
 68     double h_x[32]={1,2,3,4,5};
 69     double h_y[32]={0,0,0,0};
 70     int num_rows=4;
 71 
 72     double *d_data;
 73     int *d_col;
 74     int *d_ptr;
 75     double *d_x;
 76     double *d_y;
 77 
 78     cudaMalloc((void**) &d_data,sizeof(double)*32);
 79     cudaMalloc((void**) &d_col,sizeof(int)*32);
 80     cudaMalloc((void**) &d_ptr,sizeof(int)*32);
 81     cudaMalloc((void**) &d_x,sizeof(double)*32);
 82     cudaMalloc((void**) &d_y,sizeof(double)*32);
 83     cudaMemcpy((void*)d_data, (void*)h_data, sizeof(double)*32, cudaMemcpyHostToDevice);
 84     cudaMemcpy((void*)d_col, (void*)h_col, sizeof(int)*32, cudaMemcpyHostToDevice);
 85     cudaMemcpy((void*)d_ptr, (void*)h_ptr, sizeof(int)*32, cudaMemcpyHostToDevice);
 86     cudaMemcpy((void*)d_x, (void*)h_x, sizeof(double)*32, cudaMemcpyHostToDevice);
 87     cudaMemcpy((void*)d_y, (void*)h_y, sizeof(double)*32, cudaMemcpyHostToDevice);
 88 
 89     GpuTimer timer;
 90     timer.Start();
 91     spmv_csr_vector_kernel<<<num_rows,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);
 92     //spmv_csr_scalar_kernel<<<1,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);
 93     timer.Stop();
 94     printf("Duration: %g ms\n",timer.Elapsed());
 95 
 96     cudaMemcpy((void*)h_y, (void*)d_y, sizeof(double)*32, cudaMemcpyDeviceToHost);
 97 
 98     for(int i=0;i<num_rows;i++)
 99         printf("%.5f ",h_y[i]);
100     printf("\n");
101 
102     return 0;
103 }
View Code
ref:
http://www.nvidia.com/docs/IO/66889/nvr-2008-004.pdf　　
ch4.3
posted on 2017-01-24 11:27 Pentium.Labs 阅读(322) 评论(0) 收藏举报
刷新页面返回顶部
Pentium.Labs

导航

公告

CUDA[4] sample program: matrix-vector multiplication