http://www.cnblogs.com/dubing/archive/2011/10/10/2085742.html
学习了"熬夜的虫子"这位同学的各种线程,各种block情况下的,速度比较。
在他的博文指导下,做了比较重要的几点点的修正,如下:
1.在运行一个kernel(即.cu文件中带有__global__标志的函数)后,加上了
cudaDeviceSynchronize();进行同步,否则6中方式计算得到的数据会不一致。
2.对6种计算方法的非kernel部分代码,全部进行了重写。各自代码,不共用。
3.对多block 、多线程、带线程同步的方法6,__share__ 共享内存的代码作了一点点修
改,内部定义共享存储大小。而非调用时定义。关于这一点,可能并不是特别重要。
4.对6种计算方式,采用公平的速度比较方法,即从cpu准备好需要计算的数据为
起始时刻,到cpu获得最终的计算结果为终止时刻,统计消耗时间。对6中计算方法,
不同的运算方法会有或多或少的不同的地方。需要各自优化到各自最好的状态,避免不公平的因素。
多次的运行结果如下。源码见链接。需要的可以自行测试。
运行环境,win7 64bit,vs2010,cuda5.0,nvidia geforce 310M。
总的看来,在本例程中,方法4是最好的。方法6其次。
*****Start Testing******
Single block with single thread Gpu calculating~
mode 1 calculate sum consumes:4508.920898 ms
the sum is 3210632
Single block with single thread Cpu calculating~
mode 2 calculate sum consumes:718.790649 ms
the sum is 3210632
Single block with multi threads Gpu calculating~
mode 3 calculate sum consumes:0.216789 ms
the sum is 3210632
Single block with multi threads Gpu Optimize calculating~
mode 4 calculate sum consumes:0.149929 ms
the sum is 3210632
Multi blocks with multi threads Gpu calculating~
mode 5 calculate sum consumes:0.281623 ms
the sum is 3210632
Multi blocks with multi threads Gpu Thread Sync calculating~
mode 6 calculate sum consumes:0.158843 ms
the sum is 3210632
*****Finish Testing ******
*****Start Testing******
Single block with single thread Gpu calculating~
mode 1 calculate sum consumes:4855.262207 ms
the sum is 1964008
Single block with single thread Cpu calculating~
mode 2 calculate sum consumes:734.781921 ms
the sum is 1964008
Single block with multi threads Gpu calculating~
mode 3 calculate sum consumes:0.215168 ms
the sum is 1964008
Single block with multi threads Gpu Optimize calculating~
mode 4 calculate sum consumes:0.132099 ms
the sum is 1964008
Multi blocks with multi threads Gpu calculating~
mode 5 calculate sum consumes:0.270682 ms
the sum is 1964008
Multi blocks with multi threads Gpu Thread Sync calculating~
mode 6 calculate sum consumes:0.266630 ms
the sum is 1964008
*****Finish Testing ******
*****Start Testing******
Single block with single thread Gpu calculating~
mode 1 calculate sum consumes:4430.995605 ms
the sum is 3275448
Single block with single thread Cpu calculating~
mode 2 calculate sum consumes:572.619568 ms
the sum is 3275448
Single block with multi threads Gpu calculating~
mode 3 calculate sum consumes:0.128452 ms
the sum is 3275448
Single block with multi threads Gpu Optimize calculating~
mode 4 calculate sum consumes:0.072128 ms
the sum is 3275448
Multi blocks with multi threads Gpu calculating~
mode 5 calculate sum consumes:0.158843 ms
the sum is 3275448
Multi blocks with multi threads Gpu Thread Sync calculating~
mode 6 calculate sum consumes:0.062403 ms
the sum is 3275448
*****Finish Testing ******
总结一点,代码还是得自己敲靠谱,心里有底,了如指掌为妙,否则会被坑的不轻!!!
引以为戒,与君共勉!
贴上,main.cpp的代码:
#ifndef WIN32
#define WIN32
#endif
// CUDA Runtime
#include
// Utility and system includes
#include
#include // helper for shared that are common to CUDA SDK samples
#include
#include
#include "common.h"
int main(){
int *cpudata;
unsigned char *cpudata_temp;
int *cpuresult;
float time=0;
unsigned int mode=3;
InitCUDA();
//为数据分配host的空间,我在这儿存在问题就是,直接malloc int数据,传值会失败
//所以曲线救了个国的。fuck
cpudata_temp =(unsigned char *)malloc(DATA_SIZE*4);
cpudata=( int *)cpudata_temp;
cpuresult =(int *)malloc(sizeof(int));
srand(2009);
for (unsigned int i = 0; i < DATA_SIZE; i+=4)
{
cpudata_temp[i] = rand()%6;
}
void (*calc_sum)(const int *num,int *result,float *time);
FILE *fp=fopen("record.txt","a+");
mode=1;
fprintf(fp,"*****Start Testing******\n");
while(mode<=6){
*cpuresult=0;
time=0;
switch(mode)
{
case 1:calc_sum=sum_basic_mode1;
fprintf(fp,"Single block with single thread Gpu calculating~\n");
break;
case 2:calc_sum=sum_cpu_mode2;
fprintf(fp,"Single block with single thread Cpu calculating~\n");
break;
case 3:calc_sum=sum_Thread_mode3;
fprintf(fp,"Single block with multi threads Gpu calculating~\n");
break;
case 4:calc_sum=sum_ThreadOptimization_mode4;
fprintf(fp,"Single block with multi threads Gpu Optimize calculating~\n");
break;
case 5:calc_sum=sum_Block_mode5;
fprintf(fp,"Multi blocks with multi threads Gpu calculating~\n");
break;
case 6:calc_sum=sum_Block_sync_mode6;
fprintf(fp,"Multi blocks with multi threads Gpu Thread Sync calculating~\n");
break;
default:calc_sum=NULL;
}
calc_sum(cpudata,cpuresult,&time);
printf("mode %d calculate sum consumes:%f ms\n",mode,time);
printf("the sum is %d\n",&cpuresult);
fprintf(fp,"mode %d calculate sum consumes:%f ms\n",mode,time);
fprintf(fp,"the sum is %d\n",&cpuresult);
mode++;
}
fprintf(fp,"*****Finish Testing ******\n");
fclose(fp);
free(cpudata);
system("pause");
return 0;
}
还有kernel中方法4的代码:
__global__ static void sum_ThreadOptimization(int *num, int* result)
{
const int tid = threadIdx.x;
int sum = 0;
int i;
for(i = tid; i < DATA_SIZE; i += THREAD_NUM) {
sum += num[i] + num[i] + num[i] + num[i] + num[i] + num[i]+ num[i] + num[i] + num[i] + num[i] + num[i] + num[i];
}
result[tid] = sum;
}
extern "C" void sum_ThreadOptimization_mode4(const int *num, int* result,float *time){
StopWatchInterface *timer = NULL;
int *gpudata;
int *gpuresult;
int *cpuPartialResult;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
sdkStartTimer(&timer);
//为输入数据分配gpu显存空间。
cudaMalloc((void**) &gpudata, sizeof(int) * DATA_SIZE);
//将内存中的数据拷贝到显存中去。
cudaMemcpy(gpudata, num, sizeof(int) * DATA_SIZE,
cudaMemcpyHostToDevice);
//为输出数据分配显存空间,只需要长度为THREAD_NUM的地址
cudaMalloc((void**) &gpuresult, sizeof(int)*THREAD_NUM);
//执行运算
sum_ThreadOptimization<<<1, THREAD_NUM, 0>>>(gpudata, gpuresult);
//等待计算执行完毕
cudaDeviceSynchronize();
//申请cpu存储空间,存多个线程结果
cpuPartialResult=(int *)malloc(sizeof(int)*THREAD_NUM);
//运算完毕后,将数据回收到cpu,
cudaMemcpy(cpuPartialResult, gpuresult, sizeof(int) * THREAD_NUM,
cudaMemcpyDeviceToHost);
//结果汇总,
for(int i = 0; i < THREAD_NUM; i++) {
*result += cpuPartialResult[i];
}
//终止定时器
sdkStopTimer(&timer);
//计算得到定时器的时间
*time = (sdkGetTimerValue(&timer));
//删除定时器
sdkDeleteTimer(&timer);
//释放gpu的空间
cudaFree(gpudata);
cudaFree(gpuresult);
//释放cpu空间
free(cpuPartialResult);
}
方法6的kernel部分,做了一点点的修改,也贴出来:
__global__ void sum_Block_sync(int* num, int* result)
{
__shared__ int shared[THREAD_NUM];//一个block中给thread num个共享空间
const int tid = threadIdx.x;
const int bid = blockIdx.x;
int i;
shared[tid] = 0;//每个thread负责自己的位置上,初始化为0
for(i = bid * THREAD_NUM + tid; i < DATA_SIZE;
i += BLOCK_NUM * THREAD_NUM) {
shared[tid] += num[i] + num[i] + num[i] + num[i] + num[i] + num[i]+ num[i] + num[i] + num[i] + num[i] + num[i] + num[i];
}
//等待一个block中所有的线程完成运算,
__syncthreads();
//让第一个线程将,本block下所有的数据,汇总
if(tid == 0) {
for(i = 1; i < THREAD_NUM; i++) {
shared[0] += shared[i];
}
result[bid] = shared[0];
}
}
2014.03.23 bwb@HUST