摘要: const int N = 33 * 1024;const int threadsPerBlock = 256;const int blocksPerGrid = imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );__global__ void dot( float *a, float *b, float *c ) { __shared__ float cache[threadsPerBlock]; int tid = threadIdx.x + blockIdx.x * blockDim.x; in... 阅读全文
posted @ 2013-09-07 14:55 celerychen 阅读(455) 评论(0) 推荐(0) 编辑