摘要:
const int N = 33 * 1024;const int threadsPerBlock = 256;const int blocksPerGrid = imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );__global__ void dot( float *a, float *b, float *c ) { __shared__ float cache[threadsPerBlock]; int tid = threadIdx.x + blockIdx.x * blockDim.x; in... 阅读全文