SM(Streaming multiProcessor)

 

https://zhuanlan.zhihu.com/p/105775783

 

https://www.likecs.com/show-204740120.html

 

__shared__

 

存储体系

 

 

#define CLOCK_RATE 1683000  /* modify for different device */
__device__ void sleeps(float t) {
clock_t t0 = clock64();
clock_t t1 = t0;
while ((t1 - t0)/(CLOCK_RATE*1000.0f) < t)
t1 = clock64();
}

 

int getSPcores(cudaDeviceProp devProp)
{
int cores = 0;
int mp = devProp.multiProcessorCount;
switch (devProp.major){
case 2: // Fermi
if (devProp.minor == 1) cores = mp * 48;
else cores = mp * 32;
break;
case 3: // Kepler
cores = mp * 192;
break;
case 5: // Maxwell
cores = mp * 128;
break;
case 6: // Pascal
if ((devProp.minor == 1) || (devProp.minor == 2)) cores = mp * 128;
else if (devProp.minor == 0) cores = mp * 64;
else printf("Unknown device type\n");
break;
case 7: // Volta and Turing
if ((devProp.minor == 0) || (devProp.minor == 5)) cores = mp * 64;
else printf("Unknown device type\n");
break;
default:
printf("Unknown device type\n");
break;
}
return cores;
}

 

r=cudaGetDeviceProperties(&devProp, 0);
if (r) return r;
{ printf("%s\n", devProp.name);
printf("Major revision number: %d\n", devProp.major);
printf("Minor revision number: %d\n", devProp.minor);
printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);
printf("maxThreadsPerMultiProcessor: %d\n", devProp.maxThreadsPerMultiProcessor);
printf("l2CacheSize: %d\n", devProp.l2CacheSize);
printf("l2CacheSize: %d\n", devProp.memoryBusWidth);
printf("Total global memory: %llu\n", devProp.totalGlobalMem);
printf("Total amount of shared memory per block: %u\n",devProp.sharedMemPerBlock);
printf("Total registers per block: %d\n", devProp.regsPerBlock);
printf("Warp size: %d\n", devProp.warpSize);
printf("Maximum memory pitch: %u\n", devProp.memPitch);
printf("Total amount of constant memory: %u\n", devProp.totalConstMem);
printf("core: %d\n", getSPcores(devProp));
}



posted @ 2022-07-29 17:24  zJanly  阅读(455)  评论(0编辑  收藏  举报