cuda并行计算的几种模式
1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 3 #include <stdio.h> 4 #include <time.h> 5 #include <stdlib.h> 6 7 #define MAX 120 8 #define MIN 0 9 10 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size, 11 float* etime); 12 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size, 13 float* etime, int type); 14 __global__ void addKernel(int *c, const int *a, const int *b) { 15 int i = blockIdx.x; 16 c[i] = a[i] + b[i]; 17 } 18 19 __global__ void addKernelThread(int *c, const int *a, const int *b) { 20 int i = threadIdx.x; 21 c[i] = a[i] + b[i]; 22 } 23 int main() { 24 const int arraySize = 800; 25 srand((unsigned) time(NULL)); 26 int a[arraySize] = { 1, 2, 3, 4, 5 }; 27 int b[arraySize] = { 10, 20, 30, 40, 50 }; 28 29 for (int i = 0; i < arraySize; i++) { 30 a[i] = rand() % (MAX + 1 - MIN) + MIN; 31 b[i] = rand() % (MAX + 1 - MIN) + MIN; 32 } 33 int c[arraySize] = { 0 }; 34 // Add vectors in parallel. 35 cudaError_t cudaStatus; 36 int num = 0; 37 cudaDeviceProp prop; 38 cudaStatus = cudaGetDeviceCount(&num); 39 for (int i = 0; i < num; i++) { 40 cudaGetDeviceProperties(&prop, i); 41 } 42 43 float time; 44 cudaStatus = addWithCudaStream(c, a, b, arraySize, &time); 45 printf("Elasped time of stream is : %f \n", time); 46 printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n", 47 a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2], 48 a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0], 49 b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3], 50 b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1], 51 c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]); 52 if (cudaStatus != cudaSuccess) { 53 fprintf(stderr, "addWithCudaStream failed!"); 54 return 1; 55 } 56 cudaStatus = addWithCuda(c, a, b, arraySize, &time, 0); 57 printf("Elasped time of Block is : %f \n", time); 58 if (cudaStatus != cudaSuccess) { 59 fprintf(stderr, "addWithCudaStream failed!"); 60 return 1; 61 } 62 printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n", 63 a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2], 64 a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0], 65 b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3], 66 b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1], 67 c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]); 68 69 cudaStatus = addWithCuda(c, a, b, arraySize, &time, 1); 70 printf("Elasped time of thread is : %f \n", time); 71 if (cudaStatus != cudaSuccess) { 72 fprintf(stderr, "addWithCudaStream failed!"); 73 return 1; 74 } 75 printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n", 76 a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2], 77 a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0], 78 b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3], 79 b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1], 80 c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]); 81 82 cudaStatus = addWithCudaStream(c, a, b, arraySize, &time); 83 printf("Elasped time of stream is : %f \n", time); 84 printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n", 85 a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2], 86 a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0], 87 b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3], 88 b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1], 89 c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]); 90 if (cudaStatus != cudaSuccess) { 91 fprintf(stderr, "addWithCudaStream failed!"); 92 return 1; 93 } 94 // cudaThreadExit must be called before exiting in order for profiling and 95 // tracing tools such as Nsight and Visual Profiler to show complete traces. 96 cudaStatus = cudaThreadExit(); 97 if (cudaStatus != cudaSuccess) { 98 fprintf(stderr, "cudaThreadExit failed!"); 99 return 1; 100 } 101 return 0; 102 } 103 // Helper function for using CUDA to add vectors in parallel. 104 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size, 105 float* etime) { 106 int *dev_a = 0; 107 int *dev_b = 0; 108 int *dev_c = 0; 109 clock_t start, stop; 110 float time; 111 cudaError_t cudaStatus; 112 113 // Choose which GPU to run on, change this on a multi-GPU system. 114 cudaStatus = cudaSetDevice(0); 115 if (cudaStatus != cudaSuccess) { 116 fprintf(stderr, 117 "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 118 goto Error; 119 } 120 // Allocate GPU buffers for three vectors (two input, one output) . 121 cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int)); 122 if (cudaStatus != cudaSuccess) { 123 fprintf(stderr, "cudaMalloc failed!"); 124 goto Error; 125 } 126 cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int)); 127 if (cudaStatus != cudaSuccess) { 128 fprintf(stderr, "cudaMalloc failed!"); 129 goto Error; 130 } 131 cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int)); 132 if (cudaStatus != cudaSuccess) { 133 fprintf(stderr, "cudaMalloc failed!"); 134 goto Error; 135 } 136 // Copy input vectors from host memory to GPU buffers. 137 cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), 138 cudaMemcpyHostToDevice); 139 if (cudaStatus != cudaSuccess) { 140 fprintf(stderr, "cudaMemcpy failed!"); 141 goto Error; 142 } 143 cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), 144 cudaMemcpyHostToDevice); 145 if (cudaStatus != cudaSuccess) { 146 fprintf(stderr, "cudaMemcpy failed!"); 147 goto Error; 148 } 149 cudaStream_t stream[5]; 150 for (int i = 0; i < 5; i++) { 151 cudaStreamCreate(&stream[i]); //创建流 152 } 153 // Launch a kernel on the GPU with one thread for each element. 154 for (int i = 0; i < 5; i++) { 155 addKernel<<<1, 1, 0, stream[i]>>>(dev_c + i, dev_a + i, dev_b + i); //执行流 156 } 157 start = clock(); 158 cudaDeviceSynchronize(); 159 stop = clock(); 160 time = (float) (stop - start) / CLOCKS_PER_SEC; 161 *etime = time; 162 // cudaThreadSynchronize waits for the kernel to finish, and returns 163 // any errors encountered during the launch. 164 cudaStatus = cudaThreadSynchronize(); 165 if (cudaStatus != cudaSuccess) { 166 fprintf(stderr, 167 "cudaThreadSynchronize returned error code %d after launching addKernel!\n", 168 cudaStatus); 169 goto Error; 170 } 171 // Copy output vector from GPU buffer to host memory. 172 cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), 173 cudaMemcpyDeviceToHost); 174 if (cudaStatus != cudaSuccess) { 175 fprintf(stderr, "cudaMemcpy failed!"); 176 goto Error; 177 } 178 Error: for (int i = 0; i < 5; i++) { 179 cudaStreamDestroy(stream[i]); //销毁流 180 } 181 cudaFree(dev_c); 182 cudaFree(dev_a); 183 cudaFree(dev_b); 184 return cudaStatus; 185 } 186 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size, 187 float * etime, int type) { 188 int *dev_a = 0; 189 int *dev_b = 0; 190 int *dev_c = 0; 191 clock_t start, stop; 192 float time; 193 cudaError_t cudaStatus; 194 195 // Choose which GPU to run on, change this on a multi-GPU system. 196 cudaStatus = cudaSetDevice(0); 197 if (cudaStatus != cudaSuccess) { 198 fprintf(stderr, 199 "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 200 goto Error; 201 } 202 // Allocate GPU buffers for three vectors (two input, one output) . 203 cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int)); 204 if (cudaStatus != cudaSuccess) { 205 fprintf(stderr, "cudaMalloc failed!"); 206 goto Error; 207 } 208 cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int)); 209 if (cudaStatus != cudaSuccess) { 210 fprintf(stderr, "cudaMalloc failed!"); 211 goto Error; 212 } 213 cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int)); 214 if (cudaStatus != cudaSuccess) { 215 fprintf(stderr, "cudaMalloc failed!"); 216 goto Error; 217 } 218 // Copy input vectors from host memory to GPU buffers. 219 cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), 220 cudaMemcpyHostToDevice); 221 if (cudaStatus != cudaSuccess) { 222 fprintf(stderr, "cudaMemcpy failed!"); 223 goto Error; 224 } 225 cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), 226 cudaMemcpyHostToDevice); 227 if (cudaStatus != cudaSuccess) { 228 fprintf(stderr, "cudaMemcpy failed!"); 229 goto Error; 230 } 231 232 if (type == 0) { 233 start = clock(); 234 addKernel<<<size, 1>>>(dev_c, dev_a, dev_b); 235 } else { 236 start = clock(); 237 addKernelThread<<<1, size>>>(dev_c, dev_a, dev_b); 238 } 239 stop = clock(); 240 time = (float) (stop - start) / CLOCKS_PER_SEC; 241 *etime = time; 242 // cudaThreadSynchronize waits for the kernel to finish, and returns 243 // any errors encountered during the launch. 244 cudaStatus = cudaThreadSynchronize(); 245 if (cudaStatus != cudaSuccess) { 246 fprintf(stderr, 247 "cudaThreadSynchronize returned error code %d after launching addKernel!\n", 248 cudaStatus); 249 goto Error; 250 } 251 // Copy output vector from GPU buffer to host memory. 252 cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), 253 cudaMemcpyDeviceToHost); 254 if (cudaStatus != cudaSuccess) { 255 fprintf(stderr, "cudaMemcpy failed!"); 256 goto Error; 257 } 258 Error: cudaFree(dev_c); 259 cudaFree(dev_a); 260 cudaFree(dev_b); 261 return cudaStatus; 262 }
如上文的实现程序,使用了thread并行,block并行,stream并行三种,使用三种方法法进行了五次计算,发现stream第一次计算时会出错,调用的子程序没有变化,没有搞懂?
Elasped time of stream is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {158,123,92,107,127}
Elasped time of Block is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000008
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of thread is : 0.000004
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000007
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
OPTIMISM, PASSION & HARDWORK