使用 OpenMP 和 pthreads 两种环境,利用实现统一内存编址,计算基本的矩阵乘法 result = α * A * x + β * result 。
▶ 源代码
1 #include <cstdio> 2 #include <vector> 3 #include <algorithm> 4 #include <cuda_runtime.h> 5 #include "device_launch_parameters.h" 6 #include <cublas_v2.h> 7 8 //#define USE_PTHREADS // 使用 pthread 时补充定义 USE_PTHREADS 9 #ifdef USE_PTHREADS 10 #include <pthread.h> 11 #pragma comment(lib, "pthreadVC2.lib") 12 #else 13 #include <omp.h> 14 #endif 15 16 // Windows 系统需要构造与函数 SRAND48 和 DRAND48 等价的随机函数 17 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 18 void srand48(long seed) { srand((unsigned int)seed); } 19 double drand48() { return double(rand()) / RAND_MAX; } 20 #endif 21 22 template <typename T> struct Task// struct 也可使用类的构造和析构函数 23 { 24 unsigned int size, id; 25 T *data; 26 T *result; 27 T *vector; 28 29 Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL) {}; 30 Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) 31 { 32 cudaMallocManaged(&data, sizeof(T)*size*size); 33 cudaMallocManaged(&result, sizeof(T)*size); 34 cudaMallocManaged(&vector, sizeof(T)*size); 35 cudaDeviceSynchronize(); 36 } 37 38 ~Task() 39 { 40 cudaDeviceSynchronize(); 41 cudaFree(data); 42 cudaFree(result); 43 cudaFree(vector); 44 } 45 46 void allocate(const unsigned int s, const unsigned int unique_id)// 申请内存,初始化各成员数组 47 { 48 id = unique_id; 49 size = s; 50 cudaMallocManaged(&data, sizeof(T)*size*size); 51 cudaMallocManaged(&result, sizeof(T)*size); 52 cudaMallocManaged(&vector, sizeof(T)*size); 53 cudaDeviceSynchronize(); 54 55 for (int i = 0; i < size*size; i++) 56 data[i] = drand48(); 57 for (int i = 0; i < size; i++) 58 { 59 result[i] = 0.; 60 vector[i] = drand48(); 61 } 62 } 63 }; 64 65 #ifdef USE_PTHREADS// 封装 pthread 型的任务 66 struct threadData_t 67 { 68 int tid; 69 Task<double> *TaskListPtr; 70 cudaStream_t *streams; 71 cublasHandle_t *handles; 72 int taskSize; 73 }; 74 75 typedef struct threadData_t threadData; 76 #endif 77 78 template <typename T> void gemv(int m, int n, T *alpha, T *A, T *x, T *beta, T *result)// 计算 result = α * A * x + β * result 79 { 80 for (int i = 0; i < m; i++)// 源代码这写成了 n,并且漏掉了后面的 alpha 81 { 82 result[i] *= *beta; 83 for (int j = 0; j < n; j++) 84 result[i] += *alpha * A[i*n + j] * x[j]; 85 } 86 } 87 88 // execute a single task on either host or device depending on size 89 #ifdef USE_PTHREADS 90 void * execute(void* inpArgs) 91 { 92 threadData *dataPtr = (threadData *) inpArgs; 93 cudaStream_t *stream = dataPtr->streams; 94 cublasHandle_t *handle = dataPtr->handles; 95 int tid = dataPtr->tid; 96 97 for (int i = 0; i < dataPtr->taskSize; i++) 98 { 99 Task<double> &t = dataPtr->TaskListPtr[i]; 100 double alpha = 1.0; 101 double beta = 0.0; 102 if (t.size < 100)// 数据规模较小在主机上运行,否则在设备上运行 103 { 104 printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size); 105 cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost); 106 cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost); 107 cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost); 108 cudaStreamSynchronize(stream[0]); 109 gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result); 110 } 111 else 112 { 113 printf("\nTask [%2d], thread [%2d], size [%4d], on device",t.id,tid,t.size); 114 cublasSetStream(handle[tid+1], stream[tid+1]); 115 cudaStreamAttachMemAsync(stream[tid+1], t.data, 0, cudaMemAttachSingle); 116 cudaStreamAttachMemAsync(stream[tid+1], t.vector, 0, cudaMemAttachSingle); 117 cudaStreamAttachMemAsync(stream[tid+1], t.result, 0, cudaMemAttachSingle); 118 cublasDgemv(handle[tid+1], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, 1, &beta, t.result, 1); 119 } 120 } 121 return NULL; 122 } 123 #else 124 template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid) 125 { 126 double alpha = 1.0; 127 double beta = 0.0; 128 if (t.size < 100)// 数据规模较小在主机上运行,否则在设备上运行 129 { 130 printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size); 131 cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost); 132 cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost); 133 cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost); 134 cudaStreamSynchronize(stream[0]); 135 gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result); 136 } 137 else 138 { 139 printf("\nTask [%2d], thread [%2d], size[%4d], on device",t.id,tid,t.size); 140 cublasSetStream(handle[tid+1], stream[tid+1]); 141 cudaStreamAttachMemAsync(stream[tid+1], t.data, 0, cudaMemAttachSingle); 142 cudaStreamAttachMemAsync(stream[tid+1], t.vector, 0, cudaMemAttachSingle); 143 cudaStreamAttachMemAsync(stream[tid+1], t.result, 0, cudaMemAttachSingle); 144 cublasDgemv(handle[tid+1], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, 1, &beta, t.result, 1); 145 } 146 } 147 #endif 148 149 template <typename T> void initialise_tasks(std::vector< Task<T> > &TaskList) 150 { 151 for (unsigned int i = 0; i < TaskList.size(); i++) 152 { 153 int size; 154 size = std::max((int)(drand48()*1000.0), 64); 155 TaskList[i].allocate(size, i); 156 } 157 } 158 159 int main() 160 { 161 printf("\n\tStart.\n"); 162 163 cudaDeviceProp device_prop; 164 cudaGetDeviceProperties(&device_prop, 0); 165 if (!device_prop.managedMemory) 166 { 167 printf("\n\tUnified Memory not supported\n"); 168 getchar(); 169 return 1; 170 } 171 if (device_prop.computeMode == cudaComputeModeProhibited)// Device 为线程禁用模式 172 { 173 printf("\n\tComputeMode is cudaComputeModeProhibited\n"); 174 getchar(); 175 return 1; 176 } 177 178 srand48(time(NULL)); 179 const int nthreads = 4; 180 cudaStream_t *streams = new cudaStream_t[nthreads+1]; 181 cublasHandle_t *handles = new cublasHandle_t[nthreads+1]; 182 for (int i=0; i<nthreads+1; i++) 183 { 184 cudaStreamCreate(&streams[i]); 185 cublasCreate(&handles[i]); 186 } 187 188 unsigned int N = 40; 189 std::vector<Task<double> > TaskList(N); 190 initialise_tasks(TaskList); 191 cudaSetDevice(0); 192 193 #ifdef USE_PTHREADS 194 pthread_t threads[nthreads]; 195 threadData *InputToThreads = new threadData[nthreads]; 196 int temp = TaskList.size() / nthreads; 197 for (int i=0; i < nthreads; i++) 198 { 199 InputToThreads[i].tid = i; 200 InputToThreads[i].streams = streams; 201 InputToThreads[i].handles = handles; 202 203 if (temp == 0) // 任务数量比线程数少 204 { 205 InputToThreads[i].taskSize = 0; 206 InputToThreads[i].TaskListPtr = &TaskList[0]; 207 } 208 else // 任务数量不少于线程数。任务尽量均分,多出的零头全部塞给最后一个线程 209 { 210 if (i == nthreads - 1) 211 { 212 InputToThreads[i].taskSize = temp + (TaskList.size() % nthreads); 213 InputToThreads[i].TaskListPtr = &TaskList[i*temp + (TaskList.size() % nthreads)]; 214 } 215 else 216 { 217 InputToThreads[i].taskSize = temp; 218 InputToThreads[i].TaskListPtr = &TaskList[i*temp]; 219 } 220 } 221 pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]); 222 } 223 for (int i=0; i < nthreads; i++) 224 pthread_join(threads[i], NULL); 225 #else 226 omp_set_num_threads(nthreads); 227 #pragma omp parallel for schedule(dynamic) 228 for (int i=0; i<TaskList.size(); i++) 229 { 230 int tid = omp_get_thread_num(); 231 execute(TaskList[i], handles, streams, tid); 232 } 233 #endif 234 cudaDeviceSynchronize(); 235 236 // 清理工作 237 for (int i=0; i<nthreads+1; i++) 238 { 239 cudaStreamDestroy(streams[i]); 240 cublasDestroy(handles[i]); 241 } 242 std::vector< Task<double> >().swap(TaskList); 243 printf("\n\tFinish.\n"); 244 getchar(); 245 return 0; 246 }
▶ 输出结果:OpenMP
Start. Task [ 0], thread [ 0], size[ 721], on device Task [ 1], thread [ 2], size[ 925], on device Task [ 2], thread [ 3], size[ 250], on device Task [ 3], thread [ 1], size[ 832], on device Task [ 4], thread [ 2], size[ 798], on device Task [ 5], thread [ 3], size[ 379], on device Task [ 6], thread [ 2], size[ 110], on device Task [ 7], thread [ 1], size[ 147], on device Task [ 8], thread [ 0], size[ 941], on device Task [ 9], thread [ 3], size[ 494], on device Task [12], thread [ 1], size[ 619], on device Task [10], thread [ 0], size[ 144], on device Task [11], thread [ 2], size[ 477], on device Task [13], thread [ 3], size[ 282], on device Task [14], thread [ 0], size[ 593], on device Task [15], thread [ 1], size[ 960], on device Task [16], thread [ 0], size[ 671], on device Task [17], thread [ 2], size[ 585], on device Task [18], thread [ 3], size[ 883], on device Task [19], thread [ 0], size[ 102], on device Task [20], thread [ 2], size[ 912], on device Task [21], thread [ 1], size[ 527], on device Task [22], thread [ 2], size[ 613], on device Task [23], thread [ 3], size[ 553], on device Task [24], thread [ 0], size[ 572], on device Task [25], thread [ 1], size[ 792], on device Task [26], thread [ 2], size[ 406], on device Task [27], thread [ 3], size[ 782], on device Task [28], thread [ 0], size[ 351], on device Task [29], thread [ 1], size[ 595], on device Task [30], thread [ 2], size[ 301], on device Task [31], thread [ 3], size[ 537], on device Task [32], thread [ 0], size[ 303], on device Task [35], thread [ 3], size[ 177], on device Task [34], thread [ 2], size[ 869], on device Task [33], thread [ 1], size[ 415], on device Task [36], thread [ 0], size[ 987], on device Task [37], thread [ 3], size[ 772], on device Task [38], thread [ 2], size[ 129], on device Task [39], thread [ 1], size[ 506], on device Finish.
▶ 输出结果:pthreads
Start. Task [ 0], thread [ 0], size [ 755], on device Task [20], thread [ 2], size [ 754], on device Task [10], thread [ 1], size [ 746], on device Task [30], thread [ 3], size [ 958], on device Task [31], thread [ 3], size [ 281], on device Task [11], thread [ 1], size [ 946], on device Task [ 1], thread [ 0], size [ 77], on host Task [32], thread [ 3], size [ 144], on device Task [21], thread [ 2], size [ 432], on device Task [12], thread [ 1], size [ 105], on device Task [22], thread [ 2], size [ 837], on device Task [33], thread [ 3], size [ 402], on device Task [ 2], thread [ 0], size [ 239], on device Task [13], thread [ 1], size [ 149], on device Task [23], thread [ 2], size [ 764], on device Task [ 3], thread [ 0], size [ 775], on device Task [34], thread [ 3], size [ 913], on device Task [24], thread [ 2], size [ 182], on device Task [35], thread [ 3], size [ 865], on device Task [ 4], thread [ 0], size [ 729], on device Task [14], thread [ 1], size [ 110], on device Task [ 5], thread [ 0], size [ 604], on device Task [36], thread [ 3], size [ 107], on device Task [25], thread [ 2], size [ 403], on device Task [15], thread [ 1], size [ 842], on device Task [37], thread [ 3], size [ 858], on device Task [26], thread [ 2], size [ 64], on host Task [ 6], thread [ 0], size [ 64], on host Task [16], thread [ 1], size [ 103], on device Task [ 7], thread [ 0], size [ 852], on device Task [27], thread [ 2], size [ 826], on device Task [ 8], thread [ 0], size [ 673], on device Task [17], thread [ 1], size [ 243], on device Task [38], thread [ 3], size [ 178], on device Task [39], thread [ 3], size [ 788], on device Task [ 9], thread [ 0], size [ 67], on host Task [18], thread [ 1], size [ 340], on device Task [28], thread [ 2], size [ 249], on device Task [19], thread [ 1], size [ 934], on device Task [29], thread [ 2], size [ 793], on device Finish.
▶ 涨姿势:
● 使用 C++ 结构体完成了类似类的方法。即在结构体中定义构造函数、析构函数及其他方法。
● 使用了 cuBLAS 库,注意句柄的使用和库函数的调用。
● 用到的申请内存的函数
1 // driver_types.h 2 #define cudaMemAttachGlobal 0x01 // 可访问内存 3 #define cudaMemAttachHost 0x02 // 不可访问内存 4 #define cudaMemAttachSingle 0x04 // 单线程可访问内存 5 6 // cuda_runtime.h 7 template<class T> static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, T *devPtr, size_t length = 0, unsigned int flags = cudaMemAttachSingle) 8 { 9 return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags); 10 } 11 12 // cuda_runtime_api.h 13 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags __dv(cudaMemAttachSingle));