使用shared memory 计算矩阵乘法 (其实并没有加速多少)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | # include "cuda_runtime.h" # include "device_launch_parameters.h" # include "device_functions.h" # include <stdio.h> # include <windows.h> # include <m_tools.h> cudaError_t addWithCuda( int *c, const int *a, const int *b, unsigned int size); #define TILE_WIDTH 16 __global__ void MatrixMulKernle( int m, int n, int k, int *A, int *B, int *C) { //申请共享内存,存在于每个block中 __shared__ int ds_A[TILE_WIDTH][TILE_WIDTH]; __shared__ int ds_B[TILE_WIDTH][TILE_WIDTH]; //简化坐标记法,出现下面6个表示的地方就是并行的地方。 int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; //确定结果矩阵中的行和列 int iy = by * TILE_WIDTH + ty; int ix = bx * TILE_WIDTH + tx; if (iy >= m || ix >= k) { return ; } int gw = gridDim.x; int gh = gridDim.y; //临时变量 int Cvalue = 0 ; //循环读入A,B瓦片,计算结果矩阵,分阶段进行计算 for ( int t = 0 ; t < (n + TILE_WIDTH - 1 ) / TILE_WIDTH; ++t) { ds_A[tx][ty] = A[iy*n + t*TILE_WIDTH + tx]; ds_B[tx][ty] = B[(t*TILE_WIDTH + ty)*k + ix]; __syncthreads(); for ( int i = 0 ; i < TILE_WIDTH; ++i) Cvalue += ds_A[i][ty] * ds_B[tx][i]; //从shared memory中取值 C[iy*k + ix] = Cvalue; } } //不适用shared memory __global__ void addKernel( int *c, const int *a, const int *b) { //const int bs = CUDA_LG::block_size; //BLOCK_SIZE; int ix = blockIdx.x * blockDim.x + threadIdx.x, iy = blockIdx.y * blockDim.y + threadIdx.y; if (ix >= 100 || iy >= 100 ) { return ; } int sum = 0 ; for ( int i = 0 ; i != 200 ; ++i) { int ta = a[iy * 100 + i]; int tb = b[i * 100 + ix]; sum += ta*tb; } c[iy * 100 + ix] = sum; } int main() { const int arow = 100 ; const int acol = 200 ; const int brow = 200 ; const int bcol = 100 ; const int arraySize = arow*acol; int * a = new int [arraySize]; int * b = new int [arraySize]; int * c = new int [arraySize/ 2 ]; for ( int j = 0 ; j != arow; ++j) { for ( int i = 0 ; i != acol; ++i) { a[j*acol + i] = i; } } for ( int j = 0 ; j != brow; ++j) { for ( int i = 0 ; i != bcol; ++i) { b[j*bcol + i] = i; } } addWithCuda(c, a, b, arraySize); cudaDeviceReset(); printf( "c0=%d c1=%d c[3,50]=%d \n" , c[ 0 ], c[ 1 ],c[ 3 * 100 + 50 ]); delete [] a; delete [] b; delete [] c; system( "pause" ); return 0 ; } // Helper function for using CUDA to add vectors in parallel. cudaError_t addWithCuda( int *c, const int *a, const int *b, unsigned int size) { int *dev_a = 0 ; int *dev_b = 0 ; int *dev_c = 0 ; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice( 0 ); cudaStatus = cudaMalloc(( void **)&dev_c, size * sizeof( int )); cudaStatus = cudaMalloc(( void **)&dev_a, size * sizeof( int )); cudaStatus = cudaMalloc(( void **)&dev_b, size * sizeof( int )); cudaStatus = cudaMemcpy(dev_a, a, size * sizeof( int ), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(dev_b, b, size * sizeof( int ), cudaMemcpyHostToDevice); int thread_x = 100 ; int thread_y = 100 ; dim3 block(TILE_WIDTH, TILE_WIDTH); int grid_w = (thread_x + block.x - 1 ) / block.x; int grid_h = (thread_y + block.y - 1 ) / block.y; dim3 grid(grid_w, grid_h); // Launch a kernel on the GPU with one thread for each element. TIME_INIT; TIME_MARK( "t1" ); for ( int i= 0 ;i!= 10000 ;++i) addKernel << < grid, block >> > (dev_c, dev_a, dev_b); //486ms TIME_MARK( "t2" ); for ( int i = 0 ; i != 10000 ; ++i) MatrixMulKernle << < grid, block >> >( 100 , 200 , 100 , dev_a, dev_b, dev_c); //1069ms TIME_MARK( "t3" ); TIME_PRINT; cudaStatus = cudaGetLastError(); cudaStatus = cudaDeviceSynchronize(); cudaStatus = cudaMemcpy(c, dev_c, size/ 2 * sizeof( int ), cudaMemcpyDeviceToHost); Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); return cudaStatus; } |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 地球OL攻略 —— 某应届生求职总结
· 周边上新:园子的第一款马克杯温暖上架
· Open-Sora 2.0 重磅开源!
· 提示词工程——AI应用必不可少的技术
· .NET周刊【3月第1期 2025-03-02】
2018-05-10 mark ubuntu 16.04 64bit + cpu only install mtcnn