CUDA学习3 Max pooling (python c++ cuda)
1.Python
在CNN4 参数优化中有一个CNN模型,其中的限速步是max pooling。
如下所示,Python中运行一个50*100*24*24的max pooling需要3秒。
import numpy as np import time def simple_pool(input, ds=(2, 2)): n, m, h, w = input.shape d, s = ds zh = h / d + h % d zw = w / s + w % s z = np.zeros((n, m,zh,zw)) for k in range(n): for o in range(m): for i in range(zh): for j in range(zw): maxd = -10000 for u in range(min(d,h-d*i) ): for v in range(min(d,w-d*j)): if input[k,o,d*i+u,d*j+v]>maxd: maxd=input[k,o,d*i+u,d*j+v] z[k, o, i, j] = maxd return z N,M,H,W=[50,100,24,24] a=np.reshape(range(N*M*H*W),(N,M,H,W))*0.01 start_time= time.time() out_data=simple_pool(a) print "Cost:",time.time()-start_time,"s" print out_data[0,0,0,:10] """ Cost: 3.08899998665 s [ 0.25 0.27 0.29 0.31 0.33 0.35 0.37 0.39 0.41 0.43] """
2.C++
采用c++,仅需16~30ms。
#include<iostream> #include<windows.h> void MaxPool2d(const float* const bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height,float* top_data) { const int w = width; const int h = height; const int m = channels; const int n = num; const int d = pooled_height; const int zh = h / d + h % d; const int zw = w / d + w % d; int i,j,k,o,u,v,index,index2=0; float s; for (k = 0; k < n; ++k) for (o = 0; o < m; ++o) for (i = 0; i < zh; ++i) for (j = 0; j < zw; ++j) { index=k*m*h*w+o*h*w+d*i*w+d*j; s=-10000.0; for (u = 0; u < d&&(u+d*i)<h; ++u) for (v = 0; v < d&&(v+d*j)<w; ++v) if (*(bottom_data+index+u*w+v)>s) s=*(bottom_data+index+u*w+v); *(top_data+index2)=s; ++index2; } } int main() { const int N=50,M=100,H=24,W=24,P=(H+1)/2; float mul_min=0.01; float *input,*output; input=new float [N*M*H*W*sizeof(float)]; output=new float [N*M*P*P*sizeof(float)]; for(int i=0;i<N*M*H*W;i++) *(input+i)=i*mul_min; DWORD start_time=GetTickCount(); MaxPool2d(input,N,M,H,W,2,output); DWORD end_time=GetTickCount(); std::cout<<"Cost: "<<end_time-start_time<<"ms."<<std::endl; for(int i=0;i<10;i++) std::cout<<*(output+i)<<std::endl; delete []input; delete []output; } /* Cost: 16ms. 0.25 0.27 0.29 0.31 0.33 0.35 0.37 0.39 0.41 0.43 */
3.CUDA
在N=50时为16ms,N=500时为141ms(c++中为218ms),略有提升,应该是计算快了一些,数据交换慢了一些。
#include <windows.h> #include <iostream> __global__ void MaxPool2d(float* bottom_data, const int height, const int width, const int pooled_height,const int out_height,float* top_data) { int x = blockIdx.x; int y = blockIdx.y; int i,j,u,v,index; int index2=x*gridDim.y*out_height*out_height+y*out_height*out_height; float s; for (i = 0; i < out_height; ++i) for (j = 0; j < out_height; ++j) { index=x*gridDim.y*height*width+y*height*width+i*pooled_height*width+j*pooled_height; s=-10000.0; for (u = 0; u < pooled_height&&(u+pooled_height*i)<height; ++u) for (v = 0; v < pooled_height&&(v+pooled_height*j)<width; ++v) if (*(bottom_data+index+u*width+v)>s) s=*(bottom_data+index+u*width+v); *(top_data+index2)=s; ++index2; } } int main() { const int N=500,M=100,H=24,W=24,D=2; const int PH=H / D + H % D; int image_size = N*M*H*W*sizeof(float); int out_size = N*M*PH*PH*sizeof(float); float mul_by=0.01; float *input,*output,*dev_output,*dev_input; input = new float[image_size]; output = new float[out_size]; for (int i = 0; i<N*M*H*W; i++) *(input + i) = i*mul_by; cudaMalloc((void**)&dev_output, out_size); cudaMalloc((void**)&dev_input, image_size); cudaMemcpy(dev_input, input, image_size, cudaMemcpyHostToDevice); dim3 grid(N, M); DWORD start_time=GetTickCount(); MaxPool2d<<<grid,1>>>(dev_input,H,W,D,PH,dev_output); cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost); DWORD end_time=GetTickCount(); std::cout<<"Cost: "<<end_time-start_time<<"ms."<<std::endl; for (int i = 0; i<10; i++) std::cout << *(output + i) << std::endl; cudaFree(dev_input); cudaFree(dev_output); delete[] output; delete[] input; system("pause"); } /* Cost: 141ms. 0.25 0.27 0.29 0.31 0.33 0.35 0.37 0.39 0.41 0.43 */