darknet编译GPU、CUDNN
错误:/src/convolutional_layer.c:153:13: error: 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' undeclared (first use in this function);
修改出错的文件src/convolutional_layer.c的代码,增加针对CUDNN_MAJOR>=8的处理:
#ifdef GPU #ifdef CUDNN void cudnn_convolutional_setup(layer *l) { cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); #if CUDNN_MAJOR >= 6 cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); #else cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); #endif #if CUDNN_MAJOR >= 7 cudnnSetConvolutionGroupCount(l->convDesc, l->groups); #else if(l->groups > 1){ error("CUDNN < 7 doesn't support groups, please upgrade!"); } #endif #if CUDNN_MAJOR >= 8 int returnedAlgoCount; cudnnConvolutionFwdAlgoPerf_t fw_results[2 * CUDNN_CONVOLUTION_FWD_ALGO_COUNT]; cudnnConvolutionBwdDataAlgoPerf_t bd_results[2 * CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT]; cudnnConvolutionBwdFilterAlgoPerf_t bf_results[2 * CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT]; cudnnFindConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->weightDesc, l->convDesc, l->dstTensorDesc, CUDNN_CONVOLUTION_FWD_ALGO_COUNT, &returnedAlgoCount, fw_results); for(int algoIndex = 0; algoIndex < returnedAlgoCount; ++algoIndex){ #if PRINT_CUDNN_ALGO > 0 printf("^^^^ %s for Algo %d: %f time requiring %llu memory\n", cudnnGetErrorString(fw_results[algoIndex].status), fw_results[algoIndex].algo, fw_results[algoIndex].time, (unsigned long long)fw_results[algoIndex].memory); #endif if( fw_results[algoIndex].memory < MEMORY_LIMIT ){ l->fw_algo = fw_results[algoIndex].algo; break; } } cudnnFindConvolutionBackwardDataAlgorithm(cudnn_handle(), l->weightDesc, l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT, &returnedAlgoCount, bd_results); for(int algoIndex = 0; algoIndex < returnedAlgoCount; ++algoIndex){ #if PRINT_CUDNN_ALGO > 0 printf("^^^^ %s for Algo %d: %f time requiring %llu memory\n", cudnnGetErrorString(bd_results[algoIndex].status), bd_results[algoIndex].algo, bd_results[algoIndex].time, (unsigned long long)bd_results[algoIndex].memory); #endif if( bd_results[algoIndex].memory < MEMORY_LIMIT ){ l->bd_algo = bd_results[algoIndex].algo; break; } } cudnnFindConvolutionBackwardFilterAlgorithm(cudnn_handle(), l->srcTensorDesc, l->ddstTensorDesc, l->convDesc, l->dweightDesc, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT, &returnedAlgoCount, bf_results); for(int algoIndex = 0; algoIndex < returnedAlgoCount; ++algoIndex){ #if PRINT_CUDNN_ALGO > 0 printf("^^^^ %s for Algo %d: %f time requiring %llu memory\n", cudnnGetErrorString(bf_results[algoIndex].status), bf_results[algoIndex].algo, bf_results[algoIndex].time, (unsigned long long)bf_results[algoIndex].memory); #endif if( bf_results[algoIndex].memory < MEMORY_LIMIT ){ l->bf_algo = bf_results[algoIndex].algo; break; } } #else cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->weightDesc, l->convDesc, l->dstTensorDesc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 2000000000, &l->fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l->weightDesc, l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, 2000000000, &l->bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l->srcTensorDesc, l->ddstTensorDesc, l->convDesc, l->dweightDesc, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, 2000000000, &l->bf_algo); #endif } #endif #endif 增加声明
#define MEMORY_LIMIT 2000000000
错误:nvcc fatal : Unsupported gpu architecture 'compute_30'
把Makefile里的配置修改一下,去掉ARCH配置中的 -gencode arch=compute_30,code=sm_30 \ 这行,改成下面这样即可:
ARCH= -gencode arch=compute_35,code=sm_35 \ -gencode arch=compute_50,code=[sm_50,compute_50] \ -gencode arch=compute_52,code=[sm_52,compute_52] \ -gencode arch=compute_70,code=[sm_70,compute_70] \ -gencode arch=compute_75,code=[sm_75,compute_75]\ -gencode arch=compute_86,code=[sm_86,compute_86]
天道酬勤 循序渐进 技压群雄