opencv 源码分析 CUDA可分离滤波器设计 ( 发现OpenCV的cuda真TM慢 )
1. 主函数
void SeparableLinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream) { GpuMat src = _src.getGpuMat(); CV_Assert( src.type() == srcType_ ); _dst.create(src.size(), dstType_); GpuMat dst = _dst.getGpuMat(); ensureSizeIsEnough(src.size(), bufType_, buf_); DeviceInfo devInfo; const int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion(); cudaStream_t stream = StreamAccessor::getStream(_stream); rowFilter_(src, buf_, rowKernel_.ptr<float>(), rowKernel_.cols, anchor_.x, rowBorderMode_, cc, stream); columnFilter_(buf_, dst, columnKernel_.ptr<float>(), columnKernel_.cols, anchor_.y, columnBorderMode_, cc, stream); }
the block of col is 16X16 , the block of row is 32X8
2. COL
namespace filter { template <typename T, typename D> void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) { typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream); static const caller_t callers[5][33] = { { 0, column_filter::caller< 1, T, D, BrdColConstant>, column_filter::caller< 2, T, D, BrdColConstant>, column_filter::caller< 3, T, D, BrdColConstant>, column_filter::caller< 4, T, D, BrdColConstant>, column_filter::caller< 5, T, D, BrdColConstant>, column_filter::caller< 6, T, D, BrdColConstant>, column_filter::caller< 7, T, D, BrdColConstant>, column_filter::caller< 8, T, D, BrdColConstant>, column_filter::caller< 9, T, D, BrdColConstant>, column_filter::caller<10, T, D, BrdColConstant>, column_filter::caller<11, T, D, BrdColConstant>, column_filter::caller<12, T, D, BrdColConstant>, column_filter::caller<13, T, D, BrdColConstant>, column_filter::caller<14, T, D, BrdColConstant>, column_filter::caller<15, T, D, BrdColConstant>, column_filter::caller<16, T, D, BrdColConstant>, column_filter::caller<17, T, D, BrdColConstant>, column_filter::caller<18, T, D, BrdColConstant>, column_filter::caller<19, T, D, BrdColConstant>, column_filter::caller<20, T, D, BrdColConstant>, column_filter::caller<21, T, D, BrdColConstant>, column_filter::caller<22, T, D, BrdColConstant>, column_filter::caller<23, T, D, BrdColConstant>, column_filter::caller<24, T, D, BrdColConstant>, column_filter::caller<25, T, D, BrdColConstant>, column_filter::caller<26, T, D, BrdColConstant>, column_filter::caller<27, T, D, BrdColConstant>, column_filter::caller<28, T, D, BrdColConstant>, column_filter::caller<29, T, D, BrdColConstant>, column_filter::caller<30, T, D, BrdColConstant>, column_filter::caller<31, T, D, BrdColConstant>, column_filter::caller<32, T, D, BrdColConstant> }, { 0, column_filter::caller< 1, T, D, BrdColReplicate>, column_filter::caller< 2, T, D, BrdColReplicate>, column_filter::caller< 3, T, D, BrdColReplicate>, column_filter::caller< 4, T, D, BrdColReplicate>, column_filter::caller< 5, T, D, BrdColReplicate>, column_filter::caller< 6, T, D, BrdColReplicate>, column_filter::caller< 7, T, D, BrdColReplicate>, column_filter::caller< 8, T, D, BrdColReplicate>, column_filter::caller< 9, T, D, BrdColReplicate>, column_filter::caller<10, T, D, BrdColReplicate>, column_filter::caller<11, T, D, BrdColReplicate>, column_filter::caller<12, T, D, BrdColReplicate>, column_filter::caller<13, T, D, BrdColReplicate>, column_filter::caller<14, T, D, BrdColReplicate>, column_filter::caller<15, T, D, BrdColReplicate>, column_filter::caller<16, T, D, BrdColReplicate>, column_filter::caller<17, T, D, BrdColReplicate>, column_filter::caller<18, T, D, BrdColReplicate>, column_filter::caller<19, T, D, BrdColReplicate>, column_filter::caller<20, T, D, BrdColReplicate>, column_filter::caller<21, T, D, BrdColReplicate>, column_filter::caller<22, T, D, BrdColReplicate>, column_filter::caller<23, T, D, BrdColReplicate>, column_filter::caller<24, T, D, BrdColReplicate>, column_filter::caller<25, T, D, BrdColReplicate>, column_filter::caller<26, T, D, BrdColReplicate>, column_filter::caller<27, T, D, BrdColReplicate>, column_filter::caller<28, T, D, BrdColReplicate>, column_filter::caller<29, T, D, BrdColReplicate>, column_filter::caller<30, T, D, BrdColReplicate>, column_filter::caller<31, T, D, BrdColReplicate>, column_filter::caller<32, T, D, BrdColReplicate> }, { 0, column_filter::caller< 1, T, D, BrdColReflect>, column_filter::caller< 2, T, D, BrdColReflect>, column_filter::caller< 3, T, D, BrdColReflect>, column_filter::caller< 4, T, D, BrdColReflect>, column_filter::caller< 5, T, D, BrdColReflect>, column_filter::caller< 6, T, D, BrdColReflect>, column_filter::caller< 7, T, D, BrdColReflect>, column_filter::caller< 8, T, D, BrdColReflect>, column_filter::caller< 9, T, D, BrdColReflect>, column_filter::caller<10, T, D, BrdColReflect>, column_filter::caller<11, T, D, BrdColReflect>, column_filter::caller<12, T, D, BrdColReflect>, column_filter::caller<13, T, D, BrdColReflect>, column_filter::caller<14, T, D, BrdColReflect>, column_filter::caller<15, T, D, BrdColReflect>, column_filter::caller<16, T, D, BrdColReflect>, column_filter::caller<17, T, D, BrdColReflect>, column_filter::caller<18, T, D, BrdColReflect>, column_filter::caller<19, T, D, BrdColReflect>, column_filter::caller<20, T, D, BrdColReflect>, column_filter::caller<21, T, D, BrdColReflect>, column_filter::caller<22, T, D, BrdColReflect>, column_filter::caller<23, T, D, BrdColReflect>, column_filter::caller<24, T, D, BrdColReflect>, column_filter::caller<25, T, D, BrdColReflect>, column_filter::caller<26, T, D, BrdColReflect>, column_filter::caller<27, T, D, BrdColReflect>, column_filter::caller<28, T, D, BrdColReflect>, column_filter::caller<29, T, D, BrdColReflect>, column_filter::caller<30, T, D, BrdColReflect>, column_filter::caller<31, T, D, BrdColReflect>, column_filter::caller<32, T, D, BrdColReflect> }, { 0, column_filter::caller< 1, T, D, BrdColWrap>, column_filter::caller< 2, T, D, BrdColWrap>, column_filter::caller< 3, T, D, BrdColWrap>, column_filter::caller< 4, T, D, BrdColWrap>, column_filter::caller< 5, T, D, BrdColWrap>, column_filter::caller< 6, T, D, BrdColWrap>, column_filter::caller< 7, T, D, BrdColWrap>, column_filter::caller< 8, T, D, BrdColWrap>, column_filter::caller< 9, T, D, BrdColWrap>, column_filter::caller<10, T, D, BrdColWrap>, column_filter::caller<11, T, D, BrdColWrap>, column_filter::caller<12, T, D, BrdColWrap>, column_filter::caller<13, T, D, BrdColWrap>, column_filter::caller<14, T, D, BrdColWrap>, column_filter::caller<15, T, D, BrdColWrap>, column_filter::caller<16, T, D, BrdColWrap>, column_filter::caller<17, T, D, BrdColWrap>, column_filter::caller<18, T, D, BrdColWrap>, column_filter::caller<19, T, D, BrdColWrap>, column_filter::caller<20, T, D, BrdColWrap>, column_filter::caller<21, T, D, BrdColWrap>, column_filter::caller<22, T, D, BrdColWrap>, column_filter::caller<23, T, D, BrdColWrap>, column_filter::caller<24, T, D, BrdColWrap>, column_filter::caller<25, T, D, BrdColWrap>, column_filter::caller<26, T, D, BrdColWrap>, column_filter::caller<27, T, D, BrdColWrap>, column_filter::caller<28, T, D, BrdColWrap>, column_filter::caller<29, T, D, BrdColWrap>, column_filter::caller<30, T, D, BrdColWrap>, column_filter::caller<31, T, D, BrdColWrap>, column_filter::caller<32, T, D, BrdColWrap> }, { 0, column_filter::caller< 1, T, D, BrdColReflect101>, column_filter::caller< 2, T, D, BrdColReflect101>, column_filter::caller< 3, T, D, BrdColReflect101>, column_filter::caller< 4, T, D, BrdColReflect101>, column_filter::caller< 5, T, D, BrdColReflect101>, column_filter::caller< 6, T, D, BrdColReflect101>, column_filter::caller< 7, T, D, BrdColReflect101>, column_filter::caller< 8, T, D, BrdColReflect101>, column_filter::caller< 9, T, D, BrdColReflect101>, column_filter::caller<10, T, D, BrdColReflect101>, column_filter::caller<11, T, D, BrdColReflect101>, column_filter::caller<12, T, D, BrdColReflect101>, column_filter::caller<13, T, D, BrdColReflect101>, column_filter::caller<14, T, D, BrdColReflect101>, column_filter::caller<15, T, D, BrdColReflect101>, column_filter::caller<16, T, D, BrdColReflect101>, column_filter::caller<17, T, D, BrdColReflect101>, column_filter::caller<18, T, D, BrdColReflect101>, column_filter::caller<19, T, D, BrdColReflect101>, column_filter::caller<20, T, D, BrdColReflect101>, column_filter::caller<21, T, D, BrdColReflect101>, column_filter::caller<22, T, D, BrdColReflect101>, column_filter::caller<23, T, D, BrdColReflect101>, column_filter::caller<24, T, D, BrdColReflect101>, column_filter::caller<25, T, D, BrdColReflect101>, column_filter::caller<26, T, D, BrdColReflect101>, column_filter::caller<27, T, D, BrdColReflect101>, column_filter::caller<28, T, D, BrdColReflect101>, column_filter::caller<29, T, D, BrdColReflect101>, column_filter::caller<30, T, D, BrdColReflect101>, column_filter::caller<31, T, D, BrdColReflect101>, column_filter::caller<32, T, D, BrdColReflect101> } }; callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream); } }
template <int KSIZE, typename T, typename D, template<typename> class B> void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream) { int BLOCK_DIM_X; int BLOCK_DIM_Y; int PATCH_PER_BLOCK; if (cc >= 20) { BLOCK_DIM_X = 16; BLOCK_DIM_Y = 16; PATCH_PER_BLOCK = 4; } else { BLOCK_DIM_X = 16; BLOCK_DIM_Y = 8; PATCH_PER_BLOCK = 2; } const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); B<T> brd(src.rows); linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } }
#define MAX_KERNEL_SIZE 32 template <int KSIZE, typename T, typename D, typename B> __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd) { #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) const int BLOCK_DIM_X = 16; const int BLOCK_DIM_Y = 16; const int PATCH_PER_BLOCK = 4; const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; #else const int BLOCK_DIM_X = 16; const int BLOCK_DIM_Y = 8; const int PATCH_PER_BLOCK = 2; const int HALO_SIZE = 2; #endif typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; if (x >= src.cols) return; const T* src_col = src.ptr() + x; const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; if (blockIdx.y > 0) { //Upper halo #pragma unroll for (int j = 0; j < HALO_SIZE; ++j) smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); } else { //Upper halo #pragma unroll for (int j = 0; j < HALO_SIZE; ++j) smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); } if (blockIdx.y + 2 < gridDim.y) { //Main data #pragma unroll for (int j = 0; j < PATCH_PER_BLOCK; ++j) smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x)); //Lower halo #pragma unroll for (int j = 0; j < HALO_SIZE; ++j) smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); } else { //Main data #pragma unroll for (int j = 0; j < PATCH_PER_BLOCK; ++j) smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); //Lower halo #pragma unroll for (int j = 0; j < HALO_SIZE; ++j) smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); } __syncthreads(); #pragma unroll for (int j = 0; j < PATCH_PER_BLOCK; ++j) { const int y = yStart + j * BLOCK_DIM_Y; if (y < src.rows) { sum_t sum = VecTraits<sum_t>::all(0); #pragma unroll for (int k = 0; k < KSIZE; ++k) sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * kernel[k]; dst(y, x) = saturate_cast<D>(sum); } } }
3. ROW
namespace filter { template <typename T, typename D> void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) { typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream); static const caller_t callers[5][33] = { { 0, row_filter::caller< 1, T, D, BrdRowConstant>, row_filter::caller< 2, T, D, BrdRowConstant>, row_filter::caller< 3, T, D, BrdRowConstant>, row_filter::caller< 4, T, D, BrdRowConstant>, row_filter::caller< 5, T, D, BrdRowConstant>, row_filter::caller< 6, T, D, BrdRowConstant>, row_filter::caller< 7, T, D, BrdRowConstant>, row_filter::caller< 8, T, D, BrdRowConstant>, row_filter::caller< 9, T, D, BrdRowConstant>, row_filter::caller<10, T, D, BrdRowConstant>, row_filter::caller<11, T, D, BrdRowConstant>, row_filter::caller<12, T, D, BrdRowConstant>, row_filter::caller<13, T, D, BrdRowConstant>, row_filter::caller<14, T, D, BrdRowConstant>, row_filter::caller<15, T, D, BrdRowConstant>, row_filter::caller<16, T, D, BrdRowConstant>, row_filter::caller<17, T, D, BrdRowConstant>, row_filter::caller<18, T, D, BrdRowConstant>, row_filter::caller<19, T, D, BrdRowConstant>, row_filter::caller<20, T, D, BrdRowConstant>, row_filter::caller<21, T, D, BrdRowConstant>, row_filter::caller<22, T, D, BrdRowConstant>, row_filter::caller<23, T, D, BrdRowConstant>, row_filter::caller<24, T, D, BrdRowConstant>, row_filter::caller<25, T, D, BrdRowConstant>, row_filter::caller<26, T, D, BrdRowConstant>, row_filter::caller<27, T, D, BrdRowConstant>, row_filter::caller<28, T, D, BrdRowConstant>, row_filter::caller<29, T, D, BrdRowConstant>, row_filter::caller<30, T, D, BrdRowConstant>, row_filter::caller<31, T, D, BrdRowConstant>, row_filter::caller<32, T, D, BrdRowConstant> }, { 0, row_filter::caller< 1, T, D, BrdRowReplicate>, row_filter::caller< 2, T, D, BrdRowReplicate>, row_filter::caller< 3, T, D, BrdRowReplicate>, row_filter::caller< 4, T, D, BrdRowReplicate>, row_filter::caller< 5, T, D, BrdRowReplicate>, row_filter::caller< 6, T, D, BrdRowReplicate>, row_filter::caller< 7, T, D, BrdRowReplicate>, row_filter::caller< 8, T, D, BrdRowReplicate>, row_filter::caller< 9, T, D, BrdRowReplicate>, row_filter::caller<10, T, D, BrdRowReplicate>, row_filter::caller<11, T, D, BrdRowReplicate>, row_filter::caller<12, T, D, BrdRowReplicate>, row_filter::caller<13, T, D, BrdRowReplicate>, row_filter::caller<14, T, D, BrdRowReplicate>, row_filter::caller<15, T, D, BrdRowReplicate>, row_filter::caller<16, T, D, BrdRowReplicate>, row_filter::caller<17, T, D, BrdRowReplicate>, row_filter::caller<18, T, D, BrdRowReplicate>, row_filter::caller<19, T, D, BrdRowReplicate>, row_filter::caller<20, T, D, BrdRowReplicate>, row_filter::caller<21, T, D, BrdRowReplicate>, row_filter::caller<22, T, D, BrdRowReplicate>, row_filter::caller<23, T, D, BrdRowReplicate>, row_filter::caller<24, T, D, BrdRowReplicate>, row_filter::caller<25, T, D, BrdRowReplicate>, row_filter::caller<26, T, D, BrdRowReplicate>, row_filter::caller<27, T, D, BrdRowReplicate>, row_filter::caller<28, T, D, BrdRowReplicate>, row_filter::caller<29, T, D, BrdRowReplicate>, row_filter::caller<30, T, D, BrdRowReplicate>, row_filter::caller<31, T, D, BrdRowReplicate>, row_filter::caller<32, T, D, BrdRowReplicate> }, { 0, row_filter::caller< 1, T, D, BrdRowReflect>, row_filter::caller< 2, T, D, BrdRowReflect>, row_filter::caller< 3, T, D, BrdRowReflect>, row_filter::caller< 4, T, D, BrdRowReflect>, row_filter::caller< 5, T, D, BrdRowReflect>, row_filter::caller< 6, T, D, BrdRowReflect>, row_filter::caller< 7, T, D, BrdRowReflect>, row_filter::caller< 8, T, D, BrdRowReflect>, row_filter::caller< 9, T, D, BrdRowReflect>, row_filter::caller<10, T, D, BrdRowReflect>, row_filter::caller<11, T, D, BrdRowReflect>, row_filter::caller<12, T, D, BrdRowReflect>, row_filter::caller<13, T, D, BrdRowReflect>, row_filter::caller<14, T, D, BrdRowReflect>, row_filter::caller<15, T, D, BrdRowReflect>, row_filter::caller<16, T, D, BrdRowReflect>, row_filter::caller<17, T, D, BrdRowReflect>, row_filter::caller<18, T, D, BrdRowReflect>, row_filter::caller<19, T, D, BrdRowReflect>, row_filter::caller<20, T, D, BrdRowReflect>, row_filter::caller<21, T, D, BrdRowReflect>, row_filter::caller<22, T, D, BrdRowReflect>, row_filter::caller<23, T, D, BrdRowReflect>, row_filter::caller<24, T, D, BrdRowReflect>, row_filter::caller<25, T, D, BrdRowReflect>, row_filter::caller<26, T, D, BrdRowReflect>, row_filter::caller<27, T, D, BrdRowReflect>, row_filter::caller<28, T, D, BrdRowReflect>, row_filter::caller<29, T, D, BrdRowReflect>, row_filter::caller<30, T, D, BrdRowReflect>, row_filter::caller<31, T, D, BrdRowReflect>, row_filter::caller<32, T, D, BrdRowReflect> }, { 0, row_filter::caller< 1, T, D, BrdRowWrap>, row_filter::caller< 2, T, D, BrdRowWrap>, row_filter::caller< 3, T, D, BrdRowWrap>, row_filter::caller< 4, T, D, BrdRowWrap>, row_filter::caller< 5, T, D, BrdRowWrap>, row_filter::caller< 6, T, D, BrdRowWrap>, row_filter::caller< 7, T, D, BrdRowWrap>, row_filter::caller< 8, T, D, BrdRowWrap>, row_filter::caller< 9, T, D, BrdRowWrap>, row_filter::caller<10, T, D, BrdRowWrap>, row_filter::caller<11, T, D, BrdRowWrap>, row_filter::caller<12, T, D, BrdRowWrap>, row_filter::caller<13, T, D, BrdRowWrap>, row_filter::caller<14, T, D, BrdRowWrap>, row_filter::caller<15, T, D, BrdRowWrap>, row_filter::caller<16, T, D, BrdRowWrap>, row_filter::caller<17, T, D, BrdRowWrap>, row_filter::caller<18, T, D, BrdRowWrap>, row_filter::caller<19, T, D, BrdRowWrap>, row_filter::caller<20, T, D, BrdRowWrap>, row_filter::caller<21, T, D, BrdRowWrap>, row_filter::caller<22, T, D, BrdRowWrap>, row_filter::caller<23, T, D, BrdRowWrap>, row_filter::caller<24, T, D, BrdRowWrap>, row_filter::caller<25, T, D, BrdRowWrap>, row_filter::caller<26, T, D, BrdRowWrap>, row_filter::caller<27, T, D, BrdRowWrap>, row_filter::caller<28, T, D, BrdRowWrap>, row_filter::caller<29, T, D, BrdRowWrap>, row_filter::caller<30, T, D, BrdRowWrap>, row_filter::caller<31, T, D, BrdRowWrap>, row_filter::caller<32, T, D, BrdRowWrap> }, { 0, row_filter::caller< 1, T, D, BrdRowReflect101>, row_filter::caller< 2, T, D, BrdRowReflect101>, row_filter::caller< 3, T, D, BrdRowReflect101>, row_filter::caller< 4, T, D, BrdRowReflect101>, row_filter::caller< 5, T, D, BrdRowReflect101>, row_filter::caller< 6, T, D, BrdRowReflect101>, row_filter::caller< 7, T, D, BrdRowReflect101>, row_filter::caller< 8, T, D, BrdRowReflect101>, row_filter::caller< 9, T, D, BrdRowReflect101>, row_filter::caller<10, T, D, BrdRowReflect101>, row_filter::caller<11, T, D, BrdRowReflect101>, row_filter::caller<12, T, D, BrdRowReflect101>, row_filter::caller<13, T, D, BrdRowReflect101>, row_filter::caller<14, T, D, BrdRowReflect101>, row_filter::caller<15, T, D, BrdRowReflect101>, row_filter::caller<16, T, D, BrdRowReflect101>, row_filter::caller<17, T, D, BrdRowReflect101>, row_filter::caller<18, T, D, BrdRowReflect101>, row_filter::caller<19, T, D, BrdRowReflect101>, row_filter::caller<20, T, D, BrdRowReflect101>, row_filter::caller<21, T, D, BrdRowReflect101>, row_filter::caller<22, T, D, BrdRowReflect101>, row_filter::caller<23, T, D, BrdRowReflect101>, row_filter::caller<24, T, D, BrdRowReflect101>, row_filter::caller<25, T, D, BrdRowReflect101>, row_filter::caller<26, T, D, BrdRowReflect101>, row_filter::caller<27, T, D, BrdRowReflect101>, row_filter::caller<28, T, D, BrdRowReflect101>, row_filter::caller<29, T, D, BrdRowReflect101>, row_filter::caller<30, T, D, BrdRowReflect101>, row_filter::caller<31, T, D, BrdRowReflect101>, row_filter::caller<32, T, D, BrdRowReflect101> } }; callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream); } }
template <int KSIZE, typename T, typename D, template<typename> class B> void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream) { int BLOCK_DIM_X; int BLOCK_DIM_Y; int PATCH_PER_BLOCK; if (cc >= 20) { BLOCK_DIM_X = 32; BLOCK_DIM_Y = 8; PATCH_PER_BLOCK = 4; } else { BLOCK_DIM_X = 32; BLOCK_DIM_Y = 4; PATCH_PER_BLOCK = 4; } const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); B<T> brd(src.cols); linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); }
#define MAX_KERNEL_SIZE 32 template <int KSIZE, typename T, typename D, typename B> __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd) { #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) const int BLOCK_DIM_X = 32; const int BLOCK_DIM_Y = 8; const int PATCH_PER_BLOCK = 4; const int HALO_SIZE = 1; #else const int BLOCK_DIM_X = 32; const int BLOCK_DIM_Y = 4; const int PATCH_PER_BLOCK = 4; const int HALO_SIZE = 1; #endif typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X]; const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; if (y >= src.rows) return; const T* src_row = src.ptr(y); const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; if (blockIdx.x > 0) { //Load left halo #pragma unroll for (int j = 0; j < HALO_SIZE; ++j) smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]); } else { //Load left halo #pragma unroll for (int j = 0; j < HALO_SIZE; ++j) smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); } if (blockIdx.x + 2 < gridDim.x) { //Load main data #pragma unroll for (int j = 0; j < PATCH_PER_BLOCK; ++j) smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]); //Load right halo #pragma unroll for (int j = 0; j < HALO_SIZE; ++j) smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]); } else { //Load main data #pragma unroll for (int j = 0; j < PATCH_PER_BLOCK; ++j) smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); //Load right halo #pragma unroll for (int j = 0; j < HALO_SIZE; ++j) smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); } __syncthreads(); #pragma unroll for (int j = 0; j < PATCH_PER_BLOCK; ++j) { const int x = xStart + j * BLOCK_DIM_X; if (x < src.cols) { sum_t sum = VecTraits<sum_t>::all(0); #pragma unroll for (int k = 0; k < KSIZE; ++k) sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * kernel[k]; dst(y, x) = saturate_cast<D>(sum); } } }