libyuv 代码结构分析,借用其NEON/ARM64优化代码
I 入口
格式转换入口的函数都在convert_xx之类的文件中。在我的android程序中主要用的是xx格式转成NV12。其入口在convert_from.cc中。
函数为:
int I420ToNV12(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, uint8* dst_y, int dst_stride_y, uint8* dst_uv, int dst_stride_uv, int width, int height);
这里涉及到两个函数:
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
halfwidth, halfheight);
其入口都在planar_functions.cc中:
void CopyPlane(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height) { int y; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; // Negative height means invert the image. if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; dst_stride_y = -dst_stride_y; } // Coalesce rows. if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; } // Nothing to do. if (src_y == dst_y && src_stride_y == dst_stride_y) { return; } #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } #endif #if defined(HAS_COPYROW_AVX) if (TestCpuFlag(kCpuHasAVX)) { CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; } #endif #if defined(HAS_COPYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif #if defined(HAS_COPYROW_MIPS) if (TestCpuFlag(kCpuHasMIPS)) { CopyRow = CopyRow_MIPS; } #endif // Copy plane for (y = 0; y < height; ++y) { CopyRow(src_y, dst_y, width); src_y += src_stride_y; dst_y += dst_stride_y; } } void MergeUVPlane(const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, uint8* dst_uv, int dst_stride_uv, int width, int height) { int y; void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUVRow_C; // Coalesce rows. // Negative height means invert the image. if (height < 0) { height = -height; dst_uv = dst_uv + (height - 1) * dst_stride_uv; dst_stride_uv = -dst_stride_uv; } // Coalesce rows. if (src_stride_u == width && src_stride_v == width && dst_stride_uv == width * 2) { width *= height; height = 1; src_stride_u = src_stride_v = dst_stride_uv = 0; } #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { MergeUVRow = MergeUVRow_SSE2; } } #endif #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { MergeUVRow = MergeUVRow_AVX2; } } #endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow = MergeUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { MergeUVRow = MergeUVRow_NEON; } } #endif #if defined(HAS_MERGEUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MergeUVRow = MergeUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { MergeUVRow = MergeUVRow_MSA; } } #endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. MergeUVRow(src_u, src_v, dst_uv, width); src_u += src_stride_u; src_v += src_stride_v; dst_uv += dst_stride_uv; } }
到目前为止的代码都是平台无关的。也很好看懂。
II 平台相关代码
平台相关代码都在xx_neon.cc xx_neon64.cc中。具体的CopyRow_NEON和MergeUVRow_NEON相关的代码,都在row_neon64.cc/row_neon.cc中。前者是arm64的代码,后者是armeabi-v7a的neon代码。
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( "1: \n" MEMACCESS(0) "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop MEMACCESS(1) "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(count) // %2 // Output registers : // Input registers : "cc", "memory", "q0", "q1" // Clobber List ); }
CopyRow_Any_NEON和MergeUVRow_Any_NEON的代码都在row_any.cc中,any的代码不分64和32,
// Any 1 to 1. #define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ SIMD_ALIGNED(uint8 temp[128 * 2]); \ memset(temp, 0, 128); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ ANY_SIMD(temp, temp + 128, MASK + 1); \ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ } #ifdef HAS_COPYROW_NEON ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31) #endif
代码中用到的申明都在libyuv/row.h中。
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) #if defined(VISUALC_HAS_AVX2) #define SIMD_ALIGNED(var) __declspec(align(32)) var #else #define SIMD_ALIGNED(var) __declspec(align(16)) var #endif typedef __declspec(align(16)) int16 vec16[8]; typedef __declspec(align(16)) int32 vec32[4]; typedef __declspec(align(16)) int8 vec8[16]; typedef __declspec(align(16)) uint16 uvec16[8]; typedef __declspec(align(16)) uint32 uvec32[4]; typedef __declspec(align(16)) uint8 uvec8[16]; typedef __declspec(align(32)) int16 lvec16[16]; typedef __declspec(align(32)) int32 lvec32[8]; typedef __declspec(align(32)) int8 lvec8[32]; typedef __declspec(align(32)) uint16 ulvec16[16]; typedef __declspec(align(32)) uint32 ulvec32[8]; typedef __declspec(align(32)) uint8 ulvec8[32]; #elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__)) // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. #if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2) #define SIMD_ALIGNED(var) var __attribute__((aligned(32))) #else #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #endif typedef int16 __attribute__((vector_size(16))) vec16; typedef int32 __attribute__((vector_size(16))) vec32; typedef int8 __attribute__((vector_size(16))) vec8; typedef uint16 __attribute__((vector_size(16))) uvec16; typedef uint32 __attribute__((vector_size(16))) uvec32; typedef uint8 __attribute__((vector_size(16))) uvec8; typedef int16 __attribute__((vector_size(32))) lvec16; typedef int32 __attribute__((vector_size(32))) lvec32; typedef int8 __attribute__((vector_size(32))) lvec8; typedef uint16 __attribute__((vector_size(32))) ulvec16; typedef uint32 __attribute__((vector_size(32))) ulvec32; typedef uint8 __attribute__((vector_size(32))) ulvec8; #else #define SIMD_ALIGNED(var) var typedef int16 vec16[8]; typedef int32 vec32[4]; typedef int8 vec8[16]; typedef uint16 uvec16[8]; typedef uint32 uvec32[4]; typedef uint8 uvec8[16]; typedef int16 lvec16[16]; typedef int32 lvec32[8]; typedef int8 lvec8[32]; typedef uint16 ulvec16[16]; typedef uint32 ulvec32[8]; typedef uint8 ulvec8[32]; #endif ....
之后只要把这些代码抠出来便可以借用了。比起交叉编译两个庫来说,方便许多。