最近要用到Intel的media SDK播放H264视频,发现intel的SDK转码出来的帧缓冲是NV12像素格式的,需要自己转换成RGB32格式。虽然这种代码网上找找一大堆,但还是自己动动手吧。
google了一下转换公式:
C = Y - 16
D = U - 128
E = V - 128
R = clip(round(1.164383 * C + 1.596027 * E))
G = clip(round(1.164383 * C - (0.391762 * D) - (0.812968 * E)))
B = clip(round(1.164383 * C + 2.017232 * D))
研究了一个小时用SSE2搞定,代码一次调试用过,心情大好,把核心代码贴上来。
__m128i c0 = _mm_setzero_si128();
__m128i c128 = _mm_set1_epi16(128);
__m128i c128_32 = _mm_set1_epi32(128);
__m128i c16 = _mm_set1_epi16(16);
__m128i c255 = _mm_set1_epi16(255);
__m128i c_1_1596 = _mm_set1_epi32(0x199012a);
__m128i c_1_2017 = _mm_set1_epi32(0x204012a);
__m128i c_0_392 = _mm_set1_epi32(0xff9c0000);
__m128i c_1_813 = _mm_set1_epi32(0xff30012a);
__m128i c128 = _mm_set1_epi16(128);
__m128i c128_32 = _mm_set1_epi32(128);
__m128i c16 = _mm_set1_epi16(16);
__m128i c255 = _mm_set1_epi16(255);
__m128i c_1_1596 = _mm_set1_epi32(0x199012a);
__m128i c_1_2017 = _mm_set1_epi32(0x204012a);
__m128i c_0_392 = _mm_set1_epi32(0xff9c0000);
__m128i c_1_813 = _mm_set1_epi32(0xff30012a);
for(int y = 0; y < src.Height; y++)
{
BYTE* dest = (BYTE*)data.Scan0 + data.Stride * y;
BYTE* srcY = src.Y + src.Pitch * y;
BYTE* srcUV = src.UV + src.Pitch * (y / 2);
for(int x = 0; x < src.Width; x += 4)
{
//Y0Y1Y2Y30000 - 16
__m128i Ymm = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(srcY + x)), c0), c16);
//U0V0U2V20000 - 128
__m128i UVmm = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(srcUV + x)), c0), c128);
//U0U0U2U20000
__m128i Umm = _mm_shufflelo_epi16(UVmm, _MM_SHUFFLE(2,2,0,0));
//V0V0V2V20000
__m128i Vmm = _mm_shufflelo_epi16(UVmm, _MM_SHUFFLE(3,3,1,1));
//Y0V0Y1V0Y2V2Y3V2
__m128i YVmm = _mm_unpacklo_epi16(Ymm, Vmm);
//Y0U0Y1U0Y2U2Y3U2
__m128i YUmm = _mm_unpacklo_epi16(Ymm, Umm);
__m128i Rmm = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(YVmm, c_1_1596), c128_32), 8);
__m128i Bmm = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(YUmm, c_1_2017), c128_32), 8);
__m128i Gmm = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_madd_epi16(YVmm, c_1_813), _mm_madd_epi16(YUmm, c_0_392)), c128_32), 8);
Rmm = _mm_slli_epi32(_mm_and_si128(Rmm, _mm_cmpgt_epi32(Rmm, c0)), 16);
Bmm = _mm_and_si128(Bmm, _mm_cmpgt_epi32(Bmm, c0));
Gmm = _mm_slli_epi32(_mm_min_epi16(_mm_and_si128(Gmm, _mm_cmpgt_epi32(Gmm, c0)), c255), 8);
*(__m128i*)dest = _mm_or_si128(_mm_min_epi16(_mm_or_si128(Rmm, Bmm), c255), Gmm);
dest += 16;
}
}