yuv420到rgb转化(mmx)效率高(转)
1 #include <windows.h>
2
3 long int crv_tab[256];
4 long int cbu_tab[256];
5 long int cgu_tab[256];
6
7 long int cgv_tab[256];
8 long int tab_76309[256];
9 unsigned char clp[1024];
10
11
12 void init_dither_tab()
13 {
14 long int crv,cbu,cgu,cgv;
15 int i,ind;
16
17 crv = 104597; cbu = 132201;
18 cgu = 25675; cgv = 53279;
19
20 for (i = 0; i < 256; i++) {
21 crv_tab[i] = (i-128) * crv;
22 cbu_tab[i] = (i-128) * cbu;
23 cgu_tab[i] = (i-128) * cgu;
24 cgv_tab[i] = (i-128) * cgv;
25 tab_76309[i] = 76309*(i-16);
26 }
27
28 for (i=0; i<384; i++)
29 clp[i] =0;
30 ind=384;
31 for (i=0;i<256; i++)
32 clp[ind++]=i;
33 ind=640;
34 for (i=0;i<384;i++)
35 clp[ind++]=255;
36 }
37
38
39 void YUV2RGB420(unsigned char *src0,unsigned char *src1,unsigned char *src2,unsigned char *dst_ori,int width,int height)
40 {
41 int y1,y2,u,v;
42 unsigned char *py1,*py2;
43 int i,j, c1, c2, c3, c4;
44 unsigned char *d1, *d2;
45
46 //src0=src;
47 //src1=src+width*height;
48 //src2=src+width*height+width*height/4;
49
50 py1=src0;
51 py2=py1+width;
52 d1=dst_ori;
53 d2=d1+3*width;
54 for (j = 0; j < height; j += 2) {
55 for (i = 0; i < width; i += 2) {
56
57 u = *src1++;
58 v = *src2++;
59
60 c1 = crv_tab[v];
61 c2 = cgu_tab[u];
62 c3 = cgv_tab[v];
63 c4 = cbu_tab[u];
64
65 //up-left
66 y1 = tab_76309[*py1++];
67 *d1++ = clp[384+((y1 + c1)>>16)];
68 *d1++ = clp[384+((y1 - c2 - c3)>>16)];
69 *d1++ = clp[384+((y1 + c4)>>16)];
70
71 //down-left
72 y2 = tab_76309[*py2++];
73 *d2++ = clp[384+((y2 + c1)>>16)];
74 *d2++ = clp[384+((y2 - c2 - c3)>>16)];
75 *d2++ = clp[384+((y2 + c4)>>16)];
76
77 //up-right
78 y1 = tab_76309[*py1++];
79 *d1++ = clp[384+((y1 + c1)>>16)];
80 *d1++ = clp[384+((y1 - c2 - c3)>>16)];
81 *d1++ = clp[384+((y1 + c4)>>16)];
82
83 //down-right
84 y2 = tab_76309[*py2++];
85 *d2++ = clp[384+((y2 + c1)>>16)];
86 *d2++ = clp[384+((y2 - c2 - c3)>>16)];
87 *d2++ = clp[384+((y2 + c4)>>16)];
88 }
89 d1 += 3*width;
90 d2 += 3*width;
91 py1+= width;
92 py2+= width;
93 }
94 }
95
96
97
98 //How to use:
99 //YUV_TO_RGB24(pY,width,pU,pV,width>>1,pRGBBuf,width,(int)0-height,width*3);
100 typedef UCHAR uint8_t;
101 typedef ULONGLONG uint64_t;
102
103 #define MAXIMUM_Y_WIDTH 800
104 static uint64_t mmw_mult_Y = 0x2568256825682568;
105 static uint64_t mmw_mult_U_G = 0xf36ef36ef36ef36e;
106 static uint64_t mmw_mult_U_B = 0x40cf40cf40cf40cf;
107 static uint64_t mmw_mult_V_R = 0x3343334333433343;
108 static uint64_t mmw_mult_V_G = 0xe5e2e5e2e5e2e5e2;
109
110
111 static uint64_t mmb_0x10 = 0x1010101010101010;
112 static uint64_t mmw_0x0080 = 0x0080008000800080;
113 static uint64_t mmw_0x00ff = 0x00ff00ff00ff00ff;
114
115 static uint64_t mmw_cut_red = 0x7c007c007c007c00;
116 static uint64_t mmw_cut_green = 0x03e003e003e003e0;
117 static uint64_t mmw_cut_blue = 0x001f001f001f001f;
118
119
120 void YUV_TO_RGB24( uint8_t *puc_y, int stride_y,
121 uint8_t *puc_u, uint8_t *puc_v, int stride_uv,
122 uint8_t *puc_out, int width_y, int height_y,int stride_out)
123 {
124 int y, horiz_count;
125 uint8_t *puc_out_remembered;
126 //int stride_out = width_y * 3;
127
128 if (height_y < 0) {
129 //we are flipping our output upside-down
130 height_y = -height_y;
131 puc_y += (height_y - 1) * stride_y ;
132 puc_u += (height_y/2 - 1) * stride_uv;
133 puc_v += (height_y/2 - 1) * stride_uv;
134 stride_y = -stride_y;
135 stride_uv = -stride_uv;
136 }
137
138 horiz_count = -(width_y >> 3);
139
140 for (y=0; y<height_y; y++) {
141 if (y == height_y-1) {
142 //this is the last output line - we need to be careful not to overrun the end of this line
143 uint8_t temp_buff[3*MAXIMUM_Y_WIDTH+1];
144 puc_out_remembered = puc_out;
145 puc_out = temp_buff; //write the RGB to a temporary store
146 }
147 _asm {
148 push eax
149 push ebx
150 push ecx
151 push edx
152 push edi
153
154 mov eax, puc_out
155 mov ebx, puc_y
156 mov ecx, puc_u
157 mov edx, puc_v
158 mov edi, horiz_count
159
160 horiz_loop:
161
162 movd mm2, [ecx]
163 pxor mm7, mm7
164
165 movd mm3, [edx]
166 punpcklbw mm2, mm7
167
168 movq mm0, [ebx]
169 punpcklbw mm3, mm7
170
171 movq mm1, mmw_0x00ff
172
173 psubusb mm0, mmb_0x10
174
175 psubw mm2, mmw_0x0080
176 pand mm1, mm0
177
178 psubw mm3, mmw_0x0080
179 psllw mm1, 3
180
181 psrlw mm0, 8
182 psllw mm2, 3
183
184 pmulhw mm1, mmw_mult_Y
185 psllw mm0, 3
186
187 psllw mm3, 3
188 movq mm5, mm3
189
190 pmulhw mm5, mmw_mult_V_R
191 movq mm4, mm2
192
193 pmulhw mm0, mmw_mult_Y
194 movq mm7, mm1
195
196 pmulhw mm2, mmw_mult_U_G
197 paddsw mm7, mm5
198
199 pmulhw mm3, mmw_mult_V_G
200 packuswb mm7, mm7
201
202 pmulhw mm4, mmw_mult_U_B
203 paddsw mm5, mm0
204
205 packuswb mm5, mm5
206 paddsw mm2, mm3
207
208 movq mm3, mm1
209 movq mm6, mm1
210
211 paddsw mm3, mm4
212 paddsw mm6, mm2
213
214 punpcklbw mm7, mm5
215 paddsw mm2, mm0
216
217 packuswb mm6, mm6
218 packuswb mm2, mm2
219
220 packuswb mm3, mm3
221 paddsw mm4, mm0
222
223 packuswb mm4, mm4
224 punpcklbw mm6, mm2
225
226 punpcklbw mm3, mm4
227
228 // 32-bit shuffle.
229 pxor mm0, mm0
230
231 movq mm1, mm6
232 punpcklbw mm1, mm0
233
234 movq mm0, mm3
235 punpcklbw mm0, mm7
236
237 movq mm2, mm0
238
239 punpcklbw mm0, mm1
240 punpckhbw mm2, mm1
241
242 // 24-bit shuffle and sav
243 movd [eax], mm0
244 psrlq mm0, 32
245
246 movd 3[eax], mm0
247
248 movd 6[eax], mm2
249
250
251 psrlq mm2, 32
252
253 movd 9[eax], mm2
254
255 // 32-bit shuffle.
256 pxor mm0, mm0
257
258 movq mm1, mm6
259 punpckhbw mm1, mm0
260
261 movq mm0, mm3
262 punpckhbw mm0, mm7
263
264 movq mm2, mm0
265
266 punpcklbw mm0, mm1
267 punpckhbw mm2, mm1
268
269 // 24-bit shuffle and sav
270 movd 12[eax], mm0
271 psrlq mm0, 32
272
273 movd 15[eax], mm0
274 add ebx, 8
275
276 movd 18[eax], mm2
277 psrlq mm2, 32
278
279 add ecx, 4
280 add edx, 4
281
282 movd 21[eax], mm2
283 add eax, 24
284
285 inc edi
286 jne horiz_loop
287
288 pop edi
289 pop edx
290 pop ecx
291 pop ebx
292 pop eax
293
294 emms
295 }
296
297
298 if (y == height_y-1) {
299 //last line of output - we have used the temp_buff and need to copy
300 int x = 3 * width_y; //interation counter
301 uint8_t *ps = puc_out; // source pointer (temporary line store)
302 uint8_t *pd = puc_out_remembered; // dest pointer
303 while (x--) *(pd++) = *(ps++); // copy the line
304 }
305
306 puc_y += stride_y;
307 if (y%2) {
308 puc_u += stride_uv;
309 puc_v += stride_uv;
310 }
311 puc_out += stride_out;
312 }
313 }
314
2
3 long int crv_tab[256];
4 long int cbu_tab[256];
5 long int cgu_tab[256];
6
7 long int cgv_tab[256];
8 long int tab_76309[256];
9 unsigned char clp[1024];
10
11
12 void init_dither_tab()
13 {
14 long int crv,cbu,cgu,cgv;
15 int i,ind;
16
17 crv = 104597; cbu = 132201;
18 cgu = 25675; cgv = 53279;
19
20 for (i = 0; i < 256; i++) {
21 crv_tab[i] = (i-128) * crv;
22 cbu_tab[i] = (i-128) * cbu;
23 cgu_tab[i] = (i-128) * cgu;
24 cgv_tab[i] = (i-128) * cgv;
25 tab_76309[i] = 76309*(i-16);
26 }
27
28 for (i=0; i<384; i++)
29 clp[i] =0;
30 ind=384;
31 for (i=0;i<256; i++)
32 clp[ind++]=i;
33 ind=640;
34 for (i=0;i<384;i++)
35 clp[ind++]=255;
36 }
37
38
39 void YUV2RGB420(unsigned char *src0,unsigned char *src1,unsigned char *src2,unsigned char *dst_ori,int width,int height)
40 {
41 int y1,y2,u,v;
42 unsigned char *py1,*py2;
43 int i,j, c1, c2, c3, c4;
44 unsigned char *d1, *d2;
45
46 //src0=src;
47 //src1=src+width*height;
48 //src2=src+width*height+width*height/4;
49
50 py1=src0;
51 py2=py1+width;
52 d1=dst_ori;
53 d2=d1+3*width;
54 for (j = 0; j < height; j += 2) {
55 for (i = 0; i < width; i += 2) {
56
57 u = *src1++;
58 v = *src2++;
59
60 c1 = crv_tab[v];
61 c2 = cgu_tab[u];
62 c3 = cgv_tab[v];
63 c4 = cbu_tab[u];
64
65 //up-left
66 y1 = tab_76309[*py1++];
67 *d1++ = clp[384+((y1 + c1)>>16)];
68 *d1++ = clp[384+((y1 - c2 - c3)>>16)];
69 *d1++ = clp[384+((y1 + c4)>>16)];
70
71 //down-left
72 y2 = tab_76309[*py2++];
73 *d2++ = clp[384+((y2 + c1)>>16)];
74 *d2++ = clp[384+((y2 - c2 - c3)>>16)];
75 *d2++ = clp[384+((y2 + c4)>>16)];
76
77 //up-right
78 y1 = tab_76309[*py1++];
79 *d1++ = clp[384+((y1 + c1)>>16)];
80 *d1++ = clp[384+((y1 - c2 - c3)>>16)];
81 *d1++ = clp[384+((y1 + c4)>>16)];
82
83 //down-right
84 y2 = tab_76309[*py2++];
85 *d2++ = clp[384+((y2 + c1)>>16)];
86 *d2++ = clp[384+((y2 - c2 - c3)>>16)];
87 *d2++ = clp[384+((y2 + c4)>>16)];
88 }
89 d1 += 3*width;
90 d2 += 3*width;
91 py1+= width;
92 py2+= width;
93 }
94 }
95
96
97
98 //How to use:
99 //YUV_TO_RGB24(pY,width,pU,pV,width>>1,pRGBBuf,width,(int)0-height,width*3);
100 typedef UCHAR uint8_t;
101 typedef ULONGLONG uint64_t;
102
103 #define MAXIMUM_Y_WIDTH 800
104 static uint64_t mmw_mult_Y = 0x2568256825682568;
105 static uint64_t mmw_mult_U_G = 0xf36ef36ef36ef36e;
106 static uint64_t mmw_mult_U_B = 0x40cf40cf40cf40cf;
107 static uint64_t mmw_mult_V_R = 0x3343334333433343;
108 static uint64_t mmw_mult_V_G = 0xe5e2e5e2e5e2e5e2;
109
110
111 static uint64_t mmb_0x10 = 0x1010101010101010;
112 static uint64_t mmw_0x0080 = 0x0080008000800080;
113 static uint64_t mmw_0x00ff = 0x00ff00ff00ff00ff;
114
115 static uint64_t mmw_cut_red = 0x7c007c007c007c00;
116 static uint64_t mmw_cut_green = 0x03e003e003e003e0;
117 static uint64_t mmw_cut_blue = 0x001f001f001f001f;
118
119
120 void YUV_TO_RGB24( uint8_t *puc_y, int stride_y,
121 uint8_t *puc_u, uint8_t *puc_v, int stride_uv,
122 uint8_t *puc_out, int width_y, int height_y,int stride_out)
123 {
124 int y, horiz_count;
125 uint8_t *puc_out_remembered;
126 //int stride_out = width_y * 3;
127
128 if (height_y < 0) {
129 //we are flipping our output upside-down
130 height_y = -height_y;
131 puc_y += (height_y - 1) * stride_y ;
132 puc_u += (height_y/2 - 1) * stride_uv;
133 puc_v += (height_y/2 - 1) * stride_uv;
134 stride_y = -stride_y;
135 stride_uv = -stride_uv;
136 }
137
138 horiz_count = -(width_y >> 3);
139
140 for (y=0; y<height_y; y++) {
141 if (y == height_y-1) {
142 //this is the last output line - we need to be careful not to overrun the end of this line
143 uint8_t temp_buff[3*MAXIMUM_Y_WIDTH+1];
144 puc_out_remembered = puc_out;
145 puc_out = temp_buff; //write the RGB to a temporary store
146 }
147 _asm {
148 push eax
149 push ebx
150 push ecx
151 push edx
152 push edi
153
154 mov eax, puc_out
155 mov ebx, puc_y
156 mov ecx, puc_u
157 mov edx, puc_v
158 mov edi, horiz_count
159
160 horiz_loop:
161
162 movd mm2, [ecx]
163 pxor mm7, mm7
164
165 movd mm3, [edx]
166 punpcklbw mm2, mm7
167
168 movq mm0, [ebx]
169 punpcklbw mm3, mm7
170
171 movq mm1, mmw_0x00ff
172
173 psubusb mm0, mmb_0x10
174
175 psubw mm2, mmw_0x0080
176 pand mm1, mm0
177
178 psubw mm3, mmw_0x0080
179 psllw mm1, 3
180
181 psrlw mm0, 8
182 psllw mm2, 3
183
184 pmulhw mm1, mmw_mult_Y
185 psllw mm0, 3
186
187 psllw mm3, 3
188 movq mm5, mm3
189
190 pmulhw mm5, mmw_mult_V_R
191 movq mm4, mm2
192
193 pmulhw mm0, mmw_mult_Y
194 movq mm7, mm1
195
196 pmulhw mm2, mmw_mult_U_G
197 paddsw mm7, mm5
198
199 pmulhw mm3, mmw_mult_V_G
200 packuswb mm7, mm7
201
202 pmulhw mm4, mmw_mult_U_B
203 paddsw mm5, mm0
204
205 packuswb mm5, mm5
206 paddsw mm2, mm3
207
208 movq mm3, mm1
209 movq mm6, mm1
210
211 paddsw mm3, mm4
212 paddsw mm6, mm2
213
214 punpcklbw mm7, mm5
215 paddsw mm2, mm0
216
217 packuswb mm6, mm6
218 packuswb mm2, mm2
219
220 packuswb mm3, mm3
221 paddsw mm4, mm0
222
223 packuswb mm4, mm4
224 punpcklbw mm6, mm2
225
226 punpcklbw mm3, mm4
227
228 // 32-bit shuffle.
229 pxor mm0, mm0
230
231 movq mm1, mm6
232 punpcklbw mm1, mm0
233
234 movq mm0, mm3
235 punpcklbw mm0, mm7
236
237 movq mm2, mm0
238
239 punpcklbw mm0, mm1
240 punpckhbw mm2, mm1
241
242 // 24-bit shuffle and sav
243 movd [eax], mm0
244 psrlq mm0, 32
245
246 movd 3[eax], mm0
247
248 movd 6[eax], mm2
249
250
251 psrlq mm2, 32
252
253 movd 9[eax], mm2
254
255 // 32-bit shuffle.
256 pxor mm0, mm0
257
258 movq mm1, mm6
259 punpckhbw mm1, mm0
260
261 movq mm0, mm3
262 punpckhbw mm0, mm7
263
264 movq mm2, mm0
265
266 punpcklbw mm0, mm1
267 punpckhbw mm2, mm1
268
269 // 24-bit shuffle and sav
270 movd 12[eax], mm0
271 psrlq mm0, 32
272
273 movd 15[eax], mm0
274 add ebx, 8
275
276 movd 18[eax], mm2
277 psrlq mm2, 32
278
279 add ecx, 4
280 add edx, 4
281
282 movd 21[eax], mm2
283 add eax, 24
284
285 inc edi
286 jne horiz_loop
287
288 pop edi
289 pop edx
290 pop ecx
291 pop ebx
292 pop eax
293
294 emms
295 }
296
297
298 if (y == height_y-1) {
299 //last line of output - we have used the temp_buff and need to copy
300 int x = 3 * width_y; //interation counter
301 uint8_t *ps = puc_out; // source pointer (temporary line store)
302 uint8_t *pd = puc_out_remembered; // dest pointer
303 while (x--) *(pd++) = *(ps++); // copy the line
304 }
305
306 puc_y += stride_y;
307 if (y%2) {
308 puc_u += stride_uv;
309 puc_v += stride_uv;
310 }
311 puc_out += stride_out;
312 }
313 }
314