【DM642学习笔记十】DSP优化记录

1. 处理的数据先EDMA到片内,具有更高的效率!
以YUV2RGB为例:
#pragma DATA_SECTION(onchipBuf0_y,".INTPROCBUFF");
#pragmaDATA_ALIGN(onchipBuf0_y,128);
#pragma DATA_SECTION(onchipBuf1_y,".INTPROCBUFF");
#pragmaDATA_ALIGN(onchipBuf1_y,128);
#pragma DATA_SECTION(onchipBuf0_u,".INTPROCBUFF");
#pragmaDATA_ALIGN(onchipBuf0_u,128);
#pragma DATA_SECTION(onchipBuf1_u,".INTPROCBUFF");
#pragmaDATA_ALIGN(onchipBuf1_u,128);
#pragma DATA_SECTION(onchipBuf0_v,".INTPROCBUFF");
#pragmaDATA_ALIGN(onchipBuf0_v,128);
#pragma DATA_SECTION(onchipBuf1_v,".INTPROCBUFF");
#pragmaDATA_ALIGN(onchipBuf1_v,128);

#pragma DATA_SECTION(onchipBuf0_r,".INTPROCBUFF");
#pragmaDATA_ALIGN(onchipBuf0_r,128);
#pragma DATA_SECTION(onchipBuf1_r,".INTPROCBUFF");
#pragmaDATA_ALIGN(onchipBuf1_r,128);
#pragma DATA_SECTION(onchipBuf0_g,".INTPROCBUFF");
#pragmaDATA_ALIGN(onchipBuf0_g,128);
#pragma DATA_SECTION(onchipBuf1_g,".INTPROCBUFF");
#pragmaDATA_ALIGN(onchipBuf1_g,128);
#pragma DATA_SECTION(onchipBuf0_b,".INTPROCBUFF");
#pragmaDATA_ALIGN(onchipBuf0_b,128);
#pragma DATA_SECTION(onchipBuf1_b,".INTPROCBUFF");
#pragmaDATA_ALIGN(onchipBuf1_b,128);
//片上缓冲区
unsigned charonchipBuf0_y[PROC_WIDTH];
unsigned charonchipBuf1_y[PROC_WIDTH];
unsigned charonchipBuf0_u[PROC_WIDTH_2];
unsigned charonchipBuf1_u[PROC_WIDTH_2];
unsigned charonchipBuf0_v[PROC_WIDTH_2];
unsigned charonchipBuf1_v[PROC_WIDTH_2];
unsigned charonchipBuf0_r[PROC_WIDTH];
unsigned charonchipBuf1_r[PROC_WIDTH];
unsigned charonchipBuf0_g[PROC_WIDTH];
unsigned charonchipBuf1_g[PROC_WIDTH];
unsigned charonchipBuf0_b[PROC_WIDTH];
unsigned charonchipBuf1_b[PROC_WIDTH];
//原始图像 YUV
extern unsigned charsrc_Y[IMGWIDTH*IMGHEIGHT];//720*576
extern unsigned charsrc_U[IMGWIDTH_2*IMGHEIGHT];
extern unsigned charsrc_V[IMGWIDTH_2*IMGHEIGHT];
//RGB图像
extern unsigned charsrc_R[PROC_WIDTH*PROC_HEIGHT];//352*288
extern unsigned charsrc_G[PROC_WIDTH*PROC_HEIGHT];
extern unsigned charsrc_B[PROC_WIDTH*PROC_HEIGHT];

void yuv2rgb888()
{

int i=0,j=0;
int y,u,v,r,g,b;
int v359,v183,u88,u454;
   unsigned char *py,*pu,*pv,*pr,*pg,*pb;
// const int dif=0x8080;// 128128
for ( i = 0; i <288; i ++ ) 
   {
       //copy一行到片上  144-432行,180列~180+352列
      DAT_copy(src_Y+(i+144)*IMGWIDTH+180, onchipBuf0_y, PROC_WIDTH);
      DAT_copy(src_U+(i+144)*(IMGWIDTH>>1)+90, onchipBuf0_u,PROC_WIDTH_2);
      DAT_copy(src_V+(i+144)*(IMGWIDTH>>1)+90,   onchipBuf0_v, PROC_WIDTH_2);
py=onchipBuf0_y;
pu=onchipBuf0_u;
pv=onchipBuf0_v;
pr=onchipBuf0_r;
pg=onchipBuf0_g;
pb=onchipBuf0_b;
#pragmaMUST_ITERATE(0,176, 8);
for(j=0;j
{

y=(*py);//u-=128; v-=128;//y-=16;y不减16
u=(*pu)-128; 
v=(*pv)-128; 
v359=359*v>>8;
u88=88*u>>8;
v183=183*v>>8;
u454=454*u>>8;
r= y+v359;       // r=y+1.402*v;
r&=~(r>>31);  
r = (r |((255-r)>>31) ) & 0xFF;
g= y-u88-v183;    //g=y-0.34414*u-0.71414*v;
g&=~(g>>31);
g = (g |((255-g)>>31) ) & 0xFF;
b= y+u454;          //b=y+1.772*u;
b&=~(b>>31);
b = (b |((255-b)>>31) ) & 0xFF;
*pr++=r; 
*pg++=g;
*pb++=b;
//
py++; y=(*py);  //y-=16; y减了16之后比原来灰度暗了16左右。∴不减。
r= y+v359;           //r=y+1.402*v;
r&=~(r>>31);  
r = (r |((255-r)>>31) ) & 0xFF;
g= y-u88-v183;    //g=y-0.34414*u-0.71414*v;
g&=~(g>>31);
g = (g |((255-g)>>31) ) & 0xFF;
b= y+u454;       // b=y+1.772*u;
b&=~(b>>31);
b = (b |((255-b)>>31) ) & 0xFF;
*pr++=r;
*pg++=g;
*pb++=b;
py++; pu++;pv++;
}   
      //处理完后,copy给片外r、g、b[352*288]
       DAT_copy(onchipBuf0_r,src_R+i*PROC_WIDTH, PROC_WIDTH);
       DAT_copy(onchipBuf0_g,src_G+i*PROC_WIDTH, PROC_WIDTH); 
       DAT_copy(onchipBuf0_b,src_B+i*PROC_WIDTH, PROC_WIDTH); 
   }
}
View Code
耗时clock对比
放在片外:          1888 3080 clocks
先EDMA到片上:  197 7300 clocks!
约比在片外处理快10倍。
-------------------------------------------------
2. 另外,代码中将0~255的限定使用(0):
                r&=~(r>>31);  
                    r = (r | ((255-r)>>31) ) & 0xFF;
                    g &=~(g>>31);
                    g = (g | ((255-g)>>31) ) & 0xFF;
                    b &=~(b>>31);
                    b = (b | ((255-b)>>31) ) & 0xFF;  

代替(1),能实现更好的软件流水。

      if ( r>255)    r=255;          else if ( r<0 )  r=0;
          if ( g>255 )   g=255;          else if ( g<0 )  g=0;
          if ( b>255 )   b=255;          else if ( b<0 )  b=0;

说明:如下图所示,后者(1)不能软件流水~(都在片内处理的情况下)t1=14634366 clocks ≈10*t0!!!

【DM642】DSP优化记录

=> 优化后

 

 

posted @ 2014-05-21 20:26  eaglediao  阅读(108)  评论(0编辑  收藏  举报