UTF8 Unicode 区别联系,UFT8 字符串Trim函数

最近项目中要将文档标题写入oracle数据库,编码是UTF8的,老报 字符串未正确结束这个错。   ORA-01756:括号内的字符串没有正确结束。

客户端,服务器的NLS_LANG都正确设置为了UTF8. 

将标题按照自己的顺序输出,我一个一个字符编码去查,发现标题后面多了一个'E5' 字节。这个是一个三字节UTF8编码的前缀,后面没有后续字节了,ORACLE当然报错。。

在公布解决方法前,复习下UTF8编码:

UTF8和Ascii 是兼容编码,UTF8和Unicode是一一对应的。 

一字节的为: 0*******

二字节的为: 110*****   10****** 

三字节的为: 1110****   10******  10******   

依次内推  4 5 6字节。其中*为有效字符编码

到这不经有个疑问了?为什么会有 6个字节的utf8? unicode 不是两个字节么?这样表示的unicode不是超过16个bit了?

原来unicode有4字节编码的方式。只是比较少见。(不见得,我测试了20000篇文档,其中就有出现) 

原问题的解决方法:

1.将数据库字符集改为 AL32UTF8。原因是 AL32UTF8支持更高版本的unicode.容错性强一点

2.写一个utf8_trim函数,将字符串中的非utf8编码去掉,替换成空格。然后入库。

 

我采用了第二种解决方法,写了一个utf8_trim,没有用循环,而用了大量丑陋的if else

主要考虑,一是比较简单,直观。还有就是效率高一点。因为,大部分的utf8是3字节,这样效率会高一点。

代码我初步测试过了,欢迎报bug 

 贴代码如下: 

  void trim_utf8(  char* str)

{
    
if(str == NULL)
        
return ;
    
int length = strlen(str);
    
int i = 0;
    
while(i < length)
    {
        
if(str[i]>0)
        {
            i
++;
            
continue;
        }
        
if(((unsigned char) str[i]&0xE0== 192)//110 2byte
        {
            
if ((i+1)< length)
            {
                
if(((unsigned char)(str[i+1]&0xC0)) != 128)//10
                {str[i] = ' '; i = i +1;}//invalid
                else
                {i
= i +2;}
            }
            
else//invalid
            {
                str[i]
=' ';
            }
            
continue;
        }
        
if(((unsigned char) str[i]&0xC0== 128)//10
        {
            str[i] 
= ' ';
            i
++;
            
continue;
        }
        
        
if(((unsigned char)(str[i]&0xF0)) == 224)//1110 3byte
        {
            
if ((i+2)< length)
            {
                
if(((unsigned char)(str[i+1]&0xC0)) != 128 || ((unsigned char)(str[i+2]&0xC0)) != 128 )//10
                {
                    str[i] 
= ' ';
                    
if (((unsigned char)(str[i+1]&0xC0)) != 128)
                    {i 
= i+1;}
                    
else
                    {str[i
+1= ' ';i = i +2;}
                }
                
else
                {
                    i
= i +3;
                }
            }
            
else
            {
                
for ( ;i <length;i++)
                    str[i]
=' ';
            }
            
continue;
        }
        
if(((unsigned char)(str[i]&0xF8)) == 240)//11110 4byte
        {
            
if ((i+3)< length)
            {

                
if(((unsigned char)(str[i+1]&0xC0)) != 128 || ((unsigned char)(str[i+2]&0xC0)) != 128 || ((unsigned char)(str[i+3]&0xC0)) != 128)//10
                {
                    
int j  = 0;
                    
for ( j = i+1;j<= i+3;j++)
                    { 
                        
if(((unsigned char)(str[j]&0xC0)) == 128
                            str[j
-1= ' '
                        
else
                            
break;
                    }
                    i 
= j;
                }
                
else
                {
                    i
= i +4;
                }
            }
            
else
            {
                
for ( ;i <length;i++)
                    str[i]
=' ';
            }
            
continue;
        }
        
if(((unsigned char)(str[i]&0xFC)) == 248)//111110 5byte
        {
            
if ((i+4)< length)
            {

                
if(((unsigned char)(str[i+1]&0xC0)) != 128 || ((unsigned char)(str[i+2]&0xC0)) != 128 || ((unsigned char)(str[i+3]&0xC0)) != 128|| ((unsigned char)(str[i+4]&0xC0)) != 128)//10
                {
                    
int j  = 0;
                    
for ( j = i+1;j<= i+4;j++)
                    { 
                        
if(((unsigned char)(str[j]&0xC0)) == 128
                            str[j
-1= ' '
                        
else
                            
break;
                    }
                    i 
= j;
                }
                
else
                {
                    i
= i +5;
                }
            }
            
else
            {
                
for ( ;i <length;i++)
                    str[i]
=' ';
            }
            
continue;
        }
        
if(((unsigned char)(str[i]&0xFE)) == 252)//1111110 6byte
        {
            
if ((i+5)< length)
            {
                
if(((unsigned char)(str[i+1]&0xC0)) != 128 || ((unsigned char)(str[i+2]&0xC0)) != 128 || ((unsigned char)(str[i+3]&0xC0)) != 128|| ((unsigned char)(str[i+4]&0xC0)) != 128|| ((unsigned char)(str[i+5]&0xC0)) != 128)//10
                {
                    
int j  = 0;
                    
for ( j = i+1;j<= i+5;j++)
                    { 
                        
if(((unsigned char)(str[j]&0xC0)) == 128
                            str[j
-1= ' '
                        
else
                            
break;
                    }
                    i 
= j;
                }
                
else
                {
                    i
= i +6;
                }
            }
            
else
            {
                
for ( ;i <length;i++)
                    str[i]
=' ';
            }
            
continue;
        }
        str[i]
=' ';
        i
++;
    }

}

 

 

 

 

posted @ 2010-07-27 23:17  David Luo  阅读(1963)  评论(0编辑  收藏  举报