代码改变世界

ucs2-utf8 (Convert characters set beetween utf-8 and ucs2 ONLY for simply chinese, ascii ,littlen-endian Computer.)

2021-09-07 19:11  dreamboy2000  阅读(143)  评论(0编辑  收藏  举报
#include <stdio.h>


typedef unsigned short ucs2; //Unicode2 16bits.
typedef unsigned char utf8;  // utf-8 8bits.

/*
 *
 * 一:e4             b8         80 (utf-8)
 *    1110 0000  1011 1000  1000 0000 
 *     
 *
 *      e0 (ucs2)
 *    0000 0000  11 1000   00 0000  
 *
 *---------------------------------------------------
 * ucs2              : UTF8
 *     u16                    1 Bytes 0xxxxxxx 
 *     u16                    2 Bytes 110xxxxx 10xxxxxx 
 *    u16                  3 Bytes 1110xxxx 10xxxxxx 10xxxxxx 
 */


/*
 * Convert UTF8-coded characters into Unicode2-coded. 
 *
 * @Description:
 *        
 *
 * @Params
 *         dst : A pointer to indicate the memmory address which was used to store Unicode2-coded characters. 
 *     This functions never check args(dst, src), after Convertion dst ends with
 *     zero.
 *         src : A pointer to indicate the memmory address which will be converted.
 *    Before invoke this functions, you should assure that two pointer is valid
 *    and you have adequte memmory ,that is, the length of dst no less than src.
 *
 * @Return value:
 *         It Always should be dst.
 *
 *   see also ucs2utf.
 */
ucs2 *utf2ucs (ucs2 *dst, utf8 *src)
{
 unsigned int i = 0;
 unsigned int j = 0;
 unsigned int next = 0;

 while (src[i] != 0)
 {
  ucs2 temp = 0;
  if (src[i] & 0x80 && src[i+1] & 0x80)
  {

   next = 3;
   temp |= ((src[i] & 0xF) << 12);
   temp |= ((src[i+1] & 0x3F) << 6);
   temp |= ((src[i+2] & 0x3F) << 0);

  }
  else if(src[i] & 0x80 && src[i+1] & 0xC0)
  {
   next = 2;

   temp |= (src[i] & 0x1F) << 6;
   temp |= (src[i+1] & 0x3F) << 0;

  }
  else
  {
   next = 1;
   temp = src[i];
  }
  dst[j++] = temp;
  i += next;
 }
 dst[j] = 0;
 return dst;
};


/*
 *    
 * Convert Unicode2-coded characters into UTF8-coded. 
 *
 * see also utf2ucs.
 *
 */
utf8 *ucs2utf (utf8 *dst, ucs2 *src)
{
 unsigned int i = 0;
 unsigned int j = 0;
 unsigned int next = 0;

 while (src[i] != 0)
 {
  if (src[i] < 0x80)
  { 
   next = 1;
   dst[j] = 0;
   dst[j] = src[i];
  }
  else if(src[i] < 0x800)
  {
   next = 2;
   dst[j] = 0;
   dst[j+1] = 0;

   dst[j+1] = (utf8)((src[i] & 0x3F) | 0x80);

   dst[j] = (utf8)(((src[i] & 0x3F) & 0x1F) | 0xC0);
  }
  else
  {
   next = 3;
   dst[j] = 0;
   dst[j+1] = 0;
   dst[j+2] = 0;

   dst[j] |= ((((utf8)(src[i] >> 12)) & 0xF) | 0xE0);


   dst[j+1] |= (((utf8)(src[i] >> 6) & 0x3F) | 0x80);


   dst[j+2] |= (((utf8)(src[i] >> 0) & 0x3F) | 0x80);

  }

  j += next;
  i++;
 }
 dst[j] = 0;
 return dst;
}

/****************************************************/
//test code.
int main(int argc,const char *argv[])
{
 utf8 str[1024];
 ucs2 dst[512];
 int i = 0;
 int j = 0;
 
 sprintf(str,"%s",argv[1]);


 utf2ucs(dst, str);
 ucs2utf(str, dst);


 printf("%s\n", str);

 return 0;
}