c语言判断是否是utf8字符串,计算字符个数
#include <stdio.h> #include <string.h> #include <stdlib.h> /**************************************************************************** Unicode符号范围 | UTF-8编码方式 (十六进制) | (二进制) 0000 0000-0000 007F:0xxxxxxx 0000 0080-0000 07FF:110xxxxx 10xxxxxx 0000 0800-0000 FFFF:1110xxxx 10xxxxxx 10xxxxxx 0001 0000-001F FFFF:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 0020 0000-03FF FFFF:111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 0400 0000-7FFF FFFF:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx **************************************************************************/ unsigned char utf8_look_for_table[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1}; #define UTFLEN(x) utf8_look_for_table[(x)] //根据首字节,获取utf8字符所占字节数 inline int GetUtf8charByteNum(unsigned char ch) { int byteNum = 0; if (ch >= 0xFC && ch < 0xFE) byteNum = 6; else if (ch >= 0xF8) byteNum = 5; else if (ch >= 0xF0) byteNum = 4; else if (ch >= 0xE0) byteNum = 3; else if (ch >= 0xC0) byteNum = 2; else if (0 == (ch & 0x80)) byteNum = 1; return byteNum; } //判断字符串是否是utf8格式 int IsUtf8Format(const char *str) { int byteNum = 0; unsigned char ch; const char *ptr = str; if (NULL == str) return 0; while (*ptr != '\0') { ch = (unsigned char)*ptr; if (byteNum == 0) //根据首字节特性判断该字符的字节数 { if (0 == (byteNum = GetUtf8charByteNum(ch))) return 0; } else //多字节字符,非首字节格式:10xxxxxx { if ((ch & 0xC0) != 0x80) return 0; } byteNum--; ptr++; } if (byteNum > 0) return 0; return 1; } //计算utf8字符串字符个数 int GetUtf8Length(char *str) { int clen = 0; int len = 0; int byteNum = 0; unsigned char ch; char *ptr = str; if (NULL == str) return 0; clen = strlen(str); while (*ptr != '\0' && len < clen) { ch = (unsigned char)*ptr; if (0 == (byteNum = GetUtf8charByteNum(ch))) return 0; ptr += byteNum; len++; } return len; } int GetChargeNum(int len) { int num = 0; if (len > 70 && len <= 500) { if (!len % 67) num = len / 67; else num = len / 67 + 1; } else if (len > 0) num = 1; return num; } int main(int argc, char **argv) { //char *str = "hello 你好呀!"; char *str; int len = 0; int num = 0; if (argc < 2) return 0; str = argv[1]; printf("%s\n", str); if (!IsUtf8Format(str)) { printf("the text is not the Format of utf8\n"); return 0; } if (!(len = GetUtf8Length(str))) return 0; printf("the length of text: %d\n", len); if (!(num = GetChargeNum(len))) return 0; printf("the chargeNumber of sms: %d\n", num); return 1; }
参考:
http://blog.sina.com.cn/s/blog_62b2318d0101d7kb.html
http://www.cnblogs.com/jiu0821/p/6371544.html