c语言判断是否是utf8字符串,计算字符个数

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
 
/****************************************************************************
Unicode符号范围 | UTF-8编码方式
    (十六进制) | (二进制)
0000 0000-0000 007F:0xxxxxxx
0000 0080-0000 07FF:110xxxxx 10xxxxxx
0000 0800-0000 FFFF:1110xxxx 10xxxxxx 10xxxxxx
0001 0000-001F FFFF:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0020 0000-03FF FFFF:111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0400 0000-7FFF FFFF:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
**************************************************************************/
 
unsigned char utf8_look_for_table[] =
    {
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1};
 
#define UTFLEN(x) utf8_look_for_table[(x)]
 
//根据首字节,获取utf8字符所占字节数
inline int GetUtf8charByteNum(unsigned char ch)
{
    int byteNum = 0;
 
    if (ch >= 0xFC && ch < 0xFE)
        byteNum = 6;
    else if (ch >= 0xF8)
        byteNum = 5;
    else if (ch >= 0xF0)
        byteNum = 4;
    else if (ch >= 0xE0)
        byteNum = 3;
    else if (ch >= 0xC0)
        byteNum = 2;
    else if (0 == (ch & 0x80))
        byteNum = 1;
 
    return byteNum;
}
 
//判断字符串是否是utf8格式
int IsUtf8Format(const char *str)
{
    int byteNum = 0;
    unsigned char ch;
    const char *ptr = str;
 
    if (NULL == str)
        return 0;
 
    while (*ptr != '\0')
    {
        ch = (unsigned char)*ptr;
        if (byteNum == 0) //根据首字节特性判断该字符的字节数
        {
            if (0 == (byteNum = GetUtf8charByteNum(ch)))
                return 0;
        }
        else //多字节字符,非首字节格式:10xxxxxx
        {
            if ((ch & 0xC0) != 0x80)
                return 0;
        }
        byteNum--;
        ptr++;
    }
 
    if (byteNum > 0)
        return 0;
 
    return 1;
}
 
//计算utf8字符串字符个数
int GetUtf8Length(char *str)
{
    int clen = 0;
    int len = 0;
    int byteNum = 0;
    unsigned char ch;
    char *ptr = str;
 
    if (NULL == str)
        return 0;
 
    clen = strlen(str);
    while (*ptr != '\0' && len < clen)
    {
        ch = (unsigned char)*ptr;
        if (0 == (byteNum = GetUtf8charByteNum(ch)))
            return 0;
        ptr += byteNum;
        len++;
    }
 
    return len;
}
 
int GetChargeNum(int len)
{
    int num = 0;
 
    if (len > 70 && len <= 500)
    {
        if (!len % 67)
            num = len / 67;
        else
            num = len / 67 + 1;
    }
    else if (len > 0)
        num = 1;
 
    return num;
}
 
int main(int argc, char **argv)
{
    //char *str = "hello 你好呀!";
    char *str;
    int len = 0;
    int num = 0;
 
    if (argc < 2)
        return 0;
 
    str = argv[1];
    printf("%s\n", str);
 
    if (!IsUtf8Format(str))
    {
        printf("the text is not the Format of utf8\n");
        return 0;
    }
 
    if (!(len = GetUtf8Length(str)))
        return 0;
    printf("the length of text: %d\n", len);
 
    if (!(num = GetChargeNum(len)))
        return 0;
    printf("the chargeNumber of sms: %d\n", num);
 
    return 1;
}

  

 

参考:

http://blog.sina.com.cn/s/blog_62b2318d0101d7kb.html

http://www.cnblogs.com/jiu0821/p/6371544.html

posted @   翻白眼的哈士奇  阅读(4297)  评论(0编辑  收藏  举报
编辑推荐:
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
阅读排行:
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
点击右上角即可分享
微信分享提示