如何分割一个utf8字符串(保证单个汉字的完整性)

std::list<std::string> split_utf8_string(const std::string& text)
{
    std::list<std::string> temp;

    do 
    {
        if (text.length() <= 0)
            break;

        std::string::const_iterator begin = text.begin();
        std::string::const_iterator end   = text.end();
        while (begin != end)
        {
            unsigned char c = *begin;
            int n = 0;

            if ((c & 0x80) == 0)    
                n = 1;
            else if ((c & 0xE0) == 0xC0) 
                n = 2;
            else if ((c & 0xF0) == 0xE0) 
                n = 3;
            else if ((c & 0xF8) == 0xF0) 
                n = 4;
            else if ((c & 0xFC) == 0xF8) 
                n = 5;
            else if ((c & 0xFE) == 0xFC) 
                n = 6;
            else 
                break;

            if (end - begin < n) 
                break;

            std::string substring;
            substring += *begin;

            bool isError = false;
            for (int i=1; i<n; ++i)
            {
                if ((begin[i] & 0xC0) != 0x80) 
                {
                    isError = true;
                    break;
                }

                substring += begin[i];
            }

            if (isError)
                break;

            temp.push_back(substring);
            begin += n;
        }
    } 
    while (false);

    return temp;
}

 

posted @ 2013-11-04 23:07  MrBlue  阅读(938)  评论(0编辑  收藏  举报