WTF String

1.KURL:

it's the type of String in WTF,

we use

[html] view plaincopy
KURL.string().utf8().data();  

to return the char * in the KURL.

And how does it come out?

a KURL is defined in WebCore/Platform/KURL.h, it has a member function string().

b KURL.string(), return a class of String, which was defined in wtf/text/WTFString.h, it has a member function utf8()

c String.utf8() return a class of CString , which was defined in wtf/text/CString.h, it has a member function data().

d CString.data() return the type of char* , which can be printed directly.

to be continued

2. String in WTF
a. StringImplBase
defined in "WTF/text/StringImplBase.h"
it has no concrete implementation
b.StringImpl
defined in "WTF/text/StringImpl.h"
StringImplBase<--StringImpl
it hold a member UChar (wchar_t) //16 or 32 bit.
c.String
defined in "WTF/text/WTFString.h"
String hold a member of "StringImpl".
it has function to return it's data in CString

[cpp] view plaincopy
CString ascii() const;  
CString latin1() const;  
CString utf8(bool strict = false) const;  

d. CString
was defined in "WTF/text/CString.h"
it has a function "data()" to return the type of "char*".

how is the UChar come to char *
the most imporant function was String.utf8() in "WTF/text/WTFString.h"
using "convertUTF16ToUTF8". //so UChar is 16 bit.
which was defined in "WTF/unicode/utf8.cpp"

[cpp] view plaincopy
ConversionResult convertUTF16ToUTF8(  
    const UChar** sourceStart, const UChar* sourceEnd,   
    char** targetStart, char* targetEnd, bool strict)  
{  
    ConversionResult result = conversionOK;  
    const UChar* source = *sourceStart;  
    char* target = *targetStart;  
    while (source < sourceEnd) {  
        UChar32 ch;  
        unsigned short bytesToWrite = 0;  
        const UChar32 byteMask = 0xBF;  
        const UChar32 byteMark = 0x80;   
        const UChar* oldSource = source; // In case we have to back up because of  
        //target overflow.  
        ch = static_cast<unsigned short>(*source++);  
        // If we have a surrogate pair, convert to UChar32 first.  
        if (ch >= 0xD800 && ch <= 0xDBFF) {  
            // If the 16 bits following the high surrogate are in the source buffer...  
            if (source < sourceEnd) {  
                UChar32 ch2 = static_cast<unsigned short>(*source);  
                // If it's a low surrogate, convert to UChar32.  
                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {  
                    ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;  
                    ++source;  
                } else if (strict) { // it's an unpaired high surrogate  
                    --source; // return to the illegal value itself  
                    result = sourceIllegal;  
                    break;  
                }  
            } else { // We don't have the 16 bits following the high surrogate.  
                --source; // return to the high surrogate  
                result = sourceExhausted;  
                break;  
            }  
        } else if (strict) {  
            // UTF-16 surrogate values are illegal in UTF-32  
            if (ch >= 0xDC00 && ch <= 0xDFFF) {  
                --source; // return to the illegal value itself  
                result = sourceIllegal;  
                break;  
            }  
        }  
        // Figure out how many bytes the result will require  
        if (ch < (UChar32)0x80) {  
            bytesToWrite = 1;  
        } else if (ch < (UChar32)0x800) {  
            bytesToWrite = 2;  
        } else if (ch < (UChar32)0x10000) {  
            bytesToWrite = 3;  
        } else if (ch < (UChar32)0x110000) {  
            bytesToWrite = 4;  
        } else {  
            bytesToWrite = 3;  
            ch = 0xFFFD;  
        }  
  
        target += bytesToWrite;  
        if (target > targetEnd) {  
            source = oldSource; // Back up source pointer!  
            target -= bytesToWrite;  
            result = targetExhausted;  
            break;  
        }  
        switch (bytesToWrite) { // note: everything falls through.  
            case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;  
            case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;  
            case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;  
            case 1: *--target =  (char)(ch | firstByteMark[bytesToWrite]);  
        }  
        target += bytesToWrite;  
    }  
    *sourceStart = source;  
    *targetStart = target;  
    return result;  
}  

so the most common type used in webcore is "UChar",
the most common type we use to print is "char*"
the most common way for the conversion is "UChar->utf8->char*".

definition

[cpp] view plaincopy
ConversionResult convertUTF8ToUTF16(  
                const char** sourceStart, const char* sourceEnd,   
                UChar** targetStart, UChar* targetEnd, bool strict = true);  
  
ConversionResult convertUTF16ToUTF8(  
                const UChar** sourceStart, const UChar* sourceEnd,   
                char** targetStart, char* targetEnd, bool strict = true);  

example:

[cpp] view plaincopy
ConversionResult result = convertUTF16ToUTF8(&characters, characters + length,   
    &buffer, buffer + bufferVector.size(), strict);  

posted @ 2012-05-30 23:51 cascais 阅读(843) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

cascais

WTF String

公告