WTF String

1.KURL:

it's the type of String in WTF,

we use 

  1. KURL.string().utf8().data();  


to return the char * in the KURL.

And how does it come out?

    a KURL is defined in WebCore/Platform/KURL.h, it has a member function string().

    b KURL.string(), return a class of String, which was defined in wtf/text/WTFString.h, it has a member function utf8()

    c String.utf8() return a class of CString , which was defined in wtf/text/CString.h, it has a member function data().

    d CString.data() return the type of char* , which can be printed directly.

 

to be continued

2. String in WTF
a. StringImplBase 
defined in "WTF/text/StringImplBase.h"
it has no concrete implementation
b.StringImpl
defined in "WTF/text/StringImpl.h"
StringImplBase<--StringImpl
it hold a member UChar (wchar_t) //16 or 32 bit.
c.String
defined in "WTF/text/WTFString.h"
String hold a member of "StringImpl".
it has function to return it's data in CString
  1. CString ascii() const;  
  2. CString latin1() const;  
  3. CString utf8(bool strict = falseconst;  
d. CString
was defined in "WTF/text/CString.h"
it has a function "data()" to return the type of "char*".

how is the UChar come to char *
the most imporant function was String.utf8() in "WTF/text/WTFString.h"
using "convertUTF16ToUTF8". //so UChar is 16 bit.
which was defined in "WTF/unicode/utf8.cpp"
  1. ConversionResult convertUTF16ToUTF8(  
  2.     const UChar** sourceStart, const UChar* sourceEnd,   
  3.     char** targetStart, char* targetEnd, bool strict)  
  4. {  
  5.     ConversionResult result = conversionOK;  
  6.     const UChar* source = *sourceStart;  
  7.     char* target = *targetStart;  
  8.     while (source < sourceEnd) {  
  9.         UChar32 ch;  
  10.         unsigned short bytesToWrite = 0;  
  11.         const UChar32 byteMask = 0xBF;  
  12.         const UChar32 byteMark = 0x80;   
  13.         const UChar* oldSource = source; // In case we have to back up because of  
  14.         //target overflow.  
  15.         ch = static_cast<unsigned short>(*source++);  
  16.         // If we have a surrogate pair, convert to UChar32 first.  
  17.         if (ch >= 0xD800 && ch <= 0xDBFF) {  
  18.             // If the 16 bits following the high surrogate are in the source buffer...  
  19.             if (source < sourceEnd) {  
  20.                 UChar32 ch2 = static_cast<unsigned short>(*source);  
  21.                 // If it's a low surrogate, convert to UChar32.  
  22.                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {  
  23.                     ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;  
  24.                     ++source;  
  25.                 } else if (strict) { // it's an unpaired high surrogate  
  26.                     --source; // return to the illegal value itself  
  27.                     result = sourceIllegal;  
  28.                     break;  
  29.                 }  
  30.             } else { // We don't have the 16 bits following the high surrogate.  
  31.                 --source; // return to the high surrogate  
  32.                 result = sourceExhausted;  
  33.                 break;  
  34.             }  
  35.         } else if (strict) {  
  36.             // UTF-16 surrogate values are illegal in UTF-32  
  37.             if (ch >= 0xDC00 && ch <= 0xDFFF) {  
  38.                 --source; // return to the illegal value itself  
  39.                 result = sourceIllegal;  
  40.                 break;  
  41.             }  
  42.         }  
  43.         // Figure out how many bytes the result will require  
  44.         if (ch < (UChar32)0x80) {  
  45.             bytesToWrite = 1;  
  46.         } else if (ch < (UChar32)0x800) {  
  47.             bytesToWrite = 2;  
  48.         } else if (ch < (UChar32)0x10000) {  
  49.             bytesToWrite = 3;  
  50.         } else if (ch < (UChar32)0x110000) {  
  51.             bytesToWrite = 4;  
  52.         } else {  
  53.             bytesToWrite = 3;  
  54.             ch = 0xFFFD;  
  55.         }  
  56.   
  57.         target += bytesToWrite;  
  58.         if (target > targetEnd) {  
  59.             source = oldSource; // Back up source pointer!  
  60.             target -= bytesToWrite;  
  61.             result = targetExhausted;  
  62.             break;  
  63.         }  
  64.         switch (bytesToWrite) { // note: everything falls through.  
  65.             case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;  
  66.             case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;  
  67.             case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;  
  68.             case 1: *--target =  (char)(ch | firstByteMark[bytesToWrite]);  
  69.         }  
  70.         target += bytesToWrite;  
  71.     }  
  72.     *sourceStart = source;  
  73.     *targetStart = target;  
  74.     return result;  
  75. }  

so the most common type used in webcore is "UChar", 
the most common type we use to print is "char*"
the most common way for the conversion is "UChar->utf8->char*".


definition
  1. ConversionResult convertUTF8ToUTF16(  
  2.                 const char** sourceStart, const char* sourceEnd,   
  3.                 UChar** targetStart, UChar* targetEnd, bool strict = true);  
  4.   
  5. ConversionResult convertUTF16ToUTF8(  
  6.                 const UChar** sourceStart, const UChar* sourceEnd,   
  7.                 char** targetStart, char* targetEnd, bool strict = true);  
example:
  1. ConversionResult result = convertUTF16ToUTF8(&characters, characters + length,   
  2.     &buffer, buffer + bufferVector.size(), strict);  
posted @ 2012-05-30 23:51  cascais  阅读(843)  评论(0编辑  收藏  举报