又一个转换utf的程序

http://t1.minormatter.com/~ddunbar/clang-cov/ConvertUTF.c.html
/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
       2                 :  *
       3                 :  *                     The LLVM Compiler Infrastructure
       4                 :  *
       5                 :  * This file is distributed under the University of Illinois Open Source
       6                 :  * License. See LICENSE.TXT for details.
       7                 :  *
       8                 :  *===------------------------------------------------------------------------=*/
       9                 : /*
      10                 :  * Copyright 2001-2004 Unicode, Inc.
      11                 :  * 
      12                 :  * Disclaimer
      13                 :  * 
      14                 :  * This source code is provided as is by Unicode, Inc. No claims are
      15                 :  * made as to fitness for any particular purpose. No warranties of any
      16                 :  * kind are expressed or implied. The recipient agrees to determine
      17                 :  * applicability of information provided. If this file has been
      18                 :  * purchased on magnetic or optical media from Unicode, Inc., the
      19                 :  * sole remedy for any claim will be exchange of defective media
      20                 :  * within 90 days of receipt.
      21                 :  * 
      22                 :  * Limitations on Rights to Redistribute This Code
      23                 :  * 
      24                 :  * Unicode, Inc. hereby grants the right to freely use the information
      25                 :  * supplied in this file in the creation of products supporting the
      26                 :  * Unicode Standard, and to make copies of this file in any form
      27                 :  * for internal or external distribution as long as this notice
      28                 :  * remains attached.
      29                 :  */
      30                 : 
      31                 : /* ---------------------------------------------------------------------
      32                 : 
      33                 :     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
      34                 :     Author: Mark E. Davis, 1994.
      35                 :     Rev History: Rick McGowan, fixes & updates May 2001.
      36                 :     Sept 2001: fixed const & error conditions per
      37                 :         mods suggested by S. Parent & A. Lillich.
      38                 :     June 2002: Tim Dodd added detection and handling of incomplete
      39                 :         source sequences, enhanced error detection, added casts
      40                 :         to eliminate compiler warnings.
      41                 :     July 2003: slight mods to back out aggressive FFFE detection.
      42                 :     Jan 2004: updated switches in from-UTF8 conversions.
      43                 :     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
      44                 : 
      45                 :     See the header file "ConvertUTF.h" for complete documentation.
      46                 : 
      47                 : ------------------------------------------------------------------------ */
      48                 : 
      49                 : 
      50                 : #include "clang/Basic/ConvertUTF.h"
      51                 : #ifdef CVTUTF_DEBUG
      52                 : #include <stdio.h>
      53                 : #endif
      54                 : 
      55                 : static const int halfShift  = 10; /* used for shifting by 10 bits */
      56                 : 
      57                 : static const UTF32 halfBase = 0x0010000UL;
      58                 : static const UTF32 halfMask = 0x3FFUL;
      59                 : 
      60                 : #define UNI_SUR_HIGH_START  (UTF32)0xD800
      61                 : #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
      62                 : #define UNI_SUR_LOW_START   (UTF32)0xDC00
      63                 : #define UNI_SUR_LOW_END     (UTF32)0xDFFF
      64                 : #define false      0
      65                 : #define true        1
      66                 : 
      67                 : /* --------------------------------------------------------------------- */
      68                 : 
      69                 : /*
      70                 :  * Index into the table below with the first byte of a UTF-8 sequence to
      71                 :  * get the number of trailing bytes that are supposed to follow it.
      72                 :  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
      73                 :  * left as-is for anyone who may want to do such conversion, which was
      74                 :  * allowed in earlier algorithms.
      75                 :  */
      76                 : static const char trailingBytesForUTF8[256] = {
      77                 :     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
      78                 :     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
      79                 :     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
      80                 :     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
      81                 :     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
      82                 :     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
      83                 :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      84                 :     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
      85                 : };
      86                 : 
      87                 : /*
      88                 :  * Magic values subtracted from a buffer value during UTF8 conversion.
      89                 :  * This table contains as many values as there might be trailing bytes
      90                 :  * in a UTF-8 sequence.
      91                 :  */
      92                 : static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
      93                 :                      0x03C82080UL, 0xFA082080UL, 0x82082080UL };
      94                 : 
      95                 : /*
      96                 :  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
      97                 :  * into the first byte, depending on how many bytes follow.  There are
      98                 :  * as many entries in this table as there are UTF-8 sequence types.
      99                 :  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
     100                 :  * for *legal* UTF-8 will be 4 or fewer bytes total.
     101                 :  */
     102                 : static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
     103                 : 
     104                 : /* --------------------------------------------------------------------- */
     105                 : 
     106                 : /* The interface converts a whole buffer to avoid function-call overhead.
     107                 :  * Constants have been gathered. Loops & conditionals have been removed as
     108                 :  * much as possible for efficiency, in favor of drop-through switches.
     109                 :  * (See "Note A" at the bottom of the file for equivalent code.)
     110                 :  * If your compiler supports it, the "isLegalUTF8" call can be turned
     111                 :  * into an inline function.
     112                 :  */
     113                 : 
     114                 : #ifdef CLANG_NEEDS_THESE_ONE_DAY
     115                 : 
     116                 : /* --------------------------------------------------------------------- */
     117                 : 
     118                 : ConversionResult ConvertUTF32toUTF16 (
     119                 :         const UTF32** sourceStart, const UTF32* sourceEnd, 
     120                 :         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
     121                 :     ConversionResult result = conversionOK;
     122                 :     const UTF32* source = *sourceStart;
     123                 :     UTF16* target = *targetStart;
     124                 :     while (source < sourceEnd) {
     125                 :         UTF32 ch;
     126                 :         if (target >= targetEnd) {
     127                 :             result = targetExhausted; break;
     128                 :         }
     129                 :         ch = *source++;
     130                 :         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
     131                 :             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
     132                 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     133                 :                 if (flags == strictConversion) {
     134                 :                     --source; /* return to the illegal value itself */
     135                 :                     result = sourceIllegal;
     136                 :                     break;
     137                 :                 } else {
     138                 :                     *target++ = UNI_REPLACEMENT_CHAR;
     139                 :                 }
     140                 :             } else {
     141                 :                 *target++ = (UTF16)ch; /* normal case */
     142                 :             }
     143                 :         } else if (ch > UNI_MAX_LEGAL_UTF32) {
     144                 :             if (flags == strictConversion) {
     145                 :                 result = sourceIllegal;
     146                 :             } else {
     147                 :                 *target++ = UNI_REPLACEMENT_CHAR;
     148                 :             }
     149                 :         } else {
     150                 :             /* target is a character in range 0xFFFF - 0x10FFFF. */
     151                 :             if (target + 1 >= targetEnd) {
     152                 :                 --source; /* Back up source pointer! */
     153                 :                 result = targetExhausted; break;
     154                 :             }
     155                 :             ch -= halfBase;
     156                 :             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
     157                 :             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
     158                 :         }
     159                 :     }
     160                 :     *sourceStart = source;
     161                 :     *targetStart = target;
     162                 :     return result;
     163                 : }
     164                 : 
     165                 : /* --------------------------------------------------------------------- */
     166                 : 
     167                 : ConversionResult ConvertUTF16toUTF32 (
     168                 :         const UTF16** sourceStart, const UTF16* sourceEnd, 
     169                 :         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
     170                 :     ConversionResult result = conversionOK;
     171                 :     const UTF16* source = *sourceStart;
     172                 :     UTF32* target = *targetStart;
     173                 :     UTF32 ch, ch2;
     174                 :     while (source < sourceEnd) {
     175                 :         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
     176                 :         ch = *source++;
     177                 :         /* If we have a surrogate pair, convert to UTF32 first. */
     178                 :         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
     179                 :             /* If the 16 bits following the high surrogate are in the source buffer... */
     180                 :             if (source < sourceEnd) {
     181                 :                 ch2 = *source;
     182                 :                 /* If it's a low surrogate, convert to UTF32. */
     183                 :                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
     184                 :                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
     185                 :                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
     186                 :                     ++source;
     187                 :                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
     188                 :                     --source; /* return to the illegal value itself */
     189                 :                     result = sourceIllegal;
     190                 :                     break;
     191                 :                 }
     192                 :             } else { /* We don't have the 16 bits following the high surrogate. */
     193                 :                 --source; /* return to the high surrogate */
     194                 :                 result = sourceExhausted;
     195                 :                 break;
     196                 :             }
     197                 :         } else if (flags == strictConversion) {
     198                 :             /* UTF-16 surrogate values are illegal in UTF-32 */
     199                 :             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
     200                 :                 --source; /* return to the illegal value itself */
     201                 :                 result = sourceIllegal;
     202                 :                 break;
     203                 :             }
     204                 :         }
     205                 :         if (target >= targetEnd) {
     206                 :             source = oldSource; /* Back up source pointer! */
     207                 :             result = targetExhausted; break;
     208                 :         }
     209                 :         *target++ = ch;
     210                 :     }
     211                 :     *sourceStart = source;
     212                 :     *targetStart = target;
     213                 : #ifdef CVTUTF_DEBUG
     214                 : if (result == sourceIllegal) {
     215                 :     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
     216                 :     fflush(stderr);
     217                 : }
     218                 : #endif
     219                 :     return result;
     220                 : }
     221                 : ConversionResult ConvertUTF16toUTF8 (
     222                 :         const UTF16** sourceStart, const UTF16* sourceEnd, 
     223                 :         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
     224                 :     ConversionResult result = conversionOK;
     225                 :     const UTF16* source = *sourceStart;
     226                 :     UTF8* target = *targetStart;
     227                 :     while (source < sourceEnd) {
     228                 :         UTF32 ch;
     229                 :         unsigned short bytesToWrite = 0;
     230                 :         const UTF32 byteMask = 0xBF;
     231                 :         const UTF32 byteMark = 0x80; 
     232                 :         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
     233                 :         ch = *source++;
     234                 :         /* If we have a surrogate pair, convert to UTF32 first. */
     235                 :         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
     236                 :             /* If the 16 bits following the high surrogate are in the source buffer... */
     237                 :             if (source < sourceEnd) {
     238                 :                 UTF32 ch2 = *source;
     239                 :                 /* If it's a low surrogate, convert to UTF32. */
     240                 :                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
     241                 :                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
     242                 :                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
     243                 :                     ++source;
     244                 :                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
     245                 :                     --source; /* return to the illegal value itself */
     246                 :                     result = sourceIllegal;
     247                 :                     break;
     248                 :                 }
     249                 :             } else { /* We don't have the 16 bits following the high surrogate. */
     250                 :                 --source; /* return to the high surrogate */
     251                 :                 result = sourceExhausted;
     252                 :                 break;
     253                 :             }
     254                 :         } else if (flags == strictConversion) {
     255                 :             /* UTF-16 surrogate values are illegal in UTF-32 */
     256                 :             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
     257                 :                 --source; /* return to the illegal value itself */
     258                 :                 result = sourceIllegal;
     259                 :                 break;
     260                 :             }
     261                 :         }
     262                 :         /* Figure out how many bytes the result will require */
     263                 :         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
     264                 :         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
     265                 :         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
     266                 :         } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
     267                 :         } else {                            bytesToWrite = 3;
     268                 :                                             ch = UNI_REPLACEMENT_CHAR;
     269                 :         }
     270                 : 
     271                 :         target += bytesToWrite;
     272                 :         if (target > targetEnd) {
     273                 :             source = oldSource; /* Back up source pointer! */
     274                 :             target -= bytesToWrite; result = targetExhausted; break;
     275                 :         }
     276                 :         switch (bytesToWrite) { /* note: everything falls through. */
     277                 :             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     278                 :             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     279                 :             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     280                 :             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
     281                 :         }
     282                 :         target += bytesToWrite;
     283                 :     }
     284                 :     *sourceStart = source;
     285                 :     *targetStart = target;
     286                 :     return result;
     287                 : }
     288                 : 
     289                 : /* --------------------------------------------------------------------- */
     290                 : 
     291                 : ConversionResult ConvertUTF32toUTF8 (
     292                 :         const UTF32** sourceStart, const UTF32* sourceEnd, 
     293                 :         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
     294                 :     ConversionResult result = conversionOK;
     295                 :     const UTF32* source = *sourceStart;
     296                 :     UTF8* target = *targetStart;
     297                 :     while (source < sourceEnd) {
     298                 :         UTF32 ch;
     299                 :         unsigned short bytesToWrite = 0;
     300                 :         const UTF32 byteMask = 0xBF;
     301                 :         const UTF32 byteMark = 0x80; 
     302                 :         ch = *source++;
     303                 :         if (flags == strictConversion ) {
     304                 :             /* UTF-16 surrogate values are illegal in UTF-32 */
     305                 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     306                 :                 --source; /* return to the illegal value itself */
     307                 :                 result = sourceIllegal;
     308                 :                 break;
     309                 :             }
     310                 :         }
     311                 :         /*
     312                 :          * Figure out how many bytes the result will require. Turn any
     313                 :          * illegally large UTF32 things (> Plane 17) into replacement chars.
     314                 :          */
     315                 :         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
     316                 :         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
     317                 :         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
     318                 :         } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
     319                 :         } else {                            bytesToWrite = 3;
     320                 :                                             ch = UNI_REPLACEMENT_CHAR;
     321                 :                                             result = sourceIllegal;
     322                 :         }
     323                 :         
     324                 :         target += bytesToWrite;
     325                 :         if (target > targetEnd) {
     326                 :             --source; /* Back up source pointer! */
     327                 :             target -= bytesToWrite; result = targetExhausted; break;
     328                 :         }
     329                 :         switch (bytesToWrite) { /* note: everything falls through. */
     330                 :             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     331                 :             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     332                 :             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     333                 :             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
     334                 :         }
     335                 :         target += bytesToWrite;
     336                 :     }
     337                 :     *sourceStart = source;
     338                 :     *targetStart = target;
     339                 :     return result;
     340                 : }
     341                 : 
     342                 : /* --------------------------------------------------------------------- */
     343                 : 
     344                 : ConversionResult ConvertUTF8toUTF32 (
     345                 :         const UTF8** sourceStart, const UTF8* sourceEnd, 
     346                 :         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
     347                 :     ConversionResult result = conversionOK;
     348                 :     const UTF8* source = *sourceStart;
     349                 :     UTF32* target = *targetStart;
     350                 :     while (source < sourceEnd) {
     351                 :         UTF32 ch = 0;
     352                 :         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
     353                 :         if (source + extraBytesToRead >= sourceEnd) {
     354                 :             result = sourceExhausted; break;
     355                 :         }
     356                 :         /* Do this check whether lenient or strict */
     357                 :         if (!isLegalUTF8(source, extraBytesToRead+1)) {
     358                 :             result = sourceIllegal;
     359                 :             break;
     360                 :         }
     361                 :         /*
     362                 :          * The cases all fall through. See "Note A" below.
     363                 :          */
     364                 :         switch (extraBytesToRead) {
     365                 :             case 5: ch += *source++; ch <<= 6;
     366                 :             case 4: ch += *source++; ch <<= 6;
     367                 :             case 3: ch += *source++; ch <<= 6;
     368                 :             case 2: ch += *source++; ch <<= 6;
     369                 :             case 1: ch += *source++; ch <<= 6;
     370                 :             case 0: ch += *source++;
     371                 :         }
     372                 :         ch -= offsetsFromUTF8[extraBytesToRead];
     373                 : 
     374                 :         if (target >= targetEnd) {
     375                 :             source -= (extraBytesToRead+1); /* Back up the source pointer! */
     376                 :             result = targetExhausted; break;
     377                 :         }
     378                 :         if (ch <= UNI_MAX_LEGAL_UTF32) {
     379                 :             /*
     380                 :              * UTF-16 surrogate values are illegal in UTF-32, and anything
     381                 :              * over Plane 17 (> 0x10FFFF) is illegal.
     382                 :              */
     383                 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     384                 :                 if (flags == strictConversion) {
     385                 :                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
     386                 :                     result = sourceIllegal;
     387                 :                     break;
     388                 :                 } else {
     389                 :                     *target++ = UNI_REPLACEMENT_CHAR;
     390                 :                 }
     391                 :             } else {
     392                 :                 *target++ = ch;
     393                 :             }
     394                 :         } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
     395                 :             result = sourceIllegal;
     396                 :             *target++ = UNI_REPLACEMENT_CHAR;
     397                 :         }
     398                 :     }
     399                 :     *sourceStart = source;
     400                 :     *targetStart = target;
     401                 :     return result;
     402                 : }
     403                 : #endif
     404                 : 
     405                 : /* --------------------------------------------------------------------- */
     406                 : 
     407                 : /*
     408                 :  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
     409                 :  * This must be called with the length pre-determined by the first byte.
     410                 :  * If not calling this from ConvertUTF8to*, then the length can be set by:
     411                 :  *  length = trailingBytesForUTF8[*source]+1;
     412                 :  * and the sequence is illegal right away if there aren't that many bytes
     413                 :  * available.
     414                 :  * If presented with a length > 4, this returns false.  The Unicode
     415                 :  * definition of UTF-8 goes up to 4-byte sequences.
     416                 :  */
     417                 : 
     418               44: static Boolean isLegalUTF8(const UTF8 *source, int length) {
     419                 :     UTF8 a;
     420               44:     const UTF8 *srcptr = source+length;
                0: branch 0 not taken
                0: branch 1 not taken
                8: branch 2 taken
                0: branch 3 not taken
               36: branch 4 taken
     421               44:     switch (length) {
     422                0:     default: return false;
     423                 :         /* Everything else falls through when "true"... */
                0: branch 0 not taken
                0: branch 1 not taken
                0: branch 2 not taken
                0: branch 3 not taken
     424                0:     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
                8: branch 0 taken
                0: branch 1 not taken
                0: branch 2 not taken
                8: branch 3 taken
     425                8:     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
                0: branch 0 not taken
                8: branch 1 taken
     426                8:     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
     427                 : 
                0: branch 0 not taken
                0: branch 1 not taken
                0: branch 2 not taken
                0: branch 3 not taken
                8: branch 4 taken
     428                8:         switch (*source) {
     429                 :             /* no fall-through in this inner switch */
                0: branch 0 not taken
                0: branch 1 not taken
     430                0:             case 0xE0: if (a < 0xA0) return false; break;
                0: branch 0 not taken
                0: branch 1 not taken
     431                0:             case 0xED: if (a > 0x9F) return false; break;
                0: branch 0 not taken
                0: branch 1 not taken
     432                0:             case 0xF0: if (a < 0x90) return false; break;
                0: branch 0 not taken
                0: branch 1 not taken
     433                0:             case 0xF4: if (a > 0x8F) return false; break;
                0: branch 0 not taken
                8: branch 1 taken
     434                8:             default:   if (a < 0x80) return false;
     435                 :         }
     436                 : 
                8: branch 0 taken
               36: branch 1 taken
                0: branch 2 not taken
                8: branch 3 taken
     437               44:     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
     438                 :     }
                0: branch 0 not taken
               44: branch 1 taken
     439               44:     if (*source > 0xF4) return false;
     440               44:     return true;
     441                 : }
     442                 : 
     443                 : /* --------------------------------------------------------------------- */
     444                 : 
     445                 : /*
     446                 :  * Exported function to return whether a UTF-8 sequence is legal or not.
     447                 :  * This is not used here; it's just exported.
     448                 :  */
     449                0: Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
     450                0:     int length = trailingBytesForUTF8[*source]+1;
                0: branch 0 not taken
                0: branch 1 not taken
     451                0:     if (source+length > sourceEnd) {
     452                0:         return false;
     453                 :     }
     454                0:     return isLegalUTF8(source, length);
     455                 : }
     456                 : 
     457                 : /* --------------------------------------------------------------------- */
     458                 : 
     459                 : ConversionResult ConvertUTF8toUTF16 (
     460                 :         const UTF8** sourceStart, const UTF8* sourceEnd, 
     461                4:         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
     462                4:     ConversionResult result = conversionOK;
     463                4:     const UTF8* source = *sourceStart;
     464                4:     UTF16* target = *targetStart;
               44: branch 0 taken
                4: branch 1 taken
     465               52:     while (source < sourceEnd) {
     466               44:         UTF32 ch = 0;
     467               44:         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
                0: branch 0 not taken
               44: branch 1 taken
     468               44:         if (source + extraBytesToRead >= sourceEnd) {
     469                0:             result = sourceExhausted; break;
     470                 :         }
     471                 :         /* Do this check whether lenient or strict */
                0: branch 1 not taken
               44: branch 2 taken
     472               44:         if (!isLegalUTF8(source, extraBytesToRead+1)) {
     473                0:             result = sourceIllegal;
     474                0:             break;
     475                 :         }
     476                 :         /*
     477                 :          * The cases all fall through. See "Note A" below.
     478                 :          */
                0: branch 0 not taken
                0: branch 1 not taken
                0: branch 2 not taken
                8: branch 3 taken
                0: branch 4 not taken
               36: branch 5 taken
                0: branch 6 not taken
     479               44:         switch (extraBytesToRead) {
     480                0:             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
     481                0:             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
     482                0:             case 3: ch += *source++; ch <<= 6;
     483                8:             case 2: ch += *source++; ch <<= 6;
     484                8:             case 1: ch += *source++; ch <<= 6;
     485               44:             case 0: ch += *source++;
     486                 :         }
     487               44:         ch -= offsetsFromUTF8[extraBytesToRead];
     488                 : 
                0: branch 0 not taken
               44: branch 1 taken
     489               44:         if (target >= targetEnd) {
     490                0:             source -= (extraBytesToRead+1); /* Back up source pointer! */
     491                0:             result = targetExhausted; break;
     492                 :         }
               44: branch 0 taken
                0: branch 1 not taken
     493               44:         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
     494                 :             /* UTF-16 surrogate values are illegal in UTF-32 */
                0: branch 0 not taken
               44: branch 1 taken
               44: branch 2 taken
               44: branch 3 taken
     495               44:             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
                0: branch 0 not taken
                0: branch 1 not taken
     496                0:                 if (flags == strictConversion) {
     497                0:                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
     498                0:                     result = sourceIllegal;
     499                0:                     break;
     500                 :                 } else {
     501                0:                     *target++ = UNI_REPLACEMENT_CHAR;
     502                 :                 }
     503                 :             } else {
     504               44:                 *target++ = (UTF16)ch; /* normal case */
     505                 :             }
                0: branch 0 not taken
                0: branch 1 not taken
     506                0:         } else if (ch > UNI_MAX_UTF16) {
                0: branch 0 not taken
                0: branch 1 not taken
     507                0:             if (flags == strictConversion) {
     508                0:                 result = sourceIllegal;
     509                0:                 source -= (extraBytesToRead+1); /* return to the start */
     510                0:                 break; /* Bail out; shouldn't continue */
     511                 :             } else {
     512                0:                 *target++ = UNI_REPLACEMENT_CHAR;
     513                 :             }
     514                 :         } else {
     515                 :             /* target is a character in range 0xFFFF - 0x10FFFF. */
                0: branch 0 not taken
                0: branch 1 not taken
     516                0:             if (target + 1 >= targetEnd) {
     517                0:                 source -= (extraBytesToRead+1); /* Back up source pointer! */
     518                0:                 result = targetExhausted; break;
     519                 :             }
     520                0:             ch -= halfBase;
     521                0:             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
     522                0:             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
     523                 :         }
     524                 :     }
     525                4:     *sourceStart = source;
     526                4:     *targetStart = target;
     527                4:     return result;
     528                 : }
     529                 : 
     530                 : /* ---------------------------------------------------------------------
     531                 : 
     532                 :     Note A.
     533                 :     The fall-through switches in UTF-8 reading code save a
     534                 :     temp variable, some decrements & conditionals.  The switches
     535                 :     are equivalent to the following loop:
     536                 :         {
     537                 :             int tmpBytesToRead = extraBytesToRead+1;
     538                 :             do {
     539                 :                 ch += *source++;
     540                 :                 --tmpBytesToRead;
     541                 :                 if (tmpBytesToRead) ch <<= 6;
     542                 :             } while (tmpBytesToRead > 0);
     543                 :         }
     544                 :     In UTF-8 writing code, the switches on "bytesToWrite" are
     545                 :     similarly unrolled loops.
     546                 : 
     547                 :    --------------------------------------------------------------------- */
posted on 2010-03-23 12:10 老K 阅读(379) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
老K

又一个转换utf的程序

导航

公告