Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright 2001-2004 Unicode, Inc.
      3  *
      4  * Disclaimer
      5  *
      6  * This source code is provided as is by Unicode, Inc. No claims are
      7  * made as to fitness for any particular purpose. No warranties of any
      8  * kind are expressed or implied. The recipient agrees to determine
      9  * applicability of information provided. If this file has been
     10  * purchased on magnetic or optical media from Unicode, Inc., the
     11  * sole remedy for any claim will be exchange of defective media
     12  * within 90 days of receipt.
     13  *
     14  * Limitations on Rights to Redistribute This Code
     15  *
     16  * Unicode, Inc. hereby grants the right to freely use the information
     17  * supplied in this file in the creation of products supporting the
     18  * Unicode Standard, and to make copies of this file in any form
     19  * for internal or external distribution as long as this notice
     20  * remains attached.
     21  */
     22 
     23 /* ---------------------------------------------------------------------
     24 
     25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
     26     Author: Mark E. Davis, 1994.
     27     Rev History: Rick McGowan, fixes & updates May 2001.
     28     Sept 2001: fixed const & error conditions per
     29 	mods suggested by S. Parent & A. Lillich.
     30     June 2002: Tim Dodd added detection and handling of incomplete
     31 	source sequences, enhanced error detection, added casts
     32 	to eliminate compiler warnings.
     33     July 2003: slight mods to back out aggressive FFFE detection.
     34     Jan 2004: updated switches in from-UTF8 conversions.
     35     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
     36 
     37     See the header file "ConvertUTF.h" for complete documentation.
     38 
     39 ------------------------------------------------------------------------ */
     40 
     41 
     42 #include "antlr3convertutf.h"
     43 
     44 #ifdef CVTUTF_DEBUG
     45 #include <stdio.h>
     46 #endif
     47 
     48 
     49 
     50 /* --------------------------------------------------------------------- */
     51 
     52 ConversionResult ConvertUTF32toUTF16 (
     53 	const UTF32** sourceStart, const UTF32* sourceEnd,
     54 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
     55     ConversionResult result = conversionOK;
     56     const UTF32* source = *sourceStart;
     57     UTF16* target = *targetStart;
     58     while (source < sourceEnd) {
     59 	UTF32 ch;
     60 	if (target >= targetEnd) {
     61 	    result = targetExhausted; break;
     62 	}
     63 	ch = *source++;
     64 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
     65 	    /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
     66 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     67 		if (flags == strictConversion) {
     68 		    --source; /* return to the illegal value itself */
     69 		    result = sourceIllegal;
     70 		    break;
     71 		} else {
     72 		    *target++ = UNI_REPLACEMENT_CHAR;
     73 		}
     74 	    } else {
     75 		*target++ = (UTF16)ch; /* normal case */
     76 	    }
     77 	} else if (ch > UNI_MAX_LEGAL_UTF32) {
     78 	    if (flags == strictConversion) {
     79 		result = sourceIllegal;
     80 	    } else {
     81 		*target++ = UNI_REPLACEMENT_CHAR;
     82 	    }
     83 	} else {
     84 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
     85 	    if (target + 1 >= targetEnd) {
     86 		--source; /* Back up source pointer! */
     87 		result = targetExhausted; break;
     88 	    }
     89 	    ch -= halfBase;
     90 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
     91 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
     92 	}
     93     }
     94     *sourceStart = source;
     95     *targetStart = target;
     96     return result;
     97 }
     98 
     99 /* --------------------------------------------------------------------- */
    100 
    101 ConversionResult ConvertUTF16toUTF32 (
    102 	const UTF16** sourceStart, const UTF16* sourceEnd,
    103 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
    104     ConversionResult result = conversionOK;
    105     const UTF16* source = *sourceStart;
    106     UTF32* target = *targetStart;
    107     UTF32 ch, ch2;
    108     while (source < sourceEnd) {
    109 	const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
    110 	ch = *source++;
    111 	/* If we have a surrogate pair, convert to UTF32 first. */
    112 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
    113 	    /* If the 16 bits following the high surrogate are in the source buffer... */
    114 	    if (source < sourceEnd) {
    115 		ch2 = *source;
    116 		/* If it's a low surrogate, convert to UTF32. */
    117 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
    118 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
    119 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
    120 		    ++source;
    121 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
    122 		    --source; /* return to the illegal value itself */
    123 		    result = sourceIllegal;
    124 		    break;
    125 		}
    126 	    } else { /* We don't have the 16 bits following the high surrogate. */
    127 		--source; /* return to the high surrogate */
    128 		result = sourceExhausted;
    129 		break;
    130 	    }
    131 	} else if (flags == strictConversion) {
    132 	    /* UTF-16 surrogate values are illegal in UTF-32 */
    133 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
    134 		--source; /* return to the illegal value itself */
    135 		result = sourceIllegal;
    136 		break;
    137 	    }
    138 	}
    139 	if (target >= targetEnd) {
    140 	    source = oldSource; /* Back up source pointer! */
    141 	    result = targetExhausted; break;
    142 	}
    143 	*target++ = ch;
    144     }
    145     *sourceStart = source;
    146     *targetStart = target;
    147 #ifdef CVTUTF_DEBUG
    148 if (result == sourceIllegal) {
    149     ANTLR3_FPRINTF(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
    150     fflush(stderr);
    151 }
    152 #endif
    153     return result;
    154 }
    155 
    156 /* --------------------------------------------------------------------- */
    157 
    158 /*
    159  * Index into the table below with the first byte of a UTF-8 sequence to
    160  * get the number of trailing bytes that are supposed to follow it.
    161  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
    162  * left as-is for anyone who may want to do such conversion, which was
    163  * allowed in earlier algorithms.
    164  */
    165 static const char trailingBytesForUTF8[256] = {
    166     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    167     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    168     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    169     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    170     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    171     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    172     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    173     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
    174 };
    175 
    176 /*
    177  * Magic values subtracted from a buffer value during UTF8 conversion.
    178  * This table contains as many values as there might be trailing bytes
    179  * in a UTF-8 sequence.
    180  */
    181 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
    182 		     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
    183 
    184 /*
    185  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
    186  * into the first byte, depending on how many bytes follow.  There are
    187  * as many entries in this table as there are UTF-8 sequence types.
    188  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
    189  * for *legal* UTF-8 will be 4 or fewer bytes total.
    190  */
    191 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
    192 
    193 /* --------------------------------------------------------------------- */
    194 
    195 /* The interface converts a whole buffer to avoid function-call overhead.
    196  * Constants have been gathered. Loops & conditionals have been removed as
    197  * much as possible for efficiency, in favor of drop-through switches.
    198  * (See "Note A" at the bottom of the file for equivalent code.)
    199  * If your compiler supports it, the "isLegalUTF8" call can be turned
    200  * into an inline function.
    201  */
    202 
    203 /* --------------------------------------------------------------------- */
    204 
    205 ConversionResult ConvertUTF16toUTF8 (
    206 	const UTF16** sourceStart, const UTF16* sourceEnd,
    207 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
    208     ConversionResult result = conversionOK;
    209     const UTF16* source = *sourceStart;
    210     UTF8* target = *targetStart;
    211     while (source < sourceEnd) {
    212 	UTF32 ch;
    213 	unsigned short bytesToWrite = 0;
    214 	const UTF32 byteMask = 0xBF;
    215 	const UTF32 byteMark = 0x80;
    216 	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
    217 	ch = *source++;
    218 	/* If we have a surrogate pair, convert to UTF32 first. */
    219 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
    220 	    /* If the 16 bits following the high surrogate are in the source buffer... */
    221 	    if (source < sourceEnd) {
    222 		UTF32 ch2 = *source;
    223 		/* If it's a low surrogate, convert to UTF32. */
    224 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
    225 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
    226 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
    227 		    ++source;
    228 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
    229 		    --source; /* return to the illegal value itself */
    230 		    result = sourceIllegal;
    231 		    break;
    232 		}
    233 	    } else { /* We don't have the 16 bits following the high surrogate. */
    234 		--source; /* return to the high surrogate */
    235 		result = sourceExhausted;
    236 		break;
    237 	    }
    238         } else if (flags == strictConversion) {
    239 	    /* UTF-16 surrogate values are illegal in UTF-32 */
    240 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
    241 		--source; /* return to the illegal value itself */
    242 		result = sourceIllegal;
    243 		break;
    244 	    }
    245 	}
    246 	/* Figure out how many bytes the result will require */
    247 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
    248 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
    249 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
    250 	} else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
    251 	} else {			    bytesToWrite = 3;
    252 					    ch = UNI_REPLACEMENT_CHAR;
    253 	}
    254 
    255 	target += bytesToWrite;
    256 	if (target > targetEnd) {
    257 	    source = oldSource; /* Back up source pointer! */
    258 	    target -= bytesToWrite; result = targetExhausted; break;
    259 	}
    260 	switch (bytesToWrite) { /* note: everything falls through. */
    261 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    262 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    263 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    264 	    case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
    265 	}
    266 	target += bytesToWrite;
    267     }
    268     *sourceStart = source;
    269     *targetStart = target;
    270     return result;
    271 }
    272 
    273 /* --------------------------------------------------------------------- */
    274 
    275 /*
    276  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
    277  * This must be called with the length pre-determined by the first byte.
    278  * If not calling this from ConvertUTF8to*, then the length can be set by:
    279  *  length = trailingBytesForUTF8[*source]+1;
    280  * and the sequence is illegal right away if there aren't that many bytes
    281  * available.
    282  * If presented with a length > 4, this returns false.  The Unicode
    283  * definition of UTF-8 goes up to 4-byte sequences.
    284  */
    285 
    286 static ANTLR3_BOOLEAN
    287 isLegalUTF8(const UTF8 *source, int length) {
    288     UTF8 a;
    289     const UTF8 *srcptr = source+length;
    290     switch (length) {
    291     default: return false;
    292 	/* Everything else falls through when "true"... */
    293     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    294     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    295     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
    296 
    297 	switch (*source) {
    298 	    /* no fall-through in this inner switch */
    299 	    case 0xE0: if (a < 0xA0) return false; break;
    300 	    case 0xED: if (a > 0x9F) return false; break;
    301 	    case 0xF0: if (a < 0x90) return false; break;
    302 	    case 0xF4: if (a > 0x8F) return false; break;
    303 	    default:   if (a < 0x80) return false;
    304 	}
    305 
    306     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
    307     }
    308     if (*source > 0xF4) return false;
    309     return true;
    310 }
    311 
    312 /* --------------------------------------------------------------------- */
    313 
    314 /*
    315  * Exported function to return whether a UTF-8 sequence is legal or not.
    316  * This is not used here; it's just exported.
    317  */
    318 ANTLR3_BOOLEAN
    319 isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
    320     int length = trailingBytesForUTF8[*source]+1;
    321     if (source+length > sourceEnd) {
    322 	return false;
    323     }
    324     return isLegalUTF8(source, length);
    325 }
    326 
    327 /* --------------------------------------------------------------------- */
    328 
    329 ConversionResult ConvertUTF8toUTF16 (
    330 	const UTF8** sourceStart, const UTF8* sourceEnd,
    331 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
    332     ConversionResult result = conversionOK;
    333     const UTF8* source = *sourceStart;
    334     UTF16* target = *targetStart;
    335     while (source < sourceEnd) {
    336 	UTF32 ch = 0;
    337 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
    338 	if (source + extraBytesToRead >= sourceEnd) {
    339 	    result = sourceExhausted; break;
    340 	}
    341 	/* Do this check whether lenient or strict */
    342 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
    343 	    result = sourceIllegal;
    344 	    break;
    345 	}
    346 	/*
    347 	 * The cases all fall through. See "Note A" below.
    348 	 */
    349 	switch (extraBytesToRead) {
    350 	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
    351 	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
    352 	    case 3: ch += *source++; ch <<= 6;
    353 	    case 2: ch += *source++; ch <<= 6;
    354 	    case 1: ch += *source++; ch <<= 6;
    355 	    case 0: ch += *source++;
    356 	}
    357 	ch -= offsetsFromUTF8[extraBytesToRead];
    358 
    359 	if (target >= targetEnd) {
    360 	    source -= (extraBytesToRead+1); /* Back up source pointer! */
    361 	    result = targetExhausted; break;
    362 	}
    363 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
    364 	    /* UTF-16 surrogate values are illegal in UTF-32 */
    365 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    366 		if (flags == strictConversion) {
    367 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
    368 		    result = sourceIllegal;
    369 		    break;
    370 		} else {
    371 		    *target++ = UNI_REPLACEMENT_CHAR;
    372 		}
    373 	    } else {
    374 		*target++ = (UTF16)ch; /* normal case */
    375 	    }
    376 	} else if (ch > UNI_MAX_UTF16) {
    377 	    if (flags == strictConversion) {
    378 		result = sourceIllegal;
    379 		source -= (extraBytesToRead+1); /* return to the start */
    380 		break; /* Bail out; shouldn't continue */
    381 	    } else {
    382 		*target++ = UNI_REPLACEMENT_CHAR;
    383 	    }
    384 	} else {
    385 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
    386 	    if (target + 1 >= targetEnd) {
    387 		source -= (extraBytesToRead+1); /* Back up source pointer! */
    388 		result = targetExhausted; break;
    389 	    }
    390 	    ch -= halfBase;
    391 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
    392 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
    393 	}
    394     }
    395     *sourceStart = source;
    396     *targetStart = target;
    397     return result;
    398 }
    399 
    400 /* --------------------------------------------------------------------- */
    401 
    402 ConversionResult ConvertUTF32toUTF8 (
    403 	const UTF32** sourceStart, const UTF32* sourceEnd,
    404 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
    405     ConversionResult result = conversionOK;
    406     const UTF32* source = *sourceStart;
    407     UTF8* target = *targetStart;
    408     while (source < sourceEnd) {
    409 	UTF32 ch;
    410 	unsigned short bytesToWrite = 0;
    411 	const UTF32 byteMask = 0xBF;
    412 	const UTF32 byteMark = 0x80;
    413 	ch = *source++;
    414 	if (flags == strictConversion ) {
    415 	    /* UTF-16 surrogate values are illegal in UTF-32 */
    416 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    417 		--source; /* return to the illegal value itself */
    418 		result = sourceIllegal;
    419 		break;
    420 	    }
    421 	}
    422 	/*
    423 	 * Figure out how many bytes the result will require. Turn any
    424 	 * illegally large UTF32 things (> Plane 17) into replacement chars.
    425 	 */
    426 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
    427 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
    428 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
    429 	} else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
    430 	} else {			    bytesToWrite = 3;
    431 					    ch = UNI_REPLACEMENT_CHAR;
    432 					    result = sourceIllegal;
    433 	}
    434 
    435 	target += bytesToWrite;
    436 	if (target > targetEnd) {
    437 	    --source; /* Back up source pointer! */
    438 	    target -= bytesToWrite; result = targetExhausted; break;
    439 	}
    440 	switch (bytesToWrite) { /* note: everything falls through. */
    441 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    442 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    443 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    444 	    case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
    445 	}
    446 	target += bytesToWrite;
    447     }
    448     *sourceStart = source;
    449     *targetStart = target;
    450     return result;
    451 }
    452 
    453 /* --------------------------------------------------------------------- */
    454 
    455 ConversionResult ConvertUTF8toUTF32 (
    456 	const UTF8** sourceStart, const UTF8* sourceEnd,
    457 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
    458     ConversionResult result = conversionOK;
    459     const UTF8* source = *sourceStart;
    460     UTF32* target = *targetStart;
    461     while (source < sourceEnd) {
    462 	UTF32 ch = 0;
    463 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
    464 	if (source + extraBytesToRead >= sourceEnd) {
    465 	    result = sourceExhausted; break;
    466 	}
    467 	/* Do this check whether lenient or strict */
    468 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
    469 	    result = sourceIllegal;
    470 	    break;
    471 	}
    472 	/*
    473 	 * The cases all fall through. See "Note A" below.
    474 	 */
    475 	switch (extraBytesToRead) {
    476 	    case 5: ch += *source++; ch <<= 6;
    477 	    case 4: ch += *source++; ch <<= 6;
    478 	    case 3: ch += *source++; ch <<= 6;
    479 	    case 2: ch += *source++; ch <<= 6;
    480 	    case 1: ch += *source++; ch <<= 6;
    481 	    case 0: ch += *source++;
    482 	}
    483 	ch -= offsetsFromUTF8[extraBytesToRead];
    484 
    485 	if (target >= targetEnd) {
    486 	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
    487 	    result = targetExhausted; break;
    488 	}
    489 	if (ch <= UNI_MAX_LEGAL_UTF32) {
    490 	    /*
    491 	     * UTF-16 surrogate values are illegal in UTF-32, and anything
    492 	     * over Plane 17 (> 0x10FFFF) is illegal.
    493 	     */
    494 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    495 		if (flags == strictConversion) {
    496 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
    497 		    result = sourceIllegal;
    498 		    break;
    499 		} else {
    500 		    *target++ = UNI_REPLACEMENT_CHAR;
    501 		}
    502 	    } else {
    503 		*target++ = ch;
    504 	    }
    505 	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
    506 	    result = sourceIllegal;
    507 	    *target++ = UNI_REPLACEMENT_CHAR;
    508 	}
    509     }
    510     *sourceStart = source;
    511     *targetStart = target;
    512     return result;
    513 }
    514 
    515 /* ---------------------------------------------------------------------
    516 
    517     Note A.
    518     The fall-through switches in UTF-8 reading code save a
    519     temp variable, some decrements & conditionals.  The switches
    520     are equivalent to the following loop:
    521 	{
    522 	    int tmpBytesToRead = extraBytesToRead+1;
    523 	    do {
    524 		ch += *source++;
    525 		--tmpBytesToRead;
    526 		if (tmpBytesToRead) ch <<= 6;
    527 	    } while (tmpBytesToRead > 0);
    528 	}
    529     In UTF-8 writing code, the switches on "bytesToWrite" are
    530     similarly unrolled loops.
    531 
    532    --------------------------------------------------------------------- */
    533