Home | History | Annotate | Download | only in unicode
      1 /*
      2  * Copyright 2001-2004 Unicode, Inc.
      3  *
      4  * Disclaimer
      5  *
      6  * This source code is provided as is by Unicode, Inc. No claims are
      7  * made as to fitness for any particular purpose. No warranties of any
      8  * kind are expressed or implied. The recipient agrees to determine
      9  * applicability of information provided. If this file has been
     10  * purchased on magnetic or optical media from Unicode, Inc., the
     11  * sole remedy for any claim will be exchange of defective media
     12  * within 90 days of receipt.
     13  *
     14  * Limitations on Rights to Redistribute This Code
     15  *
     16  * Unicode, Inc. hereby grants the right to freely use the information
     17  * supplied in this file in the creation of products supporting the
     18  * Unicode Standard, and to make copies of this file in any form
     19  * for internal or external distribution as long as this notice
     20  * remains attached.
     21  */
     22 
     23 /* ---------------------------------------------------------------------
     24 
     25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
     26     Author: Mark E. Davis, 1994.
     27     Rev History: Rick McGowan, fixes & updates May 2001.
     28     Sept 2001: fixed const & error conditions per
     29 	mods suggested by S. Parent & A. Lillich.
     30     June 2002: Tim Dodd added detection and handling of incomplete
     31 	source sequences, enhanced error detection, added casts
     32 	to eliminate compiler warnings.
     33     July 2003: slight mods to back out aggressive FFFE detection.
     34     Jan 2004: updated switches in from-UTF8 conversions.
     35     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
     36 
     37     See the header file "ConvertUTF.h" for complete documentation.
     38 
     39 ------------------------------------------------------------------------ */
     40 
     41 
     42 #include "ConvertUTF.h"
     43 #ifdef CVTUTF_DEBUG
     44 #include <stdio.h>
     45 #endif
     46 
     47 static const int halfShift  = 10; /* used for shifting by 10 bits */
     48 
     49 static const UTF32 halfBase = 0x0010000UL;
     50 static const UTF32 halfMask = 0x3FFUL;
     51 
     52 #define UNI_SUR_HIGH_START  (UTF32)0xD800
     53 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
     54 #define UNI_SUR_LOW_START   (UTF32)0xDC00
     55 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
     56 #define false	   0
     57 #define true	    1
     58 
     59 /* --------------------------------------------------------------------- */
     60 
     61 ConversionResult ConvertUTF32toUTF16 (
     62 	const UTF32** sourceStart, const UTF32* sourceEnd,
     63 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
     64     ConversionResult result = conversionOK;
     65     const UTF32* source = *sourceStart;
     66     UTF16* target = *targetStart;
     67     while (source < sourceEnd) {
     68 	UTF32 ch;
     69 	if (target >= targetEnd) {
     70 	    result = targetExhausted; break;
     71 	}
     72 	ch = *source++;
     73 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
     74 	    /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
     75 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     76 		if (flags == strictConversion) {
     77 		    --source; /* return to the illegal value itself */
     78 		    result = sourceIllegal;
     79 		    break;
     80 		} else {
     81 		    *target++ = UNI_REPLACEMENT_CHAR;
     82 		}
     83 	    } else {
     84 		*target++ = (UTF16)ch; /* normal case */
     85 	    }
     86 	} else if (ch > UNI_MAX_LEGAL_UTF32) {
     87 	    if (flags == strictConversion) {
     88 		result = sourceIllegal;
     89 	    } else {
     90 		*target++ = UNI_REPLACEMENT_CHAR;
     91 	    }
     92 	} else {
     93 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
     94 	    if (target + 1 >= targetEnd) {
     95 		--source; /* Back up source pointer! */
     96 		result = targetExhausted; break;
     97 	    }
     98 	    ch -= halfBase;
     99 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
    100 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
    101 	}
    102     }
    103     *sourceStart = source;
    104     *targetStart = target;
    105     return result;
    106 }
    107 
    108 /* --------------------------------------------------------------------- */
    109 
    110 ConversionResult ConvertUTF16toUTF32 (
    111 	const UTF16** sourceStart, const UTF16* sourceEnd,
    112 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
    113     ConversionResult result = conversionOK;
    114     const UTF16* source = *sourceStart;
    115     UTF32* target = *targetStart;
    116     UTF32 ch, ch2;
    117     while (source < sourceEnd) {
    118 	const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
    119 	ch = *source++;
    120 	/* If we have a surrogate pair, convert to UTF32 first. */
    121 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
    122 	    /* If the 16 bits following the high surrogate are in the source buffer... */
    123 	    if (source < sourceEnd) {
    124 		ch2 = *source;
    125 		/* If it's a low surrogate, convert to UTF32. */
    126 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
    127 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
    128 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
    129 		    ++source;
    130 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
    131 		    --source; /* return to the illegal value itself */
    132 		    result = sourceIllegal;
    133 		    break;
    134 		}
    135 	    } else { /* We don't have the 16 bits following the high surrogate. */
    136 		--source; /* return to the high surrogate */
    137 		result = sourceExhausted;
    138 		break;
    139 	    }
    140 	} else if (flags == strictConversion) {
    141 	    /* UTF-16 surrogate values are illegal in UTF-32 */
    142 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
    143 		--source; /* return to the illegal value itself */
    144 		result = sourceIllegal;
    145 		break;
    146 	    }
    147 	}
    148 	if (target >= targetEnd) {
    149 	    source = oldSource; /* Back up source pointer! */
    150 	    result = targetExhausted; break;
    151 	}
    152 	*target++ = ch;
    153     }
    154     *sourceStart = source;
    155     *targetStart = target;
    156 #ifdef CVTUTF_DEBUG
    157 if (result == sourceIllegal) {
    158     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
    159     fflush(stderr);
    160 }
    161 #endif
    162     return result;
    163 }
    164 
    165 /* --------------------------------------------------------------------- */
    166 
    167 /*
    168  * Index into the table below with the first byte of a UTF-8 sequence to
    169  * get the number of trailing bytes that are supposed to follow it.
    170  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
    171  * left as-is for anyone who may want to do such conversion, which was
    172  * allowed in earlier algorithms.
    173  */
    174 static const char trailingBytesForUTF8[256] = {
    175     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    176     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    177     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    178     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    179     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    180     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    181     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    182     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
    183 };
    184 
    185 /*
    186  * Magic values subtracted from a buffer value during UTF8 conversion.
    187  * This table contains as many values as there might be trailing bytes
    188  * in a UTF-8 sequence.
    189  */
    190 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
    191 		     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
    192 
    193 /*
    194  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
    195  * into the first byte, depending on how many bytes follow.  There are
    196  * as many entries in this table as there are UTF-8 sequence types.
    197  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
    198  * for *legal* UTF-8 will be 4 or fewer bytes total.
    199  */
    200 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
    201 
    202 /* --------------------------------------------------------------------- */
    203 
    204 /* The interface converts a whole buffer to avoid function-call overhead.
    205  * Constants have been gathered. Loops & conditionals have been removed as
    206  * much as possible for efficiency, in favor of drop-through switches.
    207  * (See "Note A" at the bottom of the file for equivalent code.)
    208  * If your compiler supports it, the "isLegalUTF8" call can be turned
    209  * into an inline function.
    210  */
    211 
    212 /* --------------------------------------------------------------------- */
    213 
    214 ConversionResult ConvertUTF16toUTF8 (
    215 	const UTF16** sourceStart, const UTF16* sourceEnd,
    216 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
    217     ConversionResult result = conversionOK;
    218     const UTF16* source = *sourceStart;
    219     UTF8* target = *targetStart;
    220     while (source < sourceEnd) {
    221 	UTF32 ch;
    222 	unsigned short bytesToWrite = 0;
    223 	const UTF32 byteMask = 0xBF;
    224 	const UTF32 byteMark = 0x80;
    225 	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
    226 	ch = *source++;
    227 	/* If we have a surrogate pair, convert to UTF32 first. */
    228 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
    229 	    /* If the 16 bits following the high surrogate are in the source buffer... */
    230 	    if (source < sourceEnd) {
    231 		UTF32 ch2 = *source;
    232 		/* If it's a low surrogate, convert to UTF32. */
    233 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
    234 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
    235 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
    236 		    ++source;
    237 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
    238 		    --source; /* return to the illegal value itself */
    239 		    result = sourceIllegal;
    240 		    break;
    241 		}
    242 	    } else { /* We don't have the 16 bits following the high surrogate. */
    243 		--source; /* return to the high surrogate */
    244 		result = sourceExhausted;
    245 		break;
    246 	    }
    247 	} else if (flags == strictConversion) {
    248 	    /* UTF-16 surrogate values are illegal in UTF-32 */
    249 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
    250 		--source; /* return to the illegal value itself */
    251 		result = sourceIllegal;
    252 		break;
    253 	    }
    254 	}
    255 
    256 	// TPN: substitute all control characters except for NULL, TAB, LF or CR
    257 	if (ch && (ch != (UTF32)0x09)  && (ch != (UTF32)0x0a)  && (ch != (UTF32)0x0d)  && (ch < (UTF32)0x20) )  {
    258 		ch = (UTF32)0x3f;
    259 	}
    260 	// TPN: filter out byte order marks and invalid character 0xFFFF
    261 	if((ch == (UTF32)0xFEFF) || (ch == (UTF32)0xFFFE)|| (ch == (UTF32)0xFFFF)) {
    262 		continue;
    263 	}
    264 
    265 	/* Figure out how many bytes the result will require */
    266 	if (ch < (UTF32)0x80) {	    bytesToWrite = 1;
    267 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
    268 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
    269 	} else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
    270 	} else {			    bytesToWrite = 3;
    271 					    ch = UNI_REPLACEMENT_CHAR;
    272 	}
    273 
    274 	target += bytesToWrite;
    275 	if (target > targetEnd) {
    276 	    source = oldSource; /* Back up source pointer! */
    277 	    target -= bytesToWrite; result = targetExhausted; break;
    278 	}
    279 	switch (bytesToWrite) { /* note: everything falls through. */
    280 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    281 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    282 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    283 	    case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
    284 	}
    285 	target += bytesToWrite;
    286     }
    287     *sourceStart = source;
    288     *targetStart = target;
    289     return result;
    290 }
    291 
    292 /* --------------------------------------------------------------------- */
    293 
    294 /*
    295  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
    296  * This must be called with the length pre-determined by the first byte.
    297  * If not calling this from ConvertUTF8to*, then the length can be set by:
    298  *  length = trailingBytesForUTF8[*source]+1;
    299  * and the sequence is illegal right away if there aren't that many bytes
    300  * available.
    301  * If presented with a length > 4, this returns false.  The Unicode
    302  * definition of UTF-8 goes up to 4-byte sequences.
    303  */
    304 
    305 inline Boolean isLegalUTF8(const UTF8 *source, int length) {
    306     UTF8 a;
    307     const UTF8 *srcptr = source+length;
    308     switch (length) {
    309     default: return false;
    310 	/* Everything else falls through when "true"... */
    311     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    312     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    313     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
    314 
    315 	switch (*source) {
    316 	    /* no fall-through in this inner switch */
    317 	    case 0xE0: if (a < 0xA0) return false; break;
    318 	    case 0xED: if (a > 0x9F) return false; break;
    319 	    case 0xF0: if (a < 0x90) return false; break;
    320 	    case 0xF4: if (a > 0x8F) return false; break;
    321 	    default:   if (a < 0x80) return false;
    322 	}
    323 
    324     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
    325     }
    326     if (*source > 0xF4) return false;
    327     return true;
    328 }
    329 
    330 /* --------------------------------------------------------------------- */
    331 
    332 /*
    333  * Exported function to return whether a UTF-8 sequence is legal or not.
    334  * This is not used here; it's just exported.
    335  */
    336 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
    337     int length = trailingBytesForUTF8[*source]+1;
    338     if (source+length > sourceEnd) {
    339 	return false;
    340     }
    341     return isLegalUTF8(source, length);
    342 }
    343 
    344 /* --------------------------------------------------------------------- */
    345 
    346 ConversionResult ConvertUTF8toUTF16 (
    347 	const UTF8** sourceStart, const UTF8* sourceEnd,
    348 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
    349     ConversionResult result = conversionOK;
    350     const UTF8* source = *sourceStart;
    351     UTF16* target = *targetStart;
    352     while (source < sourceEnd) {
    353 	UTF32 ch = 0;
    354 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
    355 	if (source + extraBytesToRead >= sourceEnd) {
    356 	    result = sourceExhausted; break;
    357 	}
    358 	/* Do this check whether lenient or strict */
    359 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
    360 	    result = sourceIllegal;
    361 	    break;
    362 	}
    363 	/*
    364 	 * The cases all fall through. See "Note A" below.
    365 	 */
    366 	switch (extraBytesToRead) {
    367 	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
    368 	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
    369 	    case 3: ch += *source++; ch <<= 6;
    370 	    case 2: ch += *source++; ch <<= 6;
    371 	    case 1: ch += *source++; ch <<= 6;
    372 	    case 0: ch += *source++;
    373 	}
    374 	ch -= offsetsFromUTF8[extraBytesToRead];
    375 
    376 	if (target >= targetEnd) {
    377 	    source -= (extraBytesToRead+1); /* Back up source pointer! */
    378 	    result = targetExhausted; break;
    379 	}
    380 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
    381 	    /* UTF-16 surrogate values are illegal in UTF-32 */
    382 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    383 		if (flags == strictConversion) {
    384 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
    385 		    result = sourceIllegal;
    386 		    break;
    387 		} else {
    388 		    *target++ = UNI_REPLACEMENT_CHAR;
    389 		}
    390 	    } else {
    391 		*target++ = (UTF16)ch; /* normal case */
    392 	    }
    393 	} else if (ch > UNI_MAX_UTF16) {
    394 	    if (flags == strictConversion) {
    395 		result = sourceIllegal;
    396 		source -= (extraBytesToRead+1); /* return to the start */
    397 		break; /* Bail out; shouldn't continue */
    398 	    } else {
    399 		*target++ = UNI_REPLACEMENT_CHAR;
    400 	    }
    401 	} else {
    402 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
    403 	    if (target + 1 >= targetEnd) {
    404 		source -= (extraBytesToRead+1); /* Back up source pointer! */
    405 		result = targetExhausted; break;
    406 	    }
    407 	    ch -= halfBase;
    408 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
    409 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
    410 	}
    411     }
    412     *sourceStart = source;
    413     *targetStart = target;
    414     return result;
    415 }
    416 
    417 /* --------------------------------------------------------------------- */
    418 
    419 ConversionResult ConvertUTF32toUTF8 (
    420 	const UTF32** sourceStart, const UTF32* sourceEnd,
    421 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
    422     ConversionResult result = conversionOK;
    423     const UTF32* source = *sourceStart;
    424     UTF8* target = *targetStart;
    425     while (source < sourceEnd) {
    426 	UTF32 ch;
    427 	unsigned short bytesToWrite = 0;
    428 	const UTF32 byteMask = 0xBF;
    429 	const UTF32 byteMark = 0x80;
    430 	ch = *source++;
    431 	if (flags == strictConversion ) {
    432 	    /* UTF-16 surrogate values are illegal in UTF-32 */
    433 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    434 		--source; /* return to the illegal value itself */
    435 		result = sourceIllegal;
    436 		break;
    437 	    }
    438 	}
    439 	/*
    440 	 * Figure out how many bytes the result will require. Turn any
    441 	 * illegally large UTF32 things (> Plane 17) into replacement chars.
    442 	 */
    443 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
    444 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
    445 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
    446 	} else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
    447 	} else {			    bytesToWrite = 3;
    448 					    ch = UNI_REPLACEMENT_CHAR;
    449 					    result = sourceIllegal;
    450 	}
    451 
    452 	target += bytesToWrite;
    453 	if (target > targetEnd) {
    454 	    --source; /* Back up source pointer! */
    455 	    target -= bytesToWrite; result = targetExhausted; break;
    456 	}
    457 	switch (bytesToWrite) { /* note: everything falls through. */
    458 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    459 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    460 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    461 	    case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
    462 	}
    463 	target += bytesToWrite;
    464     }
    465     *sourceStart = source;
    466     *targetStart = target;
    467     return result;
    468 }
    469 
    470 /* --------------------------------------------------------------------- */
    471 
    472 ConversionResult ConvertUTF8toUTF32 (
    473 	const UTF8** sourceStart, const UTF8* sourceEnd,
    474 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
    475     ConversionResult result = conversionOK;
    476     const UTF8* source = *sourceStart;
    477     UTF32* target = *targetStart;
    478     while (source < sourceEnd) {
    479 	UTF32 ch = 0;
    480 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
    481 	if (source + extraBytesToRead >= sourceEnd) {
    482 	    result = sourceExhausted; break;
    483 	}
    484 	/* Do this check whether lenient or strict */
    485 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
    486 	    result = sourceIllegal;
    487 	    break;
    488 	}
    489 	/*
    490 	 * The cases all fall through. See "Note A" below.
    491 	 */
    492 	switch (extraBytesToRead) {
    493 	    case 5: ch += *source++; ch <<= 6;
    494 	    case 4: ch += *source++; ch <<= 6;
    495 	    case 3: ch += *source++; ch <<= 6;
    496 	    case 2: ch += *source++; ch <<= 6;
    497 	    case 1: ch += *source++; ch <<= 6;
    498 	    case 0: ch += *source++;
    499 	}
    500 	ch -= offsetsFromUTF8[extraBytesToRead];
    501 
    502 	if (target >= targetEnd) {
    503 	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
    504 	    result = targetExhausted; break;
    505 	}
    506 	if (ch <= UNI_MAX_LEGAL_UTF32) {
    507 	    /*
    508 	     * UTF-16 surrogate values are illegal in UTF-32, and anything
    509 	     * over Plane 17 (> 0x10FFFF) is illegal.
    510 	     */
    511 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    512 		if (flags == strictConversion) {
    513 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
    514 		    result = sourceIllegal;
    515 		    break;
    516 		} else {
    517 		    *target++ = UNI_REPLACEMENT_CHAR;
    518 		}
    519 	    } else {
    520 		*target++ = ch;
    521 	    }
    522 	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
    523 	    result = sourceIllegal;
    524 	    *target++ = UNI_REPLACEMENT_CHAR;
    525 	}
    526     }
    527     *sourceStart = source;
    528     *targetStart = target;
    529     return result;
    530 }
    531 
    532 /* ---------------------------------------------------------------------
    533 
    534     Note A.
    535     The fall-through switches in UTF-8 reading code save a
    536     temp variable, some decrements & conditionals.  The switches
    537     are equivalent to the following loop:
    538 	{
    539 	    int tmpBytesToRead = extraBytesToRead+1;
    540 	    do {
    541 		ch += *source++;
    542 		--tmpBytesToRead;
    543 		if (tmpBytesToRead) ch <<= 6;
    544 	    } while (tmpBytesToRead > 0);
    545 	}
    546     In UTF-8 writing code, the switches on "bytesToWrite" are
    547     similarly unrolled loops.
    548 
    549    --------------------------------------------------------------------- */
    550