Home | History | Annotate | Download | only in Tools
      1 /******************************************************************************
      2 
      3  @File         PVRTUnicode.cpp
      4 
      5  @Title        PVRTUnicode
      6 
      7  @Version       @Version
      8 
      9  @Copyright    Copyright (c) Imagination Technologies Limited.
     10 
     11  @Platform     All
     12 
     13  @Description  A small collection of functions used to decode Unicode formats to
     14                individual code points.
     15 
     16 ******************************************************************************/
     17 #include "PVRTUnicode.h"
     18 #include <string.h>
     19 
     20 /****************************************************************************
     21 ** Constants
     22 ****************************************************************************/
     23 const PVRTuint32 c_u32ReplChar = 0xFFFD;
     24 
     25 #define VALID_ASCII 0x80
     26 #define TAIL_MASK 0x3F
     27 #define BYTES_PER_TAIL 6
     28 
     29 #define UTF16_SURG_H_MARK 0xD800
     30 #define UTF16_SURG_H_END  0xDBFF
     31 #define UTF16_SURG_L_MARK 0xDC00
     32 #define UTF16_SURG_L_END  0xDFFF
     33 
     34 #define UNICODE_NONCHAR_MARK 0xFDD0
     35 #define UNICODE_NONCHAR_END  0xFDEF
     36 #define UNICODE_RESERVED	 0xFFFE
     37 #define UNICODE_MAX			 0x10FFFF
     38 
     39 #define MAX_LEN 0x8FFF
     40 
     41 /****************************************************************************
     42 ** A table which allows quick lookup to determine the number of bytes of a
     43 ** UTF8 code point.
     44 ****************************************************************************/
     45 const PVRTuint8 c_u8UTF8Lengths[256] =
     46 {
     47 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     48 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     49 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     50 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     51 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     52 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     53 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     54 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     55 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     56 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     57 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     58 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     59 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     60 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     61 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
     62 	3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,
     63 };
     64 
     65 /****************************************************************************
     66 ** A table which allows quick lookup to determine whether a UTF8 sequence
     67 ** is 'overlong'.
     68 ****************************************************************************/
     69 const PVRTuint32 c_u32MinVals[4] =
     70 {
     71 	0x00000000,		// 0 tail bytes
     72 	0x00000080,		// 1 tail bytes
     73 	0x00000800,		// 2 tail bytes
     74 	0x00010000,		// 3 tail bytes
     75 };
     76 
     77 /*!***************************************************************************
     78  @Function			CheckGenericUnicode
     79  @Input				c32			A UTF32 character/Unicode code point
     80  @Returns			Success or failure.
     81  @Description		Checks that the decoded code point is valid.
     82 *****************************************************************************/
     83 static bool CheckGenericUnicode(PVRTuint32 c32)
     84 {
     85 	// Check that this value isn't a UTF16 surrogate mask.
     86 	if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END)
     87 		return false;
     88 	// Check non-char values
     89 	if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END)
     90 		return false;
     91 	// Check reserved values
     92 	if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED)
     93 		return false;
     94 	// Check max value.
     95 	if(c32 > UNICODE_MAX)
     96 		return false;
     97 
     98 	return true;
     99 }
    100 
    101 /*!***************************************************************************
    102  @Function			PVRTUnicodeUTF8ToUTF32
    103  @Input				pUTF8			A UTF8 string, which is null terminated.
    104  @Output			aUTF32			An array of Unicode code points.
    105  @Returns			Success or failure.
    106  @Description		Decodes a UTF8-encoded string in to Unicode code points
    107 					(UTF32). If pUTF8 is not null terminated, the results are
    108 					undefined.
    109 *****************************************************************************/
    110 EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32)
    111 {
    112 	unsigned int uiTailLen, uiIndex;
    113 	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
    114 	PVRTuint32 c32;
    115 
    116 	const PVRTuint8* pC = pUTF8;
    117 	while(*pC)
    118 	{
    119 		// Quick optimisation for ASCII characters
    120 		while(*pC && *pC < VALID_ASCII)
    121 		{
    122 			aUTF32.Append(*pC++);
    123 		}
    124 		// Done
    125 		if(!*pC)
    126 			break;
    127 
    128 		c32 = *pC++;
    129 		uiTailLen = c_u8UTF8Lengths[c32];
    130 
    131 		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
    132 		// Also check to make sure the tail length is inside the provided buffer.
    133 		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
    134 			return PVR_OVERFLOW;
    135 
    136 		c32 &= (TAIL_MASK >> uiTailLen);	// Get the data out of the first byte. This depends on the length of the tail.
    137 
    138 		// Get the data out of each tail byte
    139 		uiIndex = 0;
    140 		while(uiIndex < uiTailLen)
    141 		{
    142 			if((pC[uiIndex] & 0xC0) != 0x80)
    143 				return PVR_FAIL;		// Invalid tail byte!
    144 
    145 			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
    146 			uiIndex++;
    147 		}
    148 
    149 		pC += uiIndex;
    150 
    151 		// Check overlong values.
    152 		if(c32 < c_u32MinVals[uiTailLen])
    153 			return PVR_FAIL;
    154 
    155 		if(!CheckGenericUnicode(c32))
    156 			return PVR_FAIL;
    157 
    158 		// OK
    159 		aUTF32.Append(c32);
    160 	}
    161 
    162 	return PVR_SUCCESS;
    163 }
    164 
    165 /*!***************************************************************************
    166  @Function			PVRTUnicodeUTF16ToUTF32
    167  @Input				pUTF16			A UTF16 string, which is null terminated.
    168  @Output			aUTF32			An array of Unicode code points.
    169  @Returns			Success or failure.
    170  @Description		Decodes a UTF16-encoded string in to Unicode code points
    171 					(UTF32). If pUTF16 is not null terminated, the results are
    172 					undefined.
    173 *****************************************************************************/
    174 EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32)
    175 {
    176 	const PVRTuint16* pC = pUTF16;
    177 
    178 	// Determine the number of shorts
    179 	while(*++pC && (pC - pUTF16) < MAX_LEN);
    180 	unsigned int uiBufferLen = (unsigned int) (pC - pUTF16);
    181 
    182 	if(uiBufferLen == MAX_LEN)
    183 		return PVR_OVERFLOW;		// Probably not NULL terminated.
    184 
    185 	// Reset to start.
    186 	pC = pUTF16;
    187 
    188 	PVRTuint32 c32;
    189 	while(*pC)
    190 	{
    191 		// Straight copy. We'll check for surrogate pairs next...
    192 		c32 = *pC++;
    193 
    194 		// Check surrogate pair
    195 		if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END)
    196 		{
    197 			// Make sure the next 2 bytes are in range...
    198 			if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0)
    199 				return PVR_OVERFLOW;
    200 
    201 			// Check that the next value is in the low surrogate range
    202 			if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END)
    203 				return PVR_FAIL;
    204 
    205 			// Decode
    206 			c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000;
    207 			pC++;
    208 		}
    209 
    210 		if(!CheckGenericUnicode(c32))
    211 			return PVR_FAIL;
    212 
    213 		// OK
    214 		aUTF32.Append(c32);
    215 	}
    216 
    217 	return PVR_SUCCESS;
    218 }
    219 
    220 /*!***************************************************************************
    221  @Function			PVRTUnicodeUTF8Length
    222  @Input				pUTF8			A UTF8 string, which is null terminated.
    223  @Returns			The length of the string, in Unicode code points.
    224  @Description		Calculates the length of a UTF8 string. If pUTF8 is
    225 					not null terminated, the results are undefined.
    226 *****************************************************************************/
    227 unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8)
    228 {
    229 	const PVRTuint8* pC = pUTF8;
    230 
    231 	unsigned int charCount = 0;
    232 	unsigned int mask;
    233 	while(*pC)
    234 	{
    235 		// Quick optimisation for ASCII characters
    236 		const PVRTuint8* pStart = pC;
    237 		while(*pC && *pC < VALID_ASCII)
    238 			pC++;
    239 
    240 		charCount += (unsigned int) (pC - pStart);
    241 
    242 		// Done
    243 		if(!*pC)
    244 			break;
    245 
    246 		mask = *pC & 0xF0;
    247 		switch(mask)
    248 		{
    249 		case 0xF0: pC++;
    250 		case 0xE0: pC++;
    251 		case 0xC0: pC++;
    252 			break;
    253 		default:
    254 			_ASSERT(!"Invalid tail byte!");
    255 			return 0;
    256 		}
    257 
    258 		pC++;
    259 		charCount++;
    260 	}
    261 
    262 	return charCount;
    263 }
    264 
    265 /*!***************************************************************************
    266  @Function			PVRTUnicodeUTF16Length
    267  @Input				pUTF16			A UTF16 string, which is null terminated.
    268  @Returns			The length of the string, in Unicode code points.
    269  @Description		Calculates the length of a UTF16 string.
    270 					If pUTF16 is not null terminated, the results are
    271 					undefined.
    272 *****************************************************************************/
    273 unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16)
    274 {
    275 	const PVRTuint16* pC = pUTF16;
    276 	unsigned int charCount = 0;
    277 	while(*pC && (pC - pUTF16) < MAX_LEN)
    278 	{
    279 		if(	pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END
    280 		 && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END)
    281 		{
    282 			pC += 2;
    283 		}
    284 		else
    285 		{
    286 			pC += 1;
    287 		}
    288 
    289 		charCount++;
    290 	}
    291 
    292 	return charCount;
    293 }
    294 
    295 /*!***************************************************************************
    296  @Function			PVRTUnicodeValidUTF8
    297  @Input				pUTF8			A UTF8 string, which is null terminated.
    298  @Returns			true or false
    299  @Description		Checks whether the encoding of a UTF8 string is valid.
    300 					If pUTF8 is not null terminated, the results are undefined.
    301 *****************************************************************************/
    302 bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8)
    303 {
    304 	unsigned int uiTailLen, uiIndex;
    305 	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
    306 	const PVRTuint8* pC = pUTF8;
    307 	while(*pC)
    308 	{
    309 		// Quick optimisation for ASCII characters
    310 		while(*pC && *pC < VALID_ASCII)	pC++;
    311 		// Done?
    312 		if(!*pC)
    313 			break;
    314 
    315 		PVRTuint32 c32 = *pC++;
    316 		uiTailLen = c_u8UTF8Lengths[c32];
    317 
    318 		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
    319 		// Also check to make sure the tail length is inside the provided buffer.
    320 		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
    321 			return false;
    322 
    323 		// Get the data out of each tail byte
    324 		uiIndex = 0;
    325 		while(uiIndex < uiTailLen)
    326 		{
    327 			if((pC[uiIndex] & 0xC0) != 0x80)
    328 				return false;		// Invalid tail byte!
    329 
    330 			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
    331 			uiIndex++;
    332 		}
    333 
    334 		pC += uiIndex;
    335 
    336 		// Check overlong values.
    337 		if(c32 < c_u32MinVals[uiTailLen])
    338 			return false;
    339 		if(!CheckGenericUnicode(c32))
    340 			return false;
    341 	}
    342 
    343 	return true;
    344 }
    345 
    346 /*****************************************************************************
    347  End of file (PVRTUnicode.cpp)
    348 *****************************************************************************/
    349 
    350