1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2000-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uparse.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2000apr18 14 * created by: Markus W. Scherer 15 * 16 * This file provides a parser for files that are delimited by one single 17 * character like ';' or TAB. Example: the Unicode Character Properties files 18 * like UnicodeData.txt are semicolon-delimited. 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/uchar.h" 23 #include "unicode/ustring.h" 24 #include "unicode/utf16.h" 25 #include "cstring.h" 26 #include "filestrm.h" 27 #include "uparse.h" 28 #include "ustr_imp.h" 29 30 #include <stdio.h> 31 32 U_CAPI const char * U_EXPORT2 33 u_skipWhitespace(const char *s) { 34 while(U_IS_INV_WHITESPACE(*s)) { 35 ++s; 36 } 37 return s; 38 } 39 40 U_CAPI char * U_EXPORT2 41 u_rtrim(char *s) { 42 char *end=uprv_strchr(s, 0); 43 while(s<end && U_IS_INV_WHITESPACE(*(end-1))) { 44 *--end = 0; 45 } 46 return end; 47 } 48 49 /* 50 * If the string starts with # @missing: then return the pointer to the 51 * following non-whitespace character. 52 * Otherwise return the original pointer. 53 * Unicode 5.0 adds such lines in some data files to document 54 * default property values. 55 * Poor man's regex for variable amounts of white space. 56 */ 57 static const char * 58 getMissingLimit(const char *s) { 59 const char *s0=s; 60 if( 61 *(s=u_skipWhitespace(s))=='#' && 62 *(s=u_skipWhitespace(s+1))=='@' && 63 0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) && 64 *(s=u_skipWhitespace(s+7))==':' 65 ) { 66 return u_skipWhitespace(s+1); 67 } else { 68 return s0; 69 } 70 } 71 72 U_CAPI void U_EXPORT2 73 u_parseDelimitedFile(const char *filename, char delimiter, 74 char *fields[][2], int32_t fieldCount, 75 UParseLineFn *lineFn, void *context, 76 UErrorCode *pErrorCode) { 77 FileStream *file; 78 char line[300]; 79 char *start, *limit; 80 int32_t i, length; 81 82 if(U_FAILURE(*pErrorCode)) { 83 return; 84 } 85 86 if(fields==NULL || lineFn==NULL || fieldCount<=0) { 87 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 88 return; 89 } 90 91 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) { 92 filename=NULL; 93 file=T_FileStream_stdin(); 94 } else { 95 file=T_FileStream_open(filename, "r"); 96 } 97 if(file==NULL) { 98 *pErrorCode=U_FILE_ACCESS_ERROR; 99 return; 100 } 101 102 while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) { 103 /* remove trailing newline characters */ 104 length=(int32_t)(u_rtrim(line)-line); 105 106 /* 107 * detect a line with # @missing: 108 * start parsing after that, or else from the beginning of the line 109 * set the default warning for @missing lines 110 */ 111 start=(char *)getMissingLimit(line); 112 if(start==line) { 113 *pErrorCode=U_ZERO_ERROR; 114 } else { 115 *pErrorCode=U_USING_DEFAULT_WARNING; 116 } 117 118 /* skip this line if it is empty or a comment */ 119 if(*start==0 || *start=='#') { 120 continue; 121 } 122 123 /* remove in-line comments */ 124 limit=uprv_strchr(start, '#'); 125 if(limit!=NULL) { 126 /* get white space before the pound sign */ 127 while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) { 128 --limit; 129 } 130 131 /* truncate the line */ 132 *limit=0; 133 } 134 135 /* skip lines with only whitespace */ 136 if(u_skipWhitespace(start)[0]==0) { 137 continue; 138 } 139 140 /* for each field, call the corresponding field function */ 141 for(i=0; i<fieldCount; ++i) { 142 /* set the limit pointer of this field */ 143 limit=start; 144 while(*limit!=delimiter && *limit!=0) { 145 ++limit; 146 } 147 148 /* set the field start and limit in the fields array */ 149 fields[i][0]=start; 150 fields[i][1]=limit; 151 152 /* set start to the beginning of the next field, if any */ 153 start=limit; 154 if(*start!=0) { 155 ++start; 156 } else if(i+1<fieldCount) { 157 *pErrorCode=U_PARSE_ERROR; 158 limit=line+length; 159 i=fieldCount; 160 break; 161 } 162 } 163 164 /* error in a field function? */ 165 if(U_FAILURE(*pErrorCode)) { 166 break; 167 } 168 169 /* call the field function */ 170 lineFn(context, fields, fieldCount, pErrorCode); 171 if(U_FAILURE(*pErrorCode)) { 172 break; 173 } 174 } 175 176 if(filename!=NULL) { 177 T_FileStream_close(file); 178 } 179 } 180 181 /* 182 * parse a list of code points 183 * store them as a UTF-32 string in dest[destCapacity] 184 * return the number of code points 185 */ 186 U_CAPI int32_t U_EXPORT2 187 u_parseCodePoints(const char *s, 188 uint32_t *dest, int32_t destCapacity, 189 UErrorCode *pErrorCode) { 190 char *end; 191 uint32_t value; 192 int32_t count; 193 194 if(U_FAILURE(*pErrorCode)) { 195 return 0; 196 } 197 if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { 198 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 199 return 0; 200 } 201 202 count=0; 203 for(;;) { 204 s=u_skipWhitespace(s); 205 if(*s==';' || *s==0) { 206 return count; 207 } 208 209 /* read one code point */ 210 value=(uint32_t)uprv_strtoul(s, &end, 16); 211 if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { 212 *pErrorCode=U_PARSE_ERROR; 213 return 0; 214 } 215 216 /* append it to the destination array */ 217 if(count<destCapacity) { 218 dest[count++]=value; 219 } else { 220 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 221 } 222 223 /* go to the following characters */ 224 s=end; 225 } 226 } 227 228 /* 229 * parse a list of code points 230 * store them as a string in dest[destCapacity] 231 * set the first code point in *pFirst 232 * @return The length of the string in numbers of UChars. 233 */ 234 U_CAPI int32_t U_EXPORT2 235 u_parseString(const char *s, 236 UChar *dest, int32_t destCapacity, 237 uint32_t *pFirst, 238 UErrorCode *pErrorCode) { 239 char *end; 240 uint32_t value; 241 int32_t destLength; 242 243 if(U_FAILURE(*pErrorCode)) { 244 return 0; 245 } 246 if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { 247 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 248 return 0; 249 } 250 251 if(pFirst!=NULL) { 252 *pFirst=0xffffffff; 253 } 254 255 destLength=0; 256 for(;;) { 257 s=u_skipWhitespace(s); 258 if(*s==';' || *s==0) { 259 if(destLength<destCapacity) { 260 dest[destLength]=0; 261 } else if(destLength==destCapacity) { 262 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; 263 } else { 264 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 265 } 266 return destLength; 267 } 268 269 /* read one code point */ 270 value=(uint32_t)uprv_strtoul(s, &end, 16); 271 if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { 272 *pErrorCode=U_PARSE_ERROR; 273 return 0; 274 } 275 276 /* store the first code point */ 277 if(pFirst!=NULL) { 278 *pFirst=value; 279 pFirst=NULL; 280 } 281 282 /* append it to the destination array */ 283 if((destLength+U16_LENGTH(value))<=destCapacity) { 284 U16_APPEND_UNSAFE(dest, destLength, value); 285 } else { 286 destLength+=U16_LENGTH(value); 287 } 288 289 /* go to the following characters */ 290 s=end; 291 } 292 } 293 294 /* read a range like start or start..end */ 295 U_CAPI int32_t U_EXPORT2 296 u_parseCodePointRangeAnyTerminator(const char *s, 297 uint32_t *pStart, uint32_t *pEnd, 298 const char **terminator, 299 UErrorCode *pErrorCode) { 300 char *end; 301 uint32_t value; 302 303 if(U_FAILURE(*pErrorCode)) { 304 return 0; 305 } 306 if(s==NULL || pStart==NULL || pEnd==NULL) { 307 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 308 return 0; 309 } 310 311 /* read the start code point */ 312 s=u_skipWhitespace(s); 313 value=(uint32_t)uprv_strtoul(s, &end, 16); 314 if(end<=s || value>=0x110000) { 315 *pErrorCode=U_PARSE_ERROR; 316 return 0; 317 } 318 *pStart=*pEnd=value; 319 320 /* is there a "..end"? */ 321 s=u_skipWhitespace(end); 322 if(*s!='.' || s[1]!='.') { 323 *terminator=end; 324 return 1; 325 } 326 s=u_skipWhitespace(s+2); 327 328 /* read the end code point */ 329 value=(uint32_t)uprv_strtoul(s, &end, 16); 330 if(end<=s || value>=0x110000) { 331 *pErrorCode=U_PARSE_ERROR; 332 return 0; 333 } 334 *pEnd=value; 335 336 /* is this a valid range? */ 337 if(value<*pStart) { 338 *pErrorCode=U_PARSE_ERROR; 339 return 0; 340 } 341 342 *terminator=end; 343 return value-*pStart+1; 344 } 345 346 U_CAPI int32_t U_EXPORT2 347 u_parseCodePointRange(const char *s, 348 uint32_t *pStart, uint32_t *pEnd, 349 UErrorCode *pErrorCode) { 350 const char *terminator; 351 int32_t rangeLength= 352 u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode); 353 if(U_SUCCESS(*pErrorCode)) { 354 terminator=u_skipWhitespace(terminator); 355 if(*terminator!=';' && *terminator!=0) { 356 *pErrorCode=U_PARSE_ERROR; 357 return 0; 358 } 359 } 360 return rangeLength; 361 } 362 363 U_CAPI int32_t U_EXPORT2 364 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) { 365 const char *read = source; 366 int32_t i = 0; 367 unsigned int value = 0; 368 if(sLen == -1) { 369 sLen = (int32_t)strlen(source); 370 } 371 372 while(read < source+sLen) { 373 sscanf(read, "%2x", &value); 374 if(i < destCapacity) { 375 dest[i] = (char)value; 376 } 377 i++; 378 read += 2; 379 } 380 return u_terminateChars(dest, destCapacity, i, status); 381 } 382