1 /* 2 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 /** 17 * @file picobase.c 18 * 19 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland 20 * All rights reserved. 21 * 22 * History: 23 * - 2009-04-20 -- initial version 24 * 25 */ 26 27 #include "picoos.h" 28 #include "picodbg.h" 29 #include "picodefs.h" 30 #include "picobase.h" 31 32 #ifdef __cplusplus 33 extern "C" { 34 #endif 35 #if 0 36 } 37 #endif 38 39 /** 40 * @addtogroup picobase 41 * 42 * @b Unicode_UTF8_functions 43 * 44 * UTF8 45 * scalar value 1st Byte 2nd Byte 3rd Byte 4th Byte 46 * 00000000 0xxxxxxx 0xxxxxxx 47 * 00000yyy yyxxxxxx 110yyyyy 10xxxxxx 48 * zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx 49 * 000uuuuu zzzzyyyy yyxxxxx 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 50 * 51 */ 52 picoos_int32 picobase_utf8_length(const picoos_uint8 *utf8str, 53 const picoos_uint16 maxlen) { 54 55 picoos_uint16 i; 56 picoos_uint16 len; 57 picoos_uint8 follow; 58 picoos_uint8 ok; 59 60 ok = TRUE; 61 i = 0; 62 len = 0; 63 follow = 0; 64 while (ok && (i < maxlen) && (utf8str[i] != '\000')) { 65 if (follow > 0) { 66 if ((utf8str[i] >= (picoos_uint8)'\200') && 67 (utf8str[i] < (picoos_uint8)'\300')) { 68 follow--; 69 } else { 70 ok = FALSE; 71 } 72 } else if (utf8str[i] < (picoos_uint8)'\200') { 73 len++; 74 } else if (utf8str[i] >= (picoos_uint8)'\370') { 75 ok = FALSE; 76 } else if (utf8str[i] >= (picoos_uint8)'\360') { 77 follow = 3; 78 len++; 79 } else if (utf8str[i] >= (picoos_uint8)'\340') { 80 follow = 2; 81 len++; 82 } else if (utf8str[i] >= (picoos_uint8)'\300') { 83 follow = 1; 84 len++; 85 } else { 86 ok = FALSE; 87 } 88 i++; 89 } 90 if (ok) { 91 return len; 92 } else { 93 return -1; 94 } 95 } 96 97 98 static picoos_uint32 base_utf32_lowercase (picoos_uint32 utf32) 99 { 100 101 picoos_uint32 lc; 102 103 lc = utf32; 104 if (((utf32 >= 65313) && (utf32 <= 65338))) { 105 lc = (utf32 + 32); 106 } else if (((utf32 >= 66560) && (utf32 <= 66599))) { 107 lc = (utf32 + 40); 108 } else if (((utf32 >= 7680) && (utf32 <= 9423))) { 109 switch (utf32) { 110 case 7680: case 7681: case 7682: case 7683: case 7684: case 7685: case 7686: case 7687: case 7688: case 7689: 111 case 7690: case 7691: case 7692: case 7693: case 7694: case 7695: case 7696: case 7697: case 7698: case 7699: case 7700: case 7701: 112 case 7702: case 7703: case 7704: case 7705: case 7706: case 7707: case 7708: case 7709: case 7710: case 7711: case 7712: case 7713: 113 case 7714: case 7715: case 7716: case 7717: case 7718: case 7719: case 7720: case 7721: case 7722: case 7723: case 7724: case 7725: 114 case 7726: case 7727: case 7728: case 7729: case 7730: case 7731: case 7732: case 7733: case 7734: case 7735: case 7736: case 7737: 115 case 7738: case 7739: case 7740: case 7741: case 7742: case 7743: case 7744: case 7745: case 7746: case 7747: case 7748: case 7749: 116 case 7750: case 7751: case 7752: case 7753: case 7754: case 7755: case 7756: case 7757: case 7758: case 7759: case 7760: case 7761: 117 case 7762: case 7763: case 7764: case 7765: case 7766: case 7767: case 7768: case 7769: case 7770: case 7771: case 7772: case 7773: 118 case 7774: case 7775: case 7776: case 7777: case 7778: case 7779: case 7780: case 7781: case 7782: case 7783: case 7784: case 7785: 119 case 7786: case 7787: case 7788: case 7789: case 7790: case 7791: case 7792: case 7793: case 7794: case 7795: case 7796: case 7797: 120 case 7798: case 7799: case 7800: case 7801: case 7802: case 7803: case 7804: case 7805: case 7806: case 7807: case 7808: case 7809: 121 case 7810: case 7811: case 7812: case 7813: case 7814: case 7815: case 7816: case 7817: case 7818: case 7819: case 7820: case 7821: 122 case 7822: case 7823: case 7824: case 7825: case 7826: case 7827: case 7828: case 7840: case 7841: case 7842: case 7843: 123 case 7844: case 7845: case 7846: case 7847: case 7848: case 7849: case 7850: case 7851: case 7852: case 7853: case 7854: case 7855: 124 case 7856: case 7857: case 7858: case 7859: case 7860: case 7861: case 7862: case 7863: case 7864: case 7865: case 7866: case 7867: 125 case 7868: case 7869: case 7870: case 7871: case 7872: case 7873: case 7874: case 7875: case 7876: case 7877: case 7878: case 7879: 126 case 7880: case 7881: case 7882: case 7883: case 7884: case 7885: case 7886: case 7887: case 7888: case 7889: case 7890: case 7891: 127 case 7892: case 7893: case 7894: case 7895: case 7896: case 7897: case 7898: case 7899: case 7900: case 7901: case 7902: case 7903: 128 case 7904: case 7905: case 7906: case 7907: case 7908: case 7909: case 7910: case 7911: case 7912: case 7913: case 7914: case 7915: 129 case 7916: case 7917: case 7918: case 7919: case 7920: case 7921: case 7922: case 7923: case 7924: case 7925: case 7926: case 7927: 130 case 7928: 131 if ( !(((utf32) % 2 == 1))) { 132 lc = (utf32 + 1); 133 } 134 break; 135 case 7944: case 7945: case 7946: case 7947: case 7948: case 7949: case 7950: case 7951: case 7960: 136 case 7961: case 7962: case 7963: case 7964: case 7965: case 7976: case 7977: case 7978: case 7979: case 7980: case 7981: 137 case 7982: case 7983: case 7992: case 7993: case 7994: case 7995: case 7996: case 7997: case 7998: case 7999: 138 case 8008: case 8009: case 8010: case 8011: case 8012: case 8013: case 8040: case 8041: case 8042: case 8043: case 8044: 139 case 8045: case 8046: case 8047: case 8072: case 8073: case 8074: case 8075: case 8076: case 8077: case 8078: case 8079: 140 case 8088: case 8089: case 8090: case 8091: case 8092: case 8093: case 8094: case 8095: case 8104: case 8105: 141 case 8106: case 8107: case 8108: case 8109: case 8110: case 8111: 142 lc = (utf32 - 8); 143 break; 144 case 8025: case 8026: case 8027: case 8028: case 8029: case 8030: case 8031: 145 if (((utf32) % 2 == 1)) { 146 lc = (utf32 - 8); 147 } 148 break; 149 case 8544: case 8545: case 8546: case 8547: case 8548: case 8549: case 8550: case 8551: case 8552: case 8553: 150 case 8554: case 8555: case 8556: case 8557: case 8558: case 8559: 151 lc = (utf32 + 16); 152 break; 153 case 9398: case 9399: case 9400: case 9401: case 9402: case 9403: case 9404: case 9405: case 9406: case 9407: 154 case 9408: case 9409: case 9410: case 9411: case 9412: case 9413: case 9414: case 9415: case 9416: case 9417: case 9418: case 9419: 155 case 9420: case 9421: case 9422: case 9423: 156 lc = (utf32 + 26); 157 break; 158 case 8120: 159 lc = 8112; 160 break; 161 case 8121: 162 lc = 8113; 163 break; 164 case 8122: 165 lc = 8048; 166 break; 167 case 8123: 168 lc = 8049; 169 break; 170 case 8124: 171 lc = 8115; 172 break; 173 case 8136: 174 lc = 8050; 175 break; 176 case 8137: 177 lc = 8051; 178 break; 179 case 8138: 180 lc = 8052; 181 break; 182 case 8139: 183 lc = 8053; 184 break; 185 case 8140: 186 lc = 8131; 187 break; 188 case 8152: 189 lc = 8144; 190 break; 191 case 8153: 192 lc = 8145; 193 break; 194 case 8154: 195 lc = 8054; 196 break; 197 case 8155: 198 lc = 8055; 199 break; 200 case 8168: 201 lc = 8160; 202 break; 203 case 8169: 204 lc = 8161; 205 break; 206 case 8170: 207 lc = 8058; 208 break; 209 case 8171: 210 lc = 8059; 211 break; 212 case 8172: 213 lc = 8165; 214 break; 215 case 8184: 216 lc = 8056; 217 break; 218 case 8185: 219 lc = 8057; 220 break; 221 case 8186: 222 lc = 8060; 223 break; 224 case 8187: 225 lc = 8061; 226 break; 227 case 8188: 228 lc = 8179; 229 break; 230 case 8486: 231 lc = 969; 232 break; 233 case 8490: 234 lc = 107; 235 break; 236 case 8491: 237 lc = 229; 238 break; 239 default: 240 break; 241 } 242 } else { 243 switch (utf32) { 244 case 65: case 66: case 67: case 68: case 69: case 70: case 71: case 72: case 73: case 74: 245 case 75: case 76: case 77: case 78: case 79: case 80: case 81: case 82: case 83: case 84: case 85: case 86: 246 case 87: case 88: case 89: case 90: case 192: case 193: case 194: case 195: case 196: case 197: case 198: 247 case 199: case 200: case 201: case 202: case 203: case 204: case 205: case 206: case 207: case 208: case 209: case 210: 248 case 211: case 212: case 213: case 214: case 216: case 217: case 218: case 219: case 220: case 221: case 222: 249 case 913: case 914: case 915: case 916: case 917: case 918: case 919: case 920: case 921: case 922: case 923: 250 case 924: case 925: case 926: case 927: case 928: case 929: case 931: case 932: case 933: case 934: case 935: 251 case 936: case 937: case 938: case 939: case 1040: case 1041: case 1042: case 1043: case 1044: case 1045: case 1046: 252 case 1047: case 1048: case 1049: case 1050: case 1051: case 1052: case 1053: case 1054: case 1055: case 1056: case 1057: case 1058: 253 case 1059: case 1060: case 1061: case 1062: case 1063: case 1064: case 1065: case 1066: case 1067: case 1068: case 1069: case 1070: 254 case 1071: 255 lc = (utf32 + 32); 256 break; 257 case 256: case 257: case 258: case 259: case 260: case 261: case 262: case 263: case 264: case 265: 258 case 266: case 267: case 268: case 269: case 270: case 271: case 272: case 273: case 274: case 275: case 276: case 277: 259 case 278: case 279: case 280: case 281: case 282: case 283: case 284: case 285: case 286: case 287: case 288: case 289: 260 case 290: case 291: case 292: case 293: case 294: case 295: case 296: case 297: case 298: case 299: case 300: case 301: 261 case 302: case 303: case 305: case 306: case 307: case 308: case 309: case 310: case 330: case 331: 262 case 332: case 333: case 334: case 335: case 336: case 337: case 338: case 339: case 340: case 341: case 342: case 343: 263 case 344: case 345: case 346: case 347: case 348: case 349: case 350: case 351: case 352: case 353: case 354: case 355: 264 case 356: case 357: case 358: case 359: case 360: case 361: case 362: case 363: case 364: case 365: case 366: case 367: 265 case 368: case 369: case 370: case 371: case 372: case 373: case 374: case 416: case 417: case 418: case 419: 266 case 420: case 478: case 479: case 480: case 481: case 482: case 483: case 484: case 485: case 486: case 487: 267 case 488: case 489: case 490: case 491: case 492: case 493: case 494: case 504: case 505: case 506: case 507: 268 case 508: case 509: case 510: case 511: case 512: case 513: case 514: case 515: case 516: case 517: case 518: case 519: 269 case 520: case 521: case 522: case 523: case 524: case 525: case 526: case 527: case 528: case 529: case 530: case 531: 270 case 532: case 533: case 534: case 535: case 536: case 537: case 538: case 539: case 540: case 541: case 542: 271 case 546: case 547: case 548: case 549: case 550: case 551: case 552: case 553: case 554: case 555: case 556: case 557: 272 case 558: case 559: case 560: case 561: case 562: case 984: case 985: case 986: case 987: case 988: case 989: 273 case 990: case 991: case 992: case 993: case 994: case 995: case 996: case 997: case 998: case 999: case 1000: case 1001: 274 case 1002: case 1003: case 1004: case 1005: case 1006: case 1120: case 1121: case 1122: case 1123: case 1124: case 1125: 275 case 1126: case 1127: case 1128: case 1129: case 1130: case 1131: case 1132: case 1133: case 1134: case 1135: case 1136: case 1137: 276 case 1138: case 1139: case 1140: case 1141: case 1142: case 1143: case 1144: case 1145: case 1146: case 1147: case 1148: case 1149: 277 case 1150: case 1151: case 1152: case 1162: case 1163: case 1164: case 1165: case 1166: case 1167: case 1168: case 1169: 278 case 1170: case 1171: case 1172: case 1173: case 1174: case 1175: case 1176: case 1177: case 1178: case 1179: case 1180: case 1181: 279 case 1182: case 1183: case 1184: case 1185: case 1186: case 1187: case 1188: case 1189: case 1190: case 1191: case 1192: case 1193: 280 case 1194: case 1195: case 1196: case 1197: case 1198: case 1199: case 1200: case 1201: case 1202: case 1203: case 1204: case 1205: 281 case 1206: case 1207: case 1208: case 1209: case 1210: case 1211: case 1212: case 1213: case 1214: case 1232: case 1233: 282 case 1234: case 1235: case 1236: case 1237: case 1238: case 1239: case 1240: case 1241: case 1242: case 1243: case 1244: case 1245: 283 case 1246: case 1247: case 1248: case 1249: case 1250: case 1251: case 1252: case 1253: case 1254: case 1255: case 1256: case 1257: 284 case 1258: case 1259: case 1260: case 1261: case 1262: case 1263: case 1264: case 1265: case 1266: case 1267: case 1268: 285 case 1280: case 1281: case 1282: case 1283: case 1284: case 1285: case 1286: case 1287: case 1288: case 1289: case 1290: case 1291: 286 case 1292: case 1293: case 1294: 287 if ( !(((utf32) % 2 == 1))) { 288 lc = (utf32 + 1); 289 } 290 break; 291 case 313: case 314: case 315: case 316: case 317: case 318: case 319: case 320: case 321: case 322: 292 case 323: case 324: case 325: case 326: case 327: case 377: case 378: case 379: case 380: case 381: 293 case 459: case 460: case 461: case 462: case 463: case 464: case 465: case 466: case 467: case 468: case 469: case 470: 294 case 471: case 472: case 473: case 474: case 475: case 1217: case 1218: case 1219: case 1220: case 1221: case 1222: 295 case 1223: case 1224: case 1225: case 1226: case 1227: case 1228: case 1229: 296 if (((utf32) % 2 == 1)) { 297 lc = (utf32 + 1); 298 } 299 break; 300 case 1024: case 1025: case 1026: case 1027: case 1028: case 1029: case 1030: case 1031: case 1032: case 1033: 301 case 1034: case 1035: case 1036: case 1037: case 1038: case 1039: 302 lc = (utf32 + 80); 303 break; 304 case 1329: case 1330: case 1331: case 1332: case 1333: case 1334: case 1335: case 1336: case 1337: case 1338: 305 case 1339: case 1340: case 1341: case 1342: case 1343: case 1344: case 1345: case 1346: case 1347: case 1348: case 1349: case 1350: 306 case 1351: case 1352: case 1353: case 1354: case 1355: case 1356: case 1357: case 1358: case 1359: case 1360: case 1361: case 1362: 307 case 1363: case 1364: case 1365: case 1366: 308 lc = (utf32 + 48); 309 break; 310 case 304: 311 lc = 105; 312 break; 313 case 376: 314 lc = 255; 315 break; 316 case 385: 317 lc = 595; 318 break; 319 case 386: 320 lc = 387; 321 break; 322 case 388: 323 lc = 389; 324 break; 325 case 390: 326 lc = 596; 327 break; 328 case 391: 329 lc = 392; 330 break; 331 case 393: 332 lc = 598; 333 break; 334 case 394: 335 lc = 599; 336 break; 337 case 395: 338 lc = 396; 339 break; 340 case 398: 341 lc = 477; 342 break; 343 case 399: 344 lc = 601; 345 break; 346 case 400: 347 lc = 603; 348 break; 349 case 401: 350 lc = 402; 351 break; 352 case 403: 353 lc = 608; 354 break; 355 case 404: 356 lc = 611; 357 break; 358 case 406: 359 lc = 617; 360 break; 361 case 407: 362 lc = 616; 363 break; 364 case 408: 365 lc = 409; 366 break; 367 case 412: 368 lc = 623; 369 break; 370 case 413: 371 lc = 626; 372 break; 373 case 415: 374 lc = 629; 375 break; 376 case 422: 377 lc = 640; 378 break; 379 case 423: 380 lc = 424; 381 break; 382 case 425: 383 lc = 643; 384 break; 385 case 428: 386 lc = 429; 387 break; 388 case 430: 389 lc = 648; 390 break; 391 case 431: 392 lc = 432; 393 break; 394 case 433: 395 lc = 650; 396 break; 397 case 434: 398 lc = 651; 399 break; 400 case 435: 401 lc = 436; 402 break; 403 case 437: 404 lc = 438; 405 break; 406 case 439: 407 lc = 658; 408 break; 409 case 440: 410 lc = 441; 411 break; 412 case 444: 413 lc = 445; 414 break; 415 case 452: 416 lc = 454; 417 break; 418 case 453: 419 lc = 454; 420 break; 421 case 455: 422 lc = 457; 423 break; 424 case 456: 425 lc = 457; 426 break; 427 case 458: 428 lc = 460; 429 break; 430 case 497: 431 lc = 499; 432 break; 433 case 498: 434 lc = 499; 435 break; 436 case 500: 437 lc = 501; 438 break; 439 case 502: 440 lc = 405; 441 break; 442 case 503: 443 lc = 447; 444 break; 445 case 544: 446 lc = 414; 447 break; 448 case 902: 449 lc = 940; 450 break; 451 case 904: 452 lc = 941; 453 break; 454 case 905: 455 lc = 942; 456 break; 457 case 906: 458 lc = 943; 459 break; 460 case 908: 461 lc = 972; 462 break; 463 case 910: 464 lc = 973; 465 break; 466 case 911: 467 lc = 974; 468 break; 469 case 1012: 470 lc = 952; 471 break; 472 case 1015: 473 lc = 1016; 474 break; 475 case 1017: 476 lc = 1010; 477 break; 478 case 1018: 479 lc = 1019; 480 break; 481 case 1272: 482 lc = 1273; 483 break; 484 default: 485 break; 486 } 487 } 488 return lc; 489 } 490 491 /** 492 * Converts utf32 input to uppercase 493 * @param utf32 : a single character encoded in UTF32 494 * @return a single uppercase character encoded in UTF32 495 */ 496 static picoos_uint32 base_utf32_uppercase (picoos_uint32 utf32) 497 { 498 picoos_uint32 lc; 499 500 lc = utf32; 501 if (((utf32 >= 65345) && (utf32 <= 65370))) { 502 lc = (utf32 - 32); 503 } else if (((utf32 >= 66600) && (utf32 <= 66639))) { 504 lc = (utf32 - 40); 505 } else if (((utf32 >= 7681) && (utf32 <= 9449))) { 506 switch (utf32) { 507 case 7681: case 7682: case 7683: case 7684: case 7685: case 7686: case 7687: case 7688: case 7689: case 7690: 508 case 7691: case 7692: case 7693: case 7694: case 7695: case 7696: case 7697: case 7698: case 7699: case 7700: case 7701: case 7702: 509 case 7703: case 7704: case 7705: case 7706: case 7707: case 7708: case 7709: case 7710: case 7711: case 7712: case 7713: case 7714: 510 case 7715: case 7716: case 7717: case 7718: case 7719: case 7720: case 7721: case 7722: case 7723: case 7724: case 7725: case 7726: 511 case 7727: case 7728: case 7729: case 7730: case 7731: case 7732: case 7733: case 7734: case 7735: case 7736: case 7737: case 7738: 512 case 7739: case 7740: case 7741: case 7742: case 7743: case 7744: case 7745: case 7746: case 7747: case 7748: case 7749: case 7750: 513 case 7751: case 7752: case 7753: case 7754: case 7755: case 7756: case 7757: case 7758: case 7759: case 7760: case 7761: case 7762: 514 case 7763: case 7764: case 7765: case 7766: case 7767: case 7768: case 7769: case 7770: case 7771: case 7772: case 7773: case 7774: 515 case 7775: case 7776: case 7777: case 7778: case 7779: case 7780: case 7781: case 7782: case 7783: case 7784: case 7785: case 7786: 516 case 7787: case 7788: case 7789: case 7790: case 7791: case 7792: case 7793: case 7794: case 7795: case 7796: case 7797: case 7798: 517 case 7799: case 7800: case 7801: case 7802: case 7803: case 7804: case 7805: case 7806: case 7807: case 7808: case 7809: case 7810: 518 case 7811: case 7812: case 7813: case 7814: case 7815: case 7816: case 7817: case 7818: case 7819: case 7820: case 7821: case 7822: 519 case 7823: case 7824: case 7825: case 7826: case 7827: case 7828: case 7829: case 7841: case 7842: case 7843: case 7844: 520 case 7845: case 7846: case 7847: case 7848: case 7849: case 7850: case 7851: case 7852: case 7853: case 7854: case 7855: case 7856: 521 case 7857: case 7858: case 7859: case 7860: case 7861: case 7862: case 7863: case 7864: case 7865: case 7866: case 7867: case 7868: 522 case 7869: case 7870: case 7871: case 7872: case 7873: case 7874: case 7875: case 7876: case 7877: case 7878: case 7879: case 7880: 523 case 7881: case 7882: case 7883: case 7884: case 7885: case 7886: case 7887: case 7888: case 7889: case 7890: case 7891: case 7892: 524 case 7893: case 7894: case 7895: case 7896: case 7897: case 7898: case 7899: case 7900: case 7901: case 7902: case 7903: case 7904: 525 case 7905: case 7906: case 7907: case 7908: case 7909: case 7910: case 7911: case 7912: case 7913: case 7914: case 7915: case 7916: 526 case 7917: case 7918: case 7919: case 7920: case 7921: case 7922: case 7923: case 7924: case 7925: case 7926: case 7927: case 7928: 527 case 7929: 528 if (((utf32) % 2 == 1)) { 529 lc = (utf32 - 1); 530 } 531 break; 532 case 7936: case 7937: case 7938: case 7939: case 7940: case 7941: case 7942: case 7943: case 7952: 533 case 7953: case 7954: case 7955: case 7956: case 7957: case 7968: case 7969: case 7970: case 7971: case 7972: case 7973: 534 case 7974: case 7975: case 7984: case 7985: case 7986: case 7987: case 7988: case 7989: case 7990: case 7991: 535 case 8000: case 8001: case 8002: case 8003: case 8004: case 8005: case 8032: case 8033: case 8034: case 8035: case 8036: 536 case 8037: case 8038: case 8039: case 8064: case 8065: case 8066: case 8067: case 8068: case 8069: case 8070: case 8071: 537 case 8080: case 8081: case 8082: case 8083: case 8084: case 8085: case 8086: case 8087: case 8096: case 8097: 538 case 8098: case 8099: case 8100: case 8101: case 8102: case 8103: 539 lc = (utf32 + 8); 540 break; 541 case 8017: case 8018: case 8019: case 8020: case 8021: case 8022: case 8023: 542 if (((utf32) % 2 == 1)) { 543 lc = (utf32 + 8); 544 } 545 break; 546 case 8560: case 8561: case 8562: case 8563: case 8564: case 8565: case 8566: case 8567: case 8568: case 8569: 547 case 8570: case 8571: case 8572: case 8573: case 8574: case 8575: 548 lc = (utf32 - 16); 549 break; 550 case 9424: case 9425: case 9426: case 9427: case 9428: case 9429: case 9430: case 9431: case 9432: case 9433: 551 case 9434: case 9435: case 9436: case 9437: case 9438: case 9439: case 9440: case 9441: case 9442: case 9443: case 9444: case 9445: 552 case 9446: case 9447: case 9448: case 9449: 553 lc = (utf32 - 26); 554 break; 555 case 7835: 556 lc = 7776; 557 break; 558 case 8048: 559 lc = 8122; 560 break; 561 case 8049: 562 lc = 8123; 563 break; 564 case 8050: 565 lc = 8136; 566 break; 567 case 8051: 568 lc = 8137; 569 break; 570 case 8052: 571 lc = 8138; 572 break; 573 case 8053: 574 lc = 8139; 575 break; 576 case 8054: 577 lc = 8154; 578 break; 579 case 8055: 580 lc = 8155; 581 break; 582 case 8056: 583 lc = 8184; 584 break; 585 case 8057: 586 lc = 8185; 587 break; 588 case 8058: 589 lc = 8170; 590 break; 591 case 8059: 592 lc = 8171; 593 break; 594 case 8060: 595 lc = 8186; 596 break; 597 case 8061: 598 lc = 8187; 599 break; 600 case 8112: 601 lc = 8120; 602 break; 603 case 8113: 604 lc = 8121; 605 break; 606 case 8115: 607 lc = 8124; 608 break; 609 case 8126: 610 lc = 921; 611 break; 612 case 8131: 613 lc = 8140; 614 break; 615 case 8144: 616 lc = 8152; 617 break; 618 case 8145: 619 lc = 8153; 620 break; 621 case 8160: 622 lc = 8168; 623 break; 624 case 8161: 625 lc = 8169; 626 break; 627 case 8165: 628 lc = 8172; 629 break; 630 case 8179: 631 lc = 8188; 632 break; 633 default: 634 break; 635 } 636 } else { 637 switch (utf32) { 638 case 97: case 98: case 99: case 100: case 101: case 102: case 103: case 104: case 105: case 106: 639 case 107: case 108: case 109: case 110: case 111: case 112: case 113: case 114: case 115: case 116: case 117: case 118: 640 case 119: case 120: case 121: case 122: case 224: case 225: case 226: case 227: case 228: case 229: case 230: 641 case 231: case 232: case 233: case 234: case 235: case 236: case 237: case 238: case 239: case 240: case 241: case 242: 642 case 243: case 244: case 245: case 246: case 247: case 248: case 249: case 250: case 251: case 252: case 253: case 254: 643 case 945: case 946: case 947: case 948: case 949: case 950: case 951: case 952: case 953: case 954: case 955: 644 case 956: case 957: case 958: case 959: case 960: case 961: case 963: case 964: case 965: case 966: case 967: 645 case 968: case 969: case 970: case 971: case 1072: case 1073: case 1074: case 1075: case 1076: case 1077: case 1078: 646 case 1079: case 1080: case 1081: case 1082: case 1083: case 1084: case 1085: case 1086: case 1087: case 1088: case 1089: case 1090: 647 case 1091: case 1092: case 1093: case 1094: case 1095: case 1096: case 1097: case 1098: case 1099: case 1100: case 1101: case 1102: 648 case 1103: 649 if ((utf32 != 247)) { 650 lc = (utf32 - 32); 651 } 652 break; 653 case 257: case 258: case 259: case 260: case 261: case 262: case 263: case 264: case 265: case 266: 654 case 267: case 268: case 269: case 270: case 271: case 272: case 273: case 274: case 275: case 276: case 277: case 278: 655 case 279: case 280: case 281: case 282: case 283: case 284: case 285: case 286: case 287: case 288: case 289: case 290: 656 case 291: case 292: case 293: case 294: case 295: case 296: case 297: case 298: case 299: case 300: case 301: case 302: 657 case 303: case 304: case 306: case 307: case 308: case 309: case 310: case 311: case 331: case 332: 658 case 333: case 334: case 335: case 336: case 337: case 338: case 339: case 340: case 341: case 342: case 343: case 344: 659 case 345: case 346: case 347: case 348: case 349: case 350: case 351: case 352: case 353: case 354: case 355: case 356: 660 case 357: case 358: case 359: case 360: case 361: case 362: case 363: case 364: case 365: case 366: case 367: case 368: 661 case 369: case 370: case 371: case 372: case 373: case 374: case 375: case 417: case 418: case 419: case 420: 662 case 421: case 481: case 482: case 483: case 484: case 485: case 486: case 487: case 488: case 489: case 490: 663 case 491: case 492: case 493: case 494: case 495: case 507: case 508: case 509: case 510: case 511: 664 case 513: case 514: case 515: case 516: case 517: case 518: case 519: case 520: case 521: case 522: case 523: case 524: 665 case 525: case 526: case 527: case 528: case 529: case 530: case 531: case 532: case 533: case 534: case 535: case 536: 666 case 537: case 538: case 539: case 540: case 541: case 542: case 543: case 544: case 546: case 547: case 548: 667 case 549: case 550: case 551: case 552: case 553: case 554: case 555: case 556: case 557: case 558: case 559: case 560: 668 case 561: case 562: case 563: case 985: case 986: case 987: case 988: case 989: case 990: case 991: case 992: 669 case 993: case 994: case 995: case 996: case 997: case 998: case 999: case 1000: case 1001: case 1002: case 1003: case 1004: 670 case 1005: case 1006: case 1007: case 1121: case 1122: case 1123: case 1124: case 1125: case 1126: case 1127: case 1128: 671 case 1129: case 1130: case 1131: case 1132: case 1133: case 1134: case 1135: case 1136: case 1137: case 1138: case 1139: case 1140: 672 case 1141: case 1142: case 1143: case 1144: case 1145: case 1146: case 1147: case 1148: case 1149: case 1150: case 1151: case 1152: 673 case 1153: case 1163: case 1164: case 1165: case 1166: case 1167: case 1168: case 1169: case 1170: case 1171: case 1172: 674 case 1173: case 1174: case 1175: case 1176: case 1177: case 1178: case 1179: case 1180: case 1181: case 1182: case 1183: case 1184: 675 case 1185: case 1186: case 1187: case 1188: case 1189: case 1190: case 1191: case 1192: case 1193: case 1194: case 1195: case 1196: 676 case 1197: case 1198: case 1199: case 1200: case 1201: case 1202: case 1203: case 1204: case 1205: case 1206: case 1207: case 1208: 677 case 1209: case 1210: case 1211: case 1212: case 1213: case 1214: case 1215: case 1233: case 1234: case 1235: case 1236: 678 case 1237: case 1238: case 1239: case 1240: case 1241: case 1242: case 1243: case 1244: case 1245: case 1246: case 1247: case 1248: 679 case 1249: case 1250: case 1251: case 1252: case 1253: case 1254: case 1255: case 1256: case 1257: case 1258: case 1259: case 1260: 680 case 1261: case 1262: case 1263: case 1264: case 1265: case 1266: case 1267: case 1268: case 1269: case 1281: case 1282: 681 case 1283: case 1284: case 1285: case 1286: case 1287: case 1288: case 1289: case 1290: case 1291: case 1292: case 1293: case 1294: 682 case 1295: 683 if (((utf32) % 2 == 1)) { 684 lc = (utf32 - 1); 685 } 686 break; 687 case 314: case 315: case 316: case 317: case 318: case 319: case 320: case 321: case 322: case 323: 688 case 324: case 325: case 326: case 327: case 328: case 378: case 379: case 380: case 381: case 382: 689 case 464: case 465: case 466: case 467: case 468: case 469: case 470: case 471: case 472: case 473: case 474: case 475: 690 case 476: case 1218: case 1219: case 1220: case 1221: case 1222: case 1223: case 1224: case 1225: case 1226: case 1227: 691 case 1228: case 1229: case 1230: 692 if ( !(((utf32) % 2 == 1))) { 693 lc = (utf32 - 1); 694 } 695 break; 696 case 1104: case 1105: case 1106: case 1107: case 1108: case 1109: case 1110: case 1111: case 1112: case 1113: 697 case 1114: case 1115: case 1116: case 1117: case 1118: case 1119: 698 lc = (utf32 - 80); 699 break; 700 case 1377: case 1378: case 1379: case 1380: case 1381: case 1382: case 1383: case 1384: case 1385: case 1386: 701 case 1387: case 1388: case 1389: case 1390: case 1391: case 1392: case 1393: case 1394: case 1395: case 1396: case 1397: case 1398: 702 case 1399: case 1400: case 1401: case 1402: case 1403: case 1404: case 1405: case 1406: case 1407: case 1408: case 1409: case 1410: 703 case 1411: case 1412: case 1413: case 1414: 704 lc = (utf32 - 48); 705 break; 706 case 181: 707 lc = 924; 708 break; 709 case 255: 710 lc = 376; 711 break; 712 case 305: 713 lc = 73; 714 break; 715 case 383: 716 lc = 83; 717 break; 718 case 387: 719 lc = 386; 720 break; 721 case 389: 722 lc = 388; 723 break; 724 case 392: 725 lc = 391; 726 break; 727 case 396: 728 lc = 395; 729 break; 730 case 402: 731 lc = 401; 732 break; 733 case 405: 734 lc = 502; 735 break; 736 case 409: 737 lc = 408; 738 break; 739 case 414: 740 lc = 544; 741 break; 742 case 424: 743 lc = 423; 744 break; 745 case 429: 746 lc = 428; 747 break; 748 case 432: 749 lc = 431; 750 break; 751 case 436: 752 lc = 435; 753 break; 754 case 438: 755 lc = 437; 756 break; 757 case 441: 758 lc = 440; 759 break; 760 case 445: 761 lc = 444; 762 break; 763 case 447: 764 lc = 503; 765 break; 766 case 453: 767 lc = 452; 768 break; 769 case 454: 770 lc = 452; 771 break; 772 case 456: 773 lc = 455; 774 break; 775 case 457: 776 lc = 455; 777 break; 778 case 459: 779 lc = 458; 780 break; 781 case 460: 782 lc = 458; 783 break; 784 case 462: 785 lc = 461; 786 break; 787 case 477: 788 lc = 398; 789 break; 790 case 479: 791 lc = 478; 792 break; 793 case 498: 794 lc = 497; 795 break; 796 case 499: 797 lc = 497; 798 break; 799 case 501: 800 lc = 500; 801 break; 802 case 505: 803 lc = 504; 804 break; 805 case 595: 806 lc = 385; 807 break; 808 case 596: 809 lc = 390; 810 break; 811 case 598: 812 lc = 393; 813 break; 814 case 599: 815 lc = 394; 816 break; 817 case 601: 818 lc = 399; 819 break; 820 case 603: 821 lc = 400; 822 break; 823 case 608: 824 lc = 403; 825 break; 826 case 611: 827 lc = 404; 828 break; 829 case 616: 830 lc = 407; 831 break; 832 case 617: 833 lc = 406; 834 break; 835 case 623: 836 lc = 412; 837 break; 838 case 626: 839 lc = 413; 840 break; 841 case 629: 842 lc = 415; 843 break; 844 case 640: 845 lc = 422; 846 break; 847 case 643: 848 lc = 425; 849 break; 850 case 648: 851 lc = 430; 852 break; 853 case 650: 854 lc = 433; 855 break; 856 case 651: 857 lc = 434; 858 break; 859 case 658: 860 lc = 439; 861 break; 862 case 837: 863 lc = 921; 864 break; 865 case 940: 866 lc = 902; 867 break; 868 case 941: 869 lc = 904; 870 break; 871 case 942: 872 lc = 905; 873 break; 874 case 943: 875 lc = 906; 876 break; 877 case 962: 878 lc = 931; 879 break; 880 case 972: 881 lc = 908; 882 break; 883 case 973: 884 lc = 910; 885 break; 886 case 974: 887 lc = 911; 888 break; 889 case 976: 890 lc = 914; 891 break; 892 case 977: 893 lc = 920; 894 break; 895 case 981: 896 lc = 934; 897 break; 898 case 982: 899 lc = 928; 900 break; 901 case 1008: 902 lc = 922; 903 break; 904 case 1009: 905 lc = 929; 906 break; 907 case 1010: 908 lc = 1017; 909 break; 910 case 1013: 911 lc = 917; 912 break; 913 case 1016: 914 lc = 1015; 915 break; 916 case 1019: 917 lc = 1018; 918 break; 919 case 1273: 920 lc = 1272; 921 break; 922 default: 923 break; 924 } 925 } 926 return lc; 927 } 928 929 /** 930 * Gets the UTF8 character 'utf8char' from the UTF8 string 'utf8str' from 931 * position 'pos' 932 * @param utf8str: utf8 string 933 * @param pos: position from where the utf8 character is copied 934 * (also output set as position of the next utf8 character in the utf8 string) 935 * @param utf8char: zero terminated utf8 character containing 1 to 4 bytes (output) 936 */ 937 static void picobase_get_utf8char (picoos_uint8 utf8[], picoos_int32 * pos, picobase_utf8char utf8char) 938 { 939 940 int i; 941 int l; 942 943 utf8char[0] = 0; 944 l = picobase_det_utf8_length(utf8[*pos]); 945 i = 0; 946 while ((((i < l) && (i < PICOBASE_UTF8_MAXLEN)) && (utf8[*pos] != 0))) { 947 utf8char[i] = utf8[*pos]; 948 (*pos)++; 949 i++; 950 } 951 utf8char[i] = 0; 952 } 953 954 955 picoos_uint8 picobase_get_next_utf8char(const picoos_uint8 *utf8s, 956 const picoos_uint32 utf8slenmax, 957 picoos_uint32 *pos, 958 picobase_utf8char utf8char) { 959 picoos_uint8 i; 960 picoos_uint8 len; 961 picoos_uint32 poscnt; 962 963 utf8char[0] = 0; 964 len = picobase_det_utf8_length(utf8s[*pos]); 965 if ((((*pos) + len) > utf8slenmax) || 966 (len > PICOBASE_UTF8_MAXLEN)) { 967 return FALSE; 968 } 969 970 poscnt = *pos; 971 i = 0; 972 while ((i < len) && (utf8s[poscnt] != 0)) { 973 utf8char[i] = utf8s[poscnt]; 974 poscnt++; 975 i++; 976 } 977 utf8char[i] = 0; 978 if ((i < len) && (utf8s[poscnt] == 0)) { 979 return FALSE; 980 } 981 *pos = poscnt; 982 return TRUE; 983 } 984 985 picoos_uint8 picobase_get_next_utf8charpos(const picoos_uint8 *utf8s, 986 const picoos_uint32 utf8slenmax, 987 picoos_uint32 *pos) { 988 picoos_uint8 i; 989 picoos_uint8 len; 990 picoos_uint32 poscnt; 991 992 len = picobase_det_utf8_length(utf8s[*pos]); 993 if ((((*pos) + len) > utf8slenmax) || 994 (len > PICOBASE_UTF8_MAXLEN)){ 995 return FALSE; 996 } 997 998 poscnt = *pos; 999 i = 0; 1000 while ((i < len) && (utf8s[poscnt] != 0)) { 1001 poscnt++; 1002 i++; 1003 } 1004 if ((i < len) && (utf8s[poscnt] == 0)) { 1005 return FALSE; 1006 } 1007 *pos = poscnt; 1008 return TRUE; 1009 } 1010 1011 picoos_uint8 picobase_get_prev_utf8char(const picoos_uint8 *utf8s, 1012 const picoos_uint32 utf8slenmin, 1013 picoos_uint32 *pos, 1014 picobase_utf8char utf8char) { 1015 picoos_uint8 i, j; 1016 picoos_uint8 len; 1017 picoos_uint32 poscnt; 1018 1019 utf8char[0] = 0; 1020 if ((*pos) == 0) { 1021 return FALSE; 1022 } 1023 poscnt = (*pos) - 1; 1024 i = 1; 1025 while ((i <= PICOBASE_UTF8_MAXLEN) && (poscnt >= utf8slenmin) && 1026 (utf8s[poscnt] != 0)) { 1027 len = picobase_det_utf8_length(utf8s[poscnt]); 1028 if (len == i) { 1029 for (j = 0; j < len; j++) { 1030 utf8char[j] = utf8s[poscnt + j]; 1031 } 1032 utf8char[j] = 0; 1033 *pos = poscnt; 1034 return TRUE; 1035 } 1036 i++; 1037 poscnt--; 1038 } 1039 return FALSE; 1040 } 1041 1042 picoos_uint8 picobase_get_prev_utf8charpos(const picoos_uint8 *utf8s, 1043 const picoos_uint32 utf8slenmin, 1044 picoos_uint32 *pos) { 1045 picoos_uint8 i; 1046 picoos_uint8 len; 1047 picoos_uint32 poscnt; 1048 1049 if ((*pos) == 0) { 1050 return FALSE; 1051 } 1052 poscnt = (*pos) - 1; 1053 i = 1; 1054 while ((i <= PICOBASE_UTF8_MAXLEN) && (poscnt >= utf8slenmin) && 1055 (utf8s[poscnt] != 0)) { 1056 len = picobase_det_utf8_length(utf8s[poscnt]); 1057 if (len == i) { 1058 *pos = poscnt; 1059 return TRUE; 1060 } 1061 i++; 1062 poscnt--; 1063 } 1064 return FALSE; 1065 } 1066 1067 /** 1068 * Converts utf8 input to utf32 1069 * @param utf8[] : character encoded in utf8 1070 * @param done : boolean indicating the completion of the operation (FALSE: conversion not done) 1071 * @return a single character encoded in UTF32 1072 */ 1073 static picobase_utf32 picobase_utf8_to_utf32 (picoos_uint8 utf8[], picoos_uint8 * done) 1074 { 1075 (*done) = TRUE; 1076 if ((utf8[0] < (picoos_uint8)'\200')) { 1077 return utf8[0]; 1078 } else if ((utf8[0] >= (picoos_uint8)'\370')) { 1079 return 0; 1080 } else if ((utf8[0] >= (picoos_uint8)'\360')) { 1081 return ((((262144 * (utf8[0] % 8)) + (4096 * (utf8[1] % 64))) + (64 * (utf8[2] % 64))) + (utf8[3] % 64)); 1082 } else if ((utf8[0] >= (picoos_uint8)'\340')) { 1083 return (((4096 * (utf8[0] % 16)) + (64 * (utf8[1] % 64))) + (utf8[2] % 64)); 1084 } else if ((utf8[(0)] >= (picoos_uint8)'\300')) { 1085 return ((64 * (utf8[0] % 32)) + (utf8[1] % 64)); 1086 } else { 1087 (*done) = FALSE; 1088 return 0; 1089 } 1090 } 1091 1092 static picoos_int32 picobase_utf32_to_utf8 (picobase_utf32 utf32, picobase_utf8 utf8[], picoos_int32 utf8MaxLen, picoos_uint8 * done) 1093 { 1094 picoos_int32 len; 1095 1096 (*done) = TRUE; 1097 if (utf8MaxLen >= 4) { 1098 if (utf32 < 128) { 1099 len = 1; 1100 utf8[0] = utf32; 1101 } else if (utf32 < 2048) { 1102 len = 2; 1103 utf8[1] = (128 + (utf32 % 64)); 1104 utf32 = (utf32 / 64); 1105 utf8[0] = (192 + (utf32 % 32)); 1106 } else if (utf32 < 65536) { 1107 len = 3; 1108 utf8[2] = (128 + (utf32 % 64)); 1109 utf32 = (utf32 / 64); 1110 utf8[1] = (128 + (utf32 % 64)); 1111 utf32 = (utf32 / 64); 1112 utf8[0] = (224 + utf32); 1113 } else if (utf32 < 1048576) { 1114 len = 4; 1115 utf8[3] = (128 + (utf32 % 64)); 1116 utf32 = (utf32 / 64); 1117 utf8[2] = (128 + (utf32 % 64)); 1118 utf32 = (utf32 / 64); 1119 utf8[1] = (128 + (utf32 % 64)); 1120 utf32 = (utf32 / 64); 1121 utf8[0] = (240 + utf32); 1122 } else { 1123 (*done) = FALSE; 1124 return 0; 1125 } 1126 if (len <= (utf8MaxLen-1)) { 1127 utf8[len] = 0; 1128 } 1129 return len; 1130 } else { 1131 (*done) = FALSE; 1132 return 0; 1133 } 1134 } 1135 1136 1137 extern picoos_int32 picobase_lowercase_utf8_str (picoos_uchar utf8str[], picoos_char lowercase[], int lowercaseMaxLen, picoos_uint8 * done) 1138 { 1139 picobase_utf8char utf8char; 1140 picoos_int32 i; 1141 picoos_int32 j; 1142 picoos_int32 k; 1143 picoos_int32 l; 1144 picobase_utf32 utf32; 1145 picoos_uint8 done1; 1146 1147 k = 0; 1148 i = 0; 1149 (*done) = TRUE; 1150 while (utf8str[i] != 0) { 1151 picobase_get_utf8char(utf8str,& i,utf8char); 1152 utf32 = picobase_utf8_to_utf32(utf8char, & done1); 1153 utf32 = base_utf32_lowercase(utf32); 1154 l = picobase_utf32_to_utf8(utf32, utf8char, PICOBASE_UTF8_MAXLEN, & done1); 1155 j = 0; 1156 while ((j < l) && (k < (lowercaseMaxLen-1))) { 1157 lowercase[k] = utf8char[j]; 1158 k++; 1159 j++; 1160 } 1161 *done = *done && (j == l); 1162 } 1163 lowercase[k] = 0; 1164 return k; 1165 } 1166 1167 1168 extern picoos_int32 picobase_uppercase_utf8_str (picoos_uchar utf8str[], picoos_char uppercase[], int uppercaseMaxLen, picoos_uint8 * done) 1169 { 1170 picobase_utf8char utf8char; 1171 picoos_int32 i; 1172 picoos_int32 j; 1173 picoos_int32 k; 1174 picoos_int32 l; 1175 picobase_utf32 utf32; 1176 picoos_uint8 done1; 1177 1178 k = 0; 1179 i = 0; 1180 (*done) = TRUE; 1181 while (utf8str[i] != 0) { 1182 picobase_get_utf8char(utf8str,& i,utf8char); 1183 utf32 = picobase_utf8_to_utf32(utf8char, & done1); 1184 utf32 = base_utf32_uppercase(utf32); 1185 l = picobase_utf32_to_utf8(utf32, utf8char, PICOBASE_UTF8_MAXLEN, & done1); 1186 j = 0; 1187 while ((j < l) && (k < (uppercaseMaxLen-1))) { 1188 uppercase[k] = utf8char[j]; 1189 k++; 1190 j++; 1191 } 1192 *done = *done && (j == l); 1193 } 1194 uppercase[k] = 0; 1195 return k; 1196 } 1197 1198 1199 extern picoos_bool picobase_is_utf8_uppercase (picoos_uchar utf8str[], picoos_int32 utf8strmaxlen) 1200 { 1201 picobase_utf8char utf8char; 1202 picoos_int32 i; 1203 picoos_uint32 utf32; 1204 picoos_bool done; 1205 picoos_bool isUpperCase; 1206 1207 isUpperCase = TRUE; 1208 i = 0; 1209 while (isUpperCase && (i <= utf8strmaxlen-1) && (utf8str[i] != 0)) { 1210 picobase_get_utf8char(utf8str,& i,utf8char); 1211 utf32 = picobase_utf8_to_utf32(utf8char,& done); 1212 isUpperCase = isUpperCase && (utf32 == base_utf32_uppercase(utf32)); 1213 } 1214 return isUpperCase; 1215 } 1216 1217 1218 extern picoos_bool picobase_is_utf8_lowercase (picoos_uchar utf8str[], picoos_int32 utf8strmaxlen) 1219 { 1220 picobase_utf8char utf8char; 1221 picoos_int32 i; 1222 picoos_uint32 utf32; 1223 picoos_bool done; 1224 picoos_bool isLowerCase; 1225 1226 isLowerCase = TRUE; 1227 i = 0; 1228 while (isLowerCase && (i <= utf8strmaxlen-1) && (utf8str[i] != 0)) { 1229 picobase_get_utf8char(utf8str,& i,utf8char); 1230 utf32 = picobase_utf8_to_utf32(utf8char,& done); 1231 isLowerCase = isLowerCase && (utf32 == base_utf32_lowercase(utf32)); 1232 } 1233 return isLowerCase; 1234 } 1235 1236 1237 #ifdef __cplusplus 1238 } 1239 #endif 1240 1241 1242 1243 /* end */ 1244