1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2016 and later: Unicode, Inc. and others. 5 * License & terms of use: http://www.unicode.org/copyright.html#License 6 * 7 ******************************************************************************* 8 ******************************************************************************* 9 * 10 * Copyright (C) 2003-2014, International Business Machines 11 * Corporation and others. All Rights Reserved. 12 * 13 ******************************************************************************* 14 * file name: uciter8.c 15 * encoding: US-ASCII 16 * tab size: 8 (not used) 17 * indentation:4 18 * 19 * created on: 2003jan10 20 * created by: Markus W. Scherer 21 * 22 * This file contains sample code that illustrates reading 23 * 8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8 24 * and also accepting single surrogates. 25 */ 26 27 #include <stdio.h> 28 #include <string.h> 29 #include "unicode/utypes.h" 30 #include "unicode/uiter.h" 31 #include "uit_len8.h" 32 33 #define log_err printf 34 35 /* UCharIterator test ------------------------------------------------------- */ 36 37 /* 38 * The following code is a copy of the UCharIterator test code in 39 * source/test/cintltst/custrtst.c, 40 * testing the lenient-8 iterator instead of the UTF-8 one. 41 */ 42 43 /* 44 * Compare results from two iterators, should be same. 45 * Assume that the text is not empty and that 46 * iteration start==0 and iteration limit==length. 47 */ 48 static void 49 compareIterators(UCharIterator *iter1, const char *n1, 50 UCharIterator *iter2, const char *n2) { 51 int32_t i, pos1, pos2, middle, length; 52 UChar32 c1, c2; 53 54 /* compare lengths */ 55 length=iter1->getIndex(iter1, UITER_LENGTH); 56 pos2=iter2->getIndex(iter2, UITER_LENGTH); 57 if(length!=pos2) { 58 log_err("%s->getIndex(length)=%d != %d=%s->getIndex(length)\n", n1, length, pos2, n2); 59 return; 60 } 61 62 /* set into the middle */ 63 middle=length/2; 64 65 pos1=iter1->move(iter1, middle, UITER_ZERO); 66 if(pos1!=middle) { 67 log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n1, middle, pos1); 68 return; 69 } 70 71 pos2=iter2->move(iter2, middle, UITER_ZERO); 72 if(pos2!=middle) { 73 log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n2, middle, pos2); 74 return; 75 } 76 77 /* test current() */ 78 c1=iter1->current(iter1); 79 c2=iter2->current(iter2); 80 if(c1!=c2) { 81 log_err("%s->current()=U+%04x != U+%04x=%s->current() at middle=%d\n", n1, c1, c2, n2, middle); 82 return; 83 } 84 85 /* move forward 3 UChars */ 86 for(i=0; i<3; ++i) { 87 c1=iter1->next(iter1); 88 c2=iter2->next(iter2); 89 if(c1!=c2) { 90 log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT)); 91 return; 92 } 93 } 94 95 /* move backward 5 UChars */ 96 for(i=0; i<5; ++i) { 97 c1=iter1->previous(iter1); 98 c2=iter2->previous(iter2); 99 if(c1!=c2) { 100 log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT)); 101 return; 102 } 103 } 104 105 /* iterate forward from the beginning */ 106 pos1=iter1->move(iter1, 0, UITER_START); 107 if(pos1<0) { 108 log_err("%s->move(start) failed\n", n1); 109 return; 110 } 111 if(!iter1->hasNext(iter1)) { 112 log_err("%s->hasNext() at the start returns FALSE\n", n1); 113 return; 114 } 115 116 pos2=iter2->move(iter2, 0, UITER_START); 117 if(pos2<0) { 118 log_err("%s->move(start) failed\n", n2); 119 return; 120 } 121 if(!iter2->hasNext(iter2)) { 122 log_err("%s->hasNext() at the start returns FALSE\n", n2); 123 return; 124 } 125 126 do { 127 c1=iter1->next(iter1); 128 c2=iter2->next(iter2); 129 if(c1!=c2) { 130 log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT)); 131 return; 132 } 133 } while(c1>=0); 134 135 if(iter1->hasNext(iter1)) { 136 log_err("%s->hasNext() at the end returns TRUE\n", n1); 137 return; 138 } 139 if(iter2->hasNext(iter2)) { 140 log_err("%s->hasNext() at the end returns TRUE\n", n2); 141 return; 142 } 143 144 /* back to the middle */ 145 pos1=iter1->move(iter1, middle, UITER_ZERO); 146 if(pos1!=middle) { 147 log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n1, middle, pos1); 148 return; 149 } 150 151 pos2=iter2->move(iter2, middle, UITER_ZERO); 152 if(pos2!=middle) { 153 log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n2, middle, pos2); 154 return; 155 } 156 157 /* move to index 1 */ 158 pos1=iter1->move(iter1, 1, UITER_ZERO); 159 if(pos1!=1) { 160 log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n1, middle, pos1); 161 return; 162 } 163 164 pos2=iter2->move(iter2, 1, UITER_ZERO); 165 if(pos2!=1) { 166 log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n2, middle, pos2); 167 return; 168 } 169 170 /* iterate backward from the end */ 171 pos1=iter1->move(iter1, 0, UITER_LIMIT); 172 if(pos1<0) { 173 log_err("%s->move(limit) failed\n", n1); 174 return; 175 } 176 if(!iter1->hasPrevious(iter1)) { 177 log_err("%s->hasPrevious() at the end returns FALSE\n", n1); 178 return; 179 } 180 181 pos2=iter2->move(iter2, 0, UITER_LIMIT); 182 if(pos2<0) { 183 log_err("%s->move(limit) failed\n", n2); 184 return; 185 } 186 if(!iter2->hasPrevious(iter2)) { 187 log_err("%s->hasPrevious() at the end returns FALSE\n", n2); 188 return; 189 } 190 191 do { 192 c1=iter1->previous(iter1); 193 c2=iter2->previous(iter2); 194 if(c1!=c2) { 195 log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT)); 196 return; 197 } 198 } while(c1>=0); 199 200 if(iter1->hasPrevious(iter1)) { 201 log_err("%s->hasPrevious() at the start returns TRUE\n", n1); 202 return; 203 } 204 if(iter2->hasPrevious(iter2)) { 205 log_err("%s->hasPrevious() at the start returns TRUE\n", n2); 206 return; 207 } 208 } 209 210 /* 211 * Test the iterator's getState() and setState() functions. 212 * iter1 and iter2 must be set up for the same iterator type and the same string 213 * but may be physically different structs (different addresses). 214 * 215 * Assume that the text is not empty and that 216 * iteration start==0 and iteration limit==length. 217 * It must be 2<=middle<=length-2. 218 */ 219 static void 220 testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) { 221 UChar32 u[4]; 222 223 UErrorCode errorCode; 224 UChar32 c; 225 uint32_t state; 226 int32_t i, j; 227 228 /* get four UChars from the middle of the string */ 229 iter1->move(iter1, middle-2, UITER_ZERO); 230 for(i=0; i<4; ++i) { 231 c=iter1->next(iter1); 232 if(c<0) { 233 /* the test violates the assumptions, see comment above */ 234 log_err("test error: %s[%d]=%d\n", n, middle-2+i, c); 235 return; 236 } 237 u[i]=c; 238 } 239 240 /* move to the middle and get the state */ 241 iter1->move(iter1, -2, UITER_CURRENT); 242 state=uiter_getState(iter1); 243 244 /* set the state into the second iterator and compare the results */ 245 errorCode=U_ZERO_ERROR; 246 uiter_setState(iter2, state, &errorCode); 247 if(U_FAILURE(errorCode)) { 248 log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode)); 249 return; 250 } 251 252 c=iter2->current(iter2); 253 if(c!=u[2]) { 254 log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]); 255 } 256 257 c=iter2->previous(iter2); 258 if(c!=u[1]) { 259 log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]); 260 } 261 262 iter2->move(iter2, 2, UITER_CURRENT); 263 c=iter2->next(iter2); 264 if(c!=u[3]) { 265 log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]); 266 } 267 268 iter2->move(iter2, -3, UITER_CURRENT); 269 c=iter2->previous(iter2); 270 if(c!=u[0]) { 271 log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]); 272 } 273 274 /* move the second iterator back to the middle */ 275 iter2->move(iter2, 1, UITER_CURRENT); 276 iter2->next(iter2); 277 278 /* check that both are in the middle */ 279 i=iter1->getIndex(iter1, UITER_CURRENT); 280 j=iter2->getIndex(iter2, UITER_CURRENT); 281 if(i!=middle) { 282 log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle); 283 } 284 if(i!=j) { 285 log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i); 286 } 287 288 /* compare lengths */ 289 i=iter1->getIndex(iter1, UITER_LENGTH); 290 j=iter2->getIndex(iter2, UITER_LENGTH); 291 if(i!=j) { 292 log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j); 293 } 294 } 295 296 static void 297 TestLenient8Iterator() { 298 static const UChar text[]={ 299 0x61, 0x62, 0x63, 300 /* dffd 107fd d801 dffd - in UTF-16, U+107fd=<d801 dffd> */ 301 0xdffd, 0xd801, 0xdffd, 0xd801, 0xdffd, 302 0x78, 0x79, 0x7a, 0 303 }; 304 static const uint8_t bytes[]={ 305 0x61, 0x62, 0x63, 306 /* dffd 107fd d801 dffd - mixture */ 307 0xed, 0xbf, 0xbd, 0xf0, 0x90, 0x9f, 0xbd, 0xed, 0xa0, 0x81, 0xed, 0xbf, 0xbd, 308 0x78, 0x79, 0x7a, 0 309 }; 310 311 UCharIterator iter1, iter2; 312 UChar32 c1, c2; 313 int32_t length; 314 315 puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)"); 316 317 /* compare the same string between UTF-16 and lenient-8 UCharIterators */ 318 uiter_setString(&iter1, text, -1); 319 uiter_setLenient8(&iter2, (const char *)bytes, sizeof(bytes)-1); 320 compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator"); 321 322 /* try again with length=-1 */ 323 uiter_setLenient8(&iter2, (const char *)bytes, -1); 324 compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1"); 325 326 /* test get/set state */ 327 length=UPRV_LENGTHOF(text)-1; 328 uiter_setLenient8(&iter1, (const char*)bytes, -1); 329 testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2); 330 testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1); 331 332 /* ---------------------------------------------------------------------- */ 333 334 puts("no output so far means that the lenient-8 iterator works fine"); 335 336 puts("iterate forward:\nUTF-16\tlenient-8"); 337 uiter_setString(&iter1, text, -1); 338 iter1.move(&iter1, 0, UITER_START); 339 iter2.move(&iter2, 0, UITER_START); 340 for(;;) { 341 c1=iter1.next(&iter1); 342 c2=iter2.next(&iter2); 343 if(c1<0 && c2<0) { 344 break; 345 } 346 if(c1<0) { 347 printf("\t%04x\n", c2); 348 } else if(c2<0) { 349 printf("%04x\n", c1); 350 } else { 351 printf("%04x\t%04x\n", c1, c2); 352 } 353 } 354 } 355 356 extern int 357 main(int argc, const char *argv[]) { 358 TestLenient8Iterator(); 359 return 0; 360 } 361