1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2003-2007, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uciter8.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2003jan10 14 * created by: Markus W. Scherer 15 * 16 * This file contains sample code that illustrates reading 17 * 8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8 18 * and also accepting single surrogates. 19 */ 20 21 #include <stdio.h> 22 #include <string.h> 23 #include "unicode/utypes.h" 24 #include "unicode/uiter.h" 25 #include "uit_len8.h" 26 27 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) 28 29 #define log_err printf 30 31 /* UCharIterator test ------------------------------------------------------- */ 32 33 /* 34 * The following code is a copy of the UCharIterator test code in 35 * source/test/cintltst/custrtst.c, 36 * testing the lenient-8 iterator instead of the UTF-8 one. 37 */ 38 39 /* 40 * Compare results from two iterators, should be same. 41 * Assume that the text is not empty and that 42 * iteration start==0 and iteration limit==length. 43 */ 44 static void 45 compareIterators(UCharIterator *iter1, const char *n1, 46 UCharIterator *iter2, const char *n2) { 47 int32_t i, pos1, pos2, middle, length; 48 UChar32 c1, c2; 49 50 /* compare lengths */ 51 length=iter1->getIndex(iter1, UITER_LENGTH); 52 pos2=iter2->getIndex(iter2, UITER_LENGTH); 53 if(length!=pos2) { 54 log_err("%s->getIndex(length)=%d != %d=%s->getIndex(length)\n", n1, length, pos2, n2); 55 return; 56 } 57 58 /* set into the middle */ 59 middle=length/2; 60 61 pos1=iter1->move(iter1, middle, UITER_ZERO); 62 if(pos1!=middle) { 63 log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n1, middle, pos1); 64 return; 65 } 66 67 pos2=iter2->move(iter2, middle, UITER_ZERO); 68 if(pos2!=middle) { 69 log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n2, middle, pos2); 70 return; 71 } 72 73 /* test current() */ 74 c1=iter1->current(iter1); 75 c2=iter2->current(iter2); 76 if(c1!=c2) { 77 log_err("%s->current()=U+%04x != U+%04x=%s->current() at middle=%d\n", n1, c1, c2, n2, middle); 78 return; 79 } 80 81 /* move forward 3 UChars */ 82 for(i=0; i<3; ++i) { 83 c1=iter1->next(iter1); 84 c2=iter2->next(iter2); 85 if(c1!=c2) { 86 log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT)); 87 return; 88 } 89 } 90 91 /* move backward 5 UChars */ 92 for(i=0; i<5; ++i) { 93 c1=iter1->previous(iter1); 94 c2=iter2->previous(iter2); 95 if(c1!=c2) { 96 log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT)); 97 return; 98 } 99 } 100 101 /* iterate forward from the beginning */ 102 pos1=iter1->move(iter1, 0, UITER_START); 103 if(pos1<0) { 104 log_err("%s->move(start) failed\n", n1); 105 return; 106 } 107 if(!iter1->hasNext(iter1)) { 108 log_err("%s->hasNext() at the start returns FALSE\n", n1); 109 return; 110 } 111 112 pos2=iter2->move(iter2, 0, UITER_START); 113 if(pos2<0) { 114 log_err("%s->move(start) failed\n", n2); 115 return; 116 } 117 if(!iter2->hasNext(iter2)) { 118 log_err("%s->hasNext() at the start returns FALSE\n", n2); 119 return; 120 } 121 122 do { 123 c1=iter1->next(iter1); 124 c2=iter2->next(iter2); 125 if(c1!=c2) { 126 log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT)); 127 return; 128 } 129 } while(c1>=0); 130 131 if(iter1->hasNext(iter1)) { 132 log_err("%s->hasNext() at the end returns TRUE\n", n1); 133 return; 134 } 135 if(iter2->hasNext(iter2)) { 136 log_err("%s->hasNext() at the end returns TRUE\n", n2); 137 return; 138 } 139 140 /* back to the middle */ 141 pos1=iter1->move(iter1, middle, UITER_ZERO); 142 if(pos1!=middle) { 143 log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n1, middle, pos1); 144 return; 145 } 146 147 pos2=iter2->move(iter2, middle, UITER_ZERO); 148 if(pos2!=middle) { 149 log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n2, middle, pos2); 150 return; 151 } 152 153 /* move to index 1 */ 154 pos1=iter1->move(iter1, 1, UITER_ZERO); 155 if(pos1!=1) { 156 log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n1, middle, pos1); 157 return; 158 } 159 160 pos2=iter2->move(iter2, 1, UITER_ZERO); 161 if(pos2!=1) { 162 log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n2, middle, pos2); 163 return; 164 } 165 166 /* iterate backward from the end */ 167 pos1=iter1->move(iter1, 0, UITER_LIMIT); 168 if(pos1<0) { 169 log_err("%s->move(limit) failed\n", n1); 170 return; 171 } 172 if(!iter1->hasPrevious(iter1)) { 173 log_err("%s->hasPrevious() at the end returns FALSE\n", n1); 174 return; 175 } 176 177 pos2=iter2->move(iter2, 0, UITER_LIMIT); 178 if(pos2<0) { 179 log_err("%s->move(limit) failed\n", n2); 180 return; 181 } 182 if(!iter2->hasPrevious(iter2)) { 183 log_err("%s->hasPrevious() at the end returns FALSE\n", n2); 184 return; 185 } 186 187 do { 188 c1=iter1->previous(iter1); 189 c2=iter2->previous(iter2); 190 if(c1!=c2) { 191 log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT)); 192 return; 193 } 194 } while(c1>=0); 195 196 if(iter1->hasPrevious(iter1)) { 197 log_err("%s->hasPrevious() at the start returns TRUE\n", n1); 198 return; 199 } 200 if(iter2->hasPrevious(iter2)) { 201 log_err("%s->hasPrevious() at the start returns TRUE\n", n2); 202 return; 203 } 204 } 205 206 /* 207 * Test the iterator's getState() and setState() functions. 208 * iter1 and iter2 must be set up for the same iterator type and the same string 209 * but may be physically different structs (different addresses). 210 * 211 * Assume that the text is not empty and that 212 * iteration start==0 and iteration limit==length. 213 * It must be 2<=middle<=length-2. 214 */ 215 static void 216 testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) { 217 UChar32 u[4]; 218 219 UErrorCode errorCode; 220 UChar32 c; 221 uint32_t state; 222 int32_t i, j; 223 224 /* get four UChars from the middle of the string */ 225 iter1->move(iter1, middle-2, UITER_ZERO); 226 for(i=0; i<4; ++i) { 227 c=iter1->next(iter1); 228 if(c<0) { 229 /* the test violates the assumptions, see comment above */ 230 log_err("test error: %s[%d]=%d\n", n, middle-2+i, c); 231 return; 232 } 233 u[i]=c; 234 } 235 236 /* move to the middle and get the state */ 237 iter1->move(iter1, -2, UITER_CURRENT); 238 state=uiter_getState(iter1); 239 240 /* set the state into the second iterator and compare the results */ 241 errorCode=U_ZERO_ERROR; 242 uiter_setState(iter2, state, &errorCode); 243 if(U_FAILURE(errorCode)) { 244 log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode)); 245 return; 246 } 247 248 c=iter2->current(iter2); 249 if(c!=u[2]) { 250 log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]); 251 } 252 253 c=iter2->previous(iter2); 254 if(c!=u[1]) { 255 log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]); 256 } 257 258 iter2->move(iter2, 2, UITER_CURRENT); 259 c=iter2->next(iter2); 260 if(c!=u[3]) { 261 log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]); 262 } 263 264 iter2->move(iter2, -3, UITER_CURRENT); 265 c=iter2->previous(iter2); 266 if(c!=u[0]) { 267 log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]); 268 } 269 270 /* move the second iterator back to the middle */ 271 iter2->move(iter2, 1, UITER_CURRENT); 272 iter2->next(iter2); 273 274 /* check that both are in the middle */ 275 i=iter1->getIndex(iter1, UITER_CURRENT); 276 j=iter2->getIndex(iter2, UITER_CURRENT); 277 if(i!=middle) { 278 log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle); 279 } 280 if(i!=j) { 281 log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i); 282 } 283 284 /* compare lengths */ 285 i=iter1->getIndex(iter1, UITER_LENGTH); 286 j=iter2->getIndex(iter2, UITER_LENGTH); 287 if(i!=j) { 288 log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j); 289 } 290 } 291 292 static void 293 TestLenient8Iterator() { 294 static const UChar text[]={ 295 0x61, 0x62, 0x63, 296 /* dffd 107fd d801 dffd - in UTF-16, U+107fd=<d801 dffd> */ 297 0xdffd, 0xd801, 0xdffd, 0xd801, 0xdffd, 298 0x78, 0x79, 0x7a, 0 299 }; 300 static const uint8_t bytes[]={ 301 0x61, 0x62, 0x63, 302 /* dffd 107fd d801 dffd - mixture */ 303 0xed, 0xbf, 0xbd, 0xf0, 0x90, 0x9f, 0xbd, 0xed, 0xa0, 0x81, 0xed, 0xbf, 0xbd, 304 0x78, 0x79, 0x7a, 0 305 }; 306 307 UCharIterator iter1, iter2; 308 UChar32 c1, c2; 309 int32_t length; 310 311 puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)"); 312 313 /* compare the same string between UTF-16 and lenient-8 UCharIterators */ 314 uiter_setString(&iter1, text, -1); 315 uiter_setLenient8(&iter2, (const char *)bytes, sizeof(bytes)-1); 316 compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator"); 317 318 /* try again with length=-1 */ 319 uiter_setLenient8(&iter2, (const char *)bytes, -1); 320 compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1"); 321 322 /* test get/set state */ 323 length=LENGTHOF(text)-1; 324 uiter_setLenient8(&iter1, (const char*)bytes, -1); 325 testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2); 326 testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1); 327 328 /* ---------------------------------------------------------------------- */ 329 330 puts("no output so far means that the lenient-8 iterator works fine"); 331 332 puts("iterate forward:\nUTF-16\tlenient-8"); 333 uiter_setString(&iter1, text, -1); 334 iter1.move(&iter1, 0, UITER_START); 335 iter2.move(&iter2, 0, UITER_START); 336 for(;;) { 337 c1=iter1.next(&iter1); 338 c2=iter2.next(&iter2); 339 if(c1<0 && c2<0) { 340 break; 341 } 342 if(c1<0) { 343 printf("\t%04x\n", c2); 344 } else if(c2<0) { 345 printf("%04x\n", c1); 346 } else { 347 printf("%04x\t%04x\n", c1, c2); 348 } 349 } 350 } 351 352 extern int 353 main(int argc, const char *argv[]) { 354 TestLenient8Iterator(); 355 return 0; 356 } 357