1 /* 2 ********************************************************************** 3 * Copyright (C) 2002-2009, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv_u7.c 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2002jul01 12 * created by: Markus W. Scherer 13 * 14 * UTF-7 converter implementation. Used to be in ucnv_utf.c. 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_CONVERSION 20 21 #include "unicode/ucnv.h" 22 #include "ucnv_bld.h" 23 #include "ucnv_cnv.h" 24 25 /* UTF-7 -------------------------------------------------------------------- */ 26 27 /* 28 * UTF-7 is a stateful encoding of Unicode. 29 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt) 30 * It was intended for use in Internet email systems, using in its bytewise 31 * encoding only a subset of 7-bit US-ASCII. 32 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still 33 * occasionally used. 34 * 35 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII 36 * characters directly or in base64. Especially, the characters in set O 37 * as defined in the RFC (see below) may be encoded directly but are not 38 * allowed in, e.g., email headers. 39 * By default, the ICU UTF-7 converter encodes set O directly. 40 * By choosing the option "version=1", set O will be escaped instead. 41 * For example: 42 * utf7Converter=ucnv_open("UTF-7,version=1"); 43 * 44 * For details about email headers see RFC 2047. 45 */ 46 47 /* 48 * Tests for US-ASCII characters belonging to character classes 49 * defined in UTF-7. 50 * 51 * Set D (directly encoded characters) consists of the following 52 * characters: the upper and lower case letters A through Z 53 * and a through z, the 10 digits 0-9, and the following nine special 54 * characters (note that "+" and "=" are omitted): 55 * '(),-./:? 56 * 57 * Set O (optional direct characters) consists of the following 58 * characters (note that "\" and "~" are omitted): 59 * !"#$%&*;<=>@[]^_`{|} 60 * 61 * According to the rules in RFC 2152, the byte values for the following 62 * US-ASCII characters are not used in UTF-7 and are therefore illegal: 63 * - all C0 control codes except for CR LF TAB 64 * - BACKSLASH 65 * - TILDE 66 * - DEL 67 * - all codes beyond US-ASCII, i.e. all >127 68 */ 69 #define inSetD(c) \ 70 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \ 71 (uint8_t)((c)-48)<10 || /* digits */ \ 72 (uint8_t)((c)-39)<3 || /* '() */ \ 73 (uint8_t)((c)-44)<4 || /* ,-./ */ \ 74 (c)==58 || (c)==63 /* :? */ \ 75 ) 76 77 #define inSetO(c) \ 78 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \ 79 (uint8_t)((c)-59)<4 || /* ;<=> */ \ 80 (uint8_t)((c)-93)<4 || /* ]^_` */ \ 81 (uint8_t)((c)-123)<3 || /* {|} */ \ 82 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \ 83 ) 84 85 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9) 86 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9) 87 88 #define PLUS 43 89 #define MINUS 45 90 #define BACKSLASH 92 91 #define TILDE 126 92 93 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */ 94 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c)) 95 96 /* encode directly sets D and O and CR LF SP TAB */ 97 static const UBool encodeDirectlyMaximum[128]={ 98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 99 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 101 102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 104 105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 107 108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 110 }; 111 112 /* encode directly set D and CR LF SP TAB but not set O */ 113 static const UBool encodeDirectlyRestricted[128]={ 114 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 115 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 117 118 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 120 121 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 123 124 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 126 }; 127 128 static const uint8_t 129 toBase64[64]={ 130 /* A-Z */ 131 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 132 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 133 /* a-z */ 134 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 135 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 136 /* 0-9 */ 137 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 138 /* +/ */ 139 43, 47 140 }; 141 142 static const int8_t 143 fromBase64[128]={ 144 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */ 145 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3, 146 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, 147 148 /* general punctuation with + and / and a special value (-2) for - */ 149 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63, 150 /* digits */ 151 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, 152 153 /* A-Z */ 154 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 155 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1, 156 157 /* a-z */ 158 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 159 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3 160 }; 161 162 /* 163 * converter status values: 164 * 165 * toUnicodeStatus: 166 * 24 inDirectMode (boolean) 167 * 23..16 base64Counter (-1..7) 168 * 15..0 bits (up to 14 bits incoming base64) 169 * 170 * fromUnicodeStatus: 171 * 31..28 version (0: set O direct 1: set O escaped) 172 * 24 inDirectMode (boolean) 173 * 23..16 base64Counter (0..2) 174 * 7..0 bits (6 bits outgoing base64) 175 * 176 */ 177 178 static void 179 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) { 180 if(choice<=UCNV_RESET_TO_UNICODE) { 181 /* reset toUnicode */ 182 cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */ 183 cnv->toULength=0; 184 } 185 if(choice!=UCNV_RESET_TO_UNICODE) { 186 /* reset fromUnicode */ 187 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */ 188 } 189 } 190 191 static void 192 _UTF7Open(UConverter *cnv, 193 UConverterLoadArgs *pArgs, 194 UErrorCode *pErrorCode) { 195 if(UCNV_GET_VERSION(cnv)<=1) { 196 /* TODO(markus): Should just use cnv->options rather than copying the version number. */ 197 cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28; 198 _UTF7Reset(cnv, UCNV_RESET_BOTH); 199 } else { 200 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 201 } 202 } 203 204 static void 205 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 206 UErrorCode *pErrorCode) { 207 UConverter *cnv; 208 const uint8_t *source, *sourceLimit; 209 UChar *target; 210 const UChar *targetLimit; 211 int32_t *offsets; 212 213 uint8_t *bytes; 214 uint8_t byteIndex; 215 216 int32_t length, targetCapacity; 217 218 /* UTF-7 state */ 219 uint16_t bits; 220 int8_t base64Counter; 221 UBool inDirectMode; 222 223 int8_t base64Value; 224 225 int32_t sourceIndex, nextSourceIndex; 226 227 uint8_t b; 228 /* set up the local pointers */ 229 cnv=pArgs->converter; 230 231 source=(const uint8_t *)pArgs->source; 232 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 233 target=pArgs->target; 234 targetLimit=pArgs->targetLimit; 235 offsets=pArgs->offsets; 236 /* get the state machine state */ 237 { 238 uint32_t status=cnv->toUnicodeStatus; 239 inDirectMode=(UBool)((status>>24)&1); 240 base64Counter=(int8_t)(status>>16); 241 bits=(uint16_t)status; 242 } 243 bytes=cnv->toUBytes; 244 byteIndex=cnv->toULength; 245 246 /* sourceIndex=-1 if the current character began in the previous buffer */ 247 sourceIndex=byteIndex==0 ? 0 : -1; 248 nextSourceIndex=0; 249 250 if(inDirectMode) { 251 directMode: 252 /* 253 * In Direct Mode, most US-ASCII characters are encoded directly, i.e., 254 * with their US-ASCII byte values. 255 * Backslash and Tilde and most control characters are not allowed in UTF-7. 256 * A plus sign starts Unicode (or "escape") Mode. 257 * 258 * In Direct Mode, only the sourceIndex is used. 259 */ 260 byteIndex=0; 261 length=(int32_t)(sourceLimit-source); 262 targetCapacity=(int32_t)(targetLimit-target); 263 if(length>targetCapacity) { 264 length=targetCapacity; 265 } 266 while(length>0) { 267 b=*source++; 268 if(!isLegalUTF7(b)) { 269 /* illegal */ 270 bytes[0]=b; 271 byteIndex=1; 272 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 273 break; 274 } else if(b!=PLUS) { 275 /* write directly encoded character */ 276 *target++=b; 277 if(offsets!=NULL) { 278 *offsets++=sourceIndex++; 279 } 280 } else /* PLUS */ { 281 /* switch to Unicode mode */ 282 nextSourceIndex=++sourceIndex; 283 inDirectMode=FALSE; 284 byteIndex=0; 285 bits=0; 286 base64Counter=-1; 287 goto unicodeMode; 288 } 289 --length; 290 } 291 if(source<sourceLimit && target>=targetLimit) { 292 /* target is full */ 293 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 294 } 295 } else { 296 unicodeMode: 297 /* 298 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded. 299 * The base64 sequence ends with any character that is not in the base64 alphabet. 300 * A terminating minus sign is consumed. 301 * 302 * In Unicode Mode, the sourceIndex has the index to the start of the current 303 * base64 bytes, while nextSourceIndex is precisely parallel to source, 304 * keeping the index to the following byte. 305 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte. 306 */ 307 while(source<sourceLimit) { 308 if(target<targetLimit) { 309 bytes[byteIndex++]=b=*source++; 310 ++nextSourceIndex; 311 if(b>=126) { 312 /* illegal - test other illegal US-ASCII values by base64Value==-3 */ 313 inDirectMode=TRUE; 314 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 315 break; 316 } else if((base64Value=fromBase64[b])>=0) { 317 /* collect base64 bytes into UChars */ 318 switch(base64Counter) { 319 case -1: /* -1 is immediately after the + */ 320 case 0: 321 bits=base64Value; 322 base64Counter=1; 323 break; 324 case 1: 325 case 3: 326 case 4: 327 case 6: 328 bits=(uint16_t)((bits<<6)|base64Value); 329 ++base64Counter; 330 break; 331 case 2: 332 *target++=(UChar)((bits<<4)|(base64Value>>2)); 333 if(offsets!=NULL) { 334 *offsets++=sourceIndex; 335 sourceIndex=nextSourceIndex-1; 336 } 337 bytes[0]=b; /* keep this byte in case an error occurs */ 338 byteIndex=1; 339 bits=(uint16_t)(base64Value&3); 340 base64Counter=3; 341 break; 342 case 5: 343 *target++=(UChar)((bits<<2)|(base64Value>>4)); 344 if(offsets!=NULL) { 345 *offsets++=sourceIndex; 346 sourceIndex=nextSourceIndex-1; 347 } 348 bytes[0]=b; /* keep this byte in case an error occurs */ 349 byteIndex=1; 350 bits=(uint16_t)(base64Value&15); 351 base64Counter=6; 352 break; 353 case 7: 354 *target++=(UChar)((bits<<6)|base64Value); 355 if(offsets!=NULL) { 356 *offsets++=sourceIndex; 357 sourceIndex=nextSourceIndex; 358 } 359 byteIndex=0; 360 bits=0; 361 base64Counter=0; 362 break; 363 default: 364 /* will never occur */ 365 break; 366 } 367 } else if(base64Value==-2) { 368 /* minus sign terminates the base64 sequence */ 369 inDirectMode=TRUE; 370 if(base64Counter==-1) { 371 /* +- i.e. a minus immediately following a plus */ 372 *target++=PLUS; 373 if(offsets!=NULL) { 374 *offsets++=sourceIndex-1; 375 } 376 } else { 377 /* absorb the minus and leave the Unicode Mode */ 378 if(bits!=0) { 379 /* bits are illegally left over, a UChar is incomplete */ 380 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 381 break; 382 } 383 } 384 sourceIndex=nextSourceIndex; 385 goto directMode; 386 } else if(base64Value==-1) /* for any legal character except base64 and minus sign */ { 387 /* leave the Unicode Mode */ 388 inDirectMode=TRUE; 389 if(base64Counter==-1) { 390 /* illegal: + immediately followed by something other than base64 or minus sign */ 391 /* include the plus sign in the reported sequence */ 392 --sourceIndex; 393 bytes[0]=PLUS; 394 bytes[1]=b; 395 byteIndex=2; 396 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 397 break; 398 } else if(bits==0) { 399 /* un-read the character in case it is a plus sign */ 400 --source; 401 sourceIndex=nextSourceIndex-1; 402 goto directMode; 403 } else { 404 /* bits are illegally left over, a UChar is incomplete */ 405 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 406 break; 407 } 408 } else /* base64Value==-3 for illegal characters */ { 409 /* illegal */ 410 inDirectMode=TRUE; 411 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 412 break; 413 } 414 } else { 415 /* target is full */ 416 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 417 break; 418 } 419 } 420 } 421 422 if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) { 423 /* 424 * if we are in Unicode mode, then the byteIndex might not be 0, 425 * but that is ok if bits==0 426 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error 427 * (not true for IMAP-mailbox-name where we must end in direct mode) 428 */ 429 byteIndex=0; 430 } 431 432 /* set the converter state back into UConverter */ 433 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits; 434 cnv->toULength=byteIndex; 435 436 /* write back the updated pointers */ 437 pArgs->source=(const char *)source; 438 pArgs->target=target; 439 pArgs->offsets=offsets; 440 return; 441 } 442 443 static void 444 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 445 UErrorCode *pErrorCode) { 446 UConverter *cnv; 447 const UChar *source, *sourceLimit; 448 uint8_t *target, *targetLimit; 449 int32_t *offsets; 450 451 int32_t length, targetCapacity, sourceIndex; 452 UChar c; 453 454 /* UTF-7 state */ 455 const UBool *encodeDirectly; 456 uint8_t bits; 457 int8_t base64Counter; 458 UBool inDirectMode; 459 460 /* set up the local pointers */ 461 cnv=pArgs->converter; 462 463 /* set up the local pointers */ 464 source=pArgs->source; 465 sourceLimit=pArgs->sourceLimit; 466 target=(uint8_t *)pArgs->target; 467 targetLimit=(uint8_t *)pArgs->targetLimit; 468 offsets=pArgs->offsets; 469 470 /* get the state machine state */ 471 { 472 uint32_t status=cnv->fromUnicodeStatus; 473 encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted; 474 inDirectMode=(UBool)((status>>24)&1); 475 base64Counter=(int8_t)(status>>16); 476 bits=(uint8_t)status; 477 } 478 479 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */ 480 sourceIndex=0; 481 482 if(inDirectMode) { 483 directMode: 484 length=(int32_t)(sourceLimit-source); 485 targetCapacity=(int32_t)(targetLimit-target); 486 if(length>targetCapacity) { 487 length=targetCapacity; 488 } 489 while(length>0) { 490 c=*source++; 491 /* currently always encode CR LF SP TAB directly */ 492 if(c<=127 && encodeDirectly[c]) { 493 /* encode directly */ 494 *target++=(uint8_t)c; 495 if(offsets!=NULL) { 496 *offsets++=sourceIndex++; 497 } 498 } else if(c==PLUS) { 499 /* output +- for + */ 500 *target++=PLUS; 501 if(target<targetLimit) { 502 *target++=MINUS; 503 if(offsets!=NULL) { 504 *offsets++=sourceIndex; 505 *offsets++=sourceIndex++; 506 } 507 /* realign length and targetCapacity */ 508 goto directMode; 509 } else { 510 if(offsets!=NULL) { 511 *offsets++=sourceIndex++; 512 } 513 cnv->charErrorBuffer[0]=MINUS; 514 cnv->charErrorBufferLength=1; 515 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 516 break; 517 } 518 } else { 519 /* un-read this character and switch to Unicode Mode */ 520 --source; 521 *target++=PLUS; 522 if(offsets!=NULL) { 523 *offsets++=sourceIndex; 524 } 525 inDirectMode=FALSE; 526 base64Counter=0; 527 goto unicodeMode; 528 } 529 --length; 530 } 531 if(source<sourceLimit && target>=targetLimit) { 532 /* target is full */ 533 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 534 } 535 } else { 536 unicodeMode: 537 while(source<sourceLimit) { 538 if(target<targetLimit) { 539 c=*source++; 540 if(c<=127 && encodeDirectly[c]) { 541 /* encode directly */ 542 inDirectMode=TRUE; 543 544 /* trick: back out this character to make this easier */ 545 --source; 546 547 /* terminate the base64 sequence */ 548 if(base64Counter!=0) { 549 /* write remaining bits for the previous character */ 550 *target++=toBase64[bits]; 551 if(offsets!=NULL) { 552 *offsets++=sourceIndex-1; 553 } 554 } 555 if(fromBase64[c]!=-1) { 556 /* need to terminate with a minus */ 557 if(target<targetLimit) { 558 *target++=MINUS; 559 if(offsets!=NULL) { 560 *offsets++=sourceIndex-1; 561 } 562 } else { 563 cnv->charErrorBuffer[0]=MINUS; 564 cnv->charErrorBufferLength=1; 565 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 566 break; 567 } 568 } 569 goto directMode; 570 } else { 571 /* 572 * base64 this character: 573 * Output 2 or 3 base64 bytes for the remaining bits of the previous character 574 * and the bits of this character, each implicitly in UTF-16BE. 575 * 576 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one 577 * character to the next. The actual 2 or 4 bits are shifted to the left edge 578 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier. 579 */ 580 switch(base64Counter) { 581 case 0: 582 *target++=toBase64[c>>10]; 583 if(target<targetLimit) { 584 *target++=toBase64[(c>>4)&0x3f]; 585 if(offsets!=NULL) { 586 *offsets++=sourceIndex; 587 *offsets++=sourceIndex++; 588 } 589 } else { 590 if(offsets!=NULL) { 591 *offsets++=sourceIndex++; 592 } 593 cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f]; 594 cnv->charErrorBufferLength=1; 595 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 596 } 597 bits=(uint8_t)((c&15)<<2); 598 base64Counter=1; 599 break; 600 case 1: 601 *target++=toBase64[bits|(c>>14)]; 602 if(target<targetLimit) { 603 *target++=toBase64[(c>>8)&0x3f]; 604 if(target<targetLimit) { 605 *target++=toBase64[(c>>2)&0x3f]; 606 if(offsets!=NULL) { 607 *offsets++=sourceIndex; 608 *offsets++=sourceIndex; 609 *offsets++=sourceIndex++; 610 } 611 } else { 612 if(offsets!=NULL) { 613 *offsets++=sourceIndex; 614 *offsets++=sourceIndex++; 615 } 616 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f]; 617 cnv->charErrorBufferLength=1; 618 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 619 } 620 } else { 621 if(offsets!=NULL) { 622 *offsets++=sourceIndex++; 623 } 624 cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f]; 625 cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f]; 626 cnv->charErrorBufferLength=2; 627 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 628 } 629 bits=(uint8_t)((c&3)<<4); 630 base64Counter=2; 631 break; 632 case 2: 633 *target++=toBase64[bits|(c>>12)]; 634 if(target<targetLimit) { 635 *target++=toBase64[(c>>6)&0x3f]; 636 if(target<targetLimit) { 637 *target++=toBase64[c&0x3f]; 638 if(offsets!=NULL) { 639 *offsets++=sourceIndex; 640 *offsets++=sourceIndex; 641 *offsets++=sourceIndex++; 642 } 643 } else { 644 if(offsets!=NULL) { 645 *offsets++=sourceIndex; 646 *offsets++=sourceIndex++; 647 } 648 cnv->charErrorBuffer[0]=toBase64[c&0x3f]; 649 cnv->charErrorBufferLength=1; 650 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 651 } 652 } else { 653 if(offsets!=NULL) { 654 *offsets++=sourceIndex++; 655 } 656 cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f]; 657 cnv->charErrorBuffer[1]=toBase64[c&0x3f]; 658 cnv->charErrorBufferLength=2; 659 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 660 } 661 bits=0; 662 base64Counter=0; 663 break; 664 default: 665 /* will never occur */ 666 break; 667 } 668 } 669 } else { 670 /* target is full */ 671 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 672 break; 673 } 674 } 675 } 676 677 if(pArgs->flush && source>=sourceLimit) { 678 /* flush remaining bits to the target */ 679 if(!inDirectMode && base64Counter!=0) { 680 if(target<targetLimit) { 681 *target++=toBase64[bits]; 682 if(offsets!=NULL) { 683 *offsets++=sourceIndex-1; 684 } 685 } else { 686 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits]; 687 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 688 } 689 } 690 /* reset the state for the next conversion */ 691 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */ 692 } else { 693 /* set the converter state back into UConverter */ 694 cnv->fromUnicodeStatus= 695 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/ 696 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits; 697 } 698 699 /* write back the updated pointers */ 700 pArgs->source=source; 701 pArgs->target=(char *)target; 702 pArgs->offsets=offsets; 703 return; 704 } 705 706 static const char * 707 _UTF7GetName(const UConverter *cnv) { 708 switch(cnv->fromUnicodeStatus>>28) { 709 case 1: 710 return "UTF-7,version=1"; 711 default: 712 return "UTF-7"; 713 } 714 } 715 716 static const UConverterImpl _UTF7Impl={ 717 UCNV_UTF7, 718 719 NULL, 720 NULL, 721 722 _UTF7Open, 723 NULL, 724 _UTF7Reset, 725 726 _UTF7ToUnicodeWithOffsets, 727 _UTF7ToUnicodeWithOffsets, 728 _UTF7FromUnicodeWithOffsets, 729 _UTF7FromUnicodeWithOffsets, 730 NULL, 731 732 NULL, 733 _UTF7GetName, 734 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */ 735 NULL, 736 ucnv_getCompleteUnicodeSet 737 }; 738 739 static const UConverterStaticData _UTF7StaticData={ 740 sizeof(UConverterStaticData), 741 "UTF-7", 742 0, /* TODO CCSID for UTF-7 */ 743 UCNV_IBM, UCNV_UTF7, 744 1, 4, 745 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */ 746 FALSE, FALSE, 747 0, 748 0, 749 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 750 }; 751 752 const UConverterSharedData _UTF7Data={ 753 sizeof(UConverterSharedData), ~((uint32_t)0), 754 NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl, 755 0 756 }; 757 758 /* IMAP mailbox name encoding ----------------------------------------------- */ 759 760 /* 761 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1 762 * http://www.ietf.org/rfc/rfc2060.txt 763 * 764 * 5.1.3. Mailbox International Naming Convention 765 * 766 * By convention, international mailbox names are specified using a 767 * modified version of the UTF-7 encoding described in [UTF-7]. The 768 * purpose of these modifications is to correct the following problems 769 * with UTF-7: 770 * 771 * 1) UTF-7 uses the "+" character for shifting; this conflicts with 772 * the common use of "+" in mailbox names, in particular USENET 773 * newsgroup names. 774 * 775 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this 776 * conflicts with the use of "/" as a popular hierarchy delimiter. 777 * 778 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with 779 * the use of "\" as a popular hierarchy delimiter. 780 * 781 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with 782 * the use of "~" in some servers as a home directory indicator. 783 * 784 * 5) UTF-7 permits multiple alternate forms to represent the same 785 * string; in particular, printable US-ASCII chararacters can be 786 * represented in encoded form. 787 * 788 * In modified UTF-7, printable US-ASCII characters except for "&" 789 * represent themselves; that is, characters with octet values 0x20-0x25 790 * and 0x27-0x7e. The character "&" (0x26) is represented by the two- 791 * octet sequence "&-". 792 * 793 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all 794 * Unicode 16-bit octets) are represented in modified BASE64, with a 795 * further modification from [UTF-7] that "," is used instead of "/". 796 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII 797 * character which can represent itself. 798 * 799 * "&" is used to shift to modified BASE64 and "-" to shift back to US- 800 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that 801 * is, a name that ends with a Unicode 16-bit octet MUST end with a "- 802 * "). 803 * 804 * For example, here is a mailbox name which mixes English, Japanese, 805 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw- 806 */ 807 808 /* 809 * Tests for US-ASCII characters belonging to character classes 810 * defined in UTF-7. 811 * 812 * Set D (directly encoded characters) consists of the following 813 * characters: the upper and lower case letters A through Z 814 * and a through z, the 10 digits 0-9, and the following nine special 815 * characters (note that "+" and "=" are omitted): 816 * '(),-./:? 817 * 818 * Set O (optional direct characters) consists of the following 819 * characters (note that "\" and "~" are omitted): 820 * !"#$%&*;<=>@[]^_`{|} 821 * 822 * According to the rules in RFC 2152, the byte values for the following 823 * US-ASCII characters are not used in UTF-7 and are therefore illegal: 824 * - all C0 control codes except for CR LF TAB 825 * - BACKSLASH 826 * - TILDE 827 * - DEL 828 * - all codes beyond US-ASCII, i.e. all >127 829 */ 830 831 /* uses '&' not '+' to start a base64 sequence */ 832 #define AMPERSAND 0x26 833 #define COMMA 0x2c 834 #define SLASH 0x2f 835 836 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */ 837 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e) 838 839 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */ 840 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND) 841 842 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA) 843 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c]) 844 845 /* 846 * converter status values: 847 * 848 * toUnicodeStatus: 849 * 24 inDirectMode (boolean) 850 * 23..16 base64Counter (-1..7) 851 * 15..0 bits (up to 14 bits incoming base64) 852 * 853 * fromUnicodeStatus: 854 * 24 inDirectMode (boolean) 855 * 23..16 base64Counter (0..2) 856 * 7..0 bits (6 bits outgoing base64) 857 * 858 * ignore bits 31..25 859 */ 860 861 static void 862 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 863 UErrorCode *pErrorCode) { 864 UConverter *cnv; 865 const uint8_t *source, *sourceLimit; 866 UChar *target; 867 const UChar *targetLimit; 868 int32_t *offsets; 869 870 uint8_t *bytes; 871 uint8_t byteIndex; 872 873 int32_t length, targetCapacity; 874 875 /* UTF-7 state */ 876 uint16_t bits; 877 int8_t base64Counter; 878 UBool inDirectMode; 879 880 int8_t base64Value; 881 882 int32_t sourceIndex, nextSourceIndex; 883 884 UChar c; 885 uint8_t b; 886 887 /* set up the local pointers */ 888 cnv=pArgs->converter; 889 890 source=(const uint8_t *)pArgs->source; 891 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 892 target=pArgs->target; 893 targetLimit=pArgs->targetLimit; 894 offsets=pArgs->offsets; 895 /* get the state machine state */ 896 { 897 uint32_t status=cnv->toUnicodeStatus; 898 inDirectMode=(UBool)((status>>24)&1); 899 base64Counter=(int8_t)(status>>16); 900 bits=(uint16_t)status; 901 } 902 bytes=cnv->toUBytes; 903 byteIndex=cnv->toULength; 904 905 /* sourceIndex=-1 if the current character began in the previous buffer */ 906 sourceIndex=byteIndex==0 ? 0 : -1; 907 nextSourceIndex=0; 908 909 if(inDirectMode) { 910 directMode: 911 /* 912 * In Direct Mode, US-ASCII characters are encoded directly, i.e., 913 * with their US-ASCII byte values. 914 * An ampersand starts Unicode (or "escape") Mode. 915 * 916 * In Direct Mode, only the sourceIndex is used. 917 */ 918 byteIndex=0; 919 length=(int32_t)(sourceLimit-source); 920 targetCapacity=(int32_t)(targetLimit-target); 921 if(length>targetCapacity) { 922 length=targetCapacity; 923 } 924 while(length>0) { 925 b=*source++; 926 if(!isLegalIMAP(b)) { 927 /* illegal */ 928 bytes[0]=b; 929 byteIndex=1; 930 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 931 break; 932 } else if(b!=AMPERSAND) { 933 /* write directly encoded character */ 934 *target++=b; 935 if(offsets!=NULL) { 936 *offsets++=sourceIndex++; 937 } 938 } else /* AMPERSAND */ { 939 /* switch to Unicode mode */ 940 nextSourceIndex=++sourceIndex; 941 inDirectMode=FALSE; 942 byteIndex=0; 943 bits=0; 944 base64Counter=-1; 945 goto unicodeMode; 946 } 947 --length; 948 } 949 if(source<sourceLimit && target>=targetLimit) { 950 /* target is full */ 951 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 952 } 953 } else { 954 unicodeMode: 955 /* 956 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded. 957 * The base64 sequence ends with any character that is not in the base64 alphabet. 958 * A terminating minus sign is consumed. 959 * US-ASCII must not be base64-ed. 960 * 961 * In Unicode Mode, the sourceIndex has the index to the start of the current 962 * base64 bytes, while nextSourceIndex is precisely parallel to source, 963 * keeping the index to the following byte. 964 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte. 965 */ 966 while(source<sourceLimit) { 967 if(target<targetLimit) { 968 bytes[byteIndex++]=b=*source++; 969 ++nextSourceIndex; 970 if(b>0x7e) { 971 /* illegal - test other illegal US-ASCII values by base64Value==-3 */ 972 inDirectMode=TRUE; 973 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 974 break; 975 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) { 976 /* collect base64 bytes into UChars */ 977 switch(base64Counter) { 978 case -1: /* -1 is immediately after the & */ 979 case 0: 980 bits=base64Value; 981 base64Counter=1; 982 break; 983 case 1: 984 case 3: 985 case 4: 986 case 6: 987 bits=(uint16_t)((bits<<6)|base64Value); 988 ++base64Counter; 989 break; 990 case 2: 991 c=(UChar)((bits<<4)|(base64Value>>2)); 992 if(isLegalIMAP(c)) { 993 /* illegal */ 994 inDirectMode=TRUE; 995 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 996 goto endloop; 997 } 998 *target++=c; 999 if(offsets!=NULL) { 1000 *offsets++=sourceIndex; 1001 sourceIndex=nextSourceIndex-1; 1002 } 1003 bytes[0]=b; /* keep this byte in case an error occurs */ 1004 byteIndex=1; 1005 bits=(uint16_t)(base64Value&3); 1006 base64Counter=3; 1007 break; 1008 case 5: 1009 c=(UChar)((bits<<2)|(base64Value>>4)); 1010 if(isLegalIMAP(c)) { 1011 /* illegal */ 1012 inDirectMode=TRUE; 1013 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1014 goto endloop; 1015 } 1016 *target++=c; 1017 if(offsets!=NULL) { 1018 *offsets++=sourceIndex; 1019 sourceIndex=nextSourceIndex-1; 1020 } 1021 bytes[0]=b; /* keep this byte in case an error occurs */ 1022 byteIndex=1; 1023 bits=(uint16_t)(base64Value&15); 1024 base64Counter=6; 1025 break; 1026 case 7: 1027 c=(UChar)((bits<<6)|base64Value); 1028 if(isLegalIMAP(c)) { 1029 /* illegal */ 1030 inDirectMode=TRUE; 1031 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1032 goto endloop; 1033 } 1034 *target++=c; 1035 if(offsets!=NULL) { 1036 *offsets++=sourceIndex; 1037 sourceIndex=nextSourceIndex; 1038 } 1039 byteIndex=0; 1040 bits=0; 1041 base64Counter=0; 1042 break; 1043 default: 1044 /* will never occur */ 1045 break; 1046 } 1047 } else if(base64Value==-2) { 1048 /* minus sign terminates the base64 sequence */ 1049 inDirectMode=TRUE; 1050 if(base64Counter==-1) { 1051 /* &- i.e. a minus immediately following an ampersand */ 1052 *target++=AMPERSAND; 1053 if(offsets!=NULL) { 1054 *offsets++=sourceIndex-1; 1055 } 1056 } else { 1057 /* absorb the minus and leave the Unicode Mode */ 1058 if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) { 1059 /* bits are illegally left over, a UChar is incomplete */ 1060 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */ 1061 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1062 break; 1063 } 1064 } 1065 sourceIndex=nextSourceIndex; 1066 goto directMode; 1067 } else { 1068 if(base64Counter==-1) { 1069 /* illegal: & immediately followed by something other than base64 or minus sign */ 1070 /* include the ampersand in the reported sequence */ 1071 --sourceIndex; 1072 bytes[0]=AMPERSAND; 1073 bytes[1]=b; 1074 byteIndex=2; 1075 } 1076 /* base64Value==-1 for characters that are illegal only in Unicode mode */ 1077 /* base64Value==-3 for illegal characters */ 1078 /* illegal */ 1079 inDirectMode=TRUE; 1080 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1081 break; 1082 } 1083 } else { 1084 /* target is full */ 1085 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1086 break; 1087 } 1088 } 1089 } 1090 endloop: 1091 1092 /* 1093 * the end of the input stream and detection of truncated input 1094 * are handled by the framework, but here we must check if we are in Unicode 1095 * mode and byteIndex==0 because we must end in direct mode 1096 * 1097 * conditions: 1098 * successful 1099 * in Unicode mode and byteIndex==0 1100 * end of input and no truncated input 1101 */ 1102 if( U_SUCCESS(*pErrorCode) && 1103 !inDirectMode && byteIndex==0 && 1104 pArgs->flush && source>=sourceLimit 1105 ) { 1106 if(base64Counter==-1) { 1107 /* & at the very end of the input */ 1108 /* make the ampersand the reported sequence */ 1109 bytes[0]=AMPERSAND; 1110 byteIndex=1; 1111 } 1112 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */ 1113 1114 inDirectMode=TRUE; /* avoid looping */ 1115 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 1116 } 1117 1118 /* set the converter state back into UConverter */ 1119 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits; 1120 cnv->toULength=byteIndex; 1121 1122 /* write back the updated pointers */ 1123 pArgs->source=(const char *)source; 1124 pArgs->target=target; 1125 pArgs->offsets=offsets; 1126 return; 1127 } 1128 1129 static void 1130 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1131 UErrorCode *pErrorCode) { 1132 UConverter *cnv; 1133 const UChar *source, *sourceLimit; 1134 uint8_t *target, *targetLimit; 1135 int32_t *offsets; 1136 1137 int32_t length, targetCapacity, sourceIndex; 1138 UChar c; 1139 uint8_t b; 1140 1141 /* UTF-7 state */ 1142 uint8_t bits; 1143 int8_t base64Counter; 1144 UBool inDirectMode; 1145 1146 /* set up the local pointers */ 1147 cnv=pArgs->converter; 1148 1149 /* set up the local pointers */ 1150 source=pArgs->source; 1151 sourceLimit=pArgs->sourceLimit; 1152 target=(uint8_t *)pArgs->target; 1153 targetLimit=(uint8_t *)pArgs->targetLimit; 1154 offsets=pArgs->offsets; 1155 1156 /* get the state machine state */ 1157 { 1158 uint32_t status=cnv->fromUnicodeStatus; 1159 inDirectMode=(UBool)((status>>24)&1); 1160 base64Counter=(int8_t)(status>>16); 1161 bits=(uint8_t)status; 1162 } 1163 1164 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */ 1165 sourceIndex=0; 1166 1167 if(inDirectMode) { 1168 directMode: 1169 length=(int32_t)(sourceLimit-source); 1170 targetCapacity=(int32_t)(targetLimit-target); 1171 if(length>targetCapacity) { 1172 length=targetCapacity; 1173 } 1174 while(length>0) { 1175 c=*source++; 1176 /* encode 0x20..0x7e except '&' directly */ 1177 if(inSetDIMAP(c)) { 1178 /* encode directly */ 1179 *target++=(uint8_t)c; 1180 if(offsets!=NULL) { 1181 *offsets++=sourceIndex++; 1182 } 1183 } else if(c==AMPERSAND) { 1184 /* output &- for & */ 1185 *target++=AMPERSAND; 1186 if(target<targetLimit) { 1187 *target++=MINUS; 1188 if(offsets!=NULL) { 1189 *offsets++=sourceIndex; 1190 *offsets++=sourceIndex++; 1191 } 1192 /* realign length and targetCapacity */ 1193 goto directMode; 1194 } else { 1195 if(offsets!=NULL) { 1196 *offsets++=sourceIndex++; 1197 } 1198 cnv->charErrorBuffer[0]=MINUS; 1199 cnv->charErrorBufferLength=1; 1200 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1201 break; 1202 } 1203 } else { 1204 /* un-read this character and switch to Unicode Mode */ 1205 --source; 1206 *target++=AMPERSAND; 1207 if(offsets!=NULL) { 1208 *offsets++=sourceIndex; 1209 } 1210 inDirectMode=FALSE; 1211 base64Counter=0; 1212 goto unicodeMode; 1213 } 1214 --length; 1215 } 1216 if(source<sourceLimit && target>=targetLimit) { 1217 /* target is full */ 1218 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1219 } 1220 } else { 1221 unicodeMode: 1222 while(source<sourceLimit) { 1223 if(target<targetLimit) { 1224 c=*source++; 1225 if(isLegalIMAP(c)) { 1226 /* encode directly */ 1227 inDirectMode=TRUE; 1228 1229 /* trick: back out this character to make this easier */ 1230 --source; 1231 1232 /* terminate the base64 sequence */ 1233 if(base64Counter!=0) { 1234 /* write remaining bits for the previous character */ 1235 *target++=TO_BASE64_IMAP(bits); 1236 if(offsets!=NULL) { 1237 *offsets++=sourceIndex-1; 1238 } 1239 } 1240 /* need to terminate with a minus */ 1241 if(target<targetLimit) { 1242 *target++=MINUS; 1243 if(offsets!=NULL) { 1244 *offsets++=sourceIndex-1; 1245 } 1246 } else { 1247 cnv->charErrorBuffer[0]=MINUS; 1248 cnv->charErrorBufferLength=1; 1249 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1250 break; 1251 } 1252 goto directMode; 1253 } else { 1254 /* 1255 * base64 this character: 1256 * Output 2 or 3 base64 bytes for the remaining bits of the previous character 1257 * and the bits of this character, each implicitly in UTF-16BE. 1258 * 1259 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one 1260 * character to the next. The actual 2 or 4 bits are shifted to the left edge 1261 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier. 1262 */ 1263 switch(base64Counter) { 1264 case 0: 1265 b=(uint8_t)(c>>10); 1266 *target++=TO_BASE64_IMAP(b); 1267 if(target<targetLimit) { 1268 b=(uint8_t)((c>>4)&0x3f); 1269 *target++=TO_BASE64_IMAP(b); 1270 if(offsets!=NULL) { 1271 *offsets++=sourceIndex; 1272 *offsets++=sourceIndex++; 1273 } 1274 } else { 1275 if(offsets!=NULL) { 1276 *offsets++=sourceIndex++; 1277 } 1278 b=(uint8_t)((c>>4)&0x3f); 1279 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1280 cnv->charErrorBufferLength=1; 1281 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1282 } 1283 bits=(uint8_t)((c&15)<<2); 1284 base64Counter=1; 1285 break; 1286 case 1: 1287 b=(uint8_t)(bits|(c>>14)); 1288 *target++=TO_BASE64_IMAP(b); 1289 if(target<targetLimit) { 1290 b=(uint8_t)((c>>8)&0x3f); 1291 *target++=TO_BASE64_IMAP(b); 1292 if(target<targetLimit) { 1293 b=(uint8_t)((c>>2)&0x3f); 1294 *target++=TO_BASE64_IMAP(b); 1295 if(offsets!=NULL) { 1296 *offsets++=sourceIndex; 1297 *offsets++=sourceIndex; 1298 *offsets++=sourceIndex++; 1299 } 1300 } else { 1301 if(offsets!=NULL) { 1302 *offsets++=sourceIndex; 1303 *offsets++=sourceIndex++; 1304 } 1305 b=(uint8_t)((c>>2)&0x3f); 1306 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1307 cnv->charErrorBufferLength=1; 1308 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1309 } 1310 } else { 1311 if(offsets!=NULL) { 1312 *offsets++=sourceIndex++; 1313 } 1314 b=(uint8_t)((c>>8)&0x3f); 1315 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1316 b=(uint8_t)((c>>2)&0x3f); 1317 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b); 1318 cnv->charErrorBufferLength=2; 1319 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1320 } 1321 bits=(uint8_t)((c&3)<<4); 1322 base64Counter=2; 1323 break; 1324 case 2: 1325 b=(uint8_t)(bits|(c>>12)); 1326 *target++=TO_BASE64_IMAP(b); 1327 if(target<targetLimit) { 1328 b=(uint8_t)((c>>6)&0x3f); 1329 *target++=TO_BASE64_IMAP(b); 1330 if(target<targetLimit) { 1331 b=(uint8_t)(c&0x3f); 1332 *target++=TO_BASE64_IMAP(b); 1333 if(offsets!=NULL) { 1334 *offsets++=sourceIndex; 1335 *offsets++=sourceIndex; 1336 *offsets++=sourceIndex++; 1337 } 1338 } else { 1339 if(offsets!=NULL) { 1340 *offsets++=sourceIndex; 1341 *offsets++=sourceIndex++; 1342 } 1343 b=(uint8_t)(c&0x3f); 1344 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1345 cnv->charErrorBufferLength=1; 1346 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1347 } 1348 } else { 1349 if(offsets!=NULL) { 1350 *offsets++=sourceIndex++; 1351 } 1352 b=(uint8_t)((c>>6)&0x3f); 1353 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1354 b=(uint8_t)(c&0x3f); 1355 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b); 1356 cnv->charErrorBufferLength=2; 1357 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1358 } 1359 bits=0; 1360 base64Counter=0; 1361 break; 1362 default: 1363 /* will never occur */ 1364 break; 1365 } 1366 } 1367 } else { 1368 /* target is full */ 1369 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1370 break; 1371 } 1372 } 1373 } 1374 1375 if(pArgs->flush && source>=sourceLimit) { 1376 /* flush remaining bits to the target */ 1377 if(!inDirectMode) { 1378 if(base64Counter!=0) { 1379 if(target<targetLimit) { 1380 *target++=TO_BASE64_IMAP(bits); 1381 if(offsets!=NULL) { 1382 *offsets++=sourceIndex-1; 1383 } 1384 } else { 1385 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits); 1386 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1387 } 1388 } 1389 /* need to terminate with a minus */ 1390 if(target<targetLimit) { 1391 *target++=MINUS; 1392 if(offsets!=NULL) { 1393 *offsets++=sourceIndex-1; 1394 } 1395 } else { 1396 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS; 1397 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1398 } 1399 } 1400 /* reset the state for the next conversion */ 1401 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */ 1402 } else { 1403 /* set the converter state back into UConverter */ 1404 cnv->fromUnicodeStatus= 1405 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/ 1406 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits; 1407 } 1408 1409 /* write back the updated pointers */ 1410 pArgs->source=source; 1411 pArgs->target=(char *)target; 1412 pArgs->offsets=offsets; 1413 return; 1414 } 1415 1416 static const UConverterImpl _IMAPImpl={ 1417 UCNV_IMAP_MAILBOX, 1418 1419 NULL, 1420 NULL, 1421 1422 _UTF7Open, 1423 NULL, 1424 _UTF7Reset, 1425 1426 _IMAPToUnicodeWithOffsets, 1427 _IMAPToUnicodeWithOffsets, 1428 _IMAPFromUnicodeWithOffsets, 1429 _IMAPFromUnicodeWithOffsets, 1430 NULL, 1431 1432 NULL, 1433 NULL, 1434 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */ 1435 NULL, 1436 ucnv_getCompleteUnicodeSet 1437 }; 1438 1439 static const UConverterStaticData _IMAPStaticData={ 1440 sizeof(UConverterStaticData), 1441 "IMAP-mailbox-name", 1442 0, /* TODO CCSID for IMAP-mailbox-name */ 1443 UCNV_IBM, UCNV_IMAP_MAILBOX, 1444 1, 4, 1445 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */ 1446 FALSE, FALSE, 1447 0, 1448 0, 1449 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1450 }; 1451 1452 const UConverterSharedData _IMAPData={ 1453 sizeof(UConverterSharedData), ~((uint32_t)0), 1454 NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl, 1455 0 1456 }; 1457 1458 #endif 1459