1 /* 2 * libjingle 3 * Copyright 2011, Google Inc. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 3. The name of the author may not be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED 17 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 18 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 19 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 22 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 24 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 25 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "talk/base/stringencode.h" 29 30 #include <cstdio> 31 #include <cstdlib> 32 33 #include "talk/base/basictypes.h" 34 #include "talk/base/common.h" 35 #include "talk/base/stringutils.h" 36 37 namespace talk_base { 38 39 ///////////////////////////////////////////////////////////////////////////// 40 // String Encoding Utilities 41 ///////////////////////////////////////////////////////////////////////////// 42 43 static const char HEX[] = "0123456789abcdef"; 44 45 char hex_encode(unsigned char val) { 46 ASSERT(val < 16); 47 return (val < 16) ? HEX[val] : '!'; 48 } 49 50 bool hex_decode(char ch, unsigned char* val) { 51 if ((ch >= '0') && (ch <= '9')) { 52 *val = ch - '0'; 53 } else if ((ch >= 'A') && (ch <= 'Z')) { 54 *val = (ch - 'A') + 10; 55 } else if ((ch >= 'a') && (ch <= 'z')) { 56 *val = (ch - 'a') + 10; 57 } else { 58 return false; 59 } 60 return true; 61 } 62 63 size_t escape(char * buffer, size_t buflen, 64 const char * source, size_t srclen, 65 const char * illegal, char escape) { 66 ASSERT(NULL != buffer); // TODO: estimate output size 67 if (buflen <= 0) 68 return 0; 69 70 size_t srcpos = 0, bufpos = 0; 71 while ((srcpos < srclen) && (bufpos + 1 < buflen)) { 72 char ch = source[srcpos++]; 73 if ((ch == escape) || ::strchr(illegal, ch)) { 74 if (bufpos + 2 >= buflen) 75 break; 76 buffer[bufpos++] = escape; 77 } 78 buffer[bufpos++] = ch; 79 } 80 81 buffer[bufpos] = '\0'; 82 return bufpos; 83 } 84 85 size_t unescape(char * buffer, size_t buflen, 86 const char * source, size_t srclen, 87 char escape) { 88 ASSERT(NULL != buffer); // TODO: estimate output size 89 if (buflen <= 0) 90 return 0; 91 92 size_t srcpos = 0, bufpos = 0; 93 while ((srcpos < srclen) && (bufpos + 1 < buflen)) { 94 char ch = source[srcpos++]; 95 if ((ch == escape) && (srcpos < srclen)) { 96 ch = source[srcpos++]; 97 } 98 buffer[bufpos++] = ch; 99 } 100 buffer[bufpos] = '\0'; 101 return bufpos; 102 } 103 104 size_t encode(char * buffer, size_t buflen, 105 const char * source, size_t srclen, 106 const char * illegal, char escape) { 107 ASSERT(NULL != buffer); // TODO: estimate output size 108 if (buflen <= 0) 109 return 0; 110 111 size_t srcpos = 0, bufpos = 0; 112 while ((srcpos < srclen) && (bufpos + 1 < buflen)) { 113 char ch = source[srcpos++]; 114 if ((ch != escape) && !::strchr(illegal, ch)) { 115 buffer[bufpos++] = ch; 116 } else if (bufpos + 3 >= buflen) { 117 break; 118 } else { 119 buffer[bufpos+0] = escape; 120 buffer[bufpos+1] = hex_encode((static_cast<unsigned char>(ch) >> 4) & 0xF); 121 buffer[bufpos+2] = hex_encode((static_cast<unsigned char>(ch) ) & 0xF); 122 bufpos += 3; 123 } 124 } 125 buffer[bufpos] = '\0'; 126 return bufpos; 127 } 128 129 size_t decode(char * buffer, size_t buflen, 130 const char * source, size_t srclen, 131 char escape) { 132 if (buflen <= 0) 133 return 0; 134 135 unsigned char h1, h2; 136 size_t srcpos = 0, bufpos = 0; 137 while ((srcpos < srclen) && (bufpos + 1 < buflen)) { 138 char ch = source[srcpos++]; 139 if ((ch == escape) 140 && (srcpos + 1 < srclen) 141 && hex_decode(source[srcpos], &h1) 142 && hex_decode(source[srcpos+1], &h2)) { 143 buffer[bufpos++] = (h1 << 4) | h2; 144 srcpos += 2; 145 } else { 146 buffer[bufpos++] = ch; 147 } 148 } 149 buffer[bufpos] = '\0'; 150 return bufpos; 151 } 152 153 const char* unsafe_filename_characters() { 154 // It might be better to have a single specification which is the union of 155 // all operating systems, unless one system is overly restrictive. 156 #ifdef WIN32 157 return "\\/:*?\"<>|"; 158 #else // !WIN32 159 // TODO 160 ASSERT(false); 161 return ""; 162 #endif // !WIN23 163 } 164 165 const unsigned char URL_UNSAFE = 0x1; // 0-33 "#$%&+,/:;<=>?@[\]^`{|} 127 166 const unsigned char XML_UNSAFE = 0x2; // "&'<> 167 const unsigned char HTML_UNSAFE = 0x2; // "&'<> 168 169 // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 6 5 7 8 9 : ; < = > ? 170 //@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ 171 //` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ 172 173 const unsigned char ASCII_CLASS[128] = { 174 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 175 1,0,3,1,1,1,3,2,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,3,1,3,1, 176 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0, 177 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1, 178 }; 179 180 size_t url_encode(char * buffer, size_t buflen, 181 const char * source, size_t srclen) { 182 if (NULL == buffer) 183 return srclen * 3 + 1; 184 if (buflen <= 0) 185 return 0; 186 187 size_t srcpos = 0, bufpos = 0; 188 while ((srcpos < srclen) && (bufpos + 1 < buflen)) { 189 unsigned char ch = source[srcpos++]; 190 if ((ch < 128) && (ASCII_CLASS[ch] & URL_UNSAFE)) { 191 if (bufpos + 3 >= buflen) { 192 break; 193 } 194 buffer[bufpos+0] = '%'; 195 buffer[bufpos+1] = hex_encode((ch >> 4) & 0xF); 196 buffer[bufpos+2] = hex_encode((ch ) & 0xF); 197 bufpos += 3; 198 } else { 199 buffer[bufpos++] = ch; 200 } 201 } 202 buffer[bufpos] = '\0'; 203 return bufpos; 204 } 205 206 size_t url_decode(char * buffer, size_t buflen, 207 const char * source, size_t srclen) { 208 if (NULL == buffer) 209 return srclen + 1; 210 if (buflen <= 0) 211 return 0; 212 213 unsigned char h1, h2; 214 size_t srcpos = 0, bufpos = 0; 215 while ((srcpos < srclen) && (bufpos + 1 < buflen)) { 216 unsigned char ch = source[srcpos++]; 217 if (ch == '+') { 218 buffer[bufpos++] = ' '; 219 } else if ((ch == '%') 220 && (srcpos + 1 < srclen) 221 && hex_decode(source[srcpos], &h1) 222 && hex_decode(source[srcpos+1], &h2)) 223 { 224 buffer[bufpos++] = (h1 << 4) | h2; 225 srcpos += 2; 226 } else { 227 buffer[bufpos++] = ch; 228 } 229 } 230 buffer[bufpos] = '\0'; 231 return bufpos; 232 } 233 234 size_t utf8_decode(const char* source, size_t srclen, unsigned long* value) { 235 const unsigned char* s = reinterpret_cast<const unsigned char*>(source); 236 if ((s[0] & 0x80) == 0x00) { // Check s[0] == 0xxxxxxx 237 *value = s[0]; 238 return 1; 239 } 240 if ((srclen < 2) || ((s[1] & 0xC0) != 0x80)) { // Check s[1] != 10xxxxxx 241 return 0; 242 } 243 // Accumulate the trailer byte values in value16, and combine it with the 244 // relevant bits from s[0], once we've determined the sequence length. 245 unsigned long value16 = (s[1] & 0x3F); 246 if ((s[0] & 0xE0) == 0xC0) { // Check s[0] == 110xxxxx 247 *value = ((s[0] & 0x1F) << 6) | value16; 248 return 2; 249 } 250 if ((srclen < 3) || ((s[2] & 0xC0) != 0x80)) { // Check s[2] != 10xxxxxx 251 return 0; 252 } 253 value16 = (value16 << 6) | (s[2] & 0x3F); 254 if ((s[0] & 0xF0) == 0xE0) { // Check s[0] == 1110xxxx 255 *value = ((s[0] & 0x0F) << 12) | value16; 256 return 3; 257 } 258 if ((srclen < 4) || ((s[3] & 0xC0) != 0x80)) { // Check s[3] != 10xxxxxx 259 return 0; 260 } 261 value16 = (value16 << 6) | (s[3] & 0x3F); 262 if ((s[0] & 0xF8) == 0xF0) { // Check s[0] == 11110xxx 263 *value = ((s[0] & 0x07) << 18) | value16; 264 return 4; 265 } 266 return 0; 267 } 268 269 size_t utf8_encode(char* buffer, size_t buflen, unsigned long value) { 270 if ((value <= 0x7F) && (buflen >= 1)) { 271 buffer[0] = static_cast<unsigned char>(value); 272 return 1; 273 } 274 if ((value <= 0x7FF) && (buflen >= 2)) { 275 buffer[0] = 0xC0 | static_cast<unsigned char>(value >> 6); 276 buffer[1] = 0x80 | static_cast<unsigned char>(value & 0x3F); 277 return 2; 278 } 279 if ((value <= 0xFFFF) && (buflen >= 3)) { 280 buffer[0] = 0xE0 | static_cast<unsigned char>(value >> 12); 281 buffer[1] = 0x80 | static_cast<unsigned char>((value >> 6) & 0x3F); 282 buffer[2] = 0x80 | static_cast<unsigned char>(value & 0x3F); 283 return 3; 284 } 285 if ((value <= 0x1FFFFF) && (buflen >= 4)) { 286 buffer[0] = 0xF0 | static_cast<unsigned char>(value >> 18); 287 buffer[1] = 0x80 | static_cast<unsigned char>((value >> 12) & 0x3F); 288 buffer[2] = 0x80 | static_cast<unsigned char>((value >> 6) & 0x3F); 289 buffer[3] = 0x80 | static_cast<unsigned char>(value & 0x3F); 290 return 4; 291 } 292 return 0; 293 } 294 295 size_t html_encode(char * buffer, size_t buflen, 296 const char * source, size_t srclen) { 297 ASSERT(NULL != buffer); // TODO: estimate output size 298 if (buflen <= 0) 299 return 0; 300 301 size_t srcpos = 0, bufpos = 0; 302 while ((srcpos < srclen) && (bufpos + 1 < buflen)) { 303 unsigned char ch = source[srcpos]; 304 if (ch < 128) { 305 srcpos += 1; 306 if (ASCII_CLASS[ch] & HTML_UNSAFE) { 307 const char * escseq = 0; 308 size_t esclen = 0; 309 switch (ch) { 310 case '<': escseq = "<"; esclen = 4; break; 311 case '>': escseq = ">"; esclen = 4; break; 312 case '\'': escseq = "'"; esclen = 5; break; 313 case '\"': escseq = """; esclen = 6; break; 314 case '&': escseq = "&"; esclen = 5; break; 315 default: ASSERT(false); 316 } 317 if (bufpos + esclen >= buflen) { 318 break; 319 } 320 memcpy(buffer + bufpos, escseq, esclen); 321 bufpos += esclen; 322 } else { 323 buffer[bufpos++] = ch; 324 } 325 } else { 326 // Largest value is 0x1FFFFF => � (10 characters) 327 char escseq[11]; 328 unsigned long val; 329 if (size_t vallen = utf8_decode(&source[srcpos], srclen - srcpos, &val)) { 330 srcpos += vallen; 331 } else { 332 // Not a valid utf8 sequence, just use the raw character. 333 val = static_cast<unsigned char>(source[srcpos++]); 334 } 335 size_t esclen = sprintfn(escseq, ARRAY_SIZE(escseq), "&#%lu;", val); 336 if (bufpos + esclen >= buflen) { 337 break; 338 } 339 memcpy(buffer + bufpos, escseq, esclen); 340 bufpos += esclen; 341 } 342 } 343 buffer[bufpos] = '\0'; 344 return bufpos; 345 } 346 347 size_t html_decode(char * buffer, size_t buflen, 348 const char * source, size_t srclen) { 349 ASSERT(NULL != buffer); // TODO: estimate output size 350 return xml_decode(buffer, buflen, source, srclen); 351 } 352 353 size_t xml_encode(char * buffer, size_t buflen, 354 const char * source, size_t srclen) { 355 ASSERT(NULL != buffer); // TODO: estimate output size 356 if (buflen <= 0) 357 return 0; 358 359 size_t srcpos = 0, bufpos = 0; 360 while ((srcpos < srclen) && (bufpos + 1 < buflen)) { 361 unsigned char ch = source[srcpos++]; 362 if ((ch < 128) && (ASCII_CLASS[ch] & XML_UNSAFE)) { 363 const char * escseq = 0; 364 size_t esclen = 0; 365 switch (ch) { 366 case '<': escseq = "<"; esclen = 4; break; 367 case '>': escseq = ">"; esclen = 4; break; 368 case '\'': escseq = "'"; esclen = 6; break; 369 case '\"': escseq = """; esclen = 6; break; 370 case '&': escseq = "&"; esclen = 5; break; 371 default: ASSERT(false); 372 } 373 if (bufpos + esclen >= buflen) { 374 break; 375 } 376 memcpy(buffer + bufpos, escseq, esclen); 377 bufpos += esclen; 378 } else { 379 buffer[bufpos++] = ch; 380 } 381 } 382 buffer[bufpos] = '\0'; 383 return bufpos; 384 } 385 386 size_t xml_decode(char * buffer, size_t buflen, 387 const char * source, size_t srclen) { 388 ASSERT(NULL != buffer); // TODO: estimate output size 389 if (buflen <= 0) 390 return 0; 391 392 size_t srcpos = 0, bufpos = 0; 393 while ((srcpos < srclen) && (bufpos + 1 < buflen)) { 394 unsigned char ch = source[srcpos++]; 395 if (ch != '&') { 396 buffer[bufpos++] = ch; 397 } else if ((srcpos + 2 < srclen) 398 && (memcmp(source + srcpos, "lt;", 3) == 0)) { 399 buffer[bufpos++] = '<'; 400 srcpos += 3; 401 } else if ((srcpos + 2 < srclen) 402 && (memcmp(source + srcpos, "gt;", 3) == 0)) { 403 buffer[bufpos++] = '>'; 404 srcpos += 3; 405 } else if ((srcpos + 4 < srclen) 406 && (memcmp(source + srcpos, "apos;", 5) == 0)) { 407 buffer[bufpos++] = '\''; 408 srcpos += 5; 409 } else if ((srcpos + 4 < srclen) 410 && (memcmp(source + srcpos, "quot;", 5) == 0)) { 411 buffer[bufpos++] = '\"'; 412 srcpos += 5; 413 } else if ((srcpos + 3 < srclen) 414 && (memcmp(source + srcpos, "amp;", 4) == 0)) { 415 buffer[bufpos++] = '&'; 416 srcpos += 4; 417 } else if ((srcpos < srclen) && (source[srcpos] == '#')) { 418 int int_base = 10; 419 if ((srcpos + 1 < srclen) && (source[srcpos+1] == 'x')) { 420 int_base = 16; 421 srcpos += 1; 422 } 423 char * ptr; 424 // TODO: Fix hack (ptr may go past end of data) 425 unsigned long val = strtoul(source + srcpos + 1, &ptr, int_base); 426 if ((static_cast<size_t>(ptr - source) < srclen) && (*ptr == ';')) { 427 srcpos = ptr - source + 1; 428 } else { 429 // Not a valid escape sequence. 430 break; 431 } 432 if (size_t esclen = utf8_encode(buffer + bufpos, buflen - bufpos, val)) { 433 bufpos += esclen; 434 } else { 435 // Not enough room to encode the character, or illegal character 436 break; 437 } 438 } else { 439 // Unrecognized escape sequence. 440 break; 441 } 442 } 443 buffer[bufpos] = '\0'; 444 return bufpos; 445 } 446 447 std::string hex_encode(const char * source, size_t srclen) { 448 const size_t kBufferSize = srclen * 2 + 1; 449 char* buffer = STACK_ARRAY(char, kBufferSize); 450 size_t length = hex_encode(buffer, kBufferSize, source, srclen); 451 return std::string(buffer, length); 452 } 453 454 size_t hex_encode(char * buffer, size_t buflen, 455 const char * csource, size_t srclen) { 456 ASSERT(NULL != buffer); // TODO: estimate output size 457 if (buflen <= 0) 458 return 0; 459 460 const unsigned char * bsource = 461 reinterpret_cast<const unsigned char *>(csource); 462 463 size_t srcpos = 0, bufpos = 0; 464 srclen = _min(srclen, (buflen - 1) / 2); 465 while (srcpos < srclen) { 466 unsigned char ch = bsource[srcpos++]; 467 buffer[bufpos ] = hex_encode((ch >> 4) & 0xF); 468 buffer[bufpos+1] = hex_encode((ch ) & 0xF); 469 bufpos += 2; 470 } 471 buffer[bufpos] = '\0'; 472 return bufpos; 473 } 474 475 size_t hex_decode(char * cbuffer, size_t buflen, 476 const char * source, size_t srclen) { 477 ASSERT(NULL != cbuffer); // TODO: estimate output size 478 if (buflen <= 0) 479 return 0; 480 481 unsigned char * bbuffer = reinterpret_cast<unsigned char *>(cbuffer); 482 483 unsigned char h1, h2; 484 size_t srcpos = 0, bufpos = 0; 485 while ((srcpos + 1 < srclen) 486 && (bufpos + 1 < buflen) 487 && hex_decode(source[srcpos], &h1) 488 && hex_decode(source[srcpos+1], &h2)) 489 { 490 bbuffer[bufpos++] = (h1 << 4) | h2; 491 srcpos += 2; 492 } 493 bbuffer[bufpos] = '\0'; 494 return bufpos; 495 } 496 497 size_t transform(std::string& value, size_t maxlen, const std::string& source, 498 Transform t) { 499 char* buffer = STACK_ARRAY(char, maxlen + 1); 500 size_t length = t(buffer, maxlen + 1, source.data(), source.length()); 501 value.assign(buffer, length); 502 return length; 503 } 504 505 std::string s_transform(const std::string& source, Transform t) { 506 // Ask transformation function to approximate the destination size (returns upper bound) 507 size_t maxlen = t(NULL, 0, source.data(), source.length()); 508 char * buffer = STACK_ARRAY(char, maxlen); 509 size_t len = t(buffer, maxlen, source.data(), source.length()); 510 std::string result(buffer, len); 511 return result; 512 } 513 514 size_t tokenize(const std::string& source, char delimiter, 515 std::vector<std::string>* fields) { 516 ASSERT(NULL != fields); 517 fields->clear(); 518 size_t last = 0; 519 for (size_t i = 0; i < source.length(); ++i) { 520 if (source[i] == delimiter) { 521 if (i != last) { 522 fields->push_back(source.substr(last, i - last)); 523 } 524 last = i + 1; 525 } 526 } 527 if (last != source.length()) { 528 fields->push_back(source.substr(last, source.length() - last)); 529 } 530 return fields->size(); 531 } 532 533 size_t split(const std::string& source, char delimiter, 534 std::vector<std::string>* fields) { 535 ASSERT(NULL != fields); 536 fields->clear(); 537 size_t last = 0; 538 for (size_t i = 0; i < source.length(); ++i) { 539 if (source[i] == delimiter) { 540 fields->push_back(source.substr(last, i - last)); 541 last = i + 1; 542 } 543 } 544 fields->push_back(source.substr(last, source.length() - last)); 545 return fields->size(); 546 } 547 548 char make_char_safe_for_filename(char c) { 549 if (c < 32) 550 return '_'; 551 552 switch (c) { 553 case '<': 554 case '>': 555 case ':': 556 case '"': 557 case '/': 558 case '\\': 559 case '|': 560 case '*': 561 case '?': 562 return '_'; 563 564 default: 565 return c; 566 } 567 } 568 569 /* 570 void sprintf(std::string& value, size_t maxlen, const char * format, ...) { 571 char * buffer = STACK_ARRAY(char, maxlen + 1); 572 va_list args; 573 va_start(args, format); 574 value.assign(buffer, vsprintfn(buffer, maxlen + 1, format, args)); 575 va_end(args); 576 } 577 */ 578 579 ///////////////////////////////////////////////////////////////////////////// 580 581 } // namespace talk_base 582