1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 // from google3/strings/strutil.cc 32 33 #include <google/protobuf/stubs/strutil.h> 34 #include <google/protobuf/stubs/mathlimits.h> 35 36 #include <errno.h> 37 #include <float.h> // FLT_DIG and DBL_DIG 38 #include <limits> 39 #include <limits.h> 40 #include <stdio.h> 41 #include <iterator> 42 43 #include <google/protobuf/stubs/stl_util.h> 44 45 #ifdef _WIN32 46 // MSVC has only _snprintf, not snprintf. 47 // 48 // MinGW has both snprintf and _snprintf, but they appear to be different 49 // functions. The former is buggy. When invoked like so: 50 // char buffer[32]; 51 // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f); 52 // it prints "1.23000e+10". This is plainly wrong: %g should never print 53 // trailing zeros after the decimal point. For some reason this bug only 54 // occurs with some input values, not all. In any case, _snprintf does the 55 // right thing, so we use it. 56 #define snprintf _snprintf 57 #endif 58 59 namespace google { 60 namespace protobuf { 61 62 // These are defined as macros on some platforms. #undef them so that we can 63 // redefine them. 64 #undef isxdigit 65 #undef isprint 66 67 // The definitions of these in ctype.h change based on locale. Since our 68 // string manipulation is all in relation to the protocol buffer and C++ 69 // languages, we always want to use the C locale. So, we re-define these 70 // exactly as we want them. 71 inline bool isxdigit(char c) { 72 return ('0' <= c && c <= '9') || 73 ('a' <= c && c <= 'f') || 74 ('A' <= c && c <= 'F'); 75 } 76 77 inline bool isprint(char c) { 78 return c >= 0x20 && c <= 0x7E; 79 } 80 81 // ---------------------------------------------------------------------- 82 // StripString 83 // Replaces any occurrence of the character 'remove' (or the characters 84 // in 'remove') with the character 'replacewith'. 85 // ---------------------------------------------------------------------- 86 void StripString(string* s, const char* remove, char replacewith) { 87 const char * str_start = s->c_str(); 88 const char * str = str_start; 89 for (str = strpbrk(str, remove); 90 str != NULL; 91 str = strpbrk(str + 1, remove)) { 92 (*s)[str - str_start] = replacewith; 93 } 94 } 95 96 void StripWhitespace(string* str) { 97 int str_length = str->length(); 98 99 // Strip off leading whitespace. 100 int first = 0; 101 while (first < str_length && ascii_isspace(str->at(first))) { 102 ++first; 103 } 104 // If entire string is white space. 105 if (first == str_length) { 106 str->clear(); 107 return; 108 } 109 if (first > 0) { 110 str->erase(0, first); 111 str_length -= first; 112 } 113 114 // Strip off trailing whitespace. 115 int last = str_length - 1; 116 while (last >= 0 && ascii_isspace(str->at(last))) { 117 --last; 118 } 119 if (last != (str_length - 1) && last >= 0) { 120 str->erase(last + 1, string::npos); 121 } 122 } 123 124 // ---------------------------------------------------------------------- 125 // StringReplace() 126 // Replace the "old" pattern with the "new" pattern in a string, 127 // and append the result to "res". If replace_all is false, 128 // it only replaces the first instance of "old." 129 // ---------------------------------------------------------------------- 130 131 void StringReplace(const string& s, const string& oldsub, 132 const string& newsub, bool replace_all, 133 string* res) { 134 if (oldsub.empty()) { 135 res->append(s); // if empty, append the given string. 136 return; 137 } 138 139 string::size_type start_pos = 0; 140 string::size_type pos; 141 do { 142 pos = s.find(oldsub, start_pos); 143 if (pos == string::npos) { 144 break; 145 } 146 res->append(s, start_pos, pos - start_pos); 147 res->append(newsub); 148 start_pos = pos + oldsub.size(); // start searching again after the "old" 149 } while (replace_all); 150 res->append(s, start_pos, s.length() - start_pos); 151 } 152 153 // ---------------------------------------------------------------------- 154 // StringReplace() 155 // Give me a string and two patterns "old" and "new", and I replace 156 // the first instance of "old" in the string with "new", if it 157 // exists. If "global" is true; call this repeatedly until it 158 // fails. RETURN a new string, regardless of whether the replacement 159 // happened or not. 160 // ---------------------------------------------------------------------- 161 162 string StringReplace(const string& s, const string& oldsub, 163 const string& newsub, bool replace_all) { 164 string ret; 165 StringReplace(s, oldsub, newsub, replace_all, &ret); 166 return ret; 167 } 168 169 // ---------------------------------------------------------------------- 170 // SplitStringUsing() 171 // Split a string using a character delimiter. Append the components 172 // to 'result'. 173 // 174 // Note: For multi-character delimiters, this routine will split on *ANY* of 175 // the characters in the string, not the entire string as a single delimiter. 176 // ---------------------------------------------------------------------- 177 template <typename ITR> 178 static inline 179 void SplitStringToIteratorUsing(const string& full, 180 const char* delim, 181 ITR& result) { 182 // Optimize the common case where delim is a single character. 183 if (delim[0] != '\0' && delim[1] == '\0') { 184 char c = delim[0]; 185 const char* p = full.data(); 186 const char* end = p + full.size(); 187 while (p != end) { 188 if (*p == c) { 189 ++p; 190 } else { 191 const char* start = p; 192 while (++p != end && *p != c); 193 *result++ = string(start, p - start); 194 } 195 } 196 return; 197 } 198 199 string::size_type begin_index, end_index; 200 begin_index = full.find_first_not_of(delim); 201 while (begin_index != string::npos) { 202 end_index = full.find_first_of(delim, begin_index); 203 if (end_index == string::npos) { 204 *result++ = full.substr(begin_index); 205 return; 206 } 207 *result++ = full.substr(begin_index, (end_index - begin_index)); 208 begin_index = full.find_first_not_of(delim, end_index); 209 } 210 } 211 212 void SplitStringUsing(const string& full, 213 const char* delim, 214 vector<string>* result) { 215 back_insert_iterator< vector<string> > it(*result); 216 SplitStringToIteratorUsing(full, delim, it); 217 } 218 219 // Split a string using a character delimiter. Append the components 220 // to 'result'. If there are consecutive delimiters, this function 221 // will return corresponding empty strings. The string is split into 222 // at most the specified number of pieces greedily. This means that the 223 // last piece may possibly be split further. To split into as many pieces 224 // as possible, specify 0 as the number of pieces. 225 // 226 // If "full" is the empty string, yields an empty string as the only value. 227 // 228 // If "pieces" is negative for some reason, it returns the whole string 229 // ---------------------------------------------------------------------- 230 template <typename StringType, typename ITR> 231 static inline 232 void SplitStringToIteratorAllowEmpty(const StringType& full, 233 const char* delim, 234 int pieces, 235 ITR& result) { 236 string::size_type begin_index, end_index; 237 begin_index = 0; 238 239 for (int i = 0; (i < pieces-1) || (pieces == 0); i++) { 240 end_index = full.find_first_of(delim, begin_index); 241 if (end_index == string::npos) { 242 *result++ = full.substr(begin_index); 243 return; 244 } 245 *result++ = full.substr(begin_index, (end_index - begin_index)); 246 begin_index = end_index + 1; 247 } 248 *result++ = full.substr(begin_index); 249 } 250 251 void SplitStringAllowEmpty(const string& full, const char* delim, 252 vector<string>* result) { 253 back_insert_iterator<vector<string> > it(*result); 254 SplitStringToIteratorAllowEmpty(full, delim, 0, it); 255 } 256 257 // ---------------------------------------------------------------------- 258 // JoinStrings() 259 // This merges a vector of string components with delim inserted 260 // as separaters between components. 261 // 262 // ---------------------------------------------------------------------- 263 template <class ITERATOR> 264 static void JoinStringsIterator(const ITERATOR& start, 265 const ITERATOR& end, 266 const char* delim, 267 string* result) { 268 GOOGLE_CHECK(result != NULL); 269 result->clear(); 270 int delim_length = strlen(delim); 271 272 // Precompute resulting length so we can reserve() memory in one shot. 273 int length = 0; 274 for (ITERATOR iter = start; iter != end; ++iter) { 275 if (iter != start) { 276 length += delim_length; 277 } 278 length += iter->size(); 279 } 280 result->reserve(length); 281 282 // Now combine everything. 283 for (ITERATOR iter = start; iter != end; ++iter) { 284 if (iter != start) { 285 result->append(delim, delim_length); 286 } 287 result->append(iter->data(), iter->size()); 288 } 289 } 290 291 void JoinStrings(const vector<string>& components, 292 const char* delim, 293 string * result) { 294 JoinStringsIterator(components.begin(), components.end(), delim, result); 295 } 296 297 // ---------------------------------------------------------------------- 298 // UnescapeCEscapeSequences() 299 // This does all the unescaping that C does: \ooo, \r, \n, etc 300 // Returns length of resulting string. 301 // The implementation of \x parses any positive number of hex digits, 302 // but it is an error if the value requires more than 8 bits, and the 303 // result is truncated to 8 bits. 304 // 305 // The second call stores its errors in a supplied string vector. 306 // If the string vector pointer is NULL, it reports the errors with LOG(). 307 // ---------------------------------------------------------------------- 308 309 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7')) 310 311 // Protocol buffers doesn't ever care about errors, but I don't want to remove 312 // the code. 313 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false) 314 315 int UnescapeCEscapeSequences(const char* source, char* dest) { 316 return UnescapeCEscapeSequences(source, dest, NULL); 317 } 318 319 int UnescapeCEscapeSequences(const char* source, char* dest, 320 vector<string> *errors) { 321 GOOGLE_DCHECK(errors == NULL) << "Error reporting not implemented."; 322 323 char* d = dest; 324 const char* p = source; 325 326 // Small optimization for case where source = dest and there's no escaping 327 while ( p == d && *p != '\0' && *p != '\\' ) 328 p++, d++; 329 330 while (*p != '\0') { 331 if (*p != '\\') { 332 *d++ = *p++; 333 } else { 334 switch ( *++p ) { // skip past the '\\' 335 case '\0': 336 LOG_STRING(ERROR, errors) << "String cannot end with \\"; 337 *d = '\0'; 338 return d - dest; // we're done with p 339 case 'a': *d++ = '\a'; break; 340 case 'b': *d++ = '\b'; break; 341 case 'f': *d++ = '\f'; break; 342 case 'n': *d++ = '\n'; break; 343 case 'r': *d++ = '\r'; break; 344 case 't': *d++ = '\t'; break; 345 case 'v': *d++ = '\v'; break; 346 case '\\': *d++ = '\\'; break; 347 case '?': *d++ = '\?'; break; // \? Who knew? 348 case '\'': *d++ = '\''; break; 349 case '"': *d++ = '\"'; break; 350 case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits 351 case '4': case '5': case '6': case '7': { 352 char ch = *p - '0'; 353 if ( IS_OCTAL_DIGIT(p[1]) ) 354 ch = ch * 8 + *++p - '0'; 355 if ( IS_OCTAL_DIGIT(p[1]) ) // safe (and easy) to do this twice 356 ch = ch * 8 + *++p - '0'; // now points at last digit 357 *d++ = ch; 358 break; 359 } 360 case 'x': case 'X': { 361 if (!isxdigit(p[1])) { 362 if (p[1] == '\0') { 363 LOG_STRING(ERROR, errors) << "String cannot end with \\x"; 364 } else { 365 LOG_STRING(ERROR, errors) << 366 "\\x cannot be followed by non-hex digit: \\" << *p << p[1]; 367 } 368 break; 369 } 370 unsigned int ch = 0; 371 const char *hex_start = p; 372 while (isxdigit(p[1])) // arbitrarily many hex digits 373 ch = (ch << 4) + hex_digit_to_int(*++p); 374 if (ch > 0xFF) 375 LOG_STRING(ERROR, errors) << "Value of " << 376 "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits"; 377 *d++ = ch; 378 break; 379 } 380 #if 0 // TODO(kenton): Support \u and \U? Requires runetochar(). 381 case 'u': { 382 // \uhhhh => convert 4 hex digits to UTF-8 383 char32 rune = 0; 384 const char *hex_start = p; 385 for (int i = 0; i < 4; ++i) { 386 if (isxdigit(p[1])) { // Look one char ahead. 387 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p. 388 } else { 389 LOG_STRING(ERROR, errors) 390 << "\\u must be followed by 4 hex digits: \\" 391 << string(hex_start, p+1-hex_start); 392 break; 393 } 394 } 395 d += runetochar(d, &rune); 396 break; 397 } 398 case 'U': { 399 // \Uhhhhhhhh => convert 8 hex digits to UTF-8 400 char32 rune = 0; 401 const char *hex_start = p; 402 for (int i = 0; i < 8; ++i) { 403 if (isxdigit(p[1])) { // Look one char ahead. 404 // Don't change rune until we're sure this 405 // is within the Unicode limit, but do advance p. 406 char32 newrune = (rune << 4) + hex_digit_to_int(*++p); 407 if (newrune > 0x10FFFF) { 408 LOG_STRING(ERROR, errors) 409 << "Value of \\" 410 << string(hex_start, p + 1 - hex_start) 411 << " exceeds Unicode limit (0x10FFFF)"; 412 break; 413 } else { 414 rune = newrune; 415 } 416 } else { 417 LOG_STRING(ERROR, errors) 418 << "\\U must be followed by 8 hex digits: \\" 419 << string(hex_start, p+1-hex_start); 420 break; 421 } 422 } 423 d += runetochar(d, &rune); 424 break; 425 } 426 #endif 427 default: 428 LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p; 429 } 430 p++; // read past letter we escaped 431 } 432 } 433 *d = '\0'; 434 return d - dest; 435 } 436 437 // ---------------------------------------------------------------------- 438 // UnescapeCEscapeString() 439 // This does the same thing as UnescapeCEscapeSequences, but creates 440 // a new string. The caller does not need to worry about allocating 441 // a dest buffer. This should be used for non performance critical 442 // tasks such as printing debug messages. It is safe for src and dest 443 // to be the same. 444 // 445 // The second call stores its errors in a supplied string vector. 446 // If the string vector pointer is NULL, it reports the errors with LOG(). 447 // 448 // In the first and second calls, the length of dest is returned. In the 449 // the third call, the new string is returned. 450 // ---------------------------------------------------------------------- 451 int UnescapeCEscapeString(const string& src, string* dest) { 452 return UnescapeCEscapeString(src, dest, NULL); 453 } 454 455 int UnescapeCEscapeString(const string& src, string* dest, 456 vector<string> *errors) { 457 scoped_array<char> unescaped(new char[src.size() + 1]); 458 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors); 459 GOOGLE_CHECK(dest); 460 dest->assign(unescaped.get(), len); 461 return len; 462 } 463 464 string UnescapeCEscapeString(const string& src) { 465 scoped_array<char> unescaped(new char[src.size() + 1]); 466 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), NULL); 467 return string(unescaped.get(), len); 468 } 469 470 // ---------------------------------------------------------------------- 471 // CEscapeString() 472 // CHexEscapeString() 473 // Copies 'src' to 'dest', escaping dangerous characters using 474 // C-style escape sequences. This is very useful for preparing query 475 // flags. 'src' and 'dest' should not overlap. The 'Hex' version uses 476 // hexadecimal rather than octal sequences. 477 // Returns the number of bytes written to 'dest' (not including the \0) 478 // or -1 if there was insufficient space. 479 // 480 // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped. 481 // ---------------------------------------------------------------------- 482 int CEscapeInternal(const char* src, int src_len, char* dest, 483 int dest_len, bool use_hex, bool utf8_safe) { 484 const char* src_end = src + src_len; 485 int used = 0; 486 bool last_hex_escape = false; // true if last output char was \xNN 487 488 for (; src < src_end; src++) { 489 if (dest_len - used < 2) // Need space for two letter escape 490 return -1; 491 492 bool is_hex_escape = false; 493 switch (*src) { 494 case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break; 495 case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break; 496 case '\t': dest[used++] = '\\'; dest[used++] = 't'; break; 497 case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break; 498 case '\'': dest[used++] = '\\'; dest[used++] = '\''; break; 499 case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break; 500 default: 501 // Note that if we emit \xNN and the src character after that is a hex 502 // digit then that digit must be escaped too to prevent it being 503 // interpreted as part of the character code by C. 504 if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) && 505 (!isprint(*src) || 506 (last_hex_escape && isxdigit(*src)))) { 507 if (dest_len - used < 4) // need space for 4 letter escape 508 return -1; 509 sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"), 510 static_cast<uint8>(*src)); 511 is_hex_escape = use_hex; 512 used += 4; 513 } else { 514 dest[used++] = *src; break; 515 } 516 } 517 last_hex_escape = is_hex_escape; 518 } 519 520 if (dest_len - used < 1) // make sure that there is room for \0 521 return -1; 522 523 dest[used] = '\0'; // doesn't count towards return value though 524 return used; 525 } 526 527 // Calculates the length of the C-style escaped version of 'src'. 528 // Assumes that non-printable characters are escaped using octal sequences, and 529 // that UTF-8 bytes are not handled specially. 530 static inline size_t CEscapedLength(StringPiece src) { 531 static char c_escaped_len[256] = { 532 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r 533 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 534 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", ' 535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9' 536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O' 537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\' 538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o' 539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL 540 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 541 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 542 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 543 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 544 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 545 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 546 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 547 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 548 }; 549 550 size_t escaped_len = 0; 551 for (int i = 0; i < src.size(); ++i) { 552 unsigned char c = static_cast<unsigned char>(src[i]); 553 escaped_len += c_escaped_len[c]; 554 } 555 return escaped_len; 556 } 557 558 // ---------------------------------------------------------------------- 559 // Escapes 'src' using C-style escape sequences, and appends the escaped string 560 // to 'dest'. This version is faster than calling CEscapeInternal as it computes 561 // the required space using a lookup table, and also does not do any special 562 // handling for Hex or UTF-8 characters. 563 // ---------------------------------------------------------------------- 564 void CEscapeAndAppend(StringPiece src, string* dest) { 565 size_t escaped_len = CEscapedLength(src); 566 if (escaped_len == src.size()) { 567 dest->append(src.data(), src.size()); 568 return; 569 } 570 571 size_t cur_dest_len = dest->size(); 572 dest->resize(cur_dest_len + escaped_len); 573 char* append_ptr = &(*dest)[cur_dest_len]; 574 575 for (int i = 0; i < src.size(); ++i) { 576 unsigned char c = static_cast<unsigned char>(src[i]); 577 switch (c) { 578 case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break; 579 case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break; 580 case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break; 581 case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break; 582 case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break; 583 case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break; 584 default: 585 if (!isprint(c)) { 586 *append_ptr++ = '\\'; 587 *append_ptr++ = '0' + c / 64; 588 *append_ptr++ = '0' + (c % 64) / 8; 589 *append_ptr++ = '0' + c % 8; 590 } else { 591 *append_ptr++ = c; 592 } 593 break; 594 } 595 } 596 } 597 598 string CEscape(const string& src) { 599 string dest; 600 CEscapeAndAppend(src, &dest); 601 return dest; 602 } 603 604 namespace strings { 605 606 string Utf8SafeCEscape(const string& src) { 607 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion 608 scoped_array<char> dest(new char[dest_length]); 609 const int len = CEscapeInternal(src.data(), src.size(), 610 dest.get(), dest_length, false, true); 611 GOOGLE_DCHECK_GE(len, 0); 612 return string(dest.get(), len); 613 } 614 615 string CHexEscape(const string& src) { 616 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion 617 scoped_array<char> dest(new char[dest_length]); 618 const int len = CEscapeInternal(src.data(), src.size(), 619 dest.get(), dest_length, true, false); 620 GOOGLE_DCHECK_GE(len, 0); 621 return string(dest.get(), len); 622 } 623 624 } // namespace strings 625 626 // ---------------------------------------------------------------------- 627 // strto32_adaptor() 628 // strtou32_adaptor() 629 // Implementation of strto[u]l replacements that have identical 630 // overflow and underflow characteristics for both ILP-32 and LP-64 631 // platforms, including errno preservation in error-free calls. 632 // ---------------------------------------------------------------------- 633 634 int32 strto32_adaptor(const char *nptr, char **endptr, int base) { 635 const int saved_errno = errno; 636 errno = 0; 637 const long result = strtol(nptr, endptr, base); 638 if (errno == ERANGE && result == LONG_MIN) { 639 return kint32min; 640 } else if (errno == ERANGE && result == LONG_MAX) { 641 return kint32max; 642 } else if (errno == 0 && result < kint32min) { 643 errno = ERANGE; 644 return kint32min; 645 } else if (errno == 0 && result > kint32max) { 646 errno = ERANGE; 647 return kint32max; 648 } 649 if (errno == 0) 650 errno = saved_errno; 651 return static_cast<int32>(result); 652 } 653 654 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) { 655 const int saved_errno = errno; 656 errno = 0; 657 const unsigned long result = strtoul(nptr, endptr, base); 658 if (errno == ERANGE && result == ULONG_MAX) { 659 return kuint32max; 660 } else if (errno == 0 && result > kuint32max) { 661 errno = ERANGE; 662 return kuint32max; 663 } 664 if (errno == 0) 665 errno = saved_errno; 666 return static_cast<uint32>(result); 667 } 668 669 inline bool safe_parse_sign(string* text /*inout*/, 670 bool* negative_ptr /*output*/) { 671 const char* start = text->data(); 672 const char* end = start + text->size(); 673 674 // Consume whitespace. 675 while (start < end && (start[0] == ' ')) { 676 ++start; 677 } 678 while (start < end && (end[-1] == ' ')) { 679 --end; 680 } 681 if (start >= end) { 682 return false; 683 } 684 685 // Consume sign. 686 *negative_ptr = (start[0] == '-'); 687 if (*negative_ptr || start[0] == '+') { 688 ++start; 689 if (start >= end) { 690 return false; 691 } 692 } 693 *text = text->substr(start - text->data(), end - start); 694 return true; 695 } 696 697 template<typename IntType> 698 bool safe_parse_positive_int( 699 string text, IntType* value_p) { 700 int base = 10; 701 IntType value = 0; 702 const IntType vmax = std::numeric_limits<IntType>::max(); 703 assert(vmax > 0); 704 assert(vmax >= base); 705 const IntType vmax_over_base = vmax / base; 706 const char* start = text.data(); 707 const char* end = start + text.size(); 708 // loop over digits 709 for (; start < end; ++start) { 710 unsigned char c = static_cast<unsigned char>(start[0]); 711 int digit = c - '0'; 712 if (digit >= base || digit < 0) { 713 *value_p = value; 714 return false; 715 } 716 if (value > vmax_over_base) { 717 *value_p = vmax; 718 return false; 719 } 720 value *= base; 721 if (value > vmax - digit) { 722 *value_p = vmax; 723 return false; 724 } 725 value += digit; 726 } 727 *value_p = value; 728 return true; 729 } 730 731 template<typename IntType> 732 bool safe_parse_negative_int( 733 const string& text, IntType* value_p) { 734 int base = 10; 735 IntType value = 0; 736 const IntType vmin = std::numeric_limits<IntType>::min(); 737 assert(vmin < 0); 738 assert(vmin <= 0 - base); 739 IntType vmin_over_base = vmin / base; 740 // 2003 c++ standard [expr.mul] 741 // "... the sign of the remainder is implementation-defined." 742 // Although (vmin/base)*base + vmin%base is always vmin. 743 // 2011 c++ standard tightens the spec but we cannot rely on it. 744 if (vmin % base > 0) { 745 vmin_over_base += 1; 746 } 747 const char* start = text.data(); 748 const char* end = start + text.size(); 749 // loop over digits 750 for (; start < end; ++start) { 751 unsigned char c = static_cast<unsigned char>(start[0]); 752 int digit = c - '0'; 753 if (digit >= base || digit < 0) { 754 *value_p = value; 755 return false; 756 } 757 if (value < vmin_over_base) { 758 *value_p = vmin; 759 return false; 760 } 761 value *= base; 762 if (value < vmin + digit) { 763 *value_p = vmin; 764 return false; 765 } 766 value -= digit; 767 } 768 *value_p = value; 769 return true; 770 } 771 772 template<typename IntType> 773 bool safe_int_internal(string text, IntType* value_p) { 774 *value_p = 0; 775 bool negative; 776 if (!safe_parse_sign(&text, &negative)) { 777 return false; 778 } 779 if (!negative) { 780 return safe_parse_positive_int(text, value_p); 781 } else { 782 return safe_parse_negative_int(text, value_p); 783 } 784 } 785 786 template<typename IntType> 787 bool safe_uint_internal(string text, IntType* value_p) { 788 *value_p = 0; 789 bool negative; 790 if (!safe_parse_sign(&text, &negative) || negative) { 791 return false; 792 } 793 return safe_parse_positive_int(text, value_p); 794 } 795 796 // ---------------------------------------------------------------------- 797 // FastIntToBuffer() 798 // FastInt64ToBuffer() 799 // FastHexToBuffer() 800 // FastHex64ToBuffer() 801 // FastHex32ToBuffer() 802 // ---------------------------------------------------------------------- 803 804 // Offset into buffer where FastInt64ToBuffer places the end of string 805 // null character. Also used by FastInt64ToBufferLeft. 806 static const int kFastInt64ToBufferOffset = 21; 807 808 char *FastInt64ToBuffer(int64 i, char* buffer) { 809 // We could collapse the positive and negative sections, but that 810 // would be slightly slower for positive numbers... 811 // 22 bytes is enough to store -2**64, -18446744073709551616. 812 char* p = buffer + kFastInt64ToBufferOffset; 813 *p-- = '\0'; 814 if (i >= 0) { 815 do { 816 *p-- = '0' + i % 10; 817 i /= 10; 818 } while (i > 0); 819 return p + 1; 820 } else { 821 // On different platforms, % and / have different behaviors for 822 // negative numbers, so we need to jump through hoops to make sure 823 // we don't divide negative numbers. 824 if (i > -10) { 825 i = -i; 826 *p-- = '0' + i; 827 *p = '-'; 828 return p; 829 } else { 830 // Make sure we aren't at MIN_INT, in which case we can't say i = -i 831 i = i + 10; 832 i = -i; 833 *p-- = '0' + i % 10; 834 // Undo what we did a moment ago 835 i = i / 10 + 1; 836 do { 837 *p-- = '0' + i % 10; 838 i /= 10; 839 } while (i > 0); 840 *p = '-'; 841 return p; 842 } 843 } 844 } 845 846 // Offset into buffer where FastInt32ToBuffer places the end of string 847 // null character. Also used by FastInt32ToBufferLeft 848 static const int kFastInt32ToBufferOffset = 11; 849 850 // Yes, this is a duplicate of FastInt64ToBuffer. But, we need this for the 851 // compiler to generate 32 bit arithmetic instructions. It's much faster, at 852 // least with 32 bit binaries. 853 char *FastInt32ToBuffer(int32 i, char* buffer) { 854 // We could collapse the positive and negative sections, but that 855 // would be slightly slower for positive numbers... 856 // 12 bytes is enough to store -2**32, -4294967296. 857 char* p = buffer + kFastInt32ToBufferOffset; 858 *p-- = '\0'; 859 if (i >= 0) { 860 do { 861 *p-- = '0' + i % 10; 862 i /= 10; 863 } while (i > 0); 864 return p + 1; 865 } else { 866 // On different platforms, % and / have different behaviors for 867 // negative numbers, so we need to jump through hoops to make sure 868 // we don't divide negative numbers. 869 if (i > -10) { 870 i = -i; 871 *p-- = '0' + i; 872 *p = '-'; 873 return p; 874 } else { 875 // Make sure we aren't at MIN_INT, in which case we can't say i = -i 876 i = i + 10; 877 i = -i; 878 *p-- = '0' + i % 10; 879 // Undo what we did a moment ago 880 i = i / 10 + 1; 881 do { 882 *p-- = '0' + i % 10; 883 i /= 10; 884 } while (i > 0); 885 *p = '-'; 886 return p; 887 } 888 } 889 } 890 891 char *FastHexToBuffer(int i, char* buffer) { 892 GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i; 893 894 static const char *hexdigits = "0123456789abcdef"; 895 char *p = buffer + 21; 896 *p-- = '\0'; 897 do { 898 *p-- = hexdigits[i & 15]; // mod by 16 899 i >>= 4; // divide by 16 900 } while (i > 0); 901 return p + 1; 902 } 903 904 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) { 905 static const char *hexdigits = "0123456789abcdef"; 906 buffer[num_byte] = '\0'; 907 for (int i = num_byte - 1; i >= 0; i--) { 908 #ifdef _M_X64 909 // MSVC x64 platform has a bug optimizing the uint32(value) in the #else 910 // block. Given that the uint32 cast was to improve performance on 32-bit 911 // platforms, we use 64-bit '&' directly. 912 buffer[i] = hexdigits[value & 0xf]; 913 #else 914 buffer[i] = hexdigits[uint32(value) & 0xf]; 915 #endif 916 value >>= 4; 917 } 918 return buffer; 919 } 920 921 char *FastHex64ToBuffer(uint64 value, char* buffer) { 922 return InternalFastHexToBuffer(value, buffer, 16); 923 } 924 925 char *FastHex32ToBuffer(uint32 value, char* buffer) { 926 return InternalFastHexToBuffer(value, buffer, 8); 927 } 928 929 // ---------------------------------------------------------------------- 930 // FastInt32ToBufferLeft() 931 // FastUInt32ToBufferLeft() 932 // FastInt64ToBufferLeft() 933 // FastUInt64ToBufferLeft() 934 // 935 // Like the Fast*ToBuffer() functions above, these are intended for speed. 936 // Unlike the Fast*ToBuffer() functions, however, these functions write 937 // their output to the beginning of the buffer (hence the name, as the 938 // output is left-aligned). The caller is responsible for ensuring that 939 // the buffer has enough space to hold the output. 940 // 941 // Returns a pointer to the end of the string (i.e. the null character 942 // terminating the string). 943 // ---------------------------------------------------------------------- 944 945 static const char two_ASCII_digits[100][2] = { 946 {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'}, 947 {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'}, 948 {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, 949 {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'}, 950 {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, 951 {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'}, 952 {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, 953 {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'}, 954 {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, 955 {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'}, 956 {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, 957 {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'}, 958 {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, 959 {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'}, 960 {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, 961 {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'}, 962 {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, 963 {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'}, 964 {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, 965 {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'} 966 }; 967 968 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) { 969 int digits; 970 const char *ASCII_digits = NULL; 971 // The idea of this implementation is to trim the number of divides to as few 972 // as possible by using multiplication and subtraction rather than mod (%), 973 // and by outputting two digits at a time rather than one. 974 // The huge-number case is first, in the hopes that the compiler will output 975 // that case in one branch-free block of code, and only output conditional 976 // branches into it from below. 977 if (u >= 1000000000) { // >= 1,000,000,000 978 digits = u / 100000000; // 100,000,000 979 ASCII_digits = two_ASCII_digits[digits]; 980 buffer[0] = ASCII_digits[0]; 981 buffer[1] = ASCII_digits[1]; 982 buffer += 2; 983 sublt100_000_000: 984 u -= digits * 100000000; // 100,000,000 985 lt100_000_000: 986 digits = u / 1000000; // 1,000,000 987 ASCII_digits = two_ASCII_digits[digits]; 988 buffer[0] = ASCII_digits[0]; 989 buffer[1] = ASCII_digits[1]; 990 buffer += 2; 991 sublt1_000_000: 992 u -= digits * 1000000; // 1,000,000 993 lt1_000_000: 994 digits = u / 10000; // 10,000 995 ASCII_digits = two_ASCII_digits[digits]; 996 buffer[0] = ASCII_digits[0]; 997 buffer[1] = ASCII_digits[1]; 998 buffer += 2; 999 sublt10_000: 1000 u -= digits * 10000; // 10,000 1001 lt10_000: 1002 digits = u / 100; 1003 ASCII_digits = two_ASCII_digits[digits]; 1004 buffer[0] = ASCII_digits[0]; 1005 buffer[1] = ASCII_digits[1]; 1006 buffer += 2; 1007 sublt100: 1008 u -= digits * 100; 1009 lt100: 1010 digits = u; 1011 ASCII_digits = two_ASCII_digits[digits]; 1012 buffer[0] = ASCII_digits[0]; 1013 buffer[1] = ASCII_digits[1]; 1014 buffer += 2; 1015 done: 1016 *buffer = 0; 1017 return buffer; 1018 } 1019 1020 if (u < 100) { 1021 digits = u; 1022 if (u >= 10) goto lt100; 1023 *buffer++ = '0' + digits; 1024 goto done; 1025 } 1026 if (u < 10000) { // 10,000 1027 if (u >= 1000) goto lt10_000; 1028 digits = u / 100; 1029 *buffer++ = '0' + digits; 1030 goto sublt100; 1031 } 1032 if (u < 1000000) { // 1,000,000 1033 if (u >= 100000) goto lt1_000_000; 1034 digits = u / 10000; // 10,000 1035 *buffer++ = '0' + digits; 1036 goto sublt10_000; 1037 } 1038 if (u < 100000000) { // 100,000,000 1039 if (u >= 10000000) goto lt100_000_000; 1040 digits = u / 1000000; // 1,000,000 1041 *buffer++ = '0' + digits; 1042 goto sublt1_000_000; 1043 } 1044 // we already know that u < 1,000,000,000 1045 digits = u / 100000000; // 100,000,000 1046 *buffer++ = '0' + digits; 1047 goto sublt100_000_000; 1048 } 1049 1050 char* FastInt32ToBufferLeft(int32 i, char* buffer) { 1051 uint32 u = i; 1052 if (i < 0) { 1053 *buffer++ = '-'; 1054 u = -i; 1055 } 1056 return FastUInt32ToBufferLeft(u, buffer); 1057 } 1058 1059 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) { 1060 int digits; 1061 const char *ASCII_digits = NULL; 1062 1063 uint32 u = static_cast<uint32>(u64); 1064 if (u == u64) return FastUInt32ToBufferLeft(u, buffer); 1065 1066 uint64 top_11_digits = u64 / 1000000000; 1067 buffer = FastUInt64ToBufferLeft(top_11_digits, buffer); 1068 u = u64 - (top_11_digits * 1000000000); 1069 1070 digits = u / 10000000; // 10,000,000 1071 GOOGLE_DCHECK_LT(digits, 100); 1072 ASCII_digits = two_ASCII_digits[digits]; 1073 buffer[0] = ASCII_digits[0]; 1074 buffer[1] = ASCII_digits[1]; 1075 buffer += 2; 1076 u -= digits * 10000000; // 10,000,000 1077 digits = u / 100000; // 100,000 1078 ASCII_digits = two_ASCII_digits[digits]; 1079 buffer[0] = ASCII_digits[0]; 1080 buffer[1] = ASCII_digits[1]; 1081 buffer += 2; 1082 u -= digits * 100000; // 100,000 1083 digits = u / 1000; // 1,000 1084 ASCII_digits = two_ASCII_digits[digits]; 1085 buffer[0] = ASCII_digits[0]; 1086 buffer[1] = ASCII_digits[1]; 1087 buffer += 2; 1088 u -= digits * 1000; // 1,000 1089 digits = u / 10; 1090 ASCII_digits = two_ASCII_digits[digits]; 1091 buffer[0] = ASCII_digits[0]; 1092 buffer[1] = ASCII_digits[1]; 1093 buffer += 2; 1094 u -= digits * 10; 1095 digits = u; 1096 *buffer++ = '0' + digits; 1097 *buffer = 0; 1098 return buffer; 1099 } 1100 1101 char* FastInt64ToBufferLeft(int64 i, char* buffer) { 1102 uint64 u = i; 1103 if (i < 0) { 1104 *buffer++ = '-'; 1105 u = -i; 1106 } 1107 return FastUInt64ToBufferLeft(u, buffer); 1108 } 1109 1110 // ---------------------------------------------------------------------- 1111 // SimpleItoa() 1112 // Description: converts an integer to a string. 1113 // 1114 // Return value: string 1115 // ---------------------------------------------------------------------- 1116 1117 string SimpleItoa(int i) { 1118 char buffer[kFastToBufferSize]; 1119 return (sizeof(i) == 4) ? 1120 FastInt32ToBuffer(i, buffer) : 1121 FastInt64ToBuffer(i, buffer); 1122 } 1123 1124 string SimpleItoa(unsigned int i) { 1125 char buffer[kFastToBufferSize]; 1126 return string(buffer, (sizeof(i) == 4) ? 1127 FastUInt32ToBufferLeft(i, buffer) : 1128 FastUInt64ToBufferLeft(i, buffer)); 1129 } 1130 1131 string SimpleItoa(long i) { 1132 char buffer[kFastToBufferSize]; 1133 return (sizeof(i) == 4) ? 1134 FastInt32ToBuffer(i, buffer) : 1135 FastInt64ToBuffer(i, buffer); 1136 } 1137 1138 string SimpleItoa(unsigned long i) { 1139 char buffer[kFastToBufferSize]; 1140 return string(buffer, (sizeof(i) == 4) ? 1141 FastUInt32ToBufferLeft(i, buffer) : 1142 FastUInt64ToBufferLeft(i, buffer)); 1143 } 1144 1145 string SimpleItoa(long long i) { 1146 char buffer[kFastToBufferSize]; 1147 return (sizeof(i) == 4) ? 1148 FastInt32ToBuffer(i, buffer) : 1149 FastInt64ToBuffer(i, buffer); 1150 } 1151 1152 string SimpleItoa(unsigned long long i) { 1153 char buffer[kFastToBufferSize]; 1154 return string(buffer, (sizeof(i) == 4) ? 1155 FastUInt32ToBufferLeft(i, buffer) : 1156 FastUInt64ToBufferLeft(i, buffer)); 1157 } 1158 1159 // ---------------------------------------------------------------------- 1160 // SimpleDtoa() 1161 // SimpleFtoa() 1162 // DoubleToBuffer() 1163 // FloatToBuffer() 1164 // We want to print the value without losing precision, but we also do 1165 // not want to print more digits than necessary. This turns out to be 1166 // trickier than it sounds. Numbers like 0.2 cannot be represented 1167 // exactly in binary. If we print 0.2 with a very large precision, 1168 // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167". 1169 // On the other hand, if we set the precision too low, we lose 1170 // significant digits when printing numbers that actually need them. 1171 // It turns out there is no precision value that does the right thing 1172 // for all numbers. 1173 // 1174 // Our strategy is to first try printing with a precision that is never 1175 // over-precise, then parse the result with strtod() to see if it 1176 // matches. If not, we print again with a precision that will always 1177 // give a precise result, but may use more digits than necessary. 1178 // 1179 // An arguably better strategy would be to use the algorithm described 1180 // in "How to Print Floating-Point Numbers Accurately" by Steele & 1181 // White, e.g. as implemented by David M. Gay's dtoa(). It turns out, 1182 // however, that the following implementation is about as fast as 1183 // DMG's code. Furthermore, DMG's code locks mutexes, which means it 1184 // will not scale well on multi-core machines. DMG's code is slightly 1185 // more accurate (in that it will never use more digits than 1186 // necessary), but this is probably irrelevant for most users. 1187 // 1188 // Rob Pike and Ken Thompson also have an implementation of dtoa() in 1189 // third_party/fmt/fltfmt.cc. Their implementation is similar to this 1190 // one in that it makes guesses and then uses strtod() to check them. 1191 // Their implementation is faster because they use their own code to 1192 // generate the digits in the first place rather than use snprintf(), 1193 // thus avoiding format string parsing overhead. However, this makes 1194 // it considerably more complicated than the following implementation, 1195 // and it is embedded in a larger library. If speed turns out to be 1196 // an issue, we could re-implement this in terms of their 1197 // implementation. 1198 // ---------------------------------------------------------------------- 1199 1200 string SimpleDtoa(double value) { 1201 char buffer[kDoubleToBufferSize]; 1202 return DoubleToBuffer(value, buffer); 1203 } 1204 1205 string SimpleFtoa(float value) { 1206 char buffer[kFloatToBufferSize]; 1207 return FloatToBuffer(value, buffer); 1208 } 1209 1210 static inline bool IsValidFloatChar(char c) { 1211 return ('0' <= c && c <= '9') || 1212 c == 'e' || c == 'E' || 1213 c == '+' || c == '-'; 1214 } 1215 1216 void DelocalizeRadix(char* buffer) { 1217 // Fast check: if the buffer has a normal decimal point, assume no 1218 // translation is needed. 1219 if (strchr(buffer, '.') != NULL) return; 1220 1221 // Find the first unknown character. 1222 while (IsValidFloatChar(*buffer)) ++buffer; 1223 1224 if (*buffer == '\0') { 1225 // No radix character found. 1226 return; 1227 } 1228 1229 // We are now pointing at the locale-specific radix character. Replace it 1230 // with '.'. 1231 *buffer = '.'; 1232 ++buffer; 1233 1234 if (!IsValidFloatChar(*buffer) && *buffer != '\0') { 1235 // It appears the radix was a multi-byte character. We need to remove the 1236 // extra bytes. 1237 char* target = buffer; 1238 do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0'); 1239 memmove(target, buffer, strlen(buffer) + 1); 1240 } 1241 } 1242 1243 char* DoubleToBuffer(double value, char* buffer) { 1244 // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all 1245 // platforms these days. Just in case some system exists where DBL_DIG 1246 // is significantly larger -- and risks overflowing our buffer -- we have 1247 // this assert. 1248 GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big); 1249 1250 if (value == numeric_limits<double>::infinity()) { 1251 strcpy(buffer, "inf"); 1252 return buffer; 1253 } else if (value == -numeric_limits<double>::infinity()) { 1254 strcpy(buffer, "-inf"); 1255 return buffer; 1256 } else if (MathLimits<double>::IsNaN(value)) { 1257 strcpy(buffer, "nan"); 1258 return buffer; 1259 } 1260 1261 int snprintf_result = 1262 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value); 1263 1264 // The snprintf should never overflow because the buffer is significantly 1265 // larger than the precision we asked for. 1266 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize); 1267 1268 // We need to make parsed_value volatile in order to force the compiler to 1269 // write it out to the stack. Otherwise, it may keep the value in a 1270 // register, and if it does that, it may keep it as a long double instead 1271 // of a double. This long double may have extra bits that make it compare 1272 // unequal to "value" even though it would be exactly equal if it were 1273 // truncated to a double. 1274 volatile double parsed_value = strtod(buffer, NULL); 1275 if (parsed_value != value) { 1276 int snprintf_result = 1277 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value); 1278 1279 // Should never overflow; see above. 1280 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize); 1281 } 1282 1283 DelocalizeRadix(buffer); 1284 return buffer; 1285 } 1286 1287 static int memcasecmp(const char *s1, const char *s2, size_t len) { 1288 const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1); 1289 const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2); 1290 1291 for ( int i = 0; i < len; i++ ) { 1292 const int diff = 1293 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) - 1294 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i]))); 1295 if (diff != 0) return diff; 1296 } 1297 return 0; 1298 } 1299 1300 inline bool CaseEqual(StringPiece s1, StringPiece s2) { 1301 if (s1.size() != s2.size()) return false; 1302 return memcasecmp(s1.data(), s2.data(), s1.size()) == 0; 1303 } 1304 1305 bool safe_strtob(StringPiece str, bool* value) { 1306 GOOGLE_CHECK(value != NULL) << "NULL output boolean given."; 1307 if (CaseEqual(str, "true") || CaseEqual(str, "t") || 1308 CaseEqual(str, "yes") || CaseEqual(str, "y") || 1309 CaseEqual(str, "1")) { 1310 *value = true; 1311 return true; 1312 } 1313 if (CaseEqual(str, "false") || CaseEqual(str, "f") || 1314 CaseEqual(str, "no") || CaseEqual(str, "n") || 1315 CaseEqual(str, "0")) { 1316 *value = false; 1317 return true; 1318 } 1319 return false; 1320 } 1321 1322 bool safe_strtof(const char* str, float* value) { 1323 char* endptr; 1324 errno = 0; // errno only gets set on errors 1325 #if defined(_WIN32) || defined (__hpux) // has no strtof() 1326 *value = strtod(str, &endptr); 1327 #else 1328 *value = strtof(str, &endptr); 1329 #endif 1330 return *str != 0 && *endptr == 0 && errno == 0; 1331 } 1332 1333 bool safe_strtod(const char* str, double* value) { 1334 char* endptr; 1335 *value = strtod(str, &endptr); 1336 if (endptr != str) { 1337 while (ascii_isspace(*endptr)) ++endptr; 1338 } 1339 // Ignore range errors from strtod. The values it 1340 // returns on underflow and overflow are the right 1341 // fallback in a robust setting. 1342 return *str != '\0' && *endptr == '\0'; 1343 } 1344 1345 bool safe_strto32(const string& str, int32* value) { 1346 return safe_int_internal(str, value); 1347 } 1348 1349 bool safe_strtou32(const string& str, uint32* value) { 1350 return safe_uint_internal(str, value); 1351 } 1352 1353 bool safe_strto64(const string& str, int64* value) { 1354 return safe_int_internal(str, value); 1355 } 1356 1357 bool safe_strtou64(const string& str, uint64* value) { 1358 return safe_uint_internal(str, value); 1359 } 1360 1361 char* FloatToBuffer(float value, char* buffer) { 1362 // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all 1363 // platforms these days. Just in case some system exists where FLT_DIG 1364 // is significantly larger -- and risks overflowing our buffer -- we have 1365 // this assert. 1366 GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big); 1367 1368 if (value == numeric_limits<double>::infinity()) { 1369 strcpy(buffer, "inf"); 1370 return buffer; 1371 } else if (value == -numeric_limits<double>::infinity()) { 1372 strcpy(buffer, "-inf"); 1373 return buffer; 1374 } else if (MathLimits<float>::IsNaN(value)) { 1375 strcpy(buffer, "nan"); 1376 return buffer; 1377 } 1378 1379 int snprintf_result = 1380 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value); 1381 1382 // The snprintf should never overflow because the buffer is significantly 1383 // larger than the precision we asked for. 1384 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize); 1385 1386 float parsed_value; 1387 if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) { 1388 int snprintf_result = 1389 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+2, value); 1390 1391 // Should never overflow; see above. 1392 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize); 1393 } 1394 1395 DelocalizeRadix(buffer); 1396 return buffer; 1397 } 1398 1399 namespace strings { 1400 1401 AlphaNum::AlphaNum(strings::Hex hex) { 1402 char *const end = &digits[kFastToBufferSize]; 1403 char *writer = end; 1404 uint64 value = hex.value; 1405 uint64 width = hex.spec; 1406 // We accomplish minimum width by OR'ing in 0x10000 to the user's value, 1407 // where 0x10000 is the smallest hex number that is as wide as the user 1408 // asked for. 1409 uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value; 1410 static const char hexdigits[] = "0123456789abcdef"; 1411 do { 1412 *--writer = hexdigits[value & 0xF]; 1413 value >>= 4; 1414 mask >>= 4; 1415 } while (mask != 0); 1416 piece_data_ = writer; 1417 piece_size_ = end - writer; 1418 } 1419 1420 } // namespace strings 1421 1422 // ---------------------------------------------------------------------- 1423 // StrCat() 1424 // This merges the given strings or integers, with no delimiter. This 1425 // is designed to be the fastest possible way to construct a string out 1426 // of a mix of raw C strings, C++ strings, and integer values. 1427 // ---------------------------------------------------------------------- 1428 1429 // Append is merely a version of memcpy that returns the address of the byte 1430 // after the area just overwritten. It comes in multiple flavors to minimize 1431 // call overhead. 1432 static char *Append1(char *out, const AlphaNum &x) { 1433 memcpy(out, x.data(), x.size()); 1434 return out + x.size(); 1435 } 1436 1437 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) { 1438 memcpy(out, x1.data(), x1.size()); 1439 out += x1.size(); 1440 1441 memcpy(out, x2.data(), x2.size()); 1442 return out + x2.size(); 1443 } 1444 1445 static char *Append4(char *out, 1446 const AlphaNum &x1, const AlphaNum &x2, 1447 const AlphaNum &x3, const AlphaNum &x4) { 1448 memcpy(out, x1.data(), x1.size()); 1449 out += x1.size(); 1450 1451 memcpy(out, x2.data(), x2.size()); 1452 out += x2.size(); 1453 1454 memcpy(out, x3.data(), x3.size()); 1455 out += x3.size(); 1456 1457 memcpy(out, x4.data(), x4.size()); 1458 return out + x4.size(); 1459 } 1460 1461 string StrCat(const AlphaNum &a, const AlphaNum &b) { 1462 string result; 1463 result.resize(a.size() + b.size()); 1464 char *const begin = &*result.begin(); 1465 char *out = Append2(begin, a, b); 1466 GOOGLE_DCHECK_EQ(out, begin + result.size()); 1467 return result; 1468 } 1469 1470 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) { 1471 string result; 1472 result.resize(a.size() + b.size() + c.size()); 1473 char *const begin = &*result.begin(); 1474 char *out = Append2(begin, a, b); 1475 out = Append1(out, c); 1476 GOOGLE_DCHECK_EQ(out, begin + result.size()); 1477 return result; 1478 } 1479 1480 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, 1481 const AlphaNum &d) { 1482 string result; 1483 result.resize(a.size() + b.size() + c.size() + d.size()); 1484 char *const begin = &*result.begin(); 1485 char *out = Append4(begin, a, b, c, d); 1486 GOOGLE_DCHECK_EQ(out, begin + result.size()); 1487 return result; 1488 } 1489 1490 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, 1491 const AlphaNum &d, const AlphaNum &e) { 1492 string result; 1493 result.resize(a.size() + b.size() + c.size() + d.size() + e.size()); 1494 char *const begin = &*result.begin(); 1495 char *out = Append4(begin, a, b, c, d); 1496 out = Append1(out, e); 1497 GOOGLE_DCHECK_EQ(out, begin + result.size()); 1498 return result; 1499 } 1500 1501 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, 1502 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) { 1503 string result; 1504 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() + 1505 f.size()); 1506 char *const begin = &*result.begin(); 1507 char *out = Append4(begin, a, b, c, d); 1508 out = Append2(out, e, f); 1509 GOOGLE_DCHECK_EQ(out, begin + result.size()); 1510 return result; 1511 } 1512 1513 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, 1514 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, 1515 const AlphaNum &g) { 1516 string result; 1517 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() + 1518 f.size() + g.size()); 1519 char *const begin = &*result.begin(); 1520 char *out = Append4(begin, a, b, c, d); 1521 out = Append2(out, e, f); 1522 out = Append1(out, g); 1523 GOOGLE_DCHECK_EQ(out, begin + result.size()); 1524 return result; 1525 } 1526 1527 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, 1528 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, 1529 const AlphaNum &g, const AlphaNum &h) { 1530 string result; 1531 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() + 1532 f.size() + g.size() + h.size()); 1533 char *const begin = &*result.begin(); 1534 char *out = Append4(begin, a, b, c, d); 1535 out = Append4(out, e, f, g, h); 1536 GOOGLE_DCHECK_EQ(out, begin + result.size()); 1537 return result; 1538 } 1539 1540 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, 1541 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, 1542 const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) { 1543 string result; 1544 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() + 1545 f.size() + g.size() + h.size() + i.size()); 1546 char *const begin = &*result.begin(); 1547 char *out = Append4(begin, a, b, c, d); 1548 out = Append4(out, e, f, g, h); 1549 out = Append1(out, i); 1550 GOOGLE_DCHECK_EQ(out, begin + result.size()); 1551 return result; 1552 } 1553 1554 // It's possible to call StrAppend with a char * pointer that is partway into 1555 // the string we're appending to. However the results of this are random. 1556 // Therefore, check for this in debug mode. Use unsigned math so we only have 1557 // to do one comparison. 1558 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \ 1559 GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \ 1560 uintptr_t((dest).size())) 1561 1562 void StrAppend(string *result, const AlphaNum &a) { 1563 GOOGLE_DCHECK_NO_OVERLAP(*result, a); 1564 result->append(a.data(), a.size()); 1565 } 1566 1567 void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) { 1568 GOOGLE_DCHECK_NO_OVERLAP(*result, a); 1569 GOOGLE_DCHECK_NO_OVERLAP(*result, b); 1570 string::size_type old_size = result->size(); 1571 result->resize(old_size + a.size() + b.size()); 1572 char *const begin = &*result->begin(); 1573 char *out = Append2(begin + old_size, a, b); 1574 GOOGLE_DCHECK_EQ(out, begin + result->size()); 1575 } 1576 1577 void StrAppend(string *result, 1578 const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) { 1579 GOOGLE_DCHECK_NO_OVERLAP(*result, a); 1580 GOOGLE_DCHECK_NO_OVERLAP(*result, b); 1581 GOOGLE_DCHECK_NO_OVERLAP(*result, c); 1582 string::size_type old_size = result->size(); 1583 result->resize(old_size + a.size() + b.size() + c.size()); 1584 char *const begin = &*result->begin(); 1585 char *out = Append2(begin + old_size, a, b); 1586 out = Append1(out, c); 1587 GOOGLE_DCHECK_EQ(out, begin + result->size()); 1588 } 1589 1590 void StrAppend(string *result, 1591 const AlphaNum &a, const AlphaNum &b, 1592 const AlphaNum &c, const AlphaNum &d) { 1593 GOOGLE_DCHECK_NO_OVERLAP(*result, a); 1594 GOOGLE_DCHECK_NO_OVERLAP(*result, b); 1595 GOOGLE_DCHECK_NO_OVERLAP(*result, c); 1596 GOOGLE_DCHECK_NO_OVERLAP(*result, d); 1597 string::size_type old_size = result->size(); 1598 result->resize(old_size + a.size() + b.size() + c.size() + d.size()); 1599 char *const begin = &*result->begin(); 1600 char *out = Append4(begin + old_size, a, b, c, d); 1601 GOOGLE_DCHECK_EQ(out, begin + result->size()); 1602 } 1603 1604 int GlobalReplaceSubstring(const string& substring, 1605 const string& replacement, 1606 string* s) { 1607 GOOGLE_CHECK(s != NULL); 1608 if (s->empty() || substring.empty()) 1609 return 0; 1610 string tmp; 1611 int num_replacements = 0; 1612 int pos = 0; 1613 for (int match_pos = s->find(substring.data(), pos, substring.length()); 1614 match_pos != string::npos; 1615 pos = match_pos + substring.length(), 1616 match_pos = s->find(substring.data(), pos, substring.length())) { 1617 ++num_replacements; 1618 // Append the original content before the match. 1619 tmp.append(*s, pos, match_pos - pos); 1620 // Append the replacement for the match. 1621 tmp.append(replacement.begin(), replacement.end()); 1622 } 1623 // Append the content after the last match. If no replacements were made, the 1624 // original string is left untouched. 1625 if (num_replacements > 0) { 1626 tmp.append(*s, pos, s->length() - pos); 1627 s->swap(tmp); 1628 } 1629 return num_replacements; 1630 } 1631 1632 int CalculateBase64EscapedLen(int input_len, bool do_padding) { 1633 // Base64 encodes three bytes of input at a time. If the input is not 1634 // divisible by three, we pad as appropriate. 1635 // 1636 // (from http://tools.ietf.org/html/rfc3548) 1637 // Special processing is performed if fewer than 24 bits are available 1638 // at the end of the data being encoded. A full encoding quantum is 1639 // always completed at the end of a quantity. When fewer than 24 input 1640 // bits are available in an input group, zero bits are added (on the 1641 // right) to form an integral number of 6-bit groups. Padding at the 1642 // end of the data is performed using the '=' character. Since all base 1643 // 64 input is an integral number of octets, only the following cases 1644 // can arise: 1645 1646 1647 // Base64 encodes each three bytes of input into four bytes of output. 1648 int len = (input_len / 3) * 4; 1649 1650 if (input_len % 3 == 0) { 1651 // (from http://tools.ietf.org/html/rfc3548) 1652 // (1) the final quantum of encoding input is an integral multiple of 24 1653 // bits; here, the final unit of encoded output will be an integral 1654 // multiple of 4 characters with no "=" padding, 1655 } else if (input_len % 3 == 1) { 1656 // (from http://tools.ietf.org/html/rfc3548) 1657 // (2) the final quantum of encoding input is exactly 8 bits; here, the 1658 // final unit of encoded output will be two characters followed by two 1659 // "=" padding characters, or 1660 len += 2; 1661 if (do_padding) { 1662 len += 2; 1663 } 1664 } else { // (input_len % 3 == 2) 1665 // (from http://tools.ietf.org/html/rfc3548) 1666 // (3) the final quantum of encoding input is exactly 16 bits; here, the 1667 // final unit of encoded output will be three characters followed by one 1668 // "=" padding character. 1669 len += 3; 1670 if (do_padding) { 1671 len += 1; 1672 } 1673 } 1674 1675 assert(len >= input_len); // make sure we didn't overflow 1676 return len; 1677 } 1678 1679 // Base64Escape does padding, so this calculation includes padding. 1680 int CalculateBase64EscapedLen(int input_len) { 1681 return CalculateBase64EscapedLen(input_len, true); 1682 } 1683 1684 // ---------------------------------------------------------------------- 1685 // int Base64Unescape() - base64 decoder 1686 // int Base64Escape() - base64 encoder 1687 // int WebSafeBase64Unescape() - Google's variation of base64 decoder 1688 // int WebSafeBase64Escape() - Google's variation of base64 encoder 1689 // 1690 // Check out 1691 // http://tools.ietf.org/html/rfc2045 for formal description, but what we 1692 // care about is that... 1693 // Take the encoded stuff in groups of 4 characters and turn each 1694 // character into a code 0 to 63 thus: 1695 // A-Z map to 0 to 25 1696 // a-z map to 26 to 51 1697 // 0-9 map to 52 to 61 1698 // +(- for WebSafe) maps to 62 1699 // /(_ for WebSafe) maps to 63 1700 // There will be four numbers, all less than 64 which can be represented 1701 // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively). 1702 // Arrange the 6 digit binary numbers into three bytes as such: 1703 // aaaaaabb bbbbcccc ccdddddd 1704 // Equals signs (one or two) are used at the end of the encoded block to 1705 // indicate that the text was not an integer multiple of three bytes long. 1706 // ---------------------------------------------------------------------- 1707 1708 int Base64UnescapeInternal(const char *src_param, int szsrc, 1709 char *dest, int szdest, 1710 const signed char* unbase64) { 1711 static const char kPad64Equals = '='; 1712 static const char kPad64Dot = '.'; 1713 1714 int decode = 0; 1715 int destidx = 0; 1716 int state = 0; 1717 unsigned int ch = 0; 1718 unsigned int temp = 0; 1719 1720 // If "char" is signed by default, using *src as an array index results in 1721 // accessing negative array elements. Treat the input as a pointer to 1722 // unsigned char to avoid this. 1723 const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param); 1724 1725 // The GET_INPUT macro gets the next input character, skipping 1726 // over any whitespace, and stopping when we reach the end of the 1727 // string or when we read any non-data character. The arguments are 1728 // an arbitrary identifier (used as a label for goto) and the number 1729 // of data bytes that must remain in the input to avoid aborting the 1730 // loop. 1731 #define GET_INPUT(label, remain) \ 1732 label: \ 1733 --szsrc; \ 1734 ch = *src++; \ 1735 decode = unbase64[ch]; \ 1736 if (decode < 0) { \ 1737 if (ascii_isspace(ch) && szsrc >= remain) \ 1738 goto label; \ 1739 state = 4 - remain; \ 1740 break; \ 1741 } 1742 1743 // if dest is null, we're just checking to see if it's legal input 1744 // rather than producing output. (I suspect this could just be done 1745 // with a regexp...). We duplicate the loop so this test can be 1746 // outside it instead of in every iteration. 1747 1748 if (dest) { 1749 // This loop consumes 4 input bytes and produces 3 output bytes 1750 // per iteration. We can't know at the start that there is enough 1751 // data left in the string for a full iteration, so the loop may 1752 // break out in the middle; if so 'state' will be set to the 1753 // number of input bytes read. 1754 1755 while (szsrc >= 4) { 1756 // We'll start by optimistically assuming that the next four 1757 // bytes of the string (src[0..3]) are four good data bytes 1758 // (that is, no nulls, whitespace, padding chars, or illegal 1759 // chars). We need to test src[0..2] for nulls individually 1760 // before constructing temp to preserve the property that we 1761 // never read past a null in the string (no matter how long 1762 // szsrc claims the string is). 1763 1764 if (!src[0] || !src[1] || !src[2] || 1765 (temp = ((unsigned(unbase64[src[0]]) << 18) | 1766 (unsigned(unbase64[src[1]]) << 12) | 1767 (unsigned(unbase64[src[2]]) << 6) | 1768 (unsigned(unbase64[src[3]])))) & 0x80000000) { 1769 // Iff any of those four characters was bad (null, illegal, 1770 // whitespace, padding), then temp's high bit will be set 1771 // (because unbase64[] is -1 for all bad characters). 1772 // 1773 // We'll back up and resort to the slower decoder, which knows 1774 // how to handle those cases. 1775 1776 GET_INPUT(first, 4); 1777 temp = decode; 1778 GET_INPUT(second, 3); 1779 temp = (temp << 6) | decode; 1780 GET_INPUT(third, 2); 1781 temp = (temp << 6) | decode; 1782 GET_INPUT(fourth, 1); 1783 temp = (temp << 6) | decode; 1784 } else { 1785 // We really did have four good data bytes, so advance four 1786 // characters in the string. 1787 1788 szsrc -= 4; 1789 src += 4; 1790 decode = -1; 1791 ch = '\0'; 1792 } 1793 1794 // temp has 24 bits of input, so write that out as three bytes. 1795 1796 if (destidx+3 > szdest) return -1; 1797 dest[destidx+2] = temp; 1798 temp >>= 8; 1799 dest[destidx+1] = temp; 1800 temp >>= 8; 1801 dest[destidx] = temp; 1802 destidx += 3; 1803 } 1804 } else { 1805 while (szsrc >= 4) { 1806 if (!src[0] || !src[1] || !src[2] || 1807 (temp = ((unsigned(unbase64[src[0]]) << 18) | 1808 (unsigned(unbase64[src[1]]) << 12) | 1809 (unsigned(unbase64[src[2]]) << 6) | 1810 (unsigned(unbase64[src[3]])))) & 0x80000000) { 1811 GET_INPUT(first_no_dest, 4); 1812 GET_INPUT(second_no_dest, 3); 1813 GET_INPUT(third_no_dest, 2); 1814 GET_INPUT(fourth_no_dest, 1); 1815 } else { 1816 szsrc -= 4; 1817 src += 4; 1818 decode = -1; 1819 ch = '\0'; 1820 } 1821 destidx += 3; 1822 } 1823 } 1824 1825 #undef GET_INPUT 1826 1827 // if the loop terminated because we read a bad character, return 1828 // now. 1829 if (decode < 0 && ch != '\0' && 1830 ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch)) 1831 return -1; 1832 1833 if (ch == kPad64Equals || ch == kPad64Dot) { 1834 // if we stopped by hitting an '=' or '.', un-read that character -- we'll 1835 // look at it again when we count to check for the proper number of 1836 // equals signs at the end. 1837 ++szsrc; 1838 --src; 1839 } else { 1840 // This loop consumes 1 input byte per iteration. It's used to 1841 // clean up the 0-3 input bytes remaining when the first, faster 1842 // loop finishes. 'temp' contains the data from 'state' input 1843 // characters read by the first loop. 1844 while (szsrc > 0) { 1845 --szsrc; 1846 ch = *src++; 1847 decode = unbase64[ch]; 1848 if (decode < 0) { 1849 if (ascii_isspace(ch)) { 1850 continue; 1851 } else if (ch == '\0') { 1852 break; 1853 } else if (ch == kPad64Equals || ch == kPad64Dot) { 1854 // back up one character; we'll read it again when we check 1855 // for the correct number of pad characters at the end. 1856 ++szsrc; 1857 --src; 1858 break; 1859 } else { 1860 return -1; 1861 } 1862 } 1863 1864 // Each input character gives us six bits of output. 1865 temp = (temp << 6) | decode; 1866 ++state; 1867 if (state == 4) { 1868 // If we've accumulated 24 bits of output, write that out as 1869 // three bytes. 1870 if (dest) { 1871 if (destidx+3 > szdest) return -1; 1872 dest[destidx+2] = temp; 1873 temp >>= 8; 1874 dest[destidx+1] = temp; 1875 temp >>= 8; 1876 dest[destidx] = temp; 1877 } 1878 destidx += 3; 1879 state = 0; 1880 temp = 0; 1881 } 1882 } 1883 } 1884 1885 // Process the leftover data contained in 'temp' at the end of the input. 1886 int expected_equals = 0; 1887 switch (state) { 1888 case 0: 1889 // Nothing left over; output is a multiple of 3 bytes. 1890 break; 1891 1892 case 1: 1893 // Bad input; we have 6 bits left over. 1894 return -1; 1895 1896 case 2: 1897 // Produce one more output byte from the 12 input bits we have left. 1898 if (dest) { 1899 if (destidx+1 > szdest) return -1; 1900 temp >>= 4; 1901 dest[destidx] = temp; 1902 } 1903 ++destidx; 1904 expected_equals = 2; 1905 break; 1906 1907 case 3: 1908 // Produce two more output bytes from the 18 input bits we have left. 1909 if (dest) { 1910 if (destidx+2 > szdest) return -1; 1911 temp >>= 2; 1912 dest[destidx+1] = temp; 1913 temp >>= 8; 1914 dest[destidx] = temp; 1915 } 1916 destidx += 2; 1917 expected_equals = 1; 1918 break; 1919 1920 default: 1921 // state should have no other values at this point. 1922 GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state; 1923 } 1924 1925 // The remainder of the string should be all whitespace, mixed with 1926 // exactly 0 equals signs, or exactly 'expected_equals' equals 1927 // signs. (Always accepting 0 equals signs is a google extension 1928 // not covered in the RFC, as is accepting dot as the pad character.) 1929 1930 int equals = 0; 1931 while (szsrc > 0 && *src) { 1932 if (*src == kPad64Equals || *src == kPad64Dot) 1933 ++equals; 1934 else if (!ascii_isspace(*src)) 1935 return -1; 1936 --szsrc; 1937 ++src; 1938 } 1939 1940 return (equals == 0 || equals == expected_equals) ? destidx : -1; 1941 } 1942 1943 // The arrays below were generated by the following code 1944 // #include <sys/time.h> 1945 // #include <stdlib.h> 1946 // #include <string.h> 1947 // main() 1948 // { 1949 // static const char Base64[] = 1950 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 1951 // char *pos; 1952 // int idx, i, j; 1953 // printf(" "); 1954 // for (i = 0; i < 255; i += 8) { 1955 // for (j = i; j < i + 8; j++) { 1956 // pos = strchr(Base64, j); 1957 // if ((pos == NULL) || (j == 0)) 1958 // idx = -1; 1959 // else 1960 // idx = pos - Base64; 1961 // if (idx == -1) 1962 // printf(" %2d, ", idx); 1963 // else 1964 // printf(" %2d/*%c*/,", idx, j); 1965 // } 1966 // printf("\n "); 1967 // } 1968 // } 1969 // 1970 // where the value of "Base64[]" was replaced by one of the base-64 conversion 1971 // tables from the functions below. 1972 static const signed char kUnBase64[] = { 1973 -1, -1, -1, -1, -1, -1, -1, -1, 1974 -1, -1, -1, -1, -1, -1, -1, -1, 1975 -1, -1, -1, -1, -1, -1, -1, -1, 1976 -1, -1, -1, -1, -1, -1, -1, -1, 1977 -1, -1, -1, -1, -1, -1, -1, -1, 1978 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */, 1979 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/, 1980 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1, 1981 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/, 1982 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/, 1983 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/, 1984 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1, 1985 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/, 1986 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/, 1987 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/, 1988 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1, 1989 -1, -1, -1, -1, -1, -1, -1, -1, 1990 -1, -1, -1, -1, -1, -1, -1, -1, 1991 -1, -1, -1, -1, -1, -1, -1, -1, 1992 -1, -1, -1, -1, -1, -1, -1, -1, 1993 -1, -1, -1, -1, -1, -1, -1, -1, 1994 -1, -1, -1, -1, -1, -1, -1, -1, 1995 -1, -1, -1, -1, -1, -1, -1, -1, 1996 -1, -1, -1, -1, -1, -1, -1, -1, 1997 -1, -1, -1, -1, -1, -1, -1, -1, 1998 -1, -1, -1, -1, -1, -1, -1, -1, 1999 -1, -1, -1, -1, -1, -1, -1, -1, 2000 -1, -1, -1, -1, -1, -1, -1, -1, 2001 -1, -1, -1, -1, -1, -1, -1, -1, 2002 -1, -1, -1, -1, -1, -1, -1, -1, 2003 -1, -1, -1, -1, -1, -1, -1, -1, 2004 -1, -1, -1, -1, -1, -1, -1, -1 2005 }; 2006 static const signed char kUnWebSafeBase64[] = { 2007 -1, -1, -1, -1, -1, -1, -1, -1, 2008 -1, -1, -1, -1, -1, -1, -1, -1, 2009 -1, -1, -1, -1, -1, -1, -1, -1, 2010 -1, -1, -1, -1, -1, -1, -1, -1, 2011 -1, -1, -1, -1, -1, -1, -1, -1, 2012 -1, -1, -1, -1, -1, 62/*-*/, -1, -1, 2013 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/, 2014 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1, 2015 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/, 2016 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/, 2017 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/, 2018 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/, 2019 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/, 2020 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/, 2021 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/, 2022 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1, 2023 -1, -1, -1, -1, -1, -1, -1, -1, 2024 -1, -1, -1, -1, -1, -1, -1, -1, 2025 -1, -1, -1, -1, -1, -1, -1, -1, 2026 -1, -1, -1, -1, -1, -1, -1, -1, 2027 -1, -1, -1, -1, -1, -1, -1, -1, 2028 -1, -1, -1, -1, -1, -1, -1, -1, 2029 -1, -1, -1, -1, -1, -1, -1, -1, 2030 -1, -1, -1, -1, -1, -1, -1, -1, 2031 -1, -1, -1, -1, -1, -1, -1, -1, 2032 -1, -1, -1, -1, -1, -1, -1, -1, 2033 -1, -1, -1, -1, -1, -1, -1, -1, 2034 -1, -1, -1, -1, -1, -1, -1, -1, 2035 -1, -1, -1, -1, -1, -1, -1, -1, 2036 -1, -1, -1, -1, -1, -1, -1, -1, 2037 -1, -1, -1, -1, -1, -1, -1, -1, 2038 -1, -1, -1, -1, -1, -1, -1, -1 2039 }; 2040 2041 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) { 2042 return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64); 2043 } 2044 2045 static bool Base64UnescapeInternal(const char* src, int slen, string* dest, 2046 const signed char* unbase64) { 2047 // Determine the size of the output string. Base64 encodes every 3 bytes into 2048 // 4 characters. any leftover chars are added directly for good measure. 2049 // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548 2050 const int dest_len = 3 * (slen / 4) + (slen % 4); 2051 2052 dest->resize(dest_len); 2053 2054 // We are getting the destination buffer by getting the beginning of the 2055 // string and converting it into a char *. 2056 const int len = Base64UnescapeInternal(src, slen, string_as_array(dest), 2057 dest_len, unbase64); 2058 if (len < 0) { 2059 dest->clear(); 2060 return false; 2061 } 2062 2063 // could be shorter if there was padding 2064 GOOGLE_DCHECK_LE(len, dest_len); 2065 dest->erase(len); 2066 2067 return true; 2068 } 2069 2070 bool Base64Unescape(StringPiece src, string* dest) { 2071 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64); 2072 } 2073 2074 bool WebSafeBase64Unescape(StringPiece src, string* dest) { 2075 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64); 2076 } 2077 2078 int Base64EscapeInternal(const unsigned char *src, int szsrc, 2079 char *dest, int szdest, const char *base64, 2080 bool do_padding) { 2081 static const char kPad64 = '='; 2082 2083 if (szsrc <= 0) return 0; 2084 2085 if (szsrc * 4 > szdest * 3) return 0; 2086 2087 char *cur_dest = dest; 2088 const unsigned char *cur_src = src; 2089 2090 char *limit_dest = dest + szdest; 2091 const unsigned char *limit_src = src + szsrc; 2092 2093 // Three bytes of data encodes to four characters of cyphertext. 2094 // So we can pump through three-byte chunks atomically. 2095 while (cur_src < limit_src - 3) { // keep going as long as we have >= 32 bits 2096 uint32 in = BigEndian::Load32(cur_src) >> 8; 2097 2098 cur_dest[0] = base64[in >> 18]; 2099 in &= 0x3FFFF; 2100 cur_dest[1] = base64[in >> 12]; 2101 in &= 0xFFF; 2102 cur_dest[2] = base64[in >> 6]; 2103 in &= 0x3F; 2104 cur_dest[3] = base64[in]; 2105 2106 cur_dest += 4; 2107 cur_src += 3; 2108 } 2109 // To save time, we didn't update szdest or szsrc in the loop. So do it now. 2110 szdest = limit_dest - cur_dest; 2111 szsrc = limit_src - cur_src; 2112 2113 /* now deal with the tail (<=3 bytes) */ 2114 switch (szsrc) { 2115 case 0: 2116 // Nothing left; nothing more to do. 2117 break; 2118 case 1: { 2119 // One byte left: this encodes to two characters, and (optionally) 2120 // two pad characters to round out the four-character cypherblock. 2121 if ((szdest -= 2) < 0) return 0; 2122 uint32 in = cur_src[0]; 2123 cur_dest[0] = base64[in >> 2]; 2124 in &= 0x3; 2125 cur_dest[1] = base64[in << 4]; 2126 cur_dest += 2; 2127 if (do_padding) { 2128 if ((szdest -= 2) < 0) return 0; 2129 cur_dest[0] = kPad64; 2130 cur_dest[1] = kPad64; 2131 cur_dest += 2; 2132 } 2133 break; 2134 } 2135 case 2: { 2136 // Two bytes left: this encodes to three characters, and (optionally) 2137 // one pad character to round out the four-character cypherblock. 2138 if ((szdest -= 3) < 0) return 0; 2139 uint32 in = BigEndian::Load16(cur_src); 2140 cur_dest[0] = base64[in >> 10]; 2141 in &= 0x3FF; 2142 cur_dest[1] = base64[in >> 4]; 2143 in &= 0x00F; 2144 cur_dest[2] = base64[in << 2]; 2145 cur_dest += 3; 2146 if (do_padding) { 2147 if ((szdest -= 1) < 0) return 0; 2148 cur_dest[0] = kPad64; 2149 cur_dest += 1; 2150 } 2151 break; 2152 } 2153 case 3: { 2154 // Three bytes left: same as in the big loop above. We can't do this in 2155 // the loop because the loop above always reads 4 bytes, and the fourth 2156 // byte is past the end of the input. 2157 if ((szdest -= 4) < 0) return 0; 2158 uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1); 2159 cur_dest[0] = base64[in >> 18]; 2160 in &= 0x3FFFF; 2161 cur_dest[1] = base64[in >> 12]; 2162 in &= 0xFFF; 2163 cur_dest[2] = base64[in >> 6]; 2164 in &= 0x3F; 2165 cur_dest[3] = base64[in]; 2166 cur_dest += 4; 2167 break; 2168 } 2169 default: 2170 // Should not be reached: blocks of 4 bytes are handled 2171 // in the while loop before this switch statement. 2172 GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc; 2173 break; 2174 } 2175 return (cur_dest - dest); 2176 } 2177 2178 static const char kBase64Chars[] = 2179 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 2180 2181 static const char kWebSafeBase64Chars[] = 2182 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; 2183 2184 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) { 2185 return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true); 2186 } 2187 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest, 2188 int szdest, bool do_padding) { 2189 return Base64EscapeInternal(src, szsrc, dest, szdest, 2190 kWebSafeBase64Chars, do_padding); 2191 } 2192 2193 void Base64EscapeInternal(const unsigned char* src, int szsrc, 2194 string* dest, bool do_padding, 2195 const char* base64_chars) { 2196 const int calc_escaped_size = 2197 CalculateBase64EscapedLen(szsrc, do_padding); 2198 dest->resize(calc_escaped_size); 2199 const int escaped_len = Base64EscapeInternal(src, szsrc, 2200 string_as_array(dest), 2201 dest->size(), 2202 base64_chars, 2203 do_padding); 2204 GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len); 2205 dest->erase(escaped_len); 2206 } 2207 2208 void Base64Escape(const unsigned char *src, int szsrc, 2209 string* dest, bool do_padding) { 2210 Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars); 2211 } 2212 2213 void WebSafeBase64Escape(const unsigned char *src, int szsrc, 2214 string *dest, bool do_padding) { 2215 Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars); 2216 } 2217 2218 void Base64Escape(StringPiece src, string* dest) { 2219 Base64Escape(reinterpret_cast<const unsigned char*>(src.data()), 2220 src.size(), dest, true); 2221 } 2222 2223 void WebSafeBase64Escape(StringPiece src, string* dest) { 2224 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()), 2225 src.size(), dest, false); 2226 } 2227 2228 void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) { 2229 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()), 2230 src.size(), dest, true); 2231 } 2232 2233 // Helper to append a Unicode code point to a string as UTF8, without bringing 2234 // in any external dependencies. 2235 int EncodeAsUTF8Char(uint32 code_point, char* output) { 2236 uint32 tmp = 0; 2237 int len = 0; 2238 if (code_point <= 0x7f) { 2239 tmp = code_point; 2240 len = 1; 2241 } else if (code_point <= 0x07ff) { 2242 tmp = 0x0000c080 | 2243 ((code_point & 0x07c0) << 2) | 2244 (code_point & 0x003f); 2245 len = 2; 2246 } else if (code_point <= 0xffff) { 2247 tmp = 0x00e08080 | 2248 ((code_point & 0xf000) << 4) | 2249 ((code_point & 0x0fc0) << 2) | 2250 (code_point & 0x003f); 2251 len = 3; 2252 } else { 2253 // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is 2254 // normally only defined up to there as well. 2255 tmp = 0xf0808080 | 2256 ((code_point & 0x1c0000) << 6) | 2257 ((code_point & 0x03f000) << 4) | 2258 ((code_point & 0x000fc0) << 2) | 2259 (code_point & 0x003f); 2260 len = 4; 2261 } 2262 tmp = ghtonl(tmp); 2263 memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len); 2264 return len; 2265 } 2266 2267 // Table of UTF-8 character lengths, based on first byte 2268 static const unsigned char kUTF8LenTbl[256] = { 2269 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 2270 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 2271 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 2272 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 2273 2274 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 2275 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 2276 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2277 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 2278 }; 2279 2280 // Return length of a single UTF-8 source character 2281 int UTF8FirstLetterNumBytes(const char* src, int len) { 2282 if (len == 0) { 2283 return 0; 2284 } 2285 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)]; 2286 } 2287 2288 } // namespace protobuf 2289 } // namespace google 2290