1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/lib/strings/str_util.h" 17 18 #include <ctype.h> 19 #include <algorithm> 20 #include <cstring> 21 #include <vector> 22 #include "tensorflow/core/lib/strings/numbers.h" 23 #include "tensorflow/core/lib/strings/stringprintf.h" 24 #include "tensorflow/core/platform/logging.h" 25 26 namespace tensorflow { 27 namespace str_util { 28 29 static char hex_char[] = "0123456789abcdef"; 30 31 string CEscape(StringPiece src) { 32 string dest; 33 34 for (unsigned char c : src) { 35 switch (c) { 36 case '\n': 37 dest.append("\\n"); 38 break; 39 case '\r': 40 dest.append("\\r"); 41 break; 42 case '\t': 43 dest.append("\\t"); 44 break; 45 case '\"': 46 dest.append("\\\""); 47 break; 48 case '\'': 49 dest.append("\\'"); 50 break; 51 case '\\': 52 dest.append("\\\\"); 53 break; 54 default: 55 // Note that if we emit \xNN and the src character after that is a hex 56 // digit then that digit must be escaped too to prevent it being 57 // interpreted as part of the character code by C. 58 if ((c >= 0x80) || !isprint(c)) { 59 dest.append("\\"); 60 dest.push_back(hex_char[c / 64]); 61 dest.push_back(hex_char[(c % 64) / 8]); 62 dest.push_back(hex_char[c % 8]); 63 } else { 64 dest.push_back(c); 65 break; 66 } 67 } 68 } 69 70 return dest; 71 } 72 73 namespace { // Private helpers for CUnescape(). 74 75 inline bool is_octal_digit(unsigned char c) { return c >= '0' && c <= '7'; } 76 77 inline bool ascii_isxdigit(unsigned char c) { 78 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || 79 (c >= 'A' && c <= 'F'); 80 } 81 82 inline int hex_digit_to_int(char c) { 83 int x = static_cast<unsigned char>(c); 84 if (x > '9') { 85 x += 9; 86 } 87 return x & 0xf; 88 } 89 90 bool CUnescapeInternal(StringPiece source, string* dest, 91 string::size_type* dest_len, string* error) { 92 const char* p = source.data(); 93 const char* end = source.end(); 94 const char* last_byte = end - 1; 95 96 // We are going to write the result to dest with its iterator. If our string 97 // implementation uses copy-on-write, this will trigger a copy-on-write of 98 // dest's buffer; that is, dest will be assigned a new buffer. 99 // 100 // Note that the following way is NOT a legal way to modify a string's 101 // content: 102 // 103 // char* d = const_cast<char*>(dest->data()); 104 // 105 // This won't trigger copy-on-write of the string, and so is dangerous when 106 // the buffer is shared. 107 auto d = dest->begin(); 108 109 // Small optimization for case where source = dest and there's no escaping 110 if (source.data() == dest->data()) { 111 while (p < end && *p != '\\') { 112 p++; 113 d++; 114 } 115 } 116 117 while (p < end) { 118 if (*p != '\\') { 119 *d++ = *p++; 120 } else { 121 if (++p > last_byte) { // skip past the '\\' 122 if (error) *error = "String cannot end with \\"; 123 return false; 124 } 125 switch (*p) { 126 case 'a': 127 *d++ = '\a'; 128 break; 129 case 'b': 130 *d++ = '\b'; 131 break; 132 case 'f': 133 *d++ = '\f'; 134 break; 135 case 'n': 136 *d++ = '\n'; 137 break; 138 case 'r': 139 *d++ = '\r'; 140 break; 141 case 't': 142 *d++ = '\t'; 143 break; 144 case 'v': 145 *d++ = '\v'; 146 break; 147 case '\\': 148 *d++ = '\\'; 149 break; 150 case '?': 151 *d++ = '\?'; 152 break; // \? Who knew? 153 case '\'': 154 *d++ = '\''; 155 break; 156 case '"': 157 *d++ = '\"'; 158 break; 159 case '0': 160 case '1': 161 case '2': 162 case '3': // octal digit: 1 to 3 digits 163 case '4': 164 case '5': 165 case '6': 166 case '7': { 167 const char* octal_start = p; 168 unsigned int ch = *p - '0'; 169 if (p < last_byte && is_octal_digit(p[1])) ch = ch * 8 + *++p - '0'; 170 if (p < last_byte && is_octal_digit(p[1])) 171 ch = ch * 8 + *++p - '0'; // now points at last digit 172 if (ch > 0xff) { 173 if (error) { 174 *error = "Value of \\" + 175 string(octal_start, p + 1 - octal_start) + 176 " exceeds 0xff"; 177 } 178 return false; 179 } 180 *d++ = ch; 181 break; 182 } 183 case 'x': 184 case 'X': { 185 if (p >= last_byte) { 186 if (error) *error = "String cannot end with \\x"; 187 return false; 188 } else if (!ascii_isxdigit(p[1])) { 189 if (error) *error = "\\x cannot be followed by a non-hex digit"; 190 return false; 191 } 192 unsigned int ch = 0; 193 const char* hex_start = p; 194 while (p < last_byte && ascii_isxdigit(p[1])) 195 // Arbitrarily many hex digits 196 ch = (ch << 4) + hex_digit_to_int(*++p); 197 if (ch > 0xFF) { 198 if (error) { 199 *error = "Value of \\" + string(hex_start, p + 1 - hex_start) + 200 " exceeds 0xff"; 201 } 202 return false; 203 } 204 *d++ = ch; 205 break; 206 } 207 default: { 208 if (error) *error = string("Unknown escape sequence: \\") + *p; 209 return false; 210 } 211 } 212 p++; // read past letter we escaped 213 } 214 } 215 *dest_len = d - dest->begin(); 216 return true; 217 } 218 219 template <typename T> 220 bool SplitAndParseAsInts(StringPiece text, char delim, 221 std::function<bool(StringPiece, T*)> converter, 222 std::vector<T>* result) { 223 result->clear(); 224 std::vector<string> num_strings = Split(text, delim); 225 for (const auto& s : num_strings) { 226 T num; 227 if (!converter(s, &num)) return false; 228 result->push_back(num); 229 } 230 return true; 231 } 232 233 } // namespace 234 235 bool CUnescape(StringPiece source, string* dest, string* error) { 236 dest->resize(source.size()); 237 string::size_type dest_size; 238 if (!CUnescapeInternal(source, dest, &dest_size, error)) { 239 return false; 240 } 241 dest->erase(dest_size); 242 return true; 243 } 244 245 void StripTrailingWhitespace(string* s) { 246 string::size_type i; 247 for (i = s->size(); i > 0 && isspace((*s)[i - 1]); --i) { 248 } 249 s->resize(i); 250 } 251 252 // Return lower-cased version of s. 253 string Lowercase(StringPiece s) { 254 string result(s.data(), s.size()); 255 for (char& c : result) { 256 c = tolower(c); 257 } 258 return result; 259 } 260 261 // Return upper-cased version of s. 262 string Uppercase(StringPiece s) { 263 string result(s.data(), s.size()); 264 for (char& c : result) { 265 c = toupper(c); 266 } 267 return result; 268 } 269 270 string ArgDefCase(StringPiece s) { 271 const size_t n = s.size(); 272 273 // Compute the size of resulting string. 274 // Number of extra underscores we will need to add. 275 size_t extra_us = 0; 276 // Number of non-alpha chars in the beginning to skip. 277 size_t to_skip = 0; 278 for (size_t i = 0; i < n; ++i) { 279 // If we are skipping and current letter is non-alpha, skip it as well 280 if (i == to_skip && !isalpha(s[i])) { 281 ++to_skip; 282 continue; 283 } 284 285 // If we are here, we are not skipping any more. 286 // If this letter is upper case, not the very first char in the 287 // resulting string, and previous letter isn't replaced with an underscore, 288 // we will need to insert an underscore. 289 if (isupper(s[i]) && i != to_skip && i > 0 && isalnum(s[i - 1])) { 290 ++extra_us; 291 } 292 } 293 294 // Initialize result with all '_'s. There is no string 295 // constructor that does not initialize memory. 296 string result(n + extra_us - to_skip, '_'); 297 // i - index into s 298 // j - index into result 299 for (size_t i = to_skip, j = 0; i < n; ++i, ++j) { 300 DCHECK_LT(j, result.size()); 301 char c = s[i]; 302 // If c is not alphanumeric, we don't need to do anything 303 // since there is already an underscore in its place. 304 if (isalnum(c)) { 305 if (isupper(c)) { 306 // If current char is upper case, we might need to insert an 307 // underscore. 308 if (i != to_skip) { 309 DCHECK_GT(j, 0); 310 if (result[j - 1] != '_') ++j; 311 } 312 result[j] = tolower(c); 313 } else { 314 result[j] = c; 315 } 316 } 317 } 318 319 return result; 320 } 321 322 void TitlecaseString(string* s, StringPiece delimiters) { 323 bool upper = true; 324 for (string::iterator ss = s->begin(); ss != s->end(); ++ss) { 325 if (upper) { 326 *ss = toupper(*ss); 327 } 328 upper = (delimiters.find(*ss) != StringPiece::npos); 329 } 330 } 331 332 string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub, 333 bool replace_all) { 334 // TODO(jlebar): We could avoid having to shift data around in the string if 335 // we had a StringPiece::find() overload that searched for a StringPiece. 336 string res(s); 337 size_t pos = 0; 338 while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) { 339 res.replace(pos, oldsub.size(), newsub.data(), newsub.size()); 340 pos += newsub.size(); 341 if (oldsub.empty()) { 342 pos++; // Match at the beginning of the text and after every byte 343 } 344 if (!replace_all) { 345 break; 346 } 347 } 348 return res; 349 } 350 351 size_t RemoveLeadingWhitespace(StringPiece* text) { 352 size_t count = 0; 353 const char* ptr = text->data(); 354 while (count < text->size() && isspace(*ptr)) { 355 count++; 356 ptr++; 357 } 358 text->remove_prefix(count); 359 return count; 360 } 361 362 size_t RemoveTrailingWhitespace(StringPiece* text) { 363 size_t count = 0; 364 const char* ptr = text->data() + text->size() - 1; 365 while (count < text->size() && isspace(*ptr)) { 366 ++count; 367 --ptr; 368 } 369 text->remove_suffix(count); 370 return count; 371 } 372 373 size_t RemoveWhitespaceContext(StringPiece* text) { 374 // use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job 375 return (RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text)); 376 } 377 378 bool ConsumePrefix(StringPiece* s, StringPiece expected) { 379 if (StartsWith(*s, expected)) { 380 s->remove_prefix(expected.size()); 381 return true; 382 } 383 return false; 384 } 385 386 bool ConsumeSuffix(StringPiece* s, StringPiece expected) { 387 if (EndsWith(*s, expected)) { 388 s->remove_suffix(expected.size()); 389 return true; 390 } 391 return false; 392 } 393 394 bool ConsumeLeadingDigits(StringPiece* s, uint64* val) { 395 const char* p = s->data(); 396 const char* limit = p + s->size(); 397 uint64 v = 0; 398 while (p < limit) { 399 const char c = *p; 400 if (c < '0' || c > '9') break; 401 uint64 new_v = (v * 10) + (c - '0'); 402 if (new_v / 8 < v) { 403 // Overflow occurred 404 return false; 405 } 406 v = new_v; 407 p++; 408 } 409 if (p > s->data()) { 410 // Consume some digits 411 s->remove_prefix(p - s->data()); 412 *val = v; 413 return true; 414 } else { 415 return false; 416 } 417 } 418 419 bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val) { 420 const char* p = s->data(); 421 const char* limit = p + s->size(); 422 while (p < limit) { 423 const char c = *p; 424 if (isspace(c)) break; 425 p++; 426 } 427 const size_t n = p - s->data(); 428 if (n > 0) { 429 *val = StringPiece(s->data(), n); 430 s->remove_prefix(n); 431 return true; 432 } else { 433 *val = StringPiece(); 434 return false; 435 } 436 } 437 438 bool SplitAndParseAsInts(StringPiece text, char delim, 439 std::vector<int32>* result) { 440 return SplitAndParseAsInts<int32>(text, delim, strings::safe_strto32, result); 441 } 442 443 bool SplitAndParseAsInts(StringPiece text, char delim, 444 std::vector<int64>* result) { 445 return SplitAndParseAsInts<int64>(text, delim, strings::safe_strto64, result); 446 } 447 448 bool SplitAndParseAsFloats(StringPiece text, char delim, 449 std::vector<float>* result) { 450 return SplitAndParseAsInts<float>(text, delim, 451 [](StringPiece str, float* value) { 452 return strings::safe_strtof(str, value); 453 }, 454 result); 455 } 456 457 size_t Strnlen(const char* str, const size_t string_max_len) { 458 size_t len = 0; 459 while (len < string_max_len && str[len] != '\0') { 460 ++len; 461 } 462 return len; 463 } 464 465 bool StrContains(StringPiece haystack, StringPiece needle) { 466 return std::search(haystack.begin(), haystack.end(), needle.begin(), 467 needle.end()) != haystack.end(); 468 } 469 470 bool StartsWith(StringPiece text, StringPiece prefix) { 471 return prefix.empty() || 472 (text.size() >= prefix.size() && 473 memcmp(text.data(), prefix.data(), prefix.size()) == 0); 474 } 475 476 bool EndsWith(StringPiece text, StringPiece suffix) { 477 return suffix.empty() || (text.size() >= suffix.size() && 478 memcmp(text.data() + (text.size() - suffix.size()), 479 suffix.data(), suffix.size()) == 0); 480 } 481 482 } // namespace str_util 483 } // namespace tensorflow 484