1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/lib/strings/str_util.h" 17 18 #include <ctype.h> 19 #include <vector> 20 #include "tensorflow/core/lib/strings/numbers.h" 21 #include "tensorflow/core/lib/strings/stringprintf.h" 22 23 namespace tensorflow { 24 namespace str_util { 25 26 static char hex_char[] = "0123456789abcdef"; 27 28 string CEscape(StringPiece src) { 29 string dest; 30 31 for (unsigned char c : src) { 32 switch (c) { 33 case '\n': 34 dest.append("\\n"); 35 break; 36 case '\r': 37 dest.append("\\r"); 38 break; 39 case '\t': 40 dest.append("\\t"); 41 break; 42 case '\"': 43 dest.append("\\\""); 44 break; 45 case '\'': 46 dest.append("\\'"); 47 break; 48 case '\\': 49 dest.append("\\\\"); 50 break; 51 default: 52 // Note that if we emit \xNN and the src character after that is a hex 53 // digit then that digit must be escaped too to prevent it being 54 // interpreted as part of the character code by C. 55 if ((c >= 0x80) || !isprint(c)) { 56 dest.append("\\"); 57 dest.push_back(hex_char[c / 64]); 58 dest.push_back(hex_char[(c % 64) / 8]); 59 dest.push_back(hex_char[c % 8]); 60 } else { 61 dest.push_back(c); 62 break; 63 } 64 } 65 } 66 67 return dest; 68 } 69 70 namespace { // Private helpers for CUnescape(). 71 72 inline bool is_octal_digit(unsigned char c) { return c >= '0' && c <= '7'; } 73 74 inline bool ascii_isxdigit(unsigned char c) { 75 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || 76 (c >= 'A' && c <= 'F'); 77 } 78 79 inline int hex_digit_to_int(char c) { 80 int x = static_cast<unsigned char>(c); 81 if (x > '9') { 82 x += 9; 83 } 84 return x & 0xf; 85 } 86 87 bool CUnescapeInternal(StringPiece source, string* dest, 88 string::size_type* dest_len, string* error) { 89 const char* p = source.data(); 90 const char* end = source.end(); 91 const char* last_byte = end - 1; 92 93 // We are going to write the result to dest with its iterator. If our string 94 // implementation uses copy-on-write, this will trigger a copy-on-write of 95 // dest's buffer; that is, dest will be assigned a new buffer. 96 // 97 // Note that the following way is NOT a legal way to modify a string's 98 // content: 99 // 100 // char* d = const_cast<char*>(dest->data()); 101 // 102 // This won't trigger copy-on-write of the string, and so is dangerous when 103 // the buffer is shared. 104 auto d = dest->begin(); 105 106 // Small optimization for case where source = dest and there's no escaping 107 if (source.data() == dest->data()) { 108 while (p < end && *p != '\\') { 109 p++; 110 d++; 111 } 112 } 113 114 while (p < end) { 115 if (*p != '\\') { 116 *d++ = *p++; 117 } else { 118 if (++p > last_byte) { // skip past the '\\' 119 if (error) *error = "String cannot end with \\"; 120 return false; 121 } 122 switch (*p) { 123 case 'a': 124 *d++ = '\a'; 125 break; 126 case 'b': 127 *d++ = '\b'; 128 break; 129 case 'f': 130 *d++ = '\f'; 131 break; 132 case 'n': 133 *d++ = '\n'; 134 break; 135 case 'r': 136 *d++ = '\r'; 137 break; 138 case 't': 139 *d++ = '\t'; 140 break; 141 case 'v': 142 *d++ = '\v'; 143 break; 144 case '\\': 145 *d++ = '\\'; 146 break; 147 case '?': 148 *d++ = '\?'; 149 break; // \? Who knew? 150 case '\'': 151 *d++ = '\''; 152 break; 153 case '"': 154 *d++ = '\"'; 155 break; 156 case '0': 157 case '1': 158 case '2': 159 case '3': // octal digit: 1 to 3 digits 160 case '4': 161 case '5': 162 case '6': 163 case '7': { 164 const char* octal_start = p; 165 unsigned int ch = *p - '0'; 166 if (p < last_byte && is_octal_digit(p[1])) ch = ch * 8 + *++p - '0'; 167 if (p < last_byte && is_octal_digit(p[1])) 168 ch = ch * 8 + *++p - '0'; // now points at last digit 169 if (ch > 0xff) { 170 if (error) { 171 *error = "Value of \\" + 172 string(octal_start, p + 1 - octal_start) + 173 " exceeds 0xff"; 174 } 175 return false; 176 } 177 *d++ = ch; 178 break; 179 } 180 case 'x': 181 case 'X': { 182 if (p >= last_byte) { 183 if (error) *error = "String cannot end with \\x"; 184 return false; 185 } else if (!ascii_isxdigit(p[1])) { 186 if (error) *error = "\\x cannot be followed by a non-hex digit"; 187 return false; 188 } 189 unsigned int ch = 0; 190 const char* hex_start = p; 191 while (p < last_byte && ascii_isxdigit(p[1])) 192 // Arbitrarily many hex digits 193 ch = (ch << 4) + hex_digit_to_int(*++p); 194 if (ch > 0xFF) { 195 if (error) { 196 *error = "Value of \\" + string(hex_start, p + 1 - hex_start) + 197 " exceeds 0xff"; 198 } 199 return false; 200 } 201 *d++ = ch; 202 break; 203 } 204 default: { 205 if (error) *error = string("Unknown escape sequence: \\") + *p; 206 return false; 207 } 208 } 209 p++; // read past letter we escaped 210 } 211 } 212 *dest_len = d - dest->begin(); 213 return true; 214 } 215 216 template <typename T> 217 bool SplitAndParseAsInts(StringPiece text, char delim, 218 std::function<bool(StringPiece, T*)> converter, 219 std::vector<T>* result) { 220 result->clear(); 221 std::vector<string> num_strings = Split(text, delim); 222 for (const auto& s : num_strings) { 223 T num; 224 if (!converter(s, &num)) return false; 225 result->push_back(num); 226 } 227 return true; 228 } 229 230 } // namespace 231 232 bool CUnescape(StringPiece source, string* dest, string* error) { 233 dest->resize(source.size()); 234 string::size_type dest_size; 235 if (!CUnescapeInternal(source, dest, &dest_size, error)) { 236 return false; 237 } 238 dest->erase(dest_size); 239 return true; 240 } 241 242 void StripTrailingWhitespace(string* s) { 243 string::size_type i; 244 for (i = s->size(); i > 0 && isspace((*s)[i - 1]); --i) { 245 } 246 s->resize(i); 247 } 248 249 // Return lower-cased version of s. 250 string Lowercase(StringPiece s) { 251 string result(s.data(), s.size()); 252 for (char& c : result) { 253 c = tolower(c); 254 } 255 return result; 256 } 257 258 // Return upper-cased version of s. 259 string Uppercase(StringPiece s) { 260 string result(s.data(), s.size()); 261 for (char& c : result) { 262 c = toupper(c); 263 } 264 return result; 265 } 266 267 string ArgDefCase(StringPiece s) { 268 const size_t n = s.size(); 269 270 // Compute the size of resulting string. 271 // Number of extra underscores we will need to add. 272 size_t extra_us = 0; 273 // Number of non-alpha chars in the beginning to skip. 274 size_t to_skip = 0; 275 for (size_t i = 0; i < n; ++i) { 276 // If we are skipping and current letter is non-alpha, skip it as well 277 if (i == to_skip && !isalpha(s[i])) { 278 ++to_skip; 279 continue; 280 } 281 282 // If we are here, we are not skipping any more. 283 // If this letter is upper case, not the very first char in the 284 // resulting string, and previous letter isn't replaced with an underscore, 285 // we will need to insert an underscore. 286 if (isupper(s[i]) && i != to_skip && i > 0 && isalnum(s[i - 1])) { 287 ++extra_us; 288 } 289 } 290 291 // Initialize result with all '_'s. There is no string 292 // constructor that does not initialize memory. 293 string result(n + extra_us - to_skip, '_'); 294 // i - index into s 295 // j - index into result 296 for (size_t i = to_skip, j = 0; i < n; ++i, ++j) { 297 DCHECK_LT(j, result.size()); 298 char c = s[i]; 299 // If c is not alphanumeric, we don't need to do anything 300 // since there is already an underscore in its place. 301 if (isalnum(c)) { 302 if (isupper(c)) { 303 // If current char is upper case, we might need to insert an 304 // underscore. 305 if (i != to_skip) { 306 DCHECK_GT(j, 0); 307 if (result[j - 1] != '_') ++j; 308 } 309 result[j] = tolower(c); 310 } else { 311 result[j] = c; 312 } 313 } 314 } 315 316 return result; 317 } 318 319 void TitlecaseString(string* s, StringPiece delimiters) { 320 bool upper = true; 321 for (string::iterator ss = s->begin(); ss != s->end(); ++ss) { 322 if (upper) { 323 *ss = toupper(*ss); 324 } 325 upper = (delimiters.find(*ss) != StringPiece::npos); 326 } 327 } 328 329 string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub, 330 bool replace_all) { 331 // TODO(jlebar): We could avoid having to shift data around in the string if 332 // we had a StringPiece::find() overload that searched for a StringPiece. 333 string res = s.ToString(); 334 size_t pos = 0; 335 while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) { 336 res.replace(pos, oldsub.size(), newsub.data(), newsub.size()); 337 pos += newsub.size(); 338 if (oldsub.empty()) { 339 pos++; // Match at the beginning of the text and after every byte 340 } 341 if (!replace_all) { 342 break; 343 } 344 } 345 return res; 346 } 347 348 size_t RemoveLeadingWhitespace(StringPiece* text) { 349 size_t count = 0; 350 const char* ptr = text->data(); 351 while (count < text->size() && isspace(*ptr)) { 352 count++; 353 ptr++; 354 } 355 text->remove_prefix(count); 356 return count; 357 } 358 359 size_t RemoveTrailingWhitespace(StringPiece* text) { 360 size_t count = 0; 361 const char* ptr = text->data() + text->size() - 1; 362 while (count < text->size() && isspace(*ptr)) { 363 ++count; 364 --ptr; 365 } 366 text->remove_suffix(count); 367 return count; 368 } 369 370 size_t RemoveWhitespaceContext(StringPiece* text) { 371 // use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job 372 return (RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text)); 373 } 374 375 bool ConsumePrefix(StringPiece* s, StringPiece expected) { 376 if (s->starts_with(expected)) { 377 s->remove_prefix(expected.size()); 378 return true; 379 } 380 return false; 381 } 382 383 bool ConsumeSuffix(StringPiece* s, StringPiece expected) { 384 if (s->ends_with(expected)) { 385 s->remove_suffix(expected.size()); 386 return true; 387 } 388 return false; 389 } 390 391 bool ConsumeLeadingDigits(StringPiece* s, uint64* val) { 392 const char* p = s->data(); 393 const char* limit = p + s->size(); 394 uint64 v = 0; 395 while (p < limit) { 396 const char c = *p; 397 if (c < '0' || c > '9') break; 398 uint64 new_v = (v * 10) + (c - '0'); 399 if (new_v / 8 < v) { 400 // Overflow occurred 401 return false; 402 } 403 v = new_v; 404 p++; 405 } 406 if (p > s->data()) { 407 // Consume some digits 408 s->remove_prefix(p - s->data()); 409 *val = v; 410 return true; 411 } else { 412 return false; 413 } 414 } 415 416 bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val) { 417 const char* p = s->data(); 418 const char* limit = p + s->size(); 419 while (p < limit) { 420 const char c = *p; 421 if (isspace(c)) break; 422 p++; 423 } 424 const size_t n = p - s->data(); 425 if (n > 0) { 426 *val = StringPiece(s->data(), n); 427 s->remove_prefix(n); 428 return true; 429 } else { 430 *val = StringPiece(); 431 return false; 432 } 433 } 434 435 bool SplitAndParseAsInts(StringPiece text, char delim, 436 std::vector<int32>* result) { 437 return SplitAndParseAsInts<int32>(text, delim, strings::safe_strto32, result); 438 } 439 440 bool SplitAndParseAsInts(StringPiece text, char delim, 441 std::vector<int64>* result) { 442 return SplitAndParseAsInts<int64>(text, delim, strings::safe_strto64, result); 443 } 444 445 bool SplitAndParseAsFloats(StringPiece text, char delim, 446 std::vector<float>* result) { 447 return SplitAndParseAsInts<float>(text, delim, 448 [](StringPiece str, float* value) { 449 return strings::safe_strtof( 450 str.ToString().c_str(), value); 451 }, 452 result); 453 } 454 455 } // namespace str_util 456 } // namespace tensorflow 457