1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // http://code.google.com/p/protobuf/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 package com.google.protobuf; 32 33 import com.google.protobuf.Descriptors.Descriptor; 34 import com.google.protobuf.Descriptors.FieldDescriptor; 35 import com.google.protobuf.Descriptors.EnumDescriptor; 36 import com.google.protobuf.Descriptors.EnumValueDescriptor; 37 38 import java.io.IOException; 39 import java.nio.CharBuffer; 40 import java.math.BigInteger; 41 import java.util.ArrayList; 42 import java.util.List; 43 import java.util.Locale; 44 import java.util.Map; 45 import java.util.regex.Matcher; 46 import java.util.regex.Pattern; 47 48 /** 49 * Provide ascii text parsing and formatting support for proto2 instances. 50 * The implementation largely follows google/protobuf/text_format.cc. 51 * 52 * @author wenboz (at) google.com Wenbo Zhu 53 * @author kenton (at) google.com Kenton Varda 54 */ 55 public final class TextFormat { 56 private TextFormat() { 57 } 58 59 /** 60 * Outputs a textual representation of the Protocol Message supplied into 61 * the parameter output. (This representation is the new version of the 62 * classic "ProtocolPrinter" output from the original Protocol Buffer system) 63 */ 64 public static void print(final Message message, final Appendable output) 65 throws IOException { 66 final TextGenerator generator = new TextGenerator(output); 67 print(message, generator); 68 } 69 70 /** Outputs a textual representation of {@code fields} to {@code output}. */ 71 public static void print(final UnknownFieldSet fields, 72 final Appendable output) 73 throws IOException { 74 final TextGenerator generator = new TextGenerator(output); 75 printUnknownFields(fields, generator); 76 } 77 78 /** 79 * Like {@code print()}, but writes directly to a {@code String} and 80 * returns it. 81 */ 82 public static String printToString(final Message message) { 83 try { 84 final StringBuilder text = new StringBuilder(); 85 print(message, text); 86 return text.toString(); 87 } catch (IOException e) { 88 throw new RuntimeException( 89 "Writing to a StringBuilder threw an IOException (should never " + 90 "happen).", e); 91 } 92 } 93 94 /** 95 * Like {@code print()}, but writes directly to a {@code String} and 96 * returns it. 97 */ 98 public static String printToString(final UnknownFieldSet fields) { 99 try { 100 final StringBuilder text = new StringBuilder(); 101 print(fields, text); 102 return text.toString(); 103 } catch (IOException e) { 104 throw new RuntimeException( 105 "Writing to a StringBuilder threw an IOException (should never " + 106 "happen).", e); 107 } 108 } 109 110 private static void print(final Message message, 111 final TextGenerator generator) 112 throws IOException { 113 for (final Map.Entry<FieldDescriptor, Object> field : 114 message.getAllFields().entrySet()) { 115 printField(field.getKey(), field.getValue(), generator); 116 } 117 printUnknownFields(message.getUnknownFields(), generator); 118 } 119 120 public static void printField(final FieldDescriptor field, 121 final Object value, 122 final Appendable output) 123 throws IOException { 124 final TextGenerator generator = new TextGenerator(output); 125 printField(field, value, generator); 126 } 127 128 public static String printFieldToString(final FieldDescriptor field, 129 final Object value) { 130 try { 131 final StringBuilder text = new StringBuilder(); 132 printField(field, value, text); 133 return text.toString(); 134 } catch (IOException e) { 135 throw new RuntimeException( 136 "Writing to a StringBuilder threw an IOException (should never " + 137 "happen).", e); 138 } 139 } 140 141 private static void printField(final FieldDescriptor field, 142 final Object value, 143 final TextGenerator generator) 144 throws IOException { 145 if (field.isRepeated()) { 146 // Repeated field. Print each element. 147 for (final Object element : (List) value) { 148 printSingleField(field, element, generator); 149 } 150 } else { 151 printSingleField(field, value, generator); 152 } 153 } 154 155 private static void printSingleField(final FieldDescriptor field, 156 final Object value, 157 final TextGenerator generator) 158 throws IOException { 159 if (field.isExtension()) { 160 generator.print("["); 161 // We special-case MessageSet elements for compatibility with proto1. 162 if (field.getContainingType().getOptions().getMessageSetWireFormat() 163 && (field.getType() == FieldDescriptor.Type.MESSAGE) 164 && (field.isOptional()) 165 // object equality 166 && (field.getExtensionScope() == field.getMessageType())) { 167 generator.print(field.getMessageType().getFullName()); 168 } else { 169 generator.print(field.getFullName()); 170 } 171 generator.print("]"); 172 } else { 173 if (field.getType() == FieldDescriptor.Type.GROUP) { 174 // Groups must be serialized with their original capitalization. 175 generator.print(field.getMessageType().getName()); 176 } else { 177 generator.print(field.getName()); 178 } 179 } 180 181 if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) { 182 generator.print(" {\n"); 183 generator.indent(); 184 } else { 185 generator.print(": "); 186 } 187 188 printFieldValue(field, value, generator); 189 190 if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) { 191 generator.outdent(); 192 generator.print("}"); 193 } 194 generator.print("\n"); 195 } 196 197 private static void printFieldValue(final FieldDescriptor field, 198 final Object value, 199 final TextGenerator generator) 200 throws IOException { 201 switch (field.getType()) { 202 case INT32: 203 case INT64: 204 case SINT32: 205 case SINT64: 206 case SFIXED32: 207 case SFIXED64: 208 case FLOAT: 209 case DOUBLE: 210 case BOOL: 211 // Good old toString() does what we want for these types. 212 generator.print(value.toString()); 213 break; 214 215 case UINT32: 216 case FIXED32: 217 generator.print(unsignedToString((Integer) value)); 218 break; 219 220 case UINT64: 221 case FIXED64: 222 generator.print(unsignedToString((Long) value)); 223 break; 224 225 case STRING: 226 generator.print("\""); 227 generator.print(escapeText((String) value)); 228 generator.print("\""); 229 break; 230 231 case BYTES: 232 generator.print("\""); 233 generator.print(escapeBytes((ByteString) value)); 234 generator.print("\""); 235 break; 236 237 case ENUM: 238 generator.print(((EnumValueDescriptor) value).getName()); 239 break; 240 241 case MESSAGE: 242 case GROUP: 243 print((Message) value, generator); 244 break; 245 } 246 } 247 248 private static void printUnknownFields(final UnknownFieldSet unknownFields, 249 final TextGenerator generator) 250 throws IOException { 251 for (final Map.Entry<Integer, UnknownFieldSet.Field> entry : 252 unknownFields.asMap().entrySet()) { 253 final String prefix = entry.getKey().toString() + ": "; 254 final UnknownFieldSet.Field field = entry.getValue(); 255 256 for (final long value : field.getVarintList()) { 257 generator.print(entry.getKey().toString()); 258 generator.print(": "); 259 generator.print(unsignedToString(value)); 260 generator.print("\n"); 261 } 262 for (final int value : field.getFixed32List()) { 263 generator.print(entry.getKey().toString()); 264 generator.print(": "); 265 generator.print(String.format((Locale) null, "0x%08x", value)); 266 generator.print("\n"); 267 } 268 for (final long value : field.getFixed64List()) { 269 generator.print(entry.getKey().toString()); 270 generator.print(": "); 271 generator.print(String.format((Locale) null, "0x%016x", value)); 272 generator.print("\n"); 273 } 274 for (final ByteString value : field.getLengthDelimitedList()) { 275 generator.print(entry.getKey().toString()); 276 generator.print(": \""); 277 generator.print(escapeBytes(value)); 278 generator.print("\"\n"); 279 } 280 for (final UnknownFieldSet value : field.getGroupList()) { 281 generator.print(entry.getKey().toString()); 282 generator.print(" {\n"); 283 generator.indent(); 284 printUnknownFields(value, generator); 285 generator.outdent(); 286 generator.print("}\n"); 287 } 288 } 289 } 290 291 /** Convert an unsigned 32-bit integer to a string. */ 292 private static String unsignedToString(final int value) { 293 if (value >= 0) { 294 return Integer.toString(value); 295 } else { 296 return Long.toString(((long) value) & 0x00000000FFFFFFFFL); 297 } 298 } 299 300 /** Convert an unsigned 64-bit integer to a string. */ 301 private static String unsignedToString(final long value) { 302 if (value >= 0) { 303 return Long.toString(value); 304 } else { 305 // Pull off the most-significant bit so that BigInteger doesn't think 306 // the number is negative, then set it again using setBit(). 307 return BigInteger.valueOf(value & 0x7FFFFFFFFFFFFFFFL) 308 .setBit(63).toString(); 309 } 310 } 311 312 /** 313 * An inner class for writing text to the output stream. 314 */ 315 private static final class TextGenerator { 316 private Appendable output; 317 private boolean atStartOfLine = true; 318 private final StringBuilder indent = new StringBuilder(); 319 320 private TextGenerator(final Appendable output) { 321 this.output = output; 322 } 323 324 /** 325 * Indent text by two spaces. After calling Indent(), two spaces will be 326 * inserted at the beginning of each line of text. Indent() may be called 327 * multiple times to produce deeper indents. 328 */ 329 public void indent() { 330 indent.append(" "); 331 } 332 333 /** 334 * Reduces the current indent level by two spaces, or crashes if the indent 335 * level is zero. 336 */ 337 public void outdent() { 338 final int length = indent.length(); 339 if (length == 0) { 340 throw new IllegalArgumentException( 341 " Outdent() without matching Indent()."); 342 } 343 indent.delete(length - 2, length); 344 } 345 346 /** 347 * Print text to the output stream. 348 */ 349 public void print(final CharSequence text) throws IOException { 350 final int size = text.length(); 351 int pos = 0; 352 353 for (int i = 0; i < size; i++) { 354 if (text.charAt(i) == '\n') { 355 write(text.subSequence(pos, size), i - pos + 1); 356 pos = i + 1; 357 atStartOfLine = true; 358 } 359 } 360 write(text.subSequence(pos, size), size - pos); 361 } 362 363 private void write(final CharSequence data, final int size) 364 throws IOException { 365 if (size == 0) { 366 return; 367 } 368 if (atStartOfLine) { 369 atStartOfLine = false; 370 output.append(indent); 371 } 372 output.append(data); 373 } 374 } 375 376 // ================================================================= 377 // Parsing 378 379 /** 380 * Represents a stream of tokens parsed from a {@code String}. 381 * 382 * <p>The Java standard library provides many classes that you might think 383 * would be useful for implementing this, but aren't. For example: 384 * 385 * <ul> 386 * <li>{@code java.io.StreamTokenizer}: This almost does what we want -- or, 387 * at least, something that would get us close to what we want -- except 388 * for one fatal flaw: It automatically un-escapes strings using Java 389 * escape sequences, which do not include all the escape sequences we 390 * need to support (e.g. '\x'). 391 * <li>{@code java.util.Scanner}: This seems like a great way at least to 392 * parse regular expressions out of a stream (so we wouldn't have to load 393 * the entire input into a single string before parsing). Sadly, 394 * {@code Scanner} requires that tokens be delimited with some delimiter. 395 * Thus, although the text "foo:" should parse to two tokens ("foo" and 396 * ":"), {@code Scanner} would recognize it only as a single token. 397 * Furthermore, {@code Scanner} provides no way to inspect the contents 398 * of delimiters, making it impossible to keep track of line and column 399 * numbers. 400 * </ul> 401 * 402 * <p>Luckily, Java's regular expression support does manage to be useful to 403 * us. (Barely: We need {@code Matcher.usePattern()}, which is new in 404 * Java 1.5.) So, we can use that, at least. Unfortunately, this implies 405 * that we need to have the entire input in one contiguous string. 406 */ 407 private static final class Tokenizer { 408 private final CharSequence text; 409 private final Matcher matcher; 410 private String currentToken; 411 412 // The character index within this.text at which the current token begins. 413 private int pos = 0; 414 415 // The line and column numbers of the current token. 416 private int line = 0; 417 private int column = 0; 418 419 // The line and column numbers of the previous token (allows throwing 420 // errors *after* consuming). 421 private int previousLine = 0; 422 private int previousColumn = 0; 423 424 // We use possesive quantifiers (*+ and ++) because otherwise the Java 425 // regex matcher has stack overflows on large inputs. 426 private static final Pattern WHITESPACE = 427 Pattern.compile("(\\s|(#.*$))++", Pattern.MULTILINE); 428 private static final Pattern TOKEN = Pattern.compile( 429 "[a-zA-Z_][0-9a-zA-Z_+-]*+|" + // an identifier 430 "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" + // a number 431 "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" + // a double-quoted string 432 "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)", // a single-quoted string 433 Pattern.MULTILINE); 434 435 private static final Pattern DOUBLE_INFINITY = Pattern.compile( 436 "-?inf(inity)?", 437 Pattern.CASE_INSENSITIVE); 438 private static final Pattern FLOAT_INFINITY = Pattern.compile( 439 "-?inf(inity)?f?", 440 Pattern.CASE_INSENSITIVE); 441 private static final Pattern FLOAT_NAN = Pattern.compile( 442 "nanf?", 443 Pattern.CASE_INSENSITIVE); 444 445 /** Construct a tokenizer that parses tokens from the given text. */ 446 private Tokenizer(final CharSequence text) { 447 this.text = text; 448 this.matcher = WHITESPACE.matcher(text); 449 skipWhitespace(); 450 nextToken(); 451 } 452 453 /** Are we at the end of the input? */ 454 public boolean atEnd() { 455 return currentToken.length() == 0; 456 } 457 458 /** Advance to the next token. */ 459 public void nextToken() { 460 previousLine = line; 461 previousColumn = column; 462 463 // Advance the line counter to the current position. 464 while (pos < matcher.regionStart()) { 465 if (text.charAt(pos) == '\n') { 466 ++line; 467 column = 0; 468 } else { 469 ++column; 470 } 471 ++pos; 472 } 473 474 // Match the next token. 475 if (matcher.regionStart() == matcher.regionEnd()) { 476 // EOF 477 currentToken = ""; 478 } else { 479 matcher.usePattern(TOKEN); 480 if (matcher.lookingAt()) { 481 currentToken = matcher.group(); 482 matcher.region(matcher.end(), matcher.regionEnd()); 483 } else { 484 // Take one character. 485 currentToken = String.valueOf(text.charAt(pos)); 486 matcher.region(pos + 1, matcher.regionEnd()); 487 } 488 489 skipWhitespace(); 490 } 491 } 492 493 /** 494 * Skip over any whitespace so that the matcher region starts at the next 495 * token. 496 */ 497 private void skipWhitespace() { 498 matcher.usePattern(WHITESPACE); 499 if (matcher.lookingAt()) { 500 matcher.region(matcher.end(), matcher.regionEnd()); 501 } 502 } 503 504 /** 505 * If the next token exactly matches {@code token}, consume it and return 506 * {@code true}. Otherwise, return {@code false} without doing anything. 507 */ 508 public boolean tryConsume(final String token) { 509 if (currentToken.equals(token)) { 510 nextToken(); 511 return true; 512 } else { 513 return false; 514 } 515 } 516 517 /** 518 * If the next token exactly matches {@code token}, consume it. Otherwise, 519 * throw a {@link ParseException}. 520 */ 521 public void consume(final String token) throws ParseException { 522 if (!tryConsume(token)) { 523 throw parseException("Expected \"" + token + "\"."); 524 } 525 } 526 527 /** 528 * Returns {@code true} if the next token is an integer, but does 529 * not consume it. 530 */ 531 public boolean lookingAtInteger() { 532 if (currentToken.length() == 0) { 533 return false; 534 } 535 536 final char c = currentToken.charAt(0); 537 return ('0' <= c && c <= '9') || 538 c == '-' || c == '+'; 539 } 540 541 /** 542 * If the next token is an identifier, consume it and return its value. 543 * Otherwise, throw a {@link ParseException}. 544 */ 545 public String consumeIdentifier() throws ParseException { 546 for (int i = 0; i < currentToken.length(); i++) { 547 final char c = currentToken.charAt(i); 548 if (('a' <= c && c <= 'z') || 549 ('A' <= c && c <= 'Z') || 550 ('0' <= c && c <= '9') || 551 (c == '_') || (c == '.')) { 552 // OK 553 } else { 554 throw parseException("Expected identifier."); 555 } 556 } 557 558 final String result = currentToken; 559 nextToken(); 560 return result; 561 } 562 563 /** 564 * If the next token is a 32-bit signed integer, consume it and return its 565 * value. Otherwise, throw a {@link ParseException}. 566 */ 567 public int consumeInt32() throws ParseException { 568 try { 569 final int result = parseInt32(currentToken); 570 nextToken(); 571 return result; 572 } catch (NumberFormatException e) { 573 throw integerParseException(e); 574 } 575 } 576 577 /** 578 * If the next token is a 32-bit unsigned integer, consume it and return its 579 * value. Otherwise, throw a {@link ParseException}. 580 */ 581 public int consumeUInt32() throws ParseException { 582 try { 583 final int result = parseUInt32(currentToken); 584 nextToken(); 585 return result; 586 } catch (NumberFormatException e) { 587 throw integerParseException(e); 588 } 589 } 590 591 /** 592 * If the next token is a 64-bit signed integer, consume it and return its 593 * value. Otherwise, throw a {@link ParseException}. 594 */ 595 public long consumeInt64() throws ParseException { 596 try { 597 final long result = parseInt64(currentToken); 598 nextToken(); 599 return result; 600 } catch (NumberFormatException e) { 601 throw integerParseException(e); 602 } 603 } 604 605 /** 606 * If the next token is a 64-bit unsigned integer, consume it and return its 607 * value. Otherwise, throw a {@link ParseException}. 608 */ 609 public long consumeUInt64() throws ParseException { 610 try { 611 final long result = parseUInt64(currentToken); 612 nextToken(); 613 return result; 614 } catch (NumberFormatException e) { 615 throw integerParseException(e); 616 } 617 } 618 619 /** 620 * If the next token is a double, consume it and return its value. 621 * Otherwise, throw a {@link ParseException}. 622 */ 623 public double consumeDouble() throws ParseException { 624 // We need to parse infinity and nan separately because 625 // Double.parseDouble() does not accept "inf", "infinity", or "nan". 626 if (DOUBLE_INFINITY.matcher(currentToken).matches()) { 627 final boolean negative = currentToken.startsWith("-"); 628 nextToken(); 629 return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; 630 } 631 if (currentToken.equalsIgnoreCase("nan")) { 632 nextToken(); 633 return Double.NaN; 634 } 635 try { 636 final double result = Double.parseDouble(currentToken); 637 nextToken(); 638 return result; 639 } catch (NumberFormatException e) { 640 throw floatParseException(e); 641 } 642 } 643 644 /** 645 * If the next token is a float, consume it and return its value. 646 * Otherwise, throw a {@link ParseException}. 647 */ 648 public float consumeFloat() throws ParseException { 649 // We need to parse infinity and nan separately because 650 // Float.parseFloat() does not accept "inf", "infinity", or "nan". 651 if (FLOAT_INFINITY.matcher(currentToken).matches()) { 652 final boolean negative = currentToken.startsWith("-"); 653 nextToken(); 654 return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY; 655 } 656 if (FLOAT_NAN.matcher(currentToken).matches()) { 657 nextToken(); 658 return Float.NaN; 659 } 660 try { 661 final float result = Float.parseFloat(currentToken); 662 nextToken(); 663 return result; 664 } catch (NumberFormatException e) { 665 throw floatParseException(e); 666 } 667 } 668 669 /** 670 * If the next token is a boolean, consume it and return its value. 671 * Otherwise, throw a {@link ParseException}. 672 */ 673 public boolean consumeBoolean() throws ParseException { 674 if (currentToken.equals("true")) { 675 nextToken(); 676 return true; 677 } else if (currentToken.equals("false")) { 678 nextToken(); 679 return false; 680 } else { 681 throw parseException("Expected \"true\" or \"false\"."); 682 } 683 } 684 685 /** 686 * If the next token is a string, consume it and return its (unescaped) 687 * value. Otherwise, throw a {@link ParseException}. 688 */ 689 public String consumeString() throws ParseException { 690 return consumeByteString().toStringUtf8(); 691 } 692 693 /** 694 * If the next token is a string, consume it, unescape it as a 695 * {@link ByteString}, and return it. Otherwise, throw a 696 * {@link ParseException}. 697 */ 698 public ByteString consumeByteString() throws ParseException { 699 List<ByteString> list = new ArrayList<ByteString>(); 700 consumeByteString(list); 701 while (currentToken.startsWith("'") || currentToken.startsWith("\"")) { 702 consumeByteString(list); 703 } 704 return ByteString.copyFrom(list); 705 } 706 707 /** 708 * Like {@link #consumeByteString()} but adds each token of the string to 709 * the given list. String literals (whether bytes or text) may come in 710 * multiple adjacent tokens which are automatically concatenated, like in 711 * C or Python. 712 */ 713 private void consumeByteString(List<ByteString> list) throws ParseException { 714 final char quote = currentToken.length() > 0 ? currentToken.charAt(0) 715 : '\0'; 716 if (quote != '\"' && quote != '\'') { 717 throw parseException("Expected string."); 718 } 719 720 if (currentToken.length() < 2 || 721 currentToken.charAt(currentToken.length() - 1) != quote) { 722 throw parseException("String missing ending quote."); 723 } 724 725 try { 726 final String escaped = 727 currentToken.substring(1, currentToken.length() - 1); 728 final ByteString result = unescapeBytes(escaped); 729 nextToken(); 730 list.add(result); 731 } catch (InvalidEscapeSequenceException e) { 732 throw parseException(e.getMessage()); 733 } 734 } 735 736 /** 737 * Returns a {@link ParseException} with the current line and column 738 * numbers in the description, suitable for throwing. 739 */ 740 public ParseException parseException(final String description) { 741 // Note: People generally prefer one-based line and column numbers. 742 return new ParseException( 743 (line + 1) + ":" + (column + 1) + ": " + description); 744 } 745 746 /** 747 * Returns a {@link ParseException} with the line and column numbers of 748 * the previous token in the description, suitable for throwing. 749 */ 750 public ParseException parseExceptionPreviousToken( 751 final String description) { 752 // Note: People generally prefer one-based line and column numbers. 753 return new ParseException( 754 (previousLine + 1) + ":" + (previousColumn + 1) + ": " + description); 755 } 756 757 /** 758 * Constructs an appropriate {@link ParseException} for the given 759 * {@code NumberFormatException} when trying to parse an integer. 760 */ 761 private ParseException integerParseException( 762 final NumberFormatException e) { 763 return parseException("Couldn't parse integer: " + e.getMessage()); 764 } 765 766 /** 767 * Constructs an appropriate {@link ParseException} for the given 768 * {@code NumberFormatException} when trying to parse a float or double. 769 */ 770 private ParseException floatParseException(final NumberFormatException e) { 771 return parseException("Couldn't parse number: " + e.getMessage()); 772 } 773 } 774 775 /** Thrown when parsing an invalid text format message. */ 776 public static class ParseException extends IOException { 777 private static final long serialVersionUID = 3196188060225107702L; 778 779 public ParseException(final String message) { 780 super(message); 781 } 782 } 783 784 /** 785 * Parse a text-format message from {@code input} and merge the contents 786 * into {@code builder}. 787 */ 788 public static void merge(final Readable input, 789 final Message.Builder builder) 790 throws IOException { 791 merge(input, ExtensionRegistry.getEmptyRegistry(), builder); 792 } 793 794 /** 795 * Parse a text-format message from {@code input} and merge the contents 796 * into {@code builder}. 797 */ 798 public static void merge(final CharSequence input, 799 final Message.Builder builder) 800 throws ParseException { 801 merge(input, ExtensionRegistry.getEmptyRegistry(), builder); 802 } 803 804 /** 805 * Parse a text-format message from {@code input} and merge the contents 806 * into {@code builder}. Extensions will be recognized if they are 807 * registered in {@code extensionRegistry}. 808 */ 809 public static void merge(final Readable input, 810 final ExtensionRegistry extensionRegistry, 811 final Message.Builder builder) 812 throws IOException { 813 // Read the entire input to a String then parse that. 814 815 // If StreamTokenizer were not quite so crippled, or if there were a kind 816 // of Reader that could read in chunks that match some particular regex, 817 // or if we wanted to write a custom Reader to tokenize our stream, then 818 // we would not have to read to one big String. Alas, none of these is 819 // the case. Oh well. 820 821 merge(toStringBuilder(input), extensionRegistry, builder); 822 } 823 824 private static final int BUFFER_SIZE = 4096; 825 826 // TODO(chrisn): See if working around java.io.Reader#read(CharBuffer) 827 // overhead is worthwhile 828 private static StringBuilder toStringBuilder(final Readable input) 829 throws IOException { 830 final StringBuilder text = new StringBuilder(); 831 final CharBuffer buffer = CharBuffer.allocate(BUFFER_SIZE); 832 while (true) { 833 final int n = input.read(buffer); 834 if (n == -1) { 835 break; 836 } 837 buffer.flip(); 838 text.append(buffer, 0, n); 839 } 840 return text; 841 } 842 843 /** 844 * Parse a text-format message from {@code input} and merge the contents 845 * into {@code builder}. Extensions will be recognized if they are 846 * registered in {@code extensionRegistry}. 847 */ 848 public static void merge(final CharSequence input, 849 final ExtensionRegistry extensionRegistry, 850 final Message.Builder builder) 851 throws ParseException { 852 final Tokenizer tokenizer = new Tokenizer(input); 853 854 while (!tokenizer.atEnd()) { 855 mergeField(tokenizer, extensionRegistry, builder); 856 } 857 } 858 859 /** 860 * Parse a single field from {@code tokenizer} and merge it into 861 * {@code builder}. 862 */ 863 private static void mergeField(final Tokenizer tokenizer, 864 final ExtensionRegistry extensionRegistry, 865 final Message.Builder builder) 866 throws ParseException { 867 FieldDescriptor field; 868 final Descriptor type = builder.getDescriptorForType(); 869 ExtensionRegistry.ExtensionInfo extension = null; 870 871 if (tokenizer.tryConsume("[")) { 872 // An extension. 873 final StringBuilder name = 874 new StringBuilder(tokenizer.consumeIdentifier()); 875 while (tokenizer.tryConsume(".")) { 876 name.append('.'); 877 name.append(tokenizer.consumeIdentifier()); 878 } 879 880 extension = extensionRegistry.findExtensionByName(name.toString()); 881 882 if (extension == null) { 883 throw tokenizer.parseExceptionPreviousToken( 884 "Extension \"" + name + "\" not found in the ExtensionRegistry."); 885 } else if (extension.descriptor.getContainingType() != type) { 886 throw tokenizer.parseExceptionPreviousToken( 887 "Extension \"" + name + "\" does not extend message type \"" + 888 type.getFullName() + "\"."); 889 } 890 891 tokenizer.consume("]"); 892 893 field = extension.descriptor; 894 } else { 895 final String name = tokenizer.consumeIdentifier(); 896 field = type.findFieldByName(name); 897 898 // Group names are expected to be capitalized as they appear in the 899 // .proto file, which actually matches their type names, not their field 900 // names. 901 if (field == null) { 902 // Explicitly specify US locale so that this code does not break when 903 // executing in Turkey. 904 final String lowerName = name.toLowerCase(Locale.US); 905 field = type.findFieldByName(lowerName); 906 // If the case-insensitive match worked but the field is NOT a group, 907 if (field != null && field.getType() != FieldDescriptor.Type.GROUP) { 908 field = null; 909 } 910 } 911 // Again, special-case group names as described above. 912 if (field != null && field.getType() == FieldDescriptor.Type.GROUP && 913 !field.getMessageType().getName().equals(name)) { 914 field = null; 915 } 916 917 if (field == null) { 918 throw tokenizer.parseExceptionPreviousToken( 919 "Message type \"" + type.getFullName() + 920 "\" has no field named \"" + name + "\"."); 921 } 922 } 923 924 Object value = null; 925 926 if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) { 927 tokenizer.tryConsume(":"); // optional 928 929 final String endToken; 930 if (tokenizer.tryConsume("<")) { 931 endToken = ">"; 932 } else { 933 tokenizer.consume("{"); 934 endToken = "}"; 935 } 936 937 final Message.Builder subBuilder; 938 if (extension == null) { 939 subBuilder = builder.newBuilderForField(field); 940 } else { 941 subBuilder = extension.defaultInstance.newBuilderForType(); 942 } 943 944 while (!tokenizer.tryConsume(endToken)) { 945 if (tokenizer.atEnd()) { 946 throw tokenizer.parseException( 947 "Expected \"" + endToken + "\"."); 948 } 949 mergeField(tokenizer, extensionRegistry, subBuilder); 950 } 951 952 value = subBuilder.build(); 953 954 } else { 955 tokenizer.consume(":"); 956 957 switch (field.getType()) { 958 case INT32: 959 case SINT32: 960 case SFIXED32: 961 value = tokenizer.consumeInt32(); 962 break; 963 964 case INT64: 965 case SINT64: 966 case SFIXED64: 967 value = tokenizer.consumeInt64(); 968 break; 969 970 case UINT32: 971 case FIXED32: 972 value = tokenizer.consumeUInt32(); 973 break; 974 975 case UINT64: 976 case FIXED64: 977 value = tokenizer.consumeUInt64(); 978 break; 979 980 case FLOAT: 981 value = tokenizer.consumeFloat(); 982 break; 983 984 case DOUBLE: 985 value = tokenizer.consumeDouble(); 986 break; 987 988 case BOOL: 989 value = tokenizer.consumeBoolean(); 990 break; 991 992 case STRING: 993 value = tokenizer.consumeString(); 994 break; 995 996 case BYTES: 997 value = tokenizer.consumeByteString(); 998 break; 999 1000 case ENUM: 1001 final EnumDescriptor enumType = field.getEnumType(); 1002 1003 if (tokenizer.lookingAtInteger()) { 1004 final int number = tokenizer.consumeInt32(); 1005 value = enumType.findValueByNumber(number); 1006 if (value == null) { 1007 throw tokenizer.parseExceptionPreviousToken( 1008 "Enum type \"" + enumType.getFullName() + 1009 "\" has no value with number " + number + '.'); 1010 } 1011 } else { 1012 final String id = tokenizer.consumeIdentifier(); 1013 value = enumType.findValueByName(id); 1014 if (value == null) { 1015 throw tokenizer.parseExceptionPreviousToken( 1016 "Enum type \"" + enumType.getFullName() + 1017 "\" has no value named \"" + id + "\"."); 1018 } 1019 } 1020 1021 break; 1022 1023 case MESSAGE: 1024 case GROUP: 1025 throw new RuntimeException("Can't get here."); 1026 } 1027 } 1028 1029 if (field.isRepeated()) { 1030 builder.addRepeatedField(field, value); 1031 } else { 1032 builder.setField(field, value); 1033 } 1034 } 1035 1036 // ================================================================= 1037 // Utility functions 1038 // 1039 // Some of these methods are package-private because Descriptors.java uses 1040 // them. 1041 1042 /** 1043 * Escapes bytes in the format used in protocol buffer text format, which 1044 * is the same as the format used for C string literals. All bytes 1045 * that are not printable 7-bit ASCII characters are escaped, as well as 1046 * backslash, single-quote, and double-quote characters. Characters for 1047 * which no defined short-hand escape sequence is defined will be escaped 1048 * using 3-digit octal sequences. 1049 */ 1050 static String escapeBytes(final ByteString input) { 1051 final StringBuilder builder = new StringBuilder(input.size()); 1052 for (int i = 0; i < input.size(); i++) { 1053 final byte b = input.byteAt(i); 1054 switch (b) { 1055 // Java does not recognize \a or \v, apparently. 1056 case 0x07: builder.append("\\a" ); break; 1057 case '\b': builder.append("\\b" ); break; 1058 case '\f': builder.append("\\f" ); break; 1059 case '\n': builder.append("\\n" ); break; 1060 case '\r': builder.append("\\r" ); break; 1061 case '\t': builder.append("\\t" ); break; 1062 case 0x0b: builder.append("\\v" ); break; 1063 case '\\': builder.append("\\\\"); break; 1064 case '\'': builder.append("\\\'"); break; 1065 case '"' : builder.append("\\\""); break; 1066 default: 1067 if (b >= 0x20) { 1068 builder.append((char) b); 1069 } else { 1070 builder.append('\\'); 1071 builder.append((char) ('0' + ((b >>> 6) & 3))); 1072 builder.append((char) ('0' + ((b >>> 3) & 7))); 1073 builder.append((char) ('0' + (b & 7))); 1074 } 1075 break; 1076 } 1077 } 1078 return builder.toString(); 1079 } 1080 1081 /** 1082 * Un-escape a byte sequence as escaped using 1083 * {@link #escapeBytes(ByteString)}. Two-digit hex escapes (starting with 1084 * "\x") are also recognized. 1085 */ 1086 static ByteString unescapeBytes(final CharSequence input) 1087 throws InvalidEscapeSequenceException { 1088 final byte[] result = new byte[input.length()]; 1089 int pos = 0; 1090 for (int i = 0; i < input.length(); i++) { 1091 char c = input.charAt(i); 1092 if (c == '\\') { 1093 if (i + 1 < input.length()) { 1094 ++i; 1095 c = input.charAt(i); 1096 if (isOctal(c)) { 1097 // Octal escape. 1098 int code = digitValue(c); 1099 if (i + 1 < input.length() && isOctal(input.charAt(i + 1))) { 1100 ++i; 1101 code = code * 8 + digitValue(input.charAt(i)); 1102 } 1103 if (i + 1 < input.length() && isOctal(input.charAt(i + 1))) { 1104 ++i; 1105 code = code * 8 + digitValue(input.charAt(i)); 1106 } 1107 result[pos++] = (byte)code; 1108 } else { 1109 switch (c) { 1110 case 'a' : result[pos++] = 0x07; break; 1111 case 'b' : result[pos++] = '\b'; break; 1112 case 'f' : result[pos++] = '\f'; break; 1113 case 'n' : result[pos++] = '\n'; break; 1114 case 'r' : result[pos++] = '\r'; break; 1115 case 't' : result[pos++] = '\t'; break; 1116 case 'v' : result[pos++] = 0x0b; break; 1117 case '\\': result[pos++] = '\\'; break; 1118 case '\'': result[pos++] = '\''; break; 1119 case '"' : result[pos++] = '\"'; break; 1120 1121 case 'x': 1122 // hex escape 1123 int code = 0; 1124 if (i + 1 < input.length() && isHex(input.charAt(i + 1))) { 1125 ++i; 1126 code = digitValue(input.charAt(i)); 1127 } else { 1128 throw new InvalidEscapeSequenceException( 1129 "Invalid escape sequence: '\\x' with no digits"); 1130 } 1131 if (i + 1 < input.length() && isHex(input.charAt(i + 1))) { 1132 ++i; 1133 code = code * 16 + digitValue(input.charAt(i)); 1134 } 1135 result[pos++] = (byte)code; 1136 break; 1137 1138 default: 1139 throw new InvalidEscapeSequenceException( 1140 "Invalid escape sequence: '\\" + c + '\''); 1141 } 1142 } 1143 } else { 1144 throw new InvalidEscapeSequenceException( 1145 "Invalid escape sequence: '\\' at end of string."); 1146 } 1147 } else { 1148 result[pos++] = (byte)c; 1149 } 1150 } 1151 1152 return ByteString.copyFrom(result, 0, pos); 1153 } 1154 1155 /** 1156 * Thrown by {@link TextFormat#unescapeBytes} and 1157 * {@link TextFormat#unescapeText} when an invalid escape sequence is seen. 1158 */ 1159 static class InvalidEscapeSequenceException extends IOException { 1160 private static final long serialVersionUID = -8164033650142593304L; 1161 1162 InvalidEscapeSequenceException(final String description) { 1163 super(description); 1164 } 1165 } 1166 1167 /** 1168 * Like {@link #escapeBytes(ByteString)}, but escapes a text string. 1169 * Non-ASCII characters are first encoded as UTF-8, then each byte is escaped 1170 * individually as a 3-digit octal escape. Yes, it's weird. 1171 */ 1172 static String escapeText(final String input) { 1173 return escapeBytes(ByteString.copyFromUtf8(input)); 1174 } 1175 1176 /** 1177 * Un-escape a text string as escaped using {@link #escapeText(String)}. 1178 * Two-digit hex escapes (starting with "\x") are also recognized. 1179 */ 1180 static String unescapeText(final String input) 1181 throws InvalidEscapeSequenceException { 1182 return unescapeBytes(input).toStringUtf8(); 1183 } 1184 1185 /** Is this an octal digit? */ 1186 private static boolean isOctal(final char c) { 1187 return '0' <= c && c <= '7'; 1188 } 1189 1190 /** Is this a hex digit? */ 1191 private static boolean isHex(final char c) { 1192 return ('0' <= c && c <= '9') || 1193 ('a' <= c && c <= 'f') || 1194 ('A' <= c && c <= 'F'); 1195 } 1196 1197 /** 1198 * Interpret a character as a digit (in any base up to 36) and return the 1199 * numeric value. This is like {@code Character.digit()} but we don't accept 1200 * non-ASCII digits. 1201 */ 1202 private static int digitValue(final char c) { 1203 if ('0' <= c && c <= '9') { 1204 return c - '0'; 1205 } else if ('a' <= c && c <= 'z') { 1206 return c - 'a' + 10; 1207 } else { 1208 return c - 'A' + 10; 1209 } 1210 } 1211 1212 /** 1213 * Parse a 32-bit signed integer from the text. Unlike the Java standard 1214 * {@code Integer.parseInt()}, this function recognizes the prefixes "0x" 1215 * and "0" to signify hexidecimal and octal numbers, respectively. 1216 */ 1217 static int parseInt32(final String text) throws NumberFormatException { 1218 return (int) parseInteger(text, true, false); 1219 } 1220 1221 /** 1222 * Parse a 32-bit unsigned integer from the text. Unlike the Java standard 1223 * {@code Integer.parseInt()}, this function recognizes the prefixes "0x" 1224 * and "0" to signify hexidecimal and octal numbers, respectively. The 1225 * result is coerced to a (signed) {@code int} when returned since Java has 1226 * no unsigned integer type. 1227 */ 1228 static int parseUInt32(final String text) throws NumberFormatException { 1229 return (int) parseInteger(text, false, false); 1230 } 1231 1232 /** 1233 * Parse a 64-bit signed integer from the text. Unlike the Java standard 1234 * {@code Integer.parseInt()}, this function recognizes the prefixes "0x" 1235 * and "0" to signify hexidecimal and octal numbers, respectively. 1236 */ 1237 static long parseInt64(final String text) throws NumberFormatException { 1238 return parseInteger(text, true, true); 1239 } 1240 1241 /** 1242 * Parse a 64-bit unsigned integer from the text. Unlike the Java standard 1243 * {@code Integer.parseInt()}, this function recognizes the prefixes "0x" 1244 * and "0" to signify hexidecimal and octal numbers, respectively. The 1245 * result is coerced to a (signed) {@code long} when returned since Java has 1246 * no unsigned long type. 1247 */ 1248 static long parseUInt64(final String text) throws NumberFormatException { 1249 return parseInteger(text, false, true); 1250 } 1251 1252 private static long parseInteger(final String text, 1253 final boolean isSigned, 1254 final boolean isLong) 1255 throws NumberFormatException { 1256 int pos = 0; 1257 1258 boolean negative = false; 1259 if (text.startsWith("-", pos)) { 1260 if (!isSigned) { 1261 throw new NumberFormatException("Number must be positive: " + text); 1262 } 1263 ++pos; 1264 negative = true; 1265 } 1266 1267 int radix = 10; 1268 if (text.startsWith("0x", pos)) { 1269 pos += 2; 1270 radix = 16; 1271 } else if (text.startsWith("0", pos)) { 1272 radix = 8; 1273 } 1274 1275 final String numberText = text.substring(pos); 1276 1277 long result = 0; 1278 if (numberText.length() < 16) { 1279 // Can safely assume no overflow. 1280 result = Long.parseLong(numberText, radix); 1281 if (negative) { 1282 result = -result; 1283 } 1284 1285 // Check bounds. 1286 // No need to check for 64-bit numbers since they'd have to be 16 chars 1287 // or longer to overflow. 1288 if (!isLong) { 1289 if (isSigned) { 1290 if (result > Integer.MAX_VALUE || result < Integer.MIN_VALUE) { 1291 throw new NumberFormatException( 1292 "Number out of range for 32-bit signed integer: " + text); 1293 } 1294 } else { 1295 if (result >= (1L << 32) || result < 0) { 1296 throw new NumberFormatException( 1297 "Number out of range for 32-bit unsigned integer: " + text); 1298 } 1299 } 1300 } 1301 } else { 1302 BigInteger bigValue = new BigInteger(numberText, radix); 1303 if (negative) { 1304 bigValue = bigValue.negate(); 1305 } 1306 1307 // Check bounds. 1308 if (!isLong) { 1309 if (isSigned) { 1310 if (bigValue.bitLength() > 31) { 1311 throw new NumberFormatException( 1312 "Number out of range for 32-bit signed integer: " + text); 1313 } 1314 } else { 1315 if (bigValue.bitLength() > 32) { 1316 throw new NumberFormatException( 1317 "Number out of range for 32-bit unsigned integer: " + text); 1318 } 1319 } 1320 } else { 1321 if (isSigned) { 1322 if (bigValue.bitLength() > 63) { 1323 throw new NumberFormatException( 1324 "Number out of range for 64-bit signed integer: " + text); 1325 } 1326 } else { 1327 if (bigValue.bitLength() > 64) { 1328 throw new NumberFormatException( 1329 "Number out of range for 64-bit unsigned integer: " + text); 1330 } 1331 } 1332 } 1333 1334 result = bigValue.longValue(); 1335 } 1336 1337 return result; 1338 } 1339 } 1340