Home | History | Annotate | Download | only in protobuf
      1 // Protocol Buffers - Google's data interchange format
      2 // Copyright 2008 Google Inc.  All rights reserved.
      3 // http://code.google.com/p/protobuf/
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are
      7 // met:
      8 //
      9 //     * Redistributions of source code must retain the above copyright
     10 // notice, this list of conditions and the following disclaimer.
     11 //     * Redistributions in binary form must reproduce the above
     12 // copyright notice, this list of conditions and the following disclaimer
     13 // in the documentation and/or other materials provided with the
     14 // distribution.
     15 //     * Neither the name of Google Inc. nor the names of its
     16 // contributors may be used to endorse or promote products derived from
     17 // this software without specific prior written permission.
     18 //
     19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30 
     31 package com.google.protobuf;
     32 
     33 import com.google.protobuf.Descriptors.Descriptor;
     34 import com.google.protobuf.Descriptors.FieldDescriptor;
     35 import com.google.protobuf.Descriptors.EnumDescriptor;
     36 import com.google.protobuf.Descriptors.EnumValueDescriptor;
     37 
     38 import java.io.IOException;
     39 import java.nio.CharBuffer;
     40 import java.math.BigInteger;
     41 import java.util.ArrayList;
     42 import java.util.List;
     43 import java.util.Locale;
     44 import java.util.Map;
     45 import java.util.regex.Matcher;
     46 import java.util.regex.Pattern;
     47 
     48 /**
     49  * Provide ascii text parsing and formatting support for proto2 instances.
     50  * The implementation largely follows google/protobuf/text_format.cc.
     51  *
     52  * @author wenboz (at) google.com Wenbo Zhu
     53  * @author kenton (at) google.com Kenton Varda
     54  */
     55 public final class TextFormat {
     56   private TextFormat() {
     57   }
     58 
     59   /**
     60    * Outputs a textual representation of the Protocol Message supplied into
     61    * the parameter output. (This representation is the new version of the
     62    * classic "ProtocolPrinter" output from the original Protocol Buffer system)
     63    */
     64   public static void print(final Message message, final Appendable output)
     65                            throws IOException {
     66     final TextGenerator generator = new TextGenerator(output);
     67     print(message, generator);
     68   }
     69 
     70   /** Outputs a textual representation of {@code fields} to {@code output}. */
     71   public static void print(final UnknownFieldSet fields,
     72                            final Appendable output)
     73                            throws IOException {
     74     final TextGenerator generator = new TextGenerator(output);
     75     printUnknownFields(fields, generator);
     76   }
     77 
     78   /**
     79    * Like {@code print()}, but writes directly to a {@code String} and
     80    * returns it.
     81    */
     82   public static String printToString(final Message message) {
     83     try {
     84       final StringBuilder text = new StringBuilder();
     85       print(message, text);
     86       return text.toString();
     87     } catch (IOException e) {
     88       throw new RuntimeException(
     89         "Writing to a StringBuilder threw an IOException (should never " +
     90         "happen).", e);
     91     }
     92   }
     93 
     94   /**
     95    * Like {@code print()}, but writes directly to a {@code String} and
     96    * returns it.
     97    */
     98   public static String printToString(final UnknownFieldSet fields) {
     99     try {
    100       final StringBuilder text = new StringBuilder();
    101       print(fields, text);
    102       return text.toString();
    103     } catch (IOException e) {
    104       throw new RuntimeException(
    105         "Writing to a StringBuilder threw an IOException (should never " +
    106         "happen).", e);
    107     }
    108   }
    109 
    110   private static void print(final Message message,
    111                             final TextGenerator generator)
    112       throws IOException {
    113     for (final Map.Entry<FieldDescriptor, Object> field :
    114          message.getAllFields().entrySet()) {
    115       printField(field.getKey(), field.getValue(), generator);
    116     }
    117     printUnknownFields(message.getUnknownFields(), generator);
    118   }
    119 
    120   public static void printField(final FieldDescriptor field,
    121                                 final Object value,
    122                                 final Appendable output)
    123                                 throws IOException {
    124     final TextGenerator generator = new TextGenerator(output);
    125     printField(field, value, generator);
    126   }
    127 
    128   public static String printFieldToString(final FieldDescriptor field,
    129                                           final Object value) {
    130     try {
    131       final StringBuilder text = new StringBuilder();
    132       printField(field, value, text);
    133       return text.toString();
    134     } catch (IOException e) {
    135       throw new RuntimeException(
    136         "Writing to a StringBuilder threw an IOException (should never " +
    137         "happen).", e);
    138     }
    139   }
    140 
    141   private static void printField(final FieldDescriptor field,
    142                                 final Object value,
    143                                 final TextGenerator generator)
    144                                 throws IOException {
    145     if (field.isRepeated()) {
    146       // Repeated field.  Print each element.
    147       for (final Object element : (List) value) {
    148         printSingleField(field, element, generator);
    149       }
    150     } else {
    151       printSingleField(field, value, generator);
    152     }
    153   }
    154 
    155   private static void printSingleField(final FieldDescriptor field,
    156                                        final Object value,
    157                                        final TextGenerator generator)
    158                                        throws IOException {
    159     if (field.isExtension()) {
    160       generator.print("[");
    161       // We special-case MessageSet elements for compatibility with proto1.
    162       if (field.getContainingType().getOptions().getMessageSetWireFormat()
    163           && (field.getType() == FieldDescriptor.Type.MESSAGE)
    164           && (field.isOptional())
    165           // object equality
    166           && (field.getExtensionScope() == field.getMessageType())) {
    167         generator.print(field.getMessageType().getFullName());
    168       } else {
    169         generator.print(field.getFullName());
    170       }
    171       generator.print("]");
    172     } else {
    173       if (field.getType() == FieldDescriptor.Type.GROUP) {
    174         // Groups must be serialized with their original capitalization.
    175         generator.print(field.getMessageType().getName());
    176       } else {
    177         generator.print(field.getName());
    178       }
    179     }
    180 
    181     if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
    182       generator.print(" {\n");
    183       generator.indent();
    184     } else {
    185       generator.print(": ");
    186     }
    187 
    188     printFieldValue(field, value, generator);
    189 
    190     if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
    191       generator.outdent();
    192       generator.print("}");
    193     }
    194     generator.print("\n");
    195   }
    196 
    197   private static void printFieldValue(final FieldDescriptor field,
    198                                       final Object value,
    199                                       final TextGenerator generator)
    200                                       throws IOException {
    201     switch (field.getType()) {
    202       case INT32:
    203       case INT64:
    204       case SINT32:
    205       case SINT64:
    206       case SFIXED32:
    207       case SFIXED64:
    208       case FLOAT:
    209       case DOUBLE:
    210       case BOOL:
    211         // Good old toString() does what we want for these types.
    212         generator.print(value.toString());
    213         break;
    214 
    215       case UINT32:
    216       case FIXED32:
    217         generator.print(unsignedToString((Integer) value));
    218         break;
    219 
    220       case UINT64:
    221       case FIXED64:
    222         generator.print(unsignedToString((Long) value));
    223         break;
    224 
    225       case STRING:
    226         generator.print("\"");
    227         generator.print(escapeText((String) value));
    228         generator.print("\"");
    229         break;
    230 
    231       case BYTES:
    232         generator.print("\"");
    233         generator.print(escapeBytes((ByteString) value));
    234         generator.print("\"");
    235         break;
    236 
    237       case ENUM:
    238         generator.print(((EnumValueDescriptor) value).getName());
    239         break;
    240 
    241       case MESSAGE:
    242       case GROUP:
    243         print((Message) value, generator);
    244         break;
    245     }
    246   }
    247 
    248   private static void printUnknownFields(final UnknownFieldSet unknownFields,
    249                                          final TextGenerator generator)
    250                                          throws IOException {
    251     for (final Map.Entry<Integer, UnknownFieldSet.Field> entry :
    252          unknownFields.asMap().entrySet()) {
    253       final String prefix = entry.getKey().toString() + ": ";
    254       final UnknownFieldSet.Field field = entry.getValue();
    255 
    256       for (final long value : field.getVarintList()) {
    257         generator.print(entry.getKey().toString());
    258         generator.print(": ");
    259         generator.print(unsignedToString(value));
    260         generator.print("\n");
    261       }
    262       for (final int value : field.getFixed32List()) {
    263         generator.print(entry.getKey().toString());
    264         generator.print(": ");
    265         generator.print(String.format((Locale) null, "0x%08x", value));
    266         generator.print("\n");
    267       }
    268       for (final long value : field.getFixed64List()) {
    269         generator.print(entry.getKey().toString());
    270         generator.print(": ");
    271         generator.print(String.format((Locale) null, "0x%016x", value));
    272         generator.print("\n");
    273       }
    274       for (final ByteString value : field.getLengthDelimitedList()) {
    275         generator.print(entry.getKey().toString());
    276         generator.print(": \"");
    277         generator.print(escapeBytes(value));
    278         generator.print("\"\n");
    279       }
    280       for (final UnknownFieldSet value : field.getGroupList()) {
    281         generator.print(entry.getKey().toString());
    282         generator.print(" {\n");
    283         generator.indent();
    284         printUnknownFields(value, generator);
    285         generator.outdent();
    286         generator.print("}\n");
    287       }
    288     }
    289   }
    290 
    291   /** Convert an unsigned 32-bit integer to a string. */
    292   private static String unsignedToString(final int value) {
    293     if (value >= 0) {
    294       return Integer.toString(value);
    295     } else {
    296       return Long.toString(((long) value) & 0x00000000FFFFFFFFL);
    297     }
    298   }
    299 
    300   /** Convert an unsigned 64-bit integer to a string. */
    301   private static String unsignedToString(final long value) {
    302     if (value >= 0) {
    303       return Long.toString(value);
    304     } else {
    305       // Pull off the most-significant bit so that BigInteger doesn't think
    306       // the number is negative, then set it again using setBit().
    307       return BigInteger.valueOf(value & 0x7FFFFFFFFFFFFFFFL)
    308                        .setBit(63).toString();
    309     }
    310   }
    311 
    312   /**
    313    * An inner class for writing text to the output stream.
    314    */
    315   private static final class TextGenerator {
    316     private Appendable output;
    317     private boolean atStartOfLine = true;
    318     private final StringBuilder indent = new StringBuilder();
    319 
    320     private TextGenerator(final Appendable output) {
    321       this.output = output;
    322     }
    323 
    324     /**
    325      * Indent text by two spaces.  After calling Indent(), two spaces will be
    326      * inserted at the beginning of each line of text.  Indent() may be called
    327      * multiple times to produce deeper indents.
    328      */
    329     public void indent() {
    330       indent.append("  ");
    331     }
    332 
    333     /**
    334      * Reduces the current indent level by two spaces, or crashes if the indent
    335      * level is zero.
    336      */
    337     public void outdent() {
    338       final int length = indent.length();
    339       if (length == 0) {
    340         throw new IllegalArgumentException(
    341             " Outdent() without matching Indent().");
    342       }
    343       indent.delete(length - 2, length);
    344     }
    345 
    346     /**
    347      * Print text to the output stream.
    348      */
    349     public void print(final CharSequence text) throws IOException {
    350       final int size = text.length();
    351       int pos = 0;
    352 
    353       for (int i = 0; i < size; i++) {
    354         if (text.charAt(i) == '\n') {
    355           write(text.subSequence(pos, size), i - pos + 1);
    356           pos = i + 1;
    357           atStartOfLine = true;
    358         }
    359       }
    360       write(text.subSequence(pos, size), size - pos);
    361     }
    362 
    363     private void write(final CharSequence data, final int size)
    364                        throws IOException {
    365       if (size == 0) {
    366         return;
    367       }
    368       if (atStartOfLine) {
    369         atStartOfLine = false;
    370         output.append(indent);
    371       }
    372       output.append(data);
    373     }
    374   }
    375 
    376   // =================================================================
    377   // Parsing
    378 
    379   /**
    380    * Represents a stream of tokens parsed from a {@code String}.
    381    *
    382    * <p>The Java standard library provides many classes that you might think
    383    * would be useful for implementing this, but aren't.  For example:
    384    *
    385    * <ul>
    386    * <li>{@code java.io.StreamTokenizer}:  This almost does what we want -- or,
    387    *   at least, something that would get us close to what we want -- except
    388    *   for one fatal flaw:  It automatically un-escapes strings using Java
    389    *   escape sequences, which do not include all the escape sequences we
    390    *   need to support (e.g. '\x').
    391    * <li>{@code java.util.Scanner}:  This seems like a great way at least to
    392    *   parse regular expressions out of a stream (so we wouldn't have to load
    393    *   the entire input into a single string before parsing).  Sadly,
    394    *   {@code Scanner} requires that tokens be delimited with some delimiter.
    395    *   Thus, although the text "foo:" should parse to two tokens ("foo" and
    396    *   ":"), {@code Scanner} would recognize it only as a single token.
    397    *   Furthermore, {@code Scanner} provides no way to inspect the contents
    398    *   of delimiters, making it impossible to keep track of line and column
    399    *   numbers.
    400    * </ul>
    401    *
    402    * <p>Luckily, Java's regular expression support does manage to be useful to
    403    * us.  (Barely:  We need {@code Matcher.usePattern()}, which is new in
    404    * Java 1.5.)  So, we can use that, at least.  Unfortunately, this implies
    405    * that we need to have the entire input in one contiguous string.
    406    */
    407   private static final class Tokenizer {
    408     private final CharSequence text;
    409     private final Matcher matcher;
    410     private String currentToken;
    411 
    412     // The character index within this.text at which the current token begins.
    413     private int pos = 0;
    414 
    415     // The line and column numbers of the current token.
    416     private int line = 0;
    417     private int column = 0;
    418 
    419     // The line and column numbers of the previous token (allows throwing
    420     // errors *after* consuming).
    421     private int previousLine = 0;
    422     private int previousColumn = 0;
    423 
    424     // We use possesive quantifiers (*+ and ++) because otherwise the Java
    425     // regex matcher has stack overflows on large inputs.
    426     private static final Pattern WHITESPACE =
    427       Pattern.compile("(\\s|(#.*$))++", Pattern.MULTILINE);
    428     private static final Pattern TOKEN = Pattern.compile(
    429       "[a-zA-Z_][0-9a-zA-Z_+-]*+|" +                // an identifier
    430       "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" +             // a number
    431       "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" +       // a double-quoted string
    432       "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)",         // a single-quoted string
    433       Pattern.MULTILINE);
    434 
    435     private static final Pattern DOUBLE_INFINITY = Pattern.compile(
    436       "-?inf(inity)?",
    437       Pattern.CASE_INSENSITIVE);
    438     private static final Pattern FLOAT_INFINITY = Pattern.compile(
    439       "-?inf(inity)?f?",
    440       Pattern.CASE_INSENSITIVE);
    441     private static final Pattern FLOAT_NAN = Pattern.compile(
    442       "nanf?",
    443       Pattern.CASE_INSENSITIVE);
    444 
    445     /** Construct a tokenizer that parses tokens from the given text. */
    446     private Tokenizer(final CharSequence text) {
    447       this.text = text;
    448       this.matcher = WHITESPACE.matcher(text);
    449       skipWhitespace();
    450       nextToken();
    451     }
    452 
    453     /** Are we at the end of the input? */
    454     public boolean atEnd() {
    455       return currentToken.length() == 0;
    456     }
    457 
    458     /** Advance to the next token. */
    459     public void nextToken() {
    460       previousLine = line;
    461       previousColumn = column;
    462 
    463       // Advance the line counter to the current position.
    464       while (pos < matcher.regionStart()) {
    465         if (text.charAt(pos) == '\n') {
    466           ++line;
    467           column = 0;
    468         } else {
    469           ++column;
    470         }
    471         ++pos;
    472       }
    473 
    474       // Match the next token.
    475       if (matcher.regionStart() == matcher.regionEnd()) {
    476         // EOF
    477         currentToken = "";
    478       } else {
    479         matcher.usePattern(TOKEN);
    480         if (matcher.lookingAt()) {
    481           currentToken = matcher.group();
    482           matcher.region(matcher.end(), matcher.regionEnd());
    483         } else {
    484           // Take one character.
    485           currentToken = String.valueOf(text.charAt(pos));
    486           matcher.region(pos + 1, matcher.regionEnd());
    487         }
    488 
    489         skipWhitespace();
    490       }
    491     }
    492 
    493     /**
    494      * Skip over any whitespace so that the matcher region starts at the next
    495      * token.
    496      */
    497     private void skipWhitespace() {
    498       matcher.usePattern(WHITESPACE);
    499       if (matcher.lookingAt()) {
    500         matcher.region(matcher.end(), matcher.regionEnd());
    501       }
    502     }
    503 
    504     /**
    505      * If the next token exactly matches {@code token}, consume it and return
    506      * {@code true}.  Otherwise, return {@code false} without doing anything.
    507      */
    508     public boolean tryConsume(final String token) {
    509       if (currentToken.equals(token)) {
    510         nextToken();
    511         return true;
    512       } else {
    513         return false;
    514       }
    515     }
    516 
    517     /**
    518      * If the next token exactly matches {@code token}, consume it.  Otherwise,
    519      * throw a {@link ParseException}.
    520      */
    521     public void consume(final String token) throws ParseException {
    522       if (!tryConsume(token)) {
    523         throw parseException("Expected \"" + token + "\".");
    524       }
    525     }
    526 
    527     /**
    528      * Returns {@code true} if the next token is an integer, but does
    529      * not consume it.
    530      */
    531     public boolean lookingAtInteger() {
    532       if (currentToken.length() == 0) {
    533         return false;
    534       }
    535 
    536       final char c = currentToken.charAt(0);
    537       return ('0' <= c && c <= '9') ||
    538              c == '-' || c == '+';
    539     }
    540 
    541     /**
    542      * If the next token is an identifier, consume it and return its value.
    543      * Otherwise, throw a {@link ParseException}.
    544      */
    545     public String consumeIdentifier() throws ParseException {
    546       for (int i = 0; i < currentToken.length(); i++) {
    547         final char c = currentToken.charAt(i);
    548         if (('a' <= c && c <= 'z') ||
    549             ('A' <= c && c <= 'Z') ||
    550             ('0' <= c && c <= '9') ||
    551             (c == '_') || (c == '.')) {
    552           // OK
    553         } else {
    554           throw parseException("Expected identifier.");
    555         }
    556       }
    557 
    558       final String result = currentToken;
    559       nextToken();
    560       return result;
    561     }
    562 
    563     /**
    564      * If the next token is a 32-bit signed integer, consume it and return its
    565      * value.  Otherwise, throw a {@link ParseException}.
    566      */
    567     public int consumeInt32() throws ParseException {
    568       try {
    569         final int result = parseInt32(currentToken);
    570         nextToken();
    571         return result;
    572       } catch (NumberFormatException e) {
    573         throw integerParseException(e);
    574       }
    575     }
    576 
    577     /**
    578      * If the next token is a 32-bit unsigned integer, consume it and return its
    579      * value.  Otherwise, throw a {@link ParseException}.
    580      */
    581     public int consumeUInt32() throws ParseException {
    582       try {
    583         final int result = parseUInt32(currentToken);
    584         nextToken();
    585         return result;
    586       } catch (NumberFormatException e) {
    587         throw integerParseException(e);
    588       }
    589     }
    590 
    591     /**
    592      * If the next token is a 64-bit signed integer, consume it and return its
    593      * value.  Otherwise, throw a {@link ParseException}.
    594      */
    595     public long consumeInt64() throws ParseException {
    596       try {
    597         final long result = parseInt64(currentToken);
    598         nextToken();
    599         return result;
    600       } catch (NumberFormatException e) {
    601         throw integerParseException(e);
    602       }
    603     }
    604 
    605     /**
    606      * If the next token is a 64-bit unsigned integer, consume it and return its
    607      * value.  Otherwise, throw a {@link ParseException}.
    608      */
    609     public long consumeUInt64() throws ParseException {
    610       try {
    611         final long result = parseUInt64(currentToken);
    612         nextToken();
    613         return result;
    614       } catch (NumberFormatException e) {
    615         throw integerParseException(e);
    616       }
    617     }
    618 
    619     /**
    620      * If the next token is a double, consume it and return its value.
    621      * Otherwise, throw a {@link ParseException}.
    622      */
    623     public double consumeDouble() throws ParseException {
    624       // We need to parse infinity and nan separately because
    625       // Double.parseDouble() does not accept "inf", "infinity", or "nan".
    626       if (DOUBLE_INFINITY.matcher(currentToken).matches()) {
    627         final boolean negative = currentToken.startsWith("-");
    628         nextToken();
    629         return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
    630       }
    631       if (currentToken.equalsIgnoreCase("nan")) {
    632         nextToken();
    633         return Double.NaN;
    634       }
    635       try {
    636         final double result = Double.parseDouble(currentToken);
    637         nextToken();
    638         return result;
    639       } catch (NumberFormatException e) {
    640         throw floatParseException(e);
    641       }
    642     }
    643 
    644     /**
    645      * If the next token is a float, consume it and return its value.
    646      * Otherwise, throw a {@link ParseException}.
    647      */
    648     public float consumeFloat() throws ParseException {
    649       // We need to parse infinity and nan separately because
    650       // Float.parseFloat() does not accept "inf", "infinity", or "nan".
    651       if (FLOAT_INFINITY.matcher(currentToken).matches()) {
    652         final boolean negative = currentToken.startsWith("-");
    653         nextToken();
    654         return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
    655       }
    656       if (FLOAT_NAN.matcher(currentToken).matches()) {
    657         nextToken();
    658         return Float.NaN;
    659       }
    660       try {
    661         final float result = Float.parseFloat(currentToken);
    662         nextToken();
    663         return result;
    664       } catch (NumberFormatException e) {
    665         throw floatParseException(e);
    666       }
    667     }
    668 
    669     /**
    670      * If the next token is a boolean, consume it and return its value.
    671      * Otherwise, throw a {@link ParseException}.
    672      */
    673     public boolean consumeBoolean() throws ParseException {
    674       if (currentToken.equals("true")) {
    675         nextToken();
    676         return true;
    677       } else if (currentToken.equals("false")) {
    678         nextToken();
    679         return false;
    680       } else {
    681         throw parseException("Expected \"true\" or \"false\".");
    682       }
    683     }
    684 
    685     /**
    686      * If the next token is a string, consume it and return its (unescaped)
    687      * value.  Otherwise, throw a {@link ParseException}.
    688      */
    689     public String consumeString() throws ParseException {
    690       return consumeByteString().toStringUtf8();
    691     }
    692 
    693     /**
    694      * If the next token is a string, consume it, unescape it as a
    695      * {@link ByteString}, and return it.  Otherwise, throw a
    696      * {@link ParseException}.
    697      */
    698     public ByteString consumeByteString() throws ParseException {
    699       List<ByteString> list = new ArrayList<ByteString>();
    700       consumeByteString(list);
    701       while (currentToken.startsWith("'") || currentToken.startsWith("\"")) {
    702         consumeByteString(list);
    703       }
    704       return ByteString.copyFrom(list);
    705     }
    706 
    707     /**
    708      * Like {@link #consumeByteString()} but adds each token of the string to
    709      * the given list.  String literals (whether bytes or text) may come in
    710      * multiple adjacent tokens which are automatically concatenated, like in
    711      * C or Python.
    712      */
    713     private void consumeByteString(List<ByteString> list) throws ParseException {
    714       final char quote = currentToken.length() > 0 ? currentToken.charAt(0)
    715                                                    : '\0';
    716       if (quote != '\"' && quote != '\'') {
    717         throw parseException("Expected string.");
    718       }
    719 
    720       if (currentToken.length() < 2 ||
    721           currentToken.charAt(currentToken.length() - 1) != quote) {
    722         throw parseException("String missing ending quote.");
    723       }
    724 
    725       try {
    726         final String escaped =
    727             currentToken.substring(1, currentToken.length() - 1);
    728         final ByteString result = unescapeBytes(escaped);
    729         nextToken();
    730         list.add(result);
    731       } catch (InvalidEscapeSequenceException e) {
    732         throw parseException(e.getMessage());
    733       }
    734     }
    735 
    736     /**
    737      * Returns a {@link ParseException} with the current line and column
    738      * numbers in the description, suitable for throwing.
    739      */
    740     public ParseException parseException(final String description) {
    741       // Note:  People generally prefer one-based line and column numbers.
    742       return new ParseException(
    743         (line + 1) + ":" + (column + 1) + ": " + description);
    744     }
    745 
    746     /**
    747      * Returns a {@link ParseException} with the line and column numbers of
    748      * the previous token in the description, suitable for throwing.
    749      */
    750     public ParseException parseExceptionPreviousToken(
    751         final String description) {
    752       // Note:  People generally prefer one-based line and column numbers.
    753       return new ParseException(
    754         (previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
    755     }
    756 
    757     /**
    758      * Constructs an appropriate {@link ParseException} for the given
    759      * {@code NumberFormatException} when trying to parse an integer.
    760      */
    761     private ParseException integerParseException(
    762         final NumberFormatException e) {
    763       return parseException("Couldn't parse integer: " + e.getMessage());
    764     }
    765 
    766     /**
    767      * Constructs an appropriate {@link ParseException} for the given
    768      * {@code NumberFormatException} when trying to parse a float or double.
    769      */
    770     private ParseException floatParseException(final NumberFormatException e) {
    771       return parseException("Couldn't parse number: " + e.getMessage());
    772     }
    773   }
    774 
    775   /** Thrown when parsing an invalid text format message. */
    776   public static class ParseException extends IOException {
    777     private static final long serialVersionUID = 3196188060225107702L;
    778 
    779     public ParseException(final String message) {
    780       super(message);
    781     }
    782   }
    783 
    784   /**
    785    * Parse a text-format message from {@code input} and merge the contents
    786    * into {@code builder}.
    787    */
    788   public static void merge(final Readable input,
    789                            final Message.Builder builder)
    790                            throws IOException {
    791     merge(input, ExtensionRegistry.getEmptyRegistry(), builder);
    792   }
    793 
    794   /**
    795    * Parse a text-format message from {@code input} and merge the contents
    796    * into {@code builder}.
    797    */
    798   public static void merge(final CharSequence input,
    799                            final Message.Builder builder)
    800                            throws ParseException {
    801     merge(input, ExtensionRegistry.getEmptyRegistry(), builder);
    802   }
    803 
    804   /**
    805    * Parse a text-format message from {@code input} and merge the contents
    806    * into {@code builder}.  Extensions will be recognized if they are
    807    * registered in {@code extensionRegistry}.
    808    */
    809   public static void merge(final Readable input,
    810                            final ExtensionRegistry extensionRegistry,
    811                            final Message.Builder builder)
    812                            throws IOException {
    813     // Read the entire input to a String then parse that.
    814 
    815     // If StreamTokenizer were not quite so crippled, or if there were a kind
    816     // of Reader that could read in chunks that match some particular regex,
    817     // or if we wanted to write a custom Reader to tokenize our stream, then
    818     // we would not have to read to one big String.  Alas, none of these is
    819     // the case.  Oh well.
    820 
    821     merge(toStringBuilder(input), extensionRegistry, builder);
    822   }
    823 
    824   private static final int BUFFER_SIZE = 4096;
    825 
    826   // TODO(chrisn): See if working around java.io.Reader#read(CharBuffer)
    827   // overhead is worthwhile
    828   private static StringBuilder toStringBuilder(final Readable input)
    829       throws IOException {
    830     final StringBuilder text = new StringBuilder();
    831     final CharBuffer buffer = CharBuffer.allocate(BUFFER_SIZE);
    832     while (true) {
    833       final int n = input.read(buffer);
    834       if (n == -1) {
    835         break;
    836       }
    837       buffer.flip();
    838       text.append(buffer, 0, n);
    839     }
    840     return text;
    841   }
    842 
    843   /**
    844    * Parse a text-format message from {@code input} and merge the contents
    845    * into {@code builder}.  Extensions will be recognized if they are
    846    * registered in {@code extensionRegistry}.
    847    */
    848   public static void merge(final CharSequence input,
    849                            final ExtensionRegistry extensionRegistry,
    850                            final Message.Builder builder)
    851                            throws ParseException {
    852     final Tokenizer tokenizer = new Tokenizer(input);
    853 
    854     while (!tokenizer.atEnd()) {
    855       mergeField(tokenizer, extensionRegistry, builder);
    856     }
    857   }
    858 
    859   /**
    860    * Parse a single field from {@code tokenizer} and merge it into
    861    * {@code builder}.
    862    */
    863   private static void mergeField(final Tokenizer tokenizer,
    864                                  final ExtensionRegistry extensionRegistry,
    865                                  final Message.Builder builder)
    866                                  throws ParseException {
    867     FieldDescriptor field;
    868     final Descriptor type = builder.getDescriptorForType();
    869     ExtensionRegistry.ExtensionInfo extension = null;
    870 
    871     if (tokenizer.tryConsume("[")) {
    872       // An extension.
    873       final StringBuilder name =
    874           new StringBuilder(tokenizer.consumeIdentifier());
    875       while (tokenizer.tryConsume(".")) {
    876         name.append('.');
    877         name.append(tokenizer.consumeIdentifier());
    878       }
    879 
    880       extension = extensionRegistry.findExtensionByName(name.toString());
    881 
    882       if (extension == null) {
    883         throw tokenizer.parseExceptionPreviousToken(
    884           "Extension \"" + name + "\" not found in the ExtensionRegistry.");
    885       } else if (extension.descriptor.getContainingType() != type) {
    886         throw tokenizer.parseExceptionPreviousToken(
    887           "Extension \"" + name + "\" does not extend message type \"" +
    888           type.getFullName() + "\".");
    889       }
    890 
    891       tokenizer.consume("]");
    892 
    893       field = extension.descriptor;
    894     } else {
    895       final String name = tokenizer.consumeIdentifier();
    896       field = type.findFieldByName(name);
    897 
    898       // Group names are expected to be capitalized as they appear in the
    899       // .proto file, which actually matches their type names, not their field
    900       // names.
    901       if (field == null) {
    902         // Explicitly specify US locale so that this code does not break when
    903         // executing in Turkey.
    904         final String lowerName = name.toLowerCase(Locale.US);
    905         field = type.findFieldByName(lowerName);
    906         // If the case-insensitive match worked but the field is NOT a group,
    907         if (field != null && field.getType() != FieldDescriptor.Type.GROUP) {
    908           field = null;
    909         }
    910       }
    911       // Again, special-case group names as described above.
    912       if (field != null && field.getType() == FieldDescriptor.Type.GROUP &&
    913           !field.getMessageType().getName().equals(name)) {
    914         field = null;
    915       }
    916 
    917       if (field == null) {
    918         throw tokenizer.parseExceptionPreviousToken(
    919           "Message type \"" + type.getFullName() +
    920           "\" has no field named \"" + name + "\".");
    921       }
    922     }
    923 
    924     Object value = null;
    925 
    926     if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
    927       tokenizer.tryConsume(":");  // optional
    928 
    929       final String endToken;
    930       if (tokenizer.tryConsume("<")) {
    931         endToken = ">";
    932       } else {
    933         tokenizer.consume("{");
    934         endToken = "}";
    935       }
    936 
    937       final Message.Builder subBuilder;
    938       if (extension == null) {
    939         subBuilder = builder.newBuilderForField(field);
    940       } else {
    941         subBuilder = extension.defaultInstance.newBuilderForType();
    942       }
    943 
    944       while (!tokenizer.tryConsume(endToken)) {
    945         if (tokenizer.atEnd()) {
    946           throw tokenizer.parseException(
    947             "Expected \"" + endToken + "\".");
    948         }
    949         mergeField(tokenizer, extensionRegistry, subBuilder);
    950       }
    951 
    952       value = subBuilder.build();
    953 
    954     } else {
    955       tokenizer.consume(":");
    956 
    957       switch (field.getType()) {
    958         case INT32:
    959         case SINT32:
    960         case SFIXED32:
    961           value = tokenizer.consumeInt32();
    962           break;
    963 
    964         case INT64:
    965         case SINT64:
    966         case SFIXED64:
    967           value = tokenizer.consumeInt64();
    968           break;
    969 
    970         case UINT32:
    971         case FIXED32:
    972           value = tokenizer.consumeUInt32();
    973           break;
    974 
    975         case UINT64:
    976         case FIXED64:
    977           value = tokenizer.consumeUInt64();
    978           break;
    979 
    980         case FLOAT:
    981           value = tokenizer.consumeFloat();
    982           break;
    983 
    984         case DOUBLE:
    985           value = tokenizer.consumeDouble();
    986           break;
    987 
    988         case BOOL:
    989           value = tokenizer.consumeBoolean();
    990           break;
    991 
    992         case STRING:
    993           value = tokenizer.consumeString();
    994           break;
    995 
    996         case BYTES:
    997           value = tokenizer.consumeByteString();
    998           break;
    999 
   1000         case ENUM:
   1001           final EnumDescriptor enumType = field.getEnumType();
   1002 
   1003           if (tokenizer.lookingAtInteger()) {
   1004             final int number = tokenizer.consumeInt32();
   1005             value = enumType.findValueByNumber(number);
   1006             if (value == null) {
   1007               throw tokenizer.parseExceptionPreviousToken(
   1008                 "Enum type \"" + enumType.getFullName() +
   1009                 "\" has no value with number " + number + '.');
   1010             }
   1011           } else {
   1012             final String id = tokenizer.consumeIdentifier();
   1013             value = enumType.findValueByName(id);
   1014             if (value == null) {
   1015               throw tokenizer.parseExceptionPreviousToken(
   1016                 "Enum type \"" + enumType.getFullName() +
   1017                 "\" has no value named \"" + id + "\".");
   1018             }
   1019           }
   1020 
   1021           break;
   1022 
   1023         case MESSAGE:
   1024         case GROUP:
   1025           throw new RuntimeException("Can't get here.");
   1026       }
   1027     }
   1028 
   1029     if (field.isRepeated()) {
   1030       builder.addRepeatedField(field, value);
   1031     } else {
   1032       builder.setField(field, value);
   1033     }
   1034   }
   1035 
   1036   // =================================================================
   1037   // Utility functions
   1038   //
   1039   // Some of these methods are package-private because Descriptors.java uses
   1040   // them.
   1041 
   1042   /**
   1043    * Escapes bytes in the format used in protocol buffer text format, which
   1044    * is the same as the format used for C string literals.  All bytes
   1045    * that are not printable 7-bit ASCII characters are escaped, as well as
   1046    * backslash, single-quote, and double-quote characters.  Characters for
   1047    * which no defined short-hand escape sequence is defined will be escaped
   1048    * using 3-digit octal sequences.
   1049    */
   1050   static String escapeBytes(final ByteString input) {
   1051     final StringBuilder builder = new StringBuilder(input.size());
   1052     for (int i = 0; i < input.size(); i++) {
   1053       final byte b = input.byteAt(i);
   1054       switch (b) {
   1055         // Java does not recognize \a or \v, apparently.
   1056         case 0x07: builder.append("\\a" ); break;
   1057         case '\b': builder.append("\\b" ); break;
   1058         case '\f': builder.append("\\f" ); break;
   1059         case '\n': builder.append("\\n" ); break;
   1060         case '\r': builder.append("\\r" ); break;
   1061         case '\t': builder.append("\\t" ); break;
   1062         case 0x0b: builder.append("\\v" ); break;
   1063         case '\\': builder.append("\\\\"); break;
   1064         case '\'': builder.append("\\\'"); break;
   1065         case '"' : builder.append("\\\""); break;
   1066         default:
   1067           if (b >= 0x20) {
   1068             builder.append((char) b);
   1069           } else {
   1070             builder.append('\\');
   1071             builder.append((char) ('0' + ((b >>> 6) & 3)));
   1072             builder.append((char) ('0' + ((b >>> 3) & 7)));
   1073             builder.append((char) ('0' + (b & 7)));
   1074           }
   1075           break;
   1076       }
   1077     }
   1078     return builder.toString();
   1079   }
   1080 
   1081   /**
   1082    * Un-escape a byte sequence as escaped using
   1083    * {@link #escapeBytes(ByteString)}.  Two-digit hex escapes (starting with
   1084    * "\x") are also recognized.
   1085    */
   1086   static ByteString unescapeBytes(final CharSequence input)
   1087       throws InvalidEscapeSequenceException {
   1088     final byte[] result = new byte[input.length()];
   1089     int pos = 0;
   1090     for (int i = 0; i < input.length(); i++) {
   1091       char c = input.charAt(i);
   1092       if (c == '\\') {
   1093         if (i + 1 < input.length()) {
   1094           ++i;
   1095           c = input.charAt(i);
   1096           if (isOctal(c)) {
   1097             // Octal escape.
   1098             int code = digitValue(c);
   1099             if (i + 1 < input.length() && isOctal(input.charAt(i + 1))) {
   1100               ++i;
   1101               code = code * 8 + digitValue(input.charAt(i));
   1102             }
   1103             if (i + 1 < input.length() && isOctal(input.charAt(i + 1))) {
   1104               ++i;
   1105               code = code * 8 + digitValue(input.charAt(i));
   1106             }
   1107             result[pos++] = (byte)code;
   1108           } else {
   1109             switch (c) {
   1110               case 'a' : result[pos++] = 0x07; break;
   1111               case 'b' : result[pos++] = '\b'; break;
   1112               case 'f' : result[pos++] = '\f'; break;
   1113               case 'n' : result[pos++] = '\n'; break;
   1114               case 'r' : result[pos++] = '\r'; break;
   1115               case 't' : result[pos++] = '\t'; break;
   1116               case 'v' : result[pos++] = 0x0b; break;
   1117               case '\\': result[pos++] = '\\'; break;
   1118               case '\'': result[pos++] = '\''; break;
   1119               case '"' : result[pos++] = '\"'; break;
   1120 
   1121               case 'x':
   1122                 // hex escape
   1123                 int code = 0;
   1124                 if (i + 1 < input.length() && isHex(input.charAt(i + 1))) {
   1125                   ++i;
   1126                   code = digitValue(input.charAt(i));
   1127                 } else {
   1128                   throw new InvalidEscapeSequenceException(
   1129                     "Invalid escape sequence: '\\x' with no digits");
   1130                 }
   1131                 if (i + 1 < input.length() && isHex(input.charAt(i + 1))) {
   1132                   ++i;
   1133                   code = code * 16 + digitValue(input.charAt(i));
   1134                 }
   1135                 result[pos++] = (byte)code;
   1136                 break;
   1137 
   1138               default:
   1139                 throw new InvalidEscapeSequenceException(
   1140                   "Invalid escape sequence: '\\" + c + '\'');
   1141             }
   1142           }
   1143         } else {
   1144           throw new InvalidEscapeSequenceException(
   1145             "Invalid escape sequence: '\\' at end of string.");
   1146         }
   1147       } else {
   1148         result[pos++] = (byte)c;
   1149       }
   1150     }
   1151 
   1152     return ByteString.copyFrom(result, 0, pos);
   1153   }
   1154 
   1155   /**
   1156    * Thrown by {@link TextFormat#unescapeBytes} and
   1157    * {@link TextFormat#unescapeText} when an invalid escape sequence is seen.
   1158    */
   1159   static class InvalidEscapeSequenceException extends IOException {
   1160     private static final long serialVersionUID = -8164033650142593304L;
   1161 
   1162     InvalidEscapeSequenceException(final String description) {
   1163       super(description);
   1164     }
   1165   }
   1166 
   1167   /**
   1168    * Like {@link #escapeBytes(ByteString)}, but escapes a text string.
   1169    * Non-ASCII characters are first encoded as UTF-8, then each byte is escaped
   1170    * individually as a 3-digit octal escape.  Yes, it's weird.
   1171    */
   1172   static String escapeText(final String input) {
   1173     return escapeBytes(ByteString.copyFromUtf8(input));
   1174   }
   1175 
   1176   /**
   1177    * Un-escape a text string as escaped using {@link #escapeText(String)}.
   1178    * Two-digit hex escapes (starting with "\x") are also recognized.
   1179    */
   1180   static String unescapeText(final String input)
   1181                              throws InvalidEscapeSequenceException {
   1182     return unescapeBytes(input).toStringUtf8();
   1183   }
   1184 
   1185   /** Is this an octal digit? */
   1186   private static boolean isOctal(final char c) {
   1187     return '0' <= c && c <= '7';
   1188   }
   1189 
   1190   /** Is this a hex digit? */
   1191   private static boolean isHex(final char c) {
   1192     return ('0' <= c && c <= '9') ||
   1193            ('a' <= c && c <= 'f') ||
   1194            ('A' <= c && c <= 'F');
   1195   }
   1196 
   1197   /**
   1198    * Interpret a character as a digit (in any base up to 36) and return the
   1199    * numeric value.  This is like {@code Character.digit()} but we don't accept
   1200    * non-ASCII digits.
   1201    */
   1202   private static int digitValue(final char c) {
   1203     if ('0' <= c && c <= '9') {
   1204       return c - '0';
   1205     } else if ('a' <= c && c <= 'z') {
   1206       return c - 'a' + 10;
   1207     } else {
   1208       return c - 'A' + 10;
   1209     }
   1210   }
   1211 
   1212   /**
   1213    * Parse a 32-bit signed integer from the text.  Unlike the Java standard
   1214    * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
   1215    * and "0" to signify hexidecimal and octal numbers, respectively.
   1216    */
   1217   static int parseInt32(final String text) throws NumberFormatException {
   1218     return (int) parseInteger(text, true, false);
   1219   }
   1220 
   1221   /**
   1222    * Parse a 32-bit unsigned integer from the text.  Unlike the Java standard
   1223    * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
   1224    * and "0" to signify hexidecimal and octal numbers, respectively.  The
   1225    * result is coerced to a (signed) {@code int} when returned since Java has
   1226    * no unsigned integer type.
   1227    */
   1228   static int parseUInt32(final String text) throws NumberFormatException {
   1229     return (int) parseInteger(text, false, false);
   1230   }
   1231 
   1232   /**
   1233    * Parse a 64-bit signed integer from the text.  Unlike the Java standard
   1234    * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
   1235    * and "0" to signify hexidecimal and octal numbers, respectively.
   1236    */
   1237   static long parseInt64(final String text) throws NumberFormatException {
   1238     return parseInteger(text, true, true);
   1239   }
   1240 
   1241   /**
   1242    * Parse a 64-bit unsigned integer from the text.  Unlike the Java standard
   1243    * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
   1244    * and "0" to signify hexidecimal and octal numbers, respectively.  The
   1245    * result is coerced to a (signed) {@code long} when returned since Java has
   1246    * no unsigned long type.
   1247    */
   1248   static long parseUInt64(final String text) throws NumberFormatException {
   1249     return parseInteger(text, false, true);
   1250   }
   1251 
   1252   private static long parseInteger(final String text,
   1253                                    final boolean isSigned,
   1254                                    final boolean isLong)
   1255                                    throws NumberFormatException {
   1256     int pos = 0;
   1257 
   1258     boolean negative = false;
   1259     if (text.startsWith("-", pos)) {
   1260       if (!isSigned) {
   1261         throw new NumberFormatException("Number must be positive: " + text);
   1262       }
   1263       ++pos;
   1264       negative = true;
   1265     }
   1266 
   1267     int radix = 10;
   1268     if (text.startsWith("0x", pos)) {
   1269       pos += 2;
   1270       radix = 16;
   1271     } else if (text.startsWith("0", pos)) {
   1272       radix = 8;
   1273     }
   1274 
   1275     final String numberText = text.substring(pos);
   1276 
   1277     long result = 0;
   1278     if (numberText.length() < 16) {
   1279       // Can safely assume no overflow.
   1280       result = Long.parseLong(numberText, radix);
   1281       if (negative) {
   1282         result = -result;
   1283       }
   1284 
   1285       // Check bounds.
   1286       // No need to check for 64-bit numbers since they'd have to be 16 chars
   1287       // or longer to overflow.
   1288       if (!isLong) {
   1289         if (isSigned) {
   1290           if (result > Integer.MAX_VALUE || result < Integer.MIN_VALUE) {
   1291             throw new NumberFormatException(
   1292               "Number out of range for 32-bit signed integer: " + text);
   1293           }
   1294         } else {
   1295           if (result >= (1L << 32) || result < 0) {
   1296             throw new NumberFormatException(
   1297               "Number out of range for 32-bit unsigned integer: " + text);
   1298           }
   1299         }
   1300       }
   1301     } else {
   1302       BigInteger bigValue = new BigInteger(numberText, radix);
   1303       if (negative) {
   1304         bigValue = bigValue.negate();
   1305       }
   1306 
   1307       // Check bounds.
   1308       if (!isLong) {
   1309         if (isSigned) {
   1310           if (bigValue.bitLength() > 31) {
   1311             throw new NumberFormatException(
   1312               "Number out of range for 32-bit signed integer: " + text);
   1313           }
   1314         } else {
   1315           if (bigValue.bitLength() > 32) {
   1316             throw new NumberFormatException(
   1317               "Number out of range for 32-bit unsigned integer: " + text);
   1318           }
   1319         }
   1320       } else {
   1321         if (isSigned) {
   1322           if (bigValue.bitLength() > 63) {
   1323             throw new NumberFormatException(
   1324               "Number out of range for 64-bit signed integer: " + text);
   1325           }
   1326         } else {
   1327           if (bigValue.bitLength() > 64) {
   1328             throw new NumberFormatException(
   1329               "Number out of range for 64-bit unsigned integer: " + text);
   1330           }
   1331         }
   1332       }
   1333 
   1334       result = bigValue.longValue();
   1335     }
   1336 
   1337     return result;
   1338   }
   1339 }
   1340