Home | History | Annotate | Download | only in protobuf
      1 # Protocol Buffers - Google's data interchange format
      2 # Copyright 2008 Google Inc.  All rights reserved.
      3 # https://developers.google.com/protocol-buffers/
      4 #
      5 # Redistribution and use in source and binary forms, with or without
      6 # modification, are permitted provided that the following conditions are
      7 # met:
      8 #
      9 #     * Redistributions of source code must retain the above copyright
     10 # notice, this list of conditions and the following disclaimer.
     11 #     * Redistributions in binary form must reproduce the above
     12 # copyright notice, this list of conditions and the following disclaimer
     13 # in the documentation and/or other materials provided with the
     14 # distribution.
     15 #     * Neither the name of Google Inc. nor the names of its
     16 # contributors may be used to endorse or promote products derived from
     17 # this software without specific prior written permission.
     18 #
     19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     23 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     24 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     25 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     29 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30 
     31 """Contains routines for printing protocol messages in text format.
     32 
     33 Simple usage example:
     34 
     35   # Create a proto object and serialize it to a text proto string.
     36   message = my_proto_pb2.MyMessage(foo='bar')
     37   text_proto = text_format.MessageToString(message)
     38 
     39   # Parse a text proto string.
     40   message = text_format.Parse(text_proto, my_proto_pb2.MyMessage())
     41 """
     42 
     43 __author__ = 'kenton (at] google.com (Kenton Varda)'
     44 
     45 import io
     46 import re
     47 
     48 import six
     49 
     50 if six.PY3:
     51   long = int
     52 
     53 from google.protobuf.internal import type_checkers
     54 from google.protobuf import descriptor
     55 from google.protobuf import text_encoding
     56 
     57 __all__ = ['MessageToString', 'PrintMessage', 'PrintField',
     58            'PrintFieldValue', 'Merge']
     59 
     60 
     61 _INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
     62                      type_checkers.Int32ValueChecker(),
     63                      type_checkers.Uint64ValueChecker(),
     64                      type_checkers.Int64ValueChecker())
     65 _FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?', re.IGNORECASE)
     66 _FLOAT_NAN = re.compile('nanf?', re.IGNORECASE)
     67 _FLOAT_TYPES = frozenset([descriptor.FieldDescriptor.CPPTYPE_FLOAT,
     68                           descriptor.FieldDescriptor.CPPTYPE_DOUBLE])
     69 _QUOTES = frozenset(("'", '"'))
     70 
     71 
     72 class Error(Exception):
     73   """Top-level module error for text_format."""
     74 
     75 
     76 class ParseError(Error):
     77   """Thrown in case of text parsing error."""
     78 
     79 
     80 class TextWriter(object):
     81   def __init__(self, as_utf8):
     82     if six.PY2:
     83       self._writer = io.BytesIO()
     84     else:
     85       self._writer = io.StringIO()
     86 
     87   def write(self, val):
     88     if six.PY2:
     89       if isinstance(val, six.text_type):
     90         val = val.encode('utf-8')
     91     return self._writer.write(val)
     92 
     93   def close(self):
     94     return self._writer.close()
     95 
     96   def getvalue(self):
     97     return self._writer.getvalue()
     98 
     99 
    100 def MessageToString(message, as_utf8=False, as_one_line=False,
    101                     pointy_brackets=False, use_index_order=False,
    102                     float_format=None, use_field_number=False):
    103   """Convert protobuf message to text format.
    104 
    105   Floating point values can be formatted compactly with 15 digits of
    106   precision (which is the most that IEEE 754 "double" can guarantee)
    107   using float_format='.15g'. To ensure that converting to text and back to a
    108   proto will result in an identical value, float_format='.17g' should be used.
    109 
    110   Args:
    111     message: The protocol buffers message.
    112     as_utf8: Produce text output in UTF8 format.
    113     as_one_line: Don't introduce newlines between fields.
    114     pointy_brackets: If True, use angle brackets instead of curly braces for
    115       nesting.
    116     use_index_order: If True, print fields of a proto message using the order
    117       defined in source code instead of the field number. By default, use the
    118       field number order.
    119     float_format: If set, use this to specify floating point number formatting
    120       (per the "Format Specification Mini-Language"); otherwise, str() is used.
    121     use_field_number: If True, print field numbers instead of names.
    122 
    123   Returns:
    124     A string of the text formatted protocol buffer message.
    125   """
    126   out = TextWriter(as_utf8)
    127   printer = _Printer(out, 0, as_utf8, as_one_line,
    128                      pointy_brackets, use_index_order, float_format,
    129                      use_field_number)
    130   printer.PrintMessage(message)
    131   result = out.getvalue()
    132   out.close()
    133   if as_one_line:
    134     return result.rstrip()
    135   return result
    136 
    137 
    138 def _IsMapEntry(field):
    139   return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
    140           field.message_type.has_options and
    141           field.message_type.GetOptions().map_entry)
    142 
    143 
    144 def PrintMessage(message, out, indent=0, as_utf8=False, as_one_line=False,
    145                  pointy_brackets=False, use_index_order=False,
    146                  float_format=None, use_field_number=False):
    147   printer = _Printer(out, indent, as_utf8, as_one_line,
    148                      pointy_brackets, use_index_order, float_format,
    149                      use_field_number)
    150   printer.PrintMessage(message)
    151 
    152 
    153 def PrintField(field, value, out, indent=0, as_utf8=False, as_one_line=False,
    154                pointy_brackets=False, use_index_order=False, float_format=None):
    155   """Print a single field name/value pair."""
    156   printer = _Printer(out, indent, as_utf8, as_one_line,
    157                      pointy_brackets, use_index_order, float_format)
    158   printer.PrintField(field, value)
    159 
    160 
    161 def PrintFieldValue(field, value, out, indent=0, as_utf8=False,
    162                     as_one_line=False, pointy_brackets=False,
    163                     use_index_order=False,
    164                     float_format=None):
    165   """Print a single field value (not including name)."""
    166   printer = _Printer(out, indent, as_utf8, as_one_line,
    167                      pointy_brackets, use_index_order, float_format)
    168   printer.PrintFieldValue(field, value)
    169 
    170 
    171 class _Printer(object):
    172   """Text format printer for protocol message."""
    173 
    174   def __init__(self, out, indent=0, as_utf8=False, as_one_line=False,
    175                pointy_brackets=False, use_index_order=False, float_format=None,
    176                use_field_number=False):
    177     """Initialize the Printer.
    178 
    179     Floating point values can be formatted compactly with 15 digits of
    180     precision (which is the most that IEEE 754 "double" can guarantee)
    181     using float_format='.15g'. To ensure that converting to text and back to a
    182     proto will result in an identical value, float_format='.17g' should be used.
    183 
    184     Args:
    185       out: To record the text format result.
    186       indent: The indent level for pretty print.
    187       as_utf8: Produce text output in UTF8 format.
    188       as_one_line: Don't introduce newlines between fields.
    189       pointy_brackets: If True, use angle brackets instead of curly braces for
    190         nesting.
    191       use_index_order: If True, print fields of a proto message using the order
    192         defined in source code instead of the field number. By default, use the
    193         field number order.
    194       float_format: If set, use this to specify floating point number formatting
    195         (per the "Format Specification Mini-Language"); otherwise, str() is
    196         used.
    197       use_field_number: If True, print field numbers instead of names.
    198     """
    199     self.out = out
    200     self.indent = indent
    201     self.as_utf8 = as_utf8
    202     self.as_one_line = as_one_line
    203     self.pointy_brackets = pointy_brackets
    204     self.use_index_order = use_index_order
    205     self.float_format = float_format
    206     self.use_field_number = use_field_number
    207 
    208   def PrintMessage(self, message):
    209     """Convert protobuf message to text format.
    210 
    211     Args:
    212       message: The protocol buffers message.
    213     """
    214     fields = message.ListFields()
    215     if self.use_index_order:
    216       fields.sort(key=lambda x: x[0].index)
    217     for field, value in fields:
    218       if _IsMapEntry(field):
    219         for key in sorted(value):
    220           # This is slow for maps with submessage entires because it copies the
    221           # entire tree.  Unfortunately this would take significant refactoring
    222           # of this file to work around.
    223           #
    224           # TODO(haberman): refactor and optimize if this becomes an issue.
    225           entry_submsg = field.message_type._concrete_class(
    226               key=key, value=value[key])
    227           self.PrintField(field, entry_submsg)
    228       elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
    229         for element in value:
    230           self.PrintField(field, element)
    231       else:
    232         self.PrintField(field, value)
    233 
    234   def PrintField(self, field, value):
    235     """Print a single field name/value pair."""
    236     out = self.out
    237     out.write(' ' * self.indent)
    238     if self.use_field_number:
    239       out.write(str(field.number))
    240     else:
    241       if field.is_extension:
    242         out.write('[')
    243         if (field.containing_type.GetOptions().message_set_wire_format and
    244             field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
    245             field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
    246           out.write(field.message_type.full_name)
    247         else:
    248           out.write(field.full_name)
    249         out.write(']')
    250       elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
    251         # For groups, use the capitalized name.
    252         out.write(field.message_type.name)
    253       else:
    254         out.write(field.name)
    255 
    256     if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
    257       # The colon is optional in this case, but our cross-language golden files
    258       # don't include it.
    259       out.write(': ')
    260 
    261     self.PrintFieldValue(field, value)
    262     if self.as_one_line:
    263       out.write(' ')
    264     else:
    265       out.write('\n')
    266 
    267   def PrintFieldValue(self, field, value):
    268     """Print a single field value (not including name).
    269 
    270     For repeated fields, the value should be a single element.
    271 
    272     Args:
    273       field: The descriptor of the field to be printed.
    274       value: The value of the field.
    275     """
    276     out = self.out
    277     if self.pointy_brackets:
    278       openb = '<'
    279       closeb = '>'
    280     else:
    281       openb = '{'
    282       closeb = '}'
    283 
    284     if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
    285       if self.as_one_line:
    286         out.write(' %s ' % openb)
    287         self.PrintMessage(value)
    288         out.write(closeb)
    289       else:
    290         out.write(' %s\n' % openb)
    291         self.indent += 2
    292         self.PrintMessage(value)
    293         self.indent -= 2
    294         out.write(' ' * self.indent + closeb)
    295     elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
    296       enum_value = field.enum_type.values_by_number.get(value, None)
    297       if enum_value is not None:
    298         out.write(enum_value.name)
    299       else:
    300         out.write(str(value))
    301     elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
    302       out.write('\"')
    303       if isinstance(value, six.text_type):
    304         out_value = value.encode('utf-8')
    305       else:
    306         out_value = value
    307       if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
    308         # We need to escape non-UTF8 chars in TYPE_BYTES field.
    309         out_as_utf8 = False
    310       else:
    311         out_as_utf8 = self.as_utf8
    312       out.write(text_encoding.CEscape(out_value, out_as_utf8))
    313       out.write('\"')
    314     elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
    315       if value:
    316         out.write('true')
    317       else:
    318         out.write('false')
    319     elif field.cpp_type in _FLOAT_TYPES and self.float_format is not None:
    320       out.write('{1:{0}}'.format(self.float_format, value))
    321     else:
    322       out.write(str(value))
    323 
    324 
    325 def Parse(text, message,
    326           allow_unknown_extension=False, allow_field_number=False):
    327   """Parses an text representation of a protocol message into a message.
    328 
    329   Args:
    330     text: Message text representation.
    331     message: A protocol buffer message to merge into.
    332     allow_unknown_extension: if True, skip over missing extensions and keep
    333       parsing
    334     allow_field_number: if True, both field number and field name are allowed.
    335 
    336   Returns:
    337     The same message passed as argument.
    338 
    339   Raises:
    340     ParseError: On text parsing problems.
    341   """
    342   if not isinstance(text, str):
    343     text = text.decode('utf-8')
    344   return ParseLines(text.split('\n'), message, allow_unknown_extension,
    345                     allow_field_number)
    346 
    347 
    348 def Merge(text, message, allow_unknown_extension=False,
    349           allow_field_number=False):
    350   """Parses an text representation of a protocol message into a message.
    351 
    352   Like Parse(), but allows repeated values for a non-repeated field, and uses
    353   the last one.
    354 
    355   Args:
    356     text: Message text representation.
    357     message: A protocol buffer message to merge into.
    358     allow_unknown_extension: if True, skip over missing extensions and keep
    359       parsing
    360     allow_field_number: if True, both field number and field name are allowed.
    361 
    362   Returns:
    363     The same message passed as argument.
    364 
    365   Raises:
    366     ParseError: On text parsing problems.
    367   """
    368   return MergeLines(text.split('\n'), message, allow_unknown_extension,
    369                     allow_field_number)
    370 
    371 
    372 def ParseLines(lines, message, allow_unknown_extension=False,
    373                allow_field_number=False):
    374   """Parses an text representation of a protocol message into a message.
    375 
    376   Args:
    377     lines: An iterable of lines of a message's text representation.
    378     message: A protocol buffer message to merge into.
    379     allow_unknown_extension: if True, skip over missing extensions and keep
    380       parsing
    381     allow_field_number: if True, both field number and field name are allowed.
    382 
    383   Returns:
    384     The same message passed as argument.
    385 
    386   Raises:
    387     ParseError: On text parsing problems.
    388   """
    389   parser = _Parser(allow_unknown_extension, allow_field_number)
    390   return parser.ParseLines(lines, message)
    391 
    392 
    393 def MergeLines(lines, message, allow_unknown_extension=False,
    394                allow_field_number=False):
    395   """Parses an text representation of a protocol message into a message.
    396 
    397   Args:
    398     lines: An iterable of lines of a message's text representation.
    399     message: A protocol buffer message to merge into.
    400     allow_unknown_extension: if True, skip over missing extensions and keep
    401       parsing
    402     allow_field_number: if True, both field number and field name are allowed.
    403 
    404   Returns:
    405     The same message passed as argument.
    406 
    407   Raises:
    408     ParseError: On text parsing problems.
    409   """
    410   parser = _Parser(allow_unknown_extension, allow_field_number)
    411   return parser.MergeLines(lines, message)
    412 
    413 
    414 class _Parser(object):
    415   """Text format parser for protocol message."""
    416 
    417   def __init__(self, allow_unknown_extension=False, allow_field_number=False):
    418     self.allow_unknown_extension = allow_unknown_extension
    419     self.allow_field_number = allow_field_number
    420 
    421   def ParseFromString(self, text, message):
    422     """Parses an text representation of a protocol message into a message."""
    423     if not isinstance(text, str):
    424       text = text.decode('utf-8')
    425     return self.ParseLines(text.split('\n'), message)
    426 
    427   def ParseLines(self, lines, message):
    428     """Parses an text representation of a protocol message into a message."""
    429     self._allow_multiple_scalars = False
    430     self._ParseOrMerge(lines, message)
    431     return message
    432 
    433   def MergeFromString(self, text, message):
    434     """Merges an text representation of a protocol message into a message."""
    435     return self._MergeLines(text.split('\n'), message)
    436 
    437   def MergeLines(self, lines, message):
    438     """Merges an text representation of a protocol message into a message."""
    439     self._allow_multiple_scalars = True
    440     self._ParseOrMerge(lines, message)
    441     return message
    442 
    443   def _ParseOrMerge(self, lines, message):
    444     """Converts an text representation of a protocol message into a message.
    445 
    446     Args:
    447       lines: Lines of a message's text representation.
    448       message: A protocol buffer message to merge into.
    449 
    450     Raises:
    451       ParseError: On text parsing problems.
    452     """
    453     tokenizer = _Tokenizer(lines)
    454     while not tokenizer.AtEnd():
    455       self._MergeField(tokenizer, message)
    456 
    457   def _MergeField(self, tokenizer, message):
    458     """Merges a single protocol message field into a message.
    459 
    460     Args:
    461       tokenizer: A tokenizer to parse the field name and values.
    462       message: A protocol message to record the data.
    463 
    464     Raises:
    465       ParseError: In case of text parsing problems.
    466     """
    467     message_descriptor = message.DESCRIPTOR
    468     if (hasattr(message_descriptor, 'syntax') and
    469         message_descriptor.syntax == 'proto3'):
    470       # Proto3 doesn't represent presence so we can't test if multiple
    471       # scalars have occurred.  We have to allow them.
    472       self._allow_multiple_scalars = True
    473     if tokenizer.TryConsume('['):
    474       name = [tokenizer.ConsumeIdentifier()]
    475       while tokenizer.TryConsume('.'):
    476         name.append(tokenizer.ConsumeIdentifier())
    477       name = '.'.join(name)
    478 
    479       if not message_descriptor.is_extendable:
    480         raise tokenizer.ParseErrorPreviousToken(
    481             'Message type "%s" does not have extensions.' %
    482             message_descriptor.full_name)
    483       # pylint: disable=protected-access
    484       field = message.Extensions._FindExtensionByName(name)
    485       # pylint: enable=protected-access
    486       if not field:
    487         if self.allow_unknown_extension:
    488           field = None
    489         else:
    490           raise tokenizer.ParseErrorPreviousToken(
    491               'Extension "%s" not registered.' % name)
    492       elif message_descriptor != field.containing_type:
    493         raise tokenizer.ParseErrorPreviousToken(
    494             'Extension "%s" does not extend message type "%s".' % (
    495                 name, message_descriptor.full_name))
    496 
    497       tokenizer.Consume(']')
    498 
    499     else:
    500       name = tokenizer.ConsumeIdentifier()
    501       if self.allow_field_number and name.isdigit():
    502         number = ParseInteger(name, True, True)
    503         field = message_descriptor.fields_by_number.get(number, None)
    504         if not field and message_descriptor.is_extendable:
    505           field = message.Extensions._FindExtensionByNumber(number)
    506       else:
    507         field = message_descriptor.fields_by_name.get(name, None)
    508 
    509         # Group names are expected to be capitalized as they appear in the
    510         # .proto file, which actually matches their type names, not their field
    511         # names.
    512         if not field:
    513           field = message_descriptor.fields_by_name.get(name.lower(), None)
    514           if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
    515             field = None
    516 
    517         if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
    518             field.message_type.name != name):
    519           field = None
    520 
    521       if not field:
    522         raise tokenizer.ParseErrorPreviousToken(
    523             'Message type "%s" has no field named "%s".' % (
    524                 message_descriptor.full_name, name))
    525 
    526     if field:
    527       if not self._allow_multiple_scalars and field.containing_oneof:
    528         # Check if there's a different field set in this oneof.
    529         # Note that we ignore the case if the same field was set before, and we
    530         # apply _allow_multiple_scalars to non-scalar fields as well.
    531         which_oneof = message.WhichOneof(field.containing_oneof.name)
    532         if which_oneof is not None and which_oneof != field.name:
    533           raise tokenizer.ParseErrorPreviousToken(
    534               'Field "%s" is specified along with field "%s", another member '
    535               'of oneof "%s" for message type "%s".' % (
    536                   field.name, which_oneof, field.containing_oneof.name,
    537                   message_descriptor.full_name))
    538 
    539       if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
    540         tokenizer.TryConsume(':')
    541         merger = self._MergeMessageField
    542       else:
    543         tokenizer.Consume(':')
    544         merger = self._MergeScalarField
    545 
    546       if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED
    547           and tokenizer.TryConsume('[')):
    548         # Short repeated format, e.g. "foo: [1, 2, 3]"
    549         while True:
    550           merger(tokenizer, message, field)
    551           if tokenizer.TryConsume(']'): break
    552           tokenizer.Consume(',')
    553 
    554       else:
    555         merger(tokenizer, message, field)
    556 
    557     else:  # Proto field is unknown.
    558       assert self.allow_unknown_extension
    559       _SkipFieldContents(tokenizer)
    560 
    561     # For historical reasons, fields may optionally be separated by commas or
    562     # semicolons.
    563     if not tokenizer.TryConsume(','):
    564       tokenizer.TryConsume(';')
    565 
    566   def _MergeMessageField(self, tokenizer, message, field):
    567     """Merges a single scalar field into a message.
    568 
    569     Args:
    570       tokenizer: A tokenizer to parse the field value.
    571       message: The message of which field is a member.
    572       field: The descriptor of the field to be merged.
    573 
    574     Raises:
    575       ParseError: In case of text parsing problems.
    576     """
    577     is_map_entry = _IsMapEntry(field)
    578 
    579     if tokenizer.TryConsume('<'):
    580       end_token = '>'
    581     else:
    582       tokenizer.Consume('{')
    583       end_token = '}'
    584 
    585     if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
    586       if field.is_extension:
    587         sub_message = message.Extensions[field].add()
    588       elif is_map_entry:
    589         # pylint: disable=protected-access
    590         sub_message = field.message_type._concrete_class()
    591       else:
    592         sub_message = getattr(message, field.name).add()
    593     else:
    594       if field.is_extension:
    595         sub_message = message.Extensions[field]
    596       else:
    597         sub_message = getattr(message, field.name)
    598       sub_message.SetInParent()
    599 
    600     while not tokenizer.TryConsume(end_token):
    601       if tokenizer.AtEnd():
    602         raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,))
    603       self._MergeField(tokenizer, sub_message)
    604 
    605     if is_map_entry:
    606       value_cpptype = field.message_type.fields_by_name['value'].cpp_type
    607       if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
    608         value = getattr(message, field.name)[sub_message.key]
    609         value.MergeFrom(sub_message.value)
    610       else:
    611         getattr(message, field.name)[sub_message.key] = sub_message.value
    612 
    613   def _MergeScalarField(self, tokenizer, message, field):
    614     """Merges a single scalar field into a message.
    615 
    616     Args:
    617       tokenizer: A tokenizer to parse the field value.
    618       message: A protocol message to record the data.
    619       field: The descriptor of the field to be merged.
    620 
    621     Raises:
    622       ParseError: In case of text parsing problems.
    623       RuntimeError: On runtime errors.
    624     """
    625     _ = self.allow_unknown_extension
    626     value = None
    627 
    628     if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
    629                       descriptor.FieldDescriptor.TYPE_SINT32,
    630                       descriptor.FieldDescriptor.TYPE_SFIXED32):
    631       value = tokenizer.ConsumeInt32()
    632     elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
    633                         descriptor.FieldDescriptor.TYPE_SINT64,
    634                         descriptor.FieldDescriptor.TYPE_SFIXED64):
    635       value = tokenizer.ConsumeInt64()
    636     elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
    637                         descriptor.FieldDescriptor.TYPE_FIXED32):
    638       value = tokenizer.ConsumeUint32()
    639     elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
    640                         descriptor.FieldDescriptor.TYPE_FIXED64):
    641       value = tokenizer.ConsumeUint64()
    642     elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
    643                         descriptor.FieldDescriptor.TYPE_DOUBLE):
    644       value = tokenizer.ConsumeFloat()
    645     elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
    646       value = tokenizer.ConsumeBool()
    647     elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
    648       value = tokenizer.ConsumeString()
    649     elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
    650       value = tokenizer.ConsumeByteString()
    651     elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
    652       value = tokenizer.ConsumeEnum(field)
    653     else:
    654       raise RuntimeError('Unknown field type %d' % field.type)
    655 
    656     if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
    657       if field.is_extension:
    658         message.Extensions[field].append(value)
    659       else:
    660         getattr(message, field.name).append(value)
    661     else:
    662       if field.is_extension:
    663         if not self._allow_multiple_scalars and message.HasExtension(field):
    664           raise tokenizer.ParseErrorPreviousToken(
    665               'Message type "%s" should not have multiple "%s" extensions.' %
    666               (message.DESCRIPTOR.full_name, field.full_name))
    667         else:
    668           message.Extensions[field] = value
    669       else:
    670         if not self._allow_multiple_scalars and message.HasField(field.name):
    671           raise tokenizer.ParseErrorPreviousToken(
    672               'Message type "%s" should not have multiple "%s" fields.' %
    673               (message.DESCRIPTOR.full_name, field.name))
    674         else:
    675           setattr(message, field.name, value)
    676 
    677 
    678 def _SkipFieldContents(tokenizer):
    679   """Skips over contents (value or message) of a field.
    680 
    681   Args:
    682     tokenizer: A tokenizer to parse the field name and values.
    683   """
    684   # Try to guess the type of this field.
    685   # If this field is not a message, there should be a ":" between the
    686   # field name and the field value and also the field value should not
    687   # start with "{" or "<" which indicates the beginning of a message body.
    688   # If there is no ":" or there is a "{" or "<" after ":", this field has
    689   # to be a message or the input is ill-formed.
    690   if tokenizer.TryConsume(':') and not tokenizer.LookingAt(
    691       '{') and not tokenizer.LookingAt('<'):
    692     _SkipFieldValue(tokenizer)
    693   else:
    694     _SkipFieldMessage(tokenizer)
    695 
    696 
    697 def _SkipField(tokenizer):
    698   """Skips over a complete field (name and value/message).
    699 
    700   Args:
    701     tokenizer: A tokenizer to parse the field name and values.
    702   """
    703   if tokenizer.TryConsume('['):
    704     # Consume extension name.
    705     tokenizer.ConsumeIdentifier()
    706     while tokenizer.TryConsume('.'):
    707       tokenizer.ConsumeIdentifier()
    708     tokenizer.Consume(']')
    709   else:
    710     tokenizer.ConsumeIdentifier()
    711 
    712   _SkipFieldContents(tokenizer)
    713 
    714   # For historical reasons, fields may optionally be separated by commas or
    715   # semicolons.
    716   if not tokenizer.TryConsume(','):
    717     tokenizer.TryConsume(';')
    718 
    719 
    720 def _SkipFieldMessage(tokenizer):
    721   """Skips over a field message.
    722 
    723   Args:
    724     tokenizer: A tokenizer to parse the field name and values.
    725   """
    726 
    727   if tokenizer.TryConsume('<'):
    728     delimiter = '>'
    729   else:
    730     tokenizer.Consume('{')
    731     delimiter = '}'
    732 
    733   while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'):
    734     _SkipField(tokenizer)
    735 
    736   tokenizer.Consume(delimiter)
    737 
    738 
    739 def _SkipFieldValue(tokenizer):
    740   """Skips over a field value.
    741 
    742   Args:
    743     tokenizer: A tokenizer to parse the field name and values.
    744 
    745   Raises:
    746     ParseError: In case an invalid field value is found.
    747   """
    748   # String/bytes tokens can come in multiple adjacent string literals.
    749   # If we can consume one, consume as many as we can.
    750   if tokenizer.TryConsumeByteString():
    751     while tokenizer.TryConsumeByteString():
    752       pass
    753     return
    754 
    755   if (not tokenizer.TryConsumeIdentifier() and
    756       not tokenizer.TryConsumeInt64() and
    757       not tokenizer.TryConsumeUint64() and
    758       not tokenizer.TryConsumeFloat()):
    759     raise ParseError('Invalid field value: ' + tokenizer.token)
    760 
    761 
    762 class _Tokenizer(object):
    763   """Protocol buffer text representation tokenizer.
    764 
    765   This class handles the lower level string parsing by splitting it into
    766   meaningful tokens.
    767 
    768   It was directly ported from the Java protocol buffer API.
    769   """
    770 
    771   _WHITESPACE = re.compile('(\\s|(#.*$))+', re.MULTILINE)
    772   _TOKEN = re.compile('|'.join([
    773       r'[a-zA-Z_][0-9a-zA-Z_+-]*',             # an identifier
    774       r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*',  # a number
    775   ] + [                                        # quoted str for each quote mark
    776       r'{qt}([^{qt}\n\\]|\\.)*({qt}|\\?$)'.format(qt=mark) for mark in _QUOTES
    777   ]))
    778 
    779   _IDENTIFIER = re.compile(r'\w+')
    780 
    781   def __init__(self, lines):
    782     self._position = 0
    783     self._line = -1
    784     self._column = 0
    785     self._token_start = None
    786     self.token = ''
    787     self._lines = iter(lines)
    788     self._current_line = ''
    789     self._previous_line = 0
    790     self._previous_column = 0
    791     self._more_lines = True
    792     self._SkipWhitespace()
    793     self.NextToken()
    794 
    795   def LookingAt(self, token):
    796     return self.token == token
    797 
    798   def AtEnd(self):
    799     """Checks the end of the text was reached.
    800 
    801     Returns:
    802       True iff the end was reached.
    803     """
    804     return not self.token
    805 
    806   def _PopLine(self):
    807     while len(self._current_line) <= self._column:
    808       try:
    809         self._current_line = next(self._lines)
    810       except StopIteration:
    811         self._current_line = ''
    812         self._more_lines = False
    813         return
    814       else:
    815         self._line += 1
    816         self._column = 0
    817 
    818   def _SkipWhitespace(self):
    819     while True:
    820       self._PopLine()
    821       match = self._WHITESPACE.match(self._current_line, self._column)
    822       if not match:
    823         break
    824       length = len(match.group(0))
    825       self._column += length
    826 
    827   def TryConsume(self, token):
    828     """Tries to consume a given piece of text.
    829 
    830     Args:
    831       token: Text to consume.
    832 
    833     Returns:
    834       True iff the text was consumed.
    835     """
    836     if self.token == token:
    837       self.NextToken()
    838       return True
    839     return False
    840 
    841   def Consume(self, token):
    842     """Consumes a piece of text.
    843 
    844     Args:
    845       token: Text to consume.
    846 
    847     Raises:
    848       ParseError: If the text couldn't be consumed.
    849     """
    850     if not self.TryConsume(token):
    851       raise self._ParseError('Expected "%s".' % token)
    852 
    853   def TryConsumeIdentifier(self):
    854     try:
    855       self.ConsumeIdentifier()
    856       return True
    857     except ParseError:
    858       return False
    859 
    860   def ConsumeIdentifier(self):
    861     """Consumes protocol message field identifier.
    862 
    863     Returns:
    864       Identifier string.
    865 
    866     Raises:
    867       ParseError: If an identifier couldn't be consumed.
    868     """
    869     result = self.token
    870     if not self._IDENTIFIER.match(result):
    871       raise self._ParseError('Expected identifier.')
    872     self.NextToken()
    873     return result
    874 
    875   def ConsumeInt32(self):
    876     """Consumes a signed 32bit integer number.
    877 
    878     Returns:
    879       The integer parsed.
    880 
    881     Raises:
    882       ParseError: If a signed 32bit integer couldn't be consumed.
    883     """
    884     try:
    885       result = ParseInteger(self.token, is_signed=True, is_long=False)
    886     except ValueError as e:
    887       raise self._ParseError(str(e))
    888     self.NextToken()
    889     return result
    890 
    891   def ConsumeUint32(self):
    892     """Consumes an unsigned 32bit integer number.
    893 
    894     Returns:
    895       The integer parsed.
    896 
    897     Raises:
    898       ParseError: If an unsigned 32bit integer couldn't be consumed.
    899     """
    900     try:
    901       result = ParseInteger(self.token, is_signed=False, is_long=False)
    902     except ValueError as e:
    903       raise self._ParseError(str(e))
    904     self.NextToken()
    905     return result
    906 
    907   def TryConsumeInt64(self):
    908     try:
    909       self.ConsumeInt64()
    910       return True
    911     except ParseError:
    912       return False
    913 
    914   def ConsumeInt64(self):
    915     """Consumes a signed 64bit integer number.
    916 
    917     Returns:
    918       The integer parsed.
    919 
    920     Raises:
    921       ParseError: If a signed 64bit integer couldn't be consumed.
    922     """
    923     try:
    924       result = ParseInteger(self.token, is_signed=True, is_long=True)
    925     except ValueError as e:
    926       raise self._ParseError(str(e))
    927     self.NextToken()
    928     return result
    929 
    930   def TryConsumeUint64(self):
    931     try:
    932       self.ConsumeUint64()
    933       return True
    934     except ParseError:
    935       return False
    936 
    937   def ConsumeUint64(self):
    938     """Consumes an unsigned 64bit integer number.
    939 
    940     Returns:
    941       The integer parsed.
    942 
    943     Raises:
    944       ParseError: If an unsigned 64bit integer couldn't be consumed.
    945     """
    946     try:
    947       result = ParseInteger(self.token, is_signed=False, is_long=True)
    948     except ValueError as e:
    949       raise self._ParseError(str(e))
    950     self.NextToken()
    951     return result
    952 
    953   def TryConsumeFloat(self):
    954     try:
    955       self.ConsumeFloat()
    956       return True
    957     except ParseError:
    958       return False
    959 
    960   def ConsumeFloat(self):
    961     """Consumes an floating point number.
    962 
    963     Returns:
    964       The number parsed.
    965 
    966     Raises:
    967       ParseError: If a floating point number couldn't be consumed.
    968     """
    969     try:
    970       result = ParseFloat(self.token)
    971     except ValueError as e:
    972       raise self._ParseError(str(e))
    973     self.NextToken()
    974     return result
    975 
    976   def ConsumeBool(self):
    977     """Consumes a boolean value.
    978 
    979     Returns:
    980       The bool parsed.
    981 
    982     Raises:
    983       ParseError: If a boolean value couldn't be consumed.
    984     """
    985     try:
    986       result = ParseBool(self.token)
    987     except ValueError as e:
    988       raise self._ParseError(str(e))
    989     self.NextToken()
    990     return result
    991 
    992   def TryConsumeByteString(self):
    993     try:
    994       self.ConsumeByteString()
    995       return True
    996     except ParseError:
    997       return False
    998 
    999   def ConsumeString(self):
   1000     """Consumes a string value.
   1001 
   1002     Returns:
   1003       The string parsed.
   1004 
   1005     Raises:
   1006       ParseError: If a string value couldn't be consumed.
   1007     """
   1008     the_bytes = self.ConsumeByteString()
   1009     try:
   1010       return six.text_type(the_bytes, 'utf-8')
   1011     except UnicodeDecodeError as e:
   1012       raise self._StringParseError(e)
   1013 
   1014   def ConsumeByteString(self):
   1015     """Consumes a byte array value.
   1016 
   1017     Returns:
   1018       The array parsed (as a string).
   1019 
   1020     Raises:
   1021       ParseError: If a byte array value couldn't be consumed.
   1022     """
   1023     the_list = [self._ConsumeSingleByteString()]
   1024     while self.token and self.token[0] in _QUOTES:
   1025       the_list.append(self._ConsumeSingleByteString())
   1026     return b''.join(the_list)
   1027 
   1028   def _ConsumeSingleByteString(self):
   1029     """Consume one token of a string literal.
   1030 
   1031     String literals (whether bytes or text) can come in multiple adjacent
   1032     tokens which are automatically concatenated, like in C or Python.  This
   1033     method only consumes one token.
   1034 
   1035     Returns:
   1036       The token parsed.
   1037     Raises:
   1038       ParseError: When the wrong format data is found.
   1039     """
   1040     text = self.token
   1041     if len(text) < 1 or text[0] not in _QUOTES:
   1042       raise self._ParseError('Expected string but found: %r' % (text,))
   1043 
   1044     if len(text) < 2 or text[-1] != text[0]:
   1045       raise self._ParseError('String missing ending quote: %r' % (text,))
   1046 
   1047     try:
   1048       result = text_encoding.CUnescape(text[1:-1])
   1049     except ValueError as e:
   1050       raise self._ParseError(str(e))
   1051     self.NextToken()
   1052     return result
   1053 
   1054   def ConsumeEnum(self, field):
   1055     try:
   1056       result = ParseEnum(field, self.token)
   1057     except ValueError as e:
   1058       raise self._ParseError(str(e))
   1059     self.NextToken()
   1060     return result
   1061 
   1062   def ParseErrorPreviousToken(self, message):
   1063     """Creates and *returns* a ParseError for the previously read token.
   1064 
   1065     Args:
   1066       message: A message to set for the exception.
   1067 
   1068     Returns:
   1069       A ParseError instance.
   1070     """
   1071     return ParseError('%d:%d : %s' % (
   1072         self._previous_line + 1, self._previous_column + 1, message))
   1073 
   1074   def _ParseError(self, message):
   1075     """Creates and *returns* a ParseError for the current token."""
   1076     return ParseError('%d:%d : %s' % (
   1077         self._line + 1, self._column + 1, message))
   1078 
   1079   def _StringParseError(self, e):
   1080     return self._ParseError('Couldn\'t parse string: ' + str(e))
   1081 
   1082   def NextToken(self):
   1083     """Reads the next meaningful token."""
   1084     self._previous_line = self._line
   1085     self._previous_column = self._column
   1086 
   1087     self._column += len(self.token)
   1088     self._SkipWhitespace()
   1089 
   1090     if not self._more_lines:
   1091       self.token = ''
   1092       return
   1093 
   1094     match = self._TOKEN.match(self._current_line, self._column)
   1095     if match:
   1096       token = match.group(0)
   1097       self.token = token
   1098     else:
   1099       self.token = self._current_line[self._column]
   1100 
   1101 
   1102 def ParseInteger(text, is_signed=False, is_long=False):
   1103   """Parses an integer.
   1104 
   1105   Args:
   1106     text: The text to parse.
   1107     is_signed: True if a signed integer must be parsed.
   1108     is_long: True if a long integer must be parsed.
   1109 
   1110   Returns:
   1111     The integer value.
   1112 
   1113   Raises:
   1114     ValueError: Thrown Iff the text is not a valid integer.
   1115   """
   1116   # Do the actual parsing. Exception handling is propagated to caller.
   1117   try:
   1118     # We force 32-bit values to int and 64-bit values to long to make
   1119     # alternate implementations where the distinction is more significant
   1120     # (e.g. the C++ implementation) simpler.
   1121     if is_long:
   1122       result = long(text, 0)
   1123     else:
   1124       result = int(text, 0)
   1125   except ValueError:
   1126     raise ValueError('Couldn\'t parse integer: %s' % text)
   1127 
   1128   # Check if the integer is sane. Exceptions handled by callers.
   1129   checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
   1130   checker.CheckValue(result)
   1131   return result
   1132 
   1133 
   1134 def ParseFloat(text):
   1135   """Parse a floating point number.
   1136 
   1137   Args:
   1138     text: Text to parse.
   1139 
   1140   Returns:
   1141     The number parsed.
   1142 
   1143   Raises:
   1144     ValueError: If a floating point number couldn't be parsed.
   1145   """
   1146   try:
   1147     # Assume Python compatible syntax.
   1148     return float(text)
   1149   except ValueError:
   1150     # Check alternative spellings.
   1151     if _FLOAT_INFINITY.match(text):
   1152       if text[0] == '-':
   1153         return float('-inf')
   1154       else:
   1155         return float('inf')
   1156     elif _FLOAT_NAN.match(text):
   1157       return float('nan')
   1158     else:
   1159       # assume '1.0f' format
   1160       try:
   1161         return float(text.rstrip('f'))
   1162       except ValueError:
   1163         raise ValueError('Couldn\'t parse float: %s' % text)
   1164 
   1165 
   1166 def ParseBool(text):
   1167   """Parse a boolean value.
   1168 
   1169   Args:
   1170     text: Text to parse.
   1171 
   1172   Returns:
   1173     Boolean values parsed
   1174 
   1175   Raises:
   1176     ValueError: If text is not a valid boolean.
   1177   """
   1178   if text in ('true', 't', '1'):
   1179     return True
   1180   elif text in ('false', 'f', '0'):
   1181     return False
   1182   else:
   1183     raise ValueError('Expected "true" or "false".')
   1184 
   1185 
   1186 def ParseEnum(field, value):
   1187   """Parse an enum value.
   1188 
   1189   The value can be specified by a number (the enum value), or by
   1190   a string literal (the enum name).
   1191 
   1192   Args:
   1193     field: Enum field descriptor.
   1194     value: String value.
   1195 
   1196   Returns:
   1197     Enum value number.
   1198 
   1199   Raises:
   1200     ValueError: If the enum value could not be parsed.
   1201   """
   1202   enum_descriptor = field.enum_type
   1203   try:
   1204     number = int(value, 0)
   1205   except ValueError:
   1206     # Identifier.
   1207     enum_value = enum_descriptor.values_by_name.get(value, None)
   1208     if enum_value is None:
   1209       raise ValueError(
   1210           'Enum type "%s" has no value named %s.' % (
   1211               enum_descriptor.full_name, value))
   1212   else:
   1213     # Numeric value.
   1214     enum_value = enum_descriptor.values_by_number.get(number, None)
   1215     if enum_value is None:
   1216       raise ValueError(
   1217           'Enum type "%s" has no value with number %d.' % (
   1218               enum_descriptor.full_name, number))
   1219   return enum_value.number
   1220