Home | History | Annotate | Download | only in scanner
      1 /**
      2  * Copyright (c) 2008, http://www.snakeyaml.org
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 package org.yaml.snakeyaml.scanner;
     17 
     18 import java.nio.ByteBuffer;
     19 import java.nio.charset.CharacterCodingException;
     20 import java.util.ArrayList;
     21 import java.util.HashMap;
     22 import java.util.Iterator;
     23 import java.util.LinkedHashMap;
     24 import java.util.List;
     25 import java.util.Map;
     26 import java.util.regex.Pattern;
     27 
     28 import org.yaml.snakeyaml.error.Mark;
     29 import org.yaml.snakeyaml.error.YAMLException;
     30 import org.yaml.snakeyaml.reader.StreamReader;
     31 import org.yaml.snakeyaml.tokens.AliasToken;
     32 import org.yaml.snakeyaml.tokens.AnchorToken;
     33 import org.yaml.snakeyaml.tokens.BlockEndToken;
     34 import org.yaml.snakeyaml.tokens.BlockEntryToken;
     35 import org.yaml.snakeyaml.tokens.BlockMappingStartToken;
     36 import org.yaml.snakeyaml.tokens.BlockSequenceStartToken;
     37 import org.yaml.snakeyaml.tokens.DirectiveToken;
     38 import org.yaml.snakeyaml.tokens.DocumentEndToken;
     39 import org.yaml.snakeyaml.tokens.DocumentStartToken;
     40 import org.yaml.snakeyaml.tokens.FlowEntryToken;
     41 import org.yaml.snakeyaml.tokens.FlowMappingEndToken;
     42 import org.yaml.snakeyaml.tokens.FlowMappingStartToken;
     43 import org.yaml.snakeyaml.tokens.FlowSequenceEndToken;
     44 import org.yaml.snakeyaml.tokens.FlowSequenceStartToken;
     45 import org.yaml.snakeyaml.tokens.KeyToken;
     46 import org.yaml.snakeyaml.tokens.ScalarToken;
     47 import org.yaml.snakeyaml.tokens.StreamEndToken;
     48 import org.yaml.snakeyaml.tokens.StreamStartToken;
     49 import org.yaml.snakeyaml.tokens.TagToken;
     50 import org.yaml.snakeyaml.tokens.TagTuple;
     51 import org.yaml.snakeyaml.tokens.Token;
     52 import org.yaml.snakeyaml.tokens.ValueToken;
     53 import org.yaml.snakeyaml.util.ArrayStack;
     54 import org.yaml.snakeyaml.util.UriEncoder;
     55 
     56 /**
     57  * <pre>
     58  * Scanner produces tokens of the following types:
     59  * STREAM-START
     60  * STREAM-END
     61  * DIRECTIVE(name, value)
     62  * DOCUMENT-START
     63  * DOCUMENT-END
     64  * BLOCK-SEQUENCE-START
     65  * BLOCK-MAPPING-START
     66  * BLOCK-END
     67  * FLOW-SEQUENCE-START
     68  * FLOW-MAPPING-START
     69  * FLOW-SEQUENCE-END
     70  * FLOW-MAPPING-END
     71  * BLOCK-ENTRY
     72  * FLOW-ENTRY
     73  * KEY
     74  * VALUE
     75  * ALIAS(value)
     76  * ANCHOR(value)
     77  * TAG(value)
     78  * SCALAR(value, plain, style)
     79  * Read comments in the Scanner code for more details.
     80  * </pre>
     81  */
     82 public final class ScannerImpl implements Scanner {
     83     /**
     84      * A regular expression matching characters which are not in the hexadecimal
     85      * set (0-9, A-F, a-f).
     86      */
     87     private final static Pattern NOT_HEXA = Pattern.compile("[^0-9A-Fa-f]");
     88 
     89     /**
     90      * A mapping from an escaped character in the input stream to the character
     91      * that they should be replaced with.
     92      *
     93      * YAML defines several common and a few uncommon escape sequences.
     94      *
     95      * @see <a href="http://www.yaml.org/spec/current.html#id2517668">4.1.6.
     96      *      Escape Sequences</a>
     97      */
     98     public final static Map<Character, String> ESCAPE_REPLACEMENTS = new HashMap<Character, String>();
     99 
    100     /**
    101      * A mapping from a character to a number of bytes to read-ahead for that
    102      * escape sequence. These escape sequences are used to handle unicode
    103      * escaping in the following formats, where H is a hexadecimal character:
    104      *
    105      * <pre>
    106      * &#92;xHH         : escaped 8-bit Unicode character
    107      * &#92;uHHHH       : escaped 16-bit Unicode character
    108      * &#92;UHHHHHHHH   : escaped 32-bit Unicode character
    109      * </pre>
    110      *
    111      * @see <a href="http://yaml.org/spec/1.1/current.html#id872840">5.6. Escape
    112      *      Sequences</a>
    113      */
    114     public final static Map<Character, Integer> ESCAPE_CODES = new HashMap<Character, Integer>();
    115 
    116     static {
    117         // ASCII null
    118         ESCAPE_REPLACEMENTS.put(Character.valueOf('0'), "\0");
    119         // ASCII bell
    120         ESCAPE_REPLACEMENTS.put(Character.valueOf('a'), "\u0007");
    121         // ASCII backspace
    122         ESCAPE_REPLACEMENTS.put(Character.valueOf('b'), "\u0008");
    123         // ASCII horizontal tab
    124         ESCAPE_REPLACEMENTS.put(Character.valueOf('t'), "\u0009");
    125         // ASCII newline (line feed; &#92;n maps to 0x0A)
    126         ESCAPE_REPLACEMENTS.put(Character.valueOf('n'), "\n");
    127         // ASCII vertical tab
    128         ESCAPE_REPLACEMENTS.put(Character.valueOf('v'), "\u000B");
    129         // ASCII form-feed
    130         ESCAPE_REPLACEMENTS.put(Character.valueOf('f'), "\u000C");
    131         // carriage-return (&#92;r maps to 0x0D)
    132         ESCAPE_REPLACEMENTS.put(Character.valueOf('r'), "\r");
    133         // ASCII escape character (Esc)
    134         ESCAPE_REPLACEMENTS.put(Character.valueOf('e'), "\u001B");
    135         // ASCII space
    136         ESCAPE_REPLACEMENTS.put(Character.valueOf(' '), "\u0020");
    137         // ASCII double-quote
    138         ESCAPE_REPLACEMENTS.put(Character.valueOf('"'), "\"");
    139         // ASCII backslash
    140         ESCAPE_REPLACEMENTS.put(Character.valueOf('\\'), "\\");
    141         // Unicode next line
    142         ESCAPE_REPLACEMENTS.put(Character.valueOf('N'), "\u0085");
    143         // Unicode non-breaking-space
    144         ESCAPE_REPLACEMENTS.put(Character.valueOf('_'), "\u00A0");
    145         // Unicode line-separator
    146         ESCAPE_REPLACEMENTS.put(Character.valueOf('L'), "\u2028");
    147         // Unicode paragraph separator
    148         ESCAPE_REPLACEMENTS.put(Character.valueOf('P'), "\u2029");
    149 
    150         // 8-bit Unicode
    151         ESCAPE_CODES.put(Character.valueOf('x'), 2);
    152         // 16-bit Unicode
    153         ESCAPE_CODES.put(Character.valueOf('u'), 4);
    154         // 32-bit Unicode (Supplementary characters are supported)
    155         ESCAPE_CODES.put(Character.valueOf('U'), 8);
    156     }
    157     private final StreamReader reader;
    158     // Had we reached the end of the stream?
    159     private boolean done = false;
    160 
    161     // The number of unclosed '{' and '['. `flow_level == 0` means block
    162     // context.
    163     private int flowLevel = 0;
    164 
    165     // List of processed tokens that are not yet emitted.
    166     private List<Token> tokens;
    167 
    168     // Number of tokens that were emitted through the `get_token` method.
    169     private int tokensTaken = 0;
    170 
    171     // The current indentation level.
    172     private int indent = -1;
    173 
    174     // Past indentation levels.
    175     private ArrayStack<Integer> indents;
    176 
    177     // Variables related to simple keys treatment. See PyYAML.
    178 
    179     /**
    180      * <pre>
    181      * A simple key is a key that is not denoted by the '?' indicator.
    182      * Example of simple keys:
    183      *   ---
    184      *   block simple key: value
    185      *   ? not a simple key:
    186      *   : { flow simple key: value }
    187      * We emit the KEY token before all keys, so when we find a potential
    188      * simple key, we try to locate the corresponding ':' indicator.
    189      * Simple keys should be limited to a single line and 1024 characters.
    190      *
    191      * Can a simple key start at the current position? A simple key may
    192      * start:
    193      * - at the beginning of the line, not counting indentation spaces
    194      *       (in block context),
    195      * - after '{', '[', ',' (in the flow context),
    196      * - after '?', ':', '-' (in the block context).
    197      * In the block context, this flag also signifies if a block collection
    198      * may start at the current position.
    199      * </pre>
    200      */
    201     private boolean allowSimpleKey = true;
    202 
    203     /*
    204      * Keep track of possible simple keys. This is a dictionary. The key is
    205      * `flow_level`; there can be no more that one possible simple key for each
    206      * level. The value is a SimpleKey record: (token_number, required, index,
    207      * line, column, mark) A simple key may start with ALIAS, ANCHOR, TAG,
    208      * SCALAR(flow), '[', or '{' tokens.
    209      */
    210     private Map<Integer, SimpleKey> possibleSimpleKeys;
    211 
    212     public ScannerImpl(StreamReader reader) {
    213         this.reader = reader;
    214         this.tokens = new ArrayList<Token>(100);
    215         this.indents = new ArrayStack<Integer>(10);
    216         // The order in possibleSimpleKeys is kept for nextPossibleSimpleKey()
    217         this.possibleSimpleKeys = new LinkedHashMap<Integer, SimpleKey>();
    218         fetchStreamStart();// Add the STREAM-START token.
    219     }
    220 
    221     /**
    222      * Check whether the next token is one of the given types.
    223      */
    224     public boolean checkToken(Token.ID... choices) {
    225         while (needMoreTokens()) {
    226             fetchMoreTokens();
    227         }
    228         if (!this.tokens.isEmpty()) {
    229             if (choices.length == 0) {
    230                 return true;
    231             }
    232             // since profiler puts this method on top (it is used a lot), we
    233             // should not use 'foreach' here because of the performance reasons
    234             Token.ID first = this.tokens.get(0).getTokenId();
    235             for (int i = 0; i < choices.length; i++) {
    236                 if (first == choices[i]) {
    237                     return true;
    238                 }
    239             }
    240         }
    241         return false;
    242     }
    243 
    244     /**
    245      * Return the next token, but do not delete it from the queue.
    246      */
    247     public Token peekToken() {
    248         while (needMoreTokens()) {
    249             fetchMoreTokens();
    250         }
    251         return this.tokens.get(0);
    252     }
    253 
    254     /**
    255      * Return the next token, removing it from the queue.
    256      */
    257     public Token getToken() {
    258         if (!this.tokens.isEmpty()) {
    259             this.tokensTaken++;
    260             return this.tokens.remove(0);
    261         }
    262         return null;
    263     }
    264 
    265     // Private methods.
    266     /**
    267      * Returns true if more tokens should be scanned.
    268      */
    269     private boolean needMoreTokens() {
    270         // If we are done, we do not require more tokens.
    271         if (this.done) {
    272             return false;
    273         }
    274         // If we aren't done, but we have no tokens, we need to scan more.
    275         if (this.tokens.isEmpty()) {
    276             return true;
    277         }
    278         // The current token may be a potential simple key, so we
    279         // need to look further.
    280         stalePossibleSimpleKeys();
    281         return nextPossibleSimpleKey() == this.tokensTaken;
    282     }
    283 
    284     /**
    285      * Fetch one or more tokens from the StreamReader.
    286      */
    287     private void fetchMoreTokens() {
    288         // Eat whitespaces and comments until we reach the next token.
    289         scanToNextToken();
    290         // Remove obsolete possible simple keys.
    291         stalePossibleSimpleKeys();
    292         // Compare the current indentation and column. It may add some tokens
    293         // and decrease the current indentation level.
    294         unwindIndent(reader.getColumn());
    295         // Peek the next character, to decide what the next group of tokens
    296         // will look like.
    297         char ch = reader.peek();
    298         switch (ch) {
    299         case '\0':
    300             // Is it the end of stream?
    301             fetchStreamEnd();
    302             return;
    303         case '%':
    304             // Is it a directive?
    305             if (checkDirective()) {
    306                 fetchDirective();
    307                 return;
    308             }
    309             break;
    310         case '-':
    311             // Is it the document start?
    312             if (checkDocumentStart()) {
    313                 fetchDocumentStart();
    314                 return;
    315                 // Is it the block entry indicator?
    316             } else if (checkBlockEntry()) {
    317                 fetchBlockEntry();
    318                 return;
    319             }
    320             break;
    321         case '.':
    322             // Is it the document end?
    323             if (checkDocumentEnd()) {
    324                 fetchDocumentEnd();
    325                 return;
    326             }
    327             break;
    328         // TODO support for BOM within a stream. (not implemented in PyYAML)
    329         case '[':
    330             // Is it the flow sequence start indicator?
    331             fetchFlowSequenceStart();
    332             return;
    333         case '{':
    334             // Is it the flow mapping start indicator?
    335             fetchFlowMappingStart();
    336             return;
    337         case ']':
    338             // Is it the flow sequence end indicator?
    339             fetchFlowSequenceEnd();
    340             return;
    341         case '}':
    342             // Is it the flow mapping end indicator?
    343             fetchFlowMappingEnd();
    344             return;
    345         case ',':
    346             // Is it the flow entry indicator?
    347             fetchFlowEntry();
    348             return;
    349             // see block entry indicator above
    350         case '?':
    351             // Is it the key indicator?
    352             if (checkKey()) {
    353                 fetchKey();
    354                 return;
    355             }
    356             break;
    357         case ':':
    358             // Is it the value indicator?
    359             if (checkValue()) {
    360                 fetchValue();
    361                 return;
    362             }
    363             break;
    364         case '*':
    365             // Is it an alias?
    366             fetchAlias();
    367             return;
    368         case '&':
    369             // Is it an anchor?
    370             fetchAnchor();
    371             return;
    372         case '!':
    373             // Is it a tag?
    374             fetchTag();
    375             return;
    376         case '|':
    377             // Is it a literal scalar?
    378             if (this.flowLevel == 0) {
    379                 fetchLiteral();
    380                 return;
    381             }
    382             break;
    383         case '>':
    384             // Is it a folded scalar?
    385             if (this.flowLevel == 0) {
    386                 fetchFolded();
    387                 return;
    388             }
    389             break;
    390         case '\'':
    391             // Is it a single quoted scalar?
    392             fetchSingle();
    393             return;
    394         case '"':
    395             // Is it a double quoted scalar?
    396             fetchDouble();
    397             return;
    398         }
    399         // It must be a plain scalar then.
    400         if (checkPlain()) {
    401             fetchPlain();
    402             return;
    403         }
    404         // No? It's an error. Let's produce a nice error message.We do this by
    405         // converting escaped characters into their escape sequences. This is a
    406         // backwards use of the ESCAPE_REPLACEMENTS map.
    407         String chRepresentation = String.valueOf(ch);
    408         for (Character s : ESCAPE_REPLACEMENTS.keySet()) {
    409             String v = ESCAPE_REPLACEMENTS.get(s);
    410             if (v.equals(chRepresentation)) {
    411                 chRepresentation = "\\" + s;// ' ' -> '\t'
    412                 break;
    413             }
    414         }
    415         if (ch == '\t')
    416             chRepresentation += "(TAB)";
    417         String text = String
    418                 .format("found character '%s' that cannot start any token. (Do not use %s for indentation)",
    419                         chRepresentation, chRepresentation);
    420         throw new ScannerException("while scanning for the next token", null, text,
    421                 reader.getMark());
    422     }
    423 
    424     // Simple keys treatment.
    425 
    426     /**
    427      * Return the number of the nearest possible simple key. Actually we don't
    428      * need to loop through the whole dictionary.
    429      */
    430     private int nextPossibleSimpleKey() {
    431         /*
    432          * the implementation is not as in PyYAML. Because
    433          * this.possibleSimpleKeys is ordered we can simply take the first key
    434          */
    435         if (!this.possibleSimpleKeys.isEmpty()) {
    436             return this.possibleSimpleKeys.values().iterator().next().getTokenNumber();
    437         }
    438         return -1;
    439     }
    440 
    441     /**
    442      * <pre>
    443      * Remove entries that are no longer possible simple keys. According to
    444      * the YAML specification, simple keys
    445      * - should be limited to a single line,
    446      * - should be no longer than 1024 characters.
    447      * Disabling this procedure will allow simple keys of any length and
    448      * height (may cause problems if indentation is broken though).
    449      * </pre>
    450      */
    451     private void stalePossibleSimpleKeys() {
    452         if (!this.possibleSimpleKeys.isEmpty()) {
    453             for (Iterator<SimpleKey> iterator = this.possibleSimpleKeys.values().iterator(); iterator
    454                     .hasNext();) {
    455                 SimpleKey key = iterator.next();
    456                 if ((key.getLine() != reader.getLine())
    457                         || (reader.getIndex() - key.getIndex() > 1024)) {
    458                     // If the key is not on the same line as the current
    459                     // position OR the difference in column between the token
    460                     // start and the current position is more than the maximum
    461                     // simple key length, then this cannot be a simple key.
    462                     if (key.isRequired()) {
    463                         // If the key was required, this implies an error
    464                         // condition.
    465                         throw new ScannerException("while scanning a simple key", key.getMark(),
    466                                 "could not find expected ':'", reader.getMark());
    467                     }
    468                     iterator.remove();
    469                 }
    470             }
    471         }
    472     }
    473 
    474     /**
    475      * The next token may start a simple key. We check if it's possible and save
    476      * its position. This function is called for ALIAS, ANCHOR, TAG,
    477      * SCALAR(flow), '[', and '{'.
    478      */
    479     private void savePossibleSimpleKey() {
    480         // The next token may start a simple key. We check if it's possible
    481         // and save its position. This function is called for
    482         // ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
    483 
    484         // Check if a simple key is required at the current position.
    485         // A simple key is required if this position is the root flowLevel, AND
    486         // the current indentation level is the same as the last indent-level.
    487         boolean required = (this.flowLevel == 0) && (this.indent == this.reader.getColumn());
    488 
    489         if (allowSimpleKey || !required) {
    490             // A simple key is required only if it is the first token in the
    491             // current line. Therefore it is always allowed.
    492         } else {
    493             throw new YAMLException(
    494                     "A simple key is required only if it is the first token in the current line");
    495         }
    496 
    497         // The next token might be a simple key. Let's save it's number and
    498         // position.
    499         if (this.allowSimpleKey) {
    500             removePossibleSimpleKey();
    501             int tokenNumber = this.tokensTaken + this.tokens.size();
    502             SimpleKey key = new SimpleKey(tokenNumber, required, reader.getIndex(),
    503                     reader.getLine(), this.reader.getColumn(), this.reader.getMark());
    504             this.possibleSimpleKeys.put(this.flowLevel, key);
    505         }
    506     }
    507 
    508     /**
    509      * Remove the saved possible key position at the current flow level.
    510      */
    511     private void removePossibleSimpleKey() {
    512         SimpleKey key = possibleSimpleKeys.remove(flowLevel);
    513         if (key != null && key.isRequired()) {
    514             throw new ScannerException("while scanning a simple key", key.getMark(),
    515                     "could not find expected ':'", reader.getMark());
    516         }
    517     }
    518 
    519     // Indentation functions.
    520 
    521     /**
    522      * * Handle implicitly ending multiple levels of block nodes by decreased
    523      * indentation. This function becomes important on lines 4 and 7 of this
    524      * example:
    525      *
    526      * <pre>
    527      * 1) book one:
    528      * 2)   part one:
    529      * 3)     chapter one
    530      * 4)   part two:
    531      * 5)     chapter one
    532      * 6)     chapter two
    533      * 7) book two:
    534      * </pre>
    535      *
    536      * In flow context, tokens should respect indentation. Actually the
    537      * condition should be `self.indent &gt;= column` according to the spec. But
    538      * this condition will prohibit intuitively correct constructions such as
    539      * key : { } </pre>
    540      */
    541     private void unwindIndent(int col) {
    542         // In the flow context, indentation is ignored. We make the scanner less
    543         // restrictive then specification requires.
    544         if (this.flowLevel != 0) {
    545             return;
    546         }
    547 
    548         // In block context, we may need to issue the BLOCK-END tokens.
    549         while (this.indent > col) {
    550             Mark mark = reader.getMark();
    551             this.indent = this.indents.pop();
    552             this.tokens.add(new BlockEndToken(mark, mark));
    553         }
    554     }
    555 
    556     /**
    557      * Check if we need to increase indentation.
    558      */
    559     private boolean addIndent(int column) {
    560         if (this.indent < column) {
    561             this.indents.push(this.indent);
    562             this.indent = column;
    563             return true;
    564         }
    565         return false;
    566     }
    567 
    568     // Fetchers.
    569 
    570     /**
    571      * We always add STREAM-START as the first token and STREAM-END as the last
    572      * token.
    573      */
    574     private void fetchStreamStart() {
    575         // Read the token.
    576         Mark mark = reader.getMark();
    577 
    578         // Add STREAM-START.
    579         Token token = new StreamStartToken(mark, mark);
    580         this.tokens.add(token);
    581     }
    582 
    583     private void fetchStreamEnd() {
    584         // Set the current intendation to -1.
    585         unwindIndent(-1);
    586 
    587         // Reset simple keys.
    588         removePossibleSimpleKey();
    589         this.allowSimpleKey = false;
    590         this.possibleSimpleKeys.clear();
    591 
    592         // Read the token.
    593         Mark mark = reader.getMark();
    594 
    595         // Add STREAM-END.
    596         Token token = new StreamEndToken(mark, mark);
    597         this.tokens.add(token);
    598 
    599         // The stream is finished.
    600         this.done = true;
    601     }
    602 
    603     /**
    604      * Fetch a YAML directive. Directives are presentation details that are
    605      * interpreted as instructions to the processor. YAML defines two kinds of
    606      * directives, YAML and TAG; all other types are reserved for future use.
    607      *
    608      * @see <a href="http://www.yaml.org/spec/1.1/#id864824"></a>
    609      */
    610     private void fetchDirective() {
    611         // Set the current intendation to -1.
    612         unwindIndent(-1);
    613 
    614         // Reset simple keys.
    615         removePossibleSimpleKey();
    616         this.allowSimpleKey = false;
    617 
    618         // Scan and add DIRECTIVE.
    619         Token tok = scanDirective();
    620         this.tokens.add(tok);
    621     }
    622 
    623     /**
    624      * Fetch a document-start token ("---").
    625      */
    626     private void fetchDocumentStart() {
    627         fetchDocumentIndicator(true);
    628     }
    629 
    630     /**
    631      * Fetch a document-end token ("...").
    632      */
    633     private void fetchDocumentEnd() {
    634         fetchDocumentIndicator(false);
    635     }
    636 
    637     /**
    638      * Fetch a document indicator, either "---" for "document-start", or else
    639      * "..." for "document-end. The type is chosen by the given boolean.
    640      */
    641     private void fetchDocumentIndicator(boolean isDocumentStart) {
    642         // Set the current intendation to -1.
    643         unwindIndent(-1);
    644 
    645         // Reset simple keys. Note that there could not be a block collection
    646         // after '---'.
    647         removePossibleSimpleKey();
    648         this.allowSimpleKey = false;
    649 
    650         // Add DOCUMENT-START or DOCUMENT-END.
    651         Mark startMark = reader.getMark();
    652         reader.forward(3);
    653         Mark endMark = reader.getMark();
    654         Token token;
    655         if (isDocumentStart) {
    656             token = new DocumentStartToken(startMark, endMark);
    657         } else {
    658             token = new DocumentEndToken(startMark, endMark);
    659         }
    660         this.tokens.add(token);
    661     }
    662 
    663     private void fetchFlowSequenceStart() {
    664         fetchFlowCollectionStart(false);
    665     }
    666 
    667     private void fetchFlowMappingStart() {
    668         fetchFlowCollectionStart(true);
    669     }
    670 
    671     /**
    672      * Fetch a flow-style collection start, which is either a sequence or a
    673      * mapping. The type is determined by the given boolean.
    674      *
    675      * A flow-style collection is in a format similar to JSON. Sequences are
    676      * started by '[' and ended by ']'; mappings are started by '{' and ended by
    677      * '}'.
    678      *
    679      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
    680      *
    681      * @param isMappingStart
    682      */
    683     private void fetchFlowCollectionStart(boolean isMappingStart) {
    684         // '[' and '{' may start a simple key.
    685         savePossibleSimpleKey();
    686 
    687         // Increase the flow level.
    688         this.flowLevel++;
    689 
    690         // Simple keys are allowed after '[' and '{'.
    691         this.allowSimpleKey = true;
    692 
    693         // Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
    694         Mark startMark = reader.getMark();
    695         reader.forward(1);
    696         Mark endMark = reader.getMark();
    697         Token token;
    698         if (isMappingStart) {
    699             token = new FlowMappingStartToken(startMark, endMark);
    700         } else {
    701             token = new FlowSequenceStartToken(startMark, endMark);
    702         }
    703         this.tokens.add(token);
    704     }
    705 
    706     private void fetchFlowSequenceEnd() {
    707         fetchFlowCollectionEnd(false);
    708     }
    709 
    710     private void fetchFlowMappingEnd() {
    711         fetchFlowCollectionEnd(true);
    712     }
    713 
    714     /**
    715      * Fetch a flow-style collection end, which is either a sequence or a
    716      * mapping. The type is determined by the given boolean.
    717      *
    718      * A flow-style collection is in a format similar to JSON. Sequences are
    719      * started by '[' and ended by ']'; mappings are started by '{' and ended by
    720      * '}'.
    721      *
    722      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
    723      */
    724     private void fetchFlowCollectionEnd(boolean isMappingEnd) {
    725         // Reset possible simple key on the current level.
    726         removePossibleSimpleKey();
    727 
    728         // Decrease the flow level.
    729         this.flowLevel--;
    730 
    731         // No simple keys after ']' or '}'.
    732         this.allowSimpleKey = false;
    733 
    734         // Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
    735         Mark startMark = reader.getMark();
    736         reader.forward();
    737         Mark endMark = reader.getMark();
    738         Token token;
    739         if (isMappingEnd) {
    740             token = new FlowMappingEndToken(startMark, endMark);
    741         } else {
    742             token = new FlowSequenceEndToken(startMark, endMark);
    743         }
    744         this.tokens.add(token);
    745     }
    746 
    747     /**
    748      * Fetch an entry in the flow style. Flow-style entries occur either
    749      * immediately after the start of a collection, or else after a comma.
    750      *
    751      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
    752      */
    753     private void fetchFlowEntry() {
    754         // Simple keys are allowed after ','.
    755         this.allowSimpleKey = true;
    756 
    757         // Reset possible simple key on the current level.
    758         removePossibleSimpleKey();
    759 
    760         // Add FLOW-ENTRY.
    761         Mark startMark = reader.getMark();
    762         reader.forward();
    763         Mark endMark = reader.getMark();
    764         Token token = new FlowEntryToken(startMark, endMark);
    765         this.tokens.add(token);
    766     }
    767 
    768     /**
    769      * Fetch an entry in the block style.
    770      *
    771      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
    772      */
    773     private void fetchBlockEntry() {
    774         // Block context needs additional checks.
    775         if (this.flowLevel == 0) {
    776             // Are we allowed to start a new entry?
    777             if (!this.allowSimpleKey) {
    778                 throw new ScannerException(null, null, "sequence entries are not allowed here",
    779                         reader.getMark());
    780             }
    781 
    782             // We may need to add BLOCK-SEQUENCE-START.
    783             if (addIndent(this.reader.getColumn())) {
    784                 Mark mark = reader.getMark();
    785                 this.tokens.add(new BlockSequenceStartToken(mark, mark));
    786             }
    787         } else {
    788             // It's an error for the block entry to occur in the flow
    789             // context,but we let the parser detect this.
    790         }
    791         // Simple keys are allowed after '-'.
    792         this.allowSimpleKey = true;
    793 
    794         // Reset possible simple key on the current level.
    795         removePossibleSimpleKey();
    796 
    797         // Add BLOCK-ENTRY.
    798         Mark startMark = reader.getMark();
    799         reader.forward();
    800         Mark endMark = reader.getMark();
    801         Token token = new BlockEntryToken(startMark, endMark);
    802         this.tokens.add(token);
    803     }
    804 
    805     /**
    806      * Fetch a key in a block-style mapping.
    807      *
    808      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
    809      */
    810     private void fetchKey() {
    811         // Block context needs additional checks.
    812         if (this.flowLevel == 0) {
    813             // Are we allowed to start a key (not necessary a simple)?
    814             if (!this.allowSimpleKey) {
    815                 throw new ScannerException(null, null, "mapping keys are not allowed here",
    816                         reader.getMark());
    817             }
    818             // We may need to add BLOCK-MAPPING-START.
    819             if (addIndent(this.reader.getColumn())) {
    820                 Mark mark = reader.getMark();
    821                 this.tokens.add(new BlockMappingStartToken(mark, mark));
    822             }
    823         }
    824         // Simple keys are allowed after '?' in the block context.
    825         this.allowSimpleKey = this.flowLevel == 0;
    826 
    827         // Reset possible simple key on the current level.
    828         removePossibleSimpleKey();
    829 
    830         // Add KEY.
    831         Mark startMark = reader.getMark();
    832         reader.forward();
    833         Mark endMark = reader.getMark();
    834         Token token = new KeyToken(startMark, endMark);
    835         this.tokens.add(token);
    836     }
    837 
    838     /**
    839      * Fetch a value in a block-style mapping.
    840      *
    841      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
    842      */
    843     private void fetchValue() {
    844         // Do we determine a simple key?
    845         SimpleKey key = this.possibleSimpleKeys.remove(this.flowLevel);
    846         if (key != null) {
    847             // Add KEY.
    848             this.tokens.add(key.getTokenNumber() - this.tokensTaken, new KeyToken(key.getMark(),
    849                     key.getMark()));
    850 
    851             // If this key starts a new block mapping, we need to add
    852             // BLOCK-MAPPING-START.
    853             if (this.flowLevel == 0) {
    854                 if (addIndent(key.getColumn())) {
    855                     this.tokens.add(key.getTokenNumber() - this.tokensTaken,
    856                             new BlockMappingStartToken(key.getMark(), key.getMark()));
    857                 }
    858             }
    859             // There cannot be two simple keys one after another.
    860             this.allowSimpleKey = false;
    861 
    862         } else {
    863             // It must be a part of a complex key.
    864             // Block context needs additional checks. Do we really need them?
    865             // They will be caught by the parser anyway.
    866             if (this.flowLevel == 0) {
    867 
    868                 // We are allowed to start a complex value if and only if we can
    869                 // start a simple key.
    870                 if (!this.allowSimpleKey) {
    871                     throw new ScannerException(null, null, "mapping values are not allowed here",
    872                             reader.getMark());
    873                 }
    874             }
    875 
    876             // If this value starts a new block mapping, we need to add
    877             // BLOCK-MAPPING-START. It will be detected as an error later by
    878             // the parser.
    879             if (flowLevel == 0) {
    880                 if (addIndent(reader.getColumn())) {
    881                     Mark mark = reader.getMark();
    882                     this.tokens.add(new BlockMappingStartToken(mark, mark));
    883                 }
    884             }
    885 
    886             // Simple keys are allowed after ':' in the block context.
    887             allowSimpleKey = flowLevel == 0;
    888 
    889             // Reset possible simple key on the current level.
    890             removePossibleSimpleKey();
    891         }
    892         // Add VALUE.
    893         Mark startMark = reader.getMark();
    894         reader.forward();
    895         Mark endMark = reader.getMark();
    896         Token token = new ValueToken(startMark, endMark);
    897         this.tokens.add(token);
    898     }
    899 
    900     /**
    901      * Fetch an alias, which is a reference to an anchor. Aliases take the
    902      * format:
    903      *
    904      * <pre>
    905      * *(anchor name)
    906      * </pre>
    907      *
    908      * @see <a href="http://www.yaml.org/spec/1.1/#id863390"></a>
    909      */
    910     private void fetchAlias() {
    911         // ALIAS could be a simple key.
    912         savePossibleSimpleKey();
    913 
    914         // No simple keys after ALIAS.
    915         this.allowSimpleKey = false;
    916 
    917         // Scan and add ALIAS.
    918         Token tok = scanAnchor(false);
    919         this.tokens.add(tok);
    920     }
    921 
    922     /**
    923      * Fetch an anchor. Anchors take the form:
    924      *
    925      * <pre>
    926      * &(anchor name)
    927      * </pre>
    928      *
    929      * @see <a href="http://www.yaml.org/spec/1.1/#id863390"></a>
    930      */
    931     private void fetchAnchor() {
    932         // ANCHOR could start a simple key.
    933         savePossibleSimpleKey();
    934 
    935         // No simple keys after ANCHOR.
    936         this.allowSimpleKey = false;
    937 
    938         // Scan and add ANCHOR.
    939         Token tok = scanAnchor(true);
    940         this.tokens.add(tok);
    941     }
    942 
    943     /**
    944      * Fetch a tag. Tags take a complex form.
    945      *
    946      * @see <a href="http://www.yaml.org/spec/1.1/#id861700"></a>
    947      */
    948     private void fetchTag() {
    949         // TAG could start a simple key.
    950         savePossibleSimpleKey();
    951 
    952         // No simple keys after TAG.
    953         this.allowSimpleKey = false;
    954 
    955         // Scan and add TAG.
    956         Token tok = scanTag();
    957         this.tokens.add(tok);
    958     }
    959 
    960     /**
    961      * Fetch a literal scalar, denoted with a vertical-bar. This is the type
    962      * best used for source code and other content, such as binary data, which
    963      * must be included verbatim.
    964      *
    965      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
    966      */
    967     private void fetchLiteral() {
    968         fetchBlockScalar('|');
    969     }
    970 
    971     /**
    972      * Fetch a folded scalar, denoted with a greater-than sign. This is the type
    973      * best used for long content, such as the text of a chapter or description.
    974      *
    975      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
    976      */
    977     private void fetchFolded() {
    978         fetchBlockScalar('>');
    979     }
    980 
    981     /**
    982      * Fetch a block scalar (literal or folded).
    983      *
    984      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
    985      *
    986      * @param style
    987      */
    988     private void fetchBlockScalar(char style) {
    989         // A simple key may follow a block scalar.
    990         this.allowSimpleKey = true;
    991 
    992         // Reset possible simple key on the current level.
    993         removePossibleSimpleKey();
    994 
    995         // Scan and add SCALAR.
    996         Token tok = scanBlockScalar(style);
    997         this.tokens.add(tok);
    998     }
    999 
   1000     /**
   1001      * Fetch a single-quoted (') scalar.
   1002      */
   1003     private void fetchSingle() {
   1004         fetchFlowScalar('\'');
   1005     }
   1006 
   1007     /**
   1008      * Fetch a double-quoted (") scalar.
   1009      */
   1010     private void fetchDouble() {
   1011         fetchFlowScalar('"');
   1012     }
   1013 
   1014     /**
   1015      * Fetch a flow scalar (single- or double-quoted).
   1016      *
   1017      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
   1018      *
   1019      * @param style
   1020      */
   1021     private void fetchFlowScalar(char style) {
   1022         // A flow scalar could be a simple key.
   1023         savePossibleSimpleKey();
   1024 
   1025         // No simple keys after flow scalars.
   1026         this.allowSimpleKey = false;
   1027 
   1028         // Scan and add SCALAR.
   1029         Token tok = scanFlowScalar(style);
   1030         this.tokens.add(tok);
   1031     }
   1032 
   1033     /**
   1034      * Fetch a plain scalar.
   1035      */
   1036     private void fetchPlain() {
   1037         // A plain scalar could be a simple key.
   1038         savePossibleSimpleKey();
   1039 
   1040         // No simple keys after plain scalars. But note that `scan_plain` will
   1041         // change this flag if the scan is finished at the beginning of the
   1042         // line.
   1043         this.allowSimpleKey = false;
   1044 
   1045         // Scan and add SCALAR. May change `allow_simple_key`.
   1046         Token tok = scanPlain();
   1047         this.tokens.add(tok);
   1048     }
   1049 
   1050     // Checkers.
   1051     /**
   1052      * Returns true if the next thing on the reader is a directive, given that
   1053      * the leading '%' has already been checked.
   1054      *
   1055      * @see <a href="http://www.yaml.org/spec/1.1/#id864824"></a>
   1056      */
   1057     private boolean checkDirective() {
   1058         // DIRECTIVE: ^ '%' ...
   1059         // The '%' indicator is already checked.
   1060         return reader.getColumn() == 0;
   1061     }
   1062 
   1063     /**
   1064      * Returns true if the next thing on the reader is a document-start ("---").
   1065      * A document-start is always followed immediately by a new line.
   1066      */
   1067     private boolean checkDocumentStart() {
   1068         // DOCUMENT-START: ^ '---' (' '|'\n')
   1069         if (reader.getColumn() == 0) {
   1070             if ("---".equals(reader.prefix(3)) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
   1071                 return true;
   1072             }
   1073         }
   1074         return false;
   1075     }
   1076 
   1077     /**
   1078      * Returns true if the next thing on the reader is a document-end ("..."). A
   1079      * document-end is always followed immediately by a new line.
   1080      */
   1081     private boolean checkDocumentEnd() {
   1082         // DOCUMENT-END: ^ '...' (' '|'\n')
   1083         if (reader.getColumn() == 0) {
   1084             if ("...".equals(reader.prefix(3)) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
   1085                 return true;
   1086             }
   1087         }
   1088         return false;
   1089     }
   1090 
   1091     /**
   1092      * Returns true if the next thing on the reader is a block token.
   1093      */
   1094     private boolean checkBlockEntry() {
   1095         // BLOCK-ENTRY: '-' (' '|'\n')
   1096         return Constant.NULL_BL_T_LINEBR.has(reader.peek(1));
   1097     }
   1098 
   1099     /**
   1100      * Returns true if the next thing on the reader is a key token.
   1101      */
   1102     private boolean checkKey() {
   1103         // KEY(flow context): '?'
   1104         if (this.flowLevel != 0) {
   1105             return true;
   1106         } else {
   1107             // KEY(block context): '?' (' '|'\n')
   1108             return Constant.NULL_BL_T_LINEBR.has(reader.peek(1));
   1109         }
   1110     }
   1111 
   1112     /**
   1113      * Returns true if the next thing on the reader is a value token.
   1114      */
   1115     private boolean checkValue() {
   1116         // VALUE(flow context): ':'
   1117         if (flowLevel != 0) {
   1118             return true;
   1119         } else {
   1120             // VALUE(block context): ':' (' '|'\n')
   1121             return Constant.NULL_BL_T_LINEBR.has(reader.peek(1));
   1122         }
   1123     }
   1124 
   1125     /**
   1126      * Returns true if the next thing on the reader is a plain token.
   1127      */
   1128     private boolean checkPlain() {
   1129         /**
   1130          * <pre>
   1131          * A plain scalar may start with any non-space character except:
   1132          *   '-', '?', ':', ',', '[', ']', '{', '}',
   1133          *   '#', '&amp;', '*', '!', '|', '&gt;', '\'', '\&quot;',
   1134          *   '%', '@', '`'.
   1135          *
   1136          * It may also start with
   1137          *   '-', '?', ':'
   1138          * if it is followed by a non-space character.
   1139          *
   1140          * Note that we limit the last rule to the block context (except the
   1141          * '-' character) because we want the flow context to be space
   1142          * independent.
   1143          * </pre>
   1144          */
   1145         char ch = reader.peek();
   1146         // If the next char is NOT one of the forbidden chars above or
   1147         // whitespace, then this is the start of a plain scalar.
   1148         return Constant.NULL_BL_T_LINEBR.hasNo(ch, "-?:,[]{}#&*!|>\'\"%@`")
   1149                 || (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(1)) && (ch == '-' || (this.flowLevel == 0 && "?:"
   1150                         .indexOf(ch) != -1)));
   1151     }
   1152 
   1153     // Scanners.
   1154 
   1155     /**
   1156      * <pre>
   1157      * We ignore spaces, line breaks and comments.
   1158      * If we find a line break in the block context, we set the flag
   1159      * `allow_simple_key` on.
   1160      * The byte order mark is stripped if it's the first character in the
   1161      * stream. We do not yet support BOM inside the stream as the
   1162      * specification requires. Any such mark will be considered as a part
   1163      * of the document.
   1164      * TODO: We need to make tab handling rules more sane. A good rule is
   1165      *   Tabs cannot precede tokens
   1166      *   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
   1167      *   KEY(block), VALUE(block), BLOCK-ENTRY
   1168      * So the checking code is
   1169      *   if &lt;TAB&gt;:
   1170      *       self.allow_simple_keys = False
   1171      * We also need to add the check for `allow_simple_keys == True` to
   1172      * `unwind_indent` before issuing BLOCK-END.
   1173      * Scanners for block, flow, and plain scalars need to be modified.
   1174      * </pre>
   1175      */
   1176     private void scanToNextToken() {
   1177         // If there is a byte order mark (BOM) at the beginning of the stream,
   1178         // forward past it.
   1179         if (reader.getIndex() == 0 && reader.peek() == '\uFEFF') {
   1180             reader.forward();
   1181         }
   1182         boolean found = false;
   1183         while (!found) {
   1184             int ff = 0;
   1185             // Peek ahead until we find the first non-space character, then
   1186             // move forward directly to that character.
   1187             while (reader.peek(ff) == ' ') {
   1188                 ff++;
   1189             }
   1190             if (ff > 0) {
   1191                 reader.forward(ff);
   1192             }
   1193             // If the character we have skipped forward to is a comment (#),
   1194             // then peek ahead until we find the next end of line. YAML
   1195             // comments are from a # to the next new-line. We then forward
   1196             // past the comment.
   1197             if (reader.peek() == '#') {
   1198                 ff = 0;
   1199                 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
   1200                     ff++;
   1201                 }
   1202                 if (ff > 0) {
   1203                     reader.forward(ff);
   1204                 }
   1205             }
   1206             // If we scanned a line break, then (depending on flow level),
   1207             // simple keys may be allowed.
   1208             if (scanLineBreak().length() != 0) {// found a line-break
   1209                 if (this.flowLevel == 0) {
   1210                     // Simple keys are allowed at flow-level 0 after a line
   1211                     // break
   1212                     this.allowSimpleKey = true;
   1213                 }
   1214             } else {
   1215                 found = true;
   1216             }
   1217         }
   1218     }
   1219 
   1220     @SuppressWarnings({ "unchecked", "rawtypes" })
   1221     private Token scanDirective() {
   1222         // See the specification for details.
   1223         Mark startMark = reader.getMark();
   1224         Mark endMark;
   1225         reader.forward();
   1226         String name = scanDirectiveName(startMark);
   1227         List<?> value = null;
   1228         if ("YAML".equals(name)) {
   1229             value = scanYamlDirectiveValue(startMark);
   1230             endMark = reader.getMark();
   1231         } else if ("TAG".equals(name)) {
   1232             value = scanTagDirectiveValue(startMark);
   1233             endMark = reader.getMark();
   1234         } else {
   1235             endMark = reader.getMark();
   1236             int ff = 0;
   1237             while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
   1238                 ff++;
   1239             }
   1240             if (ff > 0) {
   1241                 reader.forward(ff);
   1242             }
   1243         }
   1244         scanDirectiveIgnoredLine(startMark);
   1245         return new DirectiveToken(name, value, startMark, endMark);
   1246     }
   1247 
   1248     /**
   1249      * Scan a directive name. Directive names are a series of non-space
   1250      * characters.
   1251      *
   1252      * @see <a href="http://www.yaml.org/spec/1.1/#id895217"></a>
   1253      */
   1254     private String scanDirectiveName(Mark startMark) {
   1255         // See the specification for details.
   1256         int length = 0;
   1257         // A Directive-name is a sequence of alphanumeric characters
   1258         // (a-z,A-Z,0-9). We scan until we find something that isn't.
   1259         // FIXME this disagrees with the specification.
   1260         char ch = reader.peek(length);
   1261         while (Constant.ALPHA.has(ch)) {
   1262             length++;
   1263             ch = reader.peek(length);
   1264         }
   1265         // If the name would be empty, an error occurs.
   1266         if (length == 0) {
   1267             throw new ScannerException("while scanning a directive", startMark,
   1268                     "expected alphabetic or numeric character, but found " + ch + "(" + ((int) ch)
   1269                             + ")", reader.getMark());
   1270         }
   1271         String value = reader.prefixForward(length);
   1272         ch = reader.peek();
   1273         if (Constant.NULL_BL_LINEBR.hasNo(ch)) {
   1274             throw new ScannerException("while scanning a directive", startMark,
   1275                     "expected alphabetic or numeric character, but found " + ch + "(" + ((int) ch)
   1276                             + ")", reader.getMark());
   1277         }
   1278         return value;
   1279     }
   1280 
   1281     private List<Integer> scanYamlDirectiveValue(Mark startMark) {
   1282         // See the specification for details.
   1283         while (reader.peek() == ' ') {
   1284             reader.forward();
   1285         }
   1286         Integer major = scanYamlDirectiveNumber(startMark);
   1287         if (reader.peek() != '.') {
   1288             throw new ScannerException("while scanning a directive", startMark,
   1289                     "expected a digit or '.', but found " + reader.peek() + "("
   1290                             + ((int) reader.peek()) + ")", reader.getMark());
   1291         }
   1292         reader.forward();
   1293         Integer minor = scanYamlDirectiveNumber(startMark);
   1294         if (Constant.NULL_BL_LINEBR.hasNo(reader.peek())) {
   1295             throw new ScannerException("while scanning a directive", startMark,
   1296                     "expected a digit or ' ', but found " + reader.peek() + "("
   1297                             + ((int) reader.peek()) + ")", reader.getMark());
   1298         }
   1299         List<Integer> result = new ArrayList<Integer>(2);
   1300         result.add(major);
   1301         result.add(minor);
   1302         return result;
   1303     }
   1304 
   1305     /**
   1306      * Read a %YAML directive number: this is either the major or the minor
   1307      * part. Stop reading at a non-digit character (usually either '.' or '\n').
   1308      *
   1309      * @see <a href="http://www.yaml.org/spec/1.1/#id895631"></a>
   1310      * @see <a href="http://www.yaml.org/spec/1.1/#ns-dec-digit"></a>
   1311      */
   1312     private Integer scanYamlDirectiveNumber(Mark startMark) {
   1313         // See the specification for details.
   1314         char ch = reader.peek();
   1315         if (!Character.isDigit(ch)) {
   1316             throw new ScannerException("while scanning a directive", startMark,
   1317                     "expected a digit, but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
   1318         }
   1319         int length = 0;
   1320         while (Character.isDigit(reader.peek(length))) {
   1321             length++;
   1322         }
   1323         Integer value = Integer.parseInt(reader.prefixForward(length));
   1324         return value;
   1325     }
   1326 
   1327     /**
   1328      * <p>
   1329      * Read a %TAG directive value:
   1330      *
   1331      * <pre>
   1332      * s-ignored-space+ c-tag-handle s-ignored-space+ ns-tag-prefix s-l-comments
   1333      * </pre>
   1334      *
   1335      * </p>
   1336      *
   1337      * @see <a href="http://www.yaml.org/spec/1.1/#id896044"></a>
   1338      */
   1339     private List<String> scanTagDirectiveValue(Mark startMark) {
   1340         // See the specification for details.
   1341         while (reader.peek() == ' ') {
   1342             reader.forward();
   1343         }
   1344         String handle = scanTagDirectiveHandle(startMark);
   1345         while (reader.peek() == ' ') {
   1346             reader.forward();
   1347         }
   1348         String prefix = scanTagDirectivePrefix(startMark);
   1349         List<String> result = new ArrayList<String>(2);
   1350         result.add(handle);
   1351         result.add(prefix);
   1352         return result;
   1353     }
   1354 
   1355     /**
   1356      * Scan a %TAG directive's handle. This is YAML's c-tag-handle.
   1357      *
   1358      * @see <a href="http://www.yaml.org/spec/1.1/#id896876"></a>
   1359      * @param startMark
   1360      * @return
   1361      */
   1362     private String scanTagDirectiveHandle(Mark startMark) {
   1363         // See the specification for details.
   1364         String value = scanTagHandle("directive", startMark);
   1365         char ch = reader.peek();
   1366         if (ch != ' ') {
   1367             throw new ScannerException("while scanning a directive", startMark,
   1368                     "expected ' ', but found " + reader.peek() + "(" + ch + ")", reader.getMark());
   1369         }
   1370         return value;
   1371     }
   1372 
   1373     /**
   1374      * Scan a %TAG directive's prefix. This is YAML's ns-tag-prefix.
   1375      *
   1376      * @see <a href="http://www.yaml.org/spec/1.1/#ns-tag-prefix"></a>
   1377      */
   1378     private String scanTagDirectivePrefix(Mark startMark) {
   1379         // See the specification for details.
   1380         String value = scanTagUri("directive", startMark);
   1381         if (Constant.NULL_BL_LINEBR.hasNo(reader.peek())) {
   1382             throw new ScannerException("while scanning a directive", startMark,
   1383                     "expected ' ', but found " + reader.peek() + "(" + ((int) reader.peek()) + ")",
   1384                     reader.getMark());
   1385         }
   1386         return value;
   1387     }
   1388 
   1389     private String scanDirectiveIgnoredLine(Mark startMark) {
   1390         // See the specification for details.
   1391         int ff = 0;
   1392         while (reader.peek(ff) == ' ') {
   1393             ff++;
   1394         }
   1395         if (ff > 0) {
   1396             reader.forward(ff);
   1397         }
   1398         if (reader.peek() == '#') {
   1399             ff = 0;
   1400             while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
   1401                 ff++;
   1402             }
   1403             reader.forward(ff);
   1404         }
   1405         char ch = reader.peek();
   1406         String lineBreak = scanLineBreak();
   1407         if (lineBreak.length() == 0 && ch != '\0') {
   1408             throw new ScannerException("while scanning a directive", startMark,
   1409                     "expected a comment or a line break, but found " + ch + "(" + ((int) ch) + ")",
   1410                     reader.getMark());
   1411         }
   1412         return lineBreak;
   1413     }
   1414 
   1415     /**
   1416      * <pre>
   1417      * The specification does not restrict characters for anchors and
   1418      * aliases. This may lead to problems, for instance, the document:
   1419      *   [ *alias, value ]
   1420      * can be interpreted in two ways, as
   1421      *   [ &quot;value&quot; ]
   1422      * and
   1423      *   [ *alias , &quot;value&quot; ]
   1424      * Therefore we restrict aliases to numbers and ASCII letters.
   1425      * </pre>
   1426      */
   1427     private Token scanAnchor(boolean isAnchor) {
   1428         Mark startMark = reader.getMark();
   1429         char indicator = reader.peek();
   1430         String name = indicator == '*' ? "alias" : "anchor";
   1431         reader.forward();
   1432         int length = 0;
   1433         char ch = reader.peek(length);
   1434         while (Constant.ALPHA.has(ch)) {
   1435             length++;
   1436             ch = reader.peek(length);
   1437         }
   1438         if (length == 0) {
   1439             throw new ScannerException("while scanning an " + name, startMark,
   1440                     "expected alphabetic or numeric character, but found " + ch,
   1441                     reader.getMark());
   1442         }
   1443         String value = reader.prefixForward(length);
   1444         ch = reader.peek();
   1445         if (Constant.NULL_BL_T_LINEBR.hasNo(ch, "?:,]}%@`")) {
   1446             throw new ScannerException("while scanning an " + name, startMark,
   1447                     "expected alphabetic or numeric character, but found " + ch + "("
   1448                             + ((int) reader.peek()) + ")", reader.getMark());
   1449         }
   1450         Mark endMark = reader.getMark();
   1451         Token tok;
   1452         if (isAnchor) {
   1453             tok = new AnchorToken(value, startMark, endMark);
   1454         } else {
   1455             tok = new AliasToken(value, startMark, endMark);
   1456         }
   1457         return tok;
   1458     }
   1459 
   1460     /**
   1461      * <p>
   1462      * Scan a Tag property. A Tag property may be specified in one of three
   1463      * ways: c-verbatim-tag, c-ns-shorthand-tag, or c-ns-non-specific-tag
   1464      * </p>
   1465      *
   1466      * <p>
   1467      * c-verbatim-tag takes the form !&lt;ns-uri-char+&gt; and must be delivered
   1468      * verbatim (as-is) to the application. In particular, verbatim tags are not
   1469      * subject to tag resolution.
   1470      * </p>
   1471      *
   1472      * <p>
   1473      * c-ns-shorthand-tag is a valid tag handle followed by a non-empty suffix.
   1474      * If the tag handle is a c-primary-tag-handle ('!') then the suffix must
   1475      * have all exclamation marks properly URI-escaped (%21); otherwise, the
   1476      * string will look like a named tag handle: !foo!bar would be interpreted
   1477      * as (handle="!foo!", suffix="bar").
   1478      * </p>
   1479      *
   1480      * <p>
   1481      * c-ns-non-specific-tag is always a lone '!'; this is only useful for plain
   1482      * scalars, where its specification means that the scalar MUST be resolved
   1483      * to have type tag:yaml.org,2002:str.
   1484      * </p>
   1485      *
   1486      * TODO SnakeYaml incorrectly ignores c-ns-non-specific-tag right now.
   1487      *
   1488      * @see <a href="http://www.yaml.org/spec/1.1/#id900262"></a>
   1489      *
   1490      *      TODO Note that this method does not enforce rules about local versus
   1491      *      global tags!
   1492      */
   1493     private Token scanTag() {
   1494         // See the specification for details.
   1495         Mark startMark = reader.getMark();
   1496         // Determine the type of tag property based on the first character
   1497         // encountered
   1498         char ch = reader.peek(1);
   1499         String handle = null;
   1500         String suffix = null;
   1501         // Verbatim tag! (c-verbatim-tag)
   1502         if (ch == '<') {
   1503             // Skip the exclamation mark and &gt;, then read the tag suffix (as
   1504             // a URI).
   1505             reader.forward(2);
   1506             suffix = scanTagUri("tag", startMark);
   1507             if (reader.peek() != '>') {
   1508                 // If there are any characters between the end of the tag-suffix
   1509                 // URI and the closing &gt;, then an error has occurred.
   1510                 throw new ScannerException("while scanning a tag", startMark,
   1511                         "expected '>', but found '" + reader.peek() + "' (" + ((int) reader.peek())
   1512                                 + ")", reader.getMark());
   1513             }
   1514             reader.forward();
   1515         } else if (Constant.NULL_BL_T_LINEBR.has(ch)) {
   1516             // A NUL, blank, tab, or line-break means that this was a
   1517             // c-ns-non-specific tag.
   1518             suffix = "!";
   1519             reader.forward();
   1520         } else {
   1521             // Any other character implies c-ns-shorthand-tag type.
   1522 
   1523             // Look ahead in the stream to determine whether this tag property
   1524             // is of the form !foo or !foo!bar.
   1525             int length = 1;
   1526             boolean useHandle = false;
   1527             while (Constant.NULL_BL_LINEBR.hasNo(ch)) {
   1528                 if (ch == '!') {
   1529                     useHandle = true;
   1530                     break;
   1531                 }
   1532                 length++;
   1533                 ch = reader.peek(length);
   1534             }
   1535             handle = "!";
   1536             // If we need to use a handle, scan it in; otherwise, the handle is
   1537             // presumed to be '!'.
   1538             if (useHandle) {
   1539                 handle = scanTagHandle("tag", startMark);
   1540             } else {
   1541                 handle = "!";
   1542                 reader.forward();
   1543             }
   1544             suffix = scanTagUri("tag", startMark);
   1545         }
   1546         ch = reader.peek();
   1547         // Check that the next character is allowed to follow a tag-property;
   1548         // if it is not, raise the error.
   1549         if (Constant.NULL_BL_LINEBR.hasNo(ch)) {
   1550             throw new ScannerException("while scanning a tag", startMark,
   1551                     "expected ' ', but found '" + ch + "' (" + ((int) ch) + ")", reader.getMark());
   1552         }
   1553         TagTuple value = new TagTuple(handle, suffix);
   1554         Mark endMark = reader.getMark();
   1555         return new TagToken(value, startMark, endMark);
   1556     }
   1557 
   1558     private Token scanBlockScalar(char style) {
   1559         // See the specification for details.
   1560         boolean folded;
   1561         // Depending on the given style, we determine whether the scalar is
   1562         // folded ('>') or literal ('|')
   1563         if (style == '>') {
   1564             folded = true;
   1565         } else {
   1566             folded = false;
   1567         }
   1568         StringBuilder chunks = new StringBuilder();
   1569         Mark startMark = reader.getMark();
   1570         // Scan the header.
   1571         reader.forward();
   1572         Chomping chompi = scanBlockScalarIndicators(startMark);
   1573         int increment = chompi.getIncrement();
   1574         scanBlockScalarIgnoredLine(startMark);
   1575 
   1576         // Determine the indentation level and go to the first non-empty line.
   1577         int minIndent = this.indent + 1;
   1578         if (minIndent < 1) {
   1579             minIndent = 1;
   1580         }
   1581         String breaks = null;
   1582         int maxIndent = 0;
   1583         int indent = 0;
   1584         Mark endMark;
   1585         if (increment == -1) {
   1586             Object[] brme = scanBlockScalarIndentation();
   1587             breaks = (String) brme[0];
   1588             maxIndent = ((Integer) brme[1]).intValue();
   1589             endMark = (Mark) brme[2];
   1590             indent = Math.max(minIndent, maxIndent);
   1591         } else {
   1592             indent = minIndent + increment - 1;
   1593             Object[] brme = scanBlockScalarBreaks(indent);
   1594             breaks = (String) brme[0];
   1595             endMark = (Mark) brme[1];
   1596         }
   1597 
   1598         String lineBreak = "";
   1599 
   1600         // Scan the inner part of the block scalar.
   1601         while (this.reader.getColumn() == indent && reader.peek() != '\0') {
   1602             chunks.append(breaks);
   1603             boolean leadingNonSpace = " \t".indexOf(reader.peek()) == -1;
   1604             int length = 0;
   1605             while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(length))) {
   1606                 length++;
   1607             }
   1608             chunks.append(reader.prefixForward(length));
   1609             lineBreak = scanLineBreak();
   1610             Object[] brme = scanBlockScalarBreaks(indent);
   1611             breaks = (String) brme[0];
   1612             endMark = (Mark) brme[1];
   1613             if (this.reader.getColumn() == indent && reader.peek() != '\0') {
   1614 
   1615                 // Unfortunately, folding rules are ambiguous.
   1616                 //
   1617                 // This is the folding according to the specification:
   1618                 if (folded && "\n".equals(lineBreak) && leadingNonSpace
   1619                         && " \t".indexOf(reader.peek()) == -1) {
   1620                     if (breaks.length() == 0) {
   1621                         chunks.append(" ");
   1622                     }
   1623                 } else {
   1624                     chunks.append(lineBreak);
   1625                 }
   1626                 // Clark Evans's interpretation (also in the spec examples) not
   1627                 // imported from PyYAML
   1628             } else {
   1629                 break;
   1630             }
   1631         }
   1632         // Chomp the tail.
   1633         if (chompi.chompTailIsNotFalse()) {
   1634             chunks.append(lineBreak);
   1635         }
   1636         if (chompi.chompTailIsTrue()) {
   1637             chunks.append(breaks);
   1638         }
   1639         // We are done.
   1640         return new ScalarToken(chunks.toString(), false, startMark, endMark, style);
   1641     }
   1642 
   1643     /**
   1644      * Scan a block scalar indicator. The block scalar indicator includes two
   1645      * optional components, which may appear in either order.
   1646      *
   1647      * A block indentation indicator is a non-zero digit describing the
   1648      * indentation level of the block scalar to follow. This indentation is an
   1649      * additional number of spaces relative to the current indentation level.
   1650      *
   1651      * A block chomping indicator is a + or -, selecting the chomping mode away
   1652      * from the default (clip) to either -(strip) or +(keep).
   1653      *
   1654      * @see <a href="http://www.yaml.org/spec/1.1/#id868988"></a>
   1655      * @see <a href="http://www.yaml.org/spec/1.1/#id927035"></a>
   1656      * @see <a href="http://www.yaml.org/spec/1.1/#id927557"></a>
   1657      */
   1658     private Chomping scanBlockScalarIndicators(Mark startMark) {
   1659         // See the specification for details.
   1660         Boolean chomping = null;
   1661         int increment = -1;
   1662         char ch = reader.peek();
   1663         if (ch == '-' || ch == '+') {
   1664             if (ch == '+') {
   1665                 chomping = Boolean.TRUE;
   1666             } else {
   1667                 chomping = Boolean.FALSE;
   1668             }
   1669             reader.forward();
   1670             ch = reader.peek();
   1671             if (Character.isDigit(ch)) {
   1672                 increment = Integer.parseInt(String.valueOf(ch));
   1673                 if (increment == 0) {
   1674                     throw new ScannerException("while scanning a block scalar", startMark,
   1675                             "expected indentation indicator in the range 1-9, but found 0",
   1676                             reader.getMark());
   1677                 }
   1678                 reader.forward();
   1679             }
   1680         } else if (Character.isDigit(ch)) {
   1681             increment = Integer.parseInt(String.valueOf(ch));
   1682             if (increment == 0) {
   1683                 throw new ScannerException("while scanning a block scalar", startMark,
   1684                         "expected indentation indicator in the range 1-9, but found 0",
   1685                         reader.getMark());
   1686             }
   1687             reader.forward();
   1688             ch = reader.peek();
   1689             if (ch == '-' || ch == '+') {
   1690                 if (ch == '+') {
   1691                     chomping = Boolean.TRUE;
   1692                 } else {
   1693                     chomping = Boolean.FALSE;
   1694                 }
   1695                 reader.forward();
   1696             }
   1697         }
   1698         ch = reader.peek();
   1699         if (Constant.NULL_BL_LINEBR.hasNo(ch)) {
   1700             throw new ScannerException("while scanning a block scalar", startMark,
   1701                     "expected chomping or indentation indicators, but found " + ch,
   1702                     reader.getMark());
   1703         }
   1704         return new Chomping(chomping, increment);
   1705     }
   1706 
   1707     /**
   1708      * Scan to the end of the line after a block scalar has been scanned; the
   1709      * only things that are permitted at this time are comments and spaces.
   1710      */
   1711     private String scanBlockScalarIgnoredLine(Mark startMark) {
   1712         // See the specification for details.
   1713         int ff = 0;
   1714         // Forward past any number of trailing spaces
   1715         while (reader.peek(ff) == ' ') {
   1716             ff++;
   1717         }
   1718         if (ff > 0) {
   1719             reader.forward(ff);
   1720         }
   1721         // If a comment occurs, scan to just before the end of line.
   1722         if (reader.peek() == '#') {
   1723             ff = 0;
   1724             while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
   1725                 ff++;
   1726             }
   1727             if (ff > 0) {
   1728                 reader.forward(ff);
   1729             }
   1730         }
   1731         // If the next character is not a null or line break, an error has
   1732         // occurred.
   1733         char ch = reader.peek();
   1734         String lineBreak = scanLineBreak();
   1735         if (lineBreak.length() == 0 && ch != '\0') {
   1736             throw new ScannerException("while scanning a block scalar", startMark,
   1737                     "expected a comment or a line break, but found " + ch, reader.getMark());
   1738         }
   1739         return lineBreak;
   1740     }
   1741 
   1742     /**
   1743      * Scans for the indentation of a block scalar implicitly. This mechanism is
   1744      * used only if the block did not explicitly state an indentation to be
   1745      * used.
   1746      *
   1747      * @see <a href="http://www.yaml.org/spec/1.1/#id927035"></a>
   1748      */
   1749     private Object[] scanBlockScalarIndentation() {
   1750         // See the specification for details.
   1751         StringBuilder chunks = new StringBuilder();
   1752         int maxIndent = 0;
   1753         Mark endMark = reader.getMark();
   1754         // Look ahead some number of lines until the first non-blank character
   1755         // occurs; the determined indentation will be the maximum number of
   1756         // leading spaces on any of these lines.
   1757         while (Constant.LINEBR.has(reader.peek(), " \r")) {
   1758             if (reader.peek() != ' ') {
   1759                 // If the character isn't a space, it must be some kind of
   1760                 // line-break; scan the line break and track it.
   1761                 chunks.append(scanLineBreak());
   1762                 endMark = reader.getMark();
   1763             } else {
   1764                 // If the character is a space, move forward to the next
   1765                 // character; if we surpass our previous maximum for indent
   1766                 // level, update that too.
   1767                 reader.forward();
   1768                 if (this.reader.getColumn() > maxIndent) {
   1769                     maxIndent = reader.getColumn();
   1770                 }
   1771             }
   1772         }
   1773         // Pass several results back together.
   1774         return new Object[] { chunks.toString(), maxIndent, endMark };
   1775     }
   1776 
   1777     private Object[] scanBlockScalarBreaks(int indent) {
   1778         // See the specification for details.
   1779         StringBuilder chunks = new StringBuilder();
   1780         Mark endMark = reader.getMark();
   1781         int ff = 0;
   1782         int col = this.reader.getColumn();
   1783         // Scan for up to the expected indentation-level of spaces, then move
   1784         // forward past that amount.
   1785         while (col < indent && reader.peek(ff) == ' ') {
   1786             ff++;
   1787             col++;
   1788         }
   1789         if (ff > 0) {
   1790             reader.forward(ff);
   1791         }
   1792         // Consume one or more line breaks followed by any amount of spaces,
   1793         // until we find something that isn't a line-break.
   1794         String lineBreak = null;
   1795         while ((lineBreak = scanLineBreak()).length() != 0) {
   1796             chunks.append(lineBreak);
   1797             endMark = reader.getMark();
   1798             // Scan past up to (indent) spaces on the next line, then forward
   1799             // past them.
   1800             ff = 0;
   1801             col = this.reader.getColumn();
   1802             while (col < indent && reader.peek(ff) == ' ') {
   1803                 ff++;
   1804                 col++;
   1805             }
   1806             if (ff > 0) {
   1807                 reader.forward(ff);
   1808             }
   1809         }
   1810         // Return both the assembled intervening string and the end-mark.
   1811         return new Object[] { chunks.toString(), endMark };
   1812     }
   1813 
   1814     /**
   1815      * Scan a flow-style scalar. Flow scalars are presented in one of two forms;
   1816      * first, a flow scalar may be a double-quoted string; second, a flow scalar
   1817      * may be a single-quoted string.
   1818      *
   1819      * @see <a href="http://www.yaml.org/spec/1.1/#flow"></a> style/syntax
   1820      *
   1821      *      <pre>
   1822      * See the specification for details.
   1823      * Note that we loose indentation rules for quoted scalars. Quoted
   1824      * scalars don't need to adhere indentation because &quot; and ' clearly
   1825      * mark the beginning and the end of them. Therefore we are less
   1826      * restrictive then the specification requires. We only need to check
   1827      * that document separators are not included in scalars.
   1828      * </pre>
   1829      */
   1830     private Token scanFlowScalar(char style) {
   1831         boolean _double;
   1832         // The style will be either single- or double-quoted; we determine this
   1833         // by the first character in the entry (supplied)
   1834         if (style == '"') {
   1835             _double = true;
   1836         } else {
   1837             _double = false;
   1838         }
   1839         StringBuilder chunks = new StringBuilder();
   1840         Mark startMark = reader.getMark();
   1841         char quote = reader.peek();
   1842         reader.forward();
   1843         chunks.append(scanFlowScalarNonSpaces(_double, startMark));
   1844         while (reader.peek() != quote) {
   1845             chunks.append(scanFlowScalarSpaces(startMark));
   1846             chunks.append(scanFlowScalarNonSpaces(_double, startMark));
   1847         }
   1848         reader.forward();
   1849         Mark endMark = reader.getMark();
   1850         return new ScalarToken(chunks.toString(), false, startMark, endMark, style);
   1851     }
   1852 
   1853     /**
   1854      * Scan some number of flow-scalar non-space characters.
   1855      */
   1856     private String scanFlowScalarNonSpaces(boolean doubleQuoted, Mark startMark) {
   1857         // See the specification for details.
   1858         StringBuilder chunks = new StringBuilder();
   1859         while (true) {
   1860             // Scan through any number of characters which are not: NUL, blank,
   1861             // tabs, line breaks, single-quotes, double-quotes, or backslashes.
   1862             int length = 0;
   1863             while (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(length), "\'\"\\")) {
   1864                 length++;
   1865             }
   1866             if (length != 0) {
   1867                 chunks.append(reader.prefixForward(length));
   1868             }
   1869             // Depending on our quoting-type, the characters ', " and \ have
   1870             // differing meanings.
   1871             char ch = reader.peek();
   1872             if (!doubleQuoted && ch == '\'' && reader.peek(1) == '\'') {
   1873                 chunks.append("'");
   1874                 reader.forward(2);
   1875             } else if ((doubleQuoted && ch == '\'') || (!doubleQuoted && "\"\\".indexOf(ch) != -1)) {
   1876                 chunks.append(ch);
   1877                 reader.forward();
   1878             } else if (doubleQuoted && ch == '\\') {
   1879                 reader.forward();
   1880                 ch = reader.peek();
   1881                 if (ESCAPE_REPLACEMENTS.containsKey(Character.valueOf(ch))) {
   1882                     // The character is one of the single-replacement
   1883                     // types; these are replaced with a literal character
   1884                     // from the mapping.
   1885                     chunks.append(ESCAPE_REPLACEMENTS.get(Character.valueOf(ch)));
   1886                     reader.forward();
   1887                 } else if (ESCAPE_CODES.containsKey(Character.valueOf(ch))) {
   1888                     // The character is a multi-digit escape sequence, with
   1889                     // length defined by the value in the ESCAPE_CODES map.
   1890                     length = ESCAPE_CODES.get(Character.valueOf(ch)).intValue();
   1891                     reader.forward();
   1892                     String hex = reader.prefix(length);
   1893                     if (NOT_HEXA.matcher(hex).find()) {
   1894                         throw new ScannerException("while scanning a double-quoted scalar",
   1895                                 startMark, "expected escape sequence of " + length
   1896                                         + " hexadecimal numbers, but found: " + hex,
   1897                                 reader.getMark());
   1898                     }
   1899                     int decimal = Integer.parseInt(hex, 16);
   1900                     String unicode = new String(Character.toChars(decimal));
   1901                     chunks.append(unicode);
   1902                     reader.forward(length);
   1903                 } else if (scanLineBreak().length() != 0) {
   1904                     chunks.append(scanFlowScalarBreaks(startMark));
   1905                 } else {
   1906                     throw new ScannerException("while scanning a double-quoted scalar", startMark,
   1907                             "found unknown escape character " + ch + "(" + ((int) ch) + ")",
   1908                             reader.getMark());
   1909                 }
   1910             } else {
   1911                 return chunks.toString();
   1912             }
   1913         }
   1914     }
   1915 
   1916     private String scanFlowScalarSpaces(Mark startMark) {
   1917         // See the specification for details.
   1918         StringBuilder chunks = new StringBuilder();
   1919         int length = 0;
   1920         // Scan through any number of whitespace (space, tab) characters,
   1921         // consuming them.
   1922         while (" \t".indexOf(reader.peek(length)) != -1) {
   1923             length++;
   1924         }
   1925         String whitespaces = reader.prefixForward(length);
   1926         char ch = reader.peek();
   1927         if (ch == '\0') {
   1928             // A flow scalar cannot end with an end-of-stream
   1929             throw new ScannerException("while scanning a quoted scalar", startMark,
   1930                     "found unexpected end of stream", reader.getMark());
   1931         }
   1932         // If we encounter a line break, scan it into our assembled string...
   1933         String lineBreak = scanLineBreak();
   1934         if (lineBreak.length() != 0) {
   1935             String breaks = scanFlowScalarBreaks(startMark);
   1936             if (!"\n".equals(lineBreak)) {
   1937                 chunks.append(lineBreak);
   1938             } else if (breaks.length() == 0) {
   1939                 chunks.append(" ");
   1940             }
   1941             chunks.append(breaks);
   1942         } else {
   1943             chunks.append(whitespaces);
   1944         }
   1945         return chunks.toString();
   1946     }
   1947 
   1948     private String scanFlowScalarBreaks(Mark startMark) {
   1949         // See the specification for details.
   1950         StringBuilder chunks = new StringBuilder();
   1951         while (true) {
   1952             // Instead of checking indentation, we check for document
   1953             // separators.
   1954             String prefix = reader.prefix(3);
   1955             if (("---".equals(prefix) || "...".equals(prefix))
   1956                     && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
   1957                 throw new ScannerException("while scanning a quoted scalar", startMark,
   1958                         "found unexpected document separator", reader.getMark());
   1959             }
   1960             // Scan past any number of spaces and tabs, ignoring them
   1961             while (" \t".indexOf(reader.peek()) != -1) {
   1962                 reader.forward();
   1963             }
   1964             // If we stopped at a line break, add that; otherwise, return the
   1965             // assembled set of scalar breaks.
   1966             String lineBreak = scanLineBreak();
   1967             if (lineBreak.length() != 0) {
   1968                 chunks.append(lineBreak);
   1969             } else {
   1970                 return chunks.toString();
   1971             }
   1972         }
   1973     }
   1974 
   1975     /**
   1976      * Scan a plain scalar.
   1977      *
   1978      * <pre>
   1979      * See the specification for details.
   1980      * We add an additional restriction for the flow context:
   1981      *   plain scalars in the flow context cannot contain ',', ':' and '?'.
   1982      * We also keep track of the `allow_simple_key` flag here.
   1983      * Indentation rules are loosed for the flow context.
   1984      * </pre>
   1985      */
   1986     private Token scanPlain() {
   1987         StringBuilder chunks = new StringBuilder();
   1988         Mark startMark = reader.getMark();
   1989         Mark endMark = startMark;
   1990         int indent = this.indent + 1;
   1991         String spaces = "";
   1992         while (true) {
   1993             char ch;
   1994             int length = 0;
   1995             // A comment indicates the end of the scalar.
   1996             if (reader.peek() == '#') {
   1997                 break;
   1998             }
   1999             while (true) {
   2000                 ch = reader.peek(length);
   2001                 if (Constant.NULL_BL_T_LINEBR.has(ch)
   2002                         || (this.flowLevel == 0 && ch == ':' && Constant.NULL_BL_T_LINEBR
   2003                                 .has(reader.peek(length + 1)))
   2004                         || (this.flowLevel != 0 && ",:?[]{}".indexOf(ch) != -1)) {
   2005                     break;
   2006                 }
   2007                 length++;
   2008             }
   2009             // It's not clear what we should do with ':' in the flow context.
   2010             if (this.flowLevel != 0 && ch == ':'
   2011                     && Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(length + 1), ",[]{}")) {
   2012                 reader.forward(length);
   2013                 throw new ScannerException("while scanning a plain scalar", startMark,
   2014                         "found unexpected ':'", reader.getMark(),
   2015                         "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.");
   2016             }
   2017             if (length == 0) {
   2018                 break;
   2019             }
   2020             this.allowSimpleKey = false;
   2021             chunks.append(spaces);
   2022             chunks.append(reader.prefixForward(length));
   2023             endMark = reader.getMark();
   2024             spaces = scanPlainSpaces();
   2025             // System.out.printf("spaces[%s]\n", spaces);
   2026             if (spaces.length() == 0 || reader.peek() == '#'
   2027                     || (this.flowLevel == 0 && this.reader.getColumn() < indent)) {
   2028                 break;
   2029             }
   2030         }
   2031         return new ScalarToken(chunks.toString(), startMark, endMark, true);
   2032     }
   2033 
   2034     /**
   2035      * See the specification for details. SnakeYAML and libyaml allow tabs
   2036      * inside plain scalar
   2037      */
   2038     private String scanPlainSpaces() {
   2039         int length = 0;
   2040         while (reader.peek(length) == ' ' || reader.peek(length) == '\t') {
   2041             length++;
   2042         }
   2043         String whitespaces = reader.prefixForward(length);
   2044         String lineBreak = scanLineBreak();
   2045         if (lineBreak.length() != 0) {
   2046             this.allowSimpleKey = true;
   2047             String prefix = reader.prefix(3);
   2048             if ("---".equals(prefix) || "...".equals(prefix)
   2049                     && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
   2050                 return "";
   2051             }
   2052             StringBuilder breaks = new StringBuilder();
   2053             while (true) {
   2054                 if (reader.peek() == ' ') {
   2055                     reader.forward();
   2056                 } else {
   2057                     String lb = scanLineBreak();
   2058                     if (lb.length() != 0) {
   2059                         breaks.append(lb);
   2060                         prefix = reader.prefix(3);
   2061                         if ("---".equals(prefix) || "...".equals(prefix)
   2062                                 && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
   2063                             return "";
   2064                         }
   2065                     } else {
   2066                         break;
   2067                     }
   2068                 }
   2069             }
   2070             if (!"\n".equals(lineBreak)) {
   2071                 return lineBreak + breaks;
   2072             } else if (breaks.length() == 0) {
   2073                 return " ";
   2074             }
   2075             return breaks.toString();
   2076         }
   2077         return whitespaces;
   2078     }
   2079 
   2080     /**
   2081      * <p>
   2082      * Scan a Tag handle. A Tag handle takes one of three forms:
   2083      *
   2084      * <pre>
   2085      * "!" (c-primary-tag-handle)
   2086      * "!!" (ns-secondary-tag-handle)
   2087      * "!(name)!" (c-named-tag-handle)
   2088      * </pre>
   2089      *
   2090      * Where (name) must be formatted as an ns-word-char.
   2091      * </p>
   2092      *
   2093      * @see <a href="http://www.yaml.org/spec/1.1/#c-tag-handle"></a>
   2094      * @see <a href="http://www.yaml.org/spec/1.1/#ns-word-char"></a>
   2095      *
   2096      *      <pre>
   2097      * See the specification for details.
   2098      * For some strange reasons, the specification does not allow '_' in
   2099      * tag handles. I have allowed it anyway.
   2100      * </pre>
   2101      */
   2102     private String scanTagHandle(String name, Mark startMark) {
   2103         char ch = reader.peek();
   2104         if (ch != '!') {
   2105             throw new ScannerException("while scanning a " + name, startMark,
   2106                     "expected '!', but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
   2107         }
   2108         // Look for the next '!' in the stream, stopping if we hit a
   2109         // non-word-character. If the first character is a space, then the
   2110         // tag-handle is a c-primary-tag-handle ('!').
   2111         int length = 1;
   2112         ch = reader.peek(length);
   2113         if (ch != ' ') {
   2114             // Scan through 0+ alphabetic characters.
   2115             // FIXME According to the specification, these should be
   2116             // ns-word-char only, which prohibits '_'. This might be a
   2117             // candidate for a configuration option.
   2118             while (Constant.ALPHA.has(ch)) {
   2119                 length++;
   2120                 ch = reader.peek(length);
   2121             }
   2122             // Found the next non-word-char. If this is not a space and not an
   2123             // '!', then this is an error, as the tag-handle was specified as:
   2124             // !(name) or similar; the trailing '!' is missing.
   2125             if (ch != '!') {
   2126                 reader.forward(length);
   2127                 throw new ScannerException("while scanning a " + name, startMark,
   2128                         "expected '!', but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
   2129             }
   2130             length++;
   2131         }
   2132         String value = reader.prefixForward(length);
   2133         return value;
   2134     }
   2135 
   2136     /**
   2137      * <p>
   2138      * Scan a Tag URI. This scanning is valid for both local and global tag
   2139      * directives, because both appear to be valid URIs as far as scanning is
   2140      * concerned. The difference may be distinguished later, in parsing. This
   2141      * method will scan for ns-uri-char*, which covers both cases.
   2142      * </p>
   2143      *
   2144      * <p>
   2145      * This method performs no verification that the scanned URI conforms to any
   2146      * particular kind of URI specification.
   2147      * </p>
   2148      *
   2149      * @see <a href="http://www.yaml.org/spec/1.1/#ns-uri-char"></a>
   2150      */
   2151     private String scanTagUri(String name, Mark startMark) {
   2152         // See the specification for details.
   2153         // Note: we do not check if URI is well-formed.
   2154         StringBuilder chunks = new StringBuilder();
   2155         // Scan through accepted URI characters, which includes the standard
   2156         // URI characters, plus the start-escape character ('%'). When we get
   2157         // to a start-escape, scan the escaped sequence, then return.
   2158         int length = 0;
   2159         char ch = reader.peek(length);
   2160         while (Constant.URI_CHARS.has(ch)) {
   2161             if (ch == '%') {
   2162                 chunks.append(reader.prefixForward(length));
   2163                 length = 0;
   2164                 chunks.append(scanUriEscapes(name, startMark));
   2165             } else {
   2166                 length++;
   2167             }
   2168             ch = reader.peek(length);
   2169         }
   2170         // Consume the last "chunk", which would not otherwise be consumed by
   2171         // the loop above.
   2172         if (length != 0) {
   2173             chunks.append(reader.prefixForward(length));
   2174             length = 0;
   2175         }
   2176         if (chunks.length() == 0) {
   2177             // If no URI was found, an error has occurred.
   2178             throw new ScannerException("while scanning a " + name, startMark,
   2179                     "expected URI, but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
   2180         }
   2181         return chunks.toString();
   2182     }
   2183 
   2184     /**
   2185      * <p>
   2186      * Scan a sequence of %-escaped URI escape codes and convert them into a
   2187      * String representing the unescaped values.
   2188      * </p>
   2189      *
   2190      * FIXME This method fails for more than 256 bytes' worth of URI-encoded
   2191      * characters in a row. Is this possible? Is this a use-case?
   2192      *
   2193      * @see <a href="http://www.ietf.org/rfc/rfc2396.txt"></a>, section 2.4, Escaped Encoding.
   2194      */
   2195     private String scanUriEscapes(String name, Mark startMark) {
   2196         // First, look ahead to see how many URI-escaped characters we should
   2197         // expect, so we can use the correct buffer size.
   2198         int length = 1;
   2199         while (reader.peek(length * 3) == '%') {
   2200             length++;
   2201         }
   2202         // See the specification for details.
   2203         // URIs containing 16 and 32 bit Unicode characters are
   2204         // encoded in UTF-8, and then each octet is written as a
   2205         // separate character.
   2206         Mark beginningMark = reader.getMark();
   2207         ByteBuffer buff = ByteBuffer.allocate(length);
   2208         while (reader.peek() == '%') {
   2209             reader.forward();
   2210             try {
   2211                 byte code = (byte) Integer.parseInt(reader.prefix(2), 16);
   2212                 buff.put(code);
   2213             } catch (NumberFormatException nfe) {
   2214                 throw new ScannerException("while scanning a " + name, startMark,
   2215                         "expected URI escape sequence of 2 hexadecimal numbers, but found "
   2216                                 + reader.peek() + "(" + ((int) reader.peek()) + ") and "
   2217                                 + reader.peek(1) + "(" + ((int) reader.peek(1)) + ")",
   2218                         reader.getMark());
   2219             }
   2220             reader.forward(2);
   2221         }
   2222         buff.flip();
   2223         try {
   2224             return UriEncoder.decode(buff);
   2225         } catch (CharacterCodingException e) {
   2226             throw new ScannerException("while scanning a " + name, startMark,
   2227                     "expected URI in UTF-8: " + e.getMessage(), beginningMark);
   2228         }
   2229     }
   2230 
   2231     /**
   2232      * Scan a line break, transforming:
   2233      *
   2234      * <pre>
   2235      * '\r\n' : '\n'
   2236      * '\r' : '\n'
   2237      * '\n' : '\n'
   2238      * '\x85' : '\n'
   2239      * default : ''
   2240      * </pre>
   2241      */
   2242     private String scanLineBreak() {
   2243         // Transforms:
   2244         // '\r\n' : '\n'
   2245         // '\r' : '\n'
   2246         // '\n' : '\n'
   2247         // '\x85' : '\n'
   2248         // default : ''
   2249         char ch = reader.peek();
   2250         if (ch == '\r' || ch == '\n' || ch == '\u0085') {
   2251             if (ch == '\r' && '\n' == reader.peek(1)) {
   2252                 reader.forward(2);
   2253             } else {
   2254                 reader.forward();
   2255             }
   2256             return "\n";
   2257         } else if (ch == '\u2028' || ch == '\u2029') {
   2258             reader.forward();
   2259             return String.valueOf(ch);
   2260         }
   2261         return "";
   2262     }
   2263 
   2264     /**
   2265      * Chomping the tail may have 3 values - yes, no, not defined.
   2266      */
   2267     private static class Chomping {
   2268         private final Boolean value;
   2269         private final int increment;
   2270 
   2271         public Chomping(Boolean value, int increment) {
   2272             this.value = value;
   2273             this.increment = increment;
   2274         }
   2275 
   2276         public boolean chompTailIsNotFalse() {
   2277             return value == null || value;
   2278         }
   2279 
   2280         public boolean chompTailIsTrue() {
   2281             return value != null && value;
   2282         }
   2283 
   2284         public int getIncrement() {
   2285             return increment;
   2286         }
   2287     }
   2288 }
   2289