1 /** 2 * Copyright (c) 2008, http://www.snakeyaml.org 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package org.yaml.snakeyaml.scanner; 17 18 import java.nio.ByteBuffer; 19 import java.nio.charset.CharacterCodingException; 20 import java.util.ArrayList; 21 import java.util.HashMap; 22 import java.util.Iterator; 23 import java.util.LinkedHashMap; 24 import java.util.List; 25 import java.util.Map; 26 import java.util.regex.Pattern; 27 28 import org.yaml.snakeyaml.error.Mark; 29 import org.yaml.snakeyaml.error.YAMLException; 30 import org.yaml.snakeyaml.reader.StreamReader; 31 import org.yaml.snakeyaml.tokens.AliasToken; 32 import org.yaml.snakeyaml.tokens.AnchorToken; 33 import org.yaml.snakeyaml.tokens.BlockEndToken; 34 import org.yaml.snakeyaml.tokens.BlockEntryToken; 35 import org.yaml.snakeyaml.tokens.BlockMappingStartToken; 36 import org.yaml.snakeyaml.tokens.BlockSequenceStartToken; 37 import org.yaml.snakeyaml.tokens.DirectiveToken; 38 import org.yaml.snakeyaml.tokens.DocumentEndToken; 39 import org.yaml.snakeyaml.tokens.DocumentStartToken; 40 import org.yaml.snakeyaml.tokens.FlowEntryToken; 41 import org.yaml.snakeyaml.tokens.FlowMappingEndToken; 42 import org.yaml.snakeyaml.tokens.FlowMappingStartToken; 43 import org.yaml.snakeyaml.tokens.FlowSequenceEndToken; 44 import org.yaml.snakeyaml.tokens.FlowSequenceStartToken; 45 import org.yaml.snakeyaml.tokens.KeyToken; 46 import org.yaml.snakeyaml.tokens.ScalarToken; 47 import org.yaml.snakeyaml.tokens.StreamEndToken; 48 import org.yaml.snakeyaml.tokens.StreamStartToken; 49 import org.yaml.snakeyaml.tokens.TagToken; 50 import org.yaml.snakeyaml.tokens.TagTuple; 51 import org.yaml.snakeyaml.tokens.Token; 52 import org.yaml.snakeyaml.tokens.ValueToken; 53 import org.yaml.snakeyaml.util.ArrayStack; 54 import org.yaml.snakeyaml.util.UriEncoder; 55 56 /** 57 * <pre> 58 * Scanner produces tokens of the following types: 59 * STREAM-START 60 * STREAM-END 61 * DIRECTIVE(name, value) 62 * DOCUMENT-START 63 * DOCUMENT-END 64 * BLOCK-SEQUENCE-START 65 * BLOCK-MAPPING-START 66 * BLOCK-END 67 * FLOW-SEQUENCE-START 68 * FLOW-MAPPING-START 69 * FLOW-SEQUENCE-END 70 * FLOW-MAPPING-END 71 * BLOCK-ENTRY 72 * FLOW-ENTRY 73 * KEY 74 * VALUE 75 * ALIAS(value) 76 * ANCHOR(value) 77 * TAG(value) 78 * SCALAR(value, plain, style) 79 * Read comments in the Scanner code for more details. 80 * </pre> 81 */ 82 public final class ScannerImpl implements Scanner { 83 /** 84 * A regular expression matching characters which are not in the hexadecimal 85 * set (0-9, A-F, a-f). 86 */ 87 private final static Pattern NOT_HEXA = Pattern.compile("[^0-9A-Fa-f]"); 88 89 /** 90 * A mapping from an escaped character in the input stream to the character 91 * that they should be replaced with. 92 * 93 * YAML defines several common and a few uncommon escape sequences. 94 * 95 * @see <a href="http://www.yaml.org/spec/current.html#id2517668">4.1.6. 96 * Escape Sequences</a> 97 */ 98 public final static Map<Character, String> ESCAPE_REPLACEMENTS = new HashMap<Character, String>(); 99 100 /** 101 * A mapping from a character to a number of bytes to read-ahead for that 102 * escape sequence. These escape sequences are used to handle unicode 103 * escaping in the following formats, where H is a hexadecimal character: 104 * 105 * <pre> 106 * \xHH : escaped 8-bit Unicode character 107 * \uHHHH : escaped 16-bit Unicode character 108 * \UHHHHHHHH : escaped 32-bit Unicode character 109 * </pre> 110 * 111 * @see <a href="http://yaml.org/spec/1.1/current.html#id872840">5.6. Escape 112 * Sequences</a> 113 */ 114 public final static Map<Character, Integer> ESCAPE_CODES = new HashMap<Character, Integer>(); 115 116 static { 117 // ASCII null 118 ESCAPE_REPLACEMENTS.put(Character.valueOf('0'), "\0"); 119 // ASCII bell 120 ESCAPE_REPLACEMENTS.put(Character.valueOf('a'), "\u0007"); 121 // ASCII backspace 122 ESCAPE_REPLACEMENTS.put(Character.valueOf('b'), "\u0008"); 123 // ASCII horizontal tab 124 ESCAPE_REPLACEMENTS.put(Character.valueOf('t'), "\u0009"); 125 // ASCII newline (line feed; \n maps to 0x0A) 126 ESCAPE_REPLACEMENTS.put(Character.valueOf('n'), "\n"); 127 // ASCII vertical tab 128 ESCAPE_REPLACEMENTS.put(Character.valueOf('v'), "\u000B"); 129 // ASCII form-feed 130 ESCAPE_REPLACEMENTS.put(Character.valueOf('f'), "\u000C"); 131 // carriage-return (\r maps to 0x0D) 132 ESCAPE_REPLACEMENTS.put(Character.valueOf('r'), "\r"); 133 // ASCII escape character (Esc) 134 ESCAPE_REPLACEMENTS.put(Character.valueOf('e'), "\u001B"); 135 // ASCII space 136 ESCAPE_REPLACEMENTS.put(Character.valueOf(' '), "\u0020"); 137 // ASCII double-quote 138 ESCAPE_REPLACEMENTS.put(Character.valueOf('"'), "\""); 139 // ASCII backslash 140 ESCAPE_REPLACEMENTS.put(Character.valueOf('\\'), "\\"); 141 // Unicode next line 142 ESCAPE_REPLACEMENTS.put(Character.valueOf('N'), "\u0085"); 143 // Unicode non-breaking-space 144 ESCAPE_REPLACEMENTS.put(Character.valueOf('_'), "\u00A0"); 145 // Unicode line-separator 146 ESCAPE_REPLACEMENTS.put(Character.valueOf('L'), "\u2028"); 147 // Unicode paragraph separator 148 ESCAPE_REPLACEMENTS.put(Character.valueOf('P'), "\u2029"); 149 150 // 8-bit Unicode 151 ESCAPE_CODES.put(Character.valueOf('x'), 2); 152 // 16-bit Unicode 153 ESCAPE_CODES.put(Character.valueOf('u'), 4); 154 // 32-bit Unicode (Supplementary characters are supported) 155 ESCAPE_CODES.put(Character.valueOf('U'), 8); 156 } 157 private final StreamReader reader; 158 // Had we reached the end of the stream? 159 private boolean done = false; 160 161 // The number of unclosed '{' and '['. `flow_level == 0` means block 162 // context. 163 private int flowLevel = 0; 164 165 // List of processed tokens that are not yet emitted. 166 private List<Token> tokens; 167 168 // Number of tokens that were emitted through the `get_token` method. 169 private int tokensTaken = 0; 170 171 // The current indentation level. 172 private int indent = -1; 173 174 // Past indentation levels. 175 private ArrayStack<Integer> indents; 176 177 // Variables related to simple keys treatment. See PyYAML. 178 179 /** 180 * <pre> 181 * A simple key is a key that is not denoted by the '?' indicator. 182 * Example of simple keys: 183 * --- 184 * block simple key: value 185 * ? not a simple key: 186 * : { flow simple key: value } 187 * We emit the KEY token before all keys, so when we find a potential 188 * simple key, we try to locate the corresponding ':' indicator. 189 * Simple keys should be limited to a single line and 1024 characters. 190 * 191 * Can a simple key start at the current position? A simple key may 192 * start: 193 * - at the beginning of the line, not counting indentation spaces 194 * (in block context), 195 * - after '{', '[', ',' (in the flow context), 196 * - after '?', ':', '-' (in the block context). 197 * In the block context, this flag also signifies if a block collection 198 * may start at the current position. 199 * </pre> 200 */ 201 private boolean allowSimpleKey = true; 202 203 /* 204 * Keep track of possible simple keys. This is a dictionary. The key is 205 * `flow_level`; there can be no more that one possible simple key for each 206 * level. The value is a SimpleKey record: (token_number, required, index, 207 * line, column, mark) A simple key may start with ALIAS, ANCHOR, TAG, 208 * SCALAR(flow), '[', or '{' tokens. 209 */ 210 private Map<Integer, SimpleKey> possibleSimpleKeys; 211 212 public ScannerImpl(StreamReader reader) { 213 this.reader = reader; 214 this.tokens = new ArrayList<Token>(100); 215 this.indents = new ArrayStack<Integer>(10); 216 // The order in possibleSimpleKeys is kept for nextPossibleSimpleKey() 217 this.possibleSimpleKeys = new LinkedHashMap<Integer, SimpleKey>(); 218 fetchStreamStart();// Add the STREAM-START token. 219 } 220 221 /** 222 * Check whether the next token is one of the given types. 223 */ 224 public boolean checkToken(Token.ID... choices) { 225 while (needMoreTokens()) { 226 fetchMoreTokens(); 227 } 228 if (!this.tokens.isEmpty()) { 229 if (choices.length == 0) { 230 return true; 231 } 232 // since profiler puts this method on top (it is used a lot), we 233 // should not use 'foreach' here because of the performance reasons 234 Token.ID first = this.tokens.get(0).getTokenId(); 235 for (int i = 0; i < choices.length; i++) { 236 if (first == choices[i]) { 237 return true; 238 } 239 } 240 } 241 return false; 242 } 243 244 /** 245 * Return the next token, but do not delete it from the queue. 246 */ 247 public Token peekToken() { 248 while (needMoreTokens()) { 249 fetchMoreTokens(); 250 } 251 return this.tokens.get(0); 252 } 253 254 /** 255 * Return the next token, removing it from the queue. 256 */ 257 public Token getToken() { 258 if (!this.tokens.isEmpty()) { 259 this.tokensTaken++; 260 return this.tokens.remove(0); 261 } 262 return null; 263 } 264 265 // Private methods. 266 /** 267 * Returns true if more tokens should be scanned. 268 */ 269 private boolean needMoreTokens() { 270 // If we are done, we do not require more tokens. 271 if (this.done) { 272 return false; 273 } 274 // If we aren't done, but we have no tokens, we need to scan more. 275 if (this.tokens.isEmpty()) { 276 return true; 277 } 278 // The current token may be a potential simple key, so we 279 // need to look further. 280 stalePossibleSimpleKeys(); 281 return nextPossibleSimpleKey() == this.tokensTaken; 282 } 283 284 /** 285 * Fetch one or more tokens from the StreamReader. 286 */ 287 private void fetchMoreTokens() { 288 // Eat whitespaces and comments until we reach the next token. 289 scanToNextToken(); 290 // Remove obsolete possible simple keys. 291 stalePossibleSimpleKeys(); 292 // Compare the current indentation and column. It may add some tokens 293 // and decrease the current indentation level. 294 unwindIndent(reader.getColumn()); 295 // Peek the next character, to decide what the next group of tokens 296 // will look like. 297 char ch = reader.peek(); 298 switch (ch) { 299 case '\0': 300 // Is it the end of stream? 301 fetchStreamEnd(); 302 return; 303 case '%': 304 // Is it a directive? 305 if (checkDirective()) { 306 fetchDirective(); 307 return; 308 } 309 break; 310 case '-': 311 // Is it the document start? 312 if (checkDocumentStart()) { 313 fetchDocumentStart(); 314 return; 315 // Is it the block entry indicator? 316 } else if (checkBlockEntry()) { 317 fetchBlockEntry(); 318 return; 319 } 320 break; 321 case '.': 322 // Is it the document end? 323 if (checkDocumentEnd()) { 324 fetchDocumentEnd(); 325 return; 326 } 327 break; 328 // TODO support for BOM within a stream. (not implemented in PyYAML) 329 case '[': 330 // Is it the flow sequence start indicator? 331 fetchFlowSequenceStart(); 332 return; 333 case '{': 334 // Is it the flow mapping start indicator? 335 fetchFlowMappingStart(); 336 return; 337 case ']': 338 // Is it the flow sequence end indicator? 339 fetchFlowSequenceEnd(); 340 return; 341 case '}': 342 // Is it the flow mapping end indicator? 343 fetchFlowMappingEnd(); 344 return; 345 case ',': 346 // Is it the flow entry indicator? 347 fetchFlowEntry(); 348 return; 349 // see block entry indicator above 350 case '?': 351 // Is it the key indicator? 352 if (checkKey()) { 353 fetchKey(); 354 return; 355 } 356 break; 357 case ':': 358 // Is it the value indicator? 359 if (checkValue()) { 360 fetchValue(); 361 return; 362 } 363 break; 364 case '*': 365 // Is it an alias? 366 fetchAlias(); 367 return; 368 case '&': 369 // Is it an anchor? 370 fetchAnchor(); 371 return; 372 case '!': 373 // Is it a tag? 374 fetchTag(); 375 return; 376 case '|': 377 // Is it a literal scalar? 378 if (this.flowLevel == 0) { 379 fetchLiteral(); 380 return; 381 } 382 break; 383 case '>': 384 // Is it a folded scalar? 385 if (this.flowLevel == 0) { 386 fetchFolded(); 387 return; 388 } 389 break; 390 case '\'': 391 // Is it a single quoted scalar? 392 fetchSingle(); 393 return; 394 case '"': 395 // Is it a double quoted scalar? 396 fetchDouble(); 397 return; 398 } 399 // It must be a plain scalar then. 400 if (checkPlain()) { 401 fetchPlain(); 402 return; 403 } 404 // No? It's an error. Let's produce a nice error message.We do this by 405 // converting escaped characters into their escape sequences. This is a 406 // backwards use of the ESCAPE_REPLACEMENTS map. 407 String chRepresentation = String.valueOf(ch); 408 for (Character s : ESCAPE_REPLACEMENTS.keySet()) { 409 String v = ESCAPE_REPLACEMENTS.get(s); 410 if (v.equals(chRepresentation)) { 411 chRepresentation = "\\" + s;// ' ' -> '\t' 412 break; 413 } 414 } 415 if (ch == '\t') 416 chRepresentation += "(TAB)"; 417 String text = String 418 .format("found character '%s' that cannot start any token. (Do not use %s for indentation)", 419 chRepresentation, chRepresentation); 420 throw new ScannerException("while scanning for the next token", null, text, 421 reader.getMark()); 422 } 423 424 // Simple keys treatment. 425 426 /** 427 * Return the number of the nearest possible simple key. Actually we don't 428 * need to loop through the whole dictionary. 429 */ 430 private int nextPossibleSimpleKey() { 431 /* 432 * the implementation is not as in PyYAML. Because 433 * this.possibleSimpleKeys is ordered we can simply take the first key 434 */ 435 if (!this.possibleSimpleKeys.isEmpty()) { 436 return this.possibleSimpleKeys.values().iterator().next().getTokenNumber(); 437 } 438 return -1; 439 } 440 441 /** 442 * <pre> 443 * Remove entries that are no longer possible simple keys. According to 444 * the YAML specification, simple keys 445 * - should be limited to a single line, 446 * - should be no longer than 1024 characters. 447 * Disabling this procedure will allow simple keys of any length and 448 * height (may cause problems if indentation is broken though). 449 * </pre> 450 */ 451 private void stalePossibleSimpleKeys() { 452 if (!this.possibleSimpleKeys.isEmpty()) { 453 for (Iterator<SimpleKey> iterator = this.possibleSimpleKeys.values().iterator(); iterator 454 .hasNext();) { 455 SimpleKey key = iterator.next(); 456 if ((key.getLine() != reader.getLine()) 457 || (reader.getIndex() - key.getIndex() > 1024)) { 458 // If the key is not on the same line as the current 459 // position OR the difference in column between the token 460 // start and the current position is more than the maximum 461 // simple key length, then this cannot be a simple key. 462 if (key.isRequired()) { 463 // If the key was required, this implies an error 464 // condition. 465 throw new ScannerException("while scanning a simple key", key.getMark(), 466 "could not find expected ':'", reader.getMark()); 467 } 468 iterator.remove(); 469 } 470 } 471 } 472 } 473 474 /** 475 * The next token may start a simple key. We check if it's possible and save 476 * its position. This function is called for ALIAS, ANCHOR, TAG, 477 * SCALAR(flow), '[', and '{'. 478 */ 479 private void savePossibleSimpleKey() { 480 // The next token may start a simple key. We check if it's possible 481 // and save its position. This function is called for 482 // ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. 483 484 // Check if a simple key is required at the current position. 485 // A simple key is required if this position is the root flowLevel, AND 486 // the current indentation level is the same as the last indent-level. 487 boolean required = (this.flowLevel == 0) && (this.indent == this.reader.getColumn()); 488 489 if (allowSimpleKey || !required) { 490 // A simple key is required only if it is the first token in the 491 // current line. Therefore it is always allowed. 492 } else { 493 throw new YAMLException( 494 "A simple key is required only if it is the first token in the current line"); 495 } 496 497 // The next token might be a simple key. Let's save it's number and 498 // position. 499 if (this.allowSimpleKey) { 500 removePossibleSimpleKey(); 501 int tokenNumber = this.tokensTaken + this.tokens.size(); 502 SimpleKey key = new SimpleKey(tokenNumber, required, reader.getIndex(), 503 reader.getLine(), this.reader.getColumn(), this.reader.getMark()); 504 this.possibleSimpleKeys.put(this.flowLevel, key); 505 } 506 } 507 508 /** 509 * Remove the saved possible key position at the current flow level. 510 */ 511 private void removePossibleSimpleKey() { 512 SimpleKey key = possibleSimpleKeys.remove(flowLevel); 513 if (key != null && key.isRequired()) { 514 throw new ScannerException("while scanning a simple key", key.getMark(), 515 "could not find expected ':'", reader.getMark()); 516 } 517 } 518 519 // Indentation functions. 520 521 /** 522 * * Handle implicitly ending multiple levels of block nodes by decreased 523 * indentation. This function becomes important on lines 4 and 7 of this 524 * example: 525 * 526 * <pre> 527 * 1) book one: 528 * 2) part one: 529 * 3) chapter one 530 * 4) part two: 531 * 5) chapter one 532 * 6) chapter two 533 * 7) book two: 534 * </pre> 535 * 536 * In flow context, tokens should respect indentation. Actually the 537 * condition should be `self.indent >= column` according to the spec. But 538 * this condition will prohibit intuitively correct constructions such as 539 * key : { } </pre> 540 */ 541 private void unwindIndent(int col) { 542 // In the flow context, indentation is ignored. We make the scanner less 543 // restrictive then specification requires. 544 if (this.flowLevel != 0) { 545 return; 546 } 547 548 // In block context, we may need to issue the BLOCK-END tokens. 549 while (this.indent > col) { 550 Mark mark = reader.getMark(); 551 this.indent = this.indents.pop(); 552 this.tokens.add(new BlockEndToken(mark, mark)); 553 } 554 } 555 556 /** 557 * Check if we need to increase indentation. 558 */ 559 private boolean addIndent(int column) { 560 if (this.indent < column) { 561 this.indents.push(this.indent); 562 this.indent = column; 563 return true; 564 } 565 return false; 566 } 567 568 // Fetchers. 569 570 /** 571 * We always add STREAM-START as the first token and STREAM-END as the last 572 * token. 573 */ 574 private void fetchStreamStart() { 575 // Read the token. 576 Mark mark = reader.getMark(); 577 578 // Add STREAM-START. 579 Token token = new StreamStartToken(mark, mark); 580 this.tokens.add(token); 581 } 582 583 private void fetchStreamEnd() { 584 // Set the current intendation to -1. 585 unwindIndent(-1); 586 587 // Reset simple keys. 588 removePossibleSimpleKey(); 589 this.allowSimpleKey = false; 590 this.possibleSimpleKeys.clear(); 591 592 // Read the token. 593 Mark mark = reader.getMark(); 594 595 // Add STREAM-END. 596 Token token = new StreamEndToken(mark, mark); 597 this.tokens.add(token); 598 599 // The stream is finished. 600 this.done = true; 601 } 602 603 /** 604 * Fetch a YAML directive. Directives are presentation details that are 605 * interpreted as instructions to the processor. YAML defines two kinds of 606 * directives, YAML and TAG; all other types are reserved for future use. 607 * 608 * @see <a href="http://www.yaml.org/spec/1.1/#id864824"></a> 609 */ 610 private void fetchDirective() { 611 // Set the current intendation to -1. 612 unwindIndent(-1); 613 614 // Reset simple keys. 615 removePossibleSimpleKey(); 616 this.allowSimpleKey = false; 617 618 // Scan and add DIRECTIVE. 619 Token tok = scanDirective(); 620 this.tokens.add(tok); 621 } 622 623 /** 624 * Fetch a document-start token ("---"). 625 */ 626 private void fetchDocumentStart() { 627 fetchDocumentIndicator(true); 628 } 629 630 /** 631 * Fetch a document-end token ("..."). 632 */ 633 private void fetchDocumentEnd() { 634 fetchDocumentIndicator(false); 635 } 636 637 /** 638 * Fetch a document indicator, either "---" for "document-start", or else 639 * "..." for "document-end. The type is chosen by the given boolean. 640 */ 641 private void fetchDocumentIndicator(boolean isDocumentStart) { 642 // Set the current intendation to -1. 643 unwindIndent(-1); 644 645 // Reset simple keys. Note that there could not be a block collection 646 // after '---'. 647 removePossibleSimpleKey(); 648 this.allowSimpleKey = false; 649 650 // Add DOCUMENT-START or DOCUMENT-END. 651 Mark startMark = reader.getMark(); 652 reader.forward(3); 653 Mark endMark = reader.getMark(); 654 Token token; 655 if (isDocumentStart) { 656 token = new DocumentStartToken(startMark, endMark); 657 } else { 658 token = new DocumentEndToken(startMark, endMark); 659 } 660 this.tokens.add(token); 661 } 662 663 private void fetchFlowSequenceStart() { 664 fetchFlowCollectionStart(false); 665 } 666 667 private void fetchFlowMappingStart() { 668 fetchFlowCollectionStart(true); 669 } 670 671 /** 672 * Fetch a flow-style collection start, which is either a sequence or a 673 * mapping. The type is determined by the given boolean. 674 * 675 * A flow-style collection is in a format similar to JSON. Sequences are 676 * started by '[' and ended by ']'; mappings are started by '{' and ended by 677 * '}'. 678 * 679 * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a> 680 * 681 * @param isMappingStart 682 */ 683 private void fetchFlowCollectionStart(boolean isMappingStart) { 684 // '[' and '{' may start a simple key. 685 savePossibleSimpleKey(); 686 687 // Increase the flow level. 688 this.flowLevel++; 689 690 // Simple keys are allowed after '[' and '{'. 691 this.allowSimpleKey = true; 692 693 // Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. 694 Mark startMark = reader.getMark(); 695 reader.forward(1); 696 Mark endMark = reader.getMark(); 697 Token token; 698 if (isMappingStart) { 699 token = new FlowMappingStartToken(startMark, endMark); 700 } else { 701 token = new FlowSequenceStartToken(startMark, endMark); 702 } 703 this.tokens.add(token); 704 } 705 706 private void fetchFlowSequenceEnd() { 707 fetchFlowCollectionEnd(false); 708 } 709 710 private void fetchFlowMappingEnd() { 711 fetchFlowCollectionEnd(true); 712 } 713 714 /** 715 * Fetch a flow-style collection end, which is either a sequence or a 716 * mapping. The type is determined by the given boolean. 717 * 718 * A flow-style collection is in a format similar to JSON. Sequences are 719 * started by '[' and ended by ']'; mappings are started by '{' and ended by 720 * '}'. 721 * 722 * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a> 723 */ 724 private void fetchFlowCollectionEnd(boolean isMappingEnd) { 725 // Reset possible simple key on the current level. 726 removePossibleSimpleKey(); 727 728 // Decrease the flow level. 729 this.flowLevel--; 730 731 // No simple keys after ']' or '}'. 732 this.allowSimpleKey = false; 733 734 // Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. 735 Mark startMark = reader.getMark(); 736 reader.forward(); 737 Mark endMark = reader.getMark(); 738 Token token; 739 if (isMappingEnd) { 740 token = new FlowMappingEndToken(startMark, endMark); 741 } else { 742 token = new FlowSequenceEndToken(startMark, endMark); 743 } 744 this.tokens.add(token); 745 } 746 747 /** 748 * Fetch an entry in the flow style. Flow-style entries occur either 749 * immediately after the start of a collection, or else after a comma. 750 * 751 * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a> 752 */ 753 private void fetchFlowEntry() { 754 // Simple keys are allowed after ','. 755 this.allowSimpleKey = true; 756 757 // Reset possible simple key on the current level. 758 removePossibleSimpleKey(); 759 760 // Add FLOW-ENTRY. 761 Mark startMark = reader.getMark(); 762 reader.forward(); 763 Mark endMark = reader.getMark(); 764 Token token = new FlowEntryToken(startMark, endMark); 765 this.tokens.add(token); 766 } 767 768 /** 769 * Fetch an entry in the block style. 770 * 771 * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a> 772 */ 773 private void fetchBlockEntry() { 774 // Block context needs additional checks. 775 if (this.flowLevel == 0) { 776 // Are we allowed to start a new entry? 777 if (!this.allowSimpleKey) { 778 throw new ScannerException(null, null, "sequence entries are not allowed here", 779 reader.getMark()); 780 } 781 782 // We may need to add BLOCK-SEQUENCE-START. 783 if (addIndent(this.reader.getColumn())) { 784 Mark mark = reader.getMark(); 785 this.tokens.add(new BlockSequenceStartToken(mark, mark)); 786 } 787 } else { 788 // It's an error for the block entry to occur in the flow 789 // context,but we let the parser detect this. 790 } 791 // Simple keys are allowed after '-'. 792 this.allowSimpleKey = true; 793 794 // Reset possible simple key on the current level. 795 removePossibleSimpleKey(); 796 797 // Add BLOCK-ENTRY. 798 Mark startMark = reader.getMark(); 799 reader.forward(); 800 Mark endMark = reader.getMark(); 801 Token token = new BlockEntryToken(startMark, endMark); 802 this.tokens.add(token); 803 } 804 805 /** 806 * Fetch a key in a block-style mapping. 807 * 808 * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a> 809 */ 810 private void fetchKey() { 811 // Block context needs additional checks. 812 if (this.flowLevel == 0) { 813 // Are we allowed to start a key (not necessary a simple)? 814 if (!this.allowSimpleKey) { 815 throw new ScannerException(null, null, "mapping keys are not allowed here", 816 reader.getMark()); 817 } 818 // We may need to add BLOCK-MAPPING-START. 819 if (addIndent(this.reader.getColumn())) { 820 Mark mark = reader.getMark(); 821 this.tokens.add(new BlockMappingStartToken(mark, mark)); 822 } 823 } 824 // Simple keys are allowed after '?' in the block context. 825 this.allowSimpleKey = this.flowLevel == 0; 826 827 // Reset possible simple key on the current level. 828 removePossibleSimpleKey(); 829 830 // Add KEY. 831 Mark startMark = reader.getMark(); 832 reader.forward(); 833 Mark endMark = reader.getMark(); 834 Token token = new KeyToken(startMark, endMark); 835 this.tokens.add(token); 836 } 837 838 /** 839 * Fetch a value in a block-style mapping. 840 * 841 * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a> 842 */ 843 private void fetchValue() { 844 // Do we determine a simple key? 845 SimpleKey key = this.possibleSimpleKeys.remove(this.flowLevel); 846 if (key != null) { 847 // Add KEY. 848 this.tokens.add(key.getTokenNumber() - this.tokensTaken, new KeyToken(key.getMark(), 849 key.getMark())); 850 851 // If this key starts a new block mapping, we need to add 852 // BLOCK-MAPPING-START. 853 if (this.flowLevel == 0) { 854 if (addIndent(key.getColumn())) { 855 this.tokens.add(key.getTokenNumber() - this.tokensTaken, 856 new BlockMappingStartToken(key.getMark(), key.getMark())); 857 } 858 } 859 // There cannot be two simple keys one after another. 860 this.allowSimpleKey = false; 861 862 } else { 863 // It must be a part of a complex key. 864 // Block context needs additional checks. Do we really need them? 865 // They will be caught by the parser anyway. 866 if (this.flowLevel == 0) { 867 868 // We are allowed to start a complex value if and only if we can 869 // start a simple key. 870 if (!this.allowSimpleKey) { 871 throw new ScannerException(null, null, "mapping values are not allowed here", 872 reader.getMark()); 873 } 874 } 875 876 // If this value starts a new block mapping, we need to add 877 // BLOCK-MAPPING-START. It will be detected as an error later by 878 // the parser. 879 if (flowLevel == 0) { 880 if (addIndent(reader.getColumn())) { 881 Mark mark = reader.getMark(); 882 this.tokens.add(new BlockMappingStartToken(mark, mark)); 883 } 884 } 885 886 // Simple keys are allowed after ':' in the block context. 887 allowSimpleKey = flowLevel == 0; 888 889 // Reset possible simple key on the current level. 890 removePossibleSimpleKey(); 891 } 892 // Add VALUE. 893 Mark startMark = reader.getMark(); 894 reader.forward(); 895 Mark endMark = reader.getMark(); 896 Token token = new ValueToken(startMark, endMark); 897 this.tokens.add(token); 898 } 899 900 /** 901 * Fetch an alias, which is a reference to an anchor. Aliases take the 902 * format: 903 * 904 * <pre> 905 * *(anchor name) 906 * </pre> 907 * 908 * @see <a href="http://www.yaml.org/spec/1.1/#id863390"></a> 909 */ 910 private void fetchAlias() { 911 // ALIAS could be a simple key. 912 savePossibleSimpleKey(); 913 914 // No simple keys after ALIAS. 915 this.allowSimpleKey = false; 916 917 // Scan and add ALIAS. 918 Token tok = scanAnchor(false); 919 this.tokens.add(tok); 920 } 921 922 /** 923 * Fetch an anchor. Anchors take the form: 924 * 925 * <pre> 926 * &(anchor name) 927 * </pre> 928 * 929 * @see <a href="http://www.yaml.org/spec/1.1/#id863390"></a> 930 */ 931 private void fetchAnchor() { 932 // ANCHOR could start a simple key. 933 savePossibleSimpleKey(); 934 935 // No simple keys after ANCHOR. 936 this.allowSimpleKey = false; 937 938 // Scan and add ANCHOR. 939 Token tok = scanAnchor(true); 940 this.tokens.add(tok); 941 } 942 943 /** 944 * Fetch a tag. Tags take a complex form. 945 * 946 * @see <a href="http://www.yaml.org/spec/1.1/#id861700"></a> 947 */ 948 private void fetchTag() { 949 // TAG could start a simple key. 950 savePossibleSimpleKey(); 951 952 // No simple keys after TAG. 953 this.allowSimpleKey = false; 954 955 // Scan and add TAG. 956 Token tok = scanTag(); 957 this.tokens.add(tok); 958 } 959 960 /** 961 * Fetch a literal scalar, denoted with a vertical-bar. This is the type 962 * best used for source code and other content, such as binary data, which 963 * must be included verbatim. 964 * 965 * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a> 966 */ 967 private void fetchLiteral() { 968 fetchBlockScalar('|'); 969 } 970 971 /** 972 * Fetch a folded scalar, denoted with a greater-than sign. This is the type 973 * best used for long content, such as the text of a chapter or description. 974 * 975 * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a> 976 */ 977 private void fetchFolded() { 978 fetchBlockScalar('>'); 979 } 980 981 /** 982 * Fetch a block scalar (literal or folded). 983 * 984 * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a> 985 * 986 * @param style 987 */ 988 private void fetchBlockScalar(char style) { 989 // A simple key may follow a block scalar. 990 this.allowSimpleKey = true; 991 992 // Reset possible simple key on the current level. 993 removePossibleSimpleKey(); 994 995 // Scan and add SCALAR. 996 Token tok = scanBlockScalar(style); 997 this.tokens.add(tok); 998 } 999 1000 /** 1001 * Fetch a single-quoted (') scalar. 1002 */ 1003 private void fetchSingle() { 1004 fetchFlowScalar('\''); 1005 } 1006 1007 /** 1008 * Fetch a double-quoted (") scalar. 1009 */ 1010 private void fetchDouble() { 1011 fetchFlowScalar('"'); 1012 } 1013 1014 /** 1015 * Fetch a flow scalar (single- or double-quoted). 1016 * 1017 * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a> 1018 * 1019 * @param style 1020 */ 1021 private void fetchFlowScalar(char style) { 1022 // A flow scalar could be a simple key. 1023 savePossibleSimpleKey(); 1024 1025 // No simple keys after flow scalars. 1026 this.allowSimpleKey = false; 1027 1028 // Scan and add SCALAR. 1029 Token tok = scanFlowScalar(style); 1030 this.tokens.add(tok); 1031 } 1032 1033 /** 1034 * Fetch a plain scalar. 1035 */ 1036 private void fetchPlain() { 1037 // A plain scalar could be a simple key. 1038 savePossibleSimpleKey(); 1039 1040 // No simple keys after plain scalars. But note that `scan_plain` will 1041 // change this flag if the scan is finished at the beginning of the 1042 // line. 1043 this.allowSimpleKey = false; 1044 1045 // Scan and add SCALAR. May change `allow_simple_key`. 1046 Token tok = scanPlain(); 1047 this.tokens.add(tok); 1048 } 1049 1050 // Checkers. 1051 /** 1052 * Returns true if the next thing on the reader is a directive, given that 1053 * the leading '%' has already been checked. 1054 * 1055 * @see <a href="http://www.yaml.org/spec/1.1/#id864824"></a> 1056 */ 1057 private boolean checkDirective() { 1058 // DIRECTIVE: ^ '%' ... 1059 // The '%' indicator is already checked. 1060 return reader.getColumn() == 0; 1061 } 1062 1063 /** 1064 * Returns true if the next thing on the reader is a document-start ("---"). 1065 * A document-start is always followed immediately by a new line. 1066 */ 1067 private boolean checkDocumentStart() { 1068 // DOCUMENT-START: ^ '---' (' '|'\n') 1069 if (reader.getColumn() == 0) { 1070 if ("---".equals(reader.prefix(3)) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) { 1071 return true; 1072 } 1073 } 1074 return false; 1075 } 1076 1077 /** 1078 * Returns true if the next thing on the reader is a document-end ("..."). A 1079 * document-end is always followed immediately by a new line. 1080 */ 1081 private boolean checkDocumentEnd() { 1082 // DOCUMENT-END: ^ '...' (' '|'\n') 1083 if (reader.getColumn() == 0) { 1084 if ("...".equals(reader.prefix(3)) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) { 1085 return true; 1086 } 1087 } 1088 return false; 1089 } 1090 1091 /** 1092 * Returns true if the next thing on the reader is a block token. 1093 */ 1094 private boolean checkBlockEntry() { 1095 // BLOCK-ENTRY: '-' (' '|'\n') 1096 return Constant.NULL_BL_T_LINEBR.has(reader.peek(1)); 1097 } 1098 1099 /** 1100 * Returns true if the next thing on the reader is a key token. 1101 */ 1102 private boolean checkKey() { 1103 // KEY(flow context): '?' 1104 if (this.flowLevel != 0) { 1105 return true; 1106 } else { 1107 // KEY(block context): '?' (' '|'\n') 1108 return Constant.NULL_BL_T_LINEBR.has(reader.peek(1)); 1109 } 1110 } 1111 1112 /** 1113 * Returns true if the next thing on the reader is a value token. 1114 */ 1115 private boolean checkValue() { 1116 // VALUE(flow context): ':' 1117 if (flowLevel != 0) { 1118 return true; 1119 } else { 1120 // VALUE(block context): ':' (' '|'\n') 1121 return Constant.NULL_BL_T_LINEBR.has(reader.peek(1)); 1122 } 1123 } 1124 1125 /** 1126 * Returns true if the next thing on the reader is a plain token. 1127 */ 1128 private boolean checkPlain() { 1129 /** 1130 * <pre> 1131 * A plain scalar may start with any non-space character except: 1132 * '-', '?', ':', ',', '[', ']', '{', '}', 1133 * '#', '&', '*', '!', '|', '>', '\'', '\"', 1134 * '%', '@', '`'. 1135 * 1136 * It may also start with 1137 * '-', '?', ':' 1138 * if it is followed by a non-space character. 1139 * 1140 * Note that we limit the last rule to the block context (except the 1141 * '-' character) because we want the flow context to be space 1142 * independent. 1143 * </pre> 1144 */ 1145 char ch = reader.peek(); 1146 // If the next char is NOT one of the forbidden chars above or 1147 // whitespace, then this is the start of a plain scalar. 1148 return Constant.NULL_BL_T_LINEBR.hasNo(ch, "-?:,[]{}#&*!|>\'\"%@`") 1149 || (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(1)) && (ch == '-' || (this.flowLevel == 0 && "?:" 1150 .indexOf(ch) != -1))); 1151 } 1152 1153 // Scanners. 1154 1155 /** 1156 * <pre> 1157 * We ignore spaces, line breaks and comments. 1158 * If we find a line break in the block context, we set the flag 1159 * `allow_simple_key` on. 1160 * The byte order mark is stripped if it's the first character in the 1161 * stream. We do not yet support BOM inside the stream as the 1162 * specification requires. Any such mark will be considered as a part 1163 * of the document. 1164 * TODO: We need to make tab handling rules more sane. A good rule is 1165 * Tabs cannot precede tokens 1166 * BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, 1167 * KEY(block), VALUE(block), BLOCK-ENTRY 1168 * So the checking code is 1169 * if <TAB>: 1170 * self.allow_simple_keys = False 1171 * We also need to add the check for `allow_simple_keys == True` to 1172 * `unwind_indent` before issuing BLOCK-END. 1173 * Scanners for block, flow, and plain scalars need to be modified. 1174 * </pre> 1175 */ 1176 private void scanToNextToken() { 1177 // If there is a byte order mark (BOM) at the beginning of the stream, 1178 // forward past it. 1179 if (reader.getIndex() == 0 && reader.peek() == '\uFEFF') { 1180 reader.forward(); 1181 } 1182 boolean found = false; 1183 while (!found) { 1184 int ff = 0; 1185 // Peek ahead until we find the first non-space character, then 1186 // move forward directly to that character. 1187 while (reader.peek(ff) == ' ') { 1188 ff++; 1189 } 1190 if (ff > 0) { 1191 reader.forward(ff); 1192 } 1193 // If the character we have skipped forward to is a comment (#), 1194 // then peek ahead until we find the next end of line. YAML 1195 // comments are from a # to the next new-line. We then forward 1196 // past the comment. 1197 if (reader.peek() == '#') { 1198 ff = 0; 1199 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) { 1200 ff++; 1201 } 1202 if (ff > 0) { 1203 reader.forward(ff); 1204 } 1205 } 1206 // If we scanned a line break, then (depending on flow level), 1207 // simple keys may be allowed. 1208 if (scanLineBreak().length() != 0) {// found a line-break 1209 if (this.flowLevel == 0) { 1210 // Simple keys are allowed at flow-level 0 after a line 1211 // break 1212 this.allowSimpleKey = true; 1213 } 1214 } else { 1215 found = true; 1216 } 1217 } 1218 } 1219 1220 @SuppressWarnings({ "unchecked", "rawtypes" }) 1221 private Token scanDirective() { 1222 // See the specification for details. 1223 Mark startMark = reader.getMark(); 1224 Mark endMark; 1225 reader.forward(); 1226 String name = scanDirectiveName(startMark); 1227 List<?> value = null; 1228 if ("YAML".equals(name)) { 1229 value = scanYamlDirectiveValue(startMark); 1230 endMark = reader.getMark(); 1231 } else if ("TAG".equals(name)) { 1232 value = scanTagDirectiveValue(startMark); 1233 endMark = reader.getMark(); 1234 } else { 1235 endMark = reader.getMark(); 1236 int ff = 0; 1237 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) { 1238 ff++; 1239 } 1240 if (ff > 0) { 1241 reader.forward(ff); 1242 } 1243 } 1244 scanDirectiveIgnoredLine(startMark); 1245 return new DirectiveToken(name, value, startMark, endMark); 1246 } 1247 1248 /** 1249 * Scan a directive name. Directive names are a series of non-space 1250 * characters. 1251 * 1252 * @see <a href="http://www.yaml.org/spec/1.1/#id895217"></a> 1253 */ 1254 private String scanDirectiveName(Mark startMark) { 1255 // See the specification for details. 1256 int length = 0; 1257 // A Directive-name is a sequence of alphanumeric characters 1258 // (a-z,A-Z,0-9). We scan until we find something that isn't. 1259 // FIXME this disagrees with the specification. 1260 char ch = reader.peek(length); 1261 while (Constant.ALPHA.has(ch)) { 1262 length++; 1263 ch = reader.peek(length); 1264 } 1265 // If the name would be empty, an error occurs. 1266 if (length == 0) { 1267 throw new ScannerException("while scanning a directive", startMark, 1268 "expected alphabetic or numeric character, but found " + ch + "(" + ((int) ch) 1269 + ")", reader.getMark()); 1270 } 1271 String value = reader.prefixForward(length); 1272 ch = reader.peek(); 1273 if (Constant.NULL_BL_LINEBR.hasNo(ch)) { 1274 throw new ScannerException("while scanning a directive", startMark, 1275 "expected alphabetic or numeric character, but found " + ch + "(" + ((int) ch) 1276 + ")", reader.getMark()); 1277 } 1278 return value; 1279 } 1280 1281 private List<Integer> scanYamlDirectiveValue(Mark startMark) { 1282 // See the specification for details. 1283 while (reader.peek() == ' ') { 1284 reader.forward(); 1285 } 1286 Integer major = scanYamlDirectiveNumber(startMark); 1287 if (reader.peek() != '.') { 1288 throw new ScannerException("while scanning a directive", startMark, 1289 "expected a digit or '.', but found " + reader.peek() + "(" 1290 + ((int) reader.peek()) + ")", reader.getMark()); 1291 } 1292 reader.forward(); 1293 Integer minor = scanYamlDirectiveNumber(startMark); 1294 if (Constant.NULL_BL_LINEBR.hasNo(reader.peek())) { 1295 throw new ScannerException("while scanning a directive", startMark, 1296 "expected a digit or ' ', but found " + reader.peek() + "(" 1297 + ((int) reader.peek()) + ")", reader.getMark()); 1298 } 1299 List<Integer> result = new ArrayList<Integer>(2); 1300 result.add(major); 1301 result.add(minor); 1302 return result; 1303 } 1304 1305 /** 1306 * Read a %YAML directive number: this is either the major or the minor 1307 * part. Stop reading at a non-digit character (usually either '.' or '\n'). 1308 * 1309 * @see <a href="http://www.yaml.org/spec/1.1/#id895631"></a> 1310 * @see <a href="http://www.yaml.org/spec/1.1/#ns-dec-digit"></a> 1311 */ 1312 private Integer scanYamlDirectiveNumber(Mark startMark) { 1313 // See the specification for details. 1314 char ch = reader.peek(); 1315 if (!Character.isDigit(ch)) { 1316 throw new ScannerException("while scanning a directive", startMark, 1317 "expected a digit, but found " + ch + "(" + ((int) ch) + ")", reader.getMark()); 1318 } 1319 int length = 0; 1320 while (Character.isDigit(reader.peek(length))) { 1321 length++; 1322 } 1323 Integer value = Integer.parseInt(reader.prefixForward(length)); 1324 return value; 1325 } 1326 1327 /** 1328 * <p> 1329 * Read a %TAG directive value: 1330 * 1331 * <pre> 1332 * s-ignored-space+ c-tag-handle s-ignored-space+ ns-tag-prefix s-l-comments 1333 * </pre> 1334 * 1335 * </p> 1336 * 1337 * @see <a href="http://www.yaml.org/spec/1.1/#id896044"></a> 1338 */ 1339 private List<String> scanTagDirectiveValue(Mark startMark) { 1340 // See the specification for details. 1341 while (reader.peek() == ' ') { 1342 reader.forward(); 1343 } 1344 String handle = scanTagDirectiveHandle(startMark); 1345 while (reader.peek() == ' ') { 1346 reader.forward(); 1347 } 1348 String prefix = scanTagDirectivePrefix(startMark); 1349 List<String> result = new ArrayList<String>(2); 1350 result.add(handle); 1351 result.add(prefix); 1352 return result; 1353 } 1354 1355 /** 1356 * Scan a %TAG directive's handle. This is YAML's c-tag-handle. 1357 * 1358 * @see <a href="http://www.yaml.org/spec/1.1/#id896876"></a> 1359 * @param startMark 1360 * @return 1361 */ 1362 private String scanTagDirectiveHandle(Mark startMark) { 1363 // See the specification for details. 1364 String value = scanTagHandle("directive", startMark); 1365 char ch = reader.peek(); 1366 if (ch != ' ') { 1367 throw new ScannerException("while scanning a directive", startMark, 1368 "expected ' ', but found " + reader.peek() + "(" + ch + ")", reader.getMark()); 1369 } 1370 return value; 1371 } 1372 1373 /** 1374 * Scan a %TAG directive's prefix. This is YAML's ns-tag-prefix. 1375 * 1376 * @see <a href="http://www.yaml.org/spec/1.1/#ns-tag-prefix"></a> 1377 */ 1378 private String scanTagDirectivePrefix(Mark startMark) { 1379 // See the specification for details. 1380 String value = scanTagUri("directive", startMark); 1381 if (Constant.NULL_BL_LINEBR.hasNo(reader.peek())) { 1382 throw new ScannerException("while scanning a directive", startMark, 1383 "expected ' ', but found " + reader.peek() + "(" + ((int) reader.peek()) + ")", 1384 reader.getMark()); 1385 } 1386 return value; 1387 } 1388 1389 private String scanDirectiveIgnoredLine(Mark startMark) { 1390 // See the specification for details. 1391 int ff = 0; 1392 while (reader.peek(ff) == ' ') { 1393 ff++; 1394 } 1395 if (ff > 0) { 1396 reader.forward(ff); 1397 } 1398 if (reader.peek() == '#') { 1399 ff = 0; 1400 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) { 1401 ff++; 1402 } 1403 reader.forward(ff); 1404 } 1405 char ch = reader.peek(); 1406 String lineBreak = scanLineBreak(); 1407 if (lineBreak.length() == 0 && ch != '\0') { 1408 throw new ScannerException("while scanning a directive", startMark, 1409 "expected a comment or a line break, but found " + ch + "(" + ((int) ch) + ")", 1410 reader.getMark()); 1411 } 1412 return lineBreak; 1413 } 1414 1415 /** 1416 * <pre> 1417 * The specification does not restrict characters for anchors and 1418 * aliases. This may lead to problems, for instance, the document: 1419 * [ *alias, value ] 1420 * can be interpreted in two ways, as 1421 * [ "value" ] 1422 * and 1423 * [ *alias , "value" ] 1424 * Therefore we restrict aliases to numbers and ASCII letters. 1425 * </pre> 1426 */ 1427 private Token scanAnchor(boolean isAnchor) { 1428 Mark startMark = reader.getMark(); 1429 char indicator = reader.peek(); 1430 String name = indicator == '*' ? "alias" : "anchor"; 1431 reader.forward(); 1432 int length = 0; 1433 char ch = reader.peek(length); 1434 while (Constant.ALPHA.has(ch)) { 1435 length++; 1436 ch = reader.peek(length); 1437 } 1438 if (length == 0) { 1439 throw new ScannerException("while scanning an " + name, startMark, 1440 "expected alphabetic or numeric character, but found " + ch, 1441 reader.getMark()); 1442 } 1443 String value = reader.prefixForward(length); 1444 ch = reader.peek(); 1445 if (Constant.NULL_BL_T_LINEBR.hasNo(ch, "?:,]}%@`")) { 1446 throw new ScannerException("while scanning an " + name, startMark, 1447 "expected alphabetic or numeric character, but found " + ch + "(" 1448 + ((int) reader.peek()) + ")", reader.getMark()); 1449 } 1450 Mark endMark = reader.getMark(); 1451 Token tok; 1452 if (isAnchor) { 1453 tok = new AnchorToken(value, startMark, endMark); 1454 } else { 1455 tok = new AliasToken(value, startMark, endMark); 1456 } 1457 return tok; 1458 } 1459 1460 /** 1461 * <p> 1462 * Scan a Tag property. A Tag property may be specified in one of three 1463 * ways: c-verbatim-tag, c-ns-shorthand-tag, or c-ns-non-specific-tag 1464 * </p> 1465 * 1466 * <p> 1467 * c-verbatim-tag takes the form !<ns-uri-char+> and must be delivered 1468 * verbatim (as-is) to the application. In particular, verbatim tags are not 1469 * subject to tag resolution. 1470 * </p> 1471 * 1472 * <p> 1473 * c-ns-shorthand-tag is a valid tag handle followed by a non-empty suffix. 1474 * If the tag handle is a c-primary-tag-handle ('!') then the suffix must 1475 * have all exclamation marks properly URI-escaped (%21); otherwise, the 1476 * string will look like a named tag handle: !foo!bar would be interpreted 1477 * as (handle="!foo!", suffix="bar"). 1478 * </p> 1479 * 1480 * <p> 1481 * c-ns-non-specific-tag is always a lone '!'; this is only useful for plain 1482 * scalars, where its specification means that the scalar MUST be resolved 1483 * to have type tag:yaml.org,2002:str. 1484 * </p> 1485 * 1486 * TODO SnakeYaml incorrectly ignores c-ns-non-specific-tag right now. 1487 * 1488 * @see <a href="http://www.yaml.org/spec/1.1/#id900262"></a> 1489 * 1490 * TODO Note that this method does not enforce rules about local versus 1491 * global tags! 1492 */ 1493 private Token scanTag() { 1494 // See the specification for details. 1495 Mark startMark = reader.getMark(); 1496 // Determine the type of tag property based on the first character 1497 // encountered 1498 char ch = reader.peek(1); 1499 String handle = null; 1500 String suffix = null; 1501 // Verbatim tag! (c-verbatim-tag) 1502 if (ch == '<') { 1503 // Skip the exclamation mark and >, then read the tag suffix (as 1504 // a URI). 1505 reader.forward(2); 1506 suffix = scanTagUri("tag", startMark); 1507 if (reader.peek() != '>') { 1508 // If there are any characters between the end of the tag-suffix 1509 // URI and the closing >, then an error has occurred. 1510 throw new ScannerException("while scanning a tag", startMark, 1511 "expected '>', but found '" + reader.peek() + "' (" + ((int) reader.peek()) 1512 + ")", reader.getMark()); 1513 } 1514 reader.forward(); 1515 } else if (Constant.NULL_BL_T_LINEBR.has(ch)) { 1516 // A NUL, blank, tab, or line-break means that this was a 1517 // c-ns-non-specific tag. 1518 suffix = "!"; 1519 reader.forward(); 1520 } else { 1521 // Any other character implies c-ns-shorthand-tag type. 1522 1523 // Look ahead in the stream to determine whether this tag property 1524 // is of the form !foo or !foo!bar. 1525 int length = 1; 1526 boolean useHandle = false; 1527 while (Constant.NULL_BL_LINEBR.hasNo(ch)) { 1528 if (ch == '!') { 1529 useHandle = true; 1530 break; 1531 } 1532 length++; 1533 ch = reader.peek(length); 1534 } 1535 handle = "!"; 1536 // If we need to use a handle, scan it in; otherwise, the handle is 1537 // presumed to be '!'. 1538 if (useHandle) { 1539 handle = scanTagHandle("tag", startMark); 1540 } else { 1541 handle = "!"; 1542 reader.forward(); 1543 } 1544 suffix = scanTagUri("tag", startMark); 1545 } 1546 ch = reader.peek(); 1547 // Check that the next character is allowed to follow a tag-property; 1548 // if it is not, raise the error. 1549 if (Constant.NULL_BL_LINEBR.hasNo(ch)) { 1550 throw new ScannerException("while scanning a tag", startMark, 1551 "expected ' ', but found '" + ch + "' (" + ((int) ch) + ")", reader.getMark()); 1552 } 1553 TagTuple value = new TagTuple(handle, suffix); 1554 Mark endMark = reader.getMark(); 1555 return new TagToken(value, startMark, endMark); 1556 } 1557 1558 private Token scanBlockScalar(char style) { 1559 // See the specification for details. 1560 boolean folded; 1561 // Depending on the given style, we determine whether the scalar is 1562 // folded ('>') or literal ('|') 1563 if (style == '>') { 1564 folded = true; 1565 } else { 1566 folded = false; 1567 } 1568 StringBuilder chunks = new StringBuilder(); 1569 Mark startMark = reader.getMark(); 1570 // Scan the header. 1571 reader.forward(); 1572 Chomping chompi = scanBlockScalarIndicators(startMark); 1573 int increment = chompi.getIncrement(); 1574 scanBlockScalarIgnoredLine(startMark); 1575 1576 // Determine the indentation level and go to the first non-empty line. 1577 int minIndent = this.indent + 1; 1578 if (minIndent < 1) { 1579 minIndent = 1; 1580 } 1581 String breaks = null; 1582 int maxIndent = 0; 1583 int indent = 0; 1584 Mark endMark; 1585 if (increment == -1) { 1586 Object[] brme = scanBlockScalarIndentation(); 1587 breaks = (String) brme[0]; 1588 maxIndent = ((Integer) brme[1]).intValue(); 1589 endMark = (Mark) brme[2]; 1590 indent = Math.max(minIndent, maxIndent); 1591 } else { 1592 indent = minIndent + increment - 1; 1593 Object[] brme = scanBlockScalarBreaks(indent); 1594 breaks = (String) brme[0]; 1595 endMark = (Mark) brme[1]; 1596 } 1597 1598 String lineBreak = ""; 1599 1600 // Scan the inner part of the block scalar. 1601 while (this.reader.getColumn() == indent && reader.peek() != '\0') { 1602 chunks.append(breaks); 1603 boolean leadingNonSpace = " \t".indexOf(reader.peek()) == -1; 1604 int length = 0; 1605 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(length))) { 1606 length++; 1607 } 1608 chunks.append(reader.prefixForward(length)); 1609 lineBreak = scanLineBreak(); 1610 Object[] brme = scanBlockScalarBreaks(indent); 1611 breaks = (String) brme[0]; 1612 endMark = (Mark) brme[1]; 1613 if (this.reader.getColumn() == indent && reader.peek() != '\0') { 1614 1615 // Unfortunately, folding rules are ambiguous. 1616 // 1617 // This is the folding according to the specification: 1618 if (folded && "\n".equals(lineBreak) && leadingNonSpace 1619 && " \t".indexOf(reader.peek()) == -1) { 1620 if (breaks.length() == 0) { 1621 chunks.append(" "); 1622 } 1623 } else { 1624 chunks.append(lineBreak); 1625 } 1626 // Clark Evans's interpretation (also in the spec examples) not 1627 // imported from PyYAML 1628 } else { 1629 break; 1630 } 1631 } 1632 // Chomp the tail. 1633 if (chompi.chompTailIsNotFalse()) { 1634 chunks.append(lineBreak); 1635 } 1636 if (chompi.chompTailIsTrue()) { 1637 chunks.append(breaks); 1638 } 1639 // We are done. 1640 return new ScalarToken(chunks.toString(), false, startMark, endMark, style); 1641 } 1642 1643 /** 1644 * Scan a block scalar indicator. The block scalar indicator includes two 1645 * optional components, which may appear in either order. 1646 * 1647 * A block indentation indicator is a non-zero digit describing the 1648 * indentation level of the block scalar to follow. This indentation is an 1649 * additional number of spaces relative to the current indentation level. 1650 * 1651 * A block chomping indicator is a + or -, selecting the chomping mode away 1652 * from the default (clip) to either -(strip) or +(keep). 1653 * 1654 * @see <a href="http://www.yaml.org/spec/1.1/#id868988"></a> 1655 * @see <a href="http://www.yaml.org/spec/1.1/#id927035"></a> 1656 * @see <a href="http://www.yaml.org/spec/1.1/#id927557"></a> 1657 */ 1658 private Chomping scanBlockScalarIndicators(Mark startMark) { 1659 // See the specification for details. 1660 Boolean chomping = null; 1661 int increment = -1; 1662 char ch = reader.peek(); 1663 if (ch == '-' || ch == '+') { 1664 if (ch == '+') { 1665 chomping = Boolean.TRUE; 1666 } else { 1667 chomping = Boolean.FALSE; 1668 } 1669 reader.forward(); 1670 ch = reader.peek(); 1671 if (Character.isDigit(ch)) { 1672 increment = Integer.parseInt(String.valueOf(ch)); 1673 if (increment == 0) { 1674 throw new ScannerException("while scanning a block scalar", startMark, 1675 "expected indentation indicator in the range 1-9, but found 0", 1676 reader.getMark()); 1677 } 1678 reader.forward(); 1679 } 1680 } else if (Character.isDigit(ch)) { 1681 increment = Integer.parseInt(String.valueOf(ch)); 1682 if (increment == 0) { 1683 throw new ScannerException("while scanning a block scalar", startMark, 1684 "expected indentation indicator in the range 1-9, but found 0", 1685 reader.getMark()); 1686 } 1687 reader.forward(); 1688 ch = reader.peek(); 1689 if (ch == '-' || ch == '+') { 1690 if (ch == '+') { 1691 chomping = Boolean.TRUE; 1692 } else { 1693 chomping = Boolean.FALSE; 1694 } 1695 reader.forward(); 1696 } 1697 } 1698 ch = reader.peek(); 1699 if (Constant.NULL_BL_LINEBR.hasNo(ch)) { 1700 throw new ScannerException("while scanning a block scalar", startMark, 1701 "expected chomping or indentation indicators, but found " + ch, 1702 reader.getMark()); 1703 } 1704 return new Chomping(chomping, increment); 1705 } 1706 1707 /** 1708 * Scan to the end of the line after a block scalar has been scanned; the 1709 * only things that are permitted at this time are comments and spaces. 1710 */ 1711 private String scanBlockScalarIgnoredLine(Mark startMark) { 1712 // See the specification for details. 1713 int ff = 0; 1714 // Forward past any number of trailing spaces 1715 while (reader.peek(ff) == ' ') { 1716 ff++; 1717 } 1718 if (ff > 0) { 1719 reader.forward(ff); 1720 } 1721 // If a comment occurs, scan to just before the end of line. 1722 if (reader.peek() == '#') { 1723 ff = 0; 1724 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) { 1725 ff++; 1726 } 1727 if (ff > 0) { 1728 reader.forward(ff); 1729 } 1730 } 1731 // If the next character is not a null or line break, an error has 1732 // occurred. 1733 char ch = reader.peek(); 1734 String lineBreak = scanLineBreak(); 1735 if (lineBreak.length() == 0 && ch != '\0') { 1736 throw new ScannerException("while scanning a block scalar", startMark, 1737 "expected a comment or a line break, but found " + ch, reader.getMark()); 1738 } 1739 return lineBreak; 1740 } 1741 1742 /** 1743 * Scans for the indentation of a block scalar implicitly. This mechanism is 1744 * used only if the block did not explicitly state an indentation to be 1745 * used. 1746 * 1747 * @see <a href="http://www.yaml.org/spec/1.1/#id927035"></a> 1748 */ 1749 private Object[] scanBlockScalarIndentation() { 1750 // See the specification for details. 1751 StringBuilder chunks = new StringBuilder(); 1752 int maxIndent = 0; 1753 Mark endMark = reader.getMark(); 1754 // Look ahead some number of lines until the first non-blank character 1755 // occurs; the determined indentation will be the maximum number of 1756 // leading spaces on any of these lines. 1757 while (Constant.LINEBR.has(reader.peek(), " \r")) { 1758 if (reader.peek() != ' ') { 1759 // If the character isn't a space, it must be some kind of 1760 // line-break; scan the line break and track it. 1761 chunks.append(scanLineBreak()); 1762 endMark = reader.getMark(); 1763 } else { 1764 // If the character is a space, move forward to the next 1765 // character; if we surpass our previous maximum for indent 1766 // level, update that too. 1767 reader.forward(); 1768 if (this.reader.getColumn() > maxIndent) { 1769 maxIndent = reader.getColumn(); 1770 } 1771 } 1772 } 1773 // Pass several results back together. 1774 return new Object[] { chunks.toString(), maxIndent, endMark }; 1775 } 1776 1777 private Object[] scanBlockScalarBreaks(int indent) { 1778 // See the specification for details. 1779 StringBuilder chunks = new StringBuilder(); 1780 Mark endMark = reader.getMark(); 1781 int ff = 0; 1782 int col = this.reader.getColumn(); 1783 // Scan for up to the expected indentation-level of spaces, then move 1784 // forward past that amount. 1785 while (col < indent && reader.peek(ff) == ' ') { 1786 ff++; 1787 col++; 1788 } 1789 if (ff > 0) { 1790 reader.forward(ff); 1791 } 1792 // Consume one or more line breaks followed by any amount of spaces, 1793 // until we find something that isn't a line-break. 1794 String lineBreak = null; 1795 while ((lineBreak = scanLineBreak()).length() != 0) { 1796 chunks.append(lineBreak); 1797 endMark = reader.getMark(); 1798 // Scan past up to (indent) spaces on the next line, then forward 1799 // past them. 1800 ff = 0; 1801 col = this.reader.getColumn(); 1802 while (col < indent && reader.peek(ff) == ' ') { 1803 ff++; 1804 col++; 1805 } 1806 if (ff > 0) { 1807 reader.forward(ff); 1808 } 1809 } 1810 // Return both the assembled intervening string and the end-mark. 1811 return new Object[] { chunks.toString(), endMark }; 1812 } 1813 1814 /** 1815 * Scan a flow-style scalar. Flow scalars are presented in one of two forms; 1816 * first, a flow scalar may be a double-quoted string; second, a flow scalar 1817 * may be a single-quoted string. 1818 * 1819 * @see <a href="http://www.yaml.org/spec/1.1/#flow"></a> style/syntax 1820 * 1821 * <pre> 1822 * See the specification for details. 1823 * Note that we loose indentation rules for quoted scalars. Quoted 1824 * scalars don't need to adhere indentation because " and ' clearly 1825 * mark the beginning and the end of them. Therefore we are less 1826 * restrictive then the specification requires. We only need to check 1827 * that document separators are not included in scalars. 1828 * </pre> 1829 */ 1830 private Token scanFlowScalar(char style) { 1831 boolean _double; 1832 // The style will be either single- or double-quoted; we determine this 1833 // by the first character in the entry (supplied) 1834 if (style == '"') { 1835 _double = true; 1836 } else { 1837 _double = false; 1838 } 1839 StringBuilder chunks = new StringBuilder(); 1840 Mark startMark = reader.getMark(); 1841 char quote = reader.peek(); 1842 reader.forward(); 1843 chunks.append(scanFlowScalarNonSpaces(_double, startMark)); 1844 while (reader.peek() != quote) { 1845 chunks.append(scanFlowScalarSpaces(startMark)); 1846 chunks.append(scanFlowScalarNonSpaces(_double, startMark)); 1847 } 1848 reader.forward(); 1849 Mark endMark = reader.getMark(); 1850 return new ScalarToken(chunks.toString(), false, startMark, endMark, style); 1851 } 1852 1853 /** 1854 * Scan some number of flow-scalar non-space characters. 1855 */ 1856 private String scanFlowScalarNonSpaces(boolean doubleQuoted, Mark startMark) { 1857 // See the specification for details. 1858 StringBuilder chunks = new StringBuilder(); 1859 while (true) { 1860 // Scan through any number of characters which are not: NUL, blank, 1861 // tabs, line breaks, single-quotes, double-quotes, or backslashes. 1862 int length = 0; 1863 while (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(length), "\'\"\\")) { 1864 length++; 1865 } 1866 if (length != 0) { 1867 chunks.append(reader.prefixForward(length)); 1868 } 1869 // Depending on our quoting-type, the characters ', " and \ have 1870 // differing meanings. 1871 char ch = reader.peek(); 1872 if (!doubleQuoted && ch == '\'' && reader.peek(1) == '\'') { 1873 chunks.append("'"); 1874 reader.forward(2); 1875 } else if ((doubleQuoted && ch == '\'') || (!doubleQuoted && "\"\\".indexOf(ch) != -1)) { 1876 chunks.append(ch); 1877 reader.forward(); 1878 } else if (doubleQuoted && ch == '\\') { 1879 reader.forward(); 1880 ch = reader.peek(); 1881 if (ESCAPE_REPLACEMENTS.containsKey(Character.valueOf(ch))) { 1882 // The character is one of the single-replacement 1883 // types; these are replaced with a literal character 1884 // from the mapping. 1885 chunks.append(ESCAPE_REPLACEMENTS.get(Character.valueOf(ch))); 1886 reader.forward(); 1887 } else if (ESCAPE_CODES.containsKey(Character.valueOf(ch))) { 1888 // The character is a multi-digit escape sequence, with 1889 // length defined by the value in the ESCAPE_CODES map. 1890 length = ESCAPE_CODES.get(Character.valueOf(ch)).intValue(); 1891 reader.forward(); 1892 String hex = reader.prefix(length); 1893 if (NOT_HEXA.matcher(hex).find()) { 1894 throw new ScannerException("while scanning a double-quoted scalar", 1895 startMark, "expected escape sequence of " + length 1896 + " hexadecimal numbers, but found: " + hex, 1897 reader.getMark()); 1898 } 1899 int decimal = Integer.parseInt(hex, 16); 1900 String unicode = new String(Character.toChars(decimal)); 1901 chunks.append(unicode); 1902 reader.forward(length); 1903 } else if (scanLineBreak().length() != 0) { 1904 chunks.append(scanFlowScalarBreaks(startMark)); 1905 } else { 1906 throw new ScannerException("while scanning a double-quoted scalar", startMark, 1907 "found unknown escape character " + ch + "(" + ((int) ch) + ")", 1908 reader.getMark()); 1909 } 1910 } else { 1911 return chunks.toString(); 1912 } 1913 } 1914 } 1915 1916 private String scanFlowScalarSpaces(Mark startMark) { 1917 // See the specification for details. 1918 StringBuilder chunks = new StringBuilder(); 1919 int length = 0; 1920 // Scan through any number of whitespace (space, tab) characters, 1921 // consuming them. 1922 while (" \t".indexOf(reader.peek(length)) != -1) { 1923 length++; 1924 } 1925 String whitespaces = reader.prefixForward(length); 1926 char ch = reader.peek(); 1927 if (ch == '\0') { 1928 // A flow scalar cannot end with an end-of-stream 1929 throw new ScannerException("while scanning a quoted scalar", startMark, 1930 "found unexpected end of stream", reader.getMark()); 1931 } 1932 // If we encounter a line break, scan it into our assembled string... 1933 String lineBreak = scanLineBreak(); 1934 if (lineBreak.length() != 0) { 1935 String breaks = scanFlowScalarBreaks(startMark); 1936 if (!"\n".equals(lineBreak)) { 1937 chunks.append(lineBreak); 1938 } else if (breaks.length() == 0) { 1939 chunks.append(" "); 1940 } 1941 chunks.append(breaks); 1942 } else { 1943 chunks.append(whitespaces); 1944 } 1945 return chunks.toString(); 1946 } 1947 1948 private String scanFlowScalarBreaks(Mark startMark) { 1949 // See the specification for details. 1950 StringBuilder chunks = new StringBuilder(); 1951 while (true) { 1952 // Instead of checking indentation, we check for document 1953 // separators. 1954 String prefix = reader.prefix(3); 1955 if (("---".equals(prefix) || "...".equals(prefix)) 1956 && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) { 1957 throw new ScannerException("while scanning a quoted scalar", startMark, 1958 "found unexpected document separator", reader.getMark()); 1959 } 1960 // Scan past any number of spaces and tabs, ignoring them 1961 while (" \t".indexOf(reader.peek()) != -1) { 1962 reader.forward(); 1963 } 1964 // If we stopped at a line break, add that; otherwise, return the 1965 // assembled set of scalar breaks. 1966 String lineBreak = scanLineBreak(); 1967 if (lineBreak.length() != 0) { 1968 chunks.append(lineBreak); 1969 } else { 1970 return chunks.toString(); 1971 } 1972 } 1973 } 1974 1975 /** 1976 * Scan a plain scalar. 1977 * 1978 * <pre> 1979 * See the specification for details. 1980 * We add an additional restriction for the flow context: 1981 * plain scalars in the flow context cannot contain ',', ':' and '?'. 1982 * We also keep track of the `allow_simple_key` flag here. 1983 * Indentation rules are loosed for the flow context. 1984 * </pre> 1985 */ 1986 private Token scanPlain() { 1987 StringBuilder chunks = new StringBuilder(); 1988 Mark startMark = reader.getMark(); 1989 Mark endMark = startMark; 1990 int indent = this.indent + 1; 1991 String spaces = ""; 1992 while (true) { 1993 char ch; 1994 int length = 0; 1995 // A comment indicates the end of the scalar. 1996 if (reader.peek() == '#') { 1997 break; 1998 } 1999 while (true) { 2000 ch = reader.peek(length); 2001 if (Constant.NULL_BL_T_LINEBR.has(ch) 2002 || (this.flowLevel == 0 && ch == ':' && Constant.NULL_BL_T_LINEBR 2003 .has(reader.peek(length + 1))) 2004 || (this.flowLevel != 0 && ",:?[]{}".indexOf(ch) != -1)) { 2005 break; 2006 } 2007 length++; 2008 } 2009 // It's not clear what we should do with ':' in the flow context. 2010 if (this.flowLevel != 0 && ch == ':' 2011 && Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(length + 1), ",[]{}")) { 2012 reader.forward(length); 2013 throw new ScannerException("while scanning a plain scalar", startMark, 2014 "found unexpected ':'", reader.getMark(), 2015 "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details."); 2016 } 2017 if (length == 0) { 2018 break; 2019 } 2020 this.allowSimpleKey = false; 2021 chunks.append(spaces); 2022 chunks.append(reader.prefixForward(length)); 2023 endMark = reader.getMark(); 2024 spaces = scanPlainSpaces(); 2025 // System.out.printf("spaces[%s]\n", spaces); 2026 if (spaces.length() == 0 || reader.peek() == '#' 2027 || (this.flowLevel == 0 && this.reader.getColumn() < indent)) { 2028 break; 2029 } 2030 } 2031 return new ScalarToken(chunks.toString(), startMark, endMark, true); 2032 } 2033 2034 /** 2035 * See the specification for details. SnakeYAML and libyaml allow tabs 2036 * inside plain scalar 2037 */ 2038 private String scanPlainSpaces() { 2039 int length = 0; 2040 while (reader.peek(length) == ' ' || reader.peek(length) == '\t') { 2041 length++; 2042 } 2043 String whitespaces = reader.prefixForward(length); 2044 String lineBreak = scanLineBreak(); 2045 if (lineBreak.length() != 0) { 2046 this.allowSimpleKey = true; 2047 String prefix = reader.prefix(3); 2048 if ("---".equals(prefix) || "...".equals(prefix) 2049 && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) { 2050 return ""; 2051 } 2052 StringBuilder breaks = new StringBuilder(); 2053 while (true) { 2054 if (reader.peek() == ' ') { 2055 reader.forward(); 2056 } else { 2057 String lb = scanLineBreak(); 2058 if (lb.length() != 0) { 2059 breaks.append(lb); 2060 prefix = reader.prefix(3); 2061 if ("---".equals(prefix) || "...".equals(prefix) 2062 && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) { 2063 return ""; 2064 } 2065 } else { 2066 break; 2067 } 2068 } 2069 } 2070 if (!"\n".equals(lineBreak)) { 2071 return lineBreak + breaks; 2072 } else if (breaks.length() == 0) { 2073 return " "; 2074 } 2075 return breaks.toString(); 2076 } 2077 return whitespaces; 2078 } 2079 2080 /** 2081 * <p> 2082 * Scan a Tag handle. A Tag handle takes one of three forms: 2083 * 2084 * <pre> 2085 * "!" (c-primary-tag-handle) 2086 * "!!" (ns-secondary-tag-handle) 2087 * "!(name)!" (c-named-tag-handle) 2088 * </pre> 2089 * 2090 * Where (name) must be formatted as an ns-word-char. 2091 * </p> 2092 * 2093 * @see <a href="http://www.yaml.org/spec/1.1/#c-tag-handle"></a> 2094 * @see <a href="http://www.yaml.org/spec/1.1/#ns-word-char"></a> 2095 * 2096 * <pre> 2097 * See the specification for details. 2098 * For some strange reasons, the specification does not allow '_' in 2099 * tag handles. I have allowed it anyway. 2100 * </pre> 2101 */ 2102 private String scanTagHandle(String name, Mark startMark) { 2103 char ch = reader.peek(); 2104 if (ch != '!') { 2105 throw new ScannerException("while scanning a " + name, startMark, 2106 "expected '!', but found " + ch + "(" + ((int) ch) + ")", reader.getMark()); 2107 } 2108 // Look for the next '!' in the stream, stopping if we hit a 2109 // non-word-character. If the first character is a space, then the 2110 // tag-handle is a c-primary-tag-handle ('!'). 2111 int length = 1; 2112 ch = reader.peek(length); 2113 if (ch != ' ') { 2114 // Scan through 0+ alphabetic characters. 2115 // FIXME According to the specification, these should be 2116 // ns-word-char only, which prohibits '_'. This might be a 2117 // candidate for a configuration option. 2118 while (Constant.ALPHA.has(ch)) { 2119 length++; 2120 ch = reader.peek(length); 2121 } 2122 // Found the next non-word-char. If this is not a space and not an 2123 // '!', then this is an error, as the tag-handle was specified as: 2124 // !(name) or similar; the trailing '!' is missing. 2125 if (ch != '!') { 2126 reader.forward(length); 2127 throw new ScannerException("while scanning a " + name, startMark, 2128 "expected '!', but found " + ch + "(" + ((int) ch) + ")", reader.getMark()); 2129 } 2130 length++; 2131 } 2132 String value = reader.prefixForward(length); 2133 return value; 2134 } 2135 2136 /** 2137 * <p> 2138 * Scan a Tag URI. This scanning is valid for both local and global tag 2139 * directives, because both appear to be valid URIs as far as scanning is 2140 * concerned. The difference may be distinguished later, in parsing. This 2141 * method will scan for ns-uri-char*, which covers both cases. 2142 * </p> 2143 * 2144 * <p> 2145 * This method performs no verification that the scanned URI conforms to any 2146 * particular kind of URI specification. 2147 * </p> 2148 * 2149 * @see <a href="http://www.yaml.org/spec/1.1/#ns-uri-char"></a> 2150 */ 2151 private String scanTagUri(String name, Mark startMark) { 2152 // See the specification for details. 2153 // Note: we do not check if URI is well-formed. 2154 StringBuilder chunks = new StringBuilder(); 2155 // Scan through accepted URI characters, which includes the standard 2156 // URI characters, plus the start-escape character ('%'). When we get 2157 // to a start-escape, scan the escaped sequence, then return. 2158 int length = 0; 2159 char ch = reader.peek(length); 2160 while (Constant.URI_CHARS.has(ch)) { 2161 if (ch == '%') { 2162 chunks.append(reader.prefixForward(length)); 2163 length = 0; 2164 chunks.append(scanUriEscapes(name, startMark)); 2165 } else { 2166 length++; 2167 } 2168 ch = reader.peek(length); 2169 } 2170 // Consume the last "chunk", which would not otherwise be consumed by 2171 // the loop above. 2172 if (length != 0) { 2173 chunks.append(reader.prefixForward(length)); 2174 length = 0; 2175 } 2176 if (chunks.length() == 0) { 2177 // If no URI was found, an error has occurred. 2178 throw new ScannerException("while scanning a " + name, startMark, 2179 "expected URI, but found " + ch + "(" + ((int) ch) + ")", reader.getMark()); 2180 } 2181 return chunks.toString(); 2182 } 2183 2184 /** 2185 * <p> 2186 * Scan a sequence of %-escaped URI escape codes and convert them into a 2187 * String representing the unescaped values. 2188 * </p> 2189 * 2190 * FIXME This method fails for more than 256 bytes' worth of URI-encoded 2191 * characters in a row. Is this possible? Is this a use-case? 2192 * 2193 * @see <a href="http://www.ietf.org/rfc/rfc2396.txt"></a>, section 2.4, Escaped Encoding. 2194 */ 2195 private String scanUriEscapes(String name, Mark startMark) { 2196 // First, look ahead to see how many URI-escaped characters we should 2197 // expect, so we can use the correct buffer size. 2198 int length = 1; 2199 while (reader.peek(length * 3) == '%') { 2200 length++; 2201 } 2202 // See the specification for details. 2203 // URIs containing 16 and 32 bit Unicode characters are 2204 // encoded in UTF-8, and then each octet is written as a 2205 // separate character. 2206 Mark beginningMark = reader.getMark(); 2207 ByteBuffer buff = ByteBuffer.allocate(length); 2208 while (reader.peek() == '%') { 2209 reader.forward(); 2210 try { 2211 byte code = (byte) Integer.parseInt(reader.prefix(2), 16); 2212 buff.put(code); 2213 } catch (NumberFormatException nfe) { 2214 throw new ScannerException("while scanning a " + name, startMark, 2215 "expected URI escape sequence of 2 hexadecimal numbers, but found " 2216 + reader.peek() + "(" + ((int) reader.peek()) + ") and " 2217 + reader.peek(1) + "(" + ((int) reader.peek(1)) + ")", 2218 reader.getMark()); 2219 } 2220 reader.forward(2); 2221 } 2222 buff.flip(); 2223 try { 2224 return UriEncoder.decode(buff); 2225 } catch (CharacterCodingException e) { 2226 throw new ScannerException("while scanning a " + name, startMark, 2227 "expected URI in UTF-8: " + e.getMessage(), beginningMark); 2228 } 2229 } 2230 2231 /** 2232 * Scan a line break, transforming: 2233 * 2234 * <pre> 2235 * '\r\n' : '\n' 2236 * '\r' : '\n' 2237 * '\n' : '\n' 2238 * '\x85' : '\n' 2239 * default : '' 2240 * </pre> 2241 */ 2242 private String scanLineBreak() { 2243 // Transforms: 2244 // '\r\n' : '\n' 2245 // '\r' : '\n' 2246 // '\n' : '\n' 2247 // '\x85' : '\n' 2248 // default : '' 2249 char ch = reader.peek(); 2250 if (ch == '\r' || ch == '\n' || ch == '\u0085') { 2251 if (ch == '\r' && '\n' == reader.peek(1)) { 2252 reader.forward(2); 2253 } else { 2254 reader.forward(); 2255 } 2256 return "\n"; 2257 } else if (ch == '\u2028' || ch == '\u2029') { 2258 reader.forward(); 2259 return String.valueOf(ch); 2260 } 2261 return ""; 2262 } 2263 2264 /** 2265 * Chomping the tail may have 3 values - yes, no, not defined. 2266 */ 2267 private static class Chomping { 2268 private final Boolean value; 2269 private final int increment; 2270 2271 public Chomping(Boolean value, int increment) { 2272 this.value = value; 2273 this.increment = increment; 2274 } 2275 2276 public boolean chompTailIsNotFalse() { 2277 return value == null || value; 2278 } 2279 2280 public boolean chompTailIsTrue() { 2281 return value != null && value; 2282 } 2283 2284 public int getIncrement() { 2285 return increment; 2286 } 2287 } 2288 } 2289