1 /** XML parser by Oliver Zeigermann October 10, 2005 */ 2 lexer grammar t012lexerXML; 3 options { 4 language = JavaScript; 5 } 6 7 @lexer::members { 8 this.lout = []; 9 this.output = function(line) { 10 this.lout.push(line); 11 }; 12 } 13 14 DOCUMENT 15 : XMLDECL? WS? DOCTYPE? WS? ELEMENT WS? 16 ; 17 18 fragment DOCTYPE 19 : 20 '<!DOCTYPE' WS rootElementName=GENERIC_ID 21 {this.output("ROOTELEMENT: "+$rootElementName.text)} 22 WS 23 ( 24 ( 'SYSTEM' WS sys1=VALUE 25 {this.output("SYSTEM: "+$sys1.text)} 26 27 | 'PUBLIC' WS pub=VALUE WS sys2=VALUE 28 {this.output("PUBLIC: "+$pub.text)} 29 {this.output("SYSTEM: "+$sys2.text)} 30 ) 31 ( WS )? 32 )? 33 ( dtd=INTERNAL_DTD 34 {this.output("INTERNAL DTD: "+$dtd.text)} 35 )? 36 '>' 37 ; 38 39 fragment INTERNAL_DTD : '[' (options {greedy=false;} : .)* ']' ; 40 41 fragment PI : 42 '<?' target=GENERIC_ID WS? 43 {this.output("PI: "+$target.text)} 44 ( ATTRIBUTE WS? )* '?>' 45 ; 46 47 fragment XMLDECL : 48 '<?' ('x'|'X') ('m'|'M') ('l'|'L') WS? 49 {this.output("XML declaration")} 50 ( ATTRIBUTE WS? )* '?>' 51 ; 52 53 54 fragment ELEMENT 55 : ( START_TAG 56 (ELEMENT 57 | t=PCDATA 58 {this.output("PCDATA: \""+$t.text+"\"")} 59 | t=CDATA 60 {this.output("CDATA: \""+$t.text+"\"")} 61 | t=COMMENT 62 {this.output("Comment: \""+$t.text+"\"")} 63 | pi=PI 64 )* 65 END_TAG 66 | EMPTY_ELEMENT 67 ) 68 ; 69 70 fragment START_TAG 71 : '<' WS? name=GENERIC_ID WS? 72 {this.output("Start Tag: "+$name.text)} 73 ( ATTRIBUTE WS? )* '>' 74 ; 75 76 fragment EMPTY_ELEMENT 77 : '<' WS? name=GENERIC_ID WS? 78 {this.output("Empty Element: "+$name.text)} 79 ( ATTRIBUTE WS? )* '/>' 80 ; 81 82 fragment ATTRIBUTE 83 : name=GENERIC_ID WS? '=' WS? value=VALUE 84 {this.output("Attr: "+$name.text+"="+$value.text)} 85 ; 86 87 fragment END_TAG 88 : '</' WS? name=GENERIC_ID WS? '>' 89 {this.output("End Tag: "+$name.text)} 90 ; 91 92 fragment COMMENT 93 : '<!--' (options {greedy=false;} : .)* '-->' 94 ; 95 96 fragment CDATA 97 : '<![CDATA[' (options {greedy=false;} : .)* ']]>' 98 ; 99 100 fragment PCDATA : (~'<')+ ; 101 102 fragment VALUE : 103 ( '\"' (~'\"')* '\"' 104 | '\'' (~'\'')* '\'' 105 ) 106 ; 107 108 fragment GENERIC_ID 109 : ( LETTER | '_' | ':') 110 ( options {greedy=true;} : LETTER | '0'..'9' | '.' | '-' | '_' | ':' )* 111 ; 112 113 fragment LETTER 114 : 'a'..'z' 115 | 'A'..'Z' 116 ; 117 118 fragment WS : 119 ( ' ' 120 | '\t' 121 | ( '\n' 122 | '\r\n' 123 | '\r' 124 ) 125 )+ 126 ; 127 128