Home | History | Annotate | Download | only in lexer
      1 #!/usr/bin/ruby
      2 # encoding: utf-8
      3 
      4 require 'antlr3/test/functional'
      5 
      6 class XMLLexerTest < ANTLR3::Test::Functional
      7   inline_grammar( <<-'END' )
      8     lexer grammar XML;
      9     options { language = Ruby; }
     10     
     11     @members {
     12       include ANTLR3::Test::CaptureOutput
     13       include ANTLR3::Test::RaiseErrors
     14       
     15       def quote(text)
     16         text = text.gsub(/\"/, '\\"')
     17         \%("#{ text }")
     18       end
     19     }
     20     
     21     DOCUMENT
     22         :  XMLDECL? WS? DOCTYPE? WS? ELEMENT WS? 
     23         ;
     24     
     25     fragment DOCTYPE
     26         :
     27             '<!DOCTYPE' WS rootElementName=GENERIC_ID 
     28             {say("ROOTELEMENT: " + $rootElementName.text)}
     29             WS
     30             ( 
     31                 ( 'SYSTEM' WS sys1=VALUE
     32             {say("SYSTEM: " + $sys1.text)}
     33                     
     34                 | 'PUBLIC' WS pub=VALUE WS sys2=VALUE
     35                     {say("PUBLIC: " + $pub.text)}
     36                     {say("SYSTEM: " + $sys2.text)}   
     37                 )
     38                 ( WS )?
     39             )?
     40             ( dtd=INTERNAL_DTD
     41                 {say("INTERNAL DTD: " + $dtd.text)}
     42             )?
     43         '>'
     44       ;
     45     
     46     fragment INTERNAL_DTD : '[' (options {greedy=false;} : .)* ']' ;
     47     
     48     fragment PI :
     49             '<?' target=GENERIC_ID WS? 
     50               {say("PI: " + $target.text)}
     51             ( ATTRIBUTE WS? )*  '?>'
     52       ;
     53     
     54     fragment XMLDECL :
     55             '<?' ('x'|'X') ('m'|'M') ('l'|'L') WS? 
     56               {say("XML declaration")}
     57             ( ATTRIBUTE WS? )*  '?>'
     58       ;
     59     
     60     
     61     fragment ELEMENT
     62         : ( START_TAG
     63                 (ELEMENT
     64                 | t=PCDATA
     65                     {say("PCDATA: " << quote($t.text))}
     66                 | t=CDATA
     67                     {say("CDATA: " << quote($t.text))}
     68                 | t=COMMENT
     69                     {say("Comment: " << quote($t.text))}
     70                 | pi=PI
     71                 )*
     72                 END_TAG
     73             | EMPTY_ELEMENT
     74             )
     75         ;
     76     
     77     fragment START_TAG 
     78         : '<' WS? name=GENERIC_ID WS?
     79               {say("Start Tag: " + $name.text)}
     80             ( ATTRIBUTE WS? )* '>'
     81         ;
     82     
     83     fragment EMPTY_ELEMENT 
     84         : '<' WS? name=GENERIC_ID WS?
     85               {say("Empty Element: " + $name.text)}
     86             ( ATTRIBUTE WS? )* '/>'
     87         ;
     88     
     89     fragment ATTRIBUTE 
     90         : name=GENERIC_ID WS? '=' WS? value=VALUE
     91             {say("Attr: " + $name.text + " = "+ $value.text)}
     92         ;
     93     
     94     fragment END_TAG 
     95         : '</' WS? name=GENERIC_ID WS? '>'
     96             {say("End Tag: " + $name.text)}
     97         ;
     98     
     99     fragment COMMENT
    100       :	'<!--' (options {greedy=false;} : .)* '-->'
    101       ;
    102     
    103     fragment CDATA
    104       :	'<![CDATA[' (options {greedy=false;} : .)* ']]>'
    105       ;
    106     
    107     fragment PCDATA : (~'<')+ ; 
    108     
    109     fragment VALUE : 
    110             ( '\"' (~'\"')* '\"'
    111             | '\'' (~'\'')* '\''
    112             )
    113       ;
    114     
    115     fragment GENERIC_ID 
    116         : ( LETTER | '_' | ':') 
    117             ( options {greedy=true;} : LETTER | '0'..'9' | '.' | '-' | '_' | ':' )*
    118       ;
    119     
    120     fragment LETTER
    121       : 'a'..'z' 
    122       | 'A'..'Z'
    123       ;
    124     
    125     fragment WS  :
    126             (   ' '
    127             |   '\t'
    128             |  ( '\n'
    129                 |	'\r\n'
    130                 |	'\r'
    131                 )
    132             )+
    133         ;    
    134   END
    135   
    136   it "should be valid" do
    137     lexer = XML::Lexer.new( <<-'END'.fixed_indent( 0 ) )
    138       <?xml version='1.0'?>
    139       <!DOCTYPE component [
    140       <!ELEMENT component (PCDATA|sub)*>
    141       <!ATTLIST component
    142                 attr CDATA #IMPLIED
    143                 attr2 CDATA #IMPLIED
    144       >
    145       <!ELMENT sub EMPTY>
    146       
    147       ]>
    148       <component attr="val'ue" attr2='val"ue'>
    149       <!-- This is a comment -->
    150       Text
    151       <![CDATA[huhu]]>
    152       
    153       &amp;
    154       &lt;
    155       <?xtal cursor='11'?>
    156       <sub/>
    157       <sub></sub>
    158       </component>
    159     END
    160     
    161     lexer.map { |tk| tk }
    162     
    163     lexer.output.should == <<-'END'.fixed_indent( 0 )
    164       XML declaration
    165       Attr: version = '1.0'
    166       ROOTELEMENT: component
    167       INTERNAL DTD: [
    168       <!ELEMENT component (PCDATA|sub)*>
    169       <!ATTLIST component
    170                 attr CDATA #IMPLIED
    171                 attr2 CDATA #IMPLIED
    172       >
    173       <!ELMENT sub EMPTY>
    174       
    175       ]
    176       Start Tag: component
    177       Attr: attr = "val'ue"
    178       Attr: attr2 = 'val"ue'
    179       PCDATA: "
    180       "
    181       Comment: "<!-- This is a comment -->"
    182       PCDATA: "
    183       Text
    184       "
    185       CDATA: "<![CDATA[huhu]]>"
    186       PCDATA: "
    187       
    188       &amp;
    189       &lt;
    190       "
    191       PI: xtal
    192       Attr: cursor = '11'
    193       PCDATA: "
    194       "
    195       Empty Element: sub
    196       PCDATA: "
    197       "
    198       Start Tag: sub
    199       End Tag: sub
    200       PCDATA: "
    201       "
    202       End Tag: component
    203     END
    204   end
    205 
    206 end
    207