Home | History | Annotate | Download | only in Runtime
      1 package ANTLR::Runtime::Lexer;
      2 
      3 use English qw( -no_match_vars );
      4 use Readonly;
      5 use Carp;
      6 use Switch;
      7 
      8 use ANTLR::Runtime::Token;
      9 use ANTLR::Runtime::CommonToken;
     10 use ANTLR::Runtime::CharStream;
     11 use ANTLR::Runtime::MismatchedTokenException;
     12 
     13 use Moose;
     14 
     15 extends 'ANTLR::Runtime::BaseRecognizer';
     16 with 'ANTLR::Runtime::TokenSource';
     17 
     18 has 'input' => (
     19     is => 'rw',
     20     does => 'ANTLR::Runtime::CharStream',
     21 );
     22 
     23 sub reset {
     24     my ($self) = @_;
     25 
     26     # reset all recognizer state variables
     27     $self->SUPER::reset();
     28 
     29     # wack Lexer state variables
     30     if (defined $self->input) {
     31         # rewind the input
     32         $self->input->seek(0);
     33     }
     34 
     35     if (defined $self->state) {
     36         $self->state->token(undef);
     37         $self->state->type(ANTLR::Runtime::Token->INVALID_TOKEN_TYPE);
     38         $self->state->channel(ANTLR::Runtime::Token->DEFAULT_CHANNEL);
     39         $self->state->token_start_char_index(-1);
     40         $self->state->token_start_char_position_in_line(-1);
     41         $self->state->start_line(-1);
     42         $self->state->text(undef);
     43     }
     44 }
     45 
     46 # Return a token from this source; i.e., match a token on the char
     47 # stream.
     48 sub next_token {
     49     my ($self) = @_;
     50 
     51     while (1) {
     52         $self->state->token(undef);
     53         $self->state->channel(ANTLR::Runtime::Token->DEFAULT_CHANNEL);
     54         $self->state->token_start_char_index($self->input->index());
     55         $self->state->token_start_char_position_in_line($self->input->get_char_position_in_line());
     56         $self->state->token_start_line($self->input->get_line());
     57         $self->state->text(undef);
     58 
     59         if ($self->input->LA(1) eq ANTLR::Runtime::CharStream->EOF) {
     60             return ANTLR::Runtime::Token->EOF_TOKEN;
     61         }
     62 
     63         my $rv;
     64         my $op = '';
     65         eval {
     66             $self->m_tokens();
     67             if (!defined $self->state->token) {
     68                 $self->emit();
     69             }
     70             elsif ($self->state->token == ANTLR::Runtime::Token->SKIP_TOKEN) {
     71                 $op = 'next';
     72                 return;
     73             }
     74             $op = 'return';
     75             $rv = $self->state->token;
     76         };
     77         return $rv if $op eq 'return';
     78         next if $op eq 'next';
     79 
     80         if ($EVAL_ERROR) {
     81             my $exception = $EVAL_ERROR;
     82             if ($exception->isa('ANTLR::Runtime::RecognitionException')) {
     83                 $self->report_error($exception);
     84                 $self->recover($exception);
     85             } else {
     86                 croak $exception;
     87             }
     88         }
     89     }
     90 }
     91 
     92 # Instruct the lexer to skip creating a token for current lexer rule
     93 # and look for another token.  nextToken() knows to keep looking when
     94 # a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
     95 # if token==null at end of any token rule, it creates one for you
     96 # and emits it.
     97 sub skip {
     98     my ($self) = @_;
     99 
    100     $self->state->token(ANTLR::Runtime::Token->SKIP_TOKEN);
    101     return;
    102 }
    103 
    104 # This is the lexer entry point that sets instance var 'token'
    105 sub m_tokens {
    106     croak "Unimplemented";
    107 }
    108 
    109 # Set the char stream and reset the lexer
    110 sub set_char_stream {
    111     my ($self, $input) = @_;
    112 
    113     $self->input(undef);
    114     $self->reset();
    115     $self->input($input);
    116 }
    117 
    118 sub get_char_stream {
    119     my ($self) = @_;
    120     return $self->input;
    121 }
    122 
    123 sub get_source_name {
    124     my ($self) = @_;
    125     return $self->input->get_source_name();
    126 }
    127 
    128 sub emit {
    129     if (@_ == 1) {
    130         my ($self) = @_;
    131 	# The standard method called to automatically emit a token at the
    132 	# outermost lexical rule.  The token object should point into the
    133 	# char buffer start..stop.  If there is a text override in 'text',
    134 	# use that to set the token's text.  Override this method to emit
    135 	# custom Token objects.
    136         my $t = ANTLR::Runtime::CommonToken->new({
    137             input => $self->input,
    138             type  => $self->state->type,
    139             channel => $self->state->channel,
    140             start => $self->state->token_start_char_index,
    141             stop => $self->get_char_index() - 1
    142         });
    143 
    144         $t->set_line($self->state->token_start_line);
    145         $t->set_text($self->state->text);
    146         $t->set_char_position_in_line($self->state->token_start_char_position_in_line);
    147         $self->emit($t);
    148         return $t;
    149     } elsif (@_ == 2) {
    150         my ($self, $token) = @_;
    151 	# Currently does not support multiple emits per nextToken invocation
    152 	# for efficiency reasons.  Subclass and override this method and
    153 	# nextToken (to push tokens into a list and pull from that list rather
    154 	# than a single variable as this implementation does).
    155         $self->state->token($token);
    156     }
    157 }
    158 
    159 sub match {
    160     my ($self, $s) = @_;
    161 
    162     foreach my $c (split //, $s) {
    163         if ($self->input->LA(1) ne $c) {
    164             if ($self->state->backtracking > 0) {
    165                 $self->state->failed(1);
    166                 return;
    167             }
    168             my $mte = ANTLR::Runtime::MismatchedTokenException->new({
    169                 expecting => $c,
    170                 input => $self->input
    171             });
    172             $self->recover($mte);
    173             croak $mte;
    174         }
    175         $self->input->consume();
    176         $self->state->failed(0);
    177     }
    178 }
    179 
    180 sub match_any {
    181     my ($self) = @_;
    182 
    183     $self->input->consume();
    184 }
    185 
    186 sub match_range {
    187     my ($self, $a, $b) = @_;
    188 
    189     if ($self->input->LA(1) lt $a || $self->input->LA(1) gt $b) {
    190         if ($self->state->backtracking > 0) {
    191             $self->state->failed(1);
    192             return;
    193         }
    194 
    195         my $mre = ANTLR::Runtime::MismatchedRangeException($a, $b, $self->input);
    196         $self->recover($mre);
    197         croak $mre;
    198     }
    199 
    200     $self->input->consume();
    201     $self->state->failed(0);
    202 }
    203 
    204 sub get_line {
    205     my ($self) = @_;
    206 
    207     return $self->input->get_line();
    208 }
    209 
    210 sub get_char_position_in_line {
    211     my ($self) = @_;
    212 
    213     return $self->input->get_char_position_in_line();
    214 }
    215 
    216 # What is the index of the current character of lookahead?
    217 sub get_char_index {
    218     my ($self) = @_;
    219 
    220     return $self->input->index();
    221 }
    222 
    223 # Return the text matched so far for the current token or any
    224 # text override.
    225 sub get_text {
    226     my ($self) = @_;
    227 
    228     if (defined $self->state->text) {
    229         return $self->state->text;
    230     }
    231     return $self->input->substring($self->state->token_start_char_index, $self->get_char_index() - 1);
    232 }
    233 
    234 # Set the complete text of this token; it wipes any previous
    235 # changes to the text.
    236 sub set_text {
    237     my ($self, $text) = @_;
    238 
    239     $self->state->text($text);
    240 }
    241 
    242 sub report_error {
    243     Readonly my $usage => 'void report_error(RecognitionException e)';
    244     croak $usage if @_ != 2;
    245     my ($self, $e) = @_;
    246 
    247     $self->display_recognition_error($self->get_token_names(), $e);
    248 }
    249 
    250 sub get_error_message {
    251     my ($self, $e, $token_names) = @_;
    252 
    253     my $msg;
    254     if ($e->isa('ANTLR::Runtime::MismatchedTokenException')) {
    255         $msg = 'mismatched character '
    256           . $self->get_char_error_display($e->get_c())
    257           . ' expecting '
    258           . $self->get_char_error_display($e->expecting);
    259     } elsif ($e->isa('ANTLR::Runtime::NoViableAltException')) {
    260         $msg = 'no viable alternative at character ' . $self->get_char_error_display($e->get_c());
    261     } elsif ($e->isa('ANTLR::Runtime::EarlyExitException')) {
    262         $msg = 'required (...)+ loop did not match anything at character '
    263           . $self->get_char_error_display($e->get_c());
    264     } elsif ($e->isa('ANTLR::Runtime::MismatchedSetException')) {
    265         $msg = 'mismatched character ' . $self->get_char_error_display($e->get_c())
    266           . ' expecting set ' . $e->expecting;
    267     } elsif ($e->isa('ANTLR::Runtime::MismatchedNotSetException')) {
    268         $msg = 'mismatched character ' . $self->get_char_error_display($e->get_c())
    269           . ' expecting set ' . $e->expecting;
    270     } elsif ($e->isa('ANTLR::Runtime::MismatchedRangeException')) {
    271         $msg = 'mismatched character ' . $self->get_char_error_display($e->get_c())
    272           . ' expecting set ' . $self->get_char_error_display($e->a)
    273           . '..' . $self->get_char_error_display($e->b);
    274     } else {
    275         $msg = $self->SUPER::get_error_message($e, $token_names);
    276     }
    277     return $msg;
    278 }
    279 
    280 sub get_char_error_display {
    281     my ($self, $c) = @_;
    282 
    283     my $s;
    284     if ($c eq ANTLR::Runtime::Token->EOF) {
    285         $s = '<EOF>';
    286     } elsif ($c eq "\n") {
    287         $s = '\n';
    288     } elsif ($c eq "\t") {
    289         $s = '\t';
    290     } elsif ($c eq "\r") {
    291         $s = '\r';
    292     } else {
    293         $s = $c;
    294     }
    295 
    296     return "'$s'";
    297 }
    298 
    299 # Lexers can normally match any char in it's vocabulary after matching
    300 # a token, so do the easy thing and just kill a character and hope
    301 # it all works out.  You can instead use the rule invocation stack
    302 # to do sophisticated error recovery if you are in a fragment rule.
    303 sub recover {
    304     my ($self, $re) = @_;
    305 
    306     $self->input->consume();
    307 }
    308 
    309 sub trace_in {
    310     my ($self, $rule_name, $rule_index) = @_;
    311 
    312     my $input_symbol = $self->input->LT(1) . ' line=' . $self->get_line() . ':' . $self->get_char_position_in_line();
    313     $self->SUPER::trace_in($rule_name, $rule_index, $input_symbol);
    314 }
    315 
    316 sub trace_out {
    317     my ($self, $rule_name, $rule_index) = @_;
    318 
    319     my $input_symbol = $self->input->LT(1) . ' line=' . $self->get_line() . ':' . $self->get_char_position_in_line();
    320     $self->SUPER::trace_out($rule_name, $rule_index, $input_symbol);
    321 }
    322 
    323 no Moose;
    324 __PACKAGE__->meta->make_immutable();
    325 1;
    326