Home | History | Annotate | Download | only in qobject
      1 /*
      2  * JSON lexer
      3  *
      4  * Copyright IBM, Corp. 2009
      5  *
      6  * Authors:
      7  *  Anthony Liguori   <aliguori (at) us.ibm.com>
      8  *
      9  * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
     10  * See the COPYING.LIB file in the top-level directory.
     11  *
     12  */
     13 
     14 #include "qapi/qmp/qstring.h"
     15 #include "qapi/qmp/qlist.h"
     16 #include "qapi/qmp/qdict.h"
     17 #include "qapi/qmp/qint.h"
     18 #include "qemu-common.h"
     19 #include "qapi/qmp/json-lexer.h"
     20 
     21 #define MAX_TOKEN_SIZE (64ULL << 20)
     22 
     23 /*
     24  * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
     25  * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
     26  * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
     27  * [{}\[\],:]
     28  * [a-z]+
     29  *
     30  */
     31 
     32 enum json_lexer_state {
     33     IN_ERROR = 0,
     34     IN_DQ_UCODE3,
     35     IN_DQ_UCODE2,
     36     IN_DQ_UCODE1,
     37     IN_DQ_UCODE0,
     38     IN_DQ_STRING_ESCAPE,
     39     IN_DQ_STRING,
     40     IN_SQ_UCODE3,
     41     IN_SQ_UCODE2,
     42     IN_SQ_UCODE1,
     43     IN_SQ_UCODE0,
     44     IN_SQ_STRING_ESCAPE,
     45     IN_SQ_STRING,
     46     IN_ZERO,
     47     IN_DIGITS,
     48     IN_DIGIT,
     49     IN_EXP_E,
     50     IN_MANTISSA,
     51     IN_MANTISSA_DIGITS,
     52     IN_NONZERO_NUMBER,
     53     IN_NEG_NONZERO_NUMBER,
     54     IN_KEYWORD,
     55     IN_ESCAPE,
     56     IN_ESCAPE_L,
     57     IN_ESCAPE_LL,
     58     IN_ESCAPE_I,
     59     IN_ESCAPE_I6,
     60     IN_ESCAPE_I64,
     61     IN_WHITESPACE,
     62     IN_START,
     63 };
     64 
     65 #define TERMINAL(state) [0 ... 0x7F] = (state)
     66 
     67 /* Return whether TERMINAL is a terminal state and the transition to it
     68    from OLD_STATE required lookahead.  This happens whenever the table
     69    below uses the TERMINAL macro.  */
     70 #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
     71             (json_lexer[(old_state)][0] == (terminal))
     72 
     73 static const uint8_t json_lexer[][256] =  {
     74     /* double quote string */
     75     [IN_DQ_UCODE3] = {
     76         ['0' ... '9'] = IN_DQ_STRING,
     77         ['a' ... 'f'] = IN_DQ_STRING,
     78         ['A' ... 'F'] = IN_DQ_STRING,
     79     },
     80     [IN_DQ_UCODE2] = {
     81         ['0' ... '9'] = IN_DQ_UCODE3,
     82         ['a' ... 'f'] = IN_DQ_UCODE3,
     83         ['A' ... 'F'] = IN_DQ_UCODE3,
     84     },
     85     [IN_DQ_UCODE1] = {
     86         ['0' ... '9'] = IN_DQ_UCODE2,
     87         ['a' ... 'f'] = IN_DQ_UCODE2,
     88         ['A' ... 'F'] = IN_DQ_UCODE2,
     89     },
     90     [IN_DQ_UCODE0] = {
     91         ['0' ... '9'] = IN_DQ_UCODE1,
     92         ['a' ... 'f'] = IN_DQ_UCODE1,
     93         ['A' ... 'F'] = IN_DQ_UCODE1,
     94     },
     95     [IN_DQ_STRING_ESCAPE] = {
     96         ['b'] = IN_DQ_STRING,
     97         ['f'] =  IN_DQ_STRING,
     98         ['n'] =  IN_DQ_STRING,
     99         ['r'] =  IN_DQ_STRING,
    100         ['t'] =  IN_DQ_STRING,
    101         ['/'] = IN_DQ_STRING,
    102         ['\\'] = IN_DQ_STRING,
    103         ['\''] = IN_DQ_STRING,
    104         ['\"'] = IN_DQ_STRING,
    105         ['u'] = IN_DQ_UCODE0,
    106     },
    107     [IN_DQ_STRING] = {
    108         [1 ... 0xBF] = IN_DQ_STRING,
    109         [0xC2 ... 0xF4] = IN_DQ_STRING,
    110         ['\\'] = IN_DQ_STRING_ESCAPE,
    111         ['"'] = JSON_STRING,
    112     },
    113 
    114     /* single quote string */
    115     [IN_SQ_UCODE3] = {
    116         ['0' ... '9'] = IN_SQ_STRING,
    117         ['a' ... 'f'] = IN_SQ_STRING,
    118         ['A' ... 'F'] = IN_SQ_STRING,
    119     },
    120     [IN_SQ_UCODE2] = {
    121         ['0' ... '9'] = IN_SQ_UCODE3,
    122         ['a' ... 'f'] = IN_SQ_UCODE3,
    123         ['A' ... 'F'] = IN_SQ_UCODE3,
    124     },
    125     [IN_SQ_UCODE1] = {
    126         ['0' ... '9'] = IN_SQ_UCODE2,
    127         ['a' ... 'f'] = IN_SQ_UCODE2,
    128         ['A' ... 'F'] = IN_SQ_UCODE2,
    129     },
    130     [IN_SQ_UCODE0] = {
    131         ['0' ... '9'] = IN_SQ_UCODE1,
    132         ['a' ... 'f'] = IN_SQ_UCODE1,
    133         ['A' ... 'F'] = IN_SQ_UCODE1,
    134     },
    135     [IN_SQ_STRING_ESCAPE] = {
    136         ['b'] = IN_SQ_STRING,
    137         ['f'] =  IN_SQ_STRING,
    138         ['n'] =  IN_SQ_STRING,
    139         ['r'] =  IN_SQ_STRING,
    140         ['t'] =  IN_SQ_STRING,
    141         ['/'] = IN_DQ_STRING,
    142         ['\\'] = IN_DQ_STRING,
    143         ['\''] = IN_SQ_STRING,
    144         ['\"'] = IN_SQ_STRING,
    145         ['u'] = IN_SQ_UCODE0,
    146     },
    147     [IN_SQ_STRING] = {
    148         [1 ... 0xBF] = IN_SQ_STRING,
    149         [0xC2 ... 0xF4] = IN_SQ_STRING,
    150         ['\\'] = IN_SQ_STRING_ESCAPE,
    151         ['\''] = JSON_STRING,
    152     },
    153 
    154     /* Zero */
    155     [IN_ZERO] = {
    156         TERMINAL(JSON_INTEGER),
    157         ['0' ... '9'] = IN_ERROR,
    158         ['.'] = IN_MANTISSA,
    159     },
    160 
    161     /* Float */
    162     [IN_DIGITS] = {
    163         TERMINAL(JSON_FLOAT),
    164         ['0' ... '9'] = IN_DIGITS,
    165     },
    166 
    167     [IN_DIGIT] = {
    168         ['0' ... '9'] = IN_DIGITS,
    169     },
    170 
    171     [IN_EXP_E] = {
    172         ['-'] = IN_DIGIT,
    173         ['+'] = IN_DIGIT,
    174         ['0' ... '9'] = IN_DIGITS,
    175     },
    176 
    177     [IN_MANTISSA_DIGITS] = {
    178         TERMINAL(JSON_FLOAT),
    179         ['0' ... '9'] = IN_MANTISSA_DIGITS,
    180         ['e'] = IN_EXP_E,
    181         ['E'] = IN_EXP_E,
    182     },
    183 
    184     [IN_MANTISSA] = {
    185         ['0' ... '9'] = IN_MANTISSA_DIGITS,
    186     },
    187 
    188     /* Number */
    189     [IN_NONZERO_NUMBER] = {
    190         TERMINAL(JSON_INTEGER),
    191         ['0' ... '9'] = IN_NONZERO_NUMBER,
    192         ['e'] = IN_EXP_E,
    193         ['E'] = IN_EXP_E,
    194         ['.'] = IN_MANTISSA,
    195     },
    196 
    197     [IN_NEG_NONZERO_NUMBER] = {
    198         ['0'] = IN_ZERO,
    199         ['1' ... '9'] = IN_NONZERO_NUMBER,
    200     },
    201 
    202     /* keywords */
    203     [IN_KEYWORD] = {
    204         TERMINAL(JSON_KEYWORD),
    205         ['a' ... 'z'] = IN_KEYWORD,
    206     },
    207 
    208     /* whitespace */
    209     [IN_WHITESPACE] = {
    210         TERMINAL(JSON_SKIP),
    211         [' '] = IN_WHITESPACE,
    212         ['\t'] = IN_WHITESPACE,
    213         ['\r'] = IN_WHITESPACE,
    214         ['\n'] = IN_WHITESPACE,
    215     },
    216 
    217     /* escape */
    218     [IN_ESCAPE_LL] = {
    219         ['d'] = JSON_ESCAPE,
    220     },
    221 
    222     [IN_ESCAPE_L] = {
    223         ['d'] = JSON_ESCAPE,
    224         ['l'] = IN_ESCAPE_LL,
    225     },
    226 
    227     [IN_ESCAPE_I64] = {
    228         ['d'] = JSON_ESCAPE,
    229     },
    230 
    231     [IN_ESCAPE_I6] = {
    232         ['4'] = IN_ESCAPE_I64,
    233     },
    234 
    235     [IN_ESCAPE_I] = {
    236         ['6'] = IN_ESCAPE_I6,
    237     },
    238 
    239     [IN_ESCAPE] = {
    240         ['d'] = JSON_ESCAPE,
    241         ['i'] = JSON_ESCAPE,
    242         ['p'] = JSON_ESCAPE,
    243         ['s'] = JSON_ESCAPE,
    244         ['f'] = JSON_ESCAPE,
    245         ['l'] = IN_ESCAPE_L,
    246         ['I'] = IN_ESCAPE_I,
    247     },
    248 
    249     /* top level rule */
    250     [IN_START] = {
    251         ['"'] = IN_DQ_STRING,
    252         ['\''] = IN_SQ_STRING,
    253         ['0'] = IN_ZERO,
    254         ['1' ... '9'] = IN_NONZERO_NUMBER,
    255         ['-'] = IN_NEG_NONZERO_NUMBER,
    256         ['{'] = JSON_OPERATOR,
    257         ['}'] = JSON_OPERATOR,
    258         ['['] = JSON_OPERATOR,
    259         [']'] = JSON_OPERATOR,
    260         [','] = JSON_OPERATOR,
    261         [':'] = JSON_OPERATOR,
    262         ['a' ... 'z'] = IN_KEYWORD,
    263         ['%'] = IN_ESCAPE,
    264         [' '] = IN_WHITESPACE,
    265         ['\t'] = IN_WHITESPACE,
    266         ['\r'] = IN_WHITESPACE,
    267         ['\n'] = IN_WHITESPACE,
    268     },
    269 };
    270 
    271 void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
    272 {
    273     lexer->emit = func;
    274     lexer->state = IN_START;
    275     lexer->token = qstring_new();
    276     lexer->x = lexer->y = 0;
    277 }
    278 
    279 static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
    280 {
    281     int char_consumed, new_state;
    282 
    283     lexer->x++;
    284     if (ch == '\n') {
    285         lexer->x = 0;
    286         lexer->y++;
    287     }
    288 
    289     do {
    290         new_state = json_lexer[lexer->state][(uint8_t)ch];
    291         char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
    292         if (char_consumed) {
    293             qstring_append_chr(lexer->token, ch);
    294         }
    295 
    296         switch (new_state) {
    297         case JSON_OPERATOR:
    298         case JSON_ESCAPE:
    299         case JSON_INTEGER:
    300         case JSON_FLOAT:
    301         case JSON_KEYWORD:
    302         case JSON_STRING:
    303             lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
    304             /* fall through */
    305         case JSON_SKIP:
    306             QDECREF(lexer->token);
    307             lexer->token = qstring_new();
    308             new_state = IN_START;
    309             break;
    310         case IN_ERROR:
    311             /* XXX: To avoid having previous bad input leaving the parser in an
    312              * unresponsive state where we consume unpredictable amounts of
    313              * subsequent "good" input, percolate this error state up to the
    314              * tokenizer/parser by forcing a NULL object to be emitted, then
    315              * reset state.
    316              *
    317              * Also note that this handling is required for reliable channel
    318              * negotiation between QMP and the guest agent, since chr(0xFF)
    319              * is placed at the beginning of certain events to ensure proper
    320              * delivery when the channel is in an unknown state. chr(0xFF) is
    321              * never a valid ASCII/UTF-8 sequence, so this should reliably
    322              * induce an error/flush state.
    323              */
    324             lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
    325             QDECREF(lexer->token);
    326             lexer->token = qstring_new();
    327             new_state = IN_START;
    328             lexer->state = new_state;
    329             return 0;
    330         default:
    331             break;
    332         }
    333         lexer->state = new_state;
    334     } while (!char_consumed && !flush);
    335 
    336     /* Do not let a single token grow to an arbitrarily large size,
    337      * this is a security consideration.
    338      */
    339     if (lexer->token->length > MAX_TOKEN_SIZE) {
    340         lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
    341         QDECREF(lexer->token);
    342         lexer->token = qstring_new();
    343         lexer->state = IN_START;
    344     }
    345 
    346     return 0;
    347 }
    348 
    349 int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
    350 {
    351     size_t i;
    352 
    353     for (i = 0; i < size; i++) {
    354         int err;
    355 
    356         err = json_lexer_feed_char(lexer, buffer[i], false);
    357         if (err < 0) {
    358             return err;
    359         }
    360     }
    361 
    362     return 0;
    363 }
    364 
    365 int json_lexer_flush(JSONLexer *lexer)
    366 {
    367     return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true);
    368 }
    369 
    370 void json_lexer_destroy(JSONLexer *lexer)
    371 {
    372     QDECREF(lexer->token);
    373 }
    374