1 /* 2 * JSON lexer 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Anthony Liguori <aliguori (at) us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. 10 * See the COPYING.LIB file in the top-level directory. 11 * 12 */ 13 14 #include "qapi/qmp/qstring.h" 15 #include "qapi/qmp/qlist.h" 16 #include "qapi/qmp/qdict.h" 17 #include "qapi/qmp/qint.h" 18 #include "qemu-common.h" 19 #include "qapi/qmp/json-lexer.h" 20 21 #define MAX_TOKEN_SIZE (64ULL << 20) 22 23 /* 24 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\" 25 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*' 26 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+)) 27 * [{}\[\],:] 28 * [a-z]+ 29 * 30 */ 31 32 enum json_lexer_state { 33 IN_ERROR = 0, 34 IN_DQ_UCODE3, 35 IN_DQ_UCODE2, 36 IN_DQ_UCODE1, 37 IN_DQ_UCODE0, 38 IN_DQ_STRING_ESCAPE, 39 IN_DQ_STRING, 40 IN_SQ_UCODE3, 41 IN_SQ_UCODE2, 42 IN_SQ_UCODE1, 43 IN_SQ_UCODE0, 44 IN_SQ_STRING_ESCAPE, 45 IN_SQ_STRING, 46 IN_ZERO, 47 IN_DIGITS, 48 IN_DIGIT, 49 IN_EXP_E, 50 IN_MANTISSA, 51 IN_MANTISSA_DIGITS, 52 IN_NONZERO_NUMBER, 53 IN_NEG_NONZERO_NUMBER, 54 IN_KEYWORD, 55 IN_ESCAPE, 56 IN_ESCAPE_L, 57 IN_ESCAPE_LL, 58 IN_ESCAPE_I, 59 IN_ESCAPE_I6, 60 IN_ESCAPE_I64, 61 IN_WHITESPACE, 62 IN_START, 63 }; 64 65 #define TERMINAL(state) [0 ... 0x7F] = (state) 66 67 /* Return whether TERMINAL is a terminal state and the transition to it 68 from OLD_STATE required lookahead. This happens whenever the table 69 below uses the TERMINAL macro. */ 70 #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \ 71 (json_lexer[(old_state)][0] == (terminal)) 72 73 static const uint8_t json_lexer[][256] = { 74 /* double quote string */ 75 [IN_DQ_UCODE3] = { 76 ['0' ... '9'] = IN_DQ_STRING, 77 ['a' ... 'f'] = IN_DQ_STRING, 78 ['A' ... 'F'] = IN_DQ_STRING, 79 }, 80 [IN_DQ_UCODE2] = { 81 ['0' ... '9'] = IN_DQ_UCODE3, 82 ['a' ... 'f'] = IN_DQ_UCODE3, 83 ['A' ... 'F'] = IN_DQ_UCODE3, 84 }, 85 [IN_DQ_UCODE1] = { 86 ['0' ... '9'] = IN_DQ_UCODE2, 87 ['a' ... 'f'] = IN_DQ_UCODE2, 88 ['A' ... 'F'] = IN_DQ_UCODE2, 89 }, 90 [IN_DQ_UCODE0] = { 91 ['0' ... '9'] = IN_DQ_UCODE1, 92 ['a' ... 'f'] = IN_DQ_UCODE1, 93 ['A' ... 'F'] = IN_DQ_UCODE1, 94 }, 95 [IN_DQ_STRING_ESCAPE] = { 96 ['b'] = IN_DQ_STRING, 97 ['f'] = IN_DQ_STRING, 98 ['n'] = IN_DQ_STRING, 99 ['r'] = IN_DQ_STRING, 100 ['t'] = IN_DQ_STRING, 101 ['/'] = IN_DQ_STRING, 102 ['\\'] = IN_DQ_STRING, 103 ['\''] = IN_DQ_STRING, 104 ['\"'] = IN_DQ_STRING, 105 ['u'] = IN_DQ_UCODE0, 106 }, 107 [IN_DQ_STRING] = { 108 [1 ... 0xBF] = IN_DQ_STRING, 109 [0xC2 ... 0xF4] = IN_DQ_STRING, 110 ['\\'] = IN_DQ_STRING_ESCAPE, 111 ['"'] = JSON_STRING, 112 }, 113 114 /* single quote string */ 115 [IN_SQ_UCODE3] = { 116 ['0' ... '9'] = IN_SQ_STRING, 117 ['a' ... 'f'] = IN_SQ_STRING, 118 ['A' ... 'F'] = IN_SQ_STRING, 119 }, 120 [IN_SQ_UCODE2] = { 121 ['0' ... '9'] = IN_SQ_UCODE3, 122 ['a' ... 'f'] = IN_SQ_UCODE3, 123 ['A' ... 'F'] = IN_SQ_UCODE3, 124 }, 125 [IN_SQ_UCODE1] = { 126 ['0' ... '9'] = IN_SQ_UCODE2, 127 ['a' ... 'f'] = IN_SQ_UCODE2, 128 ['A' ... 'F'] = IN_SQ_UCODE2, 129 }, 130 [IN_SQ_UCODE0] = { 131 ['0' ... '9'] = IN_SQ_UCODE1, 132 ['a' ... 'f'] = IN_SQ_UCODE1, 133 ['A' ... 'F'] = IN_SQ_UCODE1, 134 }, 135 [IN_SQ_STRING_ESCAPE] = { 136 ['b'] = IN_SQ_STRING, 137 ['f'] = IN_SQ_STRING, 138 ['n'] = IN_SQ_STRING, 139 ['r'] = IN_SQ_STRING, 140 ['t'] = IN_SQ_STRING, 141 ['/'] = IN_DQ_STRING, 142 ['\\'] = IN_DQ_STRING, 143 ['\''] = IN_SQ_STRING, 144 ['\"'] = IN_SQ_STRING, 145 ['u'] = IN_SQ_UCODE0, 146 }, 147 [IN_SQ_STRING] = { 148 [1 ... 0xBF] = IN_SQ_STRING, 149 [0xC2 ... 0xF4] = IN_SQ_STRING, 150 ['\\'] = IN_SQ_STRING_ESCAPE, 151 ['\''] = JSON_STRING, 152 }, 153 154 /* Zero */ 155 [IN_ZERO] = { 156 TERMINAL(JSON_INTEGER), 157 ['0' ... '9'] = IN_ERROR, 158 ['.'] = IN_MANTISSA, 159 }, 160 161 /* Float */ 162 [IN_DIGITS] = { 163 TERMINAL(JSON_FLOAT), 164 ['0' ... '9'] = IN_DIGITS, 165 }, 166 167 [IN_DIGIT] = { 168 ['0' ... '9'] = IN_DIGITS, 169 }, 170 171 [IN_EXP_E] = { 172 ['-'] = IN_DIGIT, 173 ['+'] = IN_DIGIT, 174 ['0' ... '9'] = IN_DIGITS, 175 }, 176 177 [IN_MANTISSA_DIGITS] = { 178 TERMINAL(JSON_FLOAT), 179 ['0' ... '9'] = IN_MANTISSA_DIGITS, 180 ['e'] = IN_EXP_E, 181 ['E'] = IN_EXP_E, 182 }, 183 184 [IN_MANTISSA] = { 185 ['0' ... '9'] = IN_MANTISSA_DIGITS, 186 }, 187 188 /* Number */ 189 [IN_NONZERO_NUMBER] = { 190 TERMINAL(JSON_INTEGER), 191 ['0' ... '9'] = IN_NONZERO_NUMBER, 192 ['e'] = IN_EXP_E, 193 ['E'] = IN_EXP_E, 194 ['.'] = IN_MANTISSA, 195 }, 196 197 [IN_NEG_NONZERO_NUMBER] = { 198 ['0'] = IN_ZERO, 199 ['1' ... '9'] = IN_NONZERO_NUMBER, 200 }, 201 202 /* keywords */ 203 [IN_KEYWORD] = { 204 TERMINAL(JSON_KEYWORD), 205 ['a' ... 'z'] = IN_KEYWORD, 206 }, 207 208 /* whitespace */ 209 [IN_WHITESPACE] = { 210 TERMINAL(JSON_SKIP), 211 [' '] = IN_WHITESPACE, 212 ['\t'] = IN_WHITESPACE, 213 ['\r'] = IN_WHITESPACE, 214 ['\n'] = IN_WHITESPACE, 215 }, 216 217 /* escape */ 218 [IN_ESCAPE_LL] = { 219 ['d'] = JSON_ESCAPE, 220 }, 221 222 [IN_ESCAPE_L] = { 223 ['d'] = JSON_ESCAPE, 224 ['l'] = IN_ESCAPE_LL, 225 }, 226 227 [IN_ESCAPE_I64] = { 228 ['d'] = JSON_ESCAPE, 229 }, 230 231 [IN_ESCAPE_I6] = { 232 ['4'] = IN_ESCAPE_I64, 233 }, 234 235 [IN_ESCAPE_I] = { 236 ['6'] = IN_ESCAPE_I6, 237 }, 238 239 [IN_ESCAPE] = { 240 ['d'] = JSON_ESCAPE, 241 ['i'] = JSON_ESCAPE, 242 ['p'] = JSON_ESCAPE, 243 ['s'] = JSON_ESCAPE, 244 ['f'] = JSON_ESCAPE, 245 ['l'] = IN_ESCAPE_L, 246 ['I'] = IN_ESCAPE_I, 247 }, 248 249 /* top level rule */ 250 [IN_START] = { 251 ['"'] = IN_DQ_STRING, 252 ['\''] = IN_SQ_STRING, 253 ['0'] = IN_ZERO, 254 ['1' ... '9'] = IN_NONZERO_NUMBER, 255 ['-'] = IN_NEG_NONZERO_NUMBER, 256 ['{'] = JSON_OPERATOR, 257 ['}'] = JSON_OPERATOR, 258 ['['] = JSON_OPERATOR, 259 [']'] = JSON_OPERATOR, 260 [','] = JSON_OPERATOR, 261 [':'] = JSON_OPERATOR, 262 ['a' ... 'z'] = IN_KEYWORD, 263 ['%'] = IN_ESCAPE, 264 [' '] = IN_WHITESPACE, 265 ['\t'] = IN_WHITESPACE, 266 ['\r'] = IN_WHITESPACE, 267 ['\n'] = IN_WHITESPACE, 268 }, 269 }; 270 271 void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func) 272 { 273 lexer->emit = func; 274 lexer->state = IN_START; 275 lexer->token = qstring_new(); 276 lexer->x = lexer->y = 0; 277 } 278 279 static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) 280 { 281 int char_consumed, new_state; 282 283 lexer->x++; 284 if (ch == '\n') { 285 lexer->x = 0; 286 lexer->y++; 287 } 288 289 do { 290 new_state = json_lexer[lexer->state][(uint8_t)ch]; 291 char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state); 292 if (char_consumed) { 293 qstring_append_chr(lexer->token, ch); 294 } 295 296 switch (new_state) { 297 case JSON_OPERATOR: 298 case JSON_ESCAPE: 299 case JSON_INTEGER: 300 case JSON_FLOAT: 301 case JSON_KEYWORD: 302 case JSON_STRING: 303 lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y); 304 /* fall through */ 305 case JSON_SKIP: 306 QDECREF(lexer->token); 307 lexer->token = qstring_new(); 308 new_state = IN_START; 309 break; 310 case IN_ERROR: 311 /* XXX: To avoid having previous bad input leaving the parser in an 312 * unresponsive state where we consume unpredictable amounts of 313 * subsequent "good" input, percolate this error state up to the 314 * tokenizer/parser by forcing a NULL object to be emitted, then 315 * reset state. 316 * 317 * Also note that this handling is required for reliable channel 318 * negotiation between QMP and the guest agent, since chr(0xFF) 319 * is placed at the beginning of certain events to ensure proper 320 * delivery when the channel is in an unknown state. chr(0xFF) is 321 * never a valid ASCII/UTF-8 sequence, so this should reliably 322 * induce an error/flush state. 323 */ 324 lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y); 325 QDECREF(lexer->token); 326 lexer->token = qstring_new(); 327 new_state = IN_START; 328 lexer->state = new_state; 329 return 0; 330 default: 331 break; 332 } 333 lexer->state = new_state; 334 } while (!char_consumed && !flush); 335 336 /* Do not let a single token grow to an arbitrarily large size, 337 * this is a security consideration. 338 */ 339 if (lexer->token->length > MAX_TOKEN_SIZE) { 340 lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y); 341 QDECREF(lexer->token); 342 lexer->token = qstring_new(); 343 lexer->state = IN_START; 344 } 345 346 return 0; 347 } 348 349 int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) 350 { 351 size_t i; 352 353 for (i = 0; i < size; i++) { 354 int err; 355 356 err = json_lexer_feed_char(lexer, buffer[i], false); 357 if (err < 0) { 358 return err; 359 } 360 } 361 362 return 0; 363 } 364 365 int json_lexer_flush(JSONLexer *lexer) 366 { 367 return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true); 368 } 369 370 void json_lexer_destroy(JSONLexer *lexer) 371 { 372 QDECREF(lexer->token); 373 } 374