1 /* 2 * JSON lexer 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Anthony Liguori <aliguori (at) us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. 10 * See the COPYING.LIB file in the top-level directory. 11 * 12 */ 13 14 #include "qstring.h" 15 #include "qlist.h" 16 #include "qdict.h" 17 #include "qint.h" 18 #include "qemu-common.h" 19 #include "json-lexer.h" 20 21 /* 22 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\" 23 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*' 24 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+)) 25 * [{}\[\],:] 26 * [a-z]+ 27 * 28 */ 29 30 enum json_lexer_state { 31 IN_ERROR = 0, 32 IN_DQ_UCODE3, 33 IN_DQ_UCODE2, 34 IN_DQ_UCODE1, 35 IN_DQ_UCODE0, 36 IN_DQ_STRING_ESCAPE, 37 IN_DQ_STRING, 38 IN_SQ_UCODE3, 39 IN_SQ_UCODE2, 40 IN_SQ_UCODE1, 41 IN_SQ_UCODE0, 42 IN_SQ_STRING_ESCAPE, 43 IN_SQ_STRING, 44 IN_ZERO, 45 IN_DIGITS, 46 IN_DIGIT, 47 IN_EXP_E, 48 IN_MANTISSA, 49 IN_MANTISSA_DIGITS, 50 IN_NONZERO_NUMBER, 51 IN_NEG_NONZERO_NUMBER, 52 IN_KEYWORD, 53 IN_ESCAPE, 54 IN_ESCAPE_L, 55 IN_ESCAPE_LL, 56 IN_ESCAPE_I, 57 IN_ESCAPE_I6, 58 IN_ESCAPE_I64, 59 IN_WHITESPACE, 60 IN_START, 61 }; 62 63 #define TERMINAL(state) [0 ... 0x7F] = (state) 64 65 /* Return whether TERMINAL is a terminal state and the transition to it 66 from OLD_STATE required lookahead. This happens whenever the table 67 below uses the TERMINAL macro. */ 68 #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \ 69 (json_lexer[(old_state)][0] == (terminal)) 70 71 static const uint8_t json_lexer[][256] = { 72 /* double quote string */ 73 [IN_DQ_UCODE3] = { 74 ['0' ... '9'] = IN_DQ_STRING, 75 ['a' ... 'f'] = IN_DQ_STRING, 76 ['A' ... 'F'] = IN_DQ_STRING, 77 }, 78 [IN_DQ_UCODE2] = { 79 ['0' ... '9'] = IN_DQ_UCODE3, 80 ['a' ... 'f'] = IN_DQ_UCODE3, 81 ['A' ... 'F'] = IN_DQ_UCODE3, 82 }, 83 [IN_DQ_UCODE1] = { 84 ['0' ... '9'] = IN_DQ_UCODE2, 85 ['a' ... 'f'] = IN_DQ_UCODE2, 86 ['A' ... 'F'] = IN_DQ_UCODE2, 87 }, 88 [IN_DQ_UCODE0] = { 89 ['0' ... '9'] = IN_DQ_UCODE1, 90 ['a' ... 'f'] = IN_DQ_UCODE1, 91 ['A' ... 'F'] = IN_DQ_UCODE1, 92 }, 93 [IN_DQ_STRING_ESCAPE] = { 94 ['b'] = IN_DQ_STRING, 95 ['f'] = IN_DQ_STRING, 96 ['n'] = IN_DQ_STRING, 97 ['r'] = IN_DQ_STRING, 98 ['t'] = IN_DQ_STRING, 99 ['/'] = IN_DQ_STRING, 100 ['\\'] = IN_DQ_STRING, 101 ['\''] = IN_DQ_STRING, 102 ['\"'] = IN_DQ_STRING, 103 ['u'] = IN_DQ_UCODE0, 104 }, 105 [IN_DQ_STRING] = { 106 [1 ... 0xFF] = IN_DQ_STRING, 107 ['\\'] = IN_DQ_STRING_ESCAPE, 108 ['"'] = JSON_STRING, 109 }, 110 111 /* single quote string */ 112 [IN_SQ_UCODE3] = { 113 ['0' ... '9'] = IN_SQ_STRING, 114 ['a' ... 'f'] = IN_SQ_STRING, 115 ['A' ... 'F'] = IN_SQ_STRING, 116 }, 117 [IN_SQ_UCODE2] = { 118 ['0' ... '9'] = IN_SQ_UCODE3, 119 ['a' ... 'f'] = IN_SQ_UCODE3, 120 ['A' ... 'F'] = IN_SQ_UCODE3, 121 }, 122 [IN_SQ_UCODE1] = { 123 ['0' ... '9'] = IN_SQ_UCODE2, 124 ['a' ... 'f'] = IN_SQ_UCODE2, 125 ['A' ... 'F'] = IN_SQ_UCODE2, 126 }, 127 [IN_SQ_UCODE0] = { 128 ['0' ... '9'] = IN_SQ_UCODE1, 129 ['a' ... 'f'] = IN_SQ_UCODE1, 130 ['A' ... 'F'] = IN_SQ_UCODE1, 131 }, 132 [IN_SQ_STRING_ESCAPE] = { 133 ['b'] = IN_SQ_STRING, 134 ['f'] = IN_SQ_STRING, 135 ['n'] = IN_SQ_STRING, 136 ['r'] = IN_SQ_STRING, 137 ['t'] = IN_SQ_STRING, 138 ['/'] = IN_DQ_STRING, 139 ['\\'] = IN_DQ_STRING, 140 ['\''] = IN_SQ_STRING, 141 ['\"'] = IN_SQ_STRING, 142 ['u'] = IN_SQ_UCODE0, 143 }, 144 [IN_SQ_STRING] = { 145 [1 ... 0xFF] = IN_SQ_STRING, 146 ['\\'] = IN_SQ_STRING_ESCAPE, 147 ['\''] = JSON_STRING, 148 }, 149 150 /* Zero */ 151 [IN_ZERO] = { 152 TERMINAL(JSON_INTEGER), 153 ['0' ... '9'] = IN_ERROR, 154 ['.'] = IN_MANTISSA, 155 }, 156 157 /* Float */ 158 [IN_DIGITS] = { 159 TERMINAL(JSON_FLOAT), 160 ['0' ... '9'] = IN_DIGITS, 161 }, 162 163 [IN_DIGIT] = { 164 ['0' ... '9'] = IN_DIGITS, 165 }, 166 167 [IN_EXP_E] = { 168 ['-'] = IN_DIGIT, 169 ['+'] = IN_DIGIT, 170 ['0' ... '9'] = IN_DIGITS, 171 }, 172 173 [IN_MANTISSA_DIGITS] = { 174 TERMINAL(JSON_FLOAT), 175 ['0' ... '9'] = IN_MANTISSA_DIGITS, 176 ['e'] = IN_EXP_E, 177 ['E'] = IN_EXP_E, 178 }, 179 180 [IN_MANTISSA] = { 181 ['0' ... '9'] = IN_MANTISSA_DIGITS, 182 }, 183 184 /* Number */ 185 [IN_NONZERO_NUMBER] = { 186 TERMINAL(JSON_INTEGER), 187 ['0' ... '9'] = IN_NONZERO_NUMBER, 188 ['e'] = IN_EXP_E, 189 ['E'] = IN_EXP_E, 190 ['.'] = IN_MANTISSA, 191 }, 192 193 [IN_NEG_NONZERO_NUMBER] = { 194 ['0'] = IN_ZERO, 195 ['1' ... '9'] = IN_NONZERO_NUMBER, 196 }, 197 198 /* keywords */ 199 [IN_KEYWORD] = { 200 TERMINAL(JSON_KEYWORD), 201 ['a' ... 'z'] = IN_KEYWORD, 202 }, 203 204 /* whitespace */ 205 [IN_WHITESPACE] = { 206 TERMINAL(JSON_SKIP), 207 [' '] = IN_WHITESPACE, 208 ['\t'] = IN_WHITESPACE, 209 ['\r'] = IN_WHITESPACE, 210 ['\n'] = IN_WHITESPACE, 211 }, 212 213 /* escape */ 214 [IN_ESCAPE_LL] = { 215 ['d'] = JSON_ESCAPE, 216 }, 217 218 [IN_ESCAPE_L] = { 219 ['d'] = JSON_ESCAPE, 220 ['l'] = IN_ESCAPE_LL, 221 }, 222 223 [IN_ESCAPE_I64] = { 224 ['d'] = JSON_ESCAPE, 225 }, 226 227 [IN_ESCAPE_I6] = { 228 ['4'] = IN_ESCAPE_I64, 229 }, 230 231 [IN_ESCAPE_I] = { 232 ['6'] = IN_ESCAPE_I6, 233 }, 234 235 [IN_ESCAPE] = { 236 ['d'] = JSON_ESCAPE, 237 ['i'] = JSON_ESCAPE, 238 ['p'] = JSON_ESCAPE, 239 ['s'] = JSON_ESCAPE, 240 ['f'] = JSON_ESCAPE, 241 ['l'] = IN_ESCAPE_L, 242 ['I'] = IN_ESCAPE_I, 243 }, 244 245 /* top level rule */ 246 [IN_START] = { 247 ['"'] = IN_DQ_STRING, 248 ['\''] = IN_SQ_STRING, 249 ['0'] = IN_ZERO, 250 ['1' ... '9'] = IN_NONZERO_NUMBER, 251 ['-'] = IN_NEG_NONZERO_NUMBER, 252 ['{'] = JSON_OPERATOR, 253 ['}'] = JSON_OPERATOR, 254 ['['] = JSON_OPERATOR, 255 [']'] = JSON_OPERATOR, 256 [','] = JSON_OPERATOR, 257 [':'] = JSON_OPERATOR, 258 ['a' ... 'z'] = IN_KEYWORD, 259 ['%'] = IN_ESCAPE, 260 [' '] = IN_WHITESPACE, 261 ['\t'] = IN_WHITESPACE, 262 ['\r'] = IN_WHITESPACE, 263 ['\n'] = IN_WHITESPACE, 264 }, 265 }; 266 267 void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func) 268 { 269 lexer->emit = func; 270 lexer->state = IN_START; 271 lexer->token = qstring_new(); 272 lexer->x = lexer->y = 0; 273 } 274 275 static int json_lexer_feed_char(JSONLexer *lexer, char ch) 276 { 277 int char_consumed, new_state; 278 279 lexer->x++; 280 if (ch == '\n') { 281 lexer->x = 0; 282 lexer->y++; 283 } 284 285 do { 286 new_state = json_lexer[lexer->state][(uint8_t)ch]; 287 char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state); 288 if (char_consumed) { 289 qstring_append_chr(lexer->token, ch); 290 } 291 292 switch (new_state) { 293 case JSON_OPERATOR: 294 case JSON_ESCAPE: 295 case JSON_INTEGER: 296 case JSON_FLOAT: 297 case JSON_KEYWORD: 298 case JSON_STRING: 299 lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y); 300 case JSON_SKIP: 301 QDECREF(lexer->token); 302 lexer->token = qstring_new(); 303 new_state = IN_START; 304 break; 305 case IN_ERROR: 306 return -EINVAL; 307 default: 308 break; 309 } 310 lexer->state = new_state; 311 } while (!char_consumed); 312 return 0; 313 } 314 315 int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) 316 { 317 size_t i; 318 319 for (i = 0; i < size; i++) { 320 int err; 321 322 err = json_lexer_feed_char(lexer, buffer[i]); 323 if (err < 0) { 324 return err; 325 } 326 } 327 328 return 0; 329 } 330 331 int json_lexer_flush(JSONLexer *lexer) 332 { 333 return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0); 334 } 335 336 void json_lexer_destroy(JSONLexer *lexer) 337 { 338 QDECREF(lexer->token); 339 } 340