1 /* 2 * JSON lexer 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Anthony Liguori <aliguori (at) us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. 10 * See the COPYING.LIB file in the top-level directory. 11 * 12 */ 13 14 #include "qstring.h" 15 #include "qlist.h" 16 #include "qdict.h" 17 #include "qint.h" 18 #include "qemu-common.h" 19 #include "json-lexer.h" 20 21 /* 22 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\" 23 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*' 24 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+)) 25 * [{}\[\],:] 26 * [a-z]+ 27 * 28 */ 29 30 #undef ERROR 31 32 enum json_lexer_state { 33 ERROR = 0, 34 IN_DONE_STRING, 35 IN_DQ_UCODE3, 36 IN_DQ_UCODE2, 37 IN_DQ_UCODE1, 38 IN_DQ_UCODE0, 39 IN_DQ_STRING_ESCAPE, 40 IN_DQ_STRING, 41 IN_SQ_UCODE3, 42 IN_SQ_UCODE2, 43 IN_SQ_UCODE1, 44 IN_SQ_UCODE0, 45 IN_SQ_STRING_ESCAPE, 46 IN_SQ_STRING, 47 IN_ZERO, 48 IN_DIGITS, 49 IN_DIGIT, 50 IN_EXP_E, 51 IN_MANTISSA, 52 IN_MANTISSA_DIGITS, 53 IN_NONZERO_NUMBER, 54 IN_NEG_NONZERO_NUMBER, 55 IN_KEYWORD, 56 IN_ESCAPE, 57 IN_ESCAPE_L, 58 IN_ESCAPE_LL, 59 IN_ESCAPE_I, 60 IN_ESCAPE_I6, 61 IN_ESCAPE_I64, 62 IN_ESCAPE_DONE, 63 IN_WHITESPACE, 64 IN_OPERATOR_DONE, 65 IN_START, 66 }; 67 68 #define TERMINAL(state) [0 ... 0x7F] = (state) 69 70 static const uint8_t json_lexer[][256] = { 71 [IN_DONE_STRING] = { 72 TERMINAL(JSON_STRING), 73 }, 74 75 /* double quote string */ 76 [IN_DQ_UCODE3] = { 77 ['0' ... '9'] = IN_DQ_STRING, 78 ['a' ... 'f'] = IN_DQ_STRING, 79 ['A' ... 'F'] = IN_DQ_STRING, 80 }, 81 [IN_DQ_UCODE2] = { 82 ['0' ... '9'] = IN_DQ_UCODE3, 83 ['a' ... 'f'] = IN_DQ_UCODE3, 84 ['A' ... 'F'] = IN_DQ_UCODE3, 85 }, 86 [IN_DQ_UCODE1] = { 87 ['0' ... '9'] = IN_DQ_UCODE2, 88 ['a' ... 'f'] = IN_DQ_UCODE2, 89 ['A' ... 'F'] = IN_DQ_UCODE2, 90 }, 91 [IN_DQ_UCODE0] = { 92 ['0' ... '9'] = IN_DQ_UCODE1, 93 ['a' ... 'f'] = IN_DQ_UCODE1, 94 ['A' ... 'F'] = IN_DQ_UCODE1, 95 }, 96 [IN_DQ_STRING_ESCAPE] = { 97 ['b'] = IN_DQ_STRING, 98 ['f'] = IN_DQ_STRING, 99 ['n'] = IN_DQ_STRING, 100 ['r'] = IN_DQ_STRING, 101 ['t'] = IN_DQ_STRING, 102 ['\''] = IN_DQ_STRING, 103 ['\"'] = IN_DQ_STRING, 104 ['u'] = IN_DQ_UCODE0, 105 }, 106 [IN_DQ_STRING] = { 107 [1 ... 0xFF] = IN_DQ_STRING, 108 ['\\'] = IN_DQ_STRING_ESCAPE, 109 ['"'] = IN_DONE_STRING, 110 }, 111 112 /* single quote string */ 113 [IN_SQ_UCODE3] = { 114 ['0' ... '9'] = IN_SQ_STRING, 115 ['a' ... 'f'] = IN_SQ_STRING, 116 ['A' ... 'F'] = IN_SQ_STRING, 117 }, 118 [IN_SQ_UCODE2] = { 119 ['0' ... '9'] = IN_SQ_UCODE3, 120 ['a' ... 'f'] = IN_SQ_UCODE3, 121 ['A' ... 'F'] = IN_SQ_UCODE3, 122 }, 123 [IN_SQ_UCODE1] = { 124 ['0' ... '9'] = IN_SQ_UCODE2, 125 ['a' ... 'f'] = IN_SQ_UCODE2, 126 ['A' ... 'F'] = IN_SQ_UCODE2, 127 }, 128 [IN_SQ_UCODE0] = { 129 ['0' ... '9'] = IN_SQ_UCODE1, 130 ['a' ... 'f'] = IN_SQ_UCODE1, 131 ['A' ... 'F'] = IN_SQ_UCODE1, 132 }, 133 [IN_SQ_STRING_ESCAPE] = { 134 ['b'] = IN_SQ_STRING, 135 ['f'] = IN_SQ_STRING, 136 ['n'] = IN_SQ_STRING, 137 ['r'] = IN_SQ_STRING, 138 ['t'] = IN_SQ_STRING, 139 ['\''] = IN_SQ_STRING, 140 ['\"'] = IN_SQ_STRING, 141 ['u'] = IN_SQ_UCODE0, 142 }, 143 [IN_SQ_STRING] = { 144 [1 ... 0xFF] = IN_SQ_STRING, 145 ['\\'] = IN_SQ_STRING_ESCAPE, 146 ['\''] = IN_DONE_STRING, 147 }, 148 149 /* Zero */ 150 [IN_ZERO] = { 151 TERMINAL(JSON_INTEGER), 152 ['0' ... '9'] = ERROR, 153 ['.'] = IN_MANTISSA, 154 }, 155 156 /* Float */ 157 [IN_DIGITS] = { 158 TERMINAL(JSON_FLOAT), 159 ['0' ... '9'] = IN_DIGITS, 160 }, 161 162 [IN_DIGIT] = { 163 ['0' ... '9'] = IN_DIGITS, 164 }, 165 166 [IN_EXP_E] = { 167 ['-'] = IN_DIGIT, 168 ['+'] = IN_DIGIT, 169 ['0' ... '9'] = IN_DIGITS, 170 }, 171 172 [IN_MANTISSA_DIGITS] = { 173 TERMINAL(JSON_FLOAT), 174 ['0' ... '9'] = IN_MANTISSA_DIGITS, 175 ['e'] = IN_EXP_E, 176 ['E'] = IN_EXP_E, 177 }, 178 179 [IN_MANTISSA] = { 180 ['0' ... '9'] = IN_MANTISSA_DIGITS, 181 }, 182 183 /* Number */ 184 [IN_NONZERO_NUMBER] = { 185 TERMINAL(JSON_INTEGER), 186 ['0' ... '9'] = IN_NONZERO_NUMBER, 187 ['e'] = IN_EXP_E, 188 ['E'] = IN_EXP_E, 189 ['.'] = IN_MANTISSA, 190 }, 191 192 [IN_NEG_NONZERO_NUMBER] = { 193 ['0'] = IN_ZERO, 194 ['1' ... '9'] = IN_NONZERO_NUMBER, 195 }, 196 197 /* keywords */ 198 [IN_KEYWORD] = { 199 TERMINAL(JSON_KEYWORD), 200 ['a' ... 'z'] = IN_KEYWORD, 201 }, 202 203 /* whitespace */ 204 [IN_WHITESPACE] = { 205 TERMINAL(JSON_SKIP), 206 [' '] = IN_WHITESPACE, 207 ['\t'] = IN_WHITESPACE, 208 ['\r'] = IN_WHITESPACE, 209 ['\n'] = IN_WHITESPACE, 210 }, 211 212 /* operator */ 213 [IN_OPERATOR_DONE] = { 214 TERMINAL(JSON_OPERATOR), 215 }, 216 217 /* escape */ 218 [IN_ESCAPE_DONE] = { 219 TERMINAL(JSON_ESCAPE), 220 }, 221 222 [IN_ESCAPE_LL] = { 223 ['d'] = IN_ESCAPE_DONE, 224 }, 225 226 [IN_ESCAPE_L] = { 227 ['d'] = IN_ESCAPE_DONE, 228 ['l'] = IN_ESCAPE_LL, 229 }, 230 231 [IN_ESCAPE_I64] = { 232 ['d'] = IN_ESCAPE_DONE, 233 }, 234 235 [IN_ESCAPE_I6] = { 236 ['4'] = IN_ESCAPE_I64, 237 }, 238 239 [IN_ESCAPE_I] = { 240 ['6'] = IN_ESCAPE_I6, 241 }, 242 243 [IN_ESCAPE] = { 244 ['d'] = IN_ESCAPE_DONE, 245 ['i'] = IN_ESCAPE_DONE, 246 ['p'] = IN_ESCAPE_DONE, 247 ['s'] = IN_ESCAPE_DONE, 248 ['f'] = IN_ESCAPE_DONE, 249 ['l'] = IN_ESCAPE_L, 250 ['I'] = IN_ESCAPE_I, 251 }, 252 253 /* top level rule */ 254 [IN_START] = { 255 ['"'] = IN_DQ_STRING, 256 ['\''] = IN_SQ_STRING, 257 ['0'] = IN_ZERO, 258 ['1' ... '9'] = IN_NONZERO_NUMBER, 259 ['-'] = IN_NEG_NONZERO_NUMBER, 260 ['{'] = IN_OPERATOR_DONE, 261 ['}'] = IN_OPERATOR_DONE, 262 ['['] = IN_OPERATOR_DONE, 263 [']'] = IN_OPERATOR_DONE, 264 [','] = IN_OPERATOR_DONE, 265 [':'] = IN_OPERATOR_DONE, 266 ['a' ... 'z'] = IN_KEYWORD, 267 ['%'] = IN_ESCAPE, 268 [' '] = IN_WHITESPACE, 269 ['\t'] = IN_WHITESPACE, 270 ['\r'] = IN_WHITESPACE, 271 ['\n'] = IN_WHITESPACE, 272 }, 273 }; 274 275 void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func) 276 { 277 lexer->emit = func; 278 lexer->state = IN_START; 279 lexer->token = qstring_new(); 280 } 281 282 static int json_lexer_feed_char(JSONLexer *lexer, char ch) 283 { 284 char buf[2]; 285 286 lexer->x++; 287 if (ch == '\n') { 288 lexer->x = 0; 289 lexer->y++; 290 } 291 292 lexer->state = json_lexer[lexer->state][(uint8_t)ch]; 293 294 switch (lexer->state) { 295 case JSON_OPERATOR: 296 case JSON_ESCAPE: 297 case JSON_INTEGER: 298 case JSON_FLOAT: 299 case JSON_KEYWORD: 300 case JSON_STRING: 301 lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y); 302 case JSON_SKIP: 303 lexer->state = json_lexer[IN_START][(uint8_t)ch]; 304 QDECREF(lexer->token); 305 lexer->token = qstring_new(); 306 break; 307 case ERROR: 308 return -EINVAL; 309 default: 310 break; 311 } 312 313 buf[0] = ch; 314 buf[1] = 0; 315 316 qstring_append(lexer->token, buf); 317 318 return 0; 319 } 320 321 int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) 322 { 323 size_t i; 324 325 for (i = 0; i < size; i++) { 326 int err; 327 328 err = json_lexer_feed_char(lexer, buffer[i]); 329 if (err < 0) { 330 return err; 331 } 332 } 333 334 return 0; 335 } 336 337 int json_lexer_flush(JSONLexer *lexer) 338 { 339 return json_lexer_feed_char(lexer, 0); 340 } 341 342 void json_lexer_destroy(JSONLexer *lexer) 343 { 344 QDECREF(lexer->token); 345 } 346