1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "net/tools/flip_server/balsa_frame.h" 6 7 #include <assert.h> 8 #include <emmintrin.h> 9 #include <strings.h> 10 11 #include <limits> 12 #include <iostream> 13 #include <string> 14 #include <utility> 15 #include <vector> 16 17 #include "base/logging.h" 18 #include "base/port.h" 19 #include "base/string_piece.h" 20 #include "net/tools/flip_server/balsa_enums.h" 21 #include "net/tools/flip_server/balsa_headers.h" 22 #include "net/tools/flip_server/balsa_visitor_interface.h" 23 #include "net/tools/flip_server/buffer_interface.h" 24 #include "net/tools/flip_server/simple_buffer.h" 25 #include "net/tools/flip_server/split.h" 26 #include "net/tools/flip_server/string_piece_utils.h" 27 28 namespace net { 29 30 // Constants holding some header names for headers which can affect the way the 31 // HTTP message is framed, and so must be processed specially: 32 static const char kContentLength[] = "content-length"; 33 static const size_t kContentLengthSize = sizeof(kContentLength) - 1; 34 static const char kTransferEncoding[] = "transfer-encoding"; 35 static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1; 36 37 void BalsaFrame::Reset() { 38 last_char_was_slash_r_ = false; 39 saw_non_newline_char_ = false; 40 start_was_space_ = true; 41 chunk_length_character_extracted_ = false; 42 // is_request_ = true; // not reset between messages. 43 // request_was_head_ = false; // not reset between messages. 44 // max_header_length_ = 4096; // not reset between messages. 45 // max_request_uri_length_ = 2048; // not reset between messages. 46 // visitor_ = &do_nothing_visitor_; // not reset between messages. 47 chunk_length_remaining_ = 0; 48 content_length_remaining_ = 0; 49 last_slash_n_loc_ = NULL; 50 last_recorded_slash_n_loc_ = NULL; 51 last_slash_n_idx_ = 0; 52 term_chars_ = 0; 53 parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE; 54 last_error_ = BalsaFrameEnums::NO_ERROR; 55 lines_.clear(); 56 if (headers_ != NULL) { 57 headers_->Clear(); 58 } 59 } 60 61 const char* BalsaFrameEnums::ParseStateToString( 62 BalsaFrameEnums::ParseState error_code) { 63 switch (error_code) { 64 case ERROR: 65 return "ERROR"; 66 case READING_HEADER_AND_FIRSTLINE: 67 return "READING_HEADER_AND_FIRSTLINE"; 68 case READING_CHUNK_LENGTH: 69 return "READING_CHUNK_LENGTH"; 70 case READING_CHUNK_EXTENSION: 71 return "READING_CHUNK_EXTENSION"; 72 case READING_CHUNK_DATA: 73 return "READING_CHUNK_DATA"; 74 case READING_CHUNK_TERM: 75 return "READING_CHUNK_TERM"; 76 case READING_LAST_CHUNK_TERM: 77 return "READING_LAST_CHUNK_TERM"; 78 case READING_TRAILER: 79 return "READING_TRAILER"; 80 case READING_UNTIL_CLOSE: 81 return "READING_UNTIL_CLOSE"; 82 case READING_CONTENT: 83 return "READING_CONTENT"; 84 case MESSAGE_FULLY_READ: 85 return "MESSAGE_FULLY_READ"; 86 case NUM_STATES: 87 return "UNKNOWN_STATE"; 88 } 89 return "UNKNOWN_STATE"; 90 } 91 92 const char* BalsaFrameEnums::ErrorCodeToString( 93 BalsaFrameEnums::ErrorCode error_code) { 94 switch (error_code) { 95 case NO_ERROR: 96 return "NO_ERROR"; 97 case NO_STATUS_LINE_IN_RESPONSE: 98 return "NO_STATUS_LINE_IN_RESPONSE"; 99 case NO_REQUEST_LINE_IN_REQUEST: 100 return "NO_REQUEST_LINE_IN_REQUEST"; 101 case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION: 102 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION"; 103 case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD: 104 return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD"; 105 case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE: 106 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE"; 107 case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI: 108 return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI"; 109 case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE: 110 return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE"; 111 case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION: 112 return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION"; 113 case FAILED_CONVERTING_STATUS_CODE_TO_INT: 114 return "FAILED_CONVERTING_STATUS_CODE_TO_INT"; 115 case REQUEST_URI_TOO_LONG: 116 return "REQUEST_URI_TOO_LONG"; 117 case HEADERS_TOO_LONG: 118 return "HEADERS_TOO_LONG"; 119 case UNPARSABLE_CONTENT_LENGTH: 120 return "UNPARSABLE_CONTENT_LENGTH"; 121 case MAYBE_BODY_BUT_NO_CONTENT_LENGTH: 122 return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH"; 123 case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH: 124 return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH"; 125 case HEADER_MISSING_COLON: 126 return "HEADER_MISSING_COLON"; 127 case INVALID_CHUNK_LENGTH: 128 return "INVALID_CHUNK_LENGTH"; 129 case CHUNK_LENGTH_OVERFLOW: 130 return "CHUNK_LENGTH_OVERFLOW"; 131 case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO: 132 return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO"; 133 case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT: 134 return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT"; 135 case MULTIPLE_CONTENT_LENGTH_KEYS: 136 return "MULTIPLE_CONTENT_LENGTH_KEYS"; 137 case MULTIPLE_TRANSFER_ENCODING_KEYS: 138 return "MULTIPLE_TRANSFER_ENCODING_KEYS"; 139 case UNKNOWN_TRANSFER_ENCODING: 140 return "UNKNOWN_TRANSFER_ENCODING"; 141 case INVALID_HEADER_FORMAT: 142 return "INVALID_HEADER_FORMAT"; 143 case INTERNAL_LOGIC_ERROR: 144 return "INTERNAL_LOGIC_ERROR"; 145 case NUM_ERROR_CODES: 146 return "UNKNOWN_ERROR"; 147 } 148 return "UNKNOWN_ERROR"; 149 } 150 151 // Summary: 152 // Parses the first line of either a request or response. 153 // Note that in the case of a detected warning, error_code will be set 154 // but the function will not return false. 155 // Exactly zero or one warning or error (but not both) may be detected 156 // by this function. 157 // Note that this function will not write the data of the first-line 158 // into the header's buffer (that should already have been done elsewhere). 159 // 160 // Pre-conditions: 161 // begin != end 162 // *begin should be a character which is > ' '. This implies that there 163 // is at least one non-whitespace characters between [begin, end). 164 // headers is a valid pointer to a BalsaHeaders class. 165 // error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value. 166 // Entire first line must exist between [begin, end) 167 // Exactly zero or one newlines -may- exist between [begin, end) 168 // [begin, end) should exist in the header's buffer. 169 // 170 // Side-effects: 171 // headers will be modified 172 // error_code may be modified if either a warning or error is detected 173 // 174 // Returns: 175 // True if no error (as opposed to warning) is detected. 176 // False if an error (as opposed to warning) is detected. 177 178 // 179 // If there is indeed non-whitespace in the line, then the following 180 // will take care of this for you: 181 // while (*begin <= ' ') ++begin; 182 // ProcessFirstLine(begin, end, is_request, &headers, &error_code); 183 // 184 bool ParseHTTPFirstLine(const char* begin, 185 const char* end, 186 bool is_request, 187 size_t max_request_uri_length, 188 BalsaHeaders* headers, 189 BalsaFrameEnums::ErrorCode* error_code) { 190 const char* current = begin; 191 // HTTP firstlines all have the following structure: 192 // LWS NONWS LWS NONWS LWS NONWS NOTCRLF CRLF 193 // [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n" 194 // ws1 nws1 ws2 nws2 ws3 nws3 ws4 195 // | [-------) [-------) [----------------) 196 // REQ: method request_uri version 197 // RESP: version statuscode reason 198 // 199 // The first NONWS->LWS component we'll call firstline_a. 200 // The second firstline_b, and the third firstline_c. 201 // 202 // firstline_a goes from nws1 to (but not including) ws2 203 // firstline_b goes from nws2 to (but not including) ws3 204 // firstline_c goes from nws3 to (but not including) ws4 205 // 206 // In the code: 207 // ws1 == whitespace_1_idx_ 208 // nws1 == non_whitespace_1_idx_ 209 // ws2 == whitespace_2_idx_ 210 // nws2 == non_whitespace_2_idx_ 211 // ws3 == whitespace_3_idx_ 212 // nws3 == non_whitespace_3_idx_ 213 // ws4 == whitespace_4_idx_ 214 215 // Kill all whitespace (including '\r\n') at the end of the line. 216 --end; 217 if (*end != '\n') { 218 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; 219 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" 220 << headers->OriginalHeadersForDebugging(); 221 return false; 222 } 223 while (begin < end && *end <= ' ') { 224 --end; 225 } 226 DCHECK(*end != '\n'); 227 if (*end == '\n') { 228 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; 229 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" 230 << headers->OriginalHeadersForDebugging(); 231 return false; 232 } 233 ++end; 234 235 // The two following statements should not be possible. 236 if (end == begin) { 237 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; 238 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" 239 << headers->OriginalHeadersForDebugging(); 240 return false; 241 } 242 243 // whitespace_1_idx_ 244 headers->whitespace_1_idx_ = current - begin; 245 // This loop is commented out as it is never used in current code. This is 246 // true only because we don't begin parsing the headers at all until we've 247 // encountered a non whitespace character at the beginning of the stream, at 248 // which point we begin our demarcation of header-start. If we did -not- do 249 // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop 250 // would be necessary for the proper functioning of this parsing. 251 // This is left here as this function may (in the future) be refactored out 252 // of the BalsaFrame class so that it may be shared between code in 253 // BalsaFrame and BalsaHeaders (where it would be used in some variant of the 254 // set_first_line() function (at which point it would be necessary). 255 #if 0 256 while (*current <= ' ') { 257 ++current; 258 } 259 #endif 260 // non_whitespace_1_idx_ 261 headers->non_whitespace_1_idx_ = current - begin; 262 do { 263 // The first time through, we're guaranteed that the current character 264 // won't be a whitespace (else the loop above wouldn't have terminated). 265 // That implies that we're guaranteed to get at least one non-whitespace 266 // character if we get into this loop at all. 267 ++current; 268 if (current == end) { 269 headers->whitespace_2_idx_ = current - begin; 270 headers->non_whitespace_2_idx_ = current - begin; 271 headers->whitespace_3_idx_ = current - begin; 272 headers->non_whitespace_3_idx_ = current - begin; 273 headers->whitespace_4_idx_ = current - begin; 274 // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD for request 275 // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response 276 *error_code = 277 static_cast<BalsaFrameEnums::ErrorCode>( 278 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION + 279 is_request); 280 if (!is_request) { // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION 281 return false; 282 } 283 goto output_exhausted; 284 } 285 } while (*current > ' '); 286 // whitespace_2_idx_ 287 headers->whitespace_2_idx_ = current - begin; 288 do { 289 ++current; 290 // Note that due to the loop which consumes all of the whitespace 291 // at the end of the line, current can never == end while in this function. 292 } while (*current <= ' '); 293 // non_whitespace_2_idx_ 294 headers->non_whitespace_2_idx_ = current - begin; 295 do { 296 ++current; 297 if (current == end) { 298 headers->whitespace_3_idx_ = current - begin; 299 headers->non_whitespace_3_idx_ = current - begin; 300 headers->whitespace_4_idx_ = current - begin; 301 // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request 302 // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response 303 *error_code = 304 static_cast<BalsaFrameEnums::ErrorCode>( 305 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE 306 + is_request); 307 goto output_exhausted; 308 } 309 } while (*current > ' '); 310 // whitespace_3_idx_ 311 headers->whitespace_3_idx_ = current - begin; 312 do { 313 ++current; 314 // Note that due to the loop which consumes all of the whitespace 315 // at the end of the line, current can never == end while in this function. 316 } while (*current <= ' '); 317 // non_whitespace_3_idx_ 318 headers->non_whitespace_3_idx_ = current - begin; 319 headers->whitespace_4_idx_ = end - begin; 320 321 output_exhausted: 322 // Note that we don't fail the parse immediately when parsing of the 323 // firstline fails. Depending on the protocol type, we may want to accept 324 // a firstline with only one or two elements, e.g., for HTTP/0.9: 325 // GET\r\n 326 // or 327 // GET /\r\n 328 // should be parsed without issue (though the visitor should know that 329 // parsing the entire line was not exactly as it should be). 330 // 331 // Eventually, these errors may be removed alltogether, as the visitor can 332 // detect them on its own by examining the size of the various fields. 333 // headers->set_first_line(non_whitespace_1_idx_, current); 334 335 if (is_request) { 336 if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) > 337 max_request_uri_length) { 338 // For requests, we need at least the method. We could assume that a 339 // blank URI means "/". If version isn't stated, it should be assumed 340 // to be HTTP/0.9 by the visitor. 341 *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG; 342 return false; 343 } 344 } else { 345 headers->parsed_response_code_ = 0; 346 { 347 const char* parsed_response_code_current = 348 begin + headers->non_whitespace_2_idx_; 349 const char* parsed_response_code_end = begin + headers->whitespace_3_idx_; 350 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10; 351 352 // Convert a string of [0-9]* into an int. 353 // Note that this allows for the conversion of response codes which 354 // are outside the bounds of normal HTTP response codes (no checking 355 // is done to ensure that these are valid-- they're merely parsed)! 356 while (parsed_response_code_current < parsed_response_code_end) { 357 if (*parsed_response_code_current < '0' || 358 *parsed_response_code_current > '9') { 359 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT; 360 return false; 361 } 362 size_t status_code_x_10 = headers->parsed_response_code_ * 10; 363 uint8 c = *parsed_response_code_current - '0'; 364 if ((headers->parsed_response_code_ > kMaxDiv10) || 365 (std::numeric_limits<size_t>::max() - status_code_x_10) < c) { 366 // overflow. 367 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT; 368 return false; 369 } 370 headers->parsed_response_code_ = status_code_x_10 + c; 371 ++parsed_response_code_current; 372 } 373 } 374 } 375 return true; 376 } 377 378 // begin - beginning of the firstline 379 // end - end of the firstline 380 // 381 // A precondition for this function is that there is non-whitespace between 382 // [begin, end). If this precondition is not met, the function will not perform 383 // as expected (and bad things may happen, and it will eat your first, second, 384 // and third unborn children!). 385 // 386 // Another precondition for this function is that [begin, end) includes 387 // at most one newline, which must be at the end of the line. 388 void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) { 389 BalsaFrameEnums::ErrorCode previous_error = last_error_; 390 if (!ParseHTTPFirstLine(begin, 391 end, 392 is_request_, 393 max_request_uri_length_, 394 headers_, 395 &last_error_)) { 396 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 397 visitor_->HandleHeaderError(this); 398 return; 399 } 400 if (previous_error != last_error_) { 401 visitor_->HandleHeaderWarning(this); 402 } 403 404 if (is_request_) { 405 int version_length = 406 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_; 407 visitor_->ProcessRequestFirstLine( 408 begin + headers_->non_whitespace_1_idx_, 409 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_, 410 begin + headers_->non_whitespace_1_idx_, 411 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_, 412 begin + headers_->non_whitespace_2_idx_, 413 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_, 414 begin + headers_->non_whitespace_3_idx_, 415 version_length); 416 if (version_length == 0) 417 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 418 } else { 419 visitor_->ProcessResponseFirstLine( 420 begin + headers_->non_whitespace_1_idx_, 421 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_, 422 begin + headers_->non_whitespace_1_idx_, 423 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_, 424 begin + headers_->non_whitespace_2_idx_, 425 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_, 426 begin + headers_->non_whitespace_3_idx_, 427 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_); 428 } 429 } 430 431 // 'stream_begin' points to the first character of the headers buffer. 432 // 'line_begin' points to the first character of the line. 433 // 'current' points to a char which is ':'. 434 // 'line_end' points to the position of '\n' + 1. 435 // 'line_begin' points to the position of first character of line. 436 void BalsaFrame::CleanUpKeyValueWhitespace( 437 const char* stream_begin, 438 const char* line_begin, 439 const char* current, 440 const char* line_end, 441 HeaderLineDescription* current_header_line) { 442 const char* colon_loc = current; 443 DCHECK_LT(colon_loc, line_end); 444 DCHECK_EQ(':', *colon_loc); 445 DCHECK_EQ(':', *current); 446 DCHECK_GE(' ', *line_end) 447 << "\"" << std::string(line_begin, line_end) << "\""; 448 449 // TODO(fenix): Investigate whether or not the bounds tests in the 450 // while loops here are redundant, and if so, remove them. 451 --current; 452 while (current > line_begin && *current <= ' ') --current; 453 current += (current != colon_loc); 454 current_header_line->key_end_idx = current - stream_begin; 455 456 current = colon_loc; 457 DCHECK_EQ(':', *current); 458 ++current; 459 while (current < line_end && *current <= ' ') ++current; 460 current_header_line->value_begin_idx = current - stream_begin; 461 462 DCHECK_GE(current_header_line->key_end_idx, 463 current_header_line->first_char_idx); 464 DCHECK_GE(current_header_line->value_begin_idx, 465 current_header_line->key_end_idx); 466 DCHECK_GE(current_header_line->last_char_idx, 467 current_header_line->value_begin_idx); 468 } 469 470 inline void BalsaFrame::FindColonsAndParseIntoKeyValue() { 471 DCHECK(!lines_.empty()); 472 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 473 // The last line is always just a newline (and is uninteresting). 474 const Lines::size_type lines_size_m1 = lines_.size() - 1; 475 #if __SSE2__ 476 const __v16qi colons = { ':', ':', ':', ':', ':', ':', ':', ':', 477 ':', ':', ':', ':', ':', ':', ':', ':'}; 478 const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16; 479 #endif // __SSE2__ 480 const char* current = stream_begin + lines_[1].first; 481 // This code is a bit more subtle than it may appear at first glance. 482 // This code looks for a colon in the current line... but it also looks 483 // beyond the current line. If there is no colon in the current line, then 484 // for each subsequent line (until the colon which -has- been found is 485 // associated with a line), no searching for a colon will be performed. In 486 // this way, we minimize the amount of bytes we have scanned for a colon. 487 for (Lines::size_type i = 1; i < lines_size_m1;) { 488 const char* line_begin = stream_begin + lines_[i].first; 489 490 // Here we handle possible continuations. Note that we do not replace 491 // the '\n' in the line before a continuation (at least, as of now), 492 // which implies that any code which looks for a value must deal with 493 // "\r\n", etc -within- the line (and not just at the end of it). 494 for (++i; i < lines_size_m1; ++i) { 495 const char c = *(stream_begin + lines_[i].first); 496 if (c > ' ') { 497 // Not a continuation, so stop. Note that if the 'original' i = 1, 498 // and the next line is not a continuation, we'll end up with i = 2 499 // when we break. This handles the incrementing of i for the outer 500 // loop. 501 break; 502 } 503 } 504 const char* line_end = stream_begin + lines_[i - 1].second; 505 DCHECK_LT(line_begin - stream_begin, line_end - stream_begin); 506 507 // We cleanup the whitespace at the end of the line before doing anything 508 // else of interest as it allows us to do nothing when irregularly formatted 509 // headers are parsed (e.g. those with only keys, only values, or no colon). 510 // 511 // We're guaranteed to have *line_end > ' ' while line_end >= line_begin. 512 --line_end; 513 DCHECK_EQ('\n', *line_end) 514 << "\"" << std::string(line_begin, line_end) << "\""; 515 while (*line_end <= ' ' && line_end > line_begin) { 516 --line_end; 517 } 518 ++line_end; 519 DCHECK_GE(' ', *line_end); 520 DCHECK_LT(line_begin, line_end); 521 522 // We use '0' for the block idx, because we're always writing to the first 523 // block from the framer (we do this because the framer requires that the 524 // entire header sequence be in a contiguous buffer). 525 headers_->header_lines_.push_back( 526 HeaderLineDescription(line_begin - stream_begin, 527 line_end - stream_begin, 528 line_end - stream_begin, 529 line_end - stream_begin, 530 0)); 531 if (current >= line_end) { 532 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON; 533 visitor_->HandleHeaderWarning(this); 534 // Then the next colon will not be found within this header line-- time 535 // to try again with another header-line. 536 continue; 537 } else if (current < line_begin) { 538 // When this condition is true, the last detected colon was part of a 539 // previous line. We reset to the beginning of the line as we don't care 540 // about the presence of any colon before the beginning of the current 541 // line. 542 current = line_begin; 543 } 544 #if __SSE2__ 545 while (current < header_lines_end_m16) { 546 __m128i header_bytes = 547 _mm_loadu_si128(reinterpret_cast<const __m128i *>(current)); 548 __m128i colon_cmp = 549 _mm_cmpeq_epi8(header_bytes, reinterpret_cast<__m128i>(colons)); 550 int colon_msk = _mm_movemask_epi8(colon_cmp); 551 if (colon_msk == 0) { 552 current += 16; 553 continue; 554 } 555 current += (ffs(colon_msk) - 1); 556 if (current > line_end) { 557 break; 558 } 559 goto found_colon; 560 } 561 #endif // __SSE2__ 562 for (; current < line_end; ++current) { 563 if (*current != ':') { 564 continue; 565 } 566 goto found_colon; 567 } 568 // If we've gotten to here, then there was no colon 569 // in the line. The arguments we passed into the construction 570 // for the HeaderLineDescription object should be OK-- it assumes 571 // that the entire content is 'key' by default (which is true, as 572 // there was no colon, there can be no value). Note that this is a 573 // construct which is technically not allowed by the spec. 574 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON; 575 visitor_->HandleHeaderWarning(this); 576 continue; 577 found_colon: 578 DCHECK_EQ(*current, ':'); 579 DCHECK_LE(current - stream_begin, line_end - stream_begin); 580 DCHECK_LE(stream_begin - stream_begin, current - stream_begin); 581 582 HeaderLineDescription& current_header_line = headers_->header_lines_.back(); 583 current_header_line.key_end_idx = current - stream_begin; 584 current_header_line.value_begin_idx = current_header_line.key_end_idx; 585 if (current < line_end) { 586 ++current_header_line.key_end_idx; 587 588 CleanUpKeyValueWhitespace(stream_begin, 589 line_begin, 590 current, 591 line_end, 592 ¤t_header_line); 593 } 594 } 595 } 596 597 void BalsaFrame::ProcessContentLengthLine( 598 HeaderLines::size_type line_idx, 599 BalsaHeadersEnums::ContentLengthStatus* status, 600 size_t* length) { 601 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx]; 602 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 603 const char* line_end = stream_begin + header_line.last_char_idx; 604 const char* value_begin = (stream_begin + header_line.value_begin_idx); 605 606 if (value_begin >= line_end) { 607 // There is no non-whitespace value data. 608 #if DEBUGFRAMER 609 LOG(INFO) << "invalid content-length -- no non-whitespace value data"; 610 #endif 611 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH; 612 return; 613 } 614 615 *length = 0; 616 while (value_begin < line_end) { 617 if (*value_begin < '0' || *value_begin > '9') { 618 // bad! content-length found, and couldn't parse all of it! 619 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH; 620 #if DEBUGFRAMER 621 LOG(INFO) << "invalid content-length - non numeric character detected"; 622 #endif // DEBUGFRAMER 623 return; 624 } 625 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10; 626 size_t length_x_10 = *length * 10; 627 const unsigned char c = *value_begin - '0'; 628 if (*length > kMaxDiv10 || 629 (std::numeric_limits<size_t>::max() - length_x_10) < c) { 630 *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW; 631 #if DEBUGFRAMER 632 LOG(INFO) << "content-length overflow"; 633 #endif // DEBUGFRAMER 634 return; 635 } 636 *length = length_x_10 + c; 637 ++value_begin; 638 } 639 #if DEBUGFRAMER 640 LOG(INFO) << "content_length parsed: " << *length; 641 #endif // DEBUGFRAMER 642 *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH; 643 } 644 645 void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) { 646 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx]; 647 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 648 const char* line_end = stream_begin + header_line.last_char_idx; 649 const char* value_begin = stream_begin + header_line.value_begin_idx; 650 size_t value_length = line_end - value_begin; 651 652 if ((value_length == 7) && 653 !strncasecmp(value_begin, "chunked", 7)) { 654 headers_->transfer_encoding_is_chunked_ = true; 655 } else if ((value_length == 8) && 656 !strncasecmp(value_begin, "identity", 8)) { 657 headers_->transfer_encoding_is_chunked_ = false; 658 } else { 659 last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING; 660 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 661 visitor_->HandleHeaderError(this); 662 return; 663 } 664 } 665 666 namespace { 667 bool SplitStringPiece(base::StringPiece original, char delim, 668 base::StringPiece* before, base::StringPiece* after) { 669 const char* p = original.data(); 670 const char* end = p + original.size(); 671 672 while (p != end) { 673 if (*p == delim) { 674 ++p; 675 } else { 676 const char* start = p; 677 while (++p != end && *p != delim) { 678 // Skip to the next occurence of the delimiter. 679 } 680 *before = base::StringPiece(start, p - start); 681 if (p != end) 682 *after = base::StringPiece(p + 1, end - (p + 1)); 683 else 684 *after = base::StringPiece(""); 685 StringPieceUtils::RemoveWhitespaceContext(before); 686 StringPieceUtils::RemoveWhitespaceContext(after); 687 return true; 688 } 689 } 690 691 *before = original; 692 *after = ""; 693 return false; 694 } 695 696 // TODO(phython): Fix this function to properly deal with quoted values. 697 // E.g. ";;foo", "\";;\"", or \"aa; 698 // The last example, the semi-colon is a separator between extensions. 699 void ProcessChunkExtensionsManual(base::StringPiece all_extensions, 700 BalsaHeaders* extensions) { 701 base::StringPiece extension; 702 base::StringPiece remaining; 703 StringPieceUtils::RemoveWhitespaceContext(&all_extensions); 704 SplitStringPiece(all_extensions, ';', &extension, &remaining); 705 while (!extension.empty()) { 706 base::StringPiece key; 707 base::StringPiece value; 708 SplitStringPiece(extension, '=', &key, &value); 709 if (!value.empty()) { 710 // Strip quotation marks if they exist. 711 if (!value.empty() && value[0] == '"') 712 value.remove_prefix(1); 713 if (!value.empty() && value[value.length() - 1] == '"') 714 value.remove_suffix(1); 715 } 716 717 extensions->AppendHeader(key, value); 718 719 StringPieceUtils::RemoveWhitespaceContext(&remaining); 720 SplitStringPiece(remaining, ';', &extension, &remaining); 721 } 722 } 723 724 // TODO(phython): Fix this function to properly deal with quoted values. 725 // E.g. ";;foo", "\";;\"", or \"aa; 726 // The last example, the semi-colon is a separator between extensions. 727 void ProcessChunkExtensionsGoogle3(const char* input, size_t size, 728 BalsaHeaders* extensions) { 729 std::vector<base::StringPiece> key_values; 730 SplitStringPieceToVector(base::StringPiece(input, size), ";", 731 &key_values, true); 732 for (unsigned int i = 0; i < key_values.size(); ++i) { 733 base::StringPiece key = key_values[i].substr(0, key_values[i].find('=')); 734 base::StringPiece value; 735 if (key.length() < key_values[i].length()) { 736 value = key_values[i].substr(key.length() + 1); 737 // Remove any leading and trailing whitespace. 738 StringPieceUtils::RemoveWhitespaceContext(&value); 739 740 // Strip quotation marks if they exist. 741 if (!value.empty() && value[0] == '"') 742 value.remove_prefix(1); 743 if (!value.empty() && value[value.length() - 1] == '"') 744 value.remove_suffix(1); 745 } 746 747 // Strip the key whitespace after checking that there is a value. 748 StringPieceUtils::RemoveWhitespaceContext(&key); 749 extensions->AppendHeader(key, value); 750 } 751 } 752 753 } // anonymous namespace 754 755 void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size, 756 BalsaHeaders* extensions) { 757 #if 0 758 ProcessChunkExtensionsGoogle3(input, size, extensions); 759 #else 760 ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions); 761 #endif 762 } 763 764 void BalsaFrame::ProcessHeaderLines() { 765 HeaderLines::size_type content_length_idx = 0; 766 HeaderLines::size_type transfer_encoding_idx = 0; 767 768 DCHECK(!lines_.empty()); 769 #if DEBUGFRAMER 770 LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n"; 771 #endif // DEBUGFRAMER 772 773 // There is no need to attempt to process headers if no header lines exist. 774 // There are at least two lines in the message which are not header lines. 775 // These two non-header lines are the first line of the message, and the 776 // last line of the message (which is an empty line). 777 // Thus, we test to see if we have more than two lines total before attempting 778 // to parse any header lines. 779 if (lines_.size() > 2) { 780 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 781 782 // Then, for the rest of the header data, we parse these into key-value 783 // pairs. 784 FindColonsAndParseIntoKeyValue(); 785 // At this point, we've parsed all of the headers. Time to look for those 786 // headers which we require for framing. 787 const HeaderLines::size_type 788 header_lines_size = headers_->header_lines_.size(); 789 for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) { 790 const HeaderLineDescription& current_header_line = 791 headers_->header_lines_[i]; 792 const char* key_begin = 793 (stream_begin + current_header_line.first_char_idx); 794 const char* key_end = (stream_begin + current_header_line.key_end_idx); 795 const size_t key_len = key_end - key_begin; 796 const char c = *key_begin; 797 #if DEBUGFRAMER 798 LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len) 799 << " c: '" << c << "' key_len: " << key_len; 800 #endif // DEBUGFRAMER 801 // If a header begins with either lowercase or uppercase 'c' or 't', then 802 // the header may be one of content-length, connection, content-encoding 803 // or transfer-encoding. These headers are special, as they change the way 804 // that the message is framed, and so the framer is required to search 805 // for them. 806 807 808 if (c == 'c' || c == 'C') { 809 if ((key_len == kContentLengthSize) && 810 0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) { 811 BalsaHeadersEnums::ContentLengthStatus content_length_status = 812 BalsaHeadersEnums::NO_CONTENT_LENGTH; 813 size_t length = 0; 814 ProcessContentLengthLine(i, &content_length_status, &length); 815 if (content_length_idx != 0) { // then we've already seen one! 816 if ((headers_->content_length_status_ != content_length_status) || 817 ((headers_->content_length_status_ == 818 BalsaHeadersEnums::VALID_CONTENT_LENGTH) && 819 length != headers_->content_length_)) { 820 last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS; 821 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 822 visitor_->HandleHeaderError(this); 823 return; 824 } 825 continue; 826 } else { 827 content_length_idx = i + 1; 828 headers_->content_length_status_ = content_length_status; 829 headers_->content_length_ = length; 830 content_length_remaining_ = length; 831 } 832 833 } 834 } else if (c == 't' || c == 'T') { 835 if ((key_len == kTransferEncodingSize) && 836 0 == strncasecmp(key_begin, kTransferEncoding, 837 kTransferEncodingSize)) { 838 if (transfer_encoding_idx != 0) { 839 last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS; 840 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 841 visitor_->HandleHeaderError(this); 842 return; 843 } 844 transfer_encoding_idx = i + 1; 845 } 846 } else if (i == 0 && (key_len == 0 || c == ' ')) { 847 last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT; 848 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 849 visitor_->HandleHeaderError(this); 850 return; 851 } 852 } 853 if (headers_->transfer_encoding_is_chunked_) { 854 headers_->content_length_ = 0; 855 headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH; 856 content_length_remaining_ = 0; 857 } 858 if (transfer_encoding_idx != 0) { 859 ProcessTransferEncodingLine(transfer_encoding_idx - 1); 860 } 861 } 862 } 863 864 void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() { 865 // For responses, can't have a body if the request was a HEAD, or if it is 866 // one of these response-codes. rfc2616 section 4.3 867 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 868 if (is_request_ || 869 !(request_was_head_ || 870 (headers_->parsed_response_code_ >= 100 && 871 headers_->parsed_response_code_ < 200) || 872 (headers_->parsed_response_code_ == 204) || 873 (headers_->parsed_response_code_ == 304))) { 874 // Then we can have a body. 875 if (headers_->transfer_encoding_is_chunked_) { 876 // Note that 877 // if ( Transfer-Encoding: chunked && Content-length: ) 878 // then Transfer-Encoding: chunked trumps. 879 // This is as specified in the spec. 880 // rfc2616 section 4.4.3 881 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH; 882 } else { 883 // Errors parsing content-length definitely can cause 884 // protocol errors/warnings 885 switch (headers_->content_length_status_) { 886 // If we have a content-length, and it is parsed 887 // properly, there are two options. 888 // 1) zero content, in which case the message is done, and 889 // 2) nonzero content, in which case we have to 890 // consume the body. 891 case BalsaHeadersEnums::VALID_CONTENT_LENGTH: 892 if (headers_->content_length_ == 0) { 893 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 894 } else { 895 parse_state_ = BalsaFrameEnums::READING_CONTENT; 896 } 897 break; 898 case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW: 899 case BalsaHeadersEnums::INVALID_CONTENT_LENGTH: 900 // If there were characters left-over after parsing the 901 // content length, we should flag an error and stop. 902 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 903 last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH; 904 visitor_->HandleHeaderError(this); 905 break; 906 // We can have: no transfer-encoding, no content length, and no 907 // connection: close... 908 // Unfortunately, this case doesn't seem to be covered in the spec. 909 // We'll assume that the safest thing to do here is what the google 910 // binaries before 2008 already do, which is to assume that 911 // everything until the connection is closed is body. 912 case BalsaHeadersEnums::NO_CONTENT_LENGTH: 913 if (is_request_) { 914 base::StringPiece method = headers_->request_method(); 915 // POSTs and PUTs should have a detectable body length. If they 916 // do not we consider it an error. 917 if ((method.size() == 4 && 918 strncmp(method.data(), "POST", 4) == 0) || 919 (method.size() == 3 && 920 strncmp(method.data(), "PUT", 3) == 0)) { 921 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 922 last_error_ = 923 BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH; 924 visitor_->HandleHeaderError(this); 925 break; 926 } 927 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 928 } else { 929 parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE; 930 last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH; 931 visitor_->HandleHeaderWarning(this); 932 } 933 break; 934 // The COV_NF_... statements here provide hints to the apparatus 935 // which computes coverage reports/ratios that this code is never 936 // intended to be executed, and should technically be impossible. 937 // COV_NF_START 938 default: 939 LOG(FATAL) << "Saw a content_length_status: " 940 << headers_->content_length_status_ << " which is unknown."; 941 // COV_NF_END 942 } 943 } 944 } 945 } 946 947 size_t BalsaFrame::ProcessHeaders(const char* message_start, 948 size_t message_length) { 949 const char* const original_message_start = message_start; 950 const char* const message_end = message_start + message_length; 951 const char* message_current = message_start; 952 const char* checkpoint = message_start; 953 954 if (message_length == 0) { 955 goto bottom; 956 } 957 958 while (message_current < message_end) { 959 size_t base_idx = headers_->GetReadableBytesFromHeaderStream(); 960 961 // Yes, we could use strchr (assuming null termination), or 962 // memchr, but as it turns out that is slower than this tight loop 963 // for the input that we see. 964 if (!saw_non_newline_char_) { 965 do { 966 const char c = *message_current; 967 if (c != '\r' && c != '\n') { 968 if (c <= ' ') { 969 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 970 last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST; 971 visitor_->HandleHeaderError(this); 972 goto bottom; 973 } else { 974 saw_non_newline_char_ = true; 975 checkpoint = message_start = message_current; 976 goto read_real_message; 977 } 978 } 979 ++message_current; 980 } while (message_current < message_end); 981 goto bottom; // this is necessary to skip 'last_char_was_slash_r' checks 982 } else { 983 read_real_message: 984 // Note that SSE2 can be enabled on certain piii platforms. 985 #if __SSE2__ 986 { 987 const char* const message_end_m16 = message_end - 16; 988 __v16qi newlines = { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 989 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' }; 990 while (message_current < message_end_m16) { 991 // What this does (using compiler intrinsics): 992 // 993 // Load 16 '\n's into an xmm register 994 // Load 16 bytes of currennt message into an xmm register 995 // Do byte-wise equals on those two xmm registers 996 // Take the first bit of each byte, and put that into the first 997 // 16 bits of a mask 998 // If the mask is zero, no '\n' found. increment by 16 and try again 999 // Else scan forward to find the first set bit. 1000 // Increment current by the index of the first set bit 1001 // (ffs returns index of first set bit + 1) 1002 __m128i msg_bytes = 1003 _mm_loadu_si128(const_cast<__m128i *>( 1004 reinterpret_cast<const __m128i *>(message_current))); 1005 __m128i newline_cmp = 1006 _mm_cmpeq_epi8(msg_bytes, reinterpret_cast<__m128i>(newlines)); 1007 int newline_msk = _mm_movemask_epi8(newline_cmp); 1008 if (newline_msk == 0) { 1009 message_current += 16; 1010 continue; 1011 } 1012 message_current += (ffs(newline_msk) - 1); 1013 const size_t relative_idx = message_current - message_start; 1014 const size_t message_current_idx = 1 + base_idx + relative_idx; 1015 lines_.push_back(std::make_pair(last_slash_n_idx_, 1016 message_current_idx)); 1017 if (lines_.size() == 1) { 1018 headers_->WriteFromFramer(checkpoint, 1019 1 + message_current - checkpoint); 1020 checkpoint = message_current + 1; 1021 const char* begin = headers_->OriginalHeaderStreamBegin(); 1022 #if DEBUGFRAMER 1023 LOG(INFO) << "First line " << std::string(begin, lines_[0].second); 1024 LOG(INFO) << "is_request_: " << is_request_; 1025 #endif 1026 ProcessFirstLine(begin, begin + lines_[0].second); 1027 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) 1028 goto process_lines; 1029 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) 1030 goto bottom; 1031 } 1032 const size_t chars_since_last_slash_n = (message_current_idx - 1033 last_slash_n_idx_); 1034 last_slash_n_idx_ = message_current_idx; 1035 if (chars_since_last_slash_n > 2) { 1036 // We have a slash-n, but the last slash n was 1037 // more than 2 characters away from this. Thus, we know 1038 // that this cannot be an end-of-header. 1039 ++message_current; 1040 continue; 1041 } 1042 if ((chars_since_last_slash_n == 1) || 1043 (((message_current > message_start) && 1044 (*(message_current - 1) == '\r')) || 1045 (last_char_was_slash_r_))) { 1046 goto process_lines; 1047 } 1048 ++message_current; 1049 } 1050 } 1051 #endif // __SSE2__ 1052 while (message_current < message_end) { 1053 if (*message_current != '\n') { 1054 ++message_current; 1055 continue; 1056 } 1057 const size_t relative_idx = message_current - message_start; 1058 const size_t message_current_idx = 1 + base_idx + relative_idx; 1059 lines_.push_back(std::make_pair(last_slash_n_idx_, 1060 message_current_idx)); 1061 if (lines_.size() == 1) { 1062 headers_->WriteFromFramer(checkpoint, 1063 1 + message_current - checkpoint); 1064 checkpoint = message_current + 1; 1065 const char* begin = headers_->OriginalHeaderStreamBegin(); 1066 #if DEBUGFRAMER 1067 LOG(INFO) << "First line " << std::string(begin, lines_[0].second); 1068 LOG(INFO) << "is_request_: " << is_request_; 1069 #endif 1070 ProcessFirstLine(begin, begin + lines_[0].second); 1071 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) 1072 goto process_lines; 1073 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) 1074 goto bottom; 1075 } 1076 const size_t chars_since_last_slash_n = (message_current_idx - 1077 last_slash_n_idx_); 1078 last_slash_n_idx_ = message_current_idx; 1079 if (chars_since_last_slash_n > 2) { 1080 // false positive. 1081 ++message_current; 1082 continue; 1083 } 1084 if ((chars_since_last_slash_n == 1) || 1085 (((message_current > message_start) && 1086 (*(message_current - 1) == '\r')) || 1087 (last_char_was_slash_r_))) { 1088 goto process_lines; 1089 } 1090 ++message_current; 1091 } 1092 } 1093 continue; 1094 process_lines: 1095 ++message_current; 1096 DCHECK(message_current >= message_start); 1097 if (message_current > message_start) { 1098 headers_->WriteFromFramer(checkpoint, message_current - checkpoint); 1099 } 1100 1101 // Check if we have exceeded maximum headers length 1102 // Although we check for this limit before and after we call this function 1103 // we check it here as well to make sure that in case the visitor changed 1104 // the max_header_length_ (for example after processing the first line) 1105 // we handle it gracefully. 1106 if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) { 1107 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1108 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; 1109 visitor_->HandleHeaderError(this); 1110 goto bottom; 1111 } 1112 1113 // Since we know that we won't be writing any more bytes of the header, 1114 // we tell that to the headers object. The headers object may make 1115 // more efficient allocation decisions when this is signaled. 1116 headers_->DoneWritingFromFramer(); 1117 { 1118 const char* readable_ptr = NULL; 1119 size_t readable_size = 0; 1120 headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size); 1121 visitor_->ProcessHeaderInput(readable_ptr, readable_size); 1122 } 1123 1124 // Ok, now that we've written everything into our header buffer, it is 1125 // time to process the header lines (extract proper values for headers 1126 // which are important for framing). 1127 ProcessHeaderLines(); 1128 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) { 1129 goto bottom; 1130 } 1131 AssignParseStateAfterHeadersHaveBeenParsed(); 1132 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) { 1133 goto bottom; 1134 } 1135 visitor_->ProcessHeaders(*headers_); 1136 visitor_->HeaderDone(); 1137 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) { 1138 visitor_->MessageDone(); 1139 } 1140 goto bottom; 1141 } 1142 // If we've gotten to here, it means that we've consumed all of the 1143 // available input. We need to record whether or not the last character we 1144 // saw was a '\r' so that a subsequent call to ProcessInput correctly finds 1145 // a header framing that is split across the two calls. 1146 last_char_was_slash_r_ = (*(message_end - 1) == '\r'); 1147 DCHECK(message_current >= message_start); 1148 if (message_current > message_start) { 1149 headers_->WriteFromFramer(checkpoint, message_current - checkpoint); 1150 } 1151 bottom: 1152 return message_current - original_message_start; 1153 } 1154 1155 1156 size_t BalsaFrame::BytesSafeToSplice() const { 1157 switch (parse_state_) { 1158 case BalsaFrameEnums::READING_CHUNK_DATA: 1159 return chunk_length_remaining_; 1160 case BalsaFrameEnums::READING_UNTIL_CLOSE: 1161 return std::numeric_limits<size_t>::max(); 1162 case BalsaFrameEnums::READING_CONTENT: 1163 return content_length_remaining_; 1164 default: 1165 return 0; 1166 } 1167 } 1168 1169 void BalsaFrame::BytesSpliced(size_t bytes_spliced) { 1170 switch (parse_state_) { 1171 case BalsaFrameEnums::READING_CHUNK_DATA: 1172 if (chunk_length_remaining_ >= bytes_spliced) { 1173 chunk_length_remaining_ -= bytes_spliced; 1174 if (chunk_length_remaining_ == 0) { 1175 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM; 1176 } 1177 return; 1178 } else { 1179 last_error_ = 1180 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT; 1181 goto error_exit; 1182 } 1183 1184 case BalsaFrameEnums::READING_UNTIL_CLOSE: 1185 return; 1186 1187 case BalsaFrameEnums::READING_CONTENT: 1188 if (content_length_remaining_ >= bytes_spliced) { 1189 content_length_remaining_ -= bytes_spliced; 1190 if (content_length_remaining_ == 0) { 1191 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1192 visitor_->MessageDone(); 1193 } 1194 return; 1195 } else { 1196 last_error_ = 1197 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT; 1198 goto error_exit; 1199 } 1200 1201 default: 1202 last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO; 1203 goto error_exit; 1204 } 1205 1206 error_exit: 1207 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1208 visitor_->HandleBodyError(this); 1209 }; 1210 1211 // You may note that the state-machine contained within this function has both 1212 // switch and goto labels for nearly the same thing. For instance, the 1213 // following two labels refer to the same code block: 1214 // label_reading_chunk_data: 1215 // case BalsaFrameEnums::READING_CHUNK_DATA: 1216 // The 'case' statement is required for the switch statement which occurs when 1217 // ProcessInput is invoked. The goto label is required as the state-machine 1218 // does not use a computed goto in any subsequent operations. 1219 // 1220 // Since several states exit the state machine for various reasons, there is 1221 // also one label at the bottom of the function. When it is appropriate to 1222 // return from the function, that part of the state machine instead issues a 1223 // goto bottom; This results in less code duplication, and makes debugging 1224 // easier (as you can add a statement to a section of code which is guaranteed 1225 // to be invoked when the function is exiting. 1226 size_t BalsaFrame::ProcessInput(const char* input, size_t size) { 1227 const char* current = input; 1228 const char* on_entry = current; 1229 const char* end = current + size; 1230 #if DEBUGFRAMER 1231 LOG(INFO) << "\n==============" 1232 << BalsaFrameEnums::ParseStateToString(parse_state_) 1233 << "===============\n"; 1234 #endif // DEBUGFRAMER 1235 1236 DCHECK(headers_ != NULL); 1237 if (headers_ == NULL) return 0; 1238 1239 if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) { 1240 const size_t header_length = headers_->GetReadableBytesFromHeaderStream(); 1241 // Yes, we still have to check this here as the user can change the 1242 // max_header_length amount! 1243 // Also it is possible that we have reached the maximum allowed header size, 1244 // and we have more to consume (remember we are still inside 1245 // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error. 1246 if (header_length > max_header_length_ || 1247 (header_length == max_header_length_ && size > 0)) { 1248 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1249 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; 1250 visitor_->HandleHeaderError(this); 1251 goto bottom; 1252 } 1253 size_t bytes_to_process = max_header_length_ - header_length; 1254 if (bytes_to_process > size) { 1255 bytes_to_process = size; 1256 } 1257 current += ProcessHeaders(input, bytes_to_process); 1258 // If we are still reading headers check if we have crossed the headers 1259 // limit. Note that we check for >= as opposed to >. This is because if 1260 // header_length_after equals max_header_length_ and we are still in the 1261 // parse_state_ BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for 1262 // sure that the headers limit will be crossed later on 1263 if ((parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE)) { 1264 // Note that headers_ is valid only if we are still reading headers. 1265 const size_t header_length_after = 1266 headers_->GetReadableBytesFromHeaderStream(); 1267 if (header_length_after >= max_header_length_) { 1268 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1269 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; 1270 visitor_->HandleHeaderError(this); 1271 } 1272 } 1273 goto bottom; 1274 } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ || 1275 parse_state_ == BalsaFrameEnums::PARSE_ERROR) { 1276 // Can do nothing more 'till we're reset. 1277 goto bottom; 1278 } 1279 1280 while (current < end) { 1281 switch (parse_state_) { 1282 label_reading_chunk_length: 1283 case BalsaFrameEnums::READING_CHUNK_LENGTH: 1284 // In this state we read the chunk length. 1285 // Note that once we hit a character which is not in: 1286 // [0-9;A-Fa-f\n], we transition to a different state. 1287 // 1288 { 1289 // If we used strtol, etc, we'd have to buffer this line. 1290 // This is more annoying than simply doing the conversion 1291 // here. This code accounts for overflow. 1292 static const signed char buf[] = { 1293 // %0 %1 %2 %3 %4 %5 %6 %7 %8 \t \n %b %c \r %e %f 1294 -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1, 1295 // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f 1296 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1297 // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f 1298 -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1299 // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f 1300 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -2, -1, -1, -1, -1, 1301 // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f 1302 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1303 // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f 1304 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1305 // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f 1306 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1307 // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f 1308 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1309 }; 1310 // valid cases: 1311 // "09123\n" // -> 09123 1312 // "09123\r\n" // -> 09123 1313 // "09123 \n" // -> 09123 1314 // "09123 \r\n" // -> 09123 1315 // "09123 12312\n" // -> 09123 1316 // "09123 12312\r\n" // -> 09123 1317 // "09123; foo=bar\n" // -> 09123 1318 // "09123; foo=bar\r\n" // -> 09123 1319 // "FFFFFFFFFFFFFFFF\r\n" // -> FFFFFFFFFFFFFFFF 1320 // "FFFFFFFFFFFFFFFF 22\r\n" // -> FFFFFFFFFFFFFFFF 1321 // invalid cases: 1322 // "[ \t]+[^\n]*\n" 1323 // "FFFFFFFFFFFFFFFFF\r\n" (would overflow) 1324 // "\r\n" 1325 // "\n" 1326 while (current < end) { 1327 const char c = *current; 1328 ++current; 1329 const signed char addition = buf[static_cast<int>(c)]; 1330 if (addition >= 0) { 1331 chunk_length_character_extracted_ = true; 1332 size_t length_x_16 = chunk_length_remaining_ * 16; 1333 const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16; 1334 if ((chunk_length_remaining_ > kMaxDiv16) || 1335 ((std::numeric_limits<size_t>::max() - length_x_16) < 1336 static_cast<size_t>(addition))) { 1337 // overflow -- asked for a chunk-length greater than 2^64 - 1!! 1338 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1339 last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW; 1340 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1341 visitor_->HandleChunkingError(this); 1342 goto bottom; 1343 } 1344 chunk_length_remaining_ = length_x_16 + addition; 1345 continue; 1346 } 1347 1348 if (!chunk_length_character_extracted_ || addition == -1) { 1349 // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no 1350 // characters were converted, or an unexpected character was 1351 // seen. 1352 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1353 last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH; 1354 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1355 visitor_->HandleChunkingError(this); 1356 goto bottom; 1357 } 1358 1359 --current; 1360 parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION; 1361 visitor_->ProcessChunkLength(chunk_length_remaining_); 1362 goto label_reading_chunk_extension; 1363 } 1364 } 1365 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1366 goto bottom; // case BalsaFrameEnums::READING_CHUNK_LENGTH 1367 1368 label_reading_chunk_extension: 1369 case BalsaFrameEnums::READING_CHUNK_EXTENSION: 1370 { 1371 // TODO(phython): Convert this scanning to be 16 bytes at a time if 1372 // there is data to be read. 1373 const char* extensions_start = current; 1374 size_t extensions_length = 0; 1375 while (current < end) { 1376 const char c = *current; 1377 if (c == '\r' || c == '\n') { 1378 extensions_length = 1379 (extensions_start == current) ? 1380 0 : 1381 current - extensions_start - 1; 1382 } 1383 1384 ++current; 1385 if (c == '\n') { 1386 chunk_length_character_extracted_ = false; 1387 visitor_->ProcessChunkExtensions( 1388 extensions_start, extensions_length); 1389 if (chunk_length_remaining_ != 0) { 1390 parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA; 1391 goto label_reading_chunk_data; 1392 } 1393 HeaderFramingFound('\n'); 1394 parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM; 1395 goto label_reading_last_chunk_term; 1396 } 1397 } 1398 visitor_->ProcessChunkExtensions( 1399 extensions_start, extensions_length); 1400 } 1401 1402 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1403 goto bottom; // case BalsaFrameEnums::READING_CHUNK_EXTENSION 1404 1405 label_reading_chunk_data: 1406 case BalsaFrameEnums::READING_CHUNK_DATA: 1407 while (current < end) { 1408 if (chunk_length_remaining_ == 0) { 1409 break; 1410 } 1411 // read in the chunk 1412 size_t bytes_remaining = end - current; 1413 size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ? 1414 chunk_length_remaining_ : bytes_remaining; 1415 const char* tmp_current = current + consumed_bytes; 1416 visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry); 1417 visitor_->ProcessBodyData(current, consumed_bytes); 1418 on_entry = current = tmp_current; 1419 chunk_length_remaining_ -= consumed_bytes; 1420 } 1421 if (chunk_length_remaining_ == 0) { 1422 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM; 1423 goto label_reading_chunk_term; 1424 } 1425 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1426 goto bottom; // case BalsaFrameEnums::READING_CHUNK_DATA 1427 1428 label_reading_chunk_term: 1429 case BalsaFrameEnums::READING_CHUNK_TERM: 1430 while (current < end) { 1431 const char c = *current; 1432 ++current; 1433 1434 if (c == '\n') { 1435 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH; 1436 goto label_reading_chunk_length; 1437 } 1438 } 1439 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1440 goto bottom; // case BalsaFrameEnums::READING_CHUNK_TERM 1441 1442 label_reading_last_chunk_term: 1443 case BalsaFrameEnums::READING_LAST_CHUNK_TERM: 1444 while (current < end) { 1445 const char c = *current; 1446 1447 if (!HeaderFramingFound(c)) { 1448 // If not, however, since the spec only suggests that the 1449 // client SHOULD indicate the presence of trailers, we get to 1450 // *test* that they did or didn't. 1451 // If all of the bytes we've seen since: 1452 // OPTIONAL_WS 0 OPTIONAL_STUFF CRLF 1453 // are either '\r', or '\n', then we can assume that we don't yet 1454 // know if we need to parse headers, or if the next byte will make 1455 // the HeaderFramingFound condition (above) true. 1456 if (HeaderFramingMayBeFound()) { 1457 // If true, then we have seen only characters '\r' or '\n'. 1458 ++current; 1459 1460 // Lets try again! There is no state change here. 1461 continue; 1462 } else { 1463 // If (!HeaderFramingMayBeFound()), then we know that we must be 1464 // reading the first non CRLF character of a trailer. 1465 parse_state_ = BalsaFrameEnums::READING_TRAILER; 1466 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1467 on_entry = current; 1468 goto label_reading_trailer; 1469 } 1470 } else { 1471 // If we've found a "\r\n\r\n", then the message 1472 // is done. 1473 ++current; 1474 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1475 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1476 visitor_->MessageDone(); 1477 goto bottom; 1478 } 1479 break; // from while loop 1480 } 1481 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1482 goto bottom; // case BalsaFrameEnums::READING_LAST_CHUNK_TERM 1483 1484 label_reading_trailer: 1485 case BalsaFrameEnums::READING_TRAILER: 1486 while (current < end) { 1487 const char c = *current; 1488 ++current; 1489 // TODO(fenix): If we ever care about trailers as part of framing, 1490 // deal with them here (see below for part of the 'solution') 1491 // if (LineFramingFound(c)) { 1492 // trailer_lines_.push_back(make_pair(start_of_line_, 1493 // trailer_length_ - 1)); 1494 // start_of_line_ = trailer_length_; 1495 // } 1496 if (HeaderFramingFound(c)) { 1497 // ProcessTrailers(visitor_, &trailers_); 1498 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1499 visitor_->ProcessTrailerInput(on_entry, current - on_entry); 1500 visitor_->MessageDone(); 1501 goto bottom; 1502 } 1503 } 1504 visitor_->ProcessTrailerInput(on_entry, current - on_entry); 1505 break; // case BalsaFrameEnums::READING_TRAILER 1506 1507 // Note that there is no label: 1508 // 'label_reading_until_close' 1509 // here. This is because the state-machine exists immediately after 1510 // reading the headers instead of transitioning here (as it would 1511 // do if it was consuming all the data it could, all the time). 1512 case BalsaFrameEnums::READING_UNTIL_CLOSE: 1513 { 1514 const size_t bytes_remaining = end - current; 1515 if (bytes_remaining > 0) { 1516 visitor_->ProcessBodyInput(current, bytes_remaining); 1517 visitor_->ProcessBodyData(current, bytes_remaining); 1518 current += bytes_remaining; 1519 } 1520 } 1521 goto bottom; // case BalsaFrameEnums::READING_UNTIL_CLOSE 1522 1523 // label_reading_content: 1524 case BalsaFrameEnums::READING_CONTENT: 1525 #if DEBUGFRAMER 1526 LOG(INFO) << "ReadingContent: " << content_length_remaining_; 1527 #endif // DEBUGFRAMER 1528 while (content_length_remaining_ && current < end) { 1529 // read in the content 1530 const size_t bytes_remaining = end - current; 1531 const size_t consumed_bytes = 1532 (content_length_remaining_ < bytes_remaining) ? 1533 content_length_remaining_ : bytes_remaining; 1534 visitor_->ProcessBodyInput(current, consumed_bytes); 1535 visitor_->ProcessBodyData(current, consumed_bytes); 1536 current += consumed_bytes; 1537 content_length_remaining_ -= consumed_bytes; 1538 } 1539 if (content_length_remaining_ == 0) { 1540 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1541 visitor_->MessageDone(); 1542 } 1543 goto bottom; // case BalsaFrameEnums::READING_CONTENT 1544 1545 default: 1546 // The state-machine should never be in a state that isn't handled 1547 // above. This is a glaring logic error, and we should do something 1548 // drastic to ensure that this gets looked-at and fixed. 1549 LOG(FATAL) << "Unknown state: " << parse_state_ // COV_NF_LINE 1550 << " memory corruption?!"; // COV_NF_LINE 1551 } 1552 } 1553 bottom: 1554 #if DEBUGFRAMER 1555 LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n" 1556 << std::string(input, current) 1557 << "\n$$$$$$$$$$$$$$" 1558 << BalsaFrameEnums::ParseStateToString(parse_state_) 1559 << "$$$$$$$$$$$$$$$" 1560 << " consumed: " << (current - input); 1561 if (Error()) { 1562 LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode()); 1563 } 1564 #endif // DEBUGFRAMER 1565 return current - input; 1566 } 1567 1568 const uint32 BalsaFrame::kValidTerm1; 1569 const uint32 BalsaFrame::kValidTerm1Mask; 1570 const uint32 BalsaFrame::kValidTerm2; 1571 const uint32 BalsaFrame::kValidTerm2Mask; 1572 1573 } // namespace net 1574 1575