1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "net/tools/balsa/balsa_frame.h" 6 7 // Visual C++ defines _M_IX86_FP as 2 if the /arch:SSE2 compiler option is 8 // specified. 9 #if !defined(__SSE2__) && _M_IX86_FP == 2 10 #define __SSE2__ 1 11 #endif 12 13 #include <assert.h> 14 #if __SSE2__ 15 #include <emmintrin.h> 16 #endif // __SSE2__ 17 18 #include <limits> 19 #include <string> 20 #include <utility> 21 #include <vector> 22 23 #include "base/logging.h" 24 #include "base/port.h" 25 #include "base/strings/string_piece.h" 26 #include "net/tools/balsa/balsa_enums.h" 27 #include "net/tools/balsa/balsa_headers.h" 28 #include "net/tools/balsa/balsa_visitor_interface.h" 29 #include "net/tools/balsa/buffer_interface.h" 30 #include "net/tools/balsa/simple_buffer.h" 31 #include "net/tools/balsa/split.h" 32 #include "net/tools/balsa/string_piece_utils.h" 33 34 #if defined(COMPILER_MSVC) 35 #include <intrin.h> 36 #include <string.h> 37 38 #pragma intrinsic(_BitScanForward) 39 40 static int ffs(int i) { 41 unsigned long index; 42 return _BitScanForward(&index, i) ? index + 1 : 0; 43 } 44 45 #define strncasecmp _strnicmp 46 #else 47 #include <strings.h> 48 #endif 49 50 namespace net { 51 52 // Constants holding some header names for headers which can affect the way the 53 // HTTP message is framed, and so must be processed specially: 54 static const char kContentLength[] = "content-length"; 55 static const size_t kContentLengthSize = sizeof(kContentLength) - 1; 56 static const char kTransferEncoding[] = "transfer-encoding"; 57 static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1; 58 59 BalsaFrame::BalsaFrame() 60 : last_char_was_slash_r_(false), 61 saw_non_newline_char_(false), 62 start_was_space_(true), 63 chunk_length_character_extracted_(false), 64 is_request_(true), 65 request_was_head_(false), 66 max_header_length_(16 * 1024), 67 max_request_uri_length_(2048), 68 visitor_(&do_nothing_visitor_), 69 chunk_length_remaining_(0), 70 content_length_remaining_(0), 71 last_slash_n_loc_(NULL), 72 last_recorded_slash_n_loc_(NULL), 73 last_slash_n_idx_(0), 74 term_chars_(0), 75 parse_state_(BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE), 76 last_error_(BalsaFrameEnums::NO_ERROR), 77 headers_(NULL) { 78 } 79 80 BalsaFrame::~BalsaFrame() {} 81 82 void BalsaFrame::Reset() { 83 last_char_was_slash_r_ = false; 84 saw_non_newline_char_ = false; 85 start_was_space_ = true; 86 chunk_length_character_extracted_ = false; 87 // is_request_ = true; // not reset between messages. 88 // request_was_head_ = false; // not reset between messages. 89 // max_header_length_ = 4096; // not reset between messages. 90 // max_request_uri_length_ = 2048; // not reset between messages. 91 // visitor_ = &do_nothing_visitor_; // not reset between messages. 92 chunk_length_remaining_ = 0; 93 content_length_remaining_ = 0; 94 last_slash_n_loc_ = NULL; 95 last_recorded_slash_n_loc_ = NULL; 96 last_slash_n_idx_ = 0; 97 term_chars_ = 0; 98 parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE; 99 last_error_ = BalsaFrameEnums::NO_ERROR; 100 lines_.clear(); 101 if (headers_ != NULL) { 102 headers_->Clear(); 103 } 104 } 105 106 const char* BalsaFrameEnums::ParseStateToString( 107 BalsaFrameEnums::ParseState error_code) { 108 switch (error_code) { 109 case PARSE_ERROR: 110 return "PARSE_ERROR"; 111 case READING_HEADER_AND_FIRSTLINE: 112 return "READING_HEADER_AND_FIRSTLINE"; 113 case READING_CHUNK_LENGTH: 114 return "READING_CHUNK_LENGTH"; 115 case READING_CHUNK_EXTENSION: 116 return "READING_CHUNK_EXTENSION"; 117 case READING_CHUNK_DATA: 118 return "READING_CHUNK_DATA"; 119 case READING_CHUNK_TERM: 120 return "READING_CHUNK_TERM"; 121 case READING_LAST_CHUNK_TERM: 122 return "READING_LAST_CHUNK_TERM"; 123 case READING_TRAILER: 124 return "READING_TRAILER"; 125 case READING_UNTIL_CLOSE: 126 return "READING_UNTIL_CLOSE"; 127 case READING_CONTENT: 128 return "READING_CONTENT"; 129 case MESSAGE_FULLY_READ: 130 return "MESSAGE_FULLY_READ"; 131 case NUM_STATES: 132 return "UNKNOWN_STATE"; 133 } 134 return "UNKNOWN_STATE"; 135 } 136 137 const char* BalsaFrameEnums::ErrorCodeToString( 138 BalsaFrameEnums::ErrorCode error_code) { 139 switch (error_code) { 140 case NO_ERROR: 141 return "NO_ERROR"; 142 case NO_STATUS_LINE_IN_RESPONSE: 143 return "NO_STATUS_LINE_IN_RESPONSE"; 144 case NO_REQUEST_LINE_IN_REQUEST: 145 return "NO_REQUEST_LINE_IN_REQUEST"; 146 case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION: 147 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION"; 148 case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD: 149 return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD"; 150 case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE: 151 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE"; 152 case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI: 153 return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI"; 154 case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE: 155 return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE"; 156 case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION: 157 return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION"; 158 case FAILED_CONVERTING_STATUS_CODE_TO_INT: 159 return "FAILED_CONVERTING_STATUS_CODE_TO_INT"; 160 case REQUEST_URI_TOO_LONG: 161 return "REQUEST_URI_TOO_LONG"; 162 case HEADERS_TOO_LONG: 163 return "HEADERS_TOO_LONG"; 164 case UNPARSABLE_CONTENT_LENGTH: 165 return "UNPARSABLE_CONTENT_LENGTH"; 166 case MAYBE_BODY_BUT_NO_CONTENT_LENGTH: 167 return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH"; 168 case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH: 169 return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH"; 170 case HEADER_MISSING_COLON: 171 return "HEADER_MISSING_COLON"; 172 case INVALID_CHUNK_LENGTH: 173 return "INVALID_CHUNK_LENGTH"; 174 case CHUNK_LENGTH_OVERFLOW: 175 return "CHUNK_LENGTH_OVERFLOW"; 176 case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO: 177 return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO"; 178 case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT: 179 return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT"; 180 case MULTIPLE_CONTENT_LENGTH_KEYS: 181 return "MULTIPLE_CONTENT_LENGTH_KEYS"; 182 case MULTIPLE_TRANSFER_ENCODING_KEYS: 183 return "MULTIPLE_TRANSFER_ENCODING_KEYS"; 184 case UNKNOWN_TRANSFER_ENCODING: 185 return "UNKNOWN_TRANSFER_ENCODING"; 186 case INVALID_HEADER_FORMAT: 187 return "INVALID_HEADER_FORMAT"; 188 case INTERNAL_LOGIC_ERROR: 189 return "INTERNAL_LOGIC_ERROR"; 190 case NUM_ERROR_CODES: 191 return "UNKNOWN_ERROR"; 192 } 193 return "UNKNOWN_ERROR"; 194 } 195 196 // Summary: 197 // Parses the first line of either a request or response. 198 // Note that in the case of a detected warning, error_code will be set 199 // but the function will not return false. 200 // Exactly zero or one warning or error (but not both) may be detected 201 // by this function. 202 // Note that this function will not write the data of the first-line 203 // into the header's buffer (that should already have been done elsewhere). 204 // 205 // Pre-conditions: 206 // begin != end 207 // *begin should be a character which is > ' '. This implies that there 208 // is at least one non-whitespace characters between [begin, end). 209 // headers is a valid pointer to a BalsaHeaders class. 210 // error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value. 211 // Entire first line must exist between [begin, end) 212 // Exactly zero or one newlines -may- exist between [begin, end) 213 // [begin, end) should exist in the header's buffer. 214 // 215 // Side-effects: 216 // headers will be modified 217 // error_code may be modified if either a warning or error is detected 218 // 219 // Returns: 220 // True if no error (as opposed to warning) is detected. 221 // False if an error (as opposed to warning) is detected. 222 223 // 224 // If there is indeed non-whitespace in the line, then the following 225 // will take care of this for you: 226 // while (*begin <= ' ') ++begin; 227 // ProcessFirstLine(begin, end, is_request, &headers, &error_code); 228 // 229 bool ParseHTTPFirstLine(const char* begin, 230 const char* end, 231 bool is_request, 232 size_t max_request_uri_length, 233 BalsaHeaders* headers, 234 BalsaFrameEnums::ErrorCode* error_code) { 235 const char* current = begin; 236 // HTTP firstlines all have the following structure: 237 // LWS NONWS LWS NONWS LWS NONWS NOTCRLF CRLF 238 // [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n" 239 // ws1 nws1 ws2 nws2 ws3 nws3 ws4 240 // | [-------) [-------) [----------------) 241 // REQ: method request_uri version 242 // RESP: version statuscode reason 243 // 244 // The first NONWS->LWS component we'll call firstline_a. 245 // The second firstline_b, and the third firstline_c. 246 // 247 // firstline_a goes from nws1 to (but not including) ws2 248 // firstline_b goes from nws2 to (but not including) ws3 249 // firstline_c goes from nws3 to (but not including) ws4 250 // 251 // In the code: 252 // ws1 == whitespace_1_idx_ 253 // nws1 == non_whitespace_1_idx_ 254 // ws2 == whitespace_2_idx_ 255 // nws2 == non_whitespace_2_idx_ 256 // ws3 == whitespace_3_idx_ 257 // nws3 == non_whitespace_3_idx_ 258 // ws4 == whitespace_4_idx_ 259 260 // Kill all whitespace (including '\r\n') at the end of the line. 261 --end; 262 if (*end != '\n') { 263 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; 264 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" 265 << headers->OriginalHeadersForDebugging(); 266 return false; 267 } 268 while (begin < end && *end <= ' ') { 269 --end; 270 } 271 DCHECK(*end != '\n'); 272 if (*end == '\n') { 273 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; 274 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" 275 << headers->OriginalHeadersForDebugging(); 276 return false; 277 } 278 ++end; 279 280 // The two following statements should not be possible. 281 if (end == begin) { 282 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; 283 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" 284 << headers->OriginalHeadersForDebugging(); 285 return false; 286 } 287 288 // whitespace_1_idx_ 289 headers->whitespace_1_idx_ = current - begin; 290 // This loop is commented out as it is never used in current code. This is 291 // true only because we don't begin parsing the headers at all until we've 292 // encountered a non whitespace character at the beginning of the stream, at 293 // which point we begin our demarcation of header-start. If we did -not- do 294 // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop 295 // would be necessary for the proper functioning of this parsing. 296 // This is left here as this function may (in the future) be refactored out 297 // of the BalsaFrame class so that it may be shared between code in 298 // BalsaFrame and BalsaHeaders (where it would be used in some variant of the 299 // set_first_line() function (at which point it would be necessary). 300 #if 0 301 while (*current <= ' ') { 302 ++current; 303 } 304 #endif 305 // non_whitespace_1_idx_ 306 headers->non_whitespace_1_idx_ = current - begin; 307 do { 308 // The first time through, we're guaranteed that the current character 309 // won't be a whitespace (else the loop above wouldn't have terminated). 310 // That implies that we're guaranteed to get at least one non-whitespace 311 // character if we get into this loop at all. 312 ++current; 313 if (current == end) { 314 headers->whitespace_2_idx_ = current - begin; 315 headers->non_whitespace_2_idx_ = current - begin; 316 headers->whitespace_3_idx_ = current - begin; 317 headers->non_whitespace_3_idx_ = current - begin; 318 headers->whitespace_4_idx_ = current - begin; 319 // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD for request 320 // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response 321 *error_code = 322 static_cast<BalsaFrameEnums::ErrorCode>( 323 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION + 324 is_request); 325 if (!is_request) { // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION 326 return false; 327 } 328 goto output_exhausted; 329 } 330 } while (*current > ' '); 331 // whitespace_2_idx_ 332 headers->whitespace_2_idx_ = current - begin; 333 do { 334 ++current; 335 // Note that due to the loop which consumes all of the whitespace 336 // at the end of the line, current can never == end while in this function. 337 } while (*current <= ' '); 338 // non_whitespace_2_idx_ 339 headers->non_whitespace_2_idx_ = current - begin; 340 do { 341 ++current; 342 if (current == end) { 343 headers->whitespace_3_idx_ = current - begin; 344 headers->non_whitespace_3_idx_ = current - begin; 345 headers->whitespace_4_idx_ = current - begin; 346 // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request 347 // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response 348 *error_code = 349 static_cast<BalsaFrameEnums::ErrorCode>( 350 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE 351 + is_request); 352 goto output_exhausted; 353 } 354 } while (*current > ' '); 355 // whitespace_3_idx_ 356 headers->whitespace_3_idx_ = current - begin; 357 do { 358 ++current; 359 // Note that due to the loop which consumes all of the whitespace 360 // at the end of the line, current can never == end while in this function. 361 } while (*current <= ' '); 362 // non_whitespace_3_idx_ 363 headers->non_whitespace_3_idx_ = current - begin; 364 headers->whitespace_4_idx_ = end - begin; 365 366 output_exhausted: 367 // Note that we don't fail the parse immediately when parsing of the 368 // firstline fails. Depending on the protocol type, we may want to accept 369 // a firstline with only one or two elements, e.g., for HTTP/0.9: 370 // GET\r\n 371 // or 372 // GET /\r\n 373 // should be parsed without issue (though the visitor should know that 374 // parsing the entire line was not exactly as it should be). 375 // 376 // Eventually, these errors may be removed alltogether, as the visitor can 377 // detect them on its own by examining the size of the various fields. 378 // headers->set_first_line(non_whitespace_1_idx_, current); 379 380 if (is_request) { 381 if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) > 382 max_request_uri_length) { 383 // For requests, we need at least the method. We could assume that a 384 // blank URI means "/". If version isn't stated, it should be assumed 385 // to be HTTP/0.9 by the visitor. 386 *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG; 387 return false; 388 } 389 } else { 390 headers->parsed_response_code_ = 0; 391 { 392 const char* parsed_response_code_current = 393 begin + headers->non_whitespace_2_idx_; 394 const char* parsed_response_code_end = begin + headers->whitespace_3_idx_; 395 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10; 396 397 // Convert a string of [0-9]* into an int. 398 // Note that this allows for the conversion of response codes which 399 // are outside the bounds of normal HTTP response codes (no checking 400 // is done to ensure that these are valid-- they're merely parsed)! 401 while (parsed_response_code_current < parsed_response_code_end) { 402 if (*parsed_response_code_current < '0' || 403 *parsed_response_code_current > '9') { 404 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT; 405 return false; 406 } 407 size_t status_code_x_10 = headers->parsed_response_code_ * 10; 408 uint8 c = *parsed_response_code_current - '0'; 409 if ((headers->parsed_response_code_ > kMaxDiv10) || 410 (std::numeric_limits<size_t>::max() - status_code_x_10) < c) { 411 // overflow. 412 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT; 413 return false; 414 } 415 headers->parsed_response_code_ = status_code_x_10 + c; 416 ++parsed_response_code_current; 417 } 418 } 419 } 420 return true; 421 } 422 423 // begin - beginning of the firstline 424 // end - end of the firstline 425 // 426 // A precondition for this function is that there is non-whitespace between 427 // [begin, end). If this precondition is not met, the function will not perform 428 // as expected (and bad things may happen, and it will eat your first, second, 429 // and third unborn children!). 430 // 431 // Another precondition for this function is that [begin, end) includes 432 // at most one newline, which must be at the end of the line. 433 void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) { 434 BalsaFrameEnums::ErrorCode previous_error = last_error_; 435 if (!ParseHTTPFirstLine(begin, 436 end, 437 is_request_, 438 max_request_uri_length_, 439 headers_, 440 &last_error_)) { 441 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 442 visitor_->HandleHeaderError(this); 443 return; 444 } 445 if (previous_error != last_error_) { 446 visitor_->HandleHeaderWarning(this); 447 } 448 449 if (is_request_) { 450 size_t version_length = 451 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_; 452 visitor_->ProcessRequestFirstLine( 453 begin + headers_->non_whitespace_1_idx_, 454 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_, 455 begin + headers_->non_whitespace_1_idx_, 456 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_, 457 begin + headers_->non_whitespace_2_idx_, 458 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_, 459 begin + headers_->non_whitespace_3_idx_, 460 version_length); 461 if (version_length == 0) 462 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 463 } else { 464 visitor_->ProcessResponseFirstLine( 465 begin + headers_->non_whitespace_1_idx_, 466 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_, 467 begin + headers_->non_whitespace_1_idx_, 468 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_, 469 begin + headers_->non_whitespace_2_idx_, 470 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_, 471 begin + headers_->non_whitespace_3_idx_, 472 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_); 473 } 474 } 475 476 // 'stream_begin' points to the first character of the headers buffer. 477 // 'line_begin' points to the first character of the line. 478 // 'current' points to a char which is ':'. 479 // 'line_end' points to the position of '\n' + 1. 480 // 'line_begin' points to the position of first character of line. 481 void BalsaFrame::CleanUpKeyValueWhitespace( 482 const char* stream_begin, 483 const char* line_begin, 484 const char* current, 485 const char* line_end, 486 HeaderLineDescription* current_header_line) { 487 const char* colon_loc = current; 488 DCHECK_LT(colon_loc, line_end); 489 DCHECK_EQ(':', *colon_loc); 490 DCHECK_EQ(':', *current); 491 DCHECK_GE(' ', *line_end) 492 << "\"" << std::string(line_begin, line_end) << "\""; 493 494 // TODO(fenix): Investigate whether or not the bounds tests in the 495 // while loops here are redundant, and if so, remove them. 496 --current; 497 while (current > line_begin && *current <= ' ') --current; 498 current += (current != colon_loc); 499 current_header_line->key_end_idx = current - stream_begin; 500 501 current = colon_loc; 502 DCHECK_EQ(':', *current); 503 ++current; 504 while (current < line_end && *current <= ' ') ++current; 505 current_header_line->value_begin_idx = current - stream_begin; 506 507 DCHECK_GE(current_header_line->key_end_idx, 508 current_header_line->first_char_idx); 509 DCHECK_GE(current_header_line->value_begin_idx, 510 current_header_line->key_end_idx); 511 DCHECK_GE(current_header_line->last_char_idx, 512 current_header_line->value_begin_idx); 513 } 514 515 inline void BalsaFrame::FindColonsAndParseIntoKeyValue() { 516 DCHECK(!lines_.empty()); 517 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 518 // The last line is always just a newline (and is uninteresting). 519 const Lines::size_type lines_size_m1 = lines_.size() - 1; 520 #if __SSE2__ 521 const __m128i colons = _mm_set1_epi8(':'); 522 const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16; 523 #endif // __SSE2__ 524 const char* current = stream_begin + lines_[1].first; 525 // This code is a bit more subtle than it may appear at first glance. 526 // This code looks for a colon in the current line... but it also looks 527 // beyond the current line. If there is no colon in the current line, then 528 // for each subsequent line (until the colon which -has- been found is 529 // associated with a line), no searching for a colon will be performed. In 530 // this way, we minimize the amount of bytes we have scanned for a colon. 531 for (Lines::size_type i = 1; i < lines_size_m1;) { 532 const char* line_begin = stream_begin + lines_[i].first; 533 534 // Here we handle possible continuations. Note that we do not replace 535 // the '\n' in the line before a continuation (at least, as of now), 536 // which implies that any code which looks for a value must deal with 537 // "\r\n", etc -within- the line (and not just at the end of it). 538 for (++i; i < lines_size_m1; ++i) { 539 const char c = *(stream_begin + lines_[i].first); 540 if (c > ' ') { 541 // Not a continuation, so stop. Note that if the 'original' i = 1, 542 // and the next line is not a continuation, we'll end up with i = 2 543 // when we break. This handles the incrementing of i for the outer 544 // loop. 545 break; 546 } 547 } 548 const char* line_end = stream_begin + lines_[i - 1].second; 549 DCHECK_LT(line_begin - stream_begin, line_end - stream_begin); 550 551 // We cleanup the whitespace at the end of the line before doing anything 552 // else of interest as it allows us to do nothing when irregularly formatted 553 // headers are parsed (e.g. those with only keys, only values, or no colon). 554 // 555 // We're guaranteed to have *line_end > ' ' while line_end >= line_begin. 556 --line_end; 557 DCHECK_EQ('\n', *line_end) 558 << "\"" << std::string(line_begin, line_end) << "\""; 559 while (*line_end <= ' ' && line_end > line_begin) { 560 --line_end; 561 } 562 ++line_end; 563 DCHECK_GE(' ', *line_end); 564 DCHECK_LT(line_begin, line_end); 565 566 // We use '0' for the block idx, because we're always writing to the first 567 // block from the framer (we do this because the framer requires that the 568 // entire header sequence be in a contiguous buffer). 569 headers_->header_lines_.push_back( 570 HeaderLineDescription(line_begin - stream_begin, 571 line_end - stream_begin, 572 line_end - stream_begin, 573 line_end - stream_begin, 574 0)); 575 if (current >= line_end) { 576 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON; 577 visitor_->HandleHeaderWarning(this); 578 // Then the next colon will not be found within this header line-- time 579 // to try again with another header-line. 580 continue; 581 } else if (current < line_begin) { 582 // When this condition is true, the last detected colon was part of a 583 // previous line. We reset to the beginning of the line as we don't care 584 // about the presence of any colon before the beginning of the current 585 // line. 586 current = line_begin; 587 } 588 #if __SSE2__ 589 while (current < header_lines_end_m16) { 590 __m128i header_bytes = 591 _mm_loadu_si128(reinterpret_cast<const __m128i *>(current)); 592 __m128i colon_cmp = _mm_cmpeq_epi8(header_bytes, colons); 593 int colon_msk = _mm_movemask_epi8(colon_cmp); 594 if (colon_msk == 0) { 595 current += 16; 596 continue; 597 } 598 current += (ffs(colon_msk) - 1); 599 if (current > line_end) { 600 break; 601 } 602 goto found_colon; 603 } 604 #endif // __SSE2__ 605 for (; current < line_end; ++current) { 606 if (*current != ':') { 607 continue; 608 } 609 goto found_colon; 610 } 611 // If we've gotten to here, then there was no colon 612 // in the line. The arguments we passed into the construction 613 // for the HeaderLineDescription object should be OK-- it assumes 614 // that the entire content is 'key' by default (which is true, as 615 // there was no colon, there can be no value). Note that this is a 616 // construct which is technically not allowed by the spec. 617 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON; 618 visitor_->HandleHeaderWarning(this); 619 continue; 620 found_colon: 621 DCHECK_EQ(*current, ':'); 622 DCHECK_LE(current - stream_begin, line_end - stream_begin); 623 DCHECK_LE(stream_begin - stream_begin, current - stream_begin); 624 625 HeaderLineDescription& current_header_line = headers_->header_lines_.back(); 626 current_header_line.key_end_idx = current - stream_begin; 627 current_header_line.value_begin_idx = current_header_line.key_end_idx; 628 if (current < line_end) { 629 ++current_header_line.key_end_idx; 630 631 CleanUpKeyValueWhitespace(stream_begin, 632 line_begin, 633 current, 634 line_end, 635 ¤t_header_line); 636 } 637 } 638 } 639 640 void BalsaFrame::ProcessContentLengthLine( 641 HeaderLines::size_type line_idx, 642 BalsaHeadersEnums::ContentLengthStatus* status, 643 size_t* length) { 644 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx]; 645 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 646 const char* line_end = stream_begin + header_line.last_char_idx; 647 const char* value_begin = (stream_begin + header_line.value_begin_idx); 648 649 if (value_begin >= line_end) { 650 // There is no non-whitespace value data. 651 #if DEBUGFRAMER 652 LOG(INFO) << "invalid content-length -- no non-whitespace value data"; 653 #endif 654 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH; 655 return; 656 } 657 658 *length = 0; 659 while (value_begin < line_end) { 660 if (*value_begin < '0' || *value_begin > '9') { 661 // bad! content-length found, and couldn't parse all of it! 662 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH; 663 #if DEBUGFRAMER 664 LOG(INFO) << "invalid content-length - non numeric character detected"; 665 #endif // DEBUGFRAMER 666 return; 667 } 668 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10; 669 size_t length_x_10 = *length * 10; 670 const unsigned char c = *value_begin - '0'; 671 if (*length > kMaxDiv10 || 672 (std::numeric_limits<size_t>::max() - length_x_10) < c) { 673 *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW; 674 #if DEBUGFRAMER 675 LOG(INFO) << "content-length overflow"; 676 #endif // DEBUGFRAMER 677 return; 678 } 679 *length = length_x_10 + c; 680 ++value_begin; 681 } 682 #if DEBUGFRAMER 683 LOG(INFO) << "content_length parsed: " << *length; 684 #endif // DEBUGFRAMER 685 *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH; 686 } 687 688 void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) { 689 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx]; 690 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 691 const char* line_end = stream_begin + header_line.last_char_idx; 692 const char* value_begin = stream_begin + header_line.value_begin_idx; 693 size_t value_length = line_end - value_begin; 694 695 if ((value_length == 7) && 696 !strncasecmp(value_begin, "chunked", 7)) { 697 headers_->transfer_encoding_is_chunked_ = true; 698 } else if ((value_length == 8) && 699 !strncasecmp(value_begin, "identity", 8)) { 700 headers_->transfer_encoding_is_chunked_ = false; 701 } else { 702 last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING; 703 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 704 visitor_->HandleHeaderError(this); 705 return; 706 } 707 } 708 709 namespace { 710 bool SplitStringPiece(base::StringPiece original, char delim, 711 base::StringPiece* before, base::StringPiece* after) { 712 const char* p = original.data(); 713 const char* end = p + original.size(); 714 715 while (p != end) { 716 if (*p == delim) { 717 ++p; 718 } else { 719 const char* start = p; 720 while (++p != end && *p != delim) { 721 // Skip to the next occurence of the delimiter. 722 } 723 *before = base::StringPiece(start, p - start); 724 if (p != end) 725 *after = base::StringPiece(p + 1, end - (p + 1)); 726 else 727 *after = base::StringPiece(""); 728 StringPieceUtils::RemoveWhitespaceContext(before); 729 StringPieceUtils::RemoveWhitespaceContext(after); 730 return true; 731 } 732 } 733 734 *before = original; 735 *after = ""; 736 return false; 737 } 738 739 // TODO(phython): Fix this function to properly deal with quoted values. 740 // E.g. ";;foo", "\";;\"", or \"aa; 741 // The last example, the semi-colon is a separator between extensions. 742 void ProcessChunkExtensionsManual(base::StringPiece all_extensions, 743 BalsaHeaders* extensions) { 744 base::StringPiece extension; 745 base::StringPiece remaining; 746 StringPieceUtils::RemoveWhitespaceContext(&all_extensions); 747 SplitStringPiece(all_extensions, ';', &extension, &remaining); 748 while (!extension.empty()) { 749 base::StringPiece key; 750 base::StringPiece value; 751 SplitStringPiece(extension, '=', &key, &value); 752 if (!value.empty()) { 753 // Strip quotation marks if they exist. 754 if (!value.empty() && value[0] == '"') 755 value.remove_prefix(1); 756 if (!value.empty() && value[value.length() - 1] == '"') 757 value.remove_suffix(1); 758 } 759 760 extensions->AppendHeader(key, value); 761 762 StringPieceUtils::RemoveWhitespaceContext(&remaining); 763 SplitStringPiece(remaining, ';', &extension, &remaining); 764 } 765 } 766 767 } // anonymous namespace 768 769 void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size, 770 BalsaHeaders* extensions) { 771 ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions); 772 } 773 774 void BalsaFrame::ProcessHeaderLines() { 775 HeaderLines::size_type content_length_idx = 0; 776 HeaderLines::size_type transfer_encoding_idx = 0; 777 778 DCHECK(!lines_.empty()); 779 #if DEBUGFRAMER 780 LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n"; 781 #endif // DEBUGFRAMER 782 783 // There is no need to attempt to process headers if no header lines exist. 784 // There are at least two lines in the message which are not header lines. 785 // These two non-header lines are the first line of the message, and the 786 // last line of the message (which is an empty line). 787 // Thus, we test to see if we have more than two lines total before attempting 788 // to parse any header lines. 789 if (lines_.size() > 2) { 790 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 791 792 // Then, for the rest of the header data, we parse these into key-value 793 // pairs. 794 FindColonsAndParseIntoKeyValue(); 795 // At this point, we've parsed all of the headers. Time to look for those 796 // headers which we require for framing. 797 const HeaderLines::size_type 798 header_lines_size = headers_->header_lines_.size(); 799 for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) { 800 const HeaderLineDescription& current_header_line = 801 headers_->header_lines_[i]; 802 const char* key_begin = 803 (stream_begin + current_header_line.first_char_idx); 804 const char* key_end = (stream_begin + current_header_line.key_end_idx); 805 const size_t key_len = key_end - key_begin; 806 const char c = *key_begin; 807 #if DEBUGFRAMER 808 LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len) 809 << " c: '" << c << "' key_len: " << key_len; 810 #endif // DEBUGFRAMER 811 // If a header begins with either lowercase or uppercase 'c' or 't', then 812 // the header may be one of content-length, connection, content-encoding 813 // or transfer-encoding. These headers are special, as they change the way 814 // that the message is framed, and so the framer is required to search 815 // for them. 816 817 818 if (c == 'c' || c == 'C') { 819 if ((key_len == kContentLengthSize) && 820 0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) { 821 BalsaHeadersEnums::ContentLengthStatus content_length_status = 822 BalsaHeadersEnums::NO_CONTENT_LENGTH; 823 size_t length = 0; 824 ProcessContentLengthLine(i, &content_length_status, &length); 825 if (content_length_idx != 0) { // then we've already seen one! 826 if ((headers_->content_length_status_ != content_length_status) || 827 ((headers_->content_length_status_ == 828 BalsaHeadersEnums::VALID_CONTENT_LENGTH) && 829 length != headers_->content_length_)) { 830 last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS; 831 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 832 visitor_->HandleHeaderError(this); 833 return; 834 } 835 continue; 836 } else { 837 content_length_idx = i + 1; 838 headers_->content_length_status_ = content_length_status; 839 headers_->content_length_ = length; 840 content_length_remaining_ = length; 841 } 842 843 } 844 } else if (c == 't' || c == 'T') { 845 if ((key_len == kTransferEncodingSize) && 846 0 == strncasecmp(key_begin, kTransferEncoding, 847 kTransferEncodingSize)) { 848 if (transfer_encoding_idx != 0) { 849 last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS; 850 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 851 visitor_->HandleHeaderError(this); 852 return; 853 } 854 transfer_encoding_idx = i + 1; 855 } 856 } else if (i == 0 && (key_len == 0 || c == ' ')) { 857 last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT; 858 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 859 visitor_->HandleHeaderError(this); 860 return; 861 } 862 } 863 if (headers_->transfer_encoding_is_chunked_) { 864 headers_->content_length_ = 0; 865 headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH; 866 content_length_remaining_ = 0; 867 } 868 if (transfer_encoding_idx != 0) { 869 ProcessTransferEncodingLine(transfer_encoding_idx - 1); 870 } 871 } 872 } 873 874 void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() { 875 // For responses, can't have a body if the request was a HEAD, or if it is 876 // one of these response-codes. rfc2616 section 4.3 877 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 878 if (is_request_ || 879 !(request_was_head_ || 880 (headers_->parsed_response_code_ >= 100 && 881 headers_->parsed_response_code_ < 200) || 882 (headers_->parsed_response_code_ == 204) || 883 (headers_->parsed_response_code_ == 304))) { 884 // Then we can have a body. 885 if (headers_->transfer_encoding_is_chunked_) { 886 // Note that 887 // if ( Transfer-Encoding: chunked && Content-length: ) 888 // then Transfer-Encoding: chunked trumps. 889 // This is as specified in the spec. 890 // rfc2616 section 4.4.3 891 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH; 892 } else { 893 // Errors parsing content-length definitely can cause 894 // protocol errors/warnings 895 switch (headers_->content_length_status_) { 896 // If we have a content-length, and it is parsed 897 // properly, there are two options. 898 // 1) zero content, in which case the message is done, and 899 // 2) nonzero content, in which case we have to 900 // consume the body. 901 case BalsaHeadersEnums::VALID_CONTENT_LENGTH: 902 if (headers_->content_length_ == 0) { 903 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 904 } else { 905 parse_state_ = BalsaFrameEnums::READING_CONTENT; 906 } 907 break; 908 case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW: 909 case BalsaHeadersEnums::INVALID_CONTENT_LENGTH: 910 // If there were characters left-over after parsing the 911 // content length, we should flag an error and stop. 912 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 913 last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH; 914 visitor_->HandleHeaderError(this); 915 break; 916 // We can have: no transfer-encoding, no content length, and no 917 // connection: close... 918 // Unfortunately, this case doesn't seem to be covered in the spec. 919 // We'll assume that the safest thing to do here is what the google 920 // binaries before 2008 already do, which is to assume that 921 // everything until the connection is closed is body. 922 case BalsaHeadersEnums::NO_CONTENT_LENGTH: 923 if (is_request_) { 924 base::StringPiece method = headers_->request_method(); 925 // POSTs and PUTs should have a detectable body length. If they 926 // do not we consider it an error. 927 if ((method.size() == 4 && 928 strncmp(method.data(), "POST", 4) == 0) || 929 (method.size() == 3 && 930 strncmp(method.data(), "PUT", 3) == 0)) { 931 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 932 last_error_ = 933 BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH; 934 visitor_->HandleHeaderError(this); 935 break; 936 } 937 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 938 } else { 939 parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE; 940 last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH; 941 visitor_->HandleHeaderWarning(this); 942 } 943 break; 944 // The COV_NF_... statements here provide hints to the apparatus 945 // which computes coverage reports/ratios that this code is never 946 // intended to be executed, and should technically be impossible. 947 // COV_NF_START 948 default: 949 LOG(FATAL) << "Saw a content_length_status: " 950 << headers_->content_length_status_ << " which is unknown."; 951 // COV_NF_END 952 } 953 } 954 } 955 } 956 957 size_t BalsaFrame::ProcessHeaders(const char* message_start, 958 size_t message_length) { 959 const char* const original_message_start = message_start; 960 const char* const message_end = message_start + message_length; 961 const char* message_current = message_start; 962 const char* checkpoint = message_start; 963 964 if (message_length == 0) { 965 goto bottom; 966 } 967 968 while (message_current < message_end) { 969 size_t base_idx = headers_->GetReadableBytesFromHeaderStream(); 970 971 // Yes, we could use strchr (assuming null termination), or 972 // memchr, but as it turns out that is slower than this tight loop 973 // for the input that we see. 974 if (!saw_non_newline_char_) { 975 do { 976 const char c = *message_current; 977 if (c != '\r' && c != '\n') { 978 if (c <= ' ') { 979 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 980 last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST; 981 visitor_->HandleHeaderError(this); 982 goto bottom; 983 } else { 984 saw_non_newline_char_ = true; 985 checkpoint = message_start = message_current; 986 goto read_real_message; 987 } 988 } 989 ++message_current; 990 } while (message_current < message_end); 991 goto bottom; // this is necessary to skip 'last_char_was_slash_r' checks 992 } else { 993 read_real_message: 994 // Note that SSE2 can be enabled on certain piii platforms. 995 #if __SSE2__ 996 { 997 const char* const message_end_m16 = message_end - 16; 998 __m128i newlines = _mm_set1_epi8('\n'); 999 while (message_current < message_end_m16) { 1000 // What this does (using compiler intrinsics): 1001 // 1002 // Load 16 '\n's into an xmm register 1003 // Load 16 bytes of currennt message into an xmm register 1004 // Do byte-wise equals on those two xmm registers 1005 // Take the first bit of each byte, and put that into the first 1006 // 16 bits of a mask 1007 // If the mask is zero, no '\n' found. increment by 16 and try again 1008 // Else scan forward to find the first set bit. 1009 // Increment current by the index of the first set bit 1010 // (ffs returns index of first set bit + 1) 1011 __m128i msg_bytes = 1012 _mm_loadu_si128(const_cast<__m128i *>( 1013 reinterpret_cast<const __m128i *>(message_current))); 1014 __m128i newline_cmp = _mm_cmpeq_epi8(msg_bytes, newlines); 1015 int newline_msk = _mm_movemask_epi8(newline_cmp); 1016 if (newline_msk == 0) { 1017 message_current += 16; 1018 continue; 1019 } 1020 message_current += (ffs(newline_msk) - 1); 1021 const size_t relative_idx = message_current - message_start; 1022 const size_t message_current_idx = 1 + base_idx + relative_idx; 1023 lines_.push_back(std::make_pair(last_slash_n_idx_, 1024 message_current_idx)); 1025 if (lines_.size() == 1) { 1026 headers_->WriteFromFramer(checkpoint, 1027 1 + message_current - checkpoint); 1028 checkpoint = message_current + 1; 1029 const char* begin = headers_->OriginalHeaderStreamBegin(); 1030 #if DEBUGFRAMER 1031 LOG(INFO) << "First line " << std::string(begin, lines_[0].second); 1032 LOG(INFO) << "is_request_: " << is_request_; 1033 #endif 1034 ProcessFirstLine(begin, begin + lines_[0].second); 1035 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) 1036 goto process_lines; 1037 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) 1038 goto bottom; 1039 } 1040 const size_t chars_since_last_slash_n = (message_current_idx - 1041 last_slash_n_idx_); 1042 last_slash_n_idx_ = message_current_idx; 1043 if (chars_since_last_slash_n > 2) { 1044 // We have a slash-n, but the last slash n was 1045 // more than 2 characters away from this. Thus, we know 1046 // that this cannot be an end-of-header. 1047 ++message_current; 1048 continue; 1049 } 1050 if ((chars_since_last_slash_n == 1) || 1051 (((message_current > message_start) && 1052 (*(message_current - 1) == '\r')) || 1053 (last_char_was_slash_r_))) { 1054 goto process_lines; 1055 } 1056 ++message_current; 1057 } 1058 } 1059 #endif // __SSE2__ 1060 while (message_current < message_end) { 1061 if (*message_current != '\n') { 1062 ++message_current; 1063 continue; 1064 } 1065 const size_t relative_idx = message_current - message_start; 1066 const size_t message_current_idx = 1 + base_idx + relative_idx; 1067 lines_.push_back(std::make_pair(last_slash_n_idx_, 1068 message_current_idx)); 1069 if (lines_.size() == 1) { 1070 headers_->WriteFromFramer(checkpoint, 1071 1 + message_current - checkpoint); 1072 checkpoint = message_current + 1; 1073 const char* begin = headers_->OriginalHeaderStreamBegin(); 1074 #if DEBUGFRAMER 1075 LOG(INFO) << "First line " << std::string(begin, lines_[0].second); 1076 LOG(INFO) << "is_request_: " << is_request_; 1077 #endif 1078 ProcessFirstLine(begin, begin + lines_[0].second); 1079 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) 1080 goto process_lines; 1081 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) 1082 goto bottom; 1083 } 1084 const size_t chars_since_last_slash_n = (message_current_idx - 1085 last_slash_n_idx_); 1086 last_slash_n_idx_ = message_current_idx; 1087 if (chars_since_last_slash_n > 2) { 1088 // false positive. 1089 ++message_current; 1090 continue; 1091 } 1092 if ((chars_since_last_slash_n == 1) || 1093 (((message_current > message_start) && 1094 (*(message_current - 1) == '\r')) || 1095 (last_char_was_slash_r_))) { 1096 goto process_lines; 1097 } 1098 ++message_current; 1099 } 1100 } 1101 continue; 1102 process_lines: 1103 ++message_current; 1104 DCHECK(message_current >= message_start); 1105 if (message_current > message_start) { 1106 headers_->WriteFromFramer(checkpoint, message_current - checkpoint); 1107 } 1108 1109 // Check if we have exceeded maximum headers length 1110 // Although we check for this limit before and after we call this function 1111 // we check it here as well to make sure that in case the visitor changed 1112 // the max_header_length_ (for example after processing the first line) 1113 // we handle it gracefully. 1114 if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) { 1115 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1116 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; 1117 visitor_->HandleHeaderError(this); 1118 goto bottom; 1119 } 1120 1121 // Since we know that we won't be writing any more bytes of the header, 1122 // we tell that to the headers object. The headers object may make 1123 // more efficient allocation decisions when this is signaled. 1124 headers_->DoneWritingFromFramer(); 1125 { 1126 const char* readable_ptr = NULL; 1127 size_t readable_size = 0; 1128 headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size); 1129 visitor_->ProcessHeaderInput(readable_ptr, readable_size); 1130 } 1131 1132 // Ok, now that we've written everything into our header buffer, it is 1133 // time to process the header lines (extract proper values for headers 1134 // which are important for framing). 1135 ProcessHeaderLines(); 1136 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) { 1137 goto bottom; 1138 } 1139 AssignParseStateAfterHeadersHaveBeenParsed(); 1140 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) { 1141 goto bottom; 1142 } 1143 visitor_->ProcessHeaders(*headers_); 1144 visitor_->HeaderDone(); 1145 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) { 1146 visitor_->MessageDone(); 1147 } 1148 goto bottom; 1149 } 1150 // If we've gotten to here, it means that we've consumed all of the 1151 // available input. We need to record whether or not the last character we 1152 // saw was a '\r' so that a subsequent call to ProcessInput correctly finds 1153 // a header framing that is split across the two calls. 1154 last_char_was_slash_r_ = (*(message_end - 1) == '\r'); 1155 DCHECK(message_current >= message_start); 1156 if (message_current > message_start) { 1157 headers_->WriteFromFramer(checkpoint, message_current - checkpoint); 1158 } 1159 bottom: 1160 return message_current - original_message_start; 1161 } 1162 1163 1164 size_t BalsaFrame::BytesSafeToSplice() const { 1165 switch (parse_state_) { 1166 case BalsaFrameEnums::READING_CHUNK_DATA: 1167 return chunk_length_remaining_; 1168 case BalsaFrameEnums::READING_UNTIL_CLOSE: 1169 return std::numeric_limits<size_t>::max(); 1170 case BalsaFrameEnums::READING_CONTENT: 1171 return content_length_remaining_; 1172 default: 1173 return 0; 1174 } 1175 } 1176 1177 void BalsaFrame::BytesSpliced(size_t bytes_spliced) { 1178 switch (parse_state_) { 1179 case BalsaFrameEnums::READING_CHUNK_DATA: 1180 if (chunk_length_remaining_ >= bytes_spliced) { 1181 chunk_length_remaining_ -= bytes_spliced; 1182 if (chunk_length_remaining_ == 0) { 1183 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM; 1184 } 1185 return; 1186 } else { 1187 last_error_ = 1188 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT; 1189 goto error_exit; 1190 } 1191 1192 case BalsaFrameEnums::READING_UNTIL_CLOSE: 1193 return; 1194 1195 case BalsaFrameEnums::READING_CONTENT: 1196 if (content_length_remaining_ >= bytes_spliced) { 1197 content_length_remaining_ -= bytes_spliced; 1198 if (content_length_remaining_ == 0) { 1199 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1200 visitor_->MessageDone(); 1201 } 1202 return; 1203 } else { 1204 last_error_ = 1205 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT; 1206 goto error_exit; 1207 } 1208 1209 default: 1210 last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO; 1211 goto error_exit; 1212 } 1213 1214 error_exit: 1215 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1216 visitor_->HandleBodyError(this); 1217 }; 1218 1219 // You may note that the state-machine contained within this function has both 1220 // switch and goto labels for nearly the same thing. For instance, the 1221 // following two labels refer to the same code block: 1222 // label_reading_chunk_data: 1223 // case BalsaFrameEnums::READING_CHUNK_DATA: 1224 // The 'case' statement is required for the switch statement which occurs when 1225 // ProcessInput is invoked. The goto label is required as the state-machine 1226 // does not use a computed goto in any subsequent operations. 1227 // 1228 // Since several states exit the state machine for various reasons, there is 1229 // also one label at the bottom of the function. When it is appropriate to 1230 // return from the function, that part of the state machine instead issues a 1231 // goto bottom; This results in less code duplication, and makes debugging 1232 // easier (as you can add a statement to a section of code which is guaranteed 1233 // to be invoked when the function is exiting. 1234 size_t BalsaFrame::ProcessInput(const char* input, size_t size) { 1235 const char* current = input; 1236 const char* on_entry = current; 1237 const char* end = current + size; 1238 #if DEBUGFRAMER 1239 LOG(INFO) << "\n==============" 1240 << BalsaFrameEnums::ParseStateToString(parse_state_) 1241 << "===============\n"; 1242 #endif // DEBUGFRAMER 1243 1244 DCHECK(headers_ != NULL); 1245 if (headers_ == NULL) return 0; 1246 1247 if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) { 1248 const size_t header_length = headers_->GetReadableBytesFromHeaderStream(); 1249 // Yes, we still have to check this here as the user can change the 1250 // max_header_length amount! 1251 // Also it is possible that we have reached the maximum allowed header size, 1252 // and we have more to consume (remember we are still inside 1253 // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error. 1254 if (header_length > max_header_length_ || 1255 (header_length == max_header_length_ && size > 0)) { 1256 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1257 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; 1258 visitor_->HandleHeaderError(this); 1259 goto bottom; 1260 } 1261 size_t bytes_to_process = max_header_length_ - header_length; 1262 if (bytes_to_process > size) { 1263 bytes_to_process = size; 1264 } 1265 current += ProcessHeaders(input, bytes_to_process); 1266 // If we are still reading headers check if we have crossed the headers 1267 // limit. Note that we check for >= as opposed to >. This is because if 1268 // header_length_after equals max_header_length_ and we are still in the 1269 // parse_state_ BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for 1270 // sure that the headers limit will be crossed later on 1271 if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) { 1272 // Note that headers_ is valid only if we are still reading headers. 1273 const size_t header_length_after = 1274 headers_->GetReadableBytesFromHeaderStream(); 1275 if (header_length_after >= max_header_length_) { 1276 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1277 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; 1278 visitor_->HandleHeaderError(this); 1279 } 1280 } 1281 goto bottom; 1282 } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ || 1283 parse_state_ == BalsaFrameEnums::PARSE_ERROR) { 1284 // Can do nothing more 'till we're reset. 1285 goto bottom; 1286 } 1287 1288 while (current < end) { 1289 switch (parse_state_) { 1290 label_reading_chunk_length: 1291 case BalsaFrameEnums::READING_CHUNK_LENGTH: 1292 // In this state we read the chunk length. 1293 // Note that once we hit a character which is not in: 1294 // [0-9;A-Fa-f\n], we transition to a different state. 1295 // 1296 { 1297 // If we used strtol, etc, we'd have to buffer this line. 1298 // This is more annoying than simply doing the conversion 1299 // here. This code accounts for overflow. 1300 static const signed char buf[] = { 1301 // %0 %1 %2 %3 %4 %5 %6 %7 %8 \t \n %b %c \r %e %f 1302 -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1, 1303 // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f 1304 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1305 // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f 1306 -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1307 // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f 1308 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -2, -1, -1, -1, -1, 1309 // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f 1310 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1311 // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f 1312 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1313 // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f 1314 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1315 // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f 1316 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1317 }; 1318 // valid cases: 1319 // "09123\n" // -> 09123 1320 // "09123\r\n" // -> 09123 1321 // "09123 \n" // -> 09123 1322 // "09123 \r\n" // -> 09123 1323 // "09123 12312\n" // -> 09123 1324 // "09123 12312\r\n" // -> 09123 1325 // "09123; foo=bar\n" // -> 09123 1326 // "09123; foo=bar\r\n" // -> 09123 1327 // "FFFFFFFFFFFFFFFF\r\n" // -> FFFFFFFFFFFFFFFF 1328 // "FFFFFFFFFFFFFFFF 22\r\n" // -> FFFFFFFFFFFFFFFF 1329 // invalid cases: 1330 // "[ \t]+[^\n]*\n" 1331 // "FFFFFFFFFFFFFFFFF\r\n" (would overflow) 1332 // "\r\n" 1333 // "\n" 1334 while (current < end) { 1335 const char c = *current; 1336 ++current; 1337 const signed char addition = buf[static_cast<int>(c)]; 1338 if (addition >= 0) { 1339 chunk_length_character_extracted_ = true; 1340 size_t length_x_16 = chunk_length_remaining_ * 16; 1341 const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16; 1342 if ((chunk_length_remaining_ > kMaxDiv16) || 1343 ((std::numeric_limits<size_t>::max() - length_x_16) < 1344 static_cast<size_t>(addition))) { 1345 // overflow -- asked for a chunk-length greater than 2^64 - 1!! 1346 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1347 last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW; 1348 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1349 visitor_->HandleChunkingError(this); 1350 goto bottom; 1351 } 1352 chunk_length_remaining_ = length_x_16 + addition; 1353 continue; 1354 } 1355 1356 if (!chunk_length_character_extracted_ || addition == -1) { 1357 // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no 1358 // characters were converted, or an unexpected character was 1359 // seen. 1360 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1361 last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH; 1362 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1363 visitor_->HandleChunkingError(this); 1364 goto bottom; 1365 } 1366 1367 --current; 1368 parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION; 1369 visitor_->ProcessChunkLength(chunk_length_remaining_); 1370 goto label_reading_chunk_extension; 1371 } 1372 } 1373 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1374 goto bottom; // case BalsaFrameEnums::READING_CHUNK_LENGTH 1375 1376 label_reading_chunk_extension: 1377 case BalsaFrameEnums::READING_CHUNK_EXTENSION: 1378 { 1379 // TODO(phython): Convert this scanning to be 16 bytes at a time if 1380 // there is data to be read. 1381 const char* extensions_start = current; 1382 size_t extensions_length = 0; 1383 while (current < end) { 1384 const char c = *current; 1385 if (c == '\r' || c == '\n') { 1386 extensions_length = 1387 (extensions_start == current) ? 1388 0 : 1389 current - extensions_start - 1; 1390 } 1391 1392 ++current; 1393 if (c == '\n') { 1394 chunk_length_character_extracted_ = false; 1395 visitor_->ProcessChunkExtensions( 1396 extensions_start, extensions_length); 1397 if (chunk_length_remaining_ != 0) { 1398 parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA; 1399 goto label_reading_chunk_data; 1400 } 1401 HeaderFramingFound('\n'); 1402 parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM; 1403 goto label_reading_last_chunk_term; 1404 } 1405 } 1406 visitor_->ProcessChunkExtensions( 1407 extensions_start, extensions_length); 1408 } 1409 1410 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1411 goto bottom; // case BalsaFrameEnums::READING_CHUNK_EXTENSION 1412 1413 label_reading_chunk_data: 1414 case BalsaFrameEnums::READING_CHUNK_DATA: 1415 while (current < end) { 1416 if (chunk_length_remaining_ == 0) { 1417 break; 1418 } 1419 // read in the chunk 1420 size_t bytes_remaining = end - current; 1421 size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ? 1422 chunk_length_remaining_ : bytes_remaining; 1423 const char* tmp_current = current + consumed_bytes; 1424 visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry); 1425 visitor_->ProcessBodyData(current, consumed_bytes); 1426 on_entry = current = tmp_current; 1427 chunk_length_remaining_ -= consumed_bytes; 1428 } 1429 if (chunk_length_remaining_ == 0) { 1430 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM; 1431 goto label_reading_chunk_term; 1432 } 1433 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1434 goto bottom; // case BalsaFrameEnums::READING_CHUNK_DATA 1435 1436 label_reading_chunk_term: 1437 case BalsaFrameEnums::READING_CHUNK_TERM: 1438 while (current < end) { 1439 const char c = *current; 1440 ++current; 1441 1442 if (c == '\n') { 1443 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH; 1444 goto label_reading_chunk_length; 1445 } 1446 } 1447 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1448 goto bottom; // case BalsaFrameEnums::READING_CHUNK_TERM 1449 1450 label_reading_last_chunk_term: 1451 case BalsaFrameEnums::READING_LAST_CHUNK_TERM: 1452 while (current < end) { 1453 const char c = *current; 1454 1455 if (!HeaderFramingFound(c)) { 1456 // If not, however, since the spec only suggests that the 1457 // client SHOULD indicate the presence of trailers, we get to 1458 // *test* that they did or didn't. 1459 // If all of the bytes we've seen since: 1460 // OPTIONAL_WS 0 OPTIONAL_STUFF CRLF 1461 // are either '\r', or '\n', then we can assume that we don't yet 1462 // know if we need to parse headers, or if the next byte will make 1463 // the HeaderFramingFound condition (above) true. 1464 if (HeaderFramingMayBeFound()) { 1465 // If true, then we have seen only characters '\r' or '\n'. 1466 ++current; 1467 1468 // Lets try again! There is no state change here. 1469 continue; 1470 } else { 1471 // If (!HeaderFramingMayBeFound()), then we know that we must be 1472 // reading the first non CRLF character of a trailer. 1473 parse_state_ = BalsaFrameEnums::READING_TRAILER; 1474 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1475 on_entry = current; 1476 goto label_reading_trailer; 1477 } 1478 } else { 1479 // If we've found a "\r\n\r\n", then the message 1480 // is done. 1481 ++current; 1482 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1483 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1484 visitor_->MessageDone(); 1485 goto bottom; 1486 } 1487 break; // from while loop 1488 } 1489 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1490 goto bottom; // case BalsaFrameEnums::READING_LAST_CHUNK_TERM 1491 1492 label_reading_trailer: 1493 case BalsaFrameEnums::READING_TRAILER: 1494 while (current < end) { 1495 const char c = *current; 1496 ++current; 1497 // TODO(fenix): If we ever care about trailers as part of framing, 1498 // deal with them here (see below for part of the 'solution') 1499 // if (LineFramingFound(c)) { 1500 // trailer_lines_.push_back(make_pair(start_of_line_, 1501 // trailer_length_ - 1)); 1502 // start_of_line_ = trailer_length_; 1503 // } 1504 if (HeaderFramingFound(c)) { 1505 // ProcessTrailers(visitor_, &trailers_); 1506 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1507 visitor_->ProcessTrailerInput(on_entry, current - on_entry); 1508 visitor_->MessageDone(); 1509 goto bottom; 1510 } 1511 } 1512 visitor_->ProcessTrailerInput(on_entry, current - on_entry); 1513 break; // case BalsaFrameEnums::READING_TRAILER 1514 1515 // Note that there is no label: 1516 // 'label_reading_until_close' 1517 // here. This is because the state-machine exists immediately after 1518 // reading the headers instead of transitioning here (as it would 1519 // do if it was consuming all the data it could, all the time). 1520 case BalsaFrameEnums::READING_UNTIL_CLOSE: 1521 { 1522 const size_t bytes_remaining = end - current; 1523 if (bytes_remaining > 0) { 1524 visitor_->ProcessBodyInput(current, bytes_remaining); 1525 visitor_->ProcessBodyData(current, bytes_remaining); 1526 current += bytes_remaining; 1527 } 1528 } 1529 goto bottom; // case BalsaFrameEnums::READING_UNTIL_CLOSE 1530 1531 // label_reading_content: 1532 case BalsaFrameEnums::READING_CONTENT: 1533 #if DEBUGFRAMER 1534 LOG(INFO) << "ReadingContent: " << content_length_remaining_; 1535 #endif // DEBUGFRAMER 1536 while (content_length_remaining_ && current < end) { 1537 // read in the content 1538 const size_t bytes_remaining = end - current; 1539 const size_t consumed_bytes = 1540 (content_length_remaining_ < bytes_remaining) ? 1541 content_length_remaining_ : bytes_remaining; 1542 visitor_->ProcessBodyInput(current, consumed_bytes); 1543 visitor_->ProcessBodyData(current, consumed_bytes); 1544 current += consumed_bytes; 1545 content_length_remaining_ -= consumed_bytes; 1546 } 1547 if (content_length_remaining_ == 0) { 1548 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1549 visitor_->MessageDone(); 1550 } 1551 goto bottom; // case BalsaFrameEnums::READING_CONTENT 1552 1553 default: 1554 // The state-machine should never be in a state that isn't handled 1555 // above. This is a glaring logic error, and we should do something 1556 // drastic to ensure that this gets looked-at and fixed. 1557 LOG(FATAL) << "Unknown state: " << parse_state_ // COV_NF_LINE 1558 << " memory corruption?!"; // COV_NF_LINE 1559 } 1560 } 1561 bottom: 1562 #if DEBUGFRAMER 1563 LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n" 1564 << std::string(input, current) 1565 << "\n$$$$$$$$$$$$$$" 1566 << BalsaFrameEnums::ParseStateToString(parse_state_) 1567 << "$$$$$$$$$$$$$$$" 1568 << " consumed: " << (current - input); 1569 if (Error()) { 1570 LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode()); 1571 } 1572 #endif // DEBUGFRAMER 1573 return current - input; 1574 } 1575 1576 } // namespace net 1577