1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "net/tools/balsa/balsa_frame.h" 6 7 #include <assert.h> 8 #if __SSE2__ 9 #include <emmintrin.h> 10 #endif // __SSE2__ 11 12 #include <limits> 13 #include <string> 14 #include <utility> 15 #include <vector> 16 17 #include "base/logging.h" 18 #include "base/port.h" 19 #include "base/strings/string_piece.h" 20 #include "net/tools/balsa/balsa_enums.h" 21 #include "net/tools/balsa/balsa_headers.h" 22 #include "net/tools/balsa/balsa_visitor_interface.h" 23 #include "net/tools/balsa/buffer_interface.h" 24 #include "net/tools/balsa/simple_buffer.h" 25 #include "net/tools/balsa/split.h" 26 #include "net/tools/balsa/string_piece_utils.h" 27 28 #if defined(COMPILER_MSVC) 29 #include <string.h> 30 #define strncasecmp _strnicmp 31 #else 32 #include <strings.h> 33 #endif 34 35 namespace net { 36 37 // Constants holding some header names for headers which can affect the way the 38 // HTTP message is framed, and so must be processed specially: 39 static const char kContentLength[] = "content-length"; 40 static const size_t kContentLengthSize = sizeof(kContentLength) - 1; 41 static const char kTransferEncoding[] = "transfer-encoding"; 42 static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1; 43 44 BalsaFrame::BalsaFrame() 45 : last_char_was_slash_r_(false), 46 saw_non_newline_char_(false), 47 start_was_space_(true), 48 chunk_length_character_extracted_(false), 49 is_request_(true), 50 request_was_head_(false), 51 max_header_length_(16 * 1024), 52 max_request_uri_length_(2048), 53 visitor_(&do_nothing_visitor_), 54 chunk_length_remaining_(0), 55 content_length_remaining_(0), 56 last_slash_n_loc_(NULL), 57 last_recorded_slash_n_loc_(NULL), 58 last_slash_n_idx_(0), 59 term_chars_(0), 60 parse_state_(BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE), 61 last_error_(BalsaFrameEnums::NO_ERROR), 62 headers_(NULL) { 63 } 64 65 BalsaFrame::~BalsaFrame() {} 66 67 void BalsaFrame::Reset() { 68 last_char_was_slash_r_ = false; 69 saw_non_newline_char_ = false; 70 start_was_space_ = true; 71 chunk_length_character_extracted_ = false; 72 // is_request_ = true; // not reset between messages. 73 // request_was_head_ = false; // not reset between messages. 74 // max_header_length_ = 4096; // not reset between messages. 75 // max_request_uri_length_ = 2048; // not reset between messages. 76 // visitor_ = &do_nothing_visitor_; // not reset between messages. 77 chunk_length_remaining_ = 0; 78 content_length_remaining_ = 0; 79 last_slash_n_loc_ = NULL; 80 last_recorded_slash_n_loc_ = NULL; 81 last_slash_n_idx_ = 0; 82 term_chars_ = 0; 83 parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE; 84 last_error_ = BalsaFrameEnums::NO_ERROR; 85 lines_.clear(); 86 if (headers_ != NULL) { 87 headers_->Clear(); 88 } 89 } 90 91 const char* BalsaFrameEnums::ParseStateToString( 92 BalsaFrameEnums::ParseState error_code) { 93 switch (error_code) { 94 case PARSE_ERROR: 95 return "PARSE_ERROR"; 96 case READING_HEADER_AND_FIRSTLINE: 97 return "READING_HEADER_AND_FIRSTLINE"; 98 case READING_CHUNK_LENGTH: 99 return "READING_CHUNK_LENGTH"; 100 case READING_CHUNK_EXTENSION: 101 return "READING_CHUNK_EXTENSION"; 102 case READING_CHUNK_DATA: 103 return "READING_CHUNK_DATA"; 104 case READING_CHUNK_TERM: 105 return "READING_CHUNK_TERM"; 106 case READING_LAST_CHUNK_TERM: 107 return "READING_LAST_CHUNK_TERM"; 108 case READING_TRAILER: 109 return "READING_TRAILER"; 110 case READING_UNTIL_CLOSE: 111 return "READING_UNTIL_CLOSE"; 112 case READING_CONTENT: 113 return "READING_CONTENT"; 114 case MESSAGE_FULLY_READ: 115 return "MESSAGE_FULLY_READ"; 116 case NUM_STATES: 117 return "UNKNOWN_STATE"; 118 } 119 return "UNKNOWN_STATE"; 120 } 121 122 const char* BalsaFrameEnums::ErrorCodeToString( 123 BalsaFrameEnums::ErrorCode error_code) { 124 switch (error_code) { 125 case NO_ERROR: 126 return "NO_ERROR"; 127 case NO_STATUS_LINE_IN_RESPONSE: 128 return "NO_STATUS_LINE_IN_RESPONSE"; 129 case NO_REQUEST_LINE_IN_REQUEST: 130 return "NO_REQUEST_LINE_IN_REQUEST"; 131 case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION: 132 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION"; 133 case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD: 134 return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD"; 135 case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE: 136 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE"; 137 case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI: 138 return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI"; 139 case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE: 140 return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE"; 141 case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION: 142 return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION"; 143 case FAILED_CONVERTING_STATUS_CODE_TO_INT: 144 return "FAILED_CONVERTING_STATUS_CODE_TO_INT"; 145 case REQUEST_URI_TOO_LONG: 146 return "REQUEST_URI_TOO_LONG"; 147 case HEADERS_TOO_LONG: 148 return "HEADERS_TOO_LONG"; 149 case UNPARSABLE_CONTENT_LENGTH: 150 return "UNPARSABLE_CONTENT_LENGTH"; 151 case MAYBE_BODY_BUT_NO_CONTENT_LENGTH: 152 return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH"; 153 case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH: 154 return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH"; 155 case HEADER_MISSING_COLON: 156 return "HEADER_MISSING_COLON"; 157 case INVALID_CHUNK_LENGTH: 158 return "INVALID_CHUNK_LENGTH"; 159 case CHUNK_LENGTH_OVERFLOW: 160 return "CHUNK_LENGTH_OVERFLOW"; 161 case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO: 162 return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO"; 163 case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT: 164 return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT"; 165 case MULTIPLE_CONTENT_LENGTH_KEYS: 166 return "MULTIPLE_CONTENT_LENGTH_KEYS"; 167 case MULTIPLE_TRANSFER_ENCODING_KEYS: 168 return "MULTIPLE_TRANSFER_ENCODING_KEYS"; 169 case UNKNOWN_TRANSFER_ENCODING: 170 return "UNKNOWN_TRANSFER_ENCODING"; 171 case INVALID_HEADER_FORMAT: 172 return "INVALID_HEADER_FORMAT"; 173 case INTERNAL_LOGIC_ERROR: 174 return "INTERNAL_LOGIC_ERROR"; 175 case NUM_ERROR_CODES: 176 return "UNKNOWN_ERROR"; 177 } 178 return "UNKNOWN_ERROR"; 179 } 180 181 // Summary: 182 // Parses the first line of either a request or response. 183 // Note that in the case of a detected warning, error_code will be set 184 // but the function will not return false. 185 // Exactly zero or one warning or error (but not both) may be detected 186 // by this function. 187 // Note that this function will not write the data of the first-line 188 // into the header's buffer (that should already have been done elsewhere). 189 // 190 // Pre-conditions: 191 // begin != end 192 // *begin should be a character which is > ' '. This implies that there 193 // is at least one non-whitespace characters between [begin, end). 194 // headers is a valid pointer to a BalsaHeaders class. 195 // error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value. 196 // Entire first line must exist between [begin, end) 197 // Exactly zero or one newlines -may- exist between [begin, end) 198 // [begin, end) should exist in the header's buffer. 199 // 200 // Side-effects: 201 // headers will be modified 202 // error_code may be modified if either a warning or error is detected 203 // 204 // Returns: 205 // True if no error (as opposed to warning) is detected. 206 // False if an error (as opposed to warning) is detected. 207 208 // 209 // If there is indeed non-whitespace in the line, then the following 210 // will take care of this for you: 211 // while (*begin <= ' ') ++begin; 212 // ProcessFirstLine(begin, end, is_request, &headers, &error_code); 213 // 214 bool ParseHTTPFirstLine(const char* begin, 215 const char* end, 216 bool is_request, 217 size_t max_request_uri_length, 218 BalsaHeaders* headers, 219 BalsaFrameEnums::ErrorCode* error_code) { 220 const char* current = begin; 221 // HTTP firstlines all have the following structure: 222 // LWS NONWS LWS NONWS LWS NONWS NOTCRLF CRLF 223 // [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n" 224 // ws1 nws1 ws2 nws2 ws3 nws3 ws4 225 // | [-------) [-------) [----------------) 226 // REQ: method request_uri version 227 // RESP: version statuscode reason 228 // 229 // The first NONWS->LWS component we'll call firstline_a. 230 // The second firstline_b, and the third firstline_c. 231 // 232 // firstline_a goes from nws1 to (but not including) ws2 233 // firstline_b goes from nws2 to (but not including) ws3 234 // firstline_c goes from nws3 to (but not including) ws4 235 // 236 // In the code: 237 // ws1 == whitespace_1_idx_ 238 // nws1 == non_whitespace_1_idx_ 239 // ws2 == whitespace_2_idx_ 240 // nws2 == non_whitespace_2_idx_ 241 // ws3 == whitespace_3_idx_ 242 // nws3 == non_whitespace_3_idx_ 243 // ws4 == whitespace_4_idx_ 244 245 // Kill all whitespace (including '\r\n') at the end of the line. 246 --end; 247 if (*end != '\n') { 248 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; 249 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" 250 << headers->OriginalHeadersForDebugging(); 251 return false; 252 } 253 while (begin < end && *end <= ' ') { 254 --end; 255 } 256 DCHECK(*end != '\n'); 257 if (*end == '\n') { 258 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; 259 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" 260 << headers->OriginalHeadersForDebugging(); 261 return false; 262 } 263 ++end; 264 265 // The two following statements should not be possible. 266 if (end == begin) { 267 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; 268 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" 269 << headers->OriginalHeadersForDebugging(); 270 return false; 271 } 272 273 // whitespace_1_idx_ 274 headers->whitespace_1_idx_ = current - begin; 275 // This loop is commented out as it is never used in current code. This is 276 // true only because we don't begin parsing the headers at all until we've 277 // encountered a non whitespace character at the beginning of the stream, at 278 // which point we begin our demarcation of header-start. If we did -not- do 279 // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop 280 // would be necessary for the proper functioning of this parsing. 281 // This is left here as this function may (in the future) be refactored out 282 // of the BalsaFrame class so that it may be shared between code in 283 // BalsaFrame and BalsaHeaders (where it would be used in some variant of the 284 // set_first_line() function (at which point it would be necessary). 285 #if 0 286 while (*current <= ' ') { 287 ++current; 288 } 289 #endif 290 // non_whitespace_1_idx_ 291 headers->non_whitespace_1_idx_ = current - begin; 292 do { 293 // The first time through, we're guaranteed that the current character 294 // won't be a whitespace (else the loop above wouldn't have terminated). 295 // That implies that we're guaranteed to get at least one non-whitespace 296 // character if we get into this loop at all. 297 ++current; 298 if (current == end) { 299 headers->whitespace_2_idx_ = current - begin; 300 headers->non_whitespace_2_idx_ = current - begin; 301 headers->whitespace_3_idx_ = current - begin; 302 headers->non_whitespace_3_idx_ = current - begin; 303 headers->whitespace_4_idx_ = current - begin; 304 // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD for request 305 // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response 306 *error_code = 307 static_cast<BalsaFrameEnums::ErrorCode>( 308 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION + 309 is_request); 310 if (!is_request) { // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION 311 return false; 312 } 313 goto output_exhausted; 314 } 315 } while (*current > ' '); 316 // whitespace_2_idx_ 317 headers->whitespace_2_idx_ = current - begin; 318 do { 319 ++current; 320 // Note that due to the loop which consumes all of the whitespace 321 // at the end of the line, current can never == end while in this function. 322 } while (*current <= ' '); 323 // non_whitespace_2_idx_ 324 headers->non_whitespace_2_idx_ = current - begin; 325 do { 326 ++current; 327 if (current == end) { 328 headers->whitespace_3_idx_ = current - begin; 329 headers->non_whitespace_3_idx_ = current - begin; 330 headers->whitespace_4_idx_ = current - begin; 331 // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request 332 // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response 333 *error_code = 334 static_cast<BalsaFrameEnums::ErrorCode>( 335 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE 336 + is_request); 337 goto output_exhausted; 338 } 339 } while (*current > ' '); 340 // whitespace_3_idx_ 341 headers->whitespace_3_idx_ = current - begin; 342 do { 343 ++current; 344 // Note that due to the loop which consumes all of the whitespace 345 // at the end of the line, current can never == end while in this function. 346 } while (*current <= ' '); 347 // non_whitespace_3_idx_ 348 headers->non_whitespace_3_idx_ = current - begin; 349 headers->whitespace_4_idx_ = end - begin; 350 351 output_exhausted: 352 // Note that we don't fail the parse immediately when parsing of the 353 // firstline fails. Depending on the protocol type, we may want to accept 354 // a firstline with only one or two elements, e.g., for HTTP/0.9: 355 // GET\r\n 356 // or 357 // GET /\r\n 358 // should be parsed without issue (though the visitor should know that 359 // parsing the entire line was not exactly as it should be). 360 // 361 // Eventually, these errors may be removed alltogether, as the visitor can 362 // detect them on its own by examining the size of the various fields. 363 // headers->set_first_line(non_whitespace_1_idx_, current); 364 365 if (is_request) { 366 if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) > 367 max_request_uri_length) { 368 // For requests, we need at least the method. We could assume that a 369 // blank URI means "/". If version isn't stated, it should be assumed 370 // to be HTTP/0.9 by the visitor. 371 *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG; 372 return false; 373 } 374 } else { 375 headers->parsed_response_code_ = 0; 376 { 377 const char* parsed_response_code_current = 378 begin + headers->non_whitespace_2_idx_; 379 const char* parsed_response_code_end = begin + headers->whitespace_3_idx_; 380 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10; 381 382 // Convert a string of [0-9]* into an int. 383 // Note that this allows for the conversion of response codes which 384 // are outside the bounds of normal HTTP response codes (no checking 385 // is done to ensure that these are valid-- they're merely parsed)! 386 while (parsed_response_code_current < parsed_response_code_end) { 387 if (*parsed_response_code_current < '0' || 388 *parsed_response_code_current > '9') { 389 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT; 390 return false; 391 } 392 size_t status_code_x_10 = headers->parsed_response_code_ * 10; 393 uint8 c = *parsed_response_code_current - '0'; 394 if ((headers->parsed_response_code_ > kMaxDiv10) || 395 (std::numeric_limits<size_t>::max() - status_code_x_10) < c) { 396 // overflow. 397 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT; 398 return false; 399 } 400 headers->parsed_response_code_ = status_code_x_10 + c; 401 ++parsed_response_code_current; 402 } 403 } 404 } 405 return true; 406 } 407 408 // begin - beginning of the firstline 409 // end - end of the firstline 410 // 411 // A precondition for this function is that there is non-whitespace between 412 // [begin, end). If this precondition is not met, the function will not perform 413 // as expected (and bad things may happen, and it will eat your first, second, 414 // and third unborn children!). 415 // 416 // Another precondition for this function is that [begin, end) includes 417 // at most one newline, which must be at the end of the line. 418 void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) { 419 BalsaFrameEnums::ErrorCode previous_error = last_error_; 420 if (!ParseHTTPFirstLine(begin, 421 end, 422 is_request_, 423 max_request_uri_length_, 424 headers_, 425 &last_error_)) { 426 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 427 visitor_->HandleHeaderError(this); 428 return; 429 } 430 if (previous_error != last_error_) { 431 visitor_->HandleHeaderWarning(this); 432 } 433 434 if (is_request_) { 435 size_t version_length = 436 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_; 437 visitor_->ProcessRequestFirstLine( 438 begin + headers_->non_whitespace_1_idx_, 439 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_, 440 begin + headers_->non_whitespace_1_idx_, 441 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_, 442 begin + headers_->non_whitespace_2_idx_, 443 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_, 444 begin + headers_->non_whitespace_3_idx_, 445 version_length); 446 if (version_length == 0) 447 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 448 } else { 449 visitor_->ProcessResponseFirstLine( 450 begin + headers_->non_whitespace_1_idx_, 451 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_, 452 begin + headers_->non_whitespace_1_idx_, 453 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_, 454 begin + headers_->non_whitespace_2_idx_, 455 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_, 456 begin + headers_->non_whitespace_3_idx_, 457 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_); 458 } 459 } 460 461 // 'stream_begin' points to the first character of the headers buffer. 462 // 'line_begin' points to the first character of the line. 463 // 'current' points to a char which is ':'. 464 // 'line_end' points to the position of '\n' + 1. 465 // 'line_begin' points to the position of first character of line. 466 void BalsaFrame::CleanUpKeyValueWhitespace( 467 const char* stream_begin, 468 const char* line_begin, 469 const char* current, 470 const char* line_end, 471 HeaderLineDescription* current_header_line) { 472 const char* colon_loc = current; 473 DCHECK_LT(colon_loc, line_end); 474 DCHECK_EQ(':', *colon_loc); 475 DCHECK_EQ(':', *current); 476 DCHECK_GE(' ', *line_end) 477 << "\"" << std::string(line_begin, line_end) << "\""; 478 479 // TODO(fenix): Investigate whether or not the bounds tests in the 480 // while loops here are redundant, and if so, remove them. 481 --current; 482 while (current > line_begin && *current <= ' ') --current; 483 current += (current != colon_loc); 484 current_header_line->key_end_idx = current - stream_begin; 485 486 current = colon_loc; 487 DCHECK_EQ(':', *current); 488 ++current; 489 while (current < line_end && *current <= ' ') ++current; 490 current_header_line->value_begin_idx = current - stream_begin; 491 492 DCHECK_GE(current_header_line->key_end_idx, 493 current_header_line->first_char_idx); 494 DCHECK_GE(current_header_line->value_begin_idx, 495 current_header_line->key_end_idx); 496 DCHECK_GE(current_header_line->last_char_idx, 497 current_header_line->value_begin_idx); 498 } 499 500 inline void BalsaFrame::FindColonsAndParseIntoKeyValue() { 501 DCHECK(!lines_.empty()); 502 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 503 // The last line is always just a newline (and is uninteresting). 504 const Lines::size_type lines_size_m1 = lines_.size() - 1; 505 #if __SSE2__ 506 const __v16qi colons = { ':', ':', ':', ':', ':', ':', ':', ':', 507 ':', ':', ':', ':', ':', ':', ':', ':'}; 508 const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16; 509 #endif // __SSE2__ 510 const char* current = stream_begin + lines_[1].first; 511 // This code is a bit more subtle than it may appear at first glance. 512 // This code looks for a colon in the current line... but it also looks 513 // beyond the current line. If there is no colon in the current line, then 514 // for each subsequent line (until the colon which -has- been found is 515 // associated with a line), no searching for a colon will be performed. In 516 // this way, we minimize the amount of bytes we have scanned for a colon. 517 for (Lines::size_type i = 1; i < lines_size_m1;) { 518 const char* line_begin = stream_begin + lines_[i].first; 519 520 // Here we handle possible continuations. Note that we do not replace 521 // the '\n' in the line before a continuation (at least, as of now), 522 // which implies that any code which looks for a value must deal with 523 // "\r\n", etc -within- the line (and not just at the end of it). 524 for (++i; i < lines_size_m1; ++i) { 525 const char c = *(stream_begin + lines_[i].first); 526 if (c > ' ') { 527 // Not a continuation, so stop. Note that if the 'original' i = 1, 528 // and the next line is not a continuation, we'll end up with i = 2 529 // when we break. This handles the incrementing of i for the outer 530 // loop. 531 break; 532 } 533 } 534 const char* line_end = stream_begin + lines_[i - 1].second; 535 DCHECK_LT(line_begin - stream_begin, line_end - stream_begin); 536 537 // We cleanup the whitespace at the end of the line before doing anything 538 // else of interest as it allows us to do nothing when irregularly formatted 539 // headers are parsed (e.g. those with only keys, only values, or no colon). 540 // 541 // We're guaranteed to have *line_end > ' ' while line_end >= line_begin. 542 --line_end; 543 DCHECK_EQ('\n', *line_end) 544 << "\"" << std::string(line_begin, line_end) << "\""; 545 while (*line_end <= ' ' && line_end > line_begin) { 546 --line_end; 547 } 548 ++line_end; 549 DCHECK_GE(' ', *line_end); 550 DCHECK_LT(line_begin, line_end); 551 552 // We use '0' for the block idx, because we're always writing to the first 553 // block from the framer (we do this because the framer requires that the 554 // entire header sequence be in a contiguous buffer). 555 headers_->header_lines_.push_back( 556 HeaderLineDescription(line_begin - stream_begin, 557 line_end - stream_begin, 558 line_end - stream_begin, 559 line_end - stream_begin, 560 0)); 561 if (current >= line_end) { 562 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON; 563 visitor_->HandleHeaderWarning(this); 564 // Then the next colon will not be found within this header line-- time 565 // to try again with another header-line. 566 continue; 567 } else if (current < line_begin) { 568 // When this condition is true, the last detected colon was part of a 569 // previous line. We reset to the beginning of the line as we don't care 570 // about the presence of any colon before the beginning of the current 571 // line. 572 current = line_begin; 573 } 574 #if __SSE2__ 575 while (current < header_lines_end_m16) { 576 __m128i header_bytes = 577 _mm_loadu_si128(reinterpret_cast<const __m128i *>(current)); 578 __m128i colon_cmp = 579 _mm_cmpeq_epi8(header_bytes, reinterpret_cast<__m128i>(colons)); 580 int colon_msk = _mm_movemask_epi8(colon_cmp); 581 if (colon_msk == 0) { 582 current += 16; 583 continue; 584 } 585 current += (ffs(colon_msk) - 1); 586 if (current > line_end) { 587 break; 588 } 589 goto found_colon; 590 } 591 #endif // __SSE2__ 592 for (; current < line_end; ++current) { 593 if (*current != ':') { 594 continue; 595 } 596 goto found_colon; 597 } 598 // If we've gotten to here, then there was no colon 599 // in the line. The arguments we passed into the construction 600 // for the HeaderLineDescription object should be OK-- it assumes 601 // that the entire content is 'key' by default (which is true, as 602 // there was no colon, there can be no value). Note that this is a 603 // construct which is technically not allowed by the spec. 604 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON; 605 visitor_->HandleHeaderWarning(this); 606 continue; 607 found_colon: 608 DCHECK_EQ(*current, ':'); 609 DCHECK_LE(current - stream_begin, line_end - stream_begin); 610 DCHECK_LE(stream_begin - stream_begin, current - stream_begin); 611 612 HeaderLineDescription& current_header_line = headers_->header_lines_.back(); 613 current_header_line.key_end_idx = current - stream_begin; 614 current_header_line.value_begin_idx = current_header_line.key_end_idx; 615 if (current < line_end) { 616 ++current_header_line.key_end_idx; 617 618 CleanUpKeyValueWhitespace(stream_begin, 619 line_begin, 620 current, 621 line_end, 622 ¤t_header_line); 623 } 624 } 625 } 626 627 void BalsaFrame::ProcessContentLengthLine( 628 HeaderLines::size_type line_idx, 629 BalsaHeadersEnums::ContentLengthStatus* status, 630 size_t* length) { 631 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx]; 632 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 633 const char* line_end = stream_begin + header_line.last_char_idx; 634 const char* value_begin = (stream_begin + header_line.value_begin_idx); 635 636 if (value_begin >= line_end) { 637 // There is no non-whitespace value data. 638 #if DEBUGFRAMER 639 LOG(INFO) << "invalid content-length -- no non-whitespace value data"; 640 #endif 641 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH; 642 return; 643 } 644 645 *length = 0; 646 while (value_begin < line_end) { 647 if (*value_begin < '0' || *value_begin > '9') { 648 // bad! content-length found, and couldn't parse all of it! 649 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH; 650 #if DEBUGFRAMER 651 LOG(INFO) << "invalid content-length - non numeric character detected"; 652 #endif // DEBUGFRAMER 653 return; 654 } 655 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10; 656 size_t length_x_10 = *length * 10; 657 const unsigned char c = *value_begin - '0'; 658 if (*length > kMaxDiv10 || 659 (std::numeric_limits<size_t>::max() - length_x_10) < c) { 660 *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW; 661 #if DEBUGFRAMER 662 LOG(INFO) << "content-length overflow"; 663 #endif // DEBUGFRAMER 664 return; 665 } 666 *length = length_x_10 + c; 667 ++value_begin; 668 } 669 #if DEBUGFRAMER 670 LOG(INFO) << "content_length parsed: " << *length; 671 #endif // DEBUGFRAMER 672 *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH; 673 } 674 675 void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) { 676 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx]; 677 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 678 const char* line_end = stream_begin + header_line.last_char_idx; 679 const char* value_begin = stream_begin + header_line.value_begin_idx; 680 size_t value_length = line_end - value_begin; 681 682 if ((value_length == 7) && 683 !strncasecmp(value_begin, "chunked", 7)) { 684 headers_->transfer_encoding_is_chunked_ = true; 685 } else if ((value_length == 8) && 686 !strncasecmp(value_begin, "identity", 8)) { 687 headers_->transfer_encoding_is_chunked_ = false; 688 } else { 689 last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING; 690 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 691 visitor_->HandleHeaderError(this); 692 return; 693 } 694 } 695 696 namespace { 697 bool SplitStringPiece(base::StringPiece original, char delim, 698 base::StringPiece* before, base::StringPiece* after) { 699 const char* p = original.data(); 700 const char* end = p + original.size(); 701 702 while (p != end) { 703 if (*p == delim) { 704 ++p; 705 } else { 706 const char* start = p; 707 while (++p != end && *p != delim) { 708 // Skip to the next occurence of the delimiter. 709 } 710 *before = base::StringPiece(start, p - start); 711 if (p != end) 712 *after = base::StringPiece(p + 1, end - (p + 1)); 713 else 714 *after = base::StringPiece(""); 715 StringPieceUtils::RemoveWhitespaceContext(before); 716 StringPieceUtils::RemoveWhitespaceContext(after); 717 return true; 718 } 719 } 720 721 *before = original; 722 *after = ""; 723 return false; 724 } 725 726 // TODO(phython): Fix this function to properly deal with quoted values. 727 // E.g. ";;foo", "\";;\"", or \"aa; 728 // The last example, the semi-colon is a separator between extensions. 729 void ProcessChunkExtensionsManual(base::StringPiece all_extensions, 730 BalsaHeaders* extensions) { 731 base::StringPiece extension; 732 base::StringPiece remaining; 733 StringPieceUtils::RemoveWhitespaceContext(&all_extensions); 734 SplitStringPiece(all_extensions, ';', &extension, &remaining); 735 while (!extension.empty()) { 736 base::StringPiece key; 737 base::StringPiece value; 738 SplitStringPiece(extension, '=', &key, &value); 739 if (!value.empty()) { 740 // Strip quotation marks if they exist. 741 if (!value.empty() && value[0] == '"') 742 value.remove_prefix(1); 743 if (!value.empty() && value[value.length() - 1] == '"') 744 value.remove_suffix(1); 745 } 746 747 extensions->AppendHeader(key, value); 748 749 StringPieceUtils::RemoveWhitespaceContext(&remaining); 750 SplitStringPiece(remaining, ';', &extension, &remaining); 751 } 752 } 753 754 } // anonymous namespace 755 756 void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size, 757 BalsaHeaders* extensions) { 758 ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions); 759 } 760 761 void BalsaFrame::ProcessHeaderLines() { 762 HeaderLines::size_type content_length_idx = 0; 763 HeaderLines::size_type transfer_encoding_idx = 0; 764 765 DCHECK(!lines_.empty()); 766 #if DEBUGFRAMER 767 LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n"; 768 #endif // DEBUGFRAMER 769 770 // There is no need to attempt to process headers if no header lines exist. 771 // There are at least two lines in the message which are not header lines. 772 // These two non-header lines are the first line of the message, and the 773 // last line of the message (which is an empty line). 774 // Thus, we test to see if we have more than two lines total before attempting 775 // to parse any header lines. 776 if (lines_.size() > 2) { 777 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 778 779 // Then, for the rest of the header data, we parse these into key-value 780 // pairs. 781 FindColonsAndParseIntoKeyValue(); 782 // At this point, we've parsed all of the headers. Time to look for those 783 // headers which we require for framing. 784 const HeaderLines::size_type 785 header_lines_size = headers_->header_lines_.size(); 786 for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) { 787 const HeaderLineDescription& current_header_line = 788 headers_->header_lines_[i]; 789 const char* key_begin = 790 (stream_begin + current_header_line.first_char_idx); 791 const char* key_end = (stream_begin + current_header_line.key_end_idx); 792 const size_t key_len = key_end - key_begin; 793 const char c = *key_begin; 794 #if DEBUGFRAMER 795 LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len) 796 << " c: '" << c << "' key_len: " << key_len; 797 #endif // DEBUGFRAMER 798 // If a header begins with either lowercase or uppercase 'c' or 't', then 799 // the header may be one of content-length, connection, content-encoding 800 // or transfer-encoding. These headers are special, as they change the way 801 // that the message is framed, and so the framer is required to search 802 // for them. 803 804 805 if (c == 'c' || c == 'C') { 806 if ((key_len == kContentLengthSize) && 807 0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) { 808 BalsaHeadersEnums::ContentLengthStatus content_length_status = 809 BalsaHeadersEnums::NO_CONTENT_LENGTH; 810 size_t length = 0; 811 ProcessContentLengthLine(i, &content_length_status, &length); 812 if (content_length_idx != 0) { // then we've already seen one! 813 if ((headers_->content_length_status_ != content_length_status) || 814 ((headers_->content_length_status_ == 815 BalsaHeadersEnums::VALID_CONTENT_LENGTH) && 816 length != headers_->content_length_)) { 817 last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS; 818 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 819 visitor_->HandleHeaderError(this); 820 return; 821 } 822 continue; 823 } else { 824 content_length_idx = i + 1; 825 headers_->content_length_status_ = content_length_status; 826 headers_->content_length_ = length; 827 content_length_remaining_ = length; 828 } 829 830 } 831 } else if (c == 't' || c == 'T') { 832 if ((key_len == kTransferEncodingSize) && 833 0 == strncasecmp(key_begin, kTransferEncoding, 834 kTransferEncodingSize)) { 835 if (transfer_encoding_idx != 0) { 836 last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS; 837 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 838 visitor_->HandleHeaderError(this); 839 return; 840 } 841 transfer_encoding_idx = i + 1; 842 } 843 } else if (i == 0 && (key_len == 0 || c == ' ')) { 844 last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT; 845 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 846 visitor_->HandleHeaderError(this); 847 return; 848 } 849 } 850 if (headers_->transfer_encoding_is_chunked_) { 851 headers_->content_length_ = 0; 852 headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH; 853 content_length_remaining_ = 0; 854 } 855 if (transfer_encoding_idx != 0) { 856 ProcessTransferEncodingLine(transfer_encoding_idx - 1); 857 } 858 } 859 } 860 861 void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() { 862 // For responses, can't have a body if the request was a HEAD, or if it is 863 // one of these response-codes. rfc2616 section 4.3 864 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 865 if (is_request_ || 866 !(request_was_head_ || 867 (headers_->parsed_response_code_ >= 100 && 868 headers_->parsed_response_code_ < 200) || 869 (headers_->parsed_response_code_ == 204) || 870 (headers_->parsed_response_code_ == 304))) { 871 // Then we can have a body. 872 if (headers_->transfer_encoding_is_chunked_) { 873 // Note that 874 // if ( Transfer-Encoding: chunked && Content-length: ) 875 // then Transfer-Encoding: chunked trumps. 876 // This is as specified in the spec. 877 // rfc2616 section 4.4.3 878 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH; 879 } else { 880 // Errors parsing content-length definitely can cause 881 // protocol errors/warnings 882 switch (headers_->content_length_status_) { 883 // If we have a content-length, and it is parsed 884 // properly, there are two options. 885 // 1) zero content, in which case the message is done, and 886 // 2) nonzero content, in which case we have to 887 // consume the body. 888 case BalsaHeadersEnums::VALID_CONTENT_LENGTH: 889 if (headers_->content_length_ == 0) { 890 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 891 } else { 892 parse_state_ = BalsaFrameEnums::READING_CONTENT; 893 } 894 break; 895 case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW: 896 case BalsaHeadersEnums::INVALID_CONTENT_LENGTH: 897 // If there were characters left-over after parsing the 898 // content length, we should flag an error and stop. 899 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 900 last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH; 901 visitor_->HandleHeaderError(this); 902 break; 903 // We can have: no transfer-encoding, no content length, and no 904 // connection: close... 905 // Unfortunately, this case doesn't seem to be covered in the spec. 906 // We'll assume that the safest thing to do here is what the google 907 // binaries before 2008 already do, which is to assume that 908 // everything until the connection is closed is body. 909 case BalsaHeadersEnums::NO_CONTENT_LENGTH: 910 if (is_request_) { 911 base::StringPiece method = headers_->request_method(); 912 // POSTs and PUTs should have a detectable body length. If they 913 // do not we consider it an error. 914 if ((method.size() == 4 && 915 strncmp(method.data(), "POST", 4) == 0) || 916 (method.size() == 3 && 917 strncmp(method.data(), "PUT", 3) == 0)) { 918 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 919 last_error_ = 920 BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH; 921 visitor_->HandleHeaderError(this); 922 break; 923 } 924 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 925 } else { 926 parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE; 927 last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH; 928 visitor_->HandleHeaderWarning(this); 929 } 930 break; 931 // The COV_NF_... statements here provide hints to the apparatus 932 // which computes coverage reports/ratios that this code is never 933 // intended to be executed, and should technically be impossible. 934 // COV_NF_START 935 default: 936 LOG(FATAL) << "Saw a content_length_status: " 937 << headers_->content_length_status_ << " which is unknown."; 938 // COV_NF_END 939 } 940 } 941 } 942 } 943 944 size_t BalsaFrame::ProcessHeaders(const char* message_start, 945 size_t message_length) { 946 const char* const original_message_start = message_start; 947 const char* const message_end = message_start + message_length; 948 const char* message_current = message_start; 949 const char* checkpoint = message_start; 950 951 if (message_length == 0) { 952 goto bottom; 953 } 954 955 while (message_current < message_end) { 956 size_t base_idx = headers_->GetReadableBytesFromHeaderStream(); 957 958 // Yes, we could use strchr (assuming null termination), or 959 // memchr, but as it turns out that is slower than this tight loop 960 // for the input that we see. 961 if (!saw_non_newline_char_) { 962 do { 963 const char c = *message_current; 964 if (c != '\r' && c != '\n') { 965 if (c <= ' ') { 966 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 967 last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST; 968 visitor_->HandleHeaderError(this); 969 goto bottom; 970 } else { 971 saw_non_newline_char_ = true; 972 checkpoint = message_start = message_current; 973 goto read_real_message; 974 } 975 } 976 ++message_current; 977 } while (message_current < message_end); 978 goto bottom; // this is necessary to skip 'last_char_was_slash_r' checks 979 } else { 980 read_real_message: 981 // Note that SSE2 can be enabled on certain piii platforms. 982 #if __SSE2__ 983 { 984 const char* const message_end_m16 = message_end - 16; 985 __v16qi newlines = { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 986 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' }; 987 while (message_current < message_end_m16) { 988 // What this does (using compiler intrinsics): 989 // 990 // Load 16 '\n's into an xmm register 991 // Load 16 bytes of currennt message into an xmm register 992 // Do byte-wise equals on those two xmm registers 993 // Take the first bit of each byte, and put that into the first 994 // 16 bits of a mask 995 // If the mask is zero, no '\n' found. increment by 16 and try again 996 // Else scan forward to find the first set bit. 997 // Increment current by the index of the first set bit 998 // (ffs returns index of first set bit + 1) 999 __m128i msg_bytes = 1000 _mm_loadu_si128(const_cast<__m128i *>( 1001 reinterpret_cast<const __m128i *>(message_current))); 1002 __m128i newline_cmp = 1003 _mm_cmpeq_epi8(msg_bytes, reinterpret_cast<__m128i>(newlines)); 1004 int newline_msk = _mm_movemask_epi8(newline_cmp); 1005 if (newline_msk == 0) { 1006 message_current += 16; 1007 continue; 1008 } 1009 message_current += (ffs(newline_msk) - 1); 1010 const size_t relative_idx = message_current - message_start; 1011 const size_t message_current_idx = 1 + base_idx + relative_idx; 1012 lines_.push_back(std::make_pair(last_slash_n_idx_, 1013 message_current_idx)); 1014 if (lines_.size() == 1) { 1015 headers_->WriteFromFramer(checkpoint, 1016 1 + message_current - checkpoint); 1017 checkpoint = message_current + 1; 1018 const char* begin = headers_->OriginalHeaderStreamBegin(); 1019 #if DEBUGFRAMER 1020 LOG(INFO) << "First line " << std::string(begin, lines_[0].second); 1021 LOG(INFO) << "is_request_: " << is_request_; 1022 #endif 1023 ProcessFirstLine(begin, begin + lines_[0].second); 1024 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) 1025 goto process_lines; 1026 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) 1027 goto bottom; 1028 } 1029 const size_t chars_since_last_slash_n = (message_current_idx - 1030 last_slash_n_idx_); 1031 last_slash_n_idx_ = message_current_idx; 1032 if (chars_since_last_slash_n > 2) { 1033 // We have a slash-n, but the last slash n was 1034 // more than 2 characters away from this. Thus, we know 1035 // that this cannot be an end-of-header. 1036 ++message_current; 1037 continue; 1038 } 1039 if ((chars_since_last_slash_n == 1) || 1040 (((message_current > message_start) && 1041 (*(message_current - 1) == '\r')) || 1042 (last_char_was_slash_r_))) { 1043 goto process_lines; 1044 } 1045 ++message_current; 1046 } 1047 } 1048 #endif // __SSE2__ 1049 while (message_current < message_end) { 1050 if (*message_current != '\n') { 1051 ++message_current; 1052 continue; 1053 } 1054 const size_t relative_idx = message_current - message_start; 1055 const size_t message_current_idx = 1 + base_idx + relative_idx; 1056 lines_.push_back(std::make_pair(last_slash_n_idx_, 1057 message_current_idx)); 1058 if (lines_.size() == 1) { 1059 headers_->WriteFromFramer(checkpoint, 1060 1 + message_current - checkpoint); 1061 checkpoint = message_current + 1; 1062 const char* begin = headers_->OriginalHeaderStreamBegin(); 1063 #if DEBUGFRAMER 1064 LOG(INFO) << "First line " << std::string(begin, lines_[0].second); 1065 LOG(INFO) << "is_request_: " << is_request_; 1066 #endif 1067 ProcessFirstLine(begin, begin + lines_[0].second); 1068 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) 1069 goto process_lines; 1070 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) 1071 goto bottom; 1072 } 1073 const size_t chars_since_last_slash_n = (message_current_idx - 1074 last_slash_n_idx_); 1075 last_slash_n_idx_ = message_current_idx; 1076 if (chars_since_last_slash_n > 2) { 1077 // false positive. 1078 ++message_current; 1079 continue; 1080 } 1081 if ((chars_since_last_slash_n == 1) || 1082 (((message_current > message_start) && 1083 (*(message_current - 1) == '\r')) || 1084 (last_char_was_slash_r_))) { 1085 goto process_lines; 1086 } 1087 ++message_current; 1088 } 1089 } 1090 continue; 1091 process_lines: 1092 ++message_current; 1093 DCHECK(message_current >= message_start); 1094 if (message_current > message_start) { 1095 headers_->WriteFromFramer(checkpoint, message_current - checkpoint); 1096 } 1097 1098 // Check if we have exceeded maximum headers length 1099 // Although we check for this limit before and after we call this function 1100 // we check it here as well to make sure that in case the visitor changed 1101 // the max_header_length_ (for example after processing the first line) 1102 // we handle it gracefully. 1103 if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) { 1104 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1105 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; 1106 visitor_->HandleHeaderError(this); 1107 goto bottom; 1108 } 1109 1110 // Since we know that we won't be writing any more bytes of the header, 1111 // we tell that to the headers object. The headers object may make 1112 // more efficient allocation decisions when this is signaled. 1113 headers_->DoneWritingFromFramer(); 1114 { 1115 const char* readable_ptr = NULL; 1116 size_t readable_size = 0; 1117 headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size); 1118 visitor_->ProcessHeaderInput(readable_ptr, readable_size); 1119 } 1120 1121 // Ok, now that we've written everything into our header buffer, it is 1122 // time to process the header lines (extract proper values for headers 1123 // which are important for framing). 1124 ProcessHeaderLines(); 1125 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) { 1126 goto bottom; 1127 } 1128 AssignParseStateAfterHeadersHaveBeenParsed(); 1129 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) { 1130 goto bottom; 1131 } 1132 visitor_->ProcessHeaders(*headers_); 1133 visitor_->HeaderDone(); 1134 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) { 1135 visitor_->MessageDone(); 1136 } 1137 goto bottom; 1138 } 1139 // If we've gotten to here, it means that we've consumed all of the 1140 // available input. We need to record whether or not the last character we 1141 // saw was a '\r' so that a subsequent call to ProcessInput correctly finds 1142 // a header framing that is split across the two calls. 1143 last_char_was_slash_r_ = (*(message_end - 1) == '\r'); 1144 DCHECK(message_current >= message_start); 1145 if (message_current > message_start) { 1146 headers_->WriteFromFramer(checkpoint, message_current - checkpoint); 1147 } 1148 bottom: 1149 return message_current - original_message_start; 1150 } 1151 1152 1153 size_t BalsaFrame::BytesSafeToSplice() const { 1154 switch (parse_state_) { 1155 case BalsaFrameEnums::READING_CHUNK_DATA: 1156 return chunk_length_remaining_; 1157 case BalsaFrameEnums::READING_UNTIL_CLOSE: 1158 return std::numeric_limits<size_t>::max(); 1159 case BalsaFrameEnums::READING_CONTENT: 1160 return content_length_remaining_; 1161 default: 1162 return 0; 1163 } 1164 } 1165 1166 void BalsaFrame::BytesSpliced(size_t bytes_spliced) { 1167 switch (parse_state_) { 1168 case BalsaFrameEnums::READING_CHUNK_DATA: 1169 if (chunk_length_remaining_ >= bytes_spliced) { 1170 chunk_length_remaining_ -= bytes_spliced; 1171 if (chunk_length_remaining_ == 0) { 1172 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM; 1173 } 1174 return; 1175 } else { 1176 last_error_ = 1177 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT; 1178 goto error_exit; 1179 } 1180 1181 case BalsaFrameEnums::READING_UNTIL_CLOSE: 1182 return; 1183 1184 case BalsaFrameEnums::READING_CONTENT: 1185 if (content_length_remaining_ >= bytes_spliced) { 1186 content_length_remaining_ -= bytes_spliced; 1187 if (content_length_remaining_ == 0) { 1188 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1189 visitor_->MessageDone(); 1190 } 1191 return; 1192 } else { 1193 last_error_ = 1194 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT; 1195 goto error_exit; 1196 } 1197 1198 default: 1199 last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO; 1200 goto error_exit; 1201 } 1202 1203 error_exit: 1204 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1205 visitor_->HandleBodyError(this); 1206 }; 1207 1208 // You may note that the state-machine contained within this function has both 1209 // switch and goto labels for nearly the same thing. For instance, the 1210 // following two labels refer to the same code block: 1211 // label_reading_chunk_data: 1212 // case BalsaFrameEnums::READING_CHUNK_DATA: 1213 // The 'case' statement is required for the switch statement which occurs when 1214 // ProcessInput is invoked. The goto label is required as the state-machine 1215 // does not use a computed goto in any subsequent operations. 1216 // 1217 // Since several states exit the state machine for various reasons, there is 1218 // also one label at the bottom of the function. When it is appropriate to 1219 // return from the function, that part of the state machine instead issues a 1220 // goto bottom; This results in less code duplication, and makes debugging 1221 // easier (as you can add a statement to a section of code which is guaranteed 1222 // to be invoked when the function is exiting. 1223 size_t BalsaFrame::ProcessInput(const char* input, size_t size) { 1224 const char* current = input; 1225 const char* on_entry = current; 1226 const char* end = current + size; 1227 #if DEBUGFRAMER 1228 LOG(INFO) << "\n==============" 1229 << BalsaFrameEnums::ParseStateToString(parse_state_) 1230 << "===============\n"; 1231 #endif // DEBUGFRAMER 1232 1233 DCHECK(headers_ != NULL); 1234 if (headers_ == NULL) return 0; 1235 1236 if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) { 1237 const size_t header_length = headers_->GetReadableBytesFromHeaderStream(); 1238 // Yes, we still have to check this here as the user can change the 1239 // max_header_length amount! 1240 // Also it is possible that we have reached the maximum allowed header size, 1241 // and we have more to consume (remember we are still inside 1242 // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error. 1243 if (header_length > max_header_length_ || 1244 (header_length == max_header_length_ && size > 0)) { 1245 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1246 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; 1247 visitor_->HandleHeaderError(this); 1248 goto bottom; 1249 } 1250 size_t bytes_to_process = max_header_length_ - header_length; 1251 if (bytes_to_process > size) { 1252 bytes_to_process = size; 1253 } 1254 current += ProcessHeaders(input, bytes_to_process); 1255 // If we are still reading headers check if we have crossed the headers 1256 // limit. Note that we check for >= as opposed to >. This is because if 1257 // header_length_after equals max_header_length_ and we are still in the 1258 // parse_state_ BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for 1259 // sure that the headers limit will be crossed later on 1260 if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) { 1261 // Note that headers_ is valid only if we are still reading headers. 1262 const size_t header_length_after = 1263 headers_->GetReadableBytesFromHeaderStream(); 1264 if (header_length_after >= max_header_length_) { 1265 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1266 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; 1267 visitor_->HandleHeaderError(this); 1268 } 1269 } 1270 goto bottom; 1271 } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ || 1272 parse_state_ == BalsaFrameEnums::PARSE_ERROR) { 1273 // Can do nothing more 'till we're reset. 1274 goto bottom; 1275 } 1276 1277 while (current < end) { 1278 switch (parse_state_) { 1279 label_reading_chunk_length: 1280 case BalsaFrameEnums::READING_CHUNK_LENGTH: 1281 // In this state we read the chunk length. 1282 // Note that once we hit a character which is not in: 1283 // [0-9;A-Fa-f\n], we transition to a different state. 1284 // 1285 { 1286 // If we used strtol, etc, we'd have to buffer this line. 1287 // This is more annoying than simply doing the conversion 1288 // here. This code accounts for overflow. 1289 static const signed char buf[] = { 1290 // %0 %1 %2 %3 %4 %5 %6 %7 %8 \t \n %b %c \r %e %f 1291 -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1, 1292 // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f 1293 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1294 // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f 1295 -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1296 // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f 1297 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -2, -1, -1, -1, -1, 1298 // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f 1299 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1300 // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f 1301 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1302 // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f 1303 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1304 // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f 1305 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1306 }; 1307 // valid cases: 1308 // "09123\n" // -> 09123 1309 // "09123\r\n" // -> 09123 1310 // "09123 \n" // -> 09123 1311 // "09123 \r\n" // -> 09123 1312 // "09123 12312\n" // -> 09123 1313 // "09123 12312\r\n" // -> 09123 1314 // "09123; foo=bar\n" // -> 09123 1315 // "09123; foo=bar\r\n" // -> 09123 1316 // "FFFFFFFFFFFFFFFF\r\n" // -> FFFFFFFFFFFFFFFF 1317 // "FFFFFFFFFFFFFFFF 22\r\n" // -> FFFFFFFFFFFFFFFF 1318 // invalid cases: 1319 // "[ \t]+[^\n]*\n" 1320 // "FFFFFFFFFFFFFFFFF\r\n" (would overflow) 1321 // "\r\n" 1322 // "\n" 1323 while (current < end) { 1324 const char c = *current; 1325 ++current; 1326 const signed char addition = buf[static_cast<int>(c)]; 1327 if (addition >= 0) { 1328 chunk_length_character_extracted_ = true; 1329 size_t length_x_16 = chunk_length_remaining_ * 16; 1330 const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16; 1331 if ((chunk_length_remaining_ > kMaxDiv16) || 1332 ((std::numeric_limits<size_t>::max() - length_x_16) < 1333 static_cast<size_t>(addition))) { 1334 // overflow -- asked for a chunk-length greater than 2^64 - 1!! 1335 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1336 last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW; 1337 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1338 visitor_->HandleChunkingError(this); 1339 goto bottom; 1340 } 1341 chunk_length_remaining_ = length_x_16 + addition; 1342 continue; 1343 } 1344 1345 if (!chunk_length_character_extracted_ || addition == -1) { 1346 // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no 1347 // characters were converted, or an unexpected character was 1348 // seen. 1349 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1350 last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH; 1351 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1352 visitor_->HandleChunkingError(this); 1353 goto bottom; 1354 } 1355 1356 --current; 1357 parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION; 1358 visitor_->ProcessChunkLength(chunk_length_remaining_); 1359 goto label_reading_chunk_extension; 1360 } 1361 } 1362 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1363 goto bottom; // case BalsaFrameEnums::READING_CHUNK_LENGTH 1364 1365 label_reading_chunk_extension: 1366 case BalsaFrameEnums::READING_CHUNK_EXTENSION: 1367 { 1368 // TODO(phython): Convert this scanning to be 16 bytes at a time if 1369 // there is data to be read. 1370 const char* extensions_start = current; 1371 size_t extensions_length = 0; 1372 while (current < end) { 1373 const char c = *current; 1374 if (c == '\r' || c == '\n') { 1375 extensions_length = 1376 (extensions_start == current) ? 1377 0 : 1378 current - extensions_start - 1; 1379 } 1380 1381 ++current; 1382 if (c == '\n') { 1383 chunk_length_character_extracted_ = false; 1384 visitor_->ProcessChunkExtensions( 1385 extensions_start, extensions_length); 1386 if (chunk_length_remaining_ != 0) { 1387 parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA; 1388 goto label_reading_chunk_data; 1389 } 1390 HeaderFramingFound('\n'); 1391 parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM; 1392 goto label_reading_last_chunk_term; 1393 } 1394 } 1395 visitor_->ProcessChunkExtensions( 1396 extensions_start, extensions_length); 1397 } 1398 1399 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1400 goto bottom; // case BalsaFrameEnums::READING_CHUNK_EXTENSION 1401 1402 label_reading_chunk_data: 1403 case BalsaFrameEnums::READING_CHUNK_DATA: 1404 while (current < end) { 1405 if (chunk_length_remaining_ == 0) { 1406 break; 1407 } 1408 // read in the chunk 1409 size_t bytes_remaining = end - current; 1410 size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ? 1411 chunk_length_remaining_ : bytes_remaining; 1412 const char* tmp_current = current + consumed_bytes; 1413 visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry); 1414 visitor_->ProcessBodyData(current, consumed_bytes); 1415 on_entry = current = tmp_current; 1416 chunk_length_remaining_ -= consumed_bytes; 1417 } 1418 if (chunk_length_remaining_ == 0) { 1419 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM; 1420 goto label_reading_chunk_term; 1421 } 1422 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1423 goto bottom; // case BalsaFrameEnums::READING_CHUNK_DATA 1424 1425 label_reading_chunk_term: 1426 case BalsaFrameEnums::READING_CHUNK_TERM: 1427 while (current < end) { 1428 const char c = *current; 1429 ++current; 1430 1431 if (c == '\n') { 1432 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH; 1433 goto label_reading_chunk_length; 1434 } 1435 } 1436 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1437 goto bottom; // case BalsaFrameEnums::READING_CHUNK_TERM 1438 1439 label_reading_last_chunk_term: 1440 case BalsaFrameEnums::READING_LAST_CHUNK_TERM: 1441 while (current < end) { 1442 const char c = *current; 1443 1444 if (!HeaderFramingFound(c)) { 1445 // If not, however, since the spec only suggests that the 1446 // client SHOULD indicate the presence of trailers, we get to 1447 // *test* that they did or didn't. 1448 // If all of the bytes we've seen since: 1449 // OPTIONAL_WS 0 OPTIONAL_STUFF CRLF 1450 // are either '\r', or '\n', then we can assume that we don't yet 1451 // know if we need to parse headers, or if the next byte will make 1452 // the HeaderFramingFound condition (above) true. 1453 if (HeaderFramingMayBeFound()) { 1454 // If true, then we have seen only characters '\r' or '\n'. 1455 ++current; 1456 1457 // Lets try again! There is no state change here. 1458 continue; 1459 } else { 1460 // If (!HeaderFramingMayBeFound()), then we know that we must be 1461 // reading the first non CRLF character of a trailer. 1462 parse_state_ = BalsaFrameEnums::READING_TRAILER; 1463 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1464 on_entry = current; 1465 goto label_reading_trailer; 1466 } 1467 } else { 1468 // If we've found a "\r\n\r\n", then the message 1469 // is done. 1470 ++current; 1471 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1472 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1473 visitor_->MessageDone(); 1474 goto bottom; 1475 } 1476 break; // from while loop 1477 } 1478 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1479 goto bottom; // case BalsaFrameEnums::READING_LAST_CHUNK_TERM 1480 1481 label_reading_trailer: 1482 case BalsaFrameEnums::READING_TRAILER: 1483 while (current < end) { 1484 const char c = *current; 1485 ++current; 1486 // TODO(fenix): If we ever care about trailers as part of framing, 1487 // deal with them here (see below for part of the 'solution') 1488 // if (LineFramingFound(c)) { 1489 // trailer_lines_.push_back(make_pair(start_of_line_, 1490 // trailer_length_ - 1)); 1491 // start_of_line_ = trailer_length_; 1492 // } 1493 if (HeaderFramingFound(c)) { 1494 // ProcessTrailers(visitor_, &trailers_); 1495 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1496 visitor_->ProcessTrailerInput(on_entry, current - on_entry); 1497 visitor_->MessageDone(); 1498 goto bottom; 1499 } 1500 } 1501 visitor_->ProcessTrailerInput(on_entry, current - on_entry); 1502 break; // case BalsaFrameEnums::READING_TRAILER 1503 1504 // Note that there is no label: 1505 // 'label_reading_until_close' 1506 // here. This is because the state-machine exists immediately after 1507 // reading the headers instead of transitioning here (as it would 1508 // do if it was consuming all the data it could, all the time). 1509 case BalsaFrameEnums::READING_UNTIL_CLOSE: 1510 { 1511 const size_t bytes_remaining = end - current; 1512 if (bytes_remaining > 0) { 1513 visitor_->ProcessBodyInput(current, bytes_remaining); 1514 visitor_->ProcessBodyData(current, bytes_remaining); 1515 current += bytes_remaining; 1516 } 1517 } 1518 goto bottom; // case BalsaFrameEnums::READING_UNTIL_CLOSE 1519 1520 // label_reading_content: 1521 case BalsaFrameEnums::READING_CONTENT: 1522 #if DEBUGFRAMER 1523 LOG(INFO) << "ReadingContent: " << content_length_remaining_; 1524 #endif // DEBUGFRAMER 1525 while (content_length_remaining_ && current < end) { 1526 // read in the content 1527 const size_t bytes_remaining = end - current; 1528 const size_t consumed_bytes = 1529 (content_length_remaining_ < bytes_remaining) ? 1530 content_length_remaining_ : bytes_remaining; 1531 visitor_->ProcessBodyInput(current, consumed_bytes); 1532 visitor_->ProcessBodyData(current, consumed_bytes); 1533 current += consumed_bytes; 1534 content_length_remaining_ -= consumed_bytes; 1535 } 1536 if (content_length_remaining_ == 0) { 1537 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1538 visitor_->MessageDone(); 1539 } 1540 goto bottom; // case BalsaFrameEnums::READING_CONTENT 1541 1542 default: 1543 // The state-machine should never be in a state that isn't handled 1544 // above. This is a glaring logic error, and we should do something 1545 // drastic to ensure that this gets looked-at and fixed. 1546 LOG(FATAL) << "Unknown state: " << parse_state_ // COV_NF_LINE 1547 << " memory corruption?!"; // COV_NF_LINE 1548 } 1549 } 1550 bottom: 1551 #if DEBUGFRAMER 1552 LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n" 1553 << std::string(input, current) 1554 << "\n$$$$$$$$$$$$$$" 1555 << BalsaFrameEnums::ParseStateToString(parse_state_) 1556 << "$$$$$$$$$$$$$$$" 1557 << " consumed: " << (current - input); 1558 if (Error()) { 1559 LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode()); 1560 } 1561 #endif // DEBUGFRAMER 1562 return current - input; 1563 } 1564 1565 } // namespace net 1566