1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "net/tools/flip_server/balsa_frame.h" 6 7 #include <assert.h> 8 #if __SSE2__ 9 #include <emmintrin.h> 10 #endif // __SSE2__ 11 #include <strings.h> 12 13 #include <limits> 14 #include <string> 15 #include <utility> 16 #include <vector> 17 18 #include "base/logging.h" 19 #include "base/port.h" 20 #include "base/string_piece.h" 21 #include "net/tools/flip_server/balsa_enums.h" 22 #include "net/tools/flip_server/balsa_headers.h" 23 #include "net/tools/flip_server/balsa_visitor_interface.h" 24 #include "net/tools/flip_server/buffer_interface.h" 25 #include "net/tools/flip_server/simple_buffer.h" 26 #include "net/tools/flip_server/split.h" 27 #include "net/tools/flip_server/string_piece_utils.h" 28 29 namespace net { 30 31 // Constants holding some header names for headers which can affect the way the 32 // HTTP message is framed, and so must be processed specially: 33 static const char kContentLength[] = "content-length"; 34 static const size_t kContentLengthSize = sizeof(kContentLength) - 1; 35 static const char kTransferEncoding[] = "transfer-encoding"; 36 static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1; 37 38 BalsaFrame::BalsaFrame() 39 : last_char_was_slash_r_(false), 40 saw_non_newline_char_(false), 41 start_was_space_(true), 42 chunk_length_character_extracted_(false), 43 is_request_(true), 44 request_was_head_(false), 45 max_header_length_(16 * 1024), 46 max_request_uri_length_(2048), 47 visitor_(&do_nothing_visitor_), 48 chunk_length_remaining_(0), 49 content_length_remaining_(0), 50 last_slash_n_loc_(NULL), 51 last_recorded_slash_n_loc_(NULL), 52 last_slash_n_idx_(0), 53 term_chars_(0), 54 parse_state_(BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE), 55 last_error_(BalsaFrameEnums::NO_ERROR), 56 headers_(NULL) { 57 } 58 59 BalsaFrame::~BalsaFrame() {} 60 61 void BalsaFrame::Reset() { 62 last_char_was_slash_r_ = false; 63 saw_non_newline_char_ = false; 64 start_was_space_ = true; 65 chunk_length_character_extracted_ = false; 66 // is_request_ = true; // not reset between messages. 67 // request_was_head_ = false; // not reset between messages. 68 // max_header_length_ = 4096; // not reset between messages. 69 // max_request_uri_length_ = 2048; // not reset between messages. 70 // visitor_ = &do_nothing_visitor_; // not reset between messages. 71 chunk_length_remaining_ = 0; 72 content_length_remaining_ = 0; 73 last_slash_n_loc_ = NULL; 74 last_recorded_slash_n_loc_ = NULL; 75 last_slash_n_idx_ = 0; 76 term_chars_ = 0; 77 parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE; 78 last_error_ = BalsaFrameEnums::NO_ERROR; 79 lines_.clear(); 80 if (headers_ != NULL) { 81 headers_->Clear(); 82 } 83 } 84 85 const char* BalsaFrameEnums::ParseStateToString( 86 BalsaFrameEnums::ParseState error_code) { 87 switch (error_code) { 88 case ERROR: 89 return "ERROR"; 90 case READING_HEADER_AND_FIRSTLINE: 91 return "READING_HEADER_AND_FIRSTLINE"; 92 case READING_CHUNK_LENGTH: 93 return "READING_CHUNK_LENGTH"; 94 case READING_CHUNK_EXTENSION: 95 return "READING_CHUNK_EXTENSION"; 96 case READING_CHUNK_DATA: 97 return "READING_CHUNK_DATA"; 98 case READING_CHUNK_TERM: 99 return "READING_CHUNK_TERM"; 100 case READING_LAST_CHUNK_TERM: 101 return "READING_LAST_CHUNK_TERM"; 102 case READING_TRAILER: 103 return "READING_TRAILER"; 104 case READING_UNTIL_CLOSE: 105 return "READING_UNTIL_CLOSE"; 106 case READING_CONTENT: 107 return "READING_CONTENT"; 108 case MESSAGE_FULLY_READ: 109 return "MESSAGE_FULLY_READ"; 110 case NUM_STATES: 111 return "UNKNOWN_STATE"; 112 } 113 return "UNKNOWN_STATE"; 114 } 115 116 const char* BalsaFrameEnums::ErrorCodeToString( 117 BalsaFrameEnums::ErrorCode error_code) { 118 switch (error_code) { 119 case NO_ERROR: 120 return "NO_ERROR"; 121 case NO_STATUS_LINE_IN_RESPONSE: 122 return "NO_STATUS_LINE_IN_RESPONSE"; 123 case NO_REQUEST_LINE_IN_REQUEST: 124 return "NO_REQUEST_LINE_IN_REQUEST"; 125 case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION: 126 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION"; 127 case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD: 128 return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD"; 129 case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE: 130 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE"; 131 case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI: 132 return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI"; 133 case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE: 134 return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE"; 135 case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION: 136 return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION"; 137 case FAILED_CONVERTING_STATUS_CODE_TO_INT: 138 return "FAILED_CONVERTING_STATUS_CODE_TO_INT"; 139 case REQUEST_URI_TOO_LONG: 140 return "REQUEST_URI_TOO_LONG"; 141 case HEADERS_TOO_LONG: 142 return "HEADERS_TOO_LONG"; 143 case UNPARSABLE_CONTENT_LENGTH: 144 return "UNPARSABLE_CONTENT_LENGTH"; 145 case MAYBE_BODY_BUT_NO_CONTENT_LENGTH: 146 return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH"; 147 case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH: 148 return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH"; 149 case HEADER_MISSING_COLON: 150 return "HEADER_MISSING_COLON"; 151 case INVALID_CHUNK_LENGTH: 152 return "INVALID_CHUNK_LENGTH"; 153 case CHUNK_LENGTH_OVERFLOW: 154 return "CHUNK_LENGTH_OVERFLOW"; 155 case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO: 156 return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO"; 157 case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT: 158 return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT"; 159 case MULTIPLE_CONTENT_LENGTH_KEYS: 160 return "MULTIPLE_CONTENT_LENGTH_KEYS"; 161 case MULTIPLE_TRANSFER_ENCODING_KEYS: 162 return "MULTIPLE_TRANSFER_ENCODING_KEYS"; 163 case UNKNOWN_TRANSFER_ENCODING: 164 return "UNKNOWN_TRANSFER_ENCODING"; 165 case INVALID_HEADER_FORMAT: 166 return "INVALID_HEADER_FORMAT"; 167 case INTERNAL_LOGIC_ERROR: 168 return "INTERNAL_LOGIC_ERROR"; 169 case NUM_ERROR_CODES: 170 return "UNKNOWN_ERROR"; 171 } 172 return "UNKNOWN_ERROR"; 173 } 174 175 // Summary: 176 // Parses the first line of either a request or response. 177 // Note that in the case of a detected warning, error_code will be set 178 // but the function will not return false. 179 // Exactly zero or one warning or error (but not both) may be detected 180 // by this function. 181 // Note that this function will not write the data of the first-line 182 // into the header's buffer (that should already have been done elsewhere). 183 // 184 // Pre-conditions: 185 // begin != end 186 // *begin should be a character which is > ' '. This implies that there 187 // is at least one non-whitespace characters between [begin, end). 188 // headers is a valid pointer to a BalsaHeaders class. 189 // error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value. 190 // Entire first line must exist between [begin, end) 191 // Exactly zero or one newlines -may- exist between [begin, end) 192 // [begin, end) should exist in the header's buffer. 193 // 194 // Side-effects: 195 // headers will be modified 196 // error_code may be modified if either a warning or error is detected 197 // 198 // Returns: 199 // True if no error (as opposed to warning) is detected. 200 // False if an error (as opposed to warning) is detected. 201 202 // 203 // If there is indeed non-whitespace in the line, then the following 204 // will take care of this for you: 205 // while (*begin <= ' ') ++begin; 206 // ProcessFirstLine(begin, end, is_request, &headers, &error_code); 207 // 208 bool ParseHTTPFirstLine(const char* begin, 209 const char* end, 210 bool is_request, 211 size_t max_request_uri_length, 212 BalsaHeaders* headers, 213 BalsaFrameEnums::ErrorCode* error_code) { 214 const char* current = begin; 215 // HTTP firstlines all have the following structure: 216 // LWS NONWS LWS NONWS LWS NONWS NOTCRLF CRLF 217 // [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n" 218 // ws1 nws1 ws2 nws2 ws3 nws3 ws4 219 // | [-------) [-------) [----------------) 220 // REQ: method request_uri version 221 // RESP: version statuscode reason 222 // 223 // The first NONWS->LWS component we'll call firstline_a. 224 // The second firstline_b, and the third firstline_c. 225 // 226 // firstline_a goes from nws1 to (but not including) ws2 227 // firstline_b goes from nws2 to (but not including) ws3 228 // firstline_c goes from nws3 to (but not including) ws4 229 // 230 // In the code: 231 // ws1 == whitespace_1_idx_ 232 // nws1 == non_whitespace_1_idx_ 233 // ws2 == whitespace_2_idx_ 234 // nws2 == non_whitespace_2_idx_ 235 // ws3 == whitespace_3_idx_ 236 // nws3 == non_whitespace_3_idx_ 237 // ws4 == whitespace_4_idx_ 238 239 // Kill all whitespace (including '\r\n') at the end of the line. 240 --end; 241 if (*end != '\n') { 242 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; 243 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" 244 << headers->OriginalHeadersForDebugging(); 245 return false; 246 } 247 while (begin < end && *end <= ' ') { 248 --end; 249 } 250 DCHECK(*end != '\n'); 251 if (*end == '\n') { 252 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; 253 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" 254 << headers->OriginalHeadersForDebugging(); 255 return false; 256 } 257 ++end; 258 259 // The two following statements should not be possible. 260 if (end == begin) { 261 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; 262 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" 263 << headers->OriginalHeadersForDebugging(); 264 return false; 265 } 266 267 // whitespace_1_idx_ 268 headers->whitespace_1_idx_ = current - begin; 269 // This loop is commented out as it is never used in current code. This is 270 // true only because we don't begin parsing the headers at all until we've 271 // encountered a non whitespace character at the beginning of the stream, at 272 // which point we begin our demarcation of header-start. If we did -not- do 273 // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop 274 // would be necessary for the proper functioning of this parsing. 275 // This is left here as this function may (in the future) be refactored out 276 // of the BalsaFrame class so that it may be shared between code in 277 // BalsaFrame and BalsaHeaders (where it would be used in some variant of the 278 // set_first_line() function (at which point it would be necessary). 279 #if 0 280 while (*current <= ' ') { 281 ++current; 282 } 283 #endif 284 // non_whitespace_1_idx_ 285 headers->non_whitespace_1_idx_ = current - begin; 286 do { 287 // The first time through, we're guaranteed that the current character 288 // won't be a whitespace (else the loop above wouldn't have terminated). 289 // That implies that we're guaranteed to get at least one non-whitespace 290 // character if we get into this loop at all. 291 ++current; 292 if (current == end) { 293 headers->whitespace_2_idx_ = current - begin; 294 headers->non_whitespace_2_idx_ = current - begin; 295 headers->whitespace_3_idx_ = current - begin; 296 headers->non_whitespace_3_idx_ = current - begin; 297 headers->whitespace_4_idx_ = current - begin; 298 // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD for request 299 // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response 300 *error_code = 301 static_cast<BalsaFrameEnums::ErrorCode>( 302 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION + 303 is_request); 304 if (!is_request) { // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION 305 return false; 306 } 307 goto output_exhausted; 308 } 309 } while (*current > ' '); 310 // whitespace_2_idx_ 311 headers->whitespace_2_idx_ = current - begin; 312 do { 313 ++current; 314 // Note that due to the loop which consumes all of the whitespace 315 // at the end of the line, current can never == end while in this function. 316 } while (*current <= ' '); 317 // non_whitespace_2_idx_ 318 headers->non_whitespace_2_idx_ = current - begin; 319 do { 320 ++current; 321 if (current == end) { 322 headers->whitespace_3_idx_ = current - begin; 323 headers->non_whitespace_3_idx_ = current - begin; 324 headers->whitespace_4_idx_ = current - begin; 325 // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request 326 // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response 327 *error_code = 328 static_cast<BalsaFrameEnums::ErrorCode>( 329 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE 330 + is_request); 331 goto output_exhausted; 332 } 333 } while (*current > ' '); 334 // whitespace_3_idx_ 335 headers->whitespace_3_idx_ = current - begin; 336 do { 337 ++current; 338 // Note that due to the loop which consumes all of the whitespace 339 // at the end of the line, current can never == end while in this function. 340 } while (*current <= ' '); 341 // non_whitespace_3_idx_ 342 headers->non_whitespace_3_idx_ = current - begin; 343 headers->whitespace_4_idx_ = end - begin; 344 345 output_exhausted: 346 // Note that we don't fail the parse immediately when parsing of the 347 // firstline fails. Depending on the protocol type, we may want to accept 348 // a firstline with only one or two elements, e.g., for HTTP/0.9: 349 // GET\r\n 350 // or 351 // GET /\r\n 352 // should be parsed without issue (though the visitor should know that 353 // parsing the entire line was not exactly as it should be). 354 // 355 // Eventually, these errors may be removed alltogether, as the visitor can 356 // detect them on its own by examining the size of the various fields. 357 // headers->set_first_line(non_whitespace_1_idx_, current); 358 359 if (is_request) { 360 if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) > 361 max_request_uri_length) { 362 // For requests, we need at least the method. We could assume that a 363 // blank URI means "/". If version isn't stated, it should be assumed 364 // to be HTTP/0.9 by the visitor. 365 *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG; 366 return false; 367 } 368 } else { 369 headers->parsed_response_code_ = 0; 370 { 371 const char* parsed_response_code_current = 372 begin + headers->non_whitespace_2_idx_; 373 const char* parsed_response_code_end = begin + headers->whitespace_3_idx_; 374 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10; 375 376 // Convert a string of [0-9]* into an int. 377 // Note that this allows for the conversion of response codes which 378 // are outside the bounds of normal HTTP response codes (no checking 379 // is done to ensure that these are valid-- they're merely parsed)! 380 while (parsed_response_code_current < parsed_response_code_end) { 381 if (*parsed_response_code_current < '0' || 382 *parsed_response_code_current > '9') { 383 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT; 384 return false; 385 } 386 size_t status_code_x_10 = headers->parsed_response_code_ * 10; 387 uint8 c = *parsed_response_code_current - '0'; 388 if ((headers->parsed_response_code_ > kMaxDiv10) || 389 (std::numeric_limits<size_t>::max() - status_code_x_10) < c) { 390 // overflow. 391 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT; 392 return false; 393 } 394 headers->parsed_response_code_ = status_code_x_10 + c; 395 ++parsed_response_code_current; 396 } 397 } 398 } 399 return true; 400 } 401 402 // begin - beginning of the firstline 403 // end - end of the firstline 404 // 405 // A precondition for this function is that there is non-whitespace between 406 // [begin, end). If this precondition is not met, the function will not perform 407 // as expected (and bad things may happen, and it will eat your first, second, 408 // and third unborn children!). 409 // 410 // Another precondition for this function is that [begin, end) includes 411 // at most one newline, which must be at the end of the line. 412 void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) { 413 BalsaFrameEnums::ErrorCode previous_error = last_error_; 414 if (!ParseHTTPFirstLine(begin, 415 end, 416 is_request_, 417 max_request_uri_length_, 418 headers_, 419 &last_error_)) { 420 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 421 visitor_->HandleHeaderError(this); 422 return; 423 } 424 if (previous_error != last_error_) { 425 visitor_->HandleHeaderWarning(this); 426 } 427 428 if (is_request_) { 429 int version_length = 430 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_; 431 visitor_->ProcessRequestFirstLine( 432 begin + headers_->non_whitespace_1_idx_, 433 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_, 434 begin + headers_->non_whitespace_1_idx_, 435 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_, 436 begin + headers_->non_whitespace_2_idx_, 437 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_, 438 begin + headers_->non_whitespace_3_idx_, 439 version_length); 440 if (version_length == 0) 441 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 442 } else { 443 visitor_->ProcessResponseFirstLine( 444 begin + headers_->non_whitespace_1_idx_, 445 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_, 446 begin + headers_->non_whitespace_1_idx_, 447 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_, 448 begin + headers_->non_whitespace_2_idx_, 449 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_, 450 begin + headers_->non_whitespace_3_idx_, 451 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_); 452 } 453 } 454 455 // 'stream_begin' points to the first character of the headers buffer. 456 // 'line_begin' points to the first character of the line. 457 // 'current' points to a char which is ':'. 458 // 'line_end' points to the position of '\n' + 1. 459 // 'line_begin' points to the position of first character of line. 460 void BalsaFrame::CleanUpKeyValueWhitespace( 461 const char* stream_begin, 462 const char* line_begin, 463 const char* current, 464 const char* line_end, 465 HeaderLineDescription* current_header_line) { 466 const char* colon_loc = current; 467 DCHECK_LT(colon_loc, line_end); 468 DCHECK_EQ(':', *colon_loc); 469 DCHECK_EQ(':', *current); 470 DCHECK_GE(' ', *line_end) 471 << "\"" << std::string(line_begin, line_end) << "\""; 472 473 // TODO(fenix): Investigate whether or not the bounds tests in the 474 // while loops here are redundant, and if so, remove them. 475 --current; 476 while (current > line_begin && *current <= ' ') --current; 477 current += (current != colon_loc); 478 current_header_line->key_end_idx = current - stream_begin; 479 480 current = colon_loc; 481 DCHECK_EQ(':', *current); 482 ++current; 483 while (current < line_end && *current <= ' ') ++current; 484 current_header_line->value_begin_idx = current - stream_begin; 485 486 DCHECK_GE(current_header_line->key_end_idx, 487 current_header_line->first_char_idx); 488 DCHECK_GE(current_header_line->value_begin_idx, 489 current_header_line->key_end_idx); 490 DCHECK_GE(current_header_line->last_char_idx, 491 current_header_line->value_begin_idx); 492 } 493 494 inline void BalsaFrame::FindColonsAndParseIntoKeyValue() { 495 DCHECK(!lines_.empty()); 496 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 497 // The last line is always just a newline (and is uninteresting). 498 const Lines::size_type lines_size_m1 = lines_.size() - 1; 499 #if __SSE2__ 500 const __v16qi colons = { ':', ':', ':', ':', ':', ':', ':', ':', 501 ':', ':', ':', ':', ':', ':', ':', ':'}; 502 const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16; 503 #endif // __SSE2__ 504 const char* current = stream_begin + lines_[1].first; 505 // This code is a bit more subtle than it may appear at first glance. 506 // This code looks for a colon in the current line... but it also looks 507 // beyond the current line. If there is no colon in the current line, then 508 // for each subsequent line (until the colon which -has- been found is 509 // associated with a line), no searching for a colon will be performed. In 510 // this way, we minimize the amount of bytes we have scanned for a colon. 511 for (Lines::size_type i = 1; i < lines_size_m1;) { 512 const char* line_begin = stream_begin + lines_[i].first; 513 514 // Here we handle possible continuations. Note that we do not replace 515 // the '\n' in the line before a continuation (at least, as of now), 516 // which implies that any code which looks for a value must deal with 517 // "\r\n", etc -within- the line (and not just at the end of it). 518 for (++i; i < lines_size_m1; ++i) { 519 const char c = *(stream_begin + lines_[i].first); 520 if (c > ' ') { 521 // Not a continuation, so stop. Note that if the 'original' i = 1, 522 // and the next line is not a continuation, we'll end up with i = 2 523 // when we break. This handles the incrementing of i for the outer 524 // loop. 525 break; 526 } 527 } 528 const char* line_end = stream_begin + lines_[i - 1].second; 529 DCHECK_LT(line_begin - stream_begin, line_end - stream_begin); 530 531 // We cleanup the whitespace at the end of the line before doing anything 532 // else of interest as it allows us to do nothing when irregularly formatted 533 // headers are parsed (e.g. those with only keys, only values, or no colon). 534 // 535 // We're guaranteed to have *line_end > ' ' while line_end >= line_begin. 536 --line_end; 537 DCHECK_EQ('\n', *line_end) 538 << "\"" << std::string(line_begin, line_end) << "\""; 539 while (*line_end <= ' ' && line_end > line_begin) { 540 --line_end; 541 } 542 ++line_end; 543 DCHECK_GE(' ', *line_end); 544 DCHECK_LT(line_begin, line_end); 545 546 // We use '0' for the block idx, because we're always writing to the first 547 // block from the framer (we do this because the framer requires that the 548 // entire header sequence be in a contiguous buffer). 549 headers_->header_lines_.push_back( 550 HeaderLineDescription(line_begin - stream_begin, 551 line_end - stream_begin, 552 line_end - stream_begin, 553 line_end - stream_begin, 554 0)); 555 if (current >= line_end) { 556 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON; 557 visitor_->HandleHeaderWarning(this); 558 // Then the next colon will not be found within this header line-- time 559 // to try again with another header-line. 560 continue; 561 } else if (current < line_begin) { 562 // When this condition is true, the last detected colon was part of a 563 // previous line. We reset to the beginning of the line as we don't care 564 // about the presence of any colon before the beginning of the current 565 // line. 566 current = line_begin; 567 } 568 #if __SSE2__ 569 while (current < header_lines_end_m16) { 570 __m128i header_bytes = 571 _mm_loadu_si128(reinterpret_cast<const __m128i *>(current)); 572 __m128i colon_cmp = 573 _mm_cmpeq_epi8(header_bytes, reinterpret_cast<__m128i>(colons)); 574 int colon_msk = _mm_movemask_epi8(colon_cmp); 575 if (colon_msk == 0) { 576 current += 16; 577 continue; 578 } 579 current += (ffs(colon_msk) - 1); 580 if (current > line_end) { 581 break; 582 } 583 goto found_colon; 584 } 585 #endif // __SSE2__ 586 for (; current < line_end; ++current) { 587 if (*current != ':') { 588 continue; 589 } 590 goto found_colon; 591 } 592 // If we've gotten to here, then there was no colon 593 // in the line. The arguments we passed into the construction 594 // for the HeaderLineDescription object should be OK-- it assumes 595 // that the entire content is 'key' by default (which is true, as 596 // there was no colon, there can be no value). Note that this is a 597 // construct which is technically not allowed by the spec. 598 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON; 599 visitor_->HandleHeaderWarning(this); 600 continue; 601 found_colon: 602 DCHECK_EQ(*current, ':'); 603 DCHECK_LE(current - stream_begin, line_end - stream_begin); 604 DCHECK_LE(stream_begin - stream_begin, current - stream_begin); 605 606 HeaderLineDescription& current_header_line = headers_->header_lines_.back(); 607 current_header_line.key_end_idx = current - stream_begin; 608 current_header_line.value_begin_idx = current_header_line.key_end_idx; 609 if (current < line_end) { 610 ++current_header_line.key_end_idx; 611 612 CleanUpKeyValueWhitespace(stream_begin, 613 line_begin, 614 current, 615 line_end, 616 ¤t_header_line); 617 } 618 } 619 } 620 621 void BalsaFrame::ProcessContentLengthLine( 622 HeaderLines::size_type line_idx, 623 BalsaHeadersEnums::ContentLengthStatus* status, 624 size_t* length) { 625 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx]; 626 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 627 const char* line_end = stream_begin + header_line.last_char_idx; 628 const char* value_begin = (stream_begin + header_line.value_begin_idx); 629 630 if (value_begin >= line_end) { 631 // There is no non-whitespace value data. 632 #if DEBUGFRAMER 633 LOG(INFO) << "invalid content-length -- no non-whitespace value data"; 634 #endif 635 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH; 636 return; 637 } 638 639 *length = 0; 640 while (value_begin < line_end) { 641 if (*value_begin < '0' || *value_begin > '9') { 642 // bad! content-length found, and couldn't parse all of it! 643 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH; 644 #if DEBUGFRAMER 645 LOG(INFO) << "invalid content-length - non numeric character detected"; 646 #endif // DEBUGFRAMER 647 return; 648 } 649 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10; 650 size_t length_x_10 = *length * 10; 651 const unsigned char c = *value_begin - '0'; 652 if (*length > kMaxDiv10 || 653 (std::numeric_limits<size_t>::max() - length_x_10) < c) { 654 *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW; 655 #if DEBUGFRAMER 656 LOG(INFO) << "content-length overflow"; 657 #endif // DEBUGFRAMER 658 return; 659 } 660 *length = length_x_10 + c; 661 ++value_begin; 662 } 663 #if DEBUGFRAMER 664 LOG(INFO) << "content_length parsed: " << *length; 665 #endif // DEBUGFRAMER 666 *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH; 667 } 668 669 void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) { 670 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx]; 671 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 672 const char* line_end = stream_begin + header_line.last_char_idx; 673 const char* value_begin = stream_begin + header_line.value_begin_idx; 674 size_t value_length = line_end - value_begin; 675 676 if ((value_length == 7) && 677 !strncasecmp(value_begin, "chunked", 7)) { 678 headers_->transfer_encoding_is_chunked_ = true; 679 } else if ((value_length == 8) && 680 !strncasecmp(value_begin, "identity", 8)) { 681 headers_->transfer_encoding_is_chunked_ = false; 682 } else { 683 last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING; 684 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 685 visitor_->HandleHeaderError(this); 686 return; 687 } 688 } 689 690 namespace { 691 bool SplitStringPiece(base::StringPiece original, char delim, 692 base::StringPiece* before, base::StringPiece* after) { 693 const char* p = original.data(); 694 const char* end = p + original.size(); 695 696 while (p != end) { 697 if (*p == delim) { 698 ++p; 699 } else { 700 const char* start = p; 701 while (++p != end && *p != delim) { 702 // Skip to the next occurence of the delimiter. 703 } 704 *before = base::StringPiece(start, p - start); 705 if (p != end) 706 *after = base::StringPiece(p + 1, end - (p + 1)); 707 else 708 *after = base::StringPiece(""); 709 StringPieceUtils::RemoveWhitespaceContext(before); 710 StringPieceUtils::RemoveWhitespaceContext(after); 711 return true; 712 } 713 } 714 715 *before = original; 716 *after = ""; 717 return false; 718 } 719 720 // TODO(phython): Fix this function to properly deal with quoted values. 721 // E.g. ";;foo", "\";;\"", or \"aa; 722 // The last example, the semi-colon is a separator between extensions. 723 void ProcessChunkExtensionsManual(base::StringPiece all_extensions, 724 BalsaHeaders* extensions) { 725 base::StringPiece extension; 726 base::StringPiece remaining; 727 StringPieceUtils::RemoveWhitespaceContext(&all_extensions); 728 SplitStringPiece(all_extensions, ';', &extension, &remaining); 729 while (!extension.empty()) { 730 base::StringPiece key; 731 base::StringPiece value; 732 SplitStringPiece(extension, '=', &key, &value); 733 if (!value.empty()) { 734 // Strip quotation marks if they exist. 735 if (!value.empty() && value[0] == '"') 736 value.remove_prefix(1); 737 if (!value.empty() && value[value.length() - 1] == '"') 738 value.remove_suffix(1); 739 } 740 741 extensions->AppendHeader(key, value); 742 743 StringPieceUtils::RemoveWhitespaceContext(&remaining); 744 SplitStringPiece(remaining, ';', &extension, &remaining); 745 } 746 } 747 748 // TODO(phython): Fix this function to properly deal with quoted values. 749 // E.g. ";;foo", "\";;\"", or \"aa; 750 // The last example, the semi-colon is a separator between extensions. 751 void ProcessChunkExtensionsGoogle3(const char* input, size_t size, 752 BalsaHeaders* extensions) { 753 std::vector<base::StringPiece> key_values; 754 SplitStringPieceToVector(base::StringPiece(input, size), ";", 755 &key_values, true); 756 for (unsigned int i = 0; i < key_values.size(); ++i) { 757 base::StringPiece key = key_values[i].substr(0, key_values[i].find('=')); 758 base::StringPiece value; 759 if (key.length() < key_values[i].length()) { 760 value = key_values[i].substr(key.length() + 1); 761 // Remove any leading and trailing whitespace. 762 StringPieceUtils::RemoveWhitespaceContext(&value); 763 764 // Strip quotation marks if they exist. 765 if (!value.empty() && value[0] == '"') 766 value.remove_prefix(1); 767 if (!value.empty() && value[value.length() - 1] == '"') 768 value.remove_suffix(1); 769 } 770 771 // Strip the key whitespace after checking that there is a value. 772 StringPieceUtils::RemoveWhitespaceContext(&key); 773 extensions->AppendHeader(key, value); 774 } 775 } 776 777 } // anonymous namespace 778 779 void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size, 780 BalsaHeaders* extensions) { 781 #if 0 782 ProcessChunkExtensionsGoogle3(input, size, extensions); 783 #else 784 ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions); 785 #endif 786 } 787 788 void BalsaFrame::ProcessHeaderLines() { 789 HeaderLines::size_type content_length_idx = 0; 790 HeaderLines::size_type transfer_encoding_idx = 0; 791 792 DCHECK(!lines_.empty()); 793 #if DEBUGFRAMER 794 LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n"; 795 #endif // DEBUGFRAMER 796 797 // There is no need to attempt to process headers if no header lines exist. 798 // There are at least two lines in the message which are not header lines. 799 // These two non-header lines are the first line of the message, and the 800 // last line of the message (which is an empty line). 801 // Thus, we test to see if we have more than two lines total before attempting 802 // to parse any header lines. 803 if (lines_.size() > 2) { 804 const char* stream_begin = headers_->OriginalHeaderStreamBegin(); 805 806 // Then, for the rest of the header data, we parse these into key-value 807 // pairs. 808 FindColonsAndParseIntoKeyValue(); 809 // At this point, we've parsed all of the headers. Time to look for those 810 // headers which we require for framing. 811 const HeaderLines::size_type 812 header_lines_size = headers_->header_lines_.size(); 813 for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) { 814 const HeaderLineDescription& current_header_line = 815 headers_->header_lines_[i]; 816 const char* key_begin = 817 (stream_begin + current_header_line.first_char_idx); 818 const char* key_end = (stream_begin + current_header_line.key_end_idx); 819 const size_t key_len = key_end - key_begin; 820 const char c = *key_begin; 821 #if DEBUGFRAMER 822 LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len) 823 << " c: '" << c << "' key_len: " << key_len; 824 #endif // DEBUGFRAMER 825 // If a header begins with either lowercase or uppercase 'c' or 't', then 826 // the header may be one of content-length, connection, content-encoding 827 // or transfer-encoding. These headers are special, as they change the way 828 // that the message is framed, and so the framer is required to search 829 // for them. 830 831 832 if (c == 'c' || c == 'C') { 833 if ((key_len == kContentLengthSize) && 834 0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) { 835 BalsaHeadersEnums::ContentLengthStatus content_length_status = 836 BalsaHeadersEnums::NO_CONTENT_LENGTH; 837 size_t length = 0; 838 ProcessContentLengthLine(i, &content_length_status, &length); 839 if (content_length_idx != 0) { // then we've already seen one! 840 if ((headers_->content_length_status_ != content_length_status) || 841 ((headers_->content_length_status_ == 842 BalsaHeadersEnums::VALID_CONTENT_LENGTH) && 843 length != headers_->content_length_)) { 844 last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS; 845 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 846 visitor_->HandleHeaderError(this); 847 return; 848 } 849 continue; 850 } else { 851 content_length_idx = i + 1; 852 headers_->content_length_status_ = content_length_status; 853 headers_->content_length_ = length; 854 content_length_remaining_ = length; 855 } 856 857 } 858 } else if (c == 't' || c == 'T') { 859 if ((key_len == kTransferEncodingSize) && 860 0 == strncasecmp(key_begin, kTransferEncoding, 861 kTransferEncodingSize)) { 862 if (transfer_encoding_idx != 0) { 863 last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS; 864 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 865 visitor_->HandleHeaderError(this); 866 return; 867 } 868 transfer_encoding_idx = i + 1; 869 } 870 } else if (i == 0 && (key_len == 0 || c == ' ')) { 871 last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT; 872 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 873 visitor_->HandleHeaderError(this); 874 return; 875 } 876 } 877 if (headers_->transfer_encoding_is_chunked_) { 878 headers_->content_length_ = 0; 879 headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH; 880 content_length_remaining_ = 0; 881 } 882 if (transfer_encoding_idx != 0) { 883 ProcessTransferEncodingLine(transfer_encoding_idx - 1); 884 } 885 } 886 } 887 888 void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() { 889 // For responses, can't have a body if the request was a HEAD, or if it is 890 // one of these response-codes. rfc2616 section 4.3 891 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 892 if (is_request_ || 893 !(request_was_head_ || 894 (headers_->parsed_response_code_ >= 100 && 895 headers_->parsed_response_code_ < 200) || 896 (headers_->parsed_response_code_ == 204) || 897 (headers_->parsed_response_code_ == 304))) { 898 // Then we can have a body. 899 if (headers_->transfer_encoding_is_chunked_) { 900 // Note that 901 // if ( Transfer-Encoding: chunked && Content-length: ) 902 // then Transfer-Encoding: chunked trumps. 903 // This is as specified in the spec. 904 // rfc2616 section 4.4.3 905 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH; 906 } else { 907 // Errors parsing content-length definitely can cause 908 // protocol errors/warnings 909 switch (headers_->content_length_status_) { 910 // If we have a content-length, and it is parsed 911 // properly, there are two options. 912 // 1) zero content, in which case the message is done, and 913 // 2) nonzero content, in which case we have to 914 // consume the body. 915 case BalsaHeadersEnums::VALID_CONTENT_LENGTH: 916 if (headers_->content_length_ == 0) { 917 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 918 } else { 919 parse_state_ = BalsaFrameEnums::READING_CONTENT; 920 } 921 break; 922 case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW: 923 case BalsaHeadersEnums::INVALID_CONTENT_LENGTH: 924 // If there were characters left-over after parsing the 925 // content length, we should flag an error and stop. 926 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 927 last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH; 928 visitor_->HandleHeaderError(this); 929 break; 930 // We can have: no transfer-encoding, no content length, and no 931 // connection: close... 932 // Unfortunately, this case doesn't seem to be covered in the spec. 933 // We'll assume that the safest thing to do here is what the google 934 // binaries before 2008 already do, which is to assume that 935 // everything until the connection is closed is body. 936 case BalsaHeadersEnums::NO_CONTENT_LENGTH: 937 if (is_request_) { 938 base::StringPiece method = headers_->request_method(); 939 // POSTs and PUTs should have a detectable body length. If they 940 // do not we consider it an error. 941 if ((method.size() == 4 && 942 strncmp(method.data(), "POST", 4) == 0) || 943 (method.size() == 3 && 944 strncmp(method.data(), "PUT", 3) == 0)) { 945 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 946 last_error_ = 947 BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH; 948 visitor_->HandleHeaderError(this); 949 break; 950 } 951 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 952 } else { 953 parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE; 954 last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH; 955 visitor_->HandleHeaderWarning(this); 956 } 957 break; 958 // The COV_NF_... statements here provide hints to the apparatus 959 // which computes coverage reports/ratios that this code is never 960 // intended to be executed, and should technically be impossible. 961 // COV_NF_START 962 default: 963 LOG(FATAL) << "Saw a content_length_status: " 964 << headers_->content_length_status_ << " which is unknown."; 965 // COV_NF_END 966 } 967 } 968 } 969 } 970 971 size_t BalsaFrame::ProcessHeaders(const char* message_start, 972 size_t message_length) { 973 const char* const original_message_start = message_start; 974 const char* const message_end = message_start + message_length; 975 const char* message_current = message_start; 976 const char* checkpoint = message_start; 977 978 if (message_length == 0) { 979 goto bottom; 980 } 981 982 while (message_current < message_end) { 983 size_t base_idx = headers_->GetReadableBytesFromHeaderStream(); 984 985 // Yes, we could use strchr (assuming null termination), or 986 // memchr, but as it turns out that is slower than this tight loop 987 // for the input that we see. 988 if (!saw_non_newline_char_) { 989 do { 990 const char c = *message_current; 991 if (c != '\r' && c != '\n') { 992 if (c <= ' ') { 993 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 994 last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST; 995 visitor_->HandleHeaderError(this); 996 goto bottom; 997 } else { 998 saw_non_newline_char_ = true; 999 checkpoint = message_start = message_current; 1000 goto read_real_message; 1001 } 1002 } 1003 ++message_current; 1004 } while (message_current < message_end); 1005 goto bottom; // this is necessary to skip 'last_char_was_slash_r' checks 1006 } else { 1007 read_real_message: 1008 // Note that SSE2 can be enabled on certain piii platforms. 1009 #if __SSE2__ 1010 { 1011 const char* const message_end_m16 = message_end - 16; 1012 __v16qi newlines = { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 1013 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' }; 1014 while (message_current < message_end_m16) { 1015 // What this does (using compiler intrinsics): 1016 // 1017 // Load 16 '\n's into an xmm register 1018 // Load 16 bytes of currennt message into an xmm register 1019 // Do byte-wise equals on those two xmm registers 1020 // Take the first bit of each byte, and put that into the first 1021 // 16 bits of a mask 1022 // If the mask is zero, no '\n' found. increment by 16 and try again 1023 // Else scan forward to find the first set bit. 1024 // Increment current by the index of the first set bit 1025 // (ffs returns index of first set bit + 1) 1026 __m128i msg_bytes = 1027 _mm_loadu_si128(const_cast<__m128i *>( 1028 reinterpret_cast<const __m128i *>(message_current))); 1029 __m128i newline_cmp = 1030 _mm_cmpeq_epi8(msg_bytes, reinterpret_cast<__m128i>(newlines)); 1031 int newline_msk = _mm_movemask_epi8(newline_cmp); 1032 if (newline_msk == 0) { 1033 message_current += 16; 1034 continue; 1035 } 1036 message_current += (ffs(newline_msk) - 1); 1037 const size_t relative_idx = message_current - message_start; 1038 const size_t message_current_idx = 1 + base_idx + relative_idx; 1039 lines_.push_back(std::make_pair(last_slash_n_idx_, 1040 message_current_idx)); 1041 if (lines_.size() == 1) { 1042 headers_->WriteFromFramer(checkpoint, 1043 1 + message_current - checkpoint); 1044 checkpoint = message_current + 1; 1045 const char* begin = headers_->OriginalHeaderStreamBegin(); 1046 #if DEBUGFRAMER 1047 LOG(INFO) << "First line " << std::string(begin, lines_[0].second); 1048 LOG(INFO) << "is_request_: " << is_request_; 1049 #endif 1050 ProcessFirstLine(begin, begin + lines_[0].second); 1051 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) 1052 goto process_lines; 1053 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) 1054 goto bottom; 1055 } 1056 const size_t chars_since_last_slash_n = (message_current_idx - 1057 last_slash_n_idx_); 1058 last_slash_n_idx_ = message_current_idx; 1059 if (chars_since_last_slash_n > 2) { 1060 // We have a slash-n, but the last slash n was 1061 // more than 2 characters away from this. Thus, we know 1062 // that this cannot be an end-of-header. 1063 ++message_current; 1064 continue; 1065 } 1066 if ((chars_since_last_slash_n == 1) || 1067 (((message_current > message_start) && 1068 (*(message_current - 1) == '\r')) || 1069 (last_char_was_slash_r_))) { 1070 goto process_lines; 1071 } 1072 ++message_current; 1073 } 1074 } 1075 #endif // __SSE2__ 1076 while (message_current < message_end) { 1077 if (*message_current != '\n') { 1078 ++message_current; 1079 continue; 1080 } 1081 const size_t relative_idx = message_current - message_start; 1082 const size_t message_current_idx = 1 + base_idx + relative_idx; 1083 lines_.push_back(std::make_pair(last_slash_n_idx_, 1084 message_current_idx)); 1085 if (lines_.size() == 1) { 1086 headers_->WriteFromFramer(checkpoint, 1087 1 + message_current - checkpoint); 1088 checkpoint = message_current + 1; 1089 const char* begin = headers_->OriginalHeaderStreamBegin(); 1090 #if DEBUGFRAMER 1091 LOG(INFO) << "First line " << std::string(begin, lines_[0].second); 1092 LOG(INFO) << "is_request_: " << is_request_; 1093 #endif 1094 ProcessFirstLine(begin, begin + lines_[0].second); 1095 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) 1096 goto process_lines; 1097 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) 1098 goto bottom; 1099 } 1100 const size_t chars_since_last_slash_n = (message_current_idx - 1101 last_slash_n_idx_); 1102 last_slash_n_idx_ = message_current_idx; 1103 if (chars_since_last_slash_n > 2) { 1104 // false positive. 1105 ++message_current; 1106 continue; 1107 } 1108 if ((chars_since_last_slash_n == 1) || 1109 (((message_current > message_start) && 1110 (*(message_current - 1) == '\r')) || 1111 (last_char_was_slash_r_))) { 1112 goto process_lines; 1113 } 1114 ++message_current; 1115 } 1116 } 1117 continue; 1118 process_lines: 1119 ++message_current; 1120 DCHECK(message_current >= message_start); 1121 if (message_current > message_start) { 1122 headers_->WriteFromFramer(checkpoint, message_current - checkpoint); 1123 } 1124 1125 // Check if we have exceeded maximum headers length 1126 // Although we check for this limit before and after we call this function 1127 // we check it here as well to make sure that in case the visitor changed 1128 // the max_header_length_ (for example after processing the first line) 1129 // we handle it gracefully. 1130 if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) { 1131 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1132 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; 1133 visitor_->HandleHeaderError(this); 1134 goto bottom; 1135 } 1136 1137 // Since we know that we won't be writing any more bytes of the header, 1138 // we tell that to the headers object. The headers object may make 1139 // more efficient allocation decisions when this is signaled. 1140 headers_->DoneWritingFromFramer(); 1141 { 1142 const char* readable_ptr = NULL; 1143 size_t readable_size = 0; 1144 headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size); 1145 visitor_->ProcessHeaderInput(readable_ptr, readable_size); 1146 } 1147 1148 // Ok, now that we've written everything into our header buffer, it is 1149 // time to process the header lines (extract proper values for headers 1150 // which are important for framing). 1151 ProcessHeaderLines(); 1152 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) { 1153 goto bottom; 1154 } 1155 AssignParseStateAfterHeadersHaveBeenParsed(); 1156 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) { 1157 goto bottom; 1158 } 1159 visitor_->ProcessHeaders(*headers_); 1160 visitor_->HeaderDone(); 1161 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) { 1162 visitor_->MessageDone(); 1163 } 1164 goto bottom; 1165 } 1166 // If we've gotten to here, it means that we've consumed all of the 1167 // available input. We need to record whether or not the last character we 1168 // saw was a '\r' so that a subsequent call to ProcessInput correctly finds 1169 // a header framing that is split across the two calls. 1170 last_char_was_slash_r_ = (*(message_end - 1) == '\r'); 1171 DCHECK(message_current >= message_start); 1172 if (message_current > message_start) { 1173 headers_->WriteFromFramer(checkpoint, message_current - checkpoint); 1174 } 1175 bottom: 1176 return message_current - original_message_start; 1177 } 1178 1179 1180 size_t BalsaFrame::BytesSafeToSplice() const { 1181 switch (parse_state_) { 1182 case BalsaFrameEnums::READING_CHUNK_DATA: 1183 return chunk_length_remaining_; 1184 case BalsaFrameEnums::READING_UNTIL_CLOSE: 1185 return std::numeric_limits<size_t>::max(); 1186 case BalsaFrameEnums::READING_CONTENT: 1187 return content_length_remaining_; 1188 default: 1189 return 0; 1190 } 1191 } 1192 1193 void BalsaFrame::BytesSpliced(size_t bytes_spliced) { 1194 switch (parse_state_) { 1195 case BalsaFrameEnums::READING_CHUNK_DATA: 1196 if (chunk_length_remaining_ >= bytes_spliced) { 1197 chunk_length_remaining_ -= bytes_spliced; 1198 if (chunk_length_remaining_ == 0) { 1199 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM; 1200 } 1201 return; 1202 } else { 1203 last_error_ = 1204 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT; 1205 goto error_exit; 1206 } 1207 1208 case BalsaFrameEnums::READING_UNTIL_CLOSE: 1209 return; 1210 1211 case BalsaFrameEnums::READING_CONTENT: 1212 if (content_length_remaining_ >= bytes_spliced) { 1213 content_length_remaining_ -= bytes_spliced; 1214 if (content_length_remaining_ == 0) { 1215 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1216 visitor_->MessageDone(); 1217 } 1218 return; 1219 } else { 1220 last_error_ = 1221 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT; 1222 goto error_exit; 1223 } 1224 1225 default: 1226 last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO; 1227 goto error_exit; 1228 } 1229 1230 error_exit: 1231 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1232 visitor_->HandleBodyError(this); 1233 }; 1234 1235 // You may note that the state-machine contained within this function has both 1236 // switch and goto labels for nearly the same thing. For instance, the 1237 // following two labels refer to the same code block: 1238 // label_reading_chunk_data: 1239 // case BalsaFrameEnums::READING_CHUNK_DATA: 1240 // The 'case' statement is required for the switch statement which occurs when 1241 // ProcessInput is invoked. The goto label is required as the state-machine 1242 // does not use a computed goto in any subsequent operations. 1243 // 1244 // Since several states exit the state machine for various reasons, there is 1245 // also one label at the bottom of the function. When it is appropriate to 1246 // return from the function, that part of the state machine instead issues a 1247 // goto bottom; This results in less code duplication, and makes debugging 1248 // easier (as you can add a statement to a section of code which is guaranteed 1249 // to be invoked when the function is exiting. 1250 size_t BalsaFrame::ProcessInput(const char* input, size_t size) { 1251 const char* current = input; 1252 const char* on_entry = current; 1253 const char* end = current + size; 1254 #if DEBUGFRAMER 1255 LOG(INFO) << "\n==============" 1256 << BalsaFrameEnums::ParseStateToString(parse_state_) 1257 << "===============\n"; 1258 #endif // DEBUGFRAMER 1259 1260 DCHECK(headers_ != NULL); 1261 if (headers_ == NULL) return 0; 1262 1263 if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) { 1264 const size_t header_length = headers_->GetReadableBytesFromHeaderStream(); 1265 // Yes, we still have to check this here as the user can change the 1266 // max_header_length amount! 1267 // Also it is possible that we have reached the maximum allowed header size, 1268 // and we have more to consume (remember we are still inside 1269 // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error. 1270 if (header_length > max_header_length_ || 1271 (header_length == max_header_length_ && size > 0)) { 1272 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1273 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; 1274 visitor_->HandleHeaderError(this); 1275 goto bottom; 1276 } 1277 size_t bytes_to_process = max_header_length_ - header_length; 1278 if (bytes_to_process > size) { 1279 bytes_to_process = size; 1280 } 1281 current += ProcessHeaders(input, bytes_to_process); 1282 // If we are still reading headers check if we have crossed the headers 1283 // limit. Note that we check for >= as opposed to >. This is because if 1284 // header_length_after equals max_header_length_ and we are still in the 1285 // parse_state_ BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for 1286 // sure that the headers limit will be crossed later on 1287 if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) { 1288 // Note that headers_ is valid only if we are still reading headers. 1289 const size_t header_length_after = 1290 headers_->GetReadableBytesFromHeaderStream(); 1291 if (header_length_after >= max_header_length_) { 1292 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1293 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; 1294 visitor_->HandleHeaderError(this); 1295 } 1296 } 1297 goto bottom; 1298 } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ || 1299 parse_state_ == BalsaFrameEnums::PARSE_ERROR) { 1300 // Can do nothing more 'till we're reset. 1301 goto bottom; 1302 } 1303 1304 while (current < end) { 1305 switch (parse_state_) { 1306 label_reading_chunk_length: 1307 case BalsaFrameEnums::READING_CHUNK_LENGTH: 1308 // In this state we read the chunk length. 1309 // Note that once we hit a character which is not in: 1310 // [0-9;A-Fa-f\n], we transition to a different state. 1311 // 1312 { 1313 // If we used strtol, etc, we'd have to buffer this line. 1314 // This is more annoying than simply doing the conversion 1315 // here. This code accounts for overflow. 1316 static const signed char buf[] = { 1317 // %0 %1 %2 %3 %4 %5 %6 %7 %8 \t \n %b %c \r %e %f 1318 -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1, 1319 // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f 1320 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1321 // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f 1322 -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1323 // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f 1324 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -2, -1, -1, -1, -1, 1325 // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f 1326 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1327 // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f 1328 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1329 // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f 1330 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1331 // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f 1332 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1333 }; 1334 // valid cases: 1335 // "09123\n" // -> 09123 1336 // "09123\r\n" // -> 09123 1337 // "09123 \n" // -> 09123 1338 // "09123 \r\n" // -> 09123 1339 // "09123 12312\n" // -> 09123 1340 // "09123 12312\r\n" // -> 09123 1341 // "09123; foo=bar\n" // -> 09123 1342 // "09123; foo=bar\r\n" // -> 09123 1343 // "FFFFFFFFFFFFFFFF\r\n" // -> FFFFFFFFFFFFFFFF 1344 // "FFFFFFFFFFFFFFFF 22\r\n" // -> FFFFFFFFFFFFFFFF 1345 // invalid cases: 1346 // "[ \t]+[^\n]*\n" 1347 // "FFFFFFFFFFFFFFFFF\r\n" (would overflow) 1348 // "\r\n" 1349 // "\n" 1350 while (current < end) { 1351 const char c = *current; 1352 ++current; 1353 const signed char addition = buf[static_cast<int>(c)]; 1354 if (addition >= 0) { 1355 chunk_length_character_extracted_ = true; 1356 size_t length_x_16 = chunk_length_remaining_ * 16; 1357 const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16; 1358 if ((chunk_length_remaining_ > kMaxDiv16) || 1359 ((std::numeric_limits<size_t>::max() - length_x_16) < 1360 static_cast<size_t>(addition))) { 1361 // overflow -- asked for a chunk-length greater than 2^64 - 1!! 1362 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1363 last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW; 1364 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1365 visitor_->HandleChunkingError(this); 1366 goto bottom; 1367 } 1368 chunk_length_remaining_ = length_x_16 + addition; 1369 continue; 1370 } 1371 1372 if (!chunk_length_character_extracted_ || addition == -1) { 1373 // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no 1374 // characters were converted, or an unexpected character was 1375 // seen. 1376 parse_state_ = BalsaFrameEnums::PARSE_ERROR; 1377 last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH; 1378 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1379 visitor_->HandleChunkingError(this); 1380 goto bottom; 1381 } 1382 1383 --current; 1384 parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION; 1385 visitor_->ProcessChunkLength(chunk_length_remaining_); 1386 goto label_reading_chunk_extension; 1387 } 1388 } 1389 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1390 goto bottom; // case BalsaFrameEnums::READING_CHUNK_LENGTH 1391 1392 label_reading_chunk_extension: 1393 case BalsaFrameEnums::READING_CHUNK_EXTENSION: 1394 { 1395 // TODO(phython): Convert this scanning to be 16 bytes at a time if 1396 // there is data to be read. 1397 const char* extensions_start = current; 1398 size_t extensions_length = 0; 1399 while (current < end) { 1400 const char c = *current; 1401 if (c == '\r' || c == '\n') { 1402 extensions_length = 1403 (extensions_start == current) ? 1404 0 : 1405 current - extensions_start - 1; 1406 } 1407 1408 ++current; 1409 if (c == '\n') { 1410 chunk_length_character_extracted_ = false; 1411 visitor_->ProcessChunkExtensions( 1412 extensions_start, extensions_length); 1413 if (chunk_length_remaining_ != 0) { 1414 parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA; 1415 goto label_reading_chunk_data; 1416 } 1417 HeaderFramingFound('\n'); 1418 parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM; 1419 goto label_reading_last_chunk_term; 1420 } 1421 } 1422 visitor_->ProcessChunkExtensions( 1423 extensions_start, extensions_length); 1424 } 1425 1426 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1427 goto bottom; // case BalsaFrameEnums::READING_CHUNK_EXTENSION 1428 1429 label_reading_chunk_data: 1430 case BalsaFrameEnums::READING_CHUNK_DATA: 1431 while (current < end) { 1432 if (chunk_length_remaining_ == 0) { 1433 break; 1434 } 1435 // read in the chunk 1436 size_t bytes_remaining = end - current; 1437 size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ? 1438 chunk_length_remaining_ : bytes_remaining; 1439 const char* tmp_current = current + consumed_bytes; 1440 visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry); 1441 visitor_->ProcessBodyData(current, consumed_bytes); 1442 on_entry = current = tmp_current; 1443 chunk_length_remaining_ -= consumed_bytes; 1444 } 1445 if (chunk_length_remaining_ == 0) { 1446 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM; 1447 goto label_reading_chunk_term; 1448 } 1449 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1450 goto bottom; // case BalsaFrameEnums::READING_CHUNK_DATA 1451 1452 label_reading_chunk_term: 1453 case BalsaFrameEnums::READING_CHUNK_TERM: 1454 while (current < end) { 1455 const char c = *current; 1456 ++current; 1457 1458 if (c == '\n') { 1459 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH; 1460 goto label_reading_chunk_length; 1461 } 1462 } 1463 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1464 goto bottom; // case BalsaFrameEnums::READING_CHUNK_TERM 1465 1466 label_reading_last_chunk_term: 1467 case BalsaFrameEnums::READING_LAST_CHUNK_TERM: 1468 while (current < end) { 1469 const char c = *current; 1470 1471 if (!HeaderFramingFound(c)) { 1472 // If not, however, since the spec only suggests that the 1473 // client SHOULD indicate the presence of trailers, we get to 1474 // *test* that they did or didn't. 1475 // If all of the bytes we've seen since: 1476 // OPTIONAL_WS 0 OPTIONAL_STUFF CRLF 1477 // are either '\r', or '\n', then we can assume that we don't yet 1478 // know if we need to parse headers, or if the next byte will make 1479 // the HeaderFramingFound condition (above) true. 1480 if (HeaderFramingMayBeFound()) { 1481 // If true, then we have seen only characters '\r' or '\n'. 1482 ++current; 1483 1484 // Lets try again! There is no state change here. 1485 continue; 1486 } else { 1487 // If (!HeaderFramingMayBeFound()), then we know that we must be 1488 // reading the first non CRLF character of a trailer. 1489 parse_state_ = BalsaFrameEnums::READING_TRAILER; 1490 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1491 on_entry = current; 1492 goto label_reading_trailer; 1493 } 1494 } else { 1495 // If we've found a "\r\n\r\n", then the message 1496 // is done. 1497 ++current; 1498 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1499 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1500 visitor_->MessageDone(); 1501 goto bottom; 1502 } 1503 break; // from while loop 1504 } 1505 visitor_->ProcessBodyInput(on_entry, current - on_entry); 1506 goto bottom; // case BalsaFrameEnums::READING_LAST_CHUNK_TERM 1507 1508 label_reading_trailer: 1509 case BalsaFrameEnums::READING_TRAILER: 1510 while (current < end) { 1511 const char c = *current; 1512 ++current; 1513 // TODO(fenix): If we ever care about trailers as part of framing, 1514 // deal with them here (see below for part of the 'solution') 1515 // if (LineFramingFound(c)) { 1516 // trailer_lines_.push_back(make_pair(start_of_line_, 1517 // trailer_length_ - 1)); 1518 // start_of_line_ = trailer_length_; 1519 // } 1520 if (HeaderFramingFound(c)) { 1521 // ProcessTrailers(visitor_, &trailers_); 1522 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1523 visitor_->ProcessTrailerInput(on_entry, current - on_entry); 1524 visitor_->MessageDone(); 1525 goto bottom; 1526 } 1527 } 1528 visitor_->ProcessTrailerInput(on_entry, current - on_entry); 1529 break; // case BalsaFrameEnums::READING_TRAILER 1530 1531 // Note that there is no label: 1532 // 'label_reading_until_close' 1533 // here. This is because the state-machine exists immediately after 1534 // reading the headers instead of transitioning here (as it would 1535 // do if it was consuming all the data it could, all the time). 1536 case BalsaFrameEnums::READING_UNTIL_CLOSE: 1537 { 1538 const size_t bytes_remaining = end - current; 1539 if (bytes_remaining > 0) { 1540 visitor_->ProcessBodyInput(current, bytes_remaining); 1541 visitor_->ProcessBodyData(current, bytes_remaining); 1542 current += bytes_remaining; 1543 } 1544 } 1545 goto bottom; // case BalsaFrameEnums::READING_UNTIL_CLOSE 1546 1547 // label_reading_content: 1548 case BalsaFrameEnums::READING_CONTENT: 1549 #if DEBUGFRAMER 1550 LOG(INFO) << "ReadingContent: " << content_length_remaining_; 1551 #endif // DEBUGFRAMER 1552 while (content_length_remaining_ && current < end) { 1553 // read in the content 1554 const size_t bytes_remaining = end - current; 1555 const size_t consumed_bytes = 1556 (content_length_remaining_ < bytes_remaining) ? 1557 content_length_remaining_ : bytes_remaining; 1558 visitor_->ProcessBodyInput(current, consumed_bytes); 1559 visitor_->ProcessBodyData(current, consumed_bytes); 1560 current += consumed_bytes; 1561 content_length_remaining_ -= consumed_bytes; 1562 } 1563 if (content_length_remaining_ == 0) { 1564 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; 1565 visitor_->MessageDone(); 1566 } 1567 goto bottom; // case BalsaFrameEnums::READING_CONTENT 1568 1569 default: 1570 // The state-machine should never be in a state that isn't handled 1571 // above. This is a glaring logic error, and we should do something 1572 // drastic to ensure that this gets looked-at and fixed. 1573 LOG(FATAL) << "Unknown state: " << parse_state_ // COV_NF_LINE 1574 << " memory corruption?!"; // COV_NF_LINE 1575 } 1576 } 1577 bottom: 1578 #if DEBUGFRAMER 1579 LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n" 1580 << std::string(input, current) 1581 << "\n$$$$$$$$$$$$$$" 1582 << BalsaFrameEnums::ParseStateToString(parse_state_) 1583 << "$$$$$$$$$$$$$$$" 1584 << " consumed: " << (current - input); 1585 if (Error()) { 1586 LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode()); 1587 } 1588 #endif // DEBUGFRAMER 1589 return current - input; 1590 } 1591 1592 const uint32 BalsaFrame::kValidTerm1; 1593 const uint32 BalsaFrame::kValidTerm1Mask; 1594 const uint32 BalsaFrame::kValidTerm2; 1595 const uint32 BalsaFrame::kValidTerm2Mask; 1596 1597 } // namespace net 1598