1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/extensions/api/web_request/form_data_parser.h" 6 7 #include <vector> 8 9 #include "base/lazy_instance.h" 10 #include "base/strings/string_util.h" 11 #include "base/values.h" 12 #include "net/base/escape.h" 13 #include "net/url_request/url_request.h" 14 #include "third_party/re2/re2/re2.h" 15 16 using base::DictionaryValue; 17 using base::ListValue; 18 using base::StringPiece; 19 using re2::RE2; 20 21 namespace extensions { 22 23 namespace { 24 25 #define CONTENT_DISPOSITION "content-disposition:" 26 27 static const char g_escape_closing_quote[] = "\\\\E"; 28 static const size_t g_content_disposition_length = 29 sizeof(CONTENT_DISPOSITION) - 1; 30 31 // A wrapper struct for static RE2 objects to be held as LazyInstance. 32 struct Patterns { 33 Patterns(); 34 ~Patterns(); 35 const RE2 transfer_padding_pattern; 36 const RE2 crlf_pattern; 37 const RE2 closing_pattern; 38 const RE2 epilogue_pattern; 39 const RE2 crlf_free_pattern; 40 const RE2 preamble_pattern; 41 const RE2 header_pattern; 42 const RE2 content_disposition_pattern; 43 const RE2 name_pattern; 44 const RE2 value_pattern; 45 const RE2 unquote_pattern; 46 const RE2 url_encoded_pattern; 47 }; 48 49 Patterns::Patterns() 50 : transfer_padding_pattern("[ \\t]*\\r\\n"), 51 crlf_pattern("\\r\\n"), 52 closing_pattern("--[ \\t]*"), 53 epilogue_pattern("|\\r\\n(?s:.)*"), 54 crlf_free_pattern("(?:[^\\r]|\\r+[^\\r\\n])*"), 55 preamble_pattern(".+?"), 56 header_pattern("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"), 57 content_disposition_pattern("(?i:" CONTENT_DISPOSITION ")"), 58 name_pattern("\\bname=\"([^\"]*)\""), 59 value_pattern("\\bfilename=\"([^\"]*)\""), 60 unquote_pattern(g_escape_closing_quote), 61 // CHARACTER is an allowed character in a URL encoding. Definition is from 62 // RFC 1738, end of section 2.2. 63 #define CHARACTER "(?:[a-zA-Z0-9$_.+!*'(),]|-|(?:%[a-fA-F0-9]{2}))" 64 url_encoded_pattern("(" CHARACTER "*)=(" CHARACTER "*)") {} 65 #undef CHARACTER 66 67 #undef CONTENT_DISPOSITION 68 69 Patterns::~Patterns() {} 70 71 static base::LazyInstance<Patterns>::Leaky g_patterns = 72 LAZY_INSTANCE_INITIALIZER; 73 74 } // namespace 75 76 // Parses URLencoded forms, see 77 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 . 78 class FormDataParserUrlEncoded : public FormDataParser { 79 public: 80 FormDataParserUrlEncoded(); 81 virtual ~FormDataParserUrlEncoded(); 82 83 // Implementation of FormDataParser. 84 virtual bool AllDataReadOK() OVERRIDE; 85 virtual bool GetNextNameValue(Result* result) OVERRIDE; 86 virtual bool SetSource(const base::StringPiece& source) OVERRIDE; 87 88 private: 89 // The pattern to match a single name-value pair. This could be even static, 90 // but then we would have to spend more code on initializing the cached 91 // pointer to g_patterns.Get(). 92 const RE2& pattern() const { 93 return patterns_->url_encoded_pattern; 94 } 95 96 // Auxiliary constant for using RE2. Number of arguments for parsing 97 // name-value pairs (one for name, one for value). 98 static const size_t args_size_ = 2u; 99 static const net::UnescapeRule::Type unescape_rules_; 100 101 re2::StringPiece source_; 102 bool source_set_; 103 bool source_malformed_; 104 105 // Auxiliary store for using RE2. 106 std::string name_; 107 std::string value_; 108 const RE2::Arg arg_name_; 109 const RE2::Arg arg_value_; 110 const RE2::Arg* args_[args_size_]; 111 112 // Caching the pointer to g_patterns.Get(). 113 const Patterns* patterns_; 114 115 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded); 116 }; 117 118 // The following class, FormDataParserMultipart, parses forms encoded as 119 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart 120 // encoding) and 5322 (MIME-headers). 121 // 122 // Implementation details 123 // 124 // The original grammar from RFC 2046 is this, "multipart-body" being the root 125 // non-terminal: 126 // 127 // boundary := 0*69<bchars> bcharsnospace 128 // bchars := bcharsnospace / " " 129 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / "," 130 // / "-" / "." / "/" / ":" / "=" / "?" 131 // dash-boundary := "--" boundary 132 // multipart-body := [preamble CRLF] 133 // dash-boundary transport-padding CRLF 134 // body-part *encapsulation 135 // close-delimiter transport-padding 136 // [CRLF epilogue] 137 // transport-padding := *LWSP-char 138 // encapsulation := delimiter transport-padding CRLF body-part 139 // delimiter := CRLF dash-boundary 140 // close-delimiter := delimiter "--" 141 // preamble := discard-text 142 // epilogue := discard-text 143 // discard-text := *(*text CRLF) *text 144 // body-part := MIME-part-headers [CRLF *OCTET] 145 // OCTET := <any 0-255 octet value> 146 // 147 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF, 148 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the 149 // English alphabet, respectively. 150 // The non-terminal "text" is presumably just any text, excluding line breaks. 151 // The non-terminal "LWSP-char" is not directly defined in the original grammar 152 // but it means "linear whitespace", which is a space or a horizontal tab. 153 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use 154 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322: 155 // 156 // MIME-part-headers := field-name ":" unstructured CRLF 157 // field-name := 1*ftext 158 // ftext := %d33-57 / ; Printable US-ASCII 159 // %d59-126 ; characters not including ":". 160 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which 161 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and 162 // "CRLF<horizontal tab>", which serve for "folding". 163 // 164 // The FormDataParseMultipart class reads the input source and tries to parse it 165 // according to the grammar above, rooted at the "multipart-body" non-terminal. 166 // This happens in stages: 167 // 168 // 1. The optional preamble and the initial dash-boundary with transport padding 169 // and a CRLF are read and ignored. 170 // 171 // 2. Repeatedly each body part is read. The body parts can either serve to 172 // upload a file, or just a string of bytes. 173 // 2.a. The headers of that part are searched for the "content-disposition" 174 // header, which contains the name of the value represented by that body 175 // part. If the body-part is for file upload, that header also contains a 176 // filename. 177 // 2.b. The "*OCTET" part of the body part is then read and passed as the value 178 // of the name-value pair for body parts representing a string of bytes. 179 // For body parts for uploading a file the "*OCTET" part is just ignored 180 // and the filename is used for value instead. 181 // 182 // 3. The final close-delimiter and epilogue are read and ignored. 183 // 184 // IMPORTANT NOTE 185 // This parser supports multiple sources, i.e., SetSource can be called multiple 186 // times if the input is spread over several byte blocks. However, the split 187 // may only occur inside a body part, right after the trailing CRLF of headers. 188 class FormDataParserMultipart : public FormDataParser { 189 public: 190 explicit FormDataParserMultipart(const std::string& boundary_separator); 191 virtual ~FormDataParserMultipart(); 192 193 // Implementation of FormDataParser. 194 virtual bool AllDataReadOK() OVERRIDE; 195 virtual bool GetNextNameValue(Result* result) OVERRIDE; 196 virtual bool SetSource(const base::StringPiece& source) OVERRIDE; 197 198 private: 199 enum State { 200 STATE_INIT, // No input read yet. 201 STATE_READY, // Ready to call GetNextNameValue. 202 STATE_FINISHED, // Read the input until the end. 203 STATE_SUSPEND, // Waiting until a new |source_| is set. 204 STATE_ERROR 205 }; 206 207 // Produces a regexp to match the string "--" + |literal|. The idea is to 208 // represent "--" + |literal| as a "quoted pattern", a verbatim copy enclosed 209 // in "\\Q" and "\\E". The only catch is to watch out ofr occurences of "\\E" 210 // inside |literal|. Those must be excluded from the quote and the backslash 211 // doubly escaped. For example, for literal == "abc\\Edef" the result is 212 // "\\Q--abc\\E\\\\E\\Qdef\\E". 213 static std::string CreateBoundaryPatternFromLiteral( 214 const std::string& literal); 215 216 // Tests whether |input| has a prefix matching |pattern|. 217 static bool StartsWithPattern(const re2::StringPiece& input, 218 const RE2& pattern); 219 220 // If |source_| starts with a header, seeks |source_| beyond the header. If 221 // the header is Content-Disposition, extracts |name| from "name=" and 222 // possibly |value| from "filename=" fields of that header. Only if the 223 // "name" or "filename" fields are found, then |name| or |value| are touched. 224 // Returns true iff |source_| is seeked forward. Sets |value_assigned| 225 // to true iff |value| has been assigned to. 226 bool TryReadHeader(base::StringPiece* name, 227 base::StringPiece* value, 228 bool* value_assigned); 229 230 // Helper to GetNextNameValue. Expects that the input starts with a data 231 // portion of a body part. An attempt is made to read the input until the end 232 // of that body part. If |data| is not NULL, it is set to contain the data 233 // portion. Returns true iff the reading was successful. 234 bool FinishReadingPart(base::StringPiece* data); 235 236 // These methods could be even static, but then we would have to spend more 237 // code on initializing the cached pointer to g_patterns.Get(). 238 const RE2& transfer_padding_pattern() const { 239 return patterns_->transfer_padding_pattern; 240 } 241 const RE2& crlf_pattern() const { 242 return patterns_->crlf_pattern; 243 } 244 const RE2& closing_pattern() const { 245 return patterns_->closing_pattern; 246 } 247 const RE2& epilogue_pattern() const { 248 return patterns_->epilogue_pattern; 249 } 250 const RE2& crlf_free_pattern() const { 251 return patterns_->crlf_free_pattern; 252 } 253 const RE2& preamble_pattern() const { 254 return patterns_->preamble_pattern; 255 } 256 const RE2& header_pattern() const { 257 return patterns_->header_pattern; 258 } 259 const RE2& content_disposition_pattern() const { 260 return patterns_->content_disposition_pattern; 261 } 262 const RE2& name_pattern() const { 263 return patterns_->name_pattern; 264 } 265 const RE2& value_pattern() const { 266 return patterns_->value_pattern; 267 } 268 // However, this is used in a static method so it needs to be static. 269 static const RE2& unquote_pattern() { 270 return g_patterns.Get().unquote_pattern; // No caching g_patterns here. 271 } 272 273 const RE2 dash_boundary_pattern_; 274 275 // Because of initialisation dependency, |state_| needs to be declared after 276 // |dash_boundary_pattern_|. 277 State state_; 278 279 // The parsed message can be split into multiple sources which we read 280 // sequentially. 281 re2::StringPiece source_; 282 283 // Caching the pointer to g_patterns.Get(). 284 const Patterns* patterns_; 285 286 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart); 287 }; 288 289 // Implementation of FormDataParser and FormDataParser::Result. 290 291 FormDataParser::Result::Result() {} 292 FormDataParser::Result::~Result() {} 293 294 FormDataParser::~FormDataParser() {} 295 296 // static 297 scoped_ptr<FormDataParser> FormDataParser::Create( 298 const net::URLRequest& request) { 299 std::string value; 300 const bool found = request.extra_request_headers().GetHeader( 301 net::HttpRequestHeaders::kContentType, &value); 302 return CreateFromContentTypeHeader(found ? &value : NULL); 303 } 304 305 // static 306 scoped_ptr<FormDataParser> FormDataParser::CreateFromContentTypeHeader( 307 const std::string* content_type_header) { 308 enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE}; 309 ParserChoice choice = ERROR_CHOICE; 310 std::string boundary; 311 312 if (content_type_header == NULL) { 313 choice = URL_ENCODED; 314 } else { 315 const std::string content_type( 316 content_type_header->substr(0, content_type_header->find(';'))); 317 318 if (base::strcasecmp( 319 content_type.c_str(), "application/x-www-form-urlencoded") == 0) { 320 choice = URL_ENCODED; 321 } else if (base::strcasecmp( 322 content_type.c_str(), "multipart/form-data") == 0) { 323 static const char kBoundaryString[] = "boundary="; 324 size_t offset = content_type_header->find(kBoundaryString); 325 if (offset == std::string::npos) { 326 // Malformed header. 327 return scoped_ptr<FormDataParser>(); 328 } 329 offset += sizeof(kBoundaryString) - 1; 330 boundary = content_type_header->substr( 331 offset, content_type_header->find(';', offset)); 332 if (!boundary.empty()) 333 choice = MULTIPART; 334 } 335 } 336 // Other cases are unparseable, including when |content_type| is "text/plain". 337 338 switch (choice) { 339 case URL_ENCODED: 340 return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded()); 341 case MULTIPART: 342 return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary)); 343 default: // In other words, case ERROR_CHOICE: 344 return scoped_ptr<FormDataParser>(); 345 } 346 } 347 348 FormDataParser::FormDataParser() {} 349 350 // Implementation of FormDataParserUrlEncoded. 351 352 const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ = 353 net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS | 354 net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE; 355 356 FormDataParserUrlEncoded::FormDataParserUrlEncoded() 357 : source_(NULL), 358 source_set_(false), 359 source_malformed_(false), 360 arg_name_(&name_), 361 arg_value_(&value_), 362 patterns_(&(g_patterns.Get())) { 363 args_[0] = &arg_name_; 364 args_[1] = &arg_value_; 365 } 366 367 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {} 368 369 bool FormDataParserUrlEncoded::AllDataReadOK() { 370 // All OK means we read the whole source. 371 return source_set_ && source_.size() == 0 && !source_malformed_; 372 } 373 374 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) { 375 if (!source_set_ || source_malformed_) 376 return false; 377 378 bool success = RE2::ConsumeN(&source_, pattern(), args_, args_size_); 379 if (success) { 380 result->set_name(net::UnescapeURLComponent(name_, unescape_rules_)); 381 result->set_value(net::UnescapeURLComponent(value_, unescape_rules_)); 382 } 383 if (source_.length() > 0) { 384 if (source_[0] == '&') 385 source_.remove_prefix(1); // Remove the trailing '&'. 386 else 387 source_malformed_ = true; // '&' missing between two name-value pairs. 388 } 389 return success && !source_malformed_; 390 } 391 392 bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) { 393 if (source_set_) 394 return false; // We do not allow multiple sources for this parser. 395 source_.set(source.data(), source.size()); 396 source_set_ = true; 397 source_malformed_ = false; 398 return true; 399 } 400 401 // Implementation of FormDataParserMultipart. 402 403 // static 404 std::string FormDataParserMultipart::CreateBoundaryPatternFromLiteral( 405 const std::string& literal) { 406 #define OPEN_QUOTE "\\Q" 407 static const char quote[] = OPEN_QUOTE; 408 static const char unquote[] = "\\E"; 409 410 // The result always starts with opening the qoute and then "--". 411 std::string result(OPEN_QUOTE "--"); 412 #undef OPEN_QUOTE 413 414 // This StringPiece is used below to record the next occurrence of "\\E" in 415 // |literal|. 416 re2::StringPiece seek_unquote(literal); 417 const char* copy_start = literal.data(); 418 size_t copy_length = literal.size(); 419 420 // Find all "\\E" in |literal| and exclude them from the \Q...\E quote. 421 while (RE2::FindAndConsume(&seek_unquote, unquote_pattern())) { 422 copy_length = seek_unquote.data() - copy_start; 423 result.append(copy_start, copy_length); 424 result.append(g_escape_closing_quote); 425 result.append(quote); 426 copy_start = seek_unquote.data(); 427 } 428 429 // Finish the last \Q...\E quote. 430 copy_length = (literal.data() + literal.size()) - copy_start; 431 result.append(copy_start, copy_length); 432 result.append(unquote); 433 return result; 434 } 435 436 // static 437 bool FormDataParserMultipart::StartsWithPattern(const re2::StringPiece& input, 438 const RE2& pattern) { 439 return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0); 440 } 441 442 FormDataParserMultipart::FormDataParserMultipart( 443 const std::string& boundary_separator) 444 : dash_boundary_pattern_( 445 CreateBoundaryPatternFromLiteral(boundary_separator)), 446 state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR), 447 patterns_(&(g_patterns.Get())) {} 448 449 FormDataParserMultipart::~FormDataParserMultipart() {} 450 451 bool FormDataParserMultipart::AllDataReadOK() { 452 return state_ == STATE_FINISHED; 453 } 454 455 bool FormDataParserMultipart::FinishReadingPart(base::StringPiece* data) { 456 const char* data_start = source_.data(); 457 while (!StartsWithPattern(source_, dash_boundary_pattern_)) { 458 if (!RE2::Consume(&source_, crlf_free_pattern()) || 459 !RE2::Consume(&source_, crlf_pattern())) { 460 state_ = STATE_ERROR; 461 return false; 462 } 463 } 464 if (data != NULL) { 465 if (source_.data() == data_start) { 466 // No data in this body part. 467 state_ = STATE_ERROR; 468 return false; 469 } 470 // Subtract 2u for the trailing "\r\n". 471 data->set(data_start, source_.data() - data_start - 2u); 472 } 473 474 // Finally, read the dash-boundary and either skip to the next body part, or 475 // finish reading the source. 476 CHECK(RE2::Consume(&source_, dash_boundary_pattern_)); 477 if (StartsWithPattern(source_, closing_pattern())) { 478 CHECK(RE2::Consume(&source_, closing_pattern())); 479 if (RE2::Consume(&source_, epilogue_pattern())) 480 state_ = STATE_FINISHED; 481 else 482 state_ = STATE_ERROR; 483 } else { // Next body part ahead. 484 if (!RE2::Consume(&source_, transfer_padding_pattern())) 485 state_ = STATE_ERROR; 486 } 487 return state_ != STATE_ERROR; 488 } 489 490 bool FormDataParserMultipart::GetNextNameValue(Result* result) { 491 if (source_.size() == 0 || state_ != STATE_READY) 492 return false; 493 494 // 1. Read body-part headers. 495 base::StringPiece name; 496 base::StringPiece value; 497 bool value_assigned = false; 498 bool value_assigned_temp; 499 while (TryReadHeader(&name, &value, &value_assigned_temp)) 500 value_assigned |= value_assigned_temp; 501 if (name.size() == 0 || state_ == STATE_ERROR) { 502 state_ = STATE_ERROR; 503 return false; 504 } 505 506 // 2. Read the trailing CRLF after headers. 507 if (!RE2::Consume(&source_, crlf_pattern())) { 508 state_ = STATE_ERROR; 509 return false; 510 } 511 512 // 3. Read the data of this body part, i.e., everything until the first 513 // dash-boundary. 514 bool return_value; 515 if (value_assigned && source_.size() == 0) { // Wait for a new source? 516 return_value = true; 517 state_ = STATE_SUSPEND; 518 } else { 519 return_value = FinishReadingPart(value_assigned ? NULL : &value); 520 } 521 522 std::string unescaped_name = net::UnescapeURLComponent( 523 name.as_string(), 524 net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS); 525 result->set_name(unescaped_name); 526 result->set_value(value); 527 528 return return_value; 529 } 530 531 bool FormDataParserMultipart::SetSource(const base::StringPiece& source) { 532 if (source.data() == NULL || source_.size() != 0) 533 return false; 534 source_.set(source.data(), source.size()); 535 536 switch (state_) { 537 case STATE_INIT: 538 // Seek behind the preamble. 539 while (!StartsWithPattern(source_, dash_boundary_pattern_)) { 540 if (!RE2::Consume(&source_, preamble_pattern())) { 541 state_ = STATE_ERROR; 542 break; 543 } 544 } 545 // Read dash-boundary, transfer padding, and CRLF. 546 if (state_ != STATE_ERROR) { 547 if (!RE2::Consume(&source_, dash_boundary_pattern_) || 548 !RE2::Consume(&source_, transfer_padding_pattern())) 549 state_ = STATE_ERROR; 550 else 551 state_ = STATE_READY; 552 } 553 break; 554 case STATE_READY: // Nothing to do. 555 break; 556 case STATE_SUSPEND: 557 state_ = FinishReadingPart(NULL) ? STATE_READY : STATE_ERROR; 558 break; 559 default: 560 state_ = STATE_ERROR; 561 } 562 return state_ != STATE_ERROR; 563 } 564 565 bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name, 566 base::StringPiece* value, 567 bool* value_assigned) { 568 *value_assigned = false; 569 const char* header_start = source_.data(); 570 if (!RE2::Consume(&source_, header_pattern())) 571 return false; 572 // (*) After this point we must return true, because we consumed one header. 573 574 // Subtract 2u for the trailing "\r\n". 575 re2::StringPiece header(header_start, source_.data() - header_start - 2u); 576 577 if (!StartsWithPattern(header, content_disposition_pattern())) 578 return true; // Skip headers that don't describe the content-disposition. 579 580 re2::StringPiece groups[2u]; 581 582 if (!name_pattern().Match(header, 583 g_content_disposition_length, header.size(), 584 RE2::UNANCHORED, groups, 2)) { 585 state_ = STATE_ERROR; 586 return true; // See (*) for why true. 587 } 588 name->set(groups[1].data(), groups[1].size()); 589 590 if (value_pattern().Match(header, 591 g_content_disposition_length, header.size(), 592 RE2::UNANCHORED, groups, 2)) { 593 value->set(groups[1].data(), groups[1].size()); 594 *value_assigned = true; 595 } 596 return true; 597 } 598 599 } // namespace extensions 600