1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "net/http/http_content_disposition.h" 6 7 #include "base/base64.h" 8 #include "base/i18n/icu_string_conversions.h" 9 #include "base/logging.h" 10 #include "base/strings/string_tokenizer.h" 11 #include "base/strings/string_util.h" 12 #include "base/strings/sys_string_conversions.h" 13 #include "base/strings/utf_string_conversions.h" 14 #include "net/base/net_util.h" 15 #include "net/http/http_util.h" 16 #include "third_party/icu/source/common/unicode/ucnv.h" 17 18 namespace { 19 20 enum RFC2047EncodingType { 21 Q_ENCODING, 22 B_ENCODING 23 }; 24 25 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to 26 // decoding a quoted-printable string. Returns true if the input was valid. 27 bool DecodeQEncoding(const std::string& input, std::string* output) { 28 std::string temp; 29 temp.reserve(input.size()); 30 for (std::string::const_iterator it = input.begin(); it != input.end(); 31 ++it) { 32 if (*it == '_') { 33 temp.push_back(' '); 34 } else if (*it == '=') { 35 if ((input.end() - it < 3) || 36 !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || 37 !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) 38 return false; 39 unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + 40 HexDigitToInt(*(it + 2)); 41 temp.push_back(static_cast<char>(ch)); 42 ++it; 43 ++it; 44 } else if (0x20 < *it && *it < 0x7F && *it != '?') { 45 // In a Q-encoded word, only printable ASCII characters 46 // represent themselves. Besides, space, '=', '_' and '?' are 47 // not allowed, but they're already filtered out. 48 DCHECK_NE('=', *it); 49 DCHECK_NE('?', *it); 50 DCHECK_NE('_', *it); 51 temp.push_back(*it); 52 } else { 53 return false; 54 } 55 } 56 output->swap(temp); 57 return true; 58 } 59 60 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding 61 // type is specified in |enc_type|. 62 bool DecodeBQEncoding(const std::string& part, 63 RFC2047EncodingType enc_type, 64 const std::string& charset, 65 std::string* output) { 66 std::string decoded; 67 if (!((enc_type == B_ENCODING) ? 68 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) 69 return false; 70 71 if (decoded.empty()) { 72 output->clear(); 73 return true; 74 } 75 76 UErrorCode err = U_ZERO_ERROR; 77 UConverter* converter(ucnv_open(charset.c_str(), &err)); 78 if (U_FAILURE(err)) 79 return false; 80 81 // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. 82 // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes 83 // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a 84 // trailing '\0'. 85 size_t output_length = decoded.length() * 3 + 1; 86 char* buf = WriteInto(output, output_length); 87 output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length, 88 decoded.data(), decoded.length(), &err); 89 ucnv_close(converter); 90 if (U_FAILURE(err)) 91 return false; 92 output->resize(output_length); 93 return true; 94 } 95 96 bool DecodeWord(const std::string& encoded_word, 97 const std::string& referrer_charset, 98 bool* is_rfc2047, 99 std::string* output, 100 int* parse_result_flags) { 101 *is_rfc2047 = false; 102 output->clear(); 103 if (encoded_word.empty()) 104 return true; 105 106 if (!IsStringASCII(encoded_word)) { 107 // Try UTF-8, referrer_charset and the native OS default charset in turn. 108 if (IsStringUTF8(encoded_word)) { 109 *output = encoded_word; 110 } else { 111 base::string16 utf16_output; 112 if (!referrer_charset.empty() && 113 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), 114 base::OnStringConversionError::FAIL, 115 &utf16_output)) { 116 *output = UTF16ToUTF8(utf16_output); 117 } else { 118 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); 119 } 120 } 121 122 *parse_result_flags |= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS; 123 return true; 124 } 125 126 // RFC 2047 : one of encoding methods supported by Firefox and relatively 127 // widely used by web servers. 128 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. 129 // We don't care about the length restriction (72 bytes) because 130 // many web servers generate encoded words longer than the limit. 131 std::string decoded_word; 132 *is_rfc2047 = true; 133 int part_index = 0; 134 std::string charset; 135 base::StringTokenizer t(encoded_word, "?"); 136 RFC2047EncodingType enc_type = Q_ENCODING; 137 while (*is_rfc2047 && t.GetNext()) { 138 std::string part = t.token(); 139 switch (part_index) { 140 case 0: 141 if (part != "=") { 142 *is_rfc2047 = false; 143 break; 144 } 145 ++part_index; 146 break; 147 case 1: 148 // Do we need charset validity check here? 149 charset = part; 150 ++part_index; 151 break; 152 case 2: 153 if (part.size() > 1 || 154 part.find_first_of("bBqQ") == std::string::npos) { 155 *is_rfc2047 = false; 156 break; 157 } 158 if (part[0] == 'b' || part[0] == 'B') { 159 enc_type = B_ENCODING; 160 } 161 ++part_index; 162 break; 163 case 3: 164 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word); 165 if (!*is_rfc2047) { 166 // Last minute failure. Invalid B/Q encoding. Rather than 167 // passing it through, return now. 168 return false; 169 } 170 ++part_index; 171 break; 172 case 4: 173 if (part != "=") { 174 // Another last minute failure ! 175 // Likely to be a case of two encoded-words in a row or 176 // an encoded word followed by a non-encoded word. We can be 177 // generous, but it does not help much in terms of compatibility, 178 // I believe. Return immediately. 179 *is_rfc2047 = false; 180 return false; 181 } 182 ++part_index; 183 break; 184 default: 185 *is_rfc2047 = false; 186 return false; 187 } 188 } 189 190 if (*is_rfc2047) { 191 if (*(encoded_word.end() - 1) == '=') { 192 output->swap(decoded_word); 193 *parse_result_flags |= 194 net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS; 195 return true; 196 } 197 // encoded_word ending prematurelly with '?' or extra '?' 198 *is_rfc2047 = false; 199 return false; 200 } 201 202 // We're not handling 'especial' characters quoted with '\', but 203 // it should be Ok because we're not an email client but a 204 // web browser. 205 206 // What IE6/7 does: %-escaped UTF-8. 207 decoded_word = net::UnescapeURLComponent(encoded_word, 208 net::UnescapeRule::SPACES); 209 if (decoded_word != encoded_word) 210 *parse_result_flags |= 211 net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS; 212 if (IsStringUTF8(decoded_word)) { 213 output->swap(decoded_word); 214 return true; 215 // We can try either the OS default charset or 'origin charset' here, 216 // As far as I can tell, IE does not support it. However, I've seen 217 // web servers emit %-escaped string in a legacy encoding (usually 218 // origin charset). 219 // TODO(jungshik) : Test IE further and consider adding a fallback here. 220 } 221 return false; 222 } 223 224 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The 225 // value is supposed to be of the form: 226 // 227 // value = token | quoted-string 228 // 229 // However we currently also allow RFC 2047 encoding and non-ASCII 230 // strings. Non-ASCII strings are interpreted based on |referrer_charset|. 231 bool DecodeFilenameValue(const std::string& input, 232 const std::string& referrer_charset, 233 std::string* output, 234 int* parse_result_flags) { 235 int current_parse_result_flags = 0; 236 std::string decoded_value; 237 bool is_previous_token_rfc2047 = true; 238 239 // Tokenize with whitespace characters. 240 base::StringTokenizer t(input, " \t\n\r"); 241 t.set_options(base::StringTokenizer::RETURN_DELIMS); 242 while (t.GetNext()) { 243 if (t.token_is_delim()) { 244 // If the previous non-delimeter token is not RFC2047-encoded, 245 // put in a space in its place. Otheriwse, skip over it. 246 if (!is_previous_token_rfc2047) 247 decoded_value.push_back(' '); 248 continue; 249 } 250 // We don't support a single multibyte character split into 251 // adjacent encoded words. Some broken mail clients emit headers 252 // with that problem, but most web servers usually encode a filename 253 // in a single encoded-word. Firefox/Thunderbird do not support 254 // it, either. 255 std::string decoded; 256 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, 257 &decoded, ¤t_parse_result_flags)) 258 return false; 259 decoded_value.append(decoded); 260 } 261 output->swap(decoded_value); 262 if (parse_result_flags && !output->empty()) 263 *parse_result_flags |= current_parse_result_flags; 264 return true; 265 } 266 267 // Parses the charset and value-chars out of an ext-value string. 268 // 269 // ext-value = charset "'" [ language ] "'" value-chars 270 bool ParseExtValueComponents(const std::string& input, 271 std::string* charset, 272 std::string* value_chars) { 273 base::StringTokenizer t(input, "'"); 274 t.set_options(base::StringTokenizer::RETURN_DELIMS); 275 std::string temp_charset; 276 std::string temp_value; 277 int numDelimsSeen = 0; 278 while (t.GetNext()) { 279 if (t.token_is_delim()) { 280 ++numDelimsSeen; 281 continue; 282 } else { 283 switch (numDelimsSeen) { 284 case 0: 285 temp_charset = t.token(); 286 break; 287 case 1: 288 // Language is ignored. 289 break; 290 case 2: 291 temp_value = t.token(); 292 break; 293 default: 294 return false; 295 } 296 } 297 } 298 if (numDelimsSeen != 2) 299 return false; 300 if (temp_charset.empty() || temp_value.empty()) 301 return false; 302 charset->swap(temp_charset); 303 value_chars->swap(temp_value); 304 return true; 305 } 306 307 // http://tools.ietf.org/html/rfc5987#section-3.2 308 // 309 // ext-value = charset "'" [ language ] "'" value-chars 310 // 311 // charset = "UTF-8" / "ISO-8859-1" / mime-charset 312 // 313 // mime-charset = 1*mime-charsetc 314 // mime-charsetc = ALPHA / DIGIT 315 // / "!" / "#" / "$" / "%" / "&" 316 // / "+" / "-" / "^" / "_" / "`" 317 // / "{" / "}" / "~" 318 // 319 // language = <Language-Tag, defined in [RFC5646], Section 2.1> 320 // 321 // value-chars = *( pct-encoded / attr-char ) 322 // 323 // pct-encoded = "%" HEXDIG HEXDIG 324 // 325 // attr-char = ALPHA / DIGIT 326 // / "!" / "#" / "$" / "&" / "+" / "-" / "." 327 // / "^" / "_" / "`" / "|" / "~" 328 bool DecodeExtValue(const std::string& param_value, std::string* decoded) { 329 if (param_value.find('"') != std::string::npos) 330 return false; 331 332 std::string charset; 333 std::string value; 334 if (!ParseExtValueComponents(param_value, &charset, &value)) 335 return false; 336 337 // RFC 5987 value should be ASCII-only. 338 if (!IsStringASCII(value)) { 339 decoded->clear(); 340 return true; 341 } 342 343 std::string unescaped = net::UnescapeURLComponent( 344 value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); 345 346 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); 347 } 348 349 } // namespace 350 351 namespace net { 352 353 HttpContentDisposition::HttpContentDisposition( 354 const std::string& header, const std::string& referrer_charset) 355 : type_(INLINE), 356 parse_result_flags_(INVALID) { 357 Parse(header, referrer_charset); 358 } 359 360 HttpContentDisposition::~HttpContentDisposition() { 361 } 362 363 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( 364 std::string::const_iterator begin, std::string::const_iterator end) { 365 DCHECK(type_ == INLINE); 366 std::string::const_iterator delimiter = std::find(begin, end, ';'); 367 368 std::string::const_iterator type_begin = begin; 369 std::string::const_iterator type_end = delimiter; 370 HttpUtil::TrimLWS(&type_begin, &type_end); 371 372 // If the disposition-type isn't a valid token the then the 373 // Content-Disposition header is malformed, and we treat the first bytes as 374 // a parameter rather than a disposition-type. 375 if (!HttpUtil::IsToken(type_begin, type_end)) 376 return begin; 377 378 parse_result_flags_ |= HAS_DISPOSITION_TYPE; 379 380 DCHECK(std::find(type_begin, type_end, '=') == type_end); 381 382 if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) { 383 type_ = INLINE; 384 } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) { 385 type_ = ATTACHMENT; 386 } else { 387 parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE; 388 type_ = ATTACHMENT; 389 } 390 return delimiter; 391 } 392 393 // http://tools.ietf.org/html/rfc6266 394 // 395 // content-disposition = "Content-Disposition" ":" 396 // disposition-type *( ";" disposition-parm ) 397 // 398 // disposition-type = "inline" | "attachment" | disp-ext-type 399 // ; case-insensitive 400 // disp-ext-type = token 401 // 402 // disposition-parm = filename-parm | disp-ext-parm 403 // 404 // filename-parm = "filename" "=" value 405 // | "filename*" "=" ext-value 406 // 407 // disp-ext-parm = token "=" value 408 // | ext-token "=" ext-value 409 // ext-token = <the characters in token, followed by "*"> 410 // 411 void HttpContentDisposition::Parse(const std::string& header, 412 const std::string& referrer_charset) { 413 DCHECK(type_ == INLINE); 414 DCHECK(filename_.empty()); 415 416 std::string::const_iterator pos = header.begin(); 417 std::string::const_iterator end = header.end(); 418 pos = ConsumeDispositionType(pos, end); 419 420 std::string name; 421 std::string filename; 422 std::string ext_filename; 423 424 HttpUtil::NameValuePairsIterator iter(pos, end, ';'); 425 while (iter.GetNext()) { 426 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), 427 iter.name_end(), 428 "filename")) { 429 DecodeFilenameValue(iter.value(), referrer_charset, &filename, 430 &parse_result_flags_); 431 if (!filename.empty()) 432 parse_result_flags_ |= HAS_FILENAME; 433 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(), 434 iter.name_end(), 435 "name")) { 436 DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL); 437 if (!name.empty()) 438 parse_result_flags_ |= HAS_NAME; 439 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), 440 iter.name_end(), 441 "filename*")) { 442 DecodeExtValue(iter.raw_value(), &ext_filename); 443 if (!ext_filename.empty()) 444 parse_result_flags_ |= HAS_EXT_FILENAME; 445 } 446 } 447 448 if (!ext_filename.empty()) 449 filename_ = ext_filename; 450 else if (!filename.empty()) 451 filename_ = filename; 452 else 453 filename_ = name; 454 } 455 456 } // namespace net 457