Home | History | Annotate | Download | only in http
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "net/http/http_content_disposition.h"
      6 
      7 #include "base/base64.h"
      8 #include "base/i18n/icu_string_conversions.h"
      9 #include "base/logging.h"
     10 #include "base/strings/string_tokenizer.h"
     11 #include "base/strings/string_util.h"
     12 #include "base/strings/sys_string_conversions.h"
     13 #include "base/strings/utf_string_conversions.h"
     14 #include "net/base/net_util.h"
     15 #include "net/http/http_util.h"
     16 #include "third_party/icu/source/common/unicode/ucnv.h"
     17 
     18 namespace {
     19 
     20 enum RFC2047EncodingType {
     21   Q_ENCODING,
     22   B_ENCODING
     23 };
     24 
     25 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
     26 // decoding a quoted-printable string.  Returns true if the input was valid.
     27 bool DecodeQEncoding(const std::string& input, std::string* output) {
     28   std::string temp;
     29   temp.reserve(input.size());
     30   for (std::string::const_iterator it = input.begin(); it != input.end();
     31        ++it) {
     32     if (*it == '_') {
     33       temp.push_back(' ');
     34     } else if (*it == '=') {
     35       if ((input.end() - it < 3) ||
     36           !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
     37           !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
     38         return false;
     39       unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
     40                          HexDigitToInt(*(it + 2));
     41       temp.push_back(static_cast<char>(ch));
     42       ++it;
     43       ++it;
     44     } else if (0x20 < *it && *it < 0x7F && *it != '?') {
     45       // In a Q-encoded word, only printable ASCII characters
     46       // represent themselves. Besides, space, '=', '_' and '?' are
     47       // not allowed, but they're already filtered out.
     48       DCHECK_NE('=', *it);
     49       DCHECK_NE('?', *it);
     50       DCHECK_NE('_', *it);
     51       temp.push_back(*it);
     52     } else {
     53       return false;
     54     }
     55   }
     56   output->swap(temp);
     57   return true;
     58 }
     59 
     60 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
     61 // type is specified in |enc_type|.
     62 bool DecodeBQEncoding(const std::string& part,
     63                       RFC2047EncodingType enc_type,
     64                       const std::string& charset,
     65                       std::string* output) {
     66   std::string decoded;
     67   if (!((enc_type == B_ENCODING) ?
     68         base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded)))
     69     return false;
     70 
     71   if (decoded.empty()) {
     72     output->clear();
     73     return true;
     74   }
     75 
     76   UErrorCode err = U_ZERO_ERROR;
     77   UConverter* converter(ucnv_open(charset.c_str(), &err));
     78   if (U_FAILURE(err))
     79     return false;
     80 
     81   // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
     82   // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
     83   // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a
     84   // trailing '\0'.
     85   size_t output_length = decoded.length() * 3 + 1;
     86   char* buf = WriteInto(output, output_length);
     87   output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length,
     88                                      decoded.data(), decoded.length(), &err);
     89   ucnv_close(converter);
     90   if (U_FAILURE(err))
     91     return false;
     92   output->resize(output_length);
     93   return true;
     94 }
     95 
     96 bool DecodeWord(const std::string& encoded_word,
     97                 const std::string& referrer_charset,
     98                 bool* is_rfc2047,
     99                 std::string* output,
    100                 int* parse_result_flags) {
    101   *is_rfc2047 = false;
    102   output->clear();
    103   if (encoded_word.empty())
    104     return true;
    105 
    106   if (!IsStringASCII(encoded_word)) {
    107     // Try UTF-8, referrer_charset and the native OS default charset in turn.
    108     if (IsStringUTF8(encoded_word)) {
    109       *output = encoded_word;
    110     } else {
    111       base::string16 utf16_output;
    112       if (!referrer_charset.empty() &&
    113           base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
    114                                 base::OnStringConversionError::FAIL,
    115                                 &utf16_output)) {
    116         *output = UTF16ToUTF8(utf16_output);
    117       } else {
    118         *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
    119       }
    120     }
    121 
    122     *parse_result_flags |= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS;
    123     return true;
    124   }
    125 
    126   // RFC 2047 : one of encoding methods supported by Firefox and relatively
    127   // widely used by web servers.
    128   // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
    129   // We don't care about the length restriction (72 bytes) because
    130   // many web servers generate encoded words longer than the limit.
    131   std::string decoded_word;
    132   *is_rfc2047 = true;
    133   int part_index = 0;
    134   std::string charset;
    135   base::StringTokenizer t(encoded_word, "?");
    136   RFC2047EncodingType enc_type = Q_ENCODING;
    137   while (*is_rfc2047 && t.GetNext()) {
    138     std::string part = t.token();
    139     switch (part_index) {
    140       case 0:
    141         if (part != "=") {
    142           *is_rfc2047 = false;
    143           break;
    144         }
    145         ++part_index;
    146         break;
    147       case 1:
    148         // Do we need charset validity check here?
    149         charset = part;
    150         ++part_index;
    151         break;
    152       case 2:
    153         if (part.size() > 1 ||
    154             part.find_first_of("bBqQ") == std::string::npos) {
    155           *is_rfc2047 = false;
    156           break;
    157         }
    158         if (part[0] == 'b' || part[0] == 'B') {
    159           enc_type = B_ENCODING;
    160         }
    161         ++part_index;
    162         break;
    163       case 3:
    164         *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
    165         if (!*is_rfc2047) {
    166           // Last minute failure. Invalid B/Q encoding. Rather than
    167           // passing it through, return now.
    168           return false;
    169         }
    170         ++part_index;
    171         break;
    172       case 4:
    173         if (part != "=") {
    174           // Another last minute failure !
    175           // Likely to be a case of two encoded-words in a row or
    176           // an encoded word followed by a non-encoded word. We can be
    177           // generous, but it does not help much in terms of compatibility,
    178           // I believe. Return immediately.
    179           *is_rfc2047 = false;
    180           return false;
    181         }
    182         ++part_index;
    183         break;
    184       default:
    185         *is_rfc2047 = false;
    186         return false;
    187     }
    188   }
    189 
    190   if (*is_rfc2047) {
    191     if (*(encoded_word.end() - 1) == '=') {
    192       output->swap(decoded_word);
    193       *parse_result_flags |=
    194           net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
    195       return true;
    196     }
    197     // encoded_word ending prematurelly with '?' or extra '?'
    198     *is_rfc2047 = false;
    199     return false;
    200   }
    201 
    202   // We're not handling 'especial' characters quoted with '\', but
    203   // it should be Ok because we're not an email client but a
    204   // web browser.
    205 
    206   // What IE6/7 does: %-escaped UTF-8.
    207   decoded_word = net::UnescapeURLComponent(encoded_word,
    208                                            net::UnescapeRule::SPACES);
    209   if (decoded_word != encoded_word)
    210     *parse_result_flags |=
    211         net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
    212   if (IsStringUTF8(decoded_word)) {
    213     output->swap(decoded_word);
    214     return true;
    215     // We can try either the OS default charset or 'origin charset' here,
    216     // As far as I can tell, IE does not support it. However, I've seen
    217     // web servers emit %-escaped string in a legacy encoding (usually
    218     // origin charset).
    219     // TODO(jungshik) : Test IE further and consider adding a fallback here.
    220   }
    221   return false;
    222 }
    223 
    224 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
    225 // value is supposed to be of the form:
    226 //
    227 //   value                   = token | quoted-string
    228 //
    229 // However we currently also allow RFC 2047 encoding and non-ASCII
    230 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
    231 bool DecodeFilenameValue(const std::string& input,
    232                          const std::string& referrer_charset,
    233                          std::string* output,
    234                          int* parse_result_flags) {
    235   int current_parse_result_flags = 0;
    236   std::string decoded_value;
    237   bool is_previous_token_rfc2047 = true;
    238 
    239   // Tokenize with whitespace characters.
    240   base::StringTokenizer t(input, " \t\n\r");
    241   t.set_options(base::StringTokenizer::RETURN_DELIMS);
    242   while (t.GetNext()) {
    243     if (t.token_is_delim()) {
    244       // If the previous non-delimeter token is not RFC2047-encoded,
    245       // put in a space in its place. Otheriwse, skip over it.
    246       if (!is_previous_token_rfc2047)
    247         decoded_value.push_back(' ');
    248       continue;
    249     }
    250     // We don't support a single multibyte character split into
    251     // adjacent encoded words. Some broken mail clients emit headers
    252     // with that problem, but most web servers usually encode a filename
    253     // in a single encoded-word. Firefox/Thunderbird do not support
    254     // it, either.
    255     std::string decoded;
    256     if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
    257                     &decoded, &current_parse_result_flags))
    258       return false;
    259     decoded_value.append(decoded);
    260   }
    261   output->swap(decoded_value);
    262   if (parse_result_flags && !output->empty())
    263     *parse_result_flags |= current_parse_result_flags;
    264   return true;
    265 }
    266 
    267 // Parses the charset and value-chars out of an ext-value string.
    268 //
    269 //  ext-value     = charset  "'" [ language ] "'" value-chars
    270 bool ParseExtValueComponents(const std::string& input,
    271                              std::string* charset,
    272                              std::string* value_chars) {
    273   base::StringTokenizer t(input, "'");
    274   t.set_options(base::StringTokenizer::RETURN_DELIMS);
    275   std::string temp_charset;
    276   std::string temp_value;
    277   int numDelimsSeen = 0;
    278   while (t.GetNext()) {
    279     if (t.token_is_delim()) {
    280       ++numDelimsSeen;
    281       continue;
    282     } else {
    283       switch (numDelimsSeen) {
    284         case 0:
    285           temp_charset = t.token();
    286           break;
    287         case 1:
    288           // Language is ignored.
    289           break;
    290         case 2:
    291           temp_value = t.token();
    292           break;
    293         default:
    294           return false;
    295       }
    296     }
    297   }
    298   if (numDelimsSeen != 2)
    299     return false;
    300   if (temp_charset.empty() || temp_value.empty())
    301     return false;
    302   charset->swap(temp_charset);
    303   value_chars->swap(temp_value);
    304   return true;
    305 }
    306 
    307 // http://tools.ietf.org/html/rfc5987#section-3.2
    308 //
    309 //  ext-value     = charset  "'" [ language ] "'" value-chars
    310 //
    311 //  charset       = "UTF-8" / "ISO-8859-1" / mime-charset
    312 //
    313 //  mime-charset  = 1*mime-charsetc
    314 //  mime-charsetc = ALPHA / DIGIT
    315 //                 / "!" / "#" / "$" / "%" / "&"
    316 //                 / "+" / "-" / "^" / "_" / "`"
    317 //                 / "{" / "}" / "~"
    318 //
    319 //  language      = <Language-Tag, defined in [RFC5646], Section 2.1>
    320 //
    321 //  value-chars   = *( pct-encoded / attr-char )
    322 //
    323 //  pct-encoded   = "%" HEXDIG HEXDIG
    324 //
    325 //  attr-char     = ALPHA / DIGIT
    326 //                 / "!" / "#" / "$" / "&" / "+" / "-" / "."
    327 //                 / "^" / "_" / "`" / "|" / "~"
    328 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
    329   if (param_value.find('"') != std::string::npos)
    330     return false;
    331 
    332   std::string charset;
    333   std::string value;
    334   if (!ParseExtValueComponents(param_value, &charset, &value))
    335     return false;
    336 
    337   // RFC 5987 value should be ASCII-only.
    338   if (!IsStringASCII(value)) {
    339     decoded->clear();
    340     return true;
    341   }
    342 
    343   std::string unescaped = net::UnescapeURLComponent(
    344       value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
    345 
    346   return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);
    347 }
    348 
    349 } // namespace
    350 
    351 namespace net {
    352 
    353 HttpContentDisposition::HttpContentDisposition(
    354     const std::string& header, const std::string& referrer_charset)
    355   : type_(INLINE),
    356     parse_result_flags_(INVALID) {
    357   Parse(header, referrer_charset);
    358 }
    359 
    360 HttpContentDisposition::~HttpContentDisposition() {
    361 }
    362 
    363 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
    364     std::string::const_iterator begin, std::string::const_iterator end) {
    365   DCHECK(type_ == INLINE);
    366   std::string::const_iterator delimiter = std::find(begin, end, ';');
    367 
    368   std::string::const_iterator type_begin = begin;
    369   std::string::const_iterator type_end = delimiter;
    370   HttpUtil::TrimLWS(&type_begin, &type_end);
    371 
    372   // If the disposition-type isn't a valid token the then the
    373   // Content-Disposition header is malformed, and we treat the first bytes as
    374   // a parameter rather than a disposition-type.
    375   if (!HttpUtil::IsToken(type_begin, type_end))
    376     return begin;
    377 
    378   parse_result_flags_ |= HAS_DISPOSITION_TYPE;
    379 
    380   DCHECK(std::find(type_begin, type_end, '=') == type_end);
    381 
    382   if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {
    383     type_ = INLINE;
    384   } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {
    385     type_ = ATTACHMENT;
    386   } else {
    387     parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
    388     type_ = ATTACHMENT;
    389   }
    390   return delimiter;
    391 }
    392 
    393 // http://tools.ietf.org/html/rfc6266
    394 //
    395 //  content-disposition = "Content-Disposition" ":"
    396 //                         disposition-type *( ";" disposition-parm )
    397 //
    398 //  disposition-type    = "inline" | "attachment" | disp-ext-type
    399 //                      ; case-insensitive
    400 //  disp-ext-type       = token
    401 //
    402 //  disposition-parm    = filename-parm | disp-ext-parm
    403 //
    404 //  filename-parm       = "filename" "=" value
    405 //                      | "filename*" "=" ext-value
    406 //
    407 //  disp-ext-parm       = token "=" value
    408 //                      | ext-token "=" ext-value
    409 //  ext-token           = <the characters in token, followed by "*">
    410 //
    411 void HttpContentDisposition::Parse(const std::string& header,
    412                                    const std::string& referrer_charset) {
    413   DCHECK(type_ == INLINE);
    414   DCHECK(filename_.empty());
    415 
    416   std::string::const_iterator pos = header.begin();
    417   std::string::const_iterator end = header.end();
    418   pos = ConsumeDispositionType(pos, end);
    419 
    420   std::string name;
    421   std::string filename;
    422   std::string ext_filename;
    423 
    424   HttpUtil::NameValuePairsIterator iter(pos, end, ';');
    425   while (iter.GetNext()) {
    426     if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
    427                                                  iter.name_end(),
    428                                                  "filename")) {
    429       DecodeFilenameValue(iter.value(), referrer_charset, &filename,
    430                           &parse_result_flags_);
    431       if (!filename.empty())
    432         parse_result_flags_ |= HAS_FILENAME;
    433     } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),
    434                                                     iter.name_end(),
    435                                                     "name")) {
    436       DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL);
    437       if (!name.empty())
    438         parse_result_flags_ |= HAS_NAME;
    439     } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
    440                                                             iter.name_end(),
    441                                                             "filename*")) {
    442       DecodeExtValue(iter.raw_value(), &ext_filename);
    443       if (!ext_filename.empty())
    444         parse_result_flags_ |= HAS_EXT_FILENAME;
    445     }
    446   }
    447 
    448   if (!ext_filename.empty())
    449     filename_ = ext_filename;
    450   else if (!filename.empty())
    451     filename_ = filename;
    452   else
    453     filename_ = name;
    454 }
    455 
    456 }  // namespace net
    457