Home | History | Annotate | Download | only in http
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "net/http/http_content_disposition.h"
      6 
      7 #include "base/base64.h"
      8 #include "base/logging.h"
      9 #include "base/strings/string_tokenizer.h"
     10 #include "base/strings/string_util.h"
     11 #include "base/strings/sys_string_conversions.h"
     12 #include "base/strings/utf_string_conversions.h"
     13 #include "net/base/net_string_util.h"
     14 #include "net/base/net_util.h"
     15 #include "net/http/http_util.h"
     16 
     17 namespace {
     18 
     19 enum RFC2047EncodingType {
     20   Q_ENCODING,
     21   B_ENCODING
     22 };
     23 
     24 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
     25 // decoding a quoted-printable string.  Returns true if the input was valid.
     26 bool DecodeQEncoding(const std::string& input, std::string* output) {
     27   std::string temp;
     28   temp.reserve(input.size());
     29   for (std::string::const_iterator it = input.begin(); it != input.end();
     30        ++it) {
     31     if (*it == '_') {
     32       temp.push_back(' ');
     33     } else if (*it == '=') {
     34       if ((input.end() - it < 3) ||
     35           !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
     36           !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
     37         return false;
     38       unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
     39                          HexDigitToInt(*(it + 2));
     40       temp.push_back(static_cast<char>(ch));
     41       ++it;
     42       ++it;
     43     } else if (0x20 < *it && *it < 0x7F && *it != '?') {
     44       // In a Q-encoded word, only printable ASCII characters
     45       // represent themselves. Besides, space, '=', '_' and '?' are
     46       // not allowed, but they're already filtered out.
     47       DCHECK_NE('=', *it);
     48       DCHECK_NE('?', *it);
     49       DCHECK_NE('_', *it);
     50       temp.push_back(*it);
     51     } else {
     52       return false;
     53     }
     54   }
     55   output->swap(temp);
     56   return true;
     57 }
     58 
     59 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
     60 // type is specified in |enc_type|.
     61 bool DecodeBQEncoding(const std::string& part,
     62                       RFC2047EncodingType enc_type,
     63                       const std::string& charset,
     64                       std::string* output) {
     65   std::string decoded;
     66   if (!((enc_type == B_ENCODING) ?
     67         base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) {
     68     return false;
     69   }
     70 
     71   if (decoded.empty()) {
     72     output->clear();
     73     return true;
     74   }
     75 
     76   return net::ConvertToUtf8(decoded, charset.c_str(), output);
     77 }
     78 
     79 bool DecodeWord(const std::string& encoded_word,
     80                 const std::string& referrer_charset,
     81                 bool* is_rfc2047,
     82                 std::string* output,
     83                 int* parse_result_flags) {
     84   *is_rfc2047 = false;
     85   output->clear();
     86   if (encoded_word.empty())
     87     return true;
     88 
     89   if (!base::IsStringASCII(encoded_word)) {
     90     // Try UTF-8, referrer_charset and the native OS default charset in turn.
     91     if (base::IsStringUTF8(encoded_word)) {
     92       *output = encoded_word;
     93     } else {
     94       base::string16 utf16_output;
     95       if (!referrer_charset.empty() &&
     96           net::ConvertToUTF16(encoded_word, referrer_charset.c_str(),
     97                               &utf16_output)) {
     98         *output = base::UTF16ToUTF8(utf16_output);
     99       } else {
    100         *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word));
    101       }
    102     }
    103 
    104     *parse_result_flags |= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS;
    105     return true;
    106   }
    107 
    108   // RFC 2047 : one of encoding methods supported by Firefox and relatively
    109   // widely used by web servers.
    110   // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
    111   // We don't care about the length restriction (72 bytes) because
    112   // many web servers generate encoded words longer than the limit.
    113   std::string decoded_word;
    114   *is_rfc2047 = true;
    115   int part_index = 0;
    116   std::string charset;
    117   base::StringTokenizer t(encoded_word, "?");
    118   RFC2047EncodingType enc_type = Q_ENCODING;
    119   while (*is_rfc2047 && t.GetNext()) {
    120     std::string part = t.token();
    121     switch (part_index) {
    122       case 0:
    123         if (part != "=") {
    124           *is_rfc2047 = false;
    125           break;
    126         }
    127         ++part_index;
    128         break;
    129       case 1:
    130         // Do we need charset validity check here?
    131         charset = part;
    132         ++part_index;
    133         break;
    134       case 2:
    135         if (part.size() > 1 ||
    136             part.find_first_of("bBqQ") == std::string::npos) {
    137           *is_rfc2047 = false;
    138           break;
    139         }
    140         if (part[0] == 'b' || part[0] == 'B') {
    141           enc_type = B_ENCODING;
    142         }
    143         ++part_index;
    144         break;
    145       case 3:
    146         *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
    147         if (!*is_rfc2047) {
    148           // Last minute failure. Invalid B/Q encoding. Rather than
    149           // passing it through, return now.
    150           return false;
    151         }
    152         ++part_index;
    153         break;
    154       case 4:
    155         if (part != "=") {
    156           // Another last minute failure !
    157           // Likely to be a case of two encoded-words in a row or
    158           // an encoded word followed by a non-encoded word. We can be
    159           // generous, but it does not help much in terms of compatibility,
    160           // I believe. Return immediately.
    161           *is_rfc2047 = false;
    162           return false;
    163         }
    164         ++part_index;
    165         break;
    166       default:
    167         *is_rfc2047 = false;
    168         return false;
    169     }
    170   }
    171 
    172   if (*is_rfc2047) {
    173     if (*(encoded_word.end() - 1) == '=') {
    174       output->swap(decoded_word);
    175       *parse_result_flags |=
    176           net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
    177       return true;
    178     }
    179     // encoded_word ending prematurelly with '?' or extra '?'
    180     *is_rfc2047 = false;
    181     return false;
    182   }
    183 
    184   // We're not handling 'especial' characters quoted with '\', but
    185   // it should be Ok because we're not an email client but a
    186   // web browser.
    187 
    188   // What IE6/7 does: %-escaped UTF-8.
    189   decoded_word = net::UnescapeURLComponent(encoded_word,
    190                                            net::UnescapeRule::SPACES);
    191   if (decoded_word != encoded_word)
    192     *parse_result_flags |=
    193         net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
    194   if (base::IsStringUTF8(decoded_word)) {
    195     output->swap(decoded_word);
    196     return true;
    197     // We can try either the OS default charset or 'origin charset' here,
    198     // As far as I can tell, IE does not support it. However, I've seen
    199     // web servers emit %-escaped string in a legacy encoding (usually
    200     // origin charset).
    201     // TODO(jungshik) : Test IE further and consider adding a fallback here.
    202   }
    203   return false;
    204 }
    205 
    206 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
    207 // value is supposed to be of the form:
    208 //
    209 //   value                   = token | quoted-string
    210 //
    211 // However we currently also allow RFC 2047 encoding and non-ASCII
    212 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
    213 bool DecodeFilenameValue(const std::string& input,
    214                          const std::string& referrer_charset,
    215                          std::string* output,
    216                          int* parse_result_flags) {
    217   int current_parse_result_flags = 0;
    218   std::string decoded_value;
    219   bool is_previous_token_rfc2047 = true;
    220 
    221   // Tokenize with whitespace characters.
    222   base::StringTokenizer t(input, " \t\n\r");
    223   t.set_options(base::StringTokenizer::RETURN_DELIMS);
    224   while (t.GetNext()) {
    225     if (t.token_is_delim()) {
    226       // If the previous non-delimeter token is not RFC2047-encoded,
    227       // put in a space in its place. Otheriwse, skip over it.
    228       if (!is_previous_token_rfc2047)
    229         decoded_value.push_back(' ');
    230       continue;
    231     }
    232     // We don't support a single multibyte character split into
    233     // adjacent encoded words. Some broken mail clients emit headers
    234     // with that problem, but most web servers usually encode a filename
    235     // in a single encoded-word. Firefox/Thunderbird do not support
    236     // it, either.
    237     std::string decoded;
    238     if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
    239                     &decoded, &current_parse_result_flags))
    240       return false;
    241     decoded_value.append(decoded);
    242   }
    243   output->swap(decoded_value);
    244   if (parse_result_flags && !output->empty())
    245     *parse_result_flags |= current_parse_result_flags;
    246   return true;
    247 }
    248 
    249 // Parses the charset and value-chars out of an ext-value string.
    250 //
    251 //  ext-value     = charset  "'" [ language ] "'" value-chars
    252 bool ParseExtValueComponents(const std::string& input,
    253                              std::string* charset,
    254                              std::string* value_chars) {
    255   base::StringTokenizer t(input, "'");
    256   t.set_options(base::StringTokenizer::RETURN_DELIMS);
    257   std::string temp_charset;
    258   std::string temp_value;
    259   int numDelimsSeen = 0;
    260   while (t.GetNext()) {
    261     if (t.token_is_delim()) {
    262       ++numDelimsSeen;
    263       continue;
    264     } else {
    265       switch (numDelimsSeen) {
    266         case 0:
    267           temp_charset = t.token();
    268           break;
    269         case 1:
    270           // Language is ignored.
    271           break;
    272         case 2:
    273           temp_value = t.token();
    274           break;
    275         default:
    276           return false;
    277       }
    278     }
    279   }
    280   if (numDelimsSeen != 2)
    281     return false;
    282   if (temp_charset.empty() || temp_value.empty())
    283     return false;
    284   charset->swap(temp_charset);
    285   value_chars->swap(temp_value);
    286   return true;
    287 }
    288 
    289 // http://tools.ietf.org/html/rfc5987#section-3.2
    290 //
    291 //  ext-value     = charset  "'" [ language ] "'" value-chars
    292 //
    293 //  charset       = "UTF-8" / "ISO-8859-1" / mime-charset
    294 //
    295 //  mime-charset  = 1*mime-charsetc
    296 //  mime-charsetc = ALPHA / DIGIT
    297 //                 / "!" / "#" / "$" / "%" / "&"
    298 //                 / "+" / "-" / "^" / "_" / "`"
    299 //                 / "{" / "}" / "~"
    300 //
    301 //  language      = <Language-Tag, defined in [RFC5646], Section 2.1>
    302 //
    303 //  value-chars   = *( pct-encoded / attr-char )
    304 //
    305 //  pct-encoded   = "%" HEXDIG HEXDIG
    306 //
    307 //  attr-char     = ALPHA / DIGIT
    308 //                 / "!" / "#" / "$" / "&" / "+" / "-" / "."
    309 //                 / "^" / "_" / "`" / "|" / "~"
    310 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
    311   if (param_value.find('"') != std::string::npos)
    312     return false;
    313 
    314   std::string charset;
    315   std::string value;
    316   if (!ParseExtValueComponents(param_value, &charset, &value))
    317     return false;
    318 
    319   // RFC 5987 value should be ASCII-only.
    320   if (!base::IsStringASCII(value)) {
    321     decoded->clear();
    322     return true;
    323   }
    324 
    325   std::string unescaped = net::UnescapeURLComponent(
    326       value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
    327 
    328   return net::ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded);
    329 }
    330 
    331 } // namespace
    332 
    333 namespace net {
    334 
    335 HttpContentDisposition::HttpContentDisposition(
    336     const std::string& header, const std::string& referrer_charset)
    337   : type_(INLINE),
    338     parse_result_flags_(INVALID) {
    339   Parse(header, referrer_charset);
    340 }
    341 
    342 HttpContentDisposition::~HttpContentDisposition() {
    343 }
    344 
    345 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
    346     std::string::const_iterator begin, std::string::const_iterator end) {
    347   DCHECK(type_ == INLINE);
    348   std::string::const_iterator delimiter = std::find(begin, end, ';');
    349 
    350   std::string::const_iterator type_begin = begin;
    351   std::string::const_iterator type_end = delimiter;
    352   HttpUtil::TrimLWS(&type_begin, &type_end);
    353 
    354   // If the disposition-type isn't a valid token the then the
    355   // Content-Disposition header is malformed, and we treat the first bytes as
    356   // a parameter rather than a disposition-type.
    357   if (!HttpUtil::IsToken(type_begin, type_end))
    358     return begin;
    359 
    360   parse_result_flags_ |= HAS_DISPOSITION_TYPE;
    361 
    362   DCHECK(std::find(type_begin, type_end, '=') == type_end);
    363 
    364   if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {
    365     type_ = INLINE;
    366   } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {
    367     type_ = ATTACHMENT;
    368   } else {
    369     parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
    370     type_ = ATTACHMENT;
    371   }
    372   return delimiter;
    373 }
    374 
    375 // http://tools.ietf.org/html/rfc6266
    376 //
    377 //  content-disposition = "Content-Disposition" ":"
    378 //                         disposition-type *( ";" disposition-parm )
    379 //
    380 //  disposition-type    = "inline" | "attachment" | disp-ext-type
    381 //                      ; case-insensitive
    382 //  disp-ext-type       = token
    383 //
    384 //  disposition-parm    = filename-parm | disp-ext-parm
    385 //
    386 //  filename-parm       = "filename" "=" value
    387 //                      | "filename*" "=" ext-value
    388 //
    389 //  disp-ext-parm       = token "=" value
    390 //                      | ext-token "=" ext-value
    391 //  ext-token           = <the characters in token, followed by "*">
    392 //
    393 void HttpContentDisposition::Parse(const std::string& header,
    394                                    const std::string& referrer_charset) {
    395   DCHECK(type_ == INLINE);
    396   DCHECK(filename_.empty());
    397 
    398   std::string::const_iterator pos = header.begin();
    399   std::string::const_iterator end = header.end();
    400   pos = ConsumeDispositionType(pos, end);
    401 
    402   std::string name;
    403   std::string filename;
    404   std::string ext_filename;
    405 
    406   HttpUtil::NameValuePairsIterator iter(pos, end, ';');
    407   while (iter.GetNext()) {
    408     if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
    409                                                  iter.name_end(),
    410                                                  "filename")) {
    411       DecodeFilenameValue(iter.value(), referrer_charset, &filename,
    412                           &parse_result_flags_);
    413       if (!filename.empty())
    414         parse_result_flags_ |= HAS_FILENAME;
    415     } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),
    416                                                     iter.name_end(),
    417                                                     "name")) {
    418       DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL);
    419       if (!name.empty())
    420         parse_result_flags_ |= HAS_NAME;
    421     } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
    422                                                             iter.name_end(),
    423                                                             "filename*")) {
    424       DecodeExtValue(iter.raw_value(), &ext_filename);
    425       if (!ext_filename.empty())
    426         parse_result_flags_ |= HAS_EXT_FILENAME;
    427     }
    428   }
    429 
    430   if (!ext_filename.empty())
    431     filename_ = ext_filename;
    432   else if (!filename.empty())
    433     filename_ = filename;
    434   else
    435     filename_ = name;
    436 }
    437 
    438 }  // namespace net
    439