Home | History | Annotate | Download | only in base
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef NET_BASE_ESCAPE_H_
      6 #define NET_BASE_ESCAPE_H_
      7 #pragma once
      8 
      9 #include <string>
     10 #include <vector>
     11 
     12 #include "base/basictypes.h"
     13 #include "base/string16.h"
     14 
     15 // Escaping --------------------------------------------------------------------
     16 
     17 // Escape a file.  This includes:
     18 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
     19 std::string EscapePath(const std::string& path);
     20 
     21 // Escape application/x-www-form-urlencoded content.  This includes:
     22 // non-printable, non-7bit, and (including space)  ?>=<;+'&%$#"![\]^`{|}
     23 // Space is escaped as + and other special characters as %XX (hex).
     24 std::string EscapeUrlEncodedData(const std::string& path);
     25 
     26 // Escape all non-ASCII input.
     27 std::string EscapeNonASCII(const std::string& input);
     28 
     29 // Escapes characters in text suitable for use as an external protocol handler
     30 // command.
     31 // We %XX everything except alphanumerics and %-_.!~*'() and the restricted
     32 // chracters (;/?:@&=+$,).
     33 std::string EscapeExternalHandlerValue(const std::string& text);
     34 
     35 // Append the given character to the output string, escaping the character if
     36 // the character would be interpretted as an HTML delimiter.
     37 void AppendEscapedCharForHTML(char c, std::string* output);
     38 
     39 // Escape chars that might cause this text to be interpretted as HTML tags.
     40 std::string EscapeForHTML(const std::string& text);
     41 string16 EscapeForHTML(const string16& text);
     42 
     43 // Unescaping ------------------------------------------------------------------
     44 
     45 class UnescapeRule {
     46  public:
     47   // A combination of the following flags that is passed to the unescaping
     48   // functions.
     49   typedef uint32 Type;
     50 
     51   enum {
     52     // Don't unescape anything at all.
     53     NONE = 0,
     54 
     55     // Don't unescape anything special, but all normal unescaping will happen.
     56     // This is a placeholder and can't be combined with other flags (since it's
     57     // just the absence of them). All other unescape rules imply "normal" in
     58     // addition to their special meaning. Things like escaped letters, digits,
     59     // and most symbols will get unescaped with this mode.
     60     NORMAL = 1,
     61 
     62     // Convert %20 to spaces. In some places where we're showing URLs, we may
     63     // want this. In places where the URL may be copied and pasted out, then
     64     // you wouldn't want this since it might not be interpreted in one piece
     65     // by other applications.
     66     SPACES = 2,
     67 
     68     // Unescapes various characters that will change the meaning of URLs,
     69     // including '%', '+', '&', '/', '#'. If we unescaped these characters, the
     70     // resulting URL won't be the same as the source one. This flag is used when
     71     // generating final output like filenames for URLs where we won't be
     72     // interpreting as a URL and want to do as much unescaping as possible.
     73     URL_SPECIAL_CHARS = 4,
     74 
     75     // Unescapes control characters such as %01. This INCLUDES NULLs. This is
     76     // used for rare cases such as data: URL decoding where the result is binary
     77     // data. You should not use this for normal URLs!
     78     CONTROL_CHARS = 8,
     79 
     80     // URL queries use "+" for space. This flag controls that replacement.
     81     REPLACE_PLUS_WITH_SPACE = 16,
     82   };
     83 };
     84 
     85 // Unescapes |escaped_text| and returns the result.
     86 // Unescaping consists of looking for the exact pattern "%XX", where each X is
     87 // a hex digit, and converting to the character with the numerical value of
     88 // those digits. Thus "i%20=%203%3b" unescapes to "i = 3;".
     89 //
     90 // Watch out: this doesn't necessarily result in the correct final result,
     91 // because the encoding may be unknown. For example, the input might be ASCII,
     92 // which, after unescaping, is supposed to be interpreted as UTF-8, and then
     93 // converted into full wide chars. This function won't tell you if any
     94 // conversions need to take place, it only unescapes.
     95 std::string UnescapeURLComponent(const std::string& escaped_text,
     96                                  UnescapeRule::Type rules);
     97 string16 UnescapeURLComponent(const string16& escaped_text,
     98                               UnescapeRule::Type rules);
     99 
    100 // Unescapes the given substring as a URL, and then tries to interpret the
    101 // result as being encoded as UTF-8. If the result is convertable into UTF-8, it
    102 // will be returned as converted. If it is not, the original escaped string will
    103 // be converted into a string16 and returned. (|offset[s]_for_adjustment|)
    104 // specifies one or more offsets into the source strings; each offset will be
    105 // adjusted to point at the same logical place in the result strings during
    106 // decoding.  If this isn't possible because an offset points past the end of
    107 // the source strings or into the middle of a multibyte sequence, the offending
    108 // offset will be set to std::wstring::npos. |offset[s]_for_adjustment| may be
    109 // NULL.
    110 string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
    111                                            UnescapeRule::Type rules,
    112                                            size_t* offset_for_adjustment);
    113 string16 UnescapeAndDecodeUTF8URLComponentWithOffsets(
    114     const std::string& text,
    115     UnescapeRule::Type rules,
    116     std::vector<size_t>* offsets_for_adjustment);
    117 
    118 // Unescape the following ampersand character codes from |text|:
    119 // &lt; &gt; &amp; &quot; &#39;
    120 string16 UnescapeForHTML(const string16& text);
    121 
    122 // Deprecated ------------------------------------------------------------------
    123 
    124 // Escapes characters in text suitable for use as a query parameter value.
    125 // We %XX everything except alphanumerics and -_.!~*'()
    126 // Spaces change to "+" unless you pass usePlus=false.
    127 // This is basically the same as encodeURIComponent in javascript.
    128 // For the string16 version, we do a conversion to charset before encoding the
    129 // string.  If the charset doesn't exist, we return false.
    130 std::string EscapeQueryParamValue(const std::string& text, bool use_plus);
    131 bool EscapeQueryParamValue(const string16& text, const char* codepage,
    132                            bool use_plus, string16* escaped);
    133 
    134 // A specialized version of EscapeQueryParamValue for string16s that
    135 // assumes the codepage is UTF8.  This is provided as a convenience.
    136 string16 EscapeQueryParamValueUTF8(const string16& text, bool use_plus);
    137 
    138 // Private Functions (Exposed for Unit Testing) --------------------------------
    139 
    140 // A function called by std::for_each that will adjust any offset which occurs
    141 // after one or more encoded characters.
    142 struct AdjustEncodingOffset {
    143   typedef std::vector<size_t> Adjustments;
    144 
    145   explicit AdjustEncodingOffset(const Adjustments& adjustments);
    146   void operator()(size_t& offset);
    147 
    148   const Adjustments& adjustments;
    149 };
    150 
    151 #endif  // NET_BASE_ESCAPE_H_
    152