Home | History | Annotate | Download | only in base
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef NET_BASE_ESCAPE_H_
      6 #define NET_BASE_ESCAPE_H_
      7 #pragma once
      8 
      9 #include <string>
     10 #include <vector>
     11 
     12 #include "base/basictypes.h"
     13 #include "base/string16.h"
     14 #include "net/base/net_export.h"
     15 
     16 // Escaping --------------------------------------------------------------------
     17 
     18 // Escape a file.  This includes:
     19 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
     20 std::string EscapePath(const std::string& path);
     21 
     22 // Escape application/x-www-form-urlencoded content.  This includes:
     23 // non-printable, non-7bit, and (including space)  ?>=<;+'&%$#"![\]^`{|}
     24 // Space is escaped as + and other special characters as %XX (hex).
     25 std::string EscapeUrlEncodedData(const std::string& path);
     26 
     27 // Escape all non-ASCII input.
     28 std::string EscapeNonASCII(const std::string& input);
     29 
     30 // Escapes characters in text suitable for use as an external protocol handler
     31 // command.
     32 // We %XX everything except alphanumerics and %-_.!~*'() and the restricted
     33 // chracters (;/?:@&=+$,).
     34 std::string EscapeExternalHandlerValue(const std::string& text);
     35 
     36 // Append the given character to the output string, escaping the character if
     37 // the character would be interpretted as an HTML delimiter.
     38 void AppendEscapedCharForHTML(char c, std::string* output);
     39 
     40 // Escape chars that might cause this text to be interpretted as HTML tags.
     41 std::string EscapeForHTML(const std::string& text);
     42 string16 EscapeForHTML(const string16& text);
     43 
     44 // Unescaping ------------------------------------------------------------------
     45 
     46 class UnescapeRule {
     47  public:
     48   // A combination of the following flags that is passed to the unescaping
     49   // functions.
     50   typedef uint32 Type;
     51 
     52   enum {
     53     // Don't unescape anything at all.
     54     NONE = 0,
     55 
     56     // Don't unescape anything special, but all normal unescaping will happen.
     57     // This is a placeholder and can't be combined with other flags (since it's
     58     // just the absence of them). All other unescape rules imply "normal" in
     59     // addition to their special meaning. Things like escaped letters, digits,
     60     // and most symbols will get unescaped with this mode.
     61     NORMAL = 1,
     62 
     63     // Convert %20 to spaces. In some places where we're showing URLs, we may
     64     // want this. In places where the URL may be copied and pasted out, then
     65     // you wouldn't want this since it might not be interpreted in one piece
     66     // by other applications.
     67     SPACES = 2,
     68 
     69     // Unescapes various characters that will change the meaning of URLs,
     70     // including '%', '+', '&', '/', '#'. If we unescaped these characters, the
     71     // resulting URL won't be the same as the source one. This flag is used when
     72     // generating final output like filenames for URLs where we won't be
     73     // interpreting as a URL and want to do as much unescaping as possible.
     74     URL_SPECIAL_CHARS = 4,
     75 
     76     // Unescapes control characters such as %01. This INCLUDES NULLs. This is
     77     // used for rare cases such as data: URL decoding where the result is binary
     78     // data. You should not use this for normal URLs!
     79     CONTROL_CHARS = 8,
     80 
     81     // URL queries use "+" for space. This flag controls that replacement.
     82     REPLACE_PLUS_WITH_SPACE = 16,
     83   };
     84 };
     85 
     86 // Unescapes |escaped_text| and returns the result.
     87 // Unescaping consists of looking for the exact pattern "%XX", where each X is
     88 // a hex digit, and converting to the character with the numerical value of
     89 // those digits. Thus "i%20=%203%3b" unescapes to "i = 3;".
     90 //
     91 // Watch out: this doesn't necessarily result in the correct final result,
     92 // because the encoding may be unknown. For example, the input might be ASCII,
     93 // which, after unescaping, is supposed to be interpreted as UTF-8, and then
     94 // converted into full wide chars. This function won't tell you if any
     95 // conversions need to take place, it only unescapes.
     96 std::string UnescapeURLComponent(const std::string& escaped_text,
     97                                  UnescapeRule::Type rules);
     98 string16 UnescapeURLComponent(const string16& escaped_text,
     99                               UnescapeRule::Type rules);
    100 
    101 // Unescapes the given substring as a URL, and then tries to interpret the
    102 // result as being encoded as UTF-8. If the result is convertable into UTF-8, it
    103 // will be returned as converted. If it is not, the original escaped string will
    104 // be converted into a string16 and returned. (|offset[s]_for_adjustment|)
    105 // specifies one or more offsets into the source strings; each offset will be
    106 // adjusted to point at the same logical place in the result strings during
    107 // decoding.  If this isn't possible because an offset points past the end of
    108 // the source strings or into the middle of a multibyte sequence, the offending
    109 // offset will be set to std::wstring::npos. |offset[s]_for_adjustment| may be
    110 // NULL.
    111 string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
    112                                            UnescapeRule::Type rules,
    113                                            size_t* offset_for_adjustment);
    114 string16 UnescapeAndDecodeUTF8URLComponentWithOffsets(
    115     const std::string& text,
    116     UnescapeRule::Type rules,
    117     std::vector<size_t>* offsets_for_adjustment);
    118 
    119 // Unescape the following ampersand character codes from |text|:
    120 // &lt; &gt; &amp; &quot; &#39;
    121 string16 UnescapeForHTML(const string16& text);
    122 
    123 // Deprecated ------------------------------------------------------------------
    124 
    125 // Escapes characters in text suitable for use as a query parameter value.
    126 // We %XX everything except alphanumerics and -_.!~*'()
    127 // Spaces change to "+" unless you pass usePlus=false.
    128 // This is basically the same as encodeURIComponent in javascript.
    129 // For the string16 version, we do a conversion to charset before encoding the
    130 // string.  If the charset doesn't exist, we return false.
    131 NET_EXPORT std::string EscapeQueryParamValue(const std::string& text, bool use_plus);
    132 bool EscapeQueryParamValue(const string16& text, const char* codepage,
    133                            bool use_plus, string16* escaped);
    134 
    135 // A specialized version of EscapeQueryParamValue for string16s that
    136 // assumes the codepage is UTF8.  This is provided as a convenience.
    137 string16 EscapeQueryParamValueUTF8(const string16& text, bool use_plus);
    138 
    139 // Private Functions (Exposed for Unit Testing) --------------------------------
    140 
    141 // A function called by std::for_each that will adjust any offset which occurs
    142 // after one or more encoded characters.
    143 struct AdjustEncodingOffset {
    144   typedef std::vector<size_t> Adjustments;
    145 
    146   explicit AdjustEncodingOffset(const Adjustments& adjustments);
    147   void operator()(size_t& offset);
    148 
    149   const Adjustments& adjustments;
    150 };
    151 
    152 #endif  // NET_BASE_ESCAPE_H_
    153