Home | History | Annotate | Download | only in chrome_frame
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // This file defines utility functions for working with html.
      6 
      7 #ifndef CHROME_FRAME_HTML_UTILS_H_
      8 #define CHROME_FRAME_HTML_UTILS_H_
      9 
     10 #include <string>
     11 #include <vector>
     12 
     13 #include "base/basictypes.h"
     14 #include "base/gtest_prod_util.h"
     15 #include "net/http/http_util.h"
     16 
     17 // Forward declarations
     18 class HtmlUtilUnittest;
     19 
     20 //
     21 // Class designed to take a string of HTML and extract from it named
     22 // attribute values from named tags.
     23 //
     24 // Caveat: this class currently doesn't handle multi-word UTF-16 encoded
     25 // characters. Doesn't handle implies that any data following such a
     26 // character could possibly be misinterpreted.
     27 //
     28 class HTMLScanner {
     29  public:
     30   typedef std::wstring::const_iterator StrPos;
     31 
     32   // Structure maintaining const_iterators into html_string_.
     33   class StringRange {
     34     friend class HTMLScanner;
     35    public:
     36     StringRange();
     37     StringRange(StrPos start, StrPos end);
     38 
     39     bool LowerCaseEqualsASCII(const char* other) const;
     40     bool Equals(const wchar_t* other) const;
     41 
     42     // Copies the data described by StringRange into destination.
     43     std::wstring Copy() const;
     44 
     45     // If this StringRange represents a tag, this method extracts the name of
     46     // the tag and sticks it in tag_name.
     47     // Returns true if the tag name was successfully extracted.
     48     // Returns false if this string doesn't look like a valid tag.
     49     bool GetTagName(std::wstring* tag_name) const;
     50 
     51     // From a given string range, uses a string tokenizer to extract the value
     52     // of the named attribute if a simple scan finds that the attribute name is
     53     // present.
     54     //
     55     // Returns true if the named attribute can be located and it has a value
     56     // which has been placed in attribute_value.
     57     //
     58     // Note that the attribute value is unquoted here as well, so that
     59     // GetTagAttribute(*<foo bar="baz">*, L"bar", *out_value*) will stick
     60     // 'bar' in out_value and not '"bar"'.
     61     //
     62     // Returns false if the named attribute is not present in the tag or if it
     63     // did not have a value.
     64     //
     65     bool GetTagAttribute(const wchar_t* attribute_name,
     66                          StringRange* attribute_value) const;
     67 
     68     // Unquotes a StringRange by removing a matching pair of either ' or "
     69     // characters from the beginning and end of the string if present.
     70     // Returns true if string was modified, false otherwise.
     71     bool UnQuote();
     72    private:
     73      StrPos start_;
     74      StrPos end_;
     75   };
     76 
     77   typedef std::vector<StringRange> StringRangeList;
     78 
     79   // html_string must be a null-terminated string containing the HTML
     80   // to be scanned.
     81   explicit HTMLScanner(const wchar_t* html_string);
     82 
     83   // Returns the set of ranges denoting HTML tags that match the given name.
     84   // If stop_tag_name is given, then as soon as a tag with this name is
     85   // encountered this method will return.
     86   void GetTagsByName(const wchar_t* name, StringRangeList* tag_list,
     87                      const wchar_t* stop_tag_name);
     88 
     89  private:
     90   friend class HtmlUtilUnittest;
     91   FRIEND_TEST_ALL_PREFIXES(HtmlUtilUnittest, BasicTest);
     92 
     93   // Given html_string which represents the remaining html range, this method
     94   // returns the next tag in tag and advances html_string to one character after
     95   // the end of tag. This method is intended to be called repeatedly to extract
     96   // all of the tags in sequence.
     97   //
     98   // Returns true if another tag was found and 'tag' was populated with a valid
     99   // range.
    100   // Returns false if we have reached the end of the html data.
    101   bool NextTag(StringRange* html_string, StringRange* tag);
    102 
    103   // Returns true if c can be found in quotes_, false otherwise
    104   bool IsQuote(wchar_t c);
    105 
    106   // Returns true if pos refers to the last character in an HTML comment in a
    107   // string described by html_string, false otherwise.
    108   // For example with html_string describing <!-- foo> -->, pos must refer to
    109   // the last > for this method to return true.
    110   bool IsHTMLCommentClose(const StringRange* html_string, StrPos pos);
    111 
    112   // Returns true if pos refers to the last character in the terminator of the
    113   // opening tag of a downlevel-hidden conditional comment in IE as per
    114   // http://msdn.microsoft.com/en-us/library/ms537512(VS.85).aspx#syntax
    115   // For example with html_string describing <![if booga >wooga]>, pos must
    116   // refer to the last > for this method to return true.
    117   bool IsIEConditionalCommentClose(const StringRange* html_string, StrPos pos);
    118 
    119   // We store a (CollapsedWhitespace'd) copy of the html data.
    120   const std::wstring html_string_;
    121 
    122   // Store the string of quote characters to avoid repeated construction.
    123   const std::wstring quotes_;
    124 
    125   DISALLOW_COPY_AND_ASSIGN(HTMLScanner);
    126 };
    127 
    128 namespace http_utils {
    129 
    130 // Adds "chromeframe/a.b.c.d" to the User-Agent string (a.b.c.d is the version).
    131 // If the cf tag has already been added to the string, the original string is
    132 // returned.
    133 std::string AddChromeFrameToUserAgentValue(const std::string& value);
    134 
    135 // Removes "chromeframe/a.b.c.d" from the User-Agent string (a.b.c.d is the
    136 // version).  If the cf tag is not present in the string, the original string is
    137 // returned.
    138 std::string RemoveChromeFrameFromUserAgentValue(const std::string& value);
    139 
    140 // Fetches the user agent from urlmon and adds chrome frame to the
    141 // comment section.
    142 // NOTE: The returned string includes the "User-Agent: " header name.
    143 std::string GetDefaultUserAgentHeaderWithCFTag();
    144 
    145 // Returns the User-Agent header as would be used by Chrome itself.
    146 const char* GetChromeUserAgent();
    147 
    148 // Fetches the default user agent string from urlmon.
    149 // This value does not include the "User-Agent:" header name.
    150 std::string GetDefaultUserAgent();
    151 
    152 // Returns the Chrome Frame user agent.  E.g. "chromeframe/1.0".
    153 // Note that in unit tests this will be "chromeframe/0.0" due to the version
    154 // table not being present in the unit test executable.
    155 const char* GetChromeFrameUserAgent();
    156 
    157 // Returns true if there is a frame busting header (other than the do-nothing
    158 // "X-Frame-Options: ALLOWALL") in the provided header block.  Note that there
    159 // may be multiple X-Frame-Options values specified; if there is one anywhere in
    160 // the list with a value other than ALLOWALL, this returns true.
    161 bool HasFrameBustingHeader(const std::string& http_headers);
    162 
    163 // Returns the header passed in from the headers list.
    164 std::string GetHttpHeaderFromHeaderList(const std::string& header_name,
    165                                         const std::string& headers);
    166 }  // namespace http_utils
    167 
    168 #endif  // CHROME_FRAME_HTML_UTILS_H_
    169