1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // This file defines utility functions for working with html. 6 7 #ifndef CHROME_FRAME_HTML_UTILS_H_ 8 #define CHROME_FRAME_HTML_UTILS_H_ 9 10 #include <string> 11 #include <vector> 12 13 #include "base/basictypes.h" 14 #include "base/gtest_prod_util.h" 15 #include "net/http/http_util.h" 16 17 // Forward declarations 18 class HtmlUtilUnittest; 19 20 // 21 // Class designed to take a string of HTML and extract from it named 22 // attribute values from named tags. 23 // 24 // Caveat: this class currently doesn't handle multi-word UTF-16 encoded 25 // characters. Doesn't handle implies that any data following such a 26 // character could possibly be misinterpreted. 27 // 28 class HTMLScanner { 29 public: 30 typedef std::wstring::const_iterator StrPos; 31 32 // Structure maintaining const_iterators into html_string_. 33 class StringRange { 34 friend class HTMLScanner; 35 public: 36 StringRange(); 37 StringRange(StrPos start, StrPos end); 38 39 bool LowerCaseEqualsASCII(const char* other) const; 40 bool Equals(const wchar_t* other) const; 41 42 // Copies the data described by StringRange into destination. 43 std::wstring Copy() const; 44 45 // If this StringRange represents a tag, this method extracts the name of 46 // the tag and sticks it in tag_name. 47 // Returns true if the tag name was successfully extracted. 48 // Returns false if this string doesn't look like a valid tag. 49 bool GetTagName(std::wstring* tag_name) const; 50 51 // From a given string range, uses a string tokenizer to extract the value 52 // of the named attribute if a simple scan finds that the attribute name is 53 // present. 54 // 55 // Returns true if the named attribute can be located and it has a value 56 // which has been placed in attribute_value. 57 // 58 // Note that the attribute value is unquoted here as well, so that 59 // GetTagAttribute(*<foo bar="baz">*, L"bar", *out_value*) will stick 60 // 'bar' in out_value and not '"bar"'. 61 // 62 // Returns false if the named attribute is not present in the tag or if it 63 // did not have a value. 64 // 65 bool GetTagAttribute(const wchar_t* attribute_name, 66 StringRange* attribute_value) const; 67 68 // Unquotes a StringRange by removing a matching pair of either ' or " 69 // characters from the beginning and end of the string if present. 70 // Returns true if string was modified, false otherwise. 71 bool UnQuote(); 72 private: 73 StrPos start_; 74 StrPos end_; 75 }; 76 77 typedef std::vector<StringRange> StringRangeList; 78 79 // html_string must be a null-terminated string containing the HTML 80 // to be scanned. 81 explicit HTMLScanner(const wchar_t* html_string); 82 83 // Returns the set of ranges denoting HTML tags that match the given name. 84 // If stop_tag_name is given, then as soon as a tag with this name is 85 // encountered this method will return. 86 void GetTagsByName(const wchar_t* name, StringRangeList* tag_list, 87 const wchar_t* stop_tag_name); 88 89 private: 90 friend class HtmlUtilUnittest; 91 FRIEND_TEST_ALL_PREFIXES(HtmlUtilUnittest, BasicTest); 92 93 // Given html_string which represents the remaining html range, this method 94 // returns the next tag in tag and advances html_string to one character after 95 // the end of tag. This method is intended to be called repeatedly to extract 96 // all of the tags in sequence. 97 // 98 // Returns true if another tag was found and 'tag' was populated with a valid 99 // range. 100 // Returns false if we have reached the end of the html data. 101 bool NextTag(StringRange* html_string, StringRange* tag); 102 103 // Returns true if c can be found in quotes_, false otherwise 104 bool IsQuote(wchar_t c); 105 106 // Returns true if pos refers to the last character in an HTML comment in a 107 // string described by html_string, false otherwise. 108 // For example with html_string describing <!-- foo> -->, pos must refer to 109 // the last > for this method to return true. 110 bool IsHTMLCommentClose(const StringRange* html_string, StrPos pos); 111 112 // Returns true if pos refers to the last character in the terminator of the 113 // opening tag of a downlevel-hidden conditional comment in IE as per 114 // http://msdn.microsoft.com/en-us/library/ms537512(VS.85).aspx#syntax 115 // For example with html_string describing <![if booga >wooga]>, pos must 116 // refer to the last > for this method to return true. 117 bool IsIEConditionalCommentClose(const StringRange* html_string, StrPos pos); 118 119 // We store a (CollapsedWhitespace'd) copy of the html data. 120 const std::wstring html_string_; 121 122 // Store the string of quote characters to avoid repeated construction. 123 const std::wstring quotes_; 124 125 DISALLOW_COPY_AND_ASSIGN(HTMLScanner); 126 }; 127 128 namespace http_utils { 129 130 // Adds "chromeframe/a.b.c.d" to the User-Agent string (a.b.c.d is the version). 131 // If the cf tag has already been added to the string, the original string is 132 // returned. 133 std::string AddChromeFrameToUserAgentValue(const std::string& value); 134 135 // Removes "chromeframe/a.b.c.d" from the User-Agent string (a.b.c.d is the 136 // version). If the cf tag is not present in the string, the original string is 137 // returned. 138 std::string RemoveChromeFrameFromUserAgentValue(const std::string& value); 139 140 // Fetches the user agent from urlmon and adds chrome frame to the 141 // comment section. 142 // NOTE: The returned string includes the "User-Agent: " header name. 143 std::string GetDefaultUserAgentHeaderWithCFTag(); 144 145 // Returns the User-Agent header as would be used by Chrome itself. 146 const char* GetChromeUserAgent(); 147 148 // Fetches the default user agent string from urlmon. 149 // This value does not include the "User-Agent:" header name. 150 std::string GetDefaultUserAgent(); 151 152 // Returns the Chrome Frame user agent. E.g. "chromeframe/1.0". 153 // Note that in unit tests this will be "chromeframe/0.0" due to the version 154 // table not being present in the unit test executable. 155 const char* GetChromeFrameUserAgent(); 156 157 // Returns true if there is a frame busting header (other than the do-nothing 158 // "X-Frame-Options: ALLOWALL") in the provided header block. Note that there 159 // may be multiple X-Frame-Options values specified; if there is one anywhere in 160 // the list with a value other than ALLOWALL, this returns true. 161 bool HasFrameBustingHeader(const std::string& http_headers); 162 163 // Returns the header passed in from the headers list. 164 std::string GetHttpHeaderFromHeaderList(const std::string& header_name, 165 const std::string& headers); 166 } // namespace http_utils 167 168 #endif // CHROME_FRAME_HTML_UTILS_H_ 169