1 // Copyright 2007, Google Inc. 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are 6 // met: 7 // 8 // * Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above 11 // copyright notice, this list of conditions and the following disclaimer 12 // in the documentation and/or other materials provided with the 13 // distribution. 14 // * Neither the name of Google Inc. nor the names of its 15 // contributors may be used to endorse or promote products derived from 16 // this software without specific prior written permission. 17 // 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 #ifndef GOOGLEURL_SRC_URL_PARSE_H__ 31 #define GOOGLEURL_SRC_URL_PARSE_H__ 32 33 #include <string> 34 35 #include "base/basictypes.h" 36 #include "base/string16.h" 37 38 namespace url_parse { 39 40 // Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and 41 // KURLGoogle.cpp still rely on this type. 42 typedef char16 UTF16Char; 43 44 // Component ------------------------------------------------------------------ 45 46 // Represents a substring for URL parsing. 47 struct Component { 48 Component() : begin(0), len(-1) {} 49 50 // Normal constructor: takes an offset and a length. 51 Component(int b, int l) : begin(b), len(l) {} 52 53 int end() const { 54 return begin + len; 55 } 56 57 // Returns true if this component is valid, meaning the length is given. Even 58 // valid components may be empty to record the fact that they exist. 59 bool is_valid() const { 60 return (len != -1); 61 } 62 63 // Returns true if the given component is specified on false, the component 64 // is either empty or invalid. 65 bool is_nonempty() const { 66 return (len > 0); 67 } 68 69 void reset() { 70 begin = 0; 71 len = -1; 72 } 73 74 bool operator==(const Component& other) const { 75 return begin == other.begin && len == other.len; 76 } 77 78 int begin; // Byte offset in the string of this component. 79 int len; // Will be -1 if the component is unspecified. 80 }; 81 82 // Helper that returns a component created with the given begin and ending 83 // points. The ending point is non-inclusive. 84 inline Component MakeRange(int begin, int end) { 85 return Component(begin, end - begin); 86 } 87 88 // Parsed --------------------------------------------------------------------- 89 90 // A structure that holds the identified parts of an input URL. This structure 91 // does NOT store the URL itself. The caller will have to store the URL text 92 // and its corresponding Parsed structure separately. 93 // 94 // Typical usage would be: 95 // 96 // url_parse::Parsed parsed; 97 // url_parse::Component scheme; 98 // if (!url_parse::ExtractScheme(url, url_len, &scheme)) 99 // return I_CAN_NOT_FIND_THE_SCHEME_DUDE; 100 // 101 // if (IsStandardScheme(url, scheme)) // Not provided by this component 102 // url_parseParseStandardURL(url, url_len, &parsed); 103 // else if (IsFileURL(url, scheme)) // Not provided by this component 104 // url_parse::ParseFileURL(url, url_len, &parsed); 105 // else 106 // url_parse::ParsePathURL(url, url_len, &parsed); 107 // 108 struct Parsed { 109 // Identifies different components. 110 enum ComponentType { 111 SCHEME, 112 USERNAME, 113 PASSWORD, 114 HOST, 115 PORT, 116 PATH, 117 QUERY, 118 REF, 119 }; 120 121 // The default constructor is sufficient for the components. 122 Parsed() {} 123 124 // Returns the length of the URL (the end of the last component). 125 // 126 // Note that for some invalid, non-canonical URLs, this may not be the length 127 // of the string. For example "http://": the parsed structure will only 128 // contain an entry for the four-character scheme, and it doesn't know about 129 // the "://". For all other last-components, it will return the real length. 130 int Length() const; 131 132 // Returns the number of characters before the given component if it exists, 133 // or where the component would be if it did exist. This will return the 134 // string length if the component would be appended to the end. 135 // 136 // Note that this can get a little funny for the port, query, and ref 137 // components which have a delimiter that is not counted as part of the 138 // component. The |include_delimiter| flag controls if you want this counted 139 // as part of the component or not when the component exists. 140 // 141 // This example shows the difference between the two flags for two of these 142 // delimited components that is present (the port and query) and one that 143 // isn't (the reference). The components that this flag affects are marked 144 // with a *. 145 // 0 1 2 146 // 012345678901234567890 147 // Example input: http://foo:80/?query 148 // include_delim=true, ...=false ("<-" indicates different) 149 // SCHEME: 0 0 150 // USERNAME: 5 5 151 // PASSWORD: 5 5 152 // HOST: 7 7 153 // *PORT: 10 11 <- 154 // PATH: 13 13 155 // *QUERY: 14 15 <- 156 // *REF: 20 20 157 // 158 int CountCharactersBefore(ComponentType type, bool include_delimiter) const; 159 160 // Scheme without the colon: "http://foo"/ would have a scheme of "http". 161 // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there 162 // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed 163 // to start at the beginning of the string if there are preceeding whitespace 164 // or control characters. 165 Component scheme; 166 167 // Username. Specified in URLs with an @ sign before the host. See |password| 168 Component username; 169 170 // Password. The length will be -1 if unspecified, 0 if specified but empty. 171 // Not all URLs with a username have a password, as in "http://me@host/". 172 // The password is separated form the username with a colon, as in 173 // "http://me:secret@host/" 174 Component password; 175 176 // Host name. 177 Component host; 178 179 // Port number. 180 Component port; 181 182 // Path, this is everything following the host name. Length will be -1 if 183 // unspecified. This includes the preceeding slash, so the path on 184 // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to 185 // have a 0 length path, it will be -1 in cases like "http://host?foo". 186 // Note that we treat backslashes the same as slashes. 187 Component path; 188 189 // Stuff between the ? and the # after the path. This does not include the 190 // preceeding ? character. Length will be -1 if unspecified, 0 if there is 191 // a question mark but no query string. 192 Component query; 193 194 // Indicated by a #, this is everything following the hash sign (not 195 // including it). If there are multiple hash signs, we'll use the last one. 196 // Length will be -1 if there is no hash sign, or 0 if there is one but 197 // nothing follows it. 198 Component ref; 199 }; 200 201 // Initialization functions --------------------------------------------------- 202 // 203 // These functions parse the given URL, filling in all of the structure's 204 // components. These functions can not fail, they will always do their best 205 // at interpreting the input given. 206 // 207 // The string length of the URL MUST be specified, we do not check for NULLs 208 // at any point in the process, and will actually handle embedded NULLs. 209 // 210 // IMPORTANT: These functions do NOT hang on to the given pointer or copy it 211 // in any way. See the comment above the struct. 212 // 213 // The 8-bit versions require UTF-8 encoding. 214 215 // StandardURL is for when the scheme is known to be one that has an 216 // authority (host) like "http". This function will not handle weird ones 217 // like "about:" and "javascript:", or do the right thing for "file:" URLs. 218 void ParseStandardURL(const char* url, int url_len, Parsed* parsed); 219 void ParseStandardURL(const char16* url, int url_len, Parsed* parsed); 220 221 // PathURL is for when the scheme is known not to have an authority (host) 222 // section but that aren't file URLs either. The scheme is parsed, and 223 // everything after the scheme is considered as the path. This is used for 224 // things like "about:" and "javascript:" 225 void ParsePathURL(const char* url, int url_len, Parsed* parsed); 226 void ParsePathURL(const char16* url, int url_len, Parsed* parsed); 227 228 // FileURL is for file URLs. There are some special rules for interpreting 229 // these. 230 void ParseFileURL(const char* url, int url_len, Parsed* parsed); 231 void ParseFileURL(const char16* url, int url_len, Parsed* parsed); 232 233 // MailtoURL is for mailto: urls. They are made up scheme,path,query 234 void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); 235 void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed); 236 237 // Helper functions ----------------------------------------------------------- 238 239 // Locates the scheme according to the URL parser's rules. This function is 240 // designed so the caller can find the scheme and call the correct Init* 241 // function according to their known scheme types. 242 // 243 // It also does not perform any validation on the scheme. 244 // 245 // This function will return true if the scheme is found and will put the 246 // scheme's range into *scheme. False means no scheme could be found. Note 247 // that a URL beginning with a colon has a scheme, but it is empty, so this 248 // function will return true but *scheme will = (0,0). 249 // 250 // The scheme is found by skipping spaces and control characters at the 251 // beginning, and taking everything from there to the first colon to be the 252 // scheme. The character at scheme.end() will be the colon (we may enhance 253 // this to handle full width colons or something, so don't count on the 254 // actual character value). The character at scheme.end()+1 will be the 255 // beginning of the rest of the URL, be it the authority or the path (or the 256 // end of the string). 257 // 258 // The 8-bit version requires UTF-8 encoding. 259 bool ExtractScheme(const char* url, int url_len, Component* scheme); 260 bool ExtractScheme(const char16* url, int url_len, Component* scheme); 261 262 // Returns true if ch is a character that terminates the authority segment 263 // of a URL. 264 bool IsAuthorityTerminator(char16 ch); 265 266 // Does a best effort parse of input |spec|, in range |auth|. If a particular 267 // component is not found, it will be set to invalid. 268 void ParseAuthority(const char* spec, 269 const Component& auth, 270 Component* username, 271 Component* password, 272 Component* hostname, 273 Component* port_num); 274 void ParseAuthority(char16* spec, 275 const Component& auth, 276 Component* username, 277 Component* password, 278 Component* hostname, 279 Component* port_num); 280 281 // Computes the integer port value from the given port component. The port 282 // component should have been identified by one of the init functions on 283 // |Parsed| for the given input url. 284 // 285 // The return value will be a positive integer between 0 and 64K, or one of 286 // the two special values below. 287 enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; 288 int ParsePort(const char* url, const Component& port); 289 int ParsePort(const char16* url, const Component& port); 290 291 // Extracts the range of the file name in the given url. The path must 292 // already have been computed by the parse function, and the matching URL 293 // and extracted path are provided to this function. The filename is 294 // defined as being everything from the last slash/backslash of the path 295 // to the end of the path. 296 // 297 // The file name will be empty if the path is empty or there is nothing 298 // following the last slash. 299 // 300 // The 8-bit version requires UTF-8 encoding. 301 void ExtractFileName(const char* url, 302 const Component& path, 303 Component* file_name); 304 void ExtractFileName(const char16* url, 305 const Component& path, 306 Component* file_name); 307 308 // Extract the first key/value from the range defined by |*query|. Updates 309 // |*query| to start at the end of the extracted key/value pair. This is 310 // designed for use in a loop: you can keep calling it with the same query 311 // object and it will iterate over all items in the query. 312 // 313 // Some key/value pairs may have the key, the value, or both be empty (for 314 // example, the query string "?&"). These will be returned. Note that an empty 315 // last parameter "foo.com?" or foo.com?a&" will not be returned, this case 316 // is the same as "done." 317 // 318 // The initial query component should not include the '?' (this is the default 319 // for parsed URLs). 320 // 321 // If no key/value are found |*key| and |*value| will be unchanged and it will 322 // return false. 323 bool ExtractQueryKeyValue(const char* url, 324 Component* query, 325 Component* key, 326 Component* value); 327 bool ExtractQueryKeyValue(const char16* url, 328 Component* query, 329 Component* key, 330 Component* value); 331 332 } // namespace url_parse 333 334 #endif // GOOGLEURL_SRC_URL_PARSE_H__ 335