1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ 6 #define URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ 7 8 #include <string> 9 10 #include "base/basictypes.h" 11 #include "base/strings/string16.h" 12 #include "url/url_export.h" 13 14 namespace url { 15 16 // Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and 17 // KURLGoogle.cpp still rely on this type. 18 typedef base::char16 UTF16Char; 19 20 // Component ------------------------------------------------------------------ 21 22 // Represents a substring for URL parsing. 23 struct Component { 24 Component() : begin(0), len(-1) {} 25 26 // Normal constructor: takes an offset and a length. 27 Component(int b, int l) : begin(b), len(l) {} 28 29 int end() const { 30 return begin + len; 31 } 32 33 // Returns true if this component is valid, meaning the length is given. Even 34 // valid components may be empty to record the fact that they exist. 35 bool is_valid() const { 36 return (len != -1); 37 } 38 39 // Returns true if the given component is specified on false, the component 40 // is either empty or invalid. 41 bool is_nonempty() const { 42 return (len > 0); 43 } 44 45 void reset() { 46 begin = 0; 47 len = -1; 48 } 49 50 bool operator==(const Component& other) const { 51 return begin == other.begin && len == other.len; 52 } 53 54 int begin; // Byte offset in the string of this component. 55 int len; // Will be -1 if the component is unspecified. 56 }; 57 58 // Helper that returns a component created with the given begin and ending 59 // points. The ending point is non-inclusive. 60 inline Component MakeRange(int begin, int end) { 61 return Component(begin, end - begin); 62 } 63 64 // Parsed --------------------------------------------------------------------- 65 66 // A structure that holds the identified parts of an input URL. This structure 67 // does NOT store the URL itself. The caller will have to store the URL text 68 // and its corresponding Parsed structure separately. 69 // 70 // Typical usage would be: 71 // 72 // Parsed parsed; 73 // Component scheme; 74 // if (!ExtractScheme(url, url_len, &scheme)) 75 // return I_CAN_NOT_FIND_THE_SCHEME_DUDE; 76 // 77 // if (IsStandardScheme(url, scheme)) // Not provided by this component 78 // ParseStandardURL(url, url_len, &parsed); 79 // else if (IsFileURL(url, scheme)) // Not provided by this component 80 // ParseFileURL(url, url_len, &parsed); 81 // else 82 // ParsePathURL(url, url_len, &parsed); 83 // 84 struct URL_EXPORT Parsed { 85 // Identifies different components. 86 enum ComponentType { 87 SCHEME, 88 USERNAME, 89 PASSWORD, 90 HOST, 91 PORT, 92 PATH, 93 QUERY, 94 REF, 95 }; 96 97 // The default constructor is sufficient for the components, but inner_parsed_ 98 // requires special handling. 99 Parsed(); 100 Parsed(const Parsed&); 101 Parsed& operator=(const Parsed&); 102 ~Parsed(); 103 104 // Returns the length of the URL (the end of the last component). 105 // 106 // Note that for some invalid, non-canonical URLs, this may not be the length 107 // of the string. For example "http://": the parsed structure will only 108 // contain an entry for the four-character scheme, and it doesn't know about 109 // the "://". For all other last-components, it will return the real length. 110 int Length() const; 111 112 // Returns the number of characters before the given component if it exists, 113 // or where the component would be if it did exist. This will return the 114 // string length if the component would be appended to the end. 115 // 116 // Note that this can get a little funny for the port, query, and ref 117 // components which have a delimiter that is not counted as part of the 118 // component. The |include_delimiter| flag controls if you want this counted 119 // as part of the component or not when the component exists. 120 // 121 // This example shows the difference between the two flags for two of these 122 // delimited components that is present (the port and query) and one that 123 // isn't (the reference). The components that this flag affects are marked 124 // with a *. 125 // 0 1 2 126 // 012345678901234567890 127 // Example input: http://foo:80/?query 128 // include_delim=true, ...=false ("<-" indicates different) 129 // SCHEME: 0 0 130 // USERNAME: 5 5 131 // PASSWORD: 5 5 132 // HOST: 7 7 133 // *PORT: 10 11 <- 134 // PATH: 13 13 135 // *QUERY: 14 15 <- 136 // *REF: 20 20 137 // 138 int CountCharactersBefore(ComponentType type, bool include_delimiter) const; 139 140 // Scheme without the colon: "http://foo"/ would have a scheme of "http". 141 // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there 142 // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed 143 // to start at the beginning of the string if there are preceeding whitespace 144 // or control characters. 145 Component scheme; 146 147 // Username. Specified in URLs with an @ sign before the host. See |password| 148 Component username; 149 150 // Password. The length will be -1 if unspecified, 0 if specified but empty. 151 // Not all URLs with a username have a password, as in "http://me@host/". 152 // The password is separated form the username with a colon, as in 153 // "http://me:secret@host/" 154 Component password; 155 156 // Host name. 157 Component host; 158 159 // Port number. 160 Component port; 161 162 // Path, this is everything following the host name, stopping at the query of 163 // ref delimiter (if any). Length will be -1 if unspecified. This includes 164 // the preceeding slash, so the path on http://www.google.com/asdf" is 165 // "/asdf". As a result, it is impossible to have a 0 length path, it will 166 // be -1 in cases like "http://host?foo". 167 // Note that we treat backslashes the same as slashes. 168 Component path; 169 170 // Stuff between the ? and the # after the path. This does not include the 171 // preceeding ? character. Length will be -1 if unspecified, 0 if there is 172 // a question mark but no query string. 173 Component query; 174 175 // Indicated by a #, this is everything following the hash sign (not 176 // including it). If there are multiple hash signs, we'll use the last one. 177 // Length will be -1 if there is no hash sign, or 0 if there is one but 178 // nothing follows it. 179 Component ref; 180 181 // The URL spec from the character after the scheme: until the end of the 182 // URL, regardless of the scheme. This is mostly useful for 'opaque' non- 183 // hierarchical schemes like data: and javascript: as a convient way to get 184 // the string with the scheme stripped off. 185 Component GetContent() const; 186 187 // This is used for nested URL types, currently only filesystem. If you 188 // parse a filesystem URL, the resulting Parsed will have a nested 189 // inner_parsed_ to hold the parsed inner URL's component information. 190 // For all other url types [including the inner URL], it will be NULL. 191 Parsed* inner_parsed() const { 192 return inner_parsed_; 193 } 194 195 void set_inner_parsed(const Parsed& inner_parsed) { 196 if (!inner_parsed_) 197 inner_parsed_ = new Parsed(inner_parsed); 198 else 199 *inner_parsed_ = inner_parsed; 200 } 201 202 void clear_inner_parsed() { 203 if (inner_parsed_) { 204 delete inner_parsed_; 205 inner_parsed_ = NULL; 206 } 207 } 208 209 private: 210 Parsed* inner_parsed_; // This object is owned and managed by this struct. 211 }; 212 213 // Initialization functions --------------------------------------------------- 214 // 215 // These functions parse the given URL, filling in all of the structure's 216 // components. These functions can not fail, they will always do their best 217 // at interpreting the input given. 218 // 219 // The string length of the URL MUST be specified, we do not check for NULLs 220 // at any point in the process, and will actually handle embedded NULLs. 221 // 222 // IMPORTANT: These functions do NOT hang on to the given pointer or copy it 223 // in any way. See the comment above the struct. 224 // 225 // The 8-bit versions require UTF-8 encoding. 226 227 // StandardURL is for when the scheme is known to be one that has an 228 // authority (host) like "http". This function will not handle weird ones 229 // like "about:" and "javascript:", or do the right thing for "file:" URLs. 230 URL_EXPORT void ParseStandardURL(const char* url, 231 int url_len, 232 Parsed* parsed); 233 URL_EXPORT void ParseStandardURL(const base::char16* url, 234 int url_len, 235 Parsed* parsed); 236 237 // PathURL is for when the scheme is known not to have an authority (host) 238 // section but that aren't file URLs either. The scheme is parsed, and 239 // everything after the scheme is considered as the path. This is used for 240 // things like "about:" and "javascript:" 241 URL_EXPORT void ParsePathURL(const char* url, 242 int url_len, 243 bool trim_path_end, 244 Parsed* parsed); 245 URL_EXPORT void ParsePathURL(const base::char16* url, 246 int url_len, 247 bool trim_path_end, 248 Parsed* parsed); 249 250 // FileURL is for file URLs. There are some special rules for interpreting 251 // these. 252 URL_EXPORT void ParseFileURL(const char* url, int url_len, Parsed* parsed); 253 URL_EXPORT void ParseFileURL(const base::char16* url, 254 int url_len, 255 Parsed* parsed); 256 257 // Filesystem URLs are structured differently than other URLs. 258 URL_EXPORT void ParseFileSystemURL(const char* url, 259 int url_len, 260 Parsed* parsed); 261 URL_EXPORT void ParseFileSystemURL(const base::char16* url, 262 int url_len, 263 Parsed* parsed); 264 265 // MailtoURL is for mailto: urls. They are made up scheme,path,query 266 URL_EXPORT void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); 267 URL_EXPORT void ParseMailtoURL(const base::char16* url, 268 int url_len, 269 Parsed* parsed); 270 271 // Helper functions ----------------------------------------------------------- 272 273 // Locates the scheme according to the URL parser's rules. This function is 274 // designed so the caller can find the scheme and call the correct Init* 275 // function according to their known scheme types. 276 // 277 // It also does not perform any validation on the scheme. 278 // 279 // This function will return true if the scheme is found and will put the 280 // scheme's range into *scheme. False means no scheme could be found. Note 281 // that a URL beginning with a colon has a scheme, but it is empty, so this 282 // function will return true but *scheme will = (0,0). 283 // 284 // The scheme is found by skipping spaces and control characters at the 285 // beginning, and taking everything from there to the first colon to be the 286 // scheme. The character at scheme.end() will be the colon (we may enhance 287 // this to handle full width colons or something, so don't count on the 288 // actual character value). The character at scheme.end()+1 will be the 289 // beginning of the rest of the URL, be it the authority or the path (or the 290 // end of the string). 291 // 292 // The 8-bit version requires UTF-8 encoding. 293 URL_EXPORT bool ExtractScheme(const char* url, 294 int url_len, 295 Component* scheme); 296 URL_EXPORT bool ExtractScheme(const base::char16* url, 297 int url_len, 298 Component* scheme); 299 300 // Returns true if ch is a character that terminates the authority segment 301 // of a URL. 302 URL_EXPORT bool IsAuthorityTerminator(base::char16 ch); 303 304 // Does a best effort parse of input |spec|, in range |auth|. If a particular 305 // component is not found, it will be set to invalid. 306 URL_EXPORT void ParseAuthority(const char* spec, 307 const Component& auth, 308 Component* username, 309 Component* password, 310 Component* hostname, 311 Component* port_num); 312 URL_EXPORT void ParseAuthority(const base::char16* spec, 313 const Component& auth, 314 Component* username, 315 Component* password, 316 Component* hostname, 317 Component* port_num); 318 319 // Computes the integer port value from the given port component. The port 320 // component should have been identified by one of the init functions on 321 // |Parsed| for the given input url. 322 // 323 // The return value will be a positive integer between 0 and 64K, or one of 324 // the two special values below. 325 enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; 326 URL_EXPORT int ParsePort(const char* url, const Component& port); 327 URL_EXPORT int ParsePort(const base::char16* url, const Component& port); 328 329 // Extracts the range of the file name in the given url. The path must 330 // already have been computed by the parse function, and the matching URL 331 // and extracted path are provided to this function. The filename is 332 // defined as being everything from the last slash/backslash of the path 333 // to the end of the path. 334 // 335 // The file name will be empty if the path is empty or there is nothing 336 // following the last slash. 337 // 338 // The 8-bit version requires UTF-8 encoding. 339 URL_EXPORT void ExtractFileName(const char* url, 340 const Component& path, 341 Component* file_name); 342 URL_EXPORT void ExtractFileName(const base::char16* url, 343 const Component& path, 344 Component* file_name); 345 346 // Extract the first key/value from the range defined by |*query|. Updates 347 // |*query| to start at the end of the extracted key/value pair. This is 348 // designed for use in a loop: you can keep calling it with the same query 349 // object and it will iterate over all items in the query. 350 // 351 // Some key/value pairs may have the key, the value, or both be empty (for 352 // example, the query string "?&"). These will be returned. Note that an empty 353 // last parameter "foo.com?" or foo.com?a&" will not be returned, this case 354 // is the same as "done." 355 // 356 // The initial query component should not include the '?' (this is the default 357 // for parsed URLs). 358 // 359 // If no key/value are found |*key| and |*value| will be unchanged and it will 360 // return false. 361 URL_EXPORT bool ExtractQueryKeyValue(const char* url, 362 Component* query, 363 Component* key, 364 Component* value); 365 URL_EXPORT bool ExtractQueryKeyValue(const base::char16* url, 366 Component* query, 367 Component* key, 368 Component* value); 369 370 } // namespace url 371 372 #endif // URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ 373