1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 // 5 // This is a port of ManifestParser.cc from WebKit/WebCore/loader/appcache. 6 7 /* 8 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 27 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include "webkit/browser/appcache/manifest_parser.h" 33 34 #include "base/command_line.h" 35 #include "base/i18n/icu_string_conversions.h" 36 #include "base/logging.h" 37 #include "base/strings/utf_string_conversions.h" 38 #include "url/gurl.h" 39 40 namespace appcache { 41 42 namespace { 43 44 // Helper function used to identify 'isPattern' annotations. 45 bool HasPatternMatchingAnnotation(const wchar_t* line_p, 46 const wchar_t* line_end) { 47 // Skip whitespace separating the resource url from the annotation. 48 // Note: trailing whitespace has already been trimmed from the line. 49 while (line_p < line_end && (*line_p == '\t' || *line_p == ' ')) 50 ++line_p; 51 if (line_p == line_end) 52 return false; 53 std::wstring annotation(line_p, line_end - line_p); 54 return annotation == L"isPattern"; 55 } 56 57 } 58 59 enum Mode { 60 EXPLICIT, 61 INTERCEPT, 62 FALLBACK, 63 ONLINE_WHITELIST, 64 UNKNOWN_MODE, 65 }; 66 67 enum InterceptVerb { 68 RETURN, 69 EXECUTE, 70 UNKNOWN_VERB, 71 }; 72 73 Manifest::Manifest() : online_whitelist_all(false) {} 74 75 Manifest::~Manifest() {} 76 77 bool ParseManifest(const GURL& manifest_url, const char* data, int length, 78 Manifest& manifest) { 79 // This is an implementation of the parsing algorithm specified in 80 // the HTML5 offline web application docs: 81 // http://www.w3.org/TR/html5/offline.html 82 // Do not modify it without consulting those docs. 83 // Though you might be tempted to convert these wstrings to UTF-8 or 84 // base::string16, this implementation seems simpler given the constraints. 85 86 const wchar_t kSignature[] = L"CACHE MANIFEST"; 87 const size_t kSignatureLength = arraysize(kSignature) - 1; 88 const wchar_t kChromiumSignature[] = L"CHROMIUM CACHE MANIFEST"; 89 const size_t kChromiumSignatureLength = arraysize(kChromiumSignature) - 1; 90 91 DCHECK(manifest.explicit_urls.empty()); 92 DCHECK(manifest.fallback_namespaces.empty()); 93 DCHECK(manifest.online_whitelist_namespaces.empty()); 94 DCHECK(!manifest.online_whitelist_all); 95 96 Mode mode = EXPLICIT; 97 98 std::wstring data_string; 99 // TODO(jennb): cannot do UTF8ToWide(data, length, &data_string); 100 // until UTF8ToWide uses 0xFFFD Unicode replacement character. 101 base::CodepageToWide(std::string(data, length), base::kCodepageUTF8, 102 base::OnStringConversionError::SUBSTITUTE, &data_string); 103 const wchar_t* p = data_string.c_str(); 104 const wchar_t* end = p + data_string.length(); 105 106 // Look for the magic signature: "^\xFEFF?CACHE MANIFEST[ \t]?" 107 // Example: "CACHE MANIFEST #comment" is a valid signature. 108 // Example: "CACHE MANIFEST;V2" is not. 109 110 // When the input data starts with a UTF-8 Byte-Order-Mark 111 // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a 112 // Unicode BOM (U+FEFF). Skip a converted Unicode BOM if it exists. 113 int bom_offset = 0; 114 if (!data_string.empty() && data_string[0] == 0xFEFF) { 115 bom_offset = 1; 116 ++p; 117 } 118 119 if (p >= end) 120 return false; 121 122 // Check for a supported signature and skip p past it. 123 if (0 == data_string.compare(bom_offset, kSignatureLength, 124 kSignature)) { 125 p += kSignatureLength; 126 } else if (0 == data_string.compare(bom_offset, kChromiumSignatureLength, 127 kChromiumSignature)) { 128 p += kChromiumSignatureLength; 129 } else { 130 return false; 131 } 132 133 // Character after "CACHE MANIFEST" must be whitespace. 134 if (p < end && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r') 135 return false; 136 137 // Skip to the end of the line. 138 while (p < end && *p != '\r' && *p != '\n') 139 ++p; 140 141 while (1) { 142 // Skip whitespace 143 while (p < end && (*p == '\n' || *p == '\r' || *p == ' ' || *p == '\t')) 144 ++p; 145 146 if (p == end) 147 break; 148 149 const wchar_t* line_start = p; 150 151 // Find the end of the line 152 while (p < end && *p != '\r' && *p != '\n') 153 ++p; 154 155 // Check if we have a comment 156 if (*line_start == '#') 157 continue; 158 159 // Get rid of trailing whitespace 160 const wchar_t* tmp = p - 1; 161 while (tmp > line_start && (*tmp == ' ' || *tmp == '\t')) 162 --tmp; 163 164 std::wstring line(line_start, tmp - line_start + 1); 165 166 if (line == L"CACHE:") { 167 mode = EXPLICIT; 168 } else if (line == L"FALLBACK:") { 169 mode = FALLBACK; 170 } else if (line == L"NETWORK:") { 171 mode = ONLINE_WHITELIST; 172 } else if (line == L"CHROMIUM-INTERCEPT:") { 173 mode = INTERCEPT; 174 } else if (*(line.end() - 1) == ':') { 175 mode = UNKNOWN_MODE; 176 } else if (mode == UNKNOWN_MODE) { 177 continue; 178 } else if (line == L"*" && mode == ONLINE_WHITELIST) { 179 manifest.online_whitelist_all = true; 180 continue; 181 } else if (mode == EXPLICIT || mode == ONLINE_WHITELIST) { 182 const wchar_t *line_p = line.c_str(); 183 const wchar_t *line_end = line_p + line.length(); 184 185 // Look for whitespace separating the URL from subsequent ignored tokens. 186 while (line_p < line_end && *line_p != '\t' && *line_p != ' ') 187 ++line_p; 188 189 base::string16 url16; 190 WideToUTF16(line.c_str(), line_p - line.c_str(), &url16); 191 GURL url = manifest_url.Resolve(url16); 192 if (!url.is_valid()) 193 continue; 194 if (url.has_ref()) { 195 GURL::Replacements replacements; 196 replacements.ClearRef(); 197 url = url.ReplaceComponents(replacements); 198 } 199 200 // Scheme component must be the same as the manifest URL's. 201 if (url.scheme() != manifest_url.scheme()) { 202 continue; 203 } 204 205 // See http://code.google.com/p/chromium/issues/detail?id=69594 206 // We willfully violate the HTML5 spec at this point in order 207 // to support the appcaching of cross-origin HTTPS resources. 208 // Per the spec, EXPLICIT cross-origin HTTS resources should be 209 // ignored here. We've opted for a milder constraint and allow 210 // caching unless the resource has a "no-store" header. That 211 // condition is enforced in AppCacheUpdateJob. 212 213 if (mode == EXPLICIT) { 214 manifest.explicit_urls.insert(url.spec()); 215 } else { 216 bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end); 217 manifest.online_whitelist_namespaces.push_back( 218 Namespace(NETWORK_NAMESPACE, url, GURL(), is_pattern)); 219 } 220 } else if (mode == INTERCEPT) { 221 // Lines of the form, 222 // <urlnamespace> <intercept_type> <targeturl> 223 const wchar_t* line_p = line.c_str(); 224 const wchar_t* line_end = line_p + line.length(); 225 226 // Look for first whitespace separating the url namespace from 227 // the intercept type. 228 while (line_p < line_end && *line_p != '\t' && *line_p != ' ') 229 ++line_p; 230 231 if (line_p == line_end) 232 continue; // There was no whitespace separating the URLs. 233 234 base::string16 namespace_url16; 235 WideToUTF16(line.c_str(), line_p - line.c_str(), &namespace_url16); 236 GURL namespace_url = manifest_url.Resolve(namespace_url16); 237 if (!namespace_url.is_valid()) 238 continue; 239 if (namespace_url.has_ref()) { 240 GURL::Replacements replacements; 241 replacements.ClearRef(); 242 namespace_url = namespace_url.ReplaceComponents(replacements); 243 } 244 245 // The namespace URL must have the same scheme, host and port 246 // as the manifest's URL. 247 if (manifest_url.GetOrigin() != namespace_url.GetOrigin()) 248 continue; 249 250 // Skip whitespace separating namespace from the type. 251 while (line_p < line_end && (*line_p == '\t' || *line_p == ' ')) 252 ++line_p; 253 254 // Look for whitespace separating the type from the target url. 255 const wchar_t* type_start = line_p; 256 while (line_p < line_end && *line_p != '\t' && *line_p != ' ') 257 ++line_p; 258 259 // Look for a type value we understand, otherwise skip the line. 260 InterceptVerb verb = UNKNOWN_VERB; 261 std::wstring type(type_start, line_p - type_start); 262 if (type == L"return") { 263 verb = RETURN; 264 } else if (type == L"execute" && 265 CommandLine::ForCurrentProcess()->HasSwitch( 266 kEnableExecutableHandlers)) { 267 verb = EXECUTE; 268 } 269 if (verb == UNKNOWN_VERB) 270 continue; 271 272 // Skip whitespace separating type from the target_url. 273 while (line_p < line_end && (*line_p == '\t' || *line_p == ' ')) 274 ++line_p; 275 276 // Look for whitespace separating the URL from subsequent ignored tokens. 277 const wchar_t* target_url_start = line_p; 278 while (line_p < line_end && *line_p != '\t' && *line_p != ' ') 279 ++line_p; 280 281 base::string16 target_url16; 282 WideToUTF16(target_url_start, line_p - target_url_start, &target_url16); 283 GURL target_url = manifest_url.Resolve(target_url16); 284 if (!target_url.is_valid()) 285 continue; 286 287 if (target_url.has_ref()) { 288 GURL::Replacements replacements; 289 replacements.ClearRef(); 290 target_url = target_url.ReplaceComponents(replacements); 291 } 292 if (manifest_url.GetOrigin() != target_url.GetOrigin()) 293 continue; 294 295 bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end); 296 manifest.intercept_namespaces.push_back( 297 Namespace(INTERCEPT_NAMESPACE, namespace_url, 298 target_url, is_pattern, verb == EXECUTE)); 299 } else if (mode == FALLBACK) { 300 const wchar_t* line_p = line.c_str(); 301 const wchar_t* line_end = line_p + line.length(); 302 303 // Look for whitespace separating the two URLs 304 while (line_p < line_end && *line_p != '\t' && *line_p != ' ') 305 ++line_p; 306 307 if (line_p == line_end) { 308 // There was no whitespace separating the URLs. 309 continue; 310 } 311 312 base::string16 namespace_url16; 313 WideToUTF16(line.c_str(), line_p - line.c_str(), &namespace_url16); 314 GURL namespace_url = manifest_url.Resolve(namespace_url16); 315 if (!namespace_url.is_valid()) 316 continue; 317 if (namespace_url.has_ref()) { 318 GURL::Replacements replacements; 319 replacements.ClearRef(); 320 namespace_url = namespace_url.ReplaceComponents(replacements); 321 } 322 323 // Fallback namespace URL must have the same scheme, host and port 324 // as the manifest's URL. 325 if (manifest_url.GetOrigin() != namespace_url.GetOrigin()) { 326 continue; 327 } 328 329 // Skip whitespace separating fallback namespace from URL. 330 while (line_p < line_end && (*line_p == '\t' || *line_p == ' ')) 331 ++line_p; 332 333 // Look for whitespace separating the URL from subsequent ignored tokens. 334 const wchar_t* fallback_start = line_p; 335 while (line_p < line_end && *line_p != '\t' && *line_p != ' ') 336 ++line_p; 337 338 base::string16 fallback_url16; 339 WideToUTF16(fallback_start, line_p - fallback_start, &fallback_url16); 340 GURL fallback_url = manifest_url.Resolve(fallback_url16); 341 if (!fallback_url.is_valid()) 342 continue; 343 if (fallback_url.has_ref()) { 344 GURL::Replacements replacements; 345 replacements.ClearRef(); 346 fallback_url = fallback_url.ReplaceComponents(replacements); 347 } 348 349 // Fallback entry URL must have the same scheme, host and port 350 // as the manifest's URL. 351 if (manifest_url.GetOrigin() != fallback_url.GetOrigin()) { 352 continue; 353 } 354 355 bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end); 356 357 // Store regardless of duplicate namespace URL. Only first match 358 // will ever be used. 359 manifest.fallback_namespaces.push_back( 360 Namespace(FALLBACK_NAMESPACE, namespace_url, 361 fallback_url, is_pattern)); 362 } else { 363 NOTREACHED(); 364 } 365 } 366 367 return true; 368 } 369 370 } // namespace appcache 371