Home | History | Annotate | Download | only in appcache
      1 // Copyright 2014 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // This is a port of ManifestParser.cc from WebKit/WebCore/loader/appcache.
      6 
      7 /*
      8  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     20  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     26  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     27  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include "content/browser/appcache/appcache_manifest_parser.h"
     33 
     34 #include "base/command_line.h"
     35 #include "base/i18n/icu_string_conversions.h"
     36 #include "base/logging.h"
     37 #include "base/strings/utf_string_conversions.h"
     38 #include "url/gurl.h"
     39 
     40 namespace content {
     41 
     42 namespace {
     43 
     44 // Helper function used to identify 'isPattern' annotations.
     45 bool HasPatternMatchingAnnotation(const wchar_t* line_p,
     46                                   const wchar_t* line_end) {
     47   // Skip whitespace separating the resource url from the annotation.
     48   // Note: trailing whitespace has already been trimmed from the line.
     49   while (line_p < line_end && (*line_p == '\t' || *line_p == ' '))
     50     ++line_p;
     51   if (line_p == line_end)
     52     return false;
     53   std::wstring annotation(line_p, line_end - line_p);
     54   return annotation == L"isPattern";
     55 }
     56 
     57 }
     58 
     59 enum Mode {
     60   EXPLICIT,
     61   INTERCEPT,
     62   FALLBACK,
     63   ONLINE_WHITELIST,
     64   UNKNOWN_MODE,
     65 };
     66 
     67 enum InterceptVerb {
     68   RETURN,
     69   EXECUTE,
     70   UNKNOWN_VERB,
     71 };
     72 
     73 AppCacheManifest::AppCacheManifest()
     74     : online_whitelist_all(false),
     75       did_ignore_intercept_namespaces(false) {
     76 }
     77 
     78 AppCacheManifest::~AppCacheManifest() {}
     79 
     80 bool ParseManifest(const GURL& manifest_url, const char* data, int length,
     81                    ParseMode parse_mode, AppCacheManifest& manifest) {
     82   // This is an implementation of the parsing algorithm specified in
     83   // the HTML5 offline web application docs:
     84   //   http://www.w3.org/TR/html5/offline.html
     85   // Do not modify it without consulting those docs.
     86   // Though you might be tempted to convert these wstrings to UTF-8 or
     87   // base::string16, this implementation seems simpler given the constraints.
     88 
     89   const wchar_t kSignature[] = L"CACHE MANIFEST";
     90   const size_t kSignatureLength = arraysize(kSignature) - 1;
     91   const wchar_t kChromiumSignature[] = L"CHROMIUM CACHE MANIFEST";
     92   const size_t kChromiumSignatureLength = arraysize(kChromiumSignature) - 1;
     93 
     94   DCHECK(manifest.explicit_urls.empty());
     95   DCHECK(manifest.fallback_namespaces.empty());
     96   DCHECK(manifest.online_whitelist_namespaces.empty());
     97   DCHECK(!manifest.online_whitelist_all);
     98   DCHECK(!manifest.did_ignore_intercept_namespaces);
     99 
    100   Mode mode = EXPLICIT;
    101 
    102   std::wstring data_string;
    103   // TODO(jennb): cannot do UTF8ToWide(data, length, &data_string);
    104   // until UTF8ToWide uses 0xFFFD Unicode replacement character.
    105   base::CodepageToWide(std::string(data, length), base::kCodepageUTF8,
    106                        base::OnStringConversionError::SUBSTITUTE, &data_string);
    107   const wchar_t* p = data_string.c_str();
    108   const wchar_t* end = p + data_string.length();
    109 
    110   // Look for the magic signature: "^\xFEFF?CACHE MANIFEST[ \t]?"
    111   // Example: "CACHE MANIFEST #comment" is a valid signature.
    112   // Example: "CACHE MANIFEST;V2" is not.
    113 
    114   // When the input data starts with a UTF-8 Byte-Order-Mark
    115   // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a
    116   // Unicode BOM (U+FEFF). Skip a converted Unicode BOM if it exists.
    117   int bom_offset = 0;
    118   if (!data_string.empty() && data_string[0] == 0xFEFF) {
    119     bom_offset = 1;
    120     ++p;
    121   }
    122 
    123   if (p >= end)
    124     return false;
    125 
    126   // Check for a supported signature and skip p past it.
    127   if (0 == data_string.compare(bom_offset, kSignatureLength,
    128                                kSignature)) {
    129     p += kSignatureLength;
    130   } else if (0 == data_string.compare(bom_offset, kChromiumSignatureLength,
    131                                       kChromiumSignature)) {
    132     p += kChromiumSignatureLength;
    133   } else {
    134     return false;
    135   }
    136 
    137   // Character after "CACHE MANIFEST" must be whitespace.
    138   if (p < end && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r')
    139     return false;
    140 
    141   // Skip to the end of the line.
    142   while (p < end && *p != '\r' && *p != '\n')
    143     ++p;
    144 
    145   while (1) {
    146     // Skip whitespace
    147     while (p < end && (*p == '\n' || *p == '\r' || *p == ' ' || *p == '\t'))
    148       ++p;
    149 
    150     if (p == end)
    151       break;
    152 
    153     const wchar_t* line_start = p;
    154 
    155     // Find the end of the line
    156     while (p < end && *p != '\r' && *p != '\n')
    157       ++p;
    158 
    159     // Check if we have a comment
    160     if (*line_start == '#')
    161       continue;
    162 
    163     // Get rid of trailing whitespace
    164     const wchar_t* tmp = p - 1;
    165     while (tmp > line_start && (*tmp == ' ' || *tmp == '\t'))
    166       --tmp;
    167 
    168     std::wstring line(line_start, tmp - line_start + 1);
    169 
    170     if (line == L"CACHE:") {
    171       mode = EXPLICIT;
    172     } else if (line == L"FALLBACK:") {
    173       mode = FALLBACK;
    174     } else if (line == L"NETWORK:") {
    175       mode = ONLINE_WHITELIST;
    176     } else if (line == L"CHROMIUM-INTERCEPT:") {
    177       mode = INTERCEPT;
    178     } else if (*(line.end() - 1) == ':') {
    179       mode = UNKNOWN_MODE;
    180     } else if (mode == UNKNOWN_MODE) {
    181       continue;
    182     } else if (line == L"*" && mode == ONLINE_WHITELIST) {
    183       manifest.online_whitelist_all = true;
    184       continue;
    185     } else if (mode == EXPLICIT || mode == ONLINE_WHITELIST) {
    186       const wchar_t *line_p = line.c_str();
    187       const wchar_t *line_end = line_p + line.length();
    188 
    189       // Look for whitespace separating the URL from subsequent ignored tokens.
    190       while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
    191         ++line_p;
    192 
    193       base::string16 url16;
    194       base::WideToUTF16(line.c_str(), line_p - line.c_str(), &url16);
    195       GURL url = manifest_url.Resolve(url16);
    196       if (!url.is_valid())
    197         continue;
    198       if (url.has_ref()) {
    199         GURL::Replacements replacements;
    200         replacements.ClearRef();
    201         url = url.ReplaceComponents(replacements);
    202       }
    203 
    204       // Scheme component must be the same as the manifest URL's.
    205       if (url.scheme() != manifest_url.scheme()) {
    206         continue;
    207       }
    208 
    209       // See http://code.google.com/p/chromium/issues/detail?id=69594
    210       // We willfully violate the HTML5 spec at this point in order
    211       // to support the appcaching of cross-origin HTTPS resources.
    212       // Per the spec, EXPLICIT cross-origin HTTS resources should be
    213       // ignored here. We've opted for a milder constraint and allow
    214       // caching unless the resource has a "no-store" header. That
    215       // condition is enforced in AppCacheUpdateJob.
    216 
    217       if (mode == EXPLICIT) {
    218         manifest.explicit_urls.insert(url.spec());
    219       } else {
    220         bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end);
    221         manifest.online_whitelist_namespaces.push_back(
    222             AppCacheNamespace(APPCACHE_NETWORK_NAMESPACE, url, GURL(),
    223                 is_pattern));
    224       }
    225     } else if (mode == INTERCEPT) {
    226       if (parse_mode != PARSE_MANIFEST_ALLOWING_INTERCEPTS) {
    227         manifest.did_ignore_intercept_namespaces = true;
    228         continue;
    229       }
    230 
    231       // Lines of the form,
    232       // <urlnamespace> <intercept_type> <targeturl>
    233       const wchar_t* line_p = line.c_str();
    234       const wchar_t* line_end = line_p + line.length();
    235 
    236       // Look for first whitespace separating the url namespace from
    237       // the intercept type.
    238       while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
    239         ++line_p;
    240 
    241       if (line_p == line_end)
    242         continue;  // There was no whitespace separating the URLs.
    243 
    244       base::string16 namespace_url16;
    245       base::WideToUTF16(line.c_str(), line_p - line.c_str(), &namespace_url16);
    246       GURL namespace_url = manifest_url.Resolve(namespace_url16);
    247       if (!namespace_url.is_valid())
    248         continue;
    249       if (namespace_url.has_ref()) {
    250         GURL::Replacements replacements;
    251         replacements.ClearRef();
    252         namespace_url = namespace_url.ReplaceComponents(replacements);
    253       }
    254 
    255       // The namespace URL must have the same scheme, host and port
    256       // as the manifest's URL.
    257       if (manifest_url.GetOrigin() != namespace_url.GetOrigin())
    258         continue;
    259 
    260       // Skip whitespace separating namespace from the type.
    261       while (line_p < line_end && (*line_p == '\t' || *line_p == ' '))
    262         ++line_p;
    263 
    264       // Look for whitespace separating the type from the target url.
    265       const wchar_t* type_start = line_p;
    266       while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
    267         ++line_p;
    268 
    269       // Look for a type value we understand, otherwise skip the line.
    270       InterceptVerb verb = UNKNOWN_VERB;
    271       std::wstring type(type_start, line_p - type_start);
    272       if (type == L"return") {
    273         verb = RETURN;
    274       } else if (type == L"execute" &&
    275                  base::CommandLine::ForCurrentProcess()->HasSwitch(
    276                     kEnableExecutableHandlers)) {
    277         verb = EXECUTE;
    278       }
    279       if (verb == UNKNOWN_VERB)
    280         continue;
    281 
    282       // Skip whitespace separating type from the target_url.
    283       while (line_p < line_end && (*line_p == '\t' || *line_p == ' '))
    284         ++line_p;
    285 
    286       // Look for whitespace separating the URL from subsequent ignored tokens.
    287       const wchar_t* target_url_start = line_p;
    288       while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
    289         ++line_p;
    290 
    291       base::string16 target_url16;
    292       base::WideToUTF16(target_url_start, line_p - target_url_start,
    293                         &target_url16);
    294       GURL target_url = manifest_url.Resolve(target_url16);
    295       if (!target_url.is_valid())
    296         continue;
    297 
    298       if (target_url.has_ref()) {
    299         GURL::Replacements replacements;
    300         replacements.ClearRef();
    301         target_url = target_url.ReplaceComponents(replacements);
    302       }
    303       if (manifest_url.GetOrigin() != target_url.GetOrigin())
    304         continue;
    305 
    306       bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end);
    307       manifest.intercept_namespaces.push_back(
    308           AppCacheNamespace(APPCACHE_INTERCEPT_NAMESPACE, namespace_url,
    309                     target_url, is_pattern, verb == EXECUTE));
    310     } else if (mode == FALLBACK) {
    311       const wchar_t* line_p = line.c_str();
    312       const wchar_t* line_end = line_p + line.length();
    313 
    314       // Look for whitespace separating the two URLs
    315       while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
    316         ++line_p;
    317 
    318       if (line_p == line_end) {
    319         // There was no whitespace separating the URLs.
    320         continue;
    321       }
    322 
    323       base::string16 namespace_url16;
    324       base::WideToUTF16(line.c_str(), line_p - line.c_str(), &namespace_url16);
    325       GURL namespace_url = manifest_url.Resolve(namespace_url16);
    326       if (!namespace_url.is_valid())
    327         continue;
    328       if (namespace_url.has_ref()) {
    329         GURL::Replacements replacements;
    330         replacements.ClearRef();
    331         namespace_url = namespace_url.ReplaceComponents(replacements);
    332       }
    333 
    334       // Fallback namespace URL must have the same scheme, host and port
    335       // as the manifest's URL.
    336       if (manifest_url.GetOrigin() != namespace_url.GetOrigin()) {
    337         continue;
    338       }
    339 
    340       // Skip whitespace separating fallback namespace from URL.
    341       while (line_p < line_end && (*line_p == '\t' || *line_p == ' '))
    342         ++line_p;
    343 
    344       // Look for whitespace separating the URL from subsequent ignored tokens.
    345       const wchar_t* fallback_start = line_p;
    346       while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
    347         ++line_p;
    348 
    349       base::string16 fallback_url16;
    350       base::WideToUTF16(fallback_start, line_p - fallback_start,
    351                         &fallback_url16);
    352       GURL fallback_url = manifest_url.Resolve(fallback_url16);
    353       if (!fallback_url.is_valid())
    354         continue;
    355       if (fallback_url.has_ref()) {
    356         GURL::Replacements replacements;
    357         replacements.ClearRef();
    358         fallback_url = fallback_url.ReplaceComponents(replacements);
    359       }
    360 
    361       // Fallback entry URL must have the same scheme, host and port
    362       // as the manifest's URL.
    363       if (manifest_url.GetOrigin() != fallback_url.GetOrigin()) {
    364         continue;
    365       }
    366 
    367       bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end);
    368 
    369       // Store regardless of duplicate namespace URL. Only first match
    370       // will ever be used.
    371       manifest.fallback_namespaces.push_back(
    372           AppCacheNamespace(APPCACHE_FALLBACK_NAMESPACE, namespace_url,
    373                     fallback_url, is_pattern));
    374     } else {
    375       NOTREACHED();
    376     }
    377   }
    378 
    379   return true;
    380 }
    381 
    382 }  // namespace content
    383