Home | History | Annotate | Download | only in app_engine_server
      1 #!/usr/bin/env python
      2 #
      3 # Copyright 2009 Google Inc.
      4 #
      5 # Licensed under the Apache License, Version 2.0 (the "License");
      6 # you may not use this file except in compliance with the License.
      7 # You may obtain a copy of the License at
      8 #
      9 #   http://www.apache.org/licenses/LICENSE-2.0
     10 #
     11 # Unless required by applicable law or agreed to in writing, software
     12 # distributed under the License is distributed on an "AS IS" BASIS,
     13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 # See the License for the specific language governing permissions and
     15 # limitations under the License.
     16 #
     17 
     18 """A class to serve pages from zip files and use memcache for performance.
     19 
     20 This contains a class and a function to create an anonymous instance of the
     21 class to serve HTTP GET requests. Memcache is used to increase response speed
     22 and lower processing cycles used in serving. Credit to Guido van Rossum and
     23 his implementation of zipserve which served as a reference as I wrote this.
     24 
     25   MemcachedZipHandler: Class that serves request
     26   create_handler: method to create instance of MemcachedZipHandler
     27 """
     28 
     29 __author__ = 'jmatt (at] google.com (Justin Mattson)'
     30 
     31 import email.Utils
     32 import logging
     33 import mimetypes
     34 import time
     35 import zipfile
     36 
     37 from google.appengine.api import memcache
     38 from google.appengine.ext import webapp
     39 from google.appengine.ext.webapp import util
     40 from time import localtime, strftime
     41 
     42 def create_handler(zip_files, max_age=None, public=None):
     43   """Factory method to create a MemcachedZipHandler instance.
     44 
     45   Args:
     46     zip_files: A list of file names, or a list of lists of file name, first
     47         member of file mappings. See MemcachedZipHandler documentation for
     48         more information about using the list of lists format
     49     max_age: The maximum client-side cache lifetime
     50     public: Whether this should be declared public in the client-side cache
     51   Returns:
     52     A MemcachedZipHandler wrapped in a pretty, anonymous bow for use with App
     53     Engine
     54 
     55   Raises:
     56     ValueError: if the zip_files argument is not a list
     57   """
     58   # verify argument integrity. If the argument is passed in list format,
     59   # convert it to list of lists format
     60   if zip_files and type(zip_files).__name__ == 'list':
     61     num_items = len(zip_files)
     62     while num_items > 0:
     63       if type(zip_files[num_items - 1]).__name__ != 'list':
     64         zip_files[num_items - 1] = [zip_files[num_items-1]]
     65       num_items -= 1
     66   else:
     67     raise ValueError('File name arguments must be a list')
     68 
     69   class HandlerWrapper(MemcachedZipHandler):
     70     """Simple wrapper for an instance of MemcachedZipHandler.
     71 
     72     I'm still not sure why this is needed
     73     """
     74     def get(self, name):
     75       self.zipfilenames = zip_files
     76       self.TrueGet(name)
     77       if max_age is not None:
     78         MAX_AGE = max_age
     79       if public is not None:
     80         PUBLIC = public
     81 
     82   return HandlerWrapper
     83 
     84 
     85 class MemcachedZipHandler(webapp.RequestHandler):
     86   """Handles get requests for a given URL.
     87 
     88   Serves a GET request from a series of zip files. As files are served they are
     89   put into memcache, which is much faster than retreiving them from the zip
     90   source file again. It also uses considerably fewer CPU cycles.
     91   """
     92   zipfile_cache = {}                # class cache of source zip files
     93   MAX_AGE = 600                     # max client-side cache lifetime
     94   PUBLIC = True                     # public cache setting
     95   CACHE_PREFIX = 'cache://'         # memcache key prefix for actual URLs
     96   NEG_CACHE_PREFIX = 'noncache://'  # memcache key prefix for non-existant URL
     97   intlString = 'intl/'
     98   validLangs = ['en', 'de', 'es', 'fr','it','ja','zh-CN','zh-TW']
     99   
    100   def TrueGet(self, reqUri):
    101     """The top-level entry point to serving requests.
    102 
    103     Called 'True' get because it does the work when called from the wrapper
    104     class' get method. Some logic is applied to the request to serve files
    105     from an intl/<lang>/... directory or fall through to the default language.
    106 
    107     Args:
    108       name: URL requested
    109 
    110     Returns:
    111       None
    112     """
    113     langName = 'en'
    114     resetLangCookie = False
    115     urlLangName = None
    116     retry = False
    117     isValidIntl = False
    118     isStripped = False
    119 
    120     # Try to retrieve the user's lang pref from the cookie. If there is no
    121     # lang pref cookie in the request, add set-cookie to the response with the 
    122     # default value of 'en'.
    123     try:
    124       langName = self.request.cookies['android_developer_pref_lang']
    125     except KeyError:
    126       resetLangCookie = True
    127       #logging.info('==========================EXCEPTION: NO LANG COOKIE FOUND, USING [%s]', langName)
    128     logging.info('==========================REQ INIT name [%s] langName [%s] resetLangCookie [%s]', reqUri, langName, resetLangCookie)
    129 
    130     # Preprocess the req url. If it references a directory or the domain itself,
    131     # append '/index.html' to the url and 302 redirect. Otherwise, continue
    132     # processing the request below.
    133     name = self.PreprocessUrl(reqUri, langName)
    134     if name:
    135       # Do some prep for handling intl requests. Parse the url and validate
    136       # the intl/lang substring, extract the url lang code (urlLangName) and the
    137       # the uri that follows the intl/lang substring(contentUri)
    138       sections = name.split("/", 2)
    139       contentUri = 0
    140       isIntl = len(sections) > 1 and (sections[0] == "intl")
    141       if isIntl:
    142         isValidIntl = sections[1] in self.validLangs
    143         if isValidIntl:
    144           urlLangName = sections[1]
    145           contentUri = sections[2]
    146           logging.info('  Content URI is [%s]...', contentUri)
    147           if (urlLangName != langName) or (langName == 'en'):
    148             # if the lang code in the request is different from that in 
    149             # the cookie, or if the target lang is en, strip the 
    150             # intl/nn substring. It will later be redirected to
    151             # the user's preferred language url. 
    152             # logging.info('  Handling a MISMATCHED intl request')
    153             name = contentUri
    154             isStripped = True
    155             isValidIntl = False
    156             isIntl = False
    157 
    158       # Send for processing
    159       if self.isCleanUrl(name, langName, isValidIntl, isStripped):
    160         # handle a 'clean' request.
    161         # Try to form a response using the actual request url.
    162         # logging.info('  Request being handled as clean: [%s]', name)
    163         if not self.CreateResponse(name, langName, isValidIntl, resetLangCookie):
    164           # If CreateResponse returns False, there was no such document
    165           # in the intl/lang tree. Before going to 404, see if there is an
    166           # English-language version of the doc in the default
    167           # default tree and return it, else go to 404.
    168           self.CreateResponse(contentUri, langName, False, resetLangCookie)
    169 
    170       elif isIntl:
    171         # handle the case where we need to pass through an invalid intl req 
    172         # for processing (so as to get 404 as appropriate). This is needed
    173         # because intl urls are passed through clean and retried in English,
    174         # if necessary.
    175         # logging.info('  Handling an invalid intl request...')
    176         self.CreateResponse(name, langName, isValidIntl, resetLangCookie)
    177 
    178       else:
    179         # handle the case where we have a non-clean url (usually a non-intl
    180         # url) that we need to interpret in the context of any lang pref
    181         # that is set. Prepend an intl/lang string to the request url and
    182         # send it as a 302 redirect. After the redirect, the subsequent
    183         # request will be handled as a clean url.
    184         self.RedirToIntl(name, self.intlString, langName)
    185 
    186   def isCleanUrl(self, name, langName, isValidIntl, isStripped):
    187     """Determine whether to pass an incoming url straight to processing. 
    188 
    189        Args:
    190          name: The incoming URL
    191 
    192        Returns:
    193          boolean: Whether the URL should be sent straight to processing
    194     """
    195     # logging.info('  >>>> isCleanUrl name [%s] langName [%s] isValidIntl [%s]', name, langName, isValidIntl)
    196     if (langName == 'en' and not isStripped) or isValidIntl or not ('.html' in name) or (not isValidIntl and not langName):
    197       return True
    198 
    199   def PreprocessUrl(self, name, langName):
    200     """Any preprocessing work on the URL when it comes in.
    201 
    202     Put any work related to interpreting the incoming URL here. For example,
    203     this is used to redirect requests for a directory to the index.html file
    204     in that directory. Subclasses should override this method to do different
    205     preprocessing.
    206 
    207     Args:
    208       name: The incoming URL
    209 
    210     Returns:
    211       False if the request was redirected to '/index.html', or
    212       The processed URL, otherwise
    213     """
    214     # determine if this is a request for a directory
    215     final_path_segment = name
    216     final_slash_offset = name.rfind('/')
    217     if final_slash_offset != len(name) - 1:
    218       final_path_segment = name[final_slash_offset + 1:]
    219       if final_path_segment.find('.') == -1:
    220         name = ''.join([name, '/'])
    221 
    222     # if this is a directory or the domain itself, redirect to /index.html
    223     if not name or (name[len(name) - 1:] == '/'):
    224       uri = ''.join(['/', name, 'index.html'])
    225       # logging.info('--->PREPROCESSING REDIRECT [%s] to [%s] with langName [%s]', name, uri, langName)
    226       self.redirect(uri, False)
    227       return False
    228     else:
    229       return name
    230 
    231   def RedirToIntl(self, name, intlString, langName):
    232     """Redirect an incoming request to the appropriate intl uri.
    233 
    234        For non-en langName, builds the intl/lang string from a
    235        base (en) string and redirects (302) the request to look for 
    236        a version of the file in langName. For en langName, simply 
    237        redirects a stripped uri string (intl/nn removed).
    238 
    239     Args:
    240       name: The incoming, preprocessed URL
    241 
    242     Returns:
    243       The lang-specific URL
    244     """
    245     if not (langName == 'en'):
    246       builtIntlLangUri = ''.join([intlString, langName, '/', name, '?', self.request.query_string])
    247     else:
    248       builtIntlLangUri = name
    249     uri = ''.join(['/', builtIntlLangUri])
    250     logging.info('-->>REDIRECTING %s to  %s', name, uri)
    251     self.redirect(uri, False)
    252     return uri
    253 
    254   def CreateResponse(self, name, langName, isValidIntl, resetLangCookie):
    255     """Process the url and form a response, if appropriate.
    256 
    257        Attempts to retrieve the requested file (name) from cache, 
    258        negative cache, or store (zip) and form the response. 
    259        For intl requests that are not found (in the localized tree), 
    260        returns False rather than forming a response, so that
    261        the request can be retried with the base url (this is the 
    262        fallthrough to default language). 
    263 
    264        For requests that are found, forms the headers and
    265        adds the content to the response entity. If the request was
    266        for an intl (localized) url, also resets the language cookie 
    267        to the language specified in the url if needed, to ensure that 
    268        the client language and response data remain harmonious. 
    269 
    270     Args:
    271       name: The incoming, preprocessed URL
    272       langName: The language id. Used as necessary to reset the
    273                 language cookie in the response.
    274       isValidIntl: If present, indicates whether the request is
    275                    for a language-specific url
    276       resetLangCookie: Whether the response should reset the
    277                        language cookie to 'langName'
    278 
    279     Returns:
    280       True: A response was successfully created for the request
    281       False: No response was created.
    282     """
    283     # see if we have the page in the memcache
    284     logging.info('PROCESSING %s langName [%s] isValidIntl [%s] resetLang [%s]', 
    285       name, langName, isValidIntl, resetLangCookie)
    286     resp_data = self.GetFromCache(name)
    287     if resp_data is None:
    288       logging.info('  Cache miss for %s', name)
    289       resp_data = self.GetFromNegativeCache(name)
    290       if resp_data is None:
    291         resp_data = self.GetFromStore(name)
    292 
    293         # IF we have the file, put it in the memcache
    294         # ELSE put it in the negative cache
    295         if resp_data is not None:
    296           self.StoreOrUpdateInCache(name, resp_data)
    297         elif isValidIntl:
    298           # couldn't find the intl doc. Try to fall through to English.
    299           #logging.info('  Retrying with base uri...')
    300           return False
    301         else:
    302           logging.info('  Adding %s to negative cache, serving 404', name)
    303           self.StoreInNegativeCache(name)
    304           self.Write404Error()
    305           return True
    306       else:
    307         # found it in negative cache
    308         self.Write404Error()
    309         return True
    310 
    311     # found content from cache or store
    312     logging.info('FOUND CLEAN')
    313     if resetLangCookie:
    314       logging.info('  Resetting android_developer_pref_lang cookie to [%s]',
    315       langName)
    316       expireDate = time.mktime(localtime()) + 60 * 60 * 24 * 365 * 10
    317       self.response.headers.add_header('Set-Cookie', 
    318       'android_developer_pref_lang=%s; path=/; expires=%s' % 
    319       (langName, strftime("%a, %d %b %Y %H:%M:%S", localtime(expireDate))))
    320     mustRevalidate = False
    321     if ('.html' in name):
    322       # revalidate html files -- workaround for cache inconsistencies for 
    323       # negotiated responses
    324       mustRevalidate = True
    325       #logging.info('  Adding [Vary: Cookie] to response...')
    326       self.response.headers.add_header('Vary', 'Cookie')
    327     content_type, encoding = mimetypes.guess_type(name)
    328     if content_type:
    329       self.response.headers['Content-Type'] = content_type
    330       self.SetCachingHeaders(mustRevalidate)
    331       self.response.out.write(resp_data)
    332     elif (name == 'favicon.ico'):
    333       self.response.headers['Content-Type'] = 'image/x-icon'
    334       self.SetCachingHeaders(mustRevalidate)
    335       self.response.out.write(resp_data)
    336     elif name.endswith('.psd'):
    337       self.response.headers['Content-Type'] = 'application/octet-stream'
    338       self.SetCachingHeaders(mustRevalidate)
    339       self.response.out.write(resp_data)
    340     return True
    341 
    342   def GetFromStore(self, file_path):
    343     """Retrieve file from zip files.
    344 
    345     Get the file from the source, it must not have been in the memcache. If
    346     possible, we'll use the zip file index to quickly locate where the file
    347     should be found. (See MapToFileArchive documentation for assumptions about
    348     file ordering.) If we don't have an index or don't find the file where the
    349     index says we should, look through all the zip files to find it.
    350 
    351     Args:
    352       file_path: the file that we're looking for
    353 
    354     Returns:
    355       The contents of the requested file
    356     """
    357     resp_data = None
    358     file_itr = iter(self.zipfilenames)
    359 
    360     # check the index, if we have one, to see what archive the file is in
    361     archive_name = self.MapFileToArchive(file_path)
    362     if not archive_name:
    363       archive_name = file_itr.next()[0]
    364 
    365     while resp_data is None and archive_name:
    366       zip_archive = self.LoadZipFile(archive_name)
    367       if zip_archive:
    368 
    369         # we expect some lookups will fail, and that's okay, 404s will deal
    370         # with that
    371         try:
    372           resp_data = zip_archive.read(file_path)
    373         except (KeyError, RuntimeError), err:
    374           # no op
    375           x = False
    376         if resp_data is not None:
    377           logging.info('%s read from %s', file_path, archive_name)
    378           
    379       try:
    380         archive_name = file_itr.next()[0]
    381       except (StopIteration), err:
    382         archive_name = False
    383 
    384     return resp_data
    385 
    386   def LoadZipFile(self, zipfilename):
    387     """Convenience method to load zip file.
    388 
    389     Just a convenience method to load the zip file from the data store. This is
    390     useful if we ever want to change data stores and also as a means of
    391     dependency injection for testing. This method will look at our file cache
    392     first, and then load and cache the file if there's a cache miss
    393 
    394     Args:
    395       zipfilename: the name of the zip file to load
    396 
    397     Returns:
    398       The zip file requested, or None if there is an I/O error
    399     """
    400     zip_archive = None
    401     zip_archive = self.zipfile_cache.get(zipfilename)
    402     if zip_archive is None:
    403       try:
    404         zip_archive = zipfile.ZipFile(zipfilename)
    405         self.zipfile_cache[zipfilename] = zip_archive
    406       except (IOError, RuntimeError), err:
    407         logging.error('Can\'t open zipfile %s, cause: %s' % (zipfilename,
    408                                                              err))
    409     return zip_archive
    410 
    411   def MapFileToArchive(self, file_path):
    412     """Given a file name, determine what archive it should be in.
    413 
    414     This method makes two critical assumptions.
    415     (1) The zip files passed as an argument to the handler, if concatenated
    416         in that same order, would result in a total ordering
    417         of all the files. See (2) for ordering type.
    418     (2) Upper case letters before lower case letters. The traversal of a
    419         directory tree is depth first. A parent directory's files are added
    420         before the files of any child directories
    421 
    422     Args:
    423       file_path: the file to be mapped to an archive
    424 
    425     Returns:
    426       The name of the archive where we expect the file to be
    427     """
    428     num_archives = len(self.zipfilenames)
    429     while num_archives > 0:
    430       target = self.zipfilenames[num_archives - 1]
    431       if len(target) > 1:
    432         if self.CompareFilenames(target[1], file_path) >= 0:
    433           return target[0]
    434       num_archives -= 1
    435 
    436     return None
    437 
    438   def CompareFilenames(self, file1, file2):
    439     """Determines whether file1 is lexigraphically 'before' file2.
    440 
    441     WARNING: This method assumes that paths are output in a depth-first,
    442     with parent directories' files stored before childs'
    443 
    444     We say that file1 is lexigraphically before file2 if the last non-matching
    445     path segment of file1 is alphabetically before file2.
    446     
    447     Args:
    448       file1: the first file path
    449       file2: the second file path
    450 
    451     Returns:
    452       A positive number if file1 is before file2
    453       A negative number if file2 is before file1
    454       0 if filenames are the same
    455     """
    456     f1_segments = file1.split('/')
    457     f2_segments = file2.split('/')
    458 
    459     segment_ptr = 0
    460     while (segment_ptr < len(f1_segments) and
    461            segment_ptr < len(f2_segments) and
    462            f1_segments[segment_ptr] == f2_segments[segment_ptr]):
    463       segment_ptr += 1
    464 
    465     if len(f1_segments) == len(f2_segments):
    466 
    467       # we fell off the end, the paths much be the same
    468       if segment_ptr == len(f1_segments):
    469         return 0
    470 
    471       # we didn't fall of the end, compare the segments where they differ
    472       if f1_segments[segment_ptr] < f2_segments[segment_ptr]:
    473         return 1
    474       elif f1_segments[segment_ptr] > f2_segments[segment_ptr]:
    475         return -1
    476       else:
    477         return 0
    478 
    479       # the number of segments differs, we either mismatched comparing
    480       # directories, or comparing a file to a directory
    481     else:
    482 
    483       # IF we were looking at the last segment of one of the paths,
    484       # the one with fewer segments is first because files come before
    485       # directories
    486       # ELSE we just need to compare directory names
    487       if (segment_ptr + 1 == len(f1_segments) or
    488           segment_ptr + 1 == len(f2_segments)):
    489         return len(f2_segments) - len(f1_segments)
    490       else:
    491         if f1_segments[segment_ptr] < f2_segments[segment_ptr]:
    492           return 1
    493         elif f1_segments[segment_ptr] > f2_segments[segment_ptr]:
    494           return -1
    495         else:
    496           return 0
    497 
    498   def SetCachingHeaders(self, revalidate):
    499     """Set caching headers for the request."""
    500     max_age = self.MAX_AGE
    501     #self.response.headers['Expires'] = email.Utils.formatdate(
    502     #    time.time() + max_age, usegmt=True)
    503     cache_control = []
    504     if self.PUBLIC:
    505       cache_control.append('public')
    506     cache_control.append('max-age=%d' % max_age)
    507     if revalidate:
    508       cache_control.append('must-revalidate')
    509     self.response.headers['Cache-Control'] = ', '.join(cache_control)
    510 
    511   def GetFromCache(self, filename):
    512     """Get file from memcache, if available.
    513 
    514     Args:
    515       filename: The URL of the file to return
    516 
    517     Returns:
    518       The content of the file
    519     """
    520     return memcache.get('%s%s' % (self.CACHE_PREFIX, filename))
    521 
    522   def StoreOrUpdateInCache(self, filename, data):
    523     """Store data in the cache.
    524 
    525     Store a piece of data in the memcache. Memcache has a maximum item size of
    526     1*10^6 bytes. If the data is too large, fail, but log the failure. Future
    527     work will consider compressing the data before storing or chunking it
    528 
    529     Args:
    530       filename: the name of the file to store
    531       data: the data of the file
    532 
    533     Returns:
    534       None
    535     """
    536     try:
    537       if not memcache.add('%s%s' % (self.CACHE_PREFIX, filename), data):
    538         memcache.replace('%s%s' % (self.CACHE_PREFIX, filename), data)
    539     except (ValueError), err:
    540       logging.warning('Data size too large to cache\n%s' % err)
    541 
    542   def Write404Error(self):
    543     """Ouptut a simple 404 response."""
    544     self.error(404)
    545     self.response.out.write(
    546         ''.join(['<html><head><title>404: Not Found</title></head>',
    547                  '<body><b><h2>Error 404</h2><br/>',
    548                  'File not found</b></body></html>']))
    549 
    550   def StoreInNegativeCache(self, filename):
    551     """If a non-existant URL is accessed, cache this result as well.
    552 
    553     Future work should consider setting a maximum negative cache size to
    554     prevent it from from negatively impacting the real cache.
    555 
    556     Args:
    557       filename: URL to add ot negative cache
    558 
    559     Returns:
    560       None
    561     """
    562     memcache.add('%s%s' % (self.NEG_CACHE_PREFIX, filename), -1)
    563 
    564   def GetFromNegativeCache(self, filename):
    565     """Retrieve from negative cache.
    566 
    567     Args:
    568       filename: URL to retreive
    569 
    570     Returns:
    571       The file contents if present in the negative cache.
    572     """
    573     return memcache.get('%s%s' % (self.NEG_CACHE_PREFIX, filename))
    574 
    575 def main():
    576   application = webapp.WSGIApplication([('/([^/]+)/(.*)',
    577                                          MemcachedZipHandler)])
    578   util.run_wsgi_app(application)
    579 
    580 
    581 if __name__ == '__main__':
    582   main()
    583