1 #!/usr/bin/env python 2 # 3 # Copyright 2009 Google Inc. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A class to serve pages from zip files and use memcache for performance. 19 20 This contains a class and a function to create an anonymous instance of the 21 class to serve HTTP GET requests. Memcache is used to increase response speed 22 and lower processing cycles used in serving. Credit to Guido van Rossum and 23 his implementation of zipserve which served as a reference as I wrote this. 24 25 MemcachedZipHandler: Class that serves request 26 create_handler: method to create instance of MemcachedZipHandler 27 """ 28 29 __author__ = 'jmatt (at] google.com (Justin Mattson)' 30 31 import email.Utils 32 import logging 33 import mimetypes 34 import time 35 import zipfile 36 37 from google.appengine.api import memcache 38 from google.appengine.ext import webapp 39 from google.appengine.ext.webapp import util 40 from time import localtime, strftime 41 42 def create_handler(zip_files, max_age=None, public=None): 43 """Factory method to create a MemcachedZipHandler instance. 44 45 Args: 46 zip_files: A list of file names, or a list of lists of file name, first 47 member of file mappings. See MemcachedZipHandler documentation for 48 more information about using the list of lists format 49 max_age: The maximum client-side cache lifetime 50 public: Whether this should be declared public in the client-side cache 51 Returns: 52 A MemcachedZipHandler wrapped in a pretty, anonymous bow for use with App 53 Engine 54 55 Raises: 56 ValueError: if the zip_files argument is not a list 57 """ 58 # verify argument integrity. If the argument is passed in list format, 59 # convert it to list of lists format 60 if zip_files and type(zip_files).__name__ == 'list': 61 num_items = len(zip_files) 62 while num_items > 0: 63 if type(zip_files[num_items - 1]).__name__ != 'list': 64 zip_files[num_items - 1] = [zip_files[num_items-1]] 65 num_items -= 1 66 else: 67 raise ValueError('File name arguments must be a list') 68 69 class HandlerWrapper(MemcachedZipHandler): 70 """Simple wrapper for an instance of MemcachedZipHandler. 71 72 I'm still not sure why this is needed 73 """ 74 def get(self, name): 75 self.zipfilenames = zip_files 76 self.TrueGet(name) 77 if max_age is not None: 78 MAX_AGE = max_age 79 if public is not None: 80 PUBLIC = public 81 82 return HandlerWrapper 83 84 85 class MemcachedZipHandler(webapp.RequestHandler): 86 """Handles get requests for a given URL. 87 88 Serves a GET request from a series of zip files. As files are served they are 89 put into memcache, which is much faster than retreiving them from the zip 90 source file again. It also uses considerably fewer CPU cycles. 91 """ 92 zipfile_cache = {} # class cache of source zip files 93 MAX_AGE = 600 # max client-side cache lifetime 94 PUBLIC = True # public cache setting 95 CACHE_PREFIX = 'cache://' # memcache key prefix for actual URLs 96 NEG_CACHE_PREFIX = 'noncache://' # memcache key prefix for non-existant URL 97 intlString = 'intl/' 98 validLangs = ['en', 'de', 'es', 'fr','it','ja','zh-CN','zh-TW'] 99 100 def TrueGet(self, reqUri): 101 """The top-level entry point to serving requests. 102 103 Called 'True' get because it does the work when called from the wrapper 104 class' get method. Some logic is applied to the request to serve files 105 from an intl/<lang>/... directory or fall through to the default language. 106 107 Args: 108 name: URL requested 109 110 Returns: 111 None 112 """ 113 langName = 'en' 114 resetLangCookie = False 115 urlLangName = None 116 retry = False 117 isValidIntl = False 118 isStripped = False 119 120 # Try to retrieve the user's lang pref from the cookie. If there is no 121 # lang pref cookie in the request, add set-cookie to the response with the 122 # default value of 'en'. 123 try: 124 langName = self.request.cookies['android_developer_pref_lang'] 125 except KeyError: 126 resetLangCookie = True 127 #logging.info('==========================EXCEPTION: NO LANG COOKIE FOUND, USING [%s]', langName) 128 logging.info('==========================REQ INIT name [%s] langName [%s] resetLangCookie [%s]', reqUri, langName, resetLangCookie) 129 130 # Preprocess the req url. If it references a directory or the domain itself, 131 # append '/index.html' to the url and 302 redirect. Otherwise, continue 132 # processing the request below. 133 name = self.PreprocessUrl(reqUri, langName) 134 if name: 135 # Do some prep for handling intl requests. Parse the url and validate 136 # the intl/lang substring, extract the url lang code (urlLangName) and the 137 # the uri that follows the intl/lang substring(contentUri) 138 sections = name.split("/", 2) 139 contentUri = 0 140 isIntl = len(sections) > 1 and (sections[0] == "intl") 141 if isIntl: 142 isValidIntl = sections[1] in self.validLangs 143 if isValidIntl: 144 urlLangName = sections[1] 145 contentUri = sections[2] 146 logging.info(' Content URI is [%s]...', contentUri) 147 if (urlLangName != langName) or (langName == 'en'): 148 # if the lang code in the request is different from that in 149 # the cookie, or if the target lang is en, strip the 150 # intl/nn substring. It will later be redirected to 151 # the user's preferred language url. 152 # logging.info(' Handling a MISMATCHED intl request') 153 name = contentUri 154 isStripped = True 155 isValidIntl = False 156 isIntl = False 157 158 # Send for processing 159 if self.isCleanUrl(name, langName, isValidIntl, isStripped): 160 # handle a 'clean' request. 161 # Try to form a response using the actual request url. 162 # logging.info(' Request being handled as clean: [%s]', name) 163 if not self.CreateResponse(name, langName, isValidIntl, resetLangCookie): 164 # If CreateResponse returns False, there was no such document 165 # in the intl/lang tree. Before going to 404, see if there is an 166 # English-language version of the doc in the default 167 # default tree and return it, else go to 404. 168 self.CreateResponse(contentUri, langName, False, resetLangCookie) 169 170 elif isIntl: 171 # handle the case where we need to pass through an invalid intl req 172 # for processing (so as to get 404 as appropriate). This is needed 173 # because intl urls are passed through clean and retried in English, 174 # if necessary. 175 # logging.info(' Handling an invalid intl request...') 176 self.CreateResponse(name, langName, isValidIntl, resetLangCookie) 177 178 else: 179 # handle the case where we have a non-clean url (usually a non-intl 180 # url) that we need to interpret in the context of any lang pref 181 # that is set. Prepend an intl/lang string to the request url and 182 # send it as a 302 redirect. After the redirect, the subsequent 183 # request will be handled as a clean url. 184 self.RedirToIntl(name, self.intlString, langName) 185 186 def isCleanUrl(self, name, langName, isValidIntl, isStripped): 187 """Determine whether to pass an incoming url straight to processing. 188 189 Args: 190 name: The incoming URL 191 192 Returns: 193 boolean: Whether the URL should be sent straight to processing 194 """ 195 # logging.info(' >>>> isCleanUrl name [%s] langName [%s] isValidIntl [%s]', name, langName, isValidIntl) 196 if (langName == 'en' and not isStripped) or isValidIntl or not ('.html' in name) or (not isValidIntl and not langName): 197 return True 198 199 def PreprocessUrl(self, name, langName): 200 """Any preprocessing work on the URL when it comes in. 201 202 Put any work related to interpreting the incoming URL here. For example, 203 this is used to redirect requests for a directory to the index.html file 204 in that directory. Subclasses should override this method to do different 205 preprocessing. 206 207 Args: 208 name: The incoming URL 209 210 Returns: 211 False if the request was redirected to '/index.html', or 212 The processed URL, otherwise 213 """ 214 # determine if this is a request for a directory 215 final_path_segment = name 216 final_slash_offset = name.rfind('/') 217 if final_slash_offset != len(name) - 1: 218 final_path_segment = name[final_slash_offset + 1:] 219 if final_path_segment.find('.') == -1: 220 name = ''.join([name, '/']) 221 222 # if this is a directory or the domain itself, redirect to /index.html 223 if not name or (name[len(name) - 1:] == '/'): 224 uri = ''.join(['/', name, 'index.html']) 225 # logging.info('--->PREPROCESSING REDIRECT [%s] to [%s] with langName [%s]', name, uri, langName) 226 self.redirect(uri, False) 227 return False 228 else: 229 return name 230 231 def RedirToIntl(self, name, intlString, langName): 232 """Redirect an incoming request to the appropriate intl uri. 233 234 For non-en langName, builds the intl/lang string from a 235 base (en) string and redirects (302) the request to look for 236 a version of the file in langName. For en langName, simply 237 redirects a stripped uri string (intl/nn removed). 238 239 Args: 240 name: The incoming, preprocessed URL 241 242 Returns: 243 The lang-specific URL 244 """ 245 if not (langName == 'en'): 246 builtIntlLangUri = ''.join([intlString, langName, '/', name, '?', self.request.query_string]) 247 else: 248 builtIntlLangUri = name 249 uri = ''.join(['/', builtIntlLangUri]) 250 logging.info('-->>REDIRECTING %s to %s', name, uri) 251 self.redirect(uri, False) 252 return uri 253 254 def CreateResponse(self, name, langName, isValidIntl, resetLangCookie): 255 """Process the url and form a response, if appropriate. 256 257 Attempts to retrieve the requested file (name) from cache, 258 negative cache, or store (zip) and form the response. 259 For intl requests that are not found (in the localized tree), 260 returns False rather than forming a response, so that 261 the request can be retried with the base url (this is the 262 fallthrough to default language). 263 264 For requests that are found, forms the headers and 265 adds the content to the response entity. If the request was 266 for an intl (localized) url, also resets the language cookie 267 to the language specified in the url if needed, to ensure that 268 the client language and response data remain harmonious. 269 270 Args: 271 name: The incoming, preprocessed URL 272 langName: The language id. Used as necessary to reset the 273 language cookie in the response. 274 isValidIntl: If present, indicates whether the request is 275 for a language-specific url 276 resetLangCookie: Whether the response should reset the 277 language cookie to 'langName' 278 279 Returns: 280 True: A response was successfully created for the request 281 False: No response was created. 282 """ 283 # see if we have the page in the memcache 284 logging.info('PROCESSING %s langName [%s] isValidIntl [%s] resetLang [%s]', 285 name, langName, isValidIntl, resetLangCookie) 286 resp_data = self.GetFromCache(name) 287 if resp_data is None: 288 logging.info(' Cache miss for %s', name) 289 resp_data = self.GetFromNegativeCache(name) 290 if resp_data is None: 291 resp_data = self.GetFromStore(name) 292 293 # IF we have the file, put it in the memcache 294 # ELSE put it in the negative cache 295 if resp_data is not None: 296 self.StoreOrUpdateInCache(name, resp_data) 297 elif isValidIntl: 298 # couldn't find the intl doc. Try to fall through to English. 299 #logging.info(' Retrying with base uri...') 300 return False 301 else: 302 logging.info(' Adding %s to negative cache, serving 404', name) 303 self.StoreInNegativeCache(name) 304 self.Write404Error() 305 return True 306 else: 307 # found it in negative cache 308 self.Write404Error() 309 return True 310 311 # found content from cache or store 312 logging.info('FOUND CLEAN') 313 if resetLangCookie: 314 logging.info(' Resetting android_developer_pref_lang cookie to [%s]', 315 langName) 316 expireDate = time.mktime(localtime()) + 60 * 60 * 24 * 365 * 10 317 self.response.headers.add_header('Set-Cookie', 318 'android_developer_pref_lang=%s; path=/; expires=%s' % 319 (langName, strftime("%a, %d %b %Y %H:%M:%S", localtime(expireDate)))) 320 mustRevalidate = False 321 if ('.html' in name): 322 # revalidate html files -- workaround for cache inconsistencies for 323 # negotiated responses 324 mustRevalidate = True 325 #logging.info(' Adding [Vary: Cookie] to response...') 326 self.response.headers.add_header('Vary', 'Cookie') 327 content_type, encoding = mimetypes.guess_type(name) 328 if content_type: 329 self.response.headers['Content-Type'] = content_type 330 self.SetCachingHeaders(mustRevalidate) 331 self.response.out.write(resp_data) 332 elif (name == 'favicon.ico'): 333 self.response.headers['Content-Type'] = 'image/x-icon' 334 self.SetCachingHeaders(mustRevalidate) 335 self.response.out.write(resp_data) 336 elif name.endswith('.psd'): 337 self.response.headers['Content-Type'] = 'application/octet-stream' 338 self.SetCachingHeaders(mustRevalidate) 339 self.response.out.write(resp_data) 340 return True 341 342 def GetFromStore(self, file_path): 343 """Retrieve file from zip files. 344 345 Get the file from the source, it must not have been in the memcache. If 346 possible, we'll use the zip file index to quickly locate where the file 347 should be found. (See MapToFileArchive documentation for assumptions about 348 file ordering.) If we don't have an index or don't find the file where the 349 index says we should, look through all the zip files to find it. 350 351 Args: 352 file_path: the file that we're looking for 353 354 Returns: 355 The contents of the requested file 356 """ 357 resp_data = None 358 file_itr = iter(self.zipfilenames) 359 360 # check the index, if we have one, to see what archive the file is in 361 archive_name = self.MapFileToArchive(file_path) 362 if not archive_name: 363 archive_name = file_itr.next()[0] 364 365 while resp_data is None and archive_name: 366 zip_archive = self.LoadZipFile(archive_name) 367 if zip_archive: 368 369 # we expect some lookups will fail, and that's okay, 404s will deal 370 # with that 371 try: 372 resp_data = zip_archive.read(file_path) 373 except (KeyError, RuntimeError), err: 374 # no op 375 x = False 376 if resp_data is not None: 377 logging.info('%s read from %s', file_path, archive_name) 378 379 try: 380 archive_name = file_itr.next()[0] 381 except (StopIteration), err: 382 archive_name = False 383 384 return resp_data 385 386 def LoadZipFile(self, zipfilename): 387 """Convenience method to load zip file. 388 389 Just a convenience method to load the zip file from the data store. This is 390 useful if we ever want to change data stores and also as a means of 391 dependency injection for testing. This method will look at our file cache 392 first, and then load and cache the file if there's a cache miss 393 394 Args: 395 zipfilename: the name of the zip file to load 396 397 Returns: 398 The zip file requested, or None if there is an I/O error 399 """ 400 zip_archive = None 401 zip_archive = self.zipfile_cache.get(zipfilename) 402 if zip_archive is None: 403 try: 404 zip_archive = zipfile.ZipFile(zipfilename) 405 self.zipfile_cache[zipfilename] = zip_archive 406 except (IOError, RuntimeError), err: 407 logging.error('Can\'t open zipfile %s, cause: %s' % (zipfilename, 408 err)) 409 return zip_archive 410 411 def MapFileToArchive(self, file_path): 412 """Given a file name, determine what archive it should be in. 413 414 This method makes two critical assumptions. 415 (1) The zip files passed as an argument to the handler, if concatenated 416 in that same order, would result in a total ordering 417 of all the files. See (2) for ordering type. 418 (2) Upper case letters before lower case letters. The traversal of a 419 directory tree is depth first. A parent directory's files are added 420 before the files of any child directories 421 422 Args: 423 file_path: the file to be mapped to an archive 424 425 Returns: 426 The name of the archive where we expect the file to be 427 """ 428 num_archives = len(self.zipfilenames) 429 while num_archives > 0: 430 target = self.zipfilenames[num_archives - 1] 431 if len(target) > 1: 432 if self.CompareFilenames(target[1], file_path) >= 0: 433 return target[0] 434 num_archives -= 1 435 436 return None 437 438 def CompareFilenames(self, file1, file2): 439 """Determines whether file1 is lexigraphically 'before' file2. 440 441 WARNING: This method assumes that paths are output in a depth-first, 442 with parent directories' files stored before childs' 443 444 We say that file1 is lexigraphically before file2 if the last non-matching 445 path segment of file1 is alphabetically before file2. 446 447 Args: 448 file1: the first file path 449 file2: the second file path 450 451 Returns: 452 A positive number if file1 is before file2 453 A negative number if file2 is before file1 454 0 if filenames are the same 455 """ 456 f1_segments = file1.split('/') 457 f2_segments = file2.split('/') 458 459 segment_ptr = 0 460 while (segment_ptr < len(f1_segments) and 461 segment_ptr < len(f2_segments) and 462 f1_segments[segment_ptr] == f2_segments[segment_ptr]): 463 segment_ptr += 1 464 465 if len(f1_segments) == len(f2_segments): 466 467 # we fell off the end, the paths much be the same 468 if segment_ptr == len(f1_segments): 469 return 0 470 471 # we didn't fall of the end, compare the segments where they differ 472 if f1_segments[segment_ptr] < f2_segments[segment_ptr]: 473 return 1 474 elif f1_segments[segment_ptr] > f2_segments[segment_ptr]: 475 return -1 476 else: 477 return 0 478 479 # the number of segments differs, we either mismatched comparing 480 # directories, or comparing a file to a directory 481 else: 482 483 # IF we were looking at the last segment of one of the paths, 484 # the one with fewer segments is first because files come before 485 # directories 486 # ELSE we just need to compare directory names 487 if (segment_ptr + 1 == len(f1_segments) or 488 segment_ptr + 1 == len(f2_segments)): 489 return len(f2_segments) - len(f1_segments) 490 else: 491 if f1_segments[segment_ptr] < f2_segments[segment_ptr]: 492 return 1 493 elif f1_segments[segment_ptr] > f2_segments[segment_ptr]: 494 return -1 495 else: 496 return 0 497 498 def SetCachingHeaders(self, revalidate): 499 """Set caching headers for the request.""" 500 max_age = self.MAX_AGE 501 #self.response.headers['Expires'] = email.Utils.formatdate( 502 # time.time() + max_age, usegmt=True) 503 cache_control = [] 504 if self.PUBLIC: 505 cache_control.append('public') 506 cache_control.append('max-age=%d' % max_age) 507 if revalidate: 508 cache_control.append('must-revalidate') 509 self.response.headers['Cache-Control'] = ', '.join(cache_control) 510 511 def GetFromCache(self, filename): 512 """Get file from memcache, if available. 513 514 Args: 515 filename: The URL of the file to return 516 517 Returns: 518 The content of the file 519 """ 520 return memcache.get('%s%s' % (self.CACHE_PREFIX, filename)) 521 522 def StoreOrUpdateInCache(self, filename, data): 523 """Store data in the cache. 524 525 Store a piece of data in the memcache. Memcache has a maximum item size of 526 1*10^6 bytes. If the data is too large, fail, but log the failure. Future 527 work will consider compressing the data before storing or chunking it 528 529 Args: 530 filename: the name of the file to store 531 data: the data of the file 532 533 Returns: 534 None 535 """ 536 try: 537 if not memcache.add('%s%s' % (self.CACHE_PREFIX, filename), data): 538 memcache.replace('%s%s' % (self.CACHE_PREFIX, filename), data) 539 except (ValueError), err: 540 logging.warning('Data size too large to cache\n%s' % err) 541 542 def Write404Error(self): 543 """Ouptut a simple 404 response.""" 544 self.error(404) 545 self.response.out.write( 546 ''.join(['<html><head><title>404: Not Found</title></head>', 547 '<body><b><h2>Error 404</h2><br/>', 548 'File not found</b></body></html>'])) 549 550 def StoreInNegativeCache(self, filename): 551 """If a non-existant URL is accessed, cache this result as well. 552 553 Future work should consider setting a maximum negative cache size to 554 prevent it from from negatively impacting the real cache. 555 556 Args: 557 filename: URL to add ot negative cache 558 559 Returns: 560 None 561 """ 562 memcache.add('%s%s' % (self.NEG_CACHE_PREFIX, filename), -1) 563 564 def GetFromNegativeCache(self, filename): 565 """Retrieve from negative cache. 566 567 Args: 568 filename: URL to retreive 569 570 Returns: 571 The file contents if present in the negative cache. 572 """ 573 return memcache.get('%s%s' % (self.NEG_CACHE_PREFIX, filename)) 574 575 def main(): 576 application = webapp.WSGIApplication([('/([^/]+)/(.*)', 577 MemcachedZipHandler)]) 578 util.run_wsgi_app(application) 579 580 581 if __name__ == '__main__': 582 main() 583