Home | History | Annotate | Download | only in cloudstorage
      1 # Copyright 2012 Google Inc. All Rights Reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
      4 # you may not use this file except in compliance with the License.
      5 # You may obtain a copy of the License at
      6 #
      7 #    http://www.apache.org/licenses/LICENSE-2.0
      8 #
      9 # Unless required by applicable law or agreed to in writing,
     10 # software distributed under the License is distributed on an
     11 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
     12 # either express or implied. See the License for the specific
     13 # language governing permissions and limitations under the License.
     14 
     15 """Helpers shared by cloudstorage_stub and cloudstorage_api."""
     16 
     17 
     18 
     19 
     20 
     21 __all__ = ['CS_XML_NS',
     22            'CSFileStat',
     23            'dt_str_to_posix',
     24            'local_api_url',
     25            'LOCAL_GCS_ENDPOINT',
     26            'local_run',
     27            'get_access_token',
     28            'get_metadata',
     29            'GCSFileStat',
     30            'http_time_to_posix',
     31            'memory_usage',
     32            'posix_time_to_http',
     33            'posix_to_dt_str',
     34            'set_access_token',
     35            'validate_options',
     36            'validate_bucket_name',
     37            'validate_bucket_path',
     38            'validate_file_path',
     39           ]
     40 
     41 
     42 import calendar
     43 import datetime
     44 from email import utils as email_utils
     45 import logging
     46 import os
     47 import re
     48 
     49 try:
     50   from google.appengine.api import runtime
     51 except ImportError:
     52   from google.appengine.api import runtime
     53 
     54 
     55 _GCS_BUCKET_REGEX_BASE = r'[a-z0-9\.\-_]{3,63}'
     56 _GCS_BUCKET_REGEX = re.compile(_GCS_BUCKET_REGEX_BASE + r'$')
     57 _GCS_BUCKET_PATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'$')
     58 _GCS_PATH_PREFIX_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'.*')
     59 _GCS_FULLPATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'/.*')
     60 _GCS_METADATA = ['x-goog-meta-',
     61                  'content-disposition',
     62                  'cache-control',
     63                  'content-encoding']
     64 _GCS_OPTIONS = _GCS_METADATA + ['x-goog-acl']
     65 CS_XML_NS = 'http://doc.s3.amazonaws.com/2006-03-01'
     66 LOCAL_GCS_ENDPOINT = '/_ah/gcs'
     67 _access_token = ''
     68 
     69 
     70 _MAX_GET_BUCKET_RESULT = 1000
     71 
     72 
     73 def set_access_token(access_token):
     74   """Set the shared access token to authenticate with Google Cloud Storage.
     75 
     76   When set, the library will always attempt to communicate with the
     77   real Google Cloud Storage with this token even when running on dev appserver.
     78   Note the token could expire so it's up to you to renew it.
     79 
     80   When absent, the library will automatically request and refresh a token
     81   on appserver, or when on dev appserver, talk to a Google Cloud Storage
     82   stub.
     83 
     84   Args:
     85     access_token: you can get one by run 'gsutil -d ls' and copy the
     86       str after 'Bearer'.
     87   """
     88   global _access_token
     89   _access_token = access_token
     90 
     91 
     92 def get_access_token():
     93   """Returns the shared access token."""
     94   return _access_token
     95 
     96 
     97 class GCSFileStat(object):
     98   """Container for GCS file stat."""
     99 
    100   def __init__(self,
    101                filename,
    102                st_size,
    103                etag,
    104                st_ctime,
    105                content_type=None,
    106                metadata=None,
    107                is_dir=False):
    108     """Initialize.
    109 
    110     For files, the non optional arguments are always set.
    111     For directories, only filename and is_dir is set.
    112 
    113     Args:
    114       filename: a Google Cloud Storage filename of form '/bucket/filename'.
    115       st_size: file size in bytes. long compatible.
    116       etag: hex digest of the md5 hash of the file's content. str.
    117       st_ctime: posix file creation time. float compatible.
    118       content_type: content type. str.
    119       metadata: a str->str dict of user specified options when creating
    120         the file. Possible keys are x-goog-meta-, content-disposition,
    121         content-encoding, and cache-control.
    122       is_dir: True if this represents a directory. False if this is a real file.
    123     """
    124     self.filename = filename
    125     self.is_dir = is_dir
    126     self.st_size = None
    127     self.st_ctime = None
    128     self.etag = None
    129     self.content_type = content_type
    130     self.metadata = metadata
    131 
    132     if not is_dir:
    133       self.st_size = long(st_size)
    134       self.st_ctime = float(st_ctime)
    135       if etag[0] == '"' and etag[-1] == '"':
    136         etag = etag[1:-1]
    137       self.etag = etag
    138 
    139   def __repr__(self):
    140     if self.is_dir:
    141       return '(directory: %s)' % self.filename
    142 
    143     return (
    144         '(filename: %(filename)s, st_size: %(st_size)s, '
    145         'st_ctime: %(st_ctime)s, etag: %(etag)s, '
    146         'content_type: %(content_type)s, '
    147         'metadata: %(metadata)s)' %
    148         dict(filename=self.filename,
    149              st_size=self.st_size,
    150              st_ctime=self.st_ctime,
    151              etag=self.etag,
    152              content_type=self.content_type,
    153              metadata=self.metadata))
    154 
    155   def __cmp__(self, other):
    156     if not isinstance(other, self.__class__):
    157       raise ValueError('Argument to cmp must have the same type. '
    158                        'Expect %s, got %s', self.__class__.__name__,
    159                        other.__class__.__name__)
    160     if self.filename > other.filename:
    161       return 1
    162     elif self.filename < other.filename:
    163       return -1
    164     return 0
    165 
    166   def __hash__(self):
    167     if self.etag:
    168       return hash(self.etag)
    169     return hash(self.filename)
    170 
    171 
    172 CSFileStat = GCSFileStat
    173 
    174 
    175 def get_metadata(headers):
    176   """Get user defined options from HTTP response headers."""
    177   return dict((k, v) for k, v in headers.iteritems()
    178               if any(k.lower().startswith(valid) for valid in _GCS_METADATA))
    179 
    180 
    181 def validate_bucket_name(name):
    182   """Validate a Google Storage bucket name.
    183 
    184   Args:
    185     name: a Google Storage bucket name with no prefix or suffix.
    186 
    187   Raises:
    188     ValueError: if name is invalid.
    189   """
    190   _validate_path(name)
    191   if not _GCS_BUCKET_REGEX.match(name):
    192     raise ValueError('Bucket should be 3-63 characters long using only a-z,'
    193                      '0-9, underscore, dash or dot but got %s' % name)
    194 
    195 
    196 def validate_bucket_path(path):
    197   """Validate a Google Cloud Storage bucket path.
    198 
    199   Args:
    200     path: a Google Storage bucket path. It should have form '/bucket'.
    201 
    202   Raises:
    203     ValueError: if path is invalid.
    204   """
    205   _validate_path(path)
    206   if not _GCS_BUCKET_PATH_REGEX.match(path):
    207     raise ValueError('Bucket should have format /bucket '
    208                      'but got %s' % path)
    209 
    210 
    211 def validate_file_path(path):
    212   """Validate a Google Cloud Storage file path.
    213 
    214   Args:
    215     path: a Google Storage file path. It should have form '/bucket/filename'.
    216 
    217   Raises:
    218     ValueError: if path is invalid.
    219   """
    220   _validate_path(path)
    221   if not _GCS_FULLPATH_REGEX.match(path):
    222     raise ValueError('Path should have format /bucket/filename '
    223                      'but got %s' % path)
    224 
    225 
    226 def _process_path_prefix(path_prefix):
    227   """Validate and process a Google Cloud Stoarge path prefix.
    228 
    229   Args:
    230     path_prefix: a Google Cloud Storage path prefix of format '/bucket/prefix'
    231       or '/bucket/' or '/bucket'.
    232 
    233   Raises:
    234     ValueError: if path is invalid.
    235 
    236   Returns:
    237     a tuple of /bucket and prefix. prefix can be None.
    238   """
    239   _validate_path(path_prefix)
    240   if not _GCS_PATH_PREFIX_REGEX.match(path_prefix):
    241     raise ValueError('Path prefix should have format /bucket, /bucket/, '
    242                      'or /bucket/prefix but got %s.' % path_prefix)
    243   bucket_name_end = path_prefix.find('/', 1)
    244   bucket = path_prefix
    245   prefix = None
    246   if bucket_name_end != -1:
    247     bucket = path_prefix[:bucket_name_end]
    248     prefix = path_prefix[bucket_name_end + 1:] or None
    249   return bucket, prefix
    250 
    251 
    252 def _validate_path(path):
    253   """Basic validation of Google Storage paths.
    254 
    255   Args:
    256     path: a Google Storage path. It should have form '/bucket/filename'
    257       or '/bucket'.
    258 
    259   Raises:
    260     ValueError: if path is invalid.
    261     TypeError: if path is not of type basestring.
    262   """
    263   if not path:
    264     raise ValueError('Path is empty')
    265   if not isinstance(path, basestring):
    266     raise TypeError('Path should be a string but is %s (%s).' %
    267                     (path.__class__, path))
    268 
    269 
    270 def validate_options(options):
    271   """Validate Google Cloud Storage options.
    272 
    273   Args:
    274     options: a str->basestring dict of options to pass to Google Cloud Storage.
    275 
    276   Raises:
    277     ValueError: if option is not supported.
    278     TypeError: if option is not of type str or value of an option
    279       is not of type basestring.
    280   """
    281   if not options:
    282     return
    283 
    284   for k, v in options.iteritems():
    285     if not isinstance(k, str):
    286       raise TypeError('option %r should be a str.' % k)
    287     if not any(k.lower().startswith(valid) for valid in _GCS_OPTIONS):
    288       raise ValueError('option %s is not supported.' % k)
    289     if not isinstance(v, basestring):
    290       raise TypeError('value %r for option %s should be of type basestring.' %
    291                       (v, k))
    292 
    293 
    294 def http_time_to_posix(http_time):
    295   """Convert HTTP time format to posix time.
    296 
    297   See http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1
    298   for http time format.
    299 
    300   Args:
    301     http_time: time in RFC 2616 format. e.g.
    302       "Mon, 20 Nov 1995 19:12:08 GMT".
    303 
    304   Returns:
    305     A float of secs from unix epoch.
    306   """
    307   if http_time is not None:
    308     return email_utils.mktime_tz(email_utils.parsedate_tz(http_time))
    309 
    310 
    311 def posix_time_to_http(posix_time):
    312   """Convert posix time to HTML header time format.
    313 
    314   Args:
    315     posix_time: unix time.
    316 
    317   Returns:
    318     A datatime str in RFC 2616 format.
    319   """
    320   if posix_time:
    321     return email_utils.formatdate(posix_time, usegmt=True)
    322 
    323 
    324 _DT_FORMAT = '%Y-%m-%dT%H:%M:%S'
    325 
    326 
    327 def dt_str_to_posix(dt_str):
    328   """format str to posix.
    329 
    330   datetime str is of format %Y-%m-%dT%H:%M:%S.%fZ,
    331   e.g. 2013-04-12T00:22:27.978Z. According to ISO 8601, T is a separator
    332   between date and time when they are on the same line.
    333   Z indicates UTC (zero meridian).
    334 
    335   A pointer: http://www.cl.cam.ac.uk/~mgk25/iso-time.html
    336 
    337   This is used to parse LastModified node from GCS's GET bucket XML response.
    338 
    339   Args:
    340     dt_str: A datetime str.
    341 
    342   Returns:
    343     A float of secs from unix epoch. By posix definition, epoch is midnight
    344     1970/1/1 UTC.
    345   """
    346   parsable, _ = dt_str.split('.')
    347   dt = datetime.datetime.strptime(parsable, _DT_FORMAT)
    348   return calendar.timegm(dt.utctimetuple())
    349 
    350 
    351 def posix_to_dt_str(posix):
    352   """Reverse of str_to_datetime.
    353 
    354   This is used by GCS stub to generate GET bucket XML response.
    355 
    356   Args:
    357     posix: A float of secs from unix epoch.
    358 
    359   Returns:
    360     A datetime str.
    361   """
    362   dt = datetime.datetime.utcfromtimestamp(posix)
    363   dt_str = dt.strftime(_DT_FORMAT)
    364   return dt_str + '.000Z'
    365 
    366 
    367 def local_run():
    368   """Whether we should hit GCS dev appserver stub."""
    369   server_software = os.environ.get('SERVER_SOFTWARE')
    370   if server_software is None:
    371     return True
    372   if 'remote_api' in server_software:
    373     return False
    374   if server_software.startswith(('Development', 'testutil')):
    375     return True
    376   return False
    377 
    378 
    379 def local_api_url():
    380   """Return URL for GCS emulation on dev appserver."""
    381   return 'http://%s%s' % (os.environ.get('HTTP_HOST'), LOCAL_GCS_ENDPOINT)
    382 
    383 
    384 def memory_usage(method):
    385   """Log memory usage before and after a method."""
    386   def wrapper(*args, **kwargs):
    387     logging.info('Memory before method %s is %s.',
    388                  method.__name__, runtime.memory_usage().current())
    389     result = method(*args, **kwargs)
    390     logging.info('Memory after method %s is %s',
    391                  method.__name__, runtime.memory_usage().current())
    392     return result
    393   return wrapper
    394 
    395 
    396 def _add_ns(tagname):
    397   return '{%(ns)s}%(tag)s' % {'ns': CS_XML_NS,
    398                               'tag': tagname}
    399 
    400 
    401 _T_CONTENTS = _add_ns('Contents')
    402 _T_LAST_MODIFIED = _add_ns('LastModified')
    403 _T_ETAG = _add_ns('ETag')
    404 _T_KEY = _add_ns('Key')
    405 _T_SIZE = _add_ns('Size')
    406 _T_PREFIX = _add_ns('Prefix')
    407 _T_COMMON_PREFIXES = _add_ns('CommonPrefixes')
    408 _T_NEXT_MARKER = _add_ns('NextMarker')
    409 _T_IS_TRUNCATED = _add_ns('IsTruncated')
    410