Home | History | Annotate | Download | only in cloudstorage
      1 # Copyright 2012 Google Inc. All Rights Reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
      4 # you may not use this file except in compliance with the License.
      5 # You may obtain a copy of the License at
      6 #
      7 #    http://www.apache.org/licenses/LICENSE-2.0
      8 #
      9 # Unless required by applicable law or agreed to in writing,
     10 # software distributed under the License is distributed on an
     11 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
     12 # either express or implied. See the License for the specific
     13 # language governing permissions and limitations under the License.
     14 
     15 """Helpers shared by cloudstorage_stub and cloudstorage_api."""
     16 
     17 
     18 
     19 
     20 
     21 __all__ = ['CS_XML_NS',
     22            'CSFileStat',
     23            'dt_str_to_posix',
     24            'local_api_url',
     25            'LOCAL_GCS_ENDPOINT',
     26            'local_run',
     27            'get_access_token',
     28            'get_stored_content_length',
     29            'get_metadata',
     30            'GCSFileStat',
     31            'http_time_to_posix',
     32            'memory_usage',
     33            'posix_time_to_http',
     34            'posix_to_dt_str',
     35            'set_access_token',
     36            'validate_options',
     37            'validate_bucket_name',
     38            'validate_bucket_path',
     39            'validate_file_path',
     40           ]
     41 
     42 
     43 import calendar
     44 import datetime
     45 from email import utils as email_utils
     46 import logging
     47 import os
     48 import re
     49 
     50 try:
     51   from google.appengine.api import runtime
     52 except ImportError:
     53   from google.appengine.api import runtime
     54 
     55 
     56 _GCS_BUCKET_REGEX_BASE = r'[a-z0-9\.\-_]{3,63}'
     57 _GCS_BUCKET_REGEX = re.compile(_GCS_BUCKET_REGEX_BASE + r'$')
     58 _GCS_BUCKET_PATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'$')
     59 _GCS_PATH_PREFIX_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'.*')
     60 _GCS_FULLPATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'/.*')
     61 _GCS_METADATA = ['x-goog-meta-',
     62                  'content-disposition',
     63                  'cache-control',
     64                  'content-encoding']
     65 _GCS_OPTIONS = _GCS_METADATA + ['x-goog-acl']
     66 CS_XML_NS = 'http://doc.s3.amazonaws.com/2006-03-01'
     67 LOCAL_GCS_ENDPOINT = '/_ah/gcs'
     68 _access_token = ''
     69 
     70 
     71 _MAX_GET_BUCKET_RESULT = 1000
     72 
     73 
     74 def set_access_token(access_token):
     75   """Set the shared access token to authenticate with Google Cloud Storage.
     76 
     77   When set, the library will always attempt to communicate with the
     78   real Google Cloud Storage with this token even when running on dev appserver.
     79   Note the token could expire so it's up to you to renew it.
     80 
     81   When absent, the library will automatically request and refresh a token
     82   on appserver, or when on dev appserver, talk to a Google Cloud Storage
     83   stub.
     84 
     85   Args:
     86     access_token: you can get one by run 'gsutil -d ls' and copy the
     87       str after 'Bearer'.
     88   """
     89   global _access_token
     90   _access_token = access_token
     91 
     92 
     93 def get_access_token():
     94   """Returns the shared access token."""
     95   return _access_token
     96 
     97 
     98 class GCSFileStat(object):
     99   """Container for GCS file stat."""
    100 
    101   def __init__(self,
    102                filename,
    103                st_size,
    104                etag,
    105                st_ctime,
    106                content_type=None,
    107                metadata=None,
    108                is_dir=False):
    109     """Initialize.
    110 
    111     For files, the non optional arguments are always set.
    112     For directories, only filename and is_dir is set.
    113 
    114     Args:
    115       filename: a Google Cloud Storage filename of form '/bucket/filename'.
    116       st_size: file size in bytes. long compatible.
    117       etag: hex digest of the md5 hash of the file's content. str.
    118       st_ctime: posix file creation time. float compatible.
    119       content_type: content type. str.
    120       metadata: a str->str dict of user specified options when creating
    121         the file. Possible keys are x-goog-meta-, content-disposition,
    122         content-encoding, and cache-control.
    123       is_dir: True if this represents a directory. False if this is a real file.
    124     """
    125     self.filename = filename
    126     self.is_dir = is_dir
    127     self.st_size = None
    128     self.st_ctime = None
    129     self.etag = None
    130     self.content_type = content_type
    131     self.metadata = metadata
    132 
    133     if not is_dir:
    134       self.st_size = long(st_size)
    135       self.st_ctime = float(st_ctime)
    136       if etag[0] == '"' and etag[-1] == '"':
    137         etag = etag[1:-1]
    138       self.etag = etag
    139 
    140   def __repr__(self):
    141     if self.is_dir:
    142       return '(directory: %s)' % self.filename
    143 
    144     return (
    145         '(filename: %(filename)s, st_size: %(st_size)s, '
    146         'st_ctime: %(st_ctime)s, etag: %(etag)s, '
    147         'content_type: %(content_type)s, '
    148         'metadata: %(metadata)s)' %
    149         dict(filename=self.filename,
    150              st_size=self.st_size,
    151              st_ctime=self.st_ctime,
    152              etag=self.etag,
    153              content_type=self.content_type,
    154              metadata=self.metadata))
    155 
    156   def __cmp__(self, other):
    157     if not isinstance(other, self.__class__):
    158       raise ValueError('Argument to cmp must have the same type. '
    159                        'Expect %s, got %s', self.__class__.__name__,
    160                        other.__class__.__name__)
    161     if self.filename > other.filename:
    162       return 1
    163     elif self.filename < other.filename:
    164       return -1
    165     return 0
    166 
    167   def __hash__(self):
    168     if self.etag:
    169       return hash(self.etag)
    170     return hash(self.filename)
    171 
    172 
    173 CSFileStat = GCSFileStat
    174 
    175 
    176 def get_stored_content_length(headers):
    177   """Return the content length (in bytes) of the object as stored in GCS.
    178 
    179   x-goog-stored-content-length should always be present except when called via
    180   the local dev_appserver. Therefore if it is not present we default to the
    181   standard content-length header.
    182 
    183   Args:
    184     headers: a dict of headers from the http response.
    185 
    186   Returns:
    187     the stored content length.
    188   """
    189   length = headers.get('x-goog-stored-content-length')
    190   if length is None:
    191     length = headers.get('content-length')
    192   return length
    193 
    194 
    195 def get_metadata(headers):
    196   """Get user defined options from HTTP response headers."""
    197   return dict((k, v) for k, v in headers.iteritems()
    198               if any(k.lower().startswith(valid) for valid in _GCS_METADATA))
    199 
    200 
    201 def validate_bucket_name(name):
    202   """Validate a Google Storage bucket name.
    203 
    204   Args:
    205     name: a Google Storage bucket name with no prefix or suffix.
    206 
    207   Raises:
    208     ValueError: if name is invalid.
    209   """
    210   _validate_path(name)
    211   if not _GCS_BUCKET_REGEX.match(name):
    212     raise ValueError('Bucket should be 3-63 characters long using only a-z,'
    213                      '0-9, underscore, dash or dot but got %s' % name)
    214 
    215 
    216 def validate_bucket_path(path):
    217   """Validate a Google Cloud Storage bucket path.
    218 
    219   Args:
    220     path: a Google Storage bucket path. It should have form '/bucket'.
    221 
    222   Raises:
    223     ValueError: if path is invalid.
    224   """
    225   _validate_path(path)
    226   if not _GCS_BUCKET_PATH_REGEX.match(path):
    227     raise ValueError('Bucket should have format /bucket '
    228                      'but got %s' % path)
    229 
    230 
    231 def validate_file_path(path):
    232   """Validate a Google Cloud Storage file path.
    233 
    234   Args:
    235     path: a Google Storage file path. It should have form '/bucket/filename'.
    236 
    237   Raises:
    238     ValueError: if path is invalid.
    239   """
    240   _validate_path(path)
    241   if not _GCS_FULLPATH_REGEX.match(path):
    242     raise ValueError('Path should have format /bucket/filename '
    243                      'but got %s' % path)
    244 
    245 
    246 def _process_path_prefix(path_prefix):
    247   """Validate and process a Google Cloud Stoarge path prefix.
    248 
    249   Args:
    250     path_prefix: a Google Cloud Storage path prefix of format '/bucket/prefix'
    251       or '/bucket/' or '/bucket'.
    252 
    253   Raises:
    254     ValueError: if path is invalid.
    255 
    256   Returns:
    257     a tuple of /bucket and prefix. prefix can be None.
    258   """
    259   _validate_path(path_prefix)
    260   if not _GCS_PATH_PREFIX_REGEX.match(path_prefix):
    261     raise ValueError('Path prefix should have format /bucket, /bucket/, '
    262                      'or /bucket/prefix but got %s.' % path_prefix)
    263   bucket_name_end = path_prefix.find('/', 1)
    264   bucket = path_prefix
    265   prefix = None
    266   if bucket_name_end != -1:
    267     bucket = path_prefix[:bucket_name_end]
    268     prefix = path_prefix[bucket_name_end + 1:] or None
    269   return bucket, prefix
    270 
    271 
    272 def _validate_path(path):
    273   """Basic validation of Google Storage paths.
    274 
    275   Args:
    276     path: a Google Storage path. It should have form '/bucket/filename'
    277       or '/bucket'.
    278 
    279   Raises:
    280     ValueError: if path is invalid.
    281     TypeError: if path is not of type basestring.
    282   """
    283   if not path:
    284     raise ValueError('Path is empty')
    285   if not isinstance(path, basestring):
    286     raise TypeError('Path should be a string but is %s (%s).' %
    287                     (path.__class__, path))
    288 
    289 
    290 def validate_options(options):
    291   """Validate Google Cloud Storage options.
    292 
    293   Args:
    294     options: a str->basestring dict of options to pass to Google Cloud Storage.
    295 
    296   Raises:
    297     ValueError: if option is not supported.
    298     TypeError: if option is not of type str or value of an option
    299       is not of type basestring.
    300   """
    301   if not options:
    302     return
    303 
    304   for k, v in options.iteritems():
    305     if not isinstance(k, str):
    306       raise TypeError('option %r should be a str.' % k)
    307     if not any(k.lower().startswith(valid) for valid in _GCS_OPTIONS):
    308       raise ValueError('option %s is not supported.' % k)
    309     if not isinstance(v, basestring):
    310       raise TypeError('value %r for option %s should be of type basestring.' %
    311                       (v, k))
    312 
    313 
    314 def http_time_to_posix(http_time):
    315   """Convert HTTP time format to posix time.
    316 
    317   See http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1
    318   for http time format.
    319 
    320   Args:
    321     http_time: time in RFC 2616 format. e.g.
    322       "Mon, 20 Nov 1995 19:12:08 GMT".
    323 
    324   Returns:
    325     A float of secs from unix epoch.
    326   """
    327   if http_time is not None:
    328     return email_utils.mktime_tz(email_utils.parsedate_tz(http_time))
    329 
    330 
    331 def posix_time_to_http(posix_time):
    332   """Convert posix time to HTML header time format.
    333 
    334   Args:
    335     posix_time: unix time.
    336 
    337   Returns:
    338     A datatime str in RFC 2616 format.
    339   """
    340   if posix_time:
    341     return email_utils.formatdate(posix_time, usegmt=True)
    342 
    343 
    344 _DT_FORMAT = '%Y-%m-%dT%H:%M:%S'
    345 
    346 
    347 def dt_str_to_posix(dt_str):
    348   """format str to posix.
    349 
    350   datetime str is of format %Y-%m-%dT%H:%M:%S.%fZ,
    351   e.g. 2013-04-12T00:22:27.978Z. According to ISO 8601, T is a separator
    352   between date and time when they are on the same line.
    353   Z indicates UTC (zero meridian).
    354 
    355   A pointer: http://www.cl.cam.ac.uk/~mgk25/iso-time.html
    356 
    357   This is used to parse LastModified node from GCS's GET bucket XML response.
    358 
    359   Args:
    360     dt_str: A datetime str.
    361 
    362   Returns:
    363     A float of secs from unix epoch. By posix definition, epoch is midnight
    364     1970/1/1 UTC.
    365   """
    366   parsable, _ = dt_str.split('.')
    367   dt = datetime.datetime.strptime(parsable, _DT_FORMAT)
    368   return calendar.timegm(dt.utctimetuple())
    369 
    370 
    371 def posix_to_dt_str(posix):
    372   """Reverse of str_to_datetime.
    373 
    374   This is used by GCS stub to generate GET bucket XML response.
    375 
    376   Args:
    377     posix: A float of secs from unix epoch.
    378 
    379   Returns:
    380     A datetime str.
    381   """
    382   dt = datetime.datetime.utcfromtimestamp(posix)
    383   dt_str = dt.strftime(_DT_FORMAT)
    384   return dt_str + '.000Z'
    385 
    386 
    387 def local_run():
    388   """Whether we should hit GCS dev appserver stub."""
    389   server_software = os.environ.get('SERVER_SOFTWARE')
    390   if server_software is None:
    391     return True
    392   if 'remote_api' in server_software:
    393     return False
    394   if server_software.startswith(('Development', 'testutil')):
    395     return True
    396   return False
    397 
    398 
    399 def local_api_url():
    400   """Return URL for GCS emulation on dev appserver."""
    401   return 'http://%s%s' % (os.environ.get('HTTP_HOST'), LOCAL_GCS_ENDPOINT)
    402 
    403 
    404 def memory_usage(method):
    405   """Log memory usage before and after a method."""
    406   def wrapper(*args, **kwargs):
    407     logging.info('Memory before method %s is %s.',
    408                  method.__name__, runtime.memory_usage().current())
    409     result = method(*args, **kwargs)
    410     logging.info('Memory after method %s is %s',
    411                  method.__name__, runtime.memory_usage().current())
    412     return result
    413   return wrapper
    414 
    415 
    416 def _add_ns(tagname):
    417   return '{%(ns)s}%(tag)s' % {'ns': CS_XML_NS,
    418                               'tag': tagname}
    419 
    420 
    421 _T_CONTENTS = _add_ns('Contents')
    422 _T_LAST_MODIFIED = _add_ns('LastModified')
    423 _T_ETAG = _add_ns('ETag')
    424 _T_KEY = _add_ns('Key')
    425 _T_SIZE = _add_ns('Size')
    426 _T_PREFIX = _add_ns('Prefix')
    427 _T_COMMON_PREFIXES = _add_ns('CommonPrefixes')
    428 _T_NEXT_MARKER = _add_ns('NextMarker')
    429 _T_IS_TRUNCATED = _add_ns('IsTruncated')
    430