1 # Copyright 2012 Google Inc. All Rights Reserved. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, 10 # software distributed under the License is distributed on an 11 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 12 # either express or implied. See the License for the specific 13 # language governing permissions and limitations under the License. 14 15 """Helpers shared by cloudstorage_stub and cloudstorage_api.""" 16 17 18 19 20 21 __all__ = ['CS_XML_NS', 22 'CSFileStat', 23 'dt_str_to_posix', 24 'local_api_url', 25 'LOCAL_GCS_ENDPOINT', 26 'local_run', 27 'get_access_token', 28 'get_stored_content_length', 29 'get_metadata', 30 'GCSFileStat', 31 'http_time_to_posix', 32 'memory_usage', 33 'posix_time_to_http', 34 'posix_to_dt_str', 35 'set_access_token', 36 'validate_options', 37 'validate_bucket_name', 38 'validate_bucket_path', 39 'validate_file_path', 40 ] 41 42 43 import calendar 44 import datetime 45 from email import utils as email_utils 46 import logging 47 import os 48 import re 49 50 try: 51 from google.appengine.api import runtime 52 except ImportError: 53 from google.appengine.api import runtime 54 55 56 _GCS_BUCKET_REGEX_BASE = r'[a-z0-9\.\-_]{3,63}' 57 _GCS_BUCKET_REGEX = re.compile(_GCS_BUCKET_REGEX_BASE + r'$') 58 _GCS_BUCKET_PATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'$') 59 _GCS_PATH_PREFIX_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'.*') 60 _GCS_FULLPATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'/.*') 61 _GCS_METADATA = ['x-goog-meta-', 62 'content-disposition', 63 'cache-control', 64 'content-encoding'] 65 _GCS_OPTIONS = _GCS_METADATA + ['x-goog-acl'] 66 CS_XML_NS = 'http://doc.s3.amazonaws.com/2006-03-01' 67 LOCAL_GCS_ENDPOINT = '/_ah/gcs' 68 _access_token = '' 69 70 71 _MAX_GET_BUCKET_RESULT = 1000 72 73 74 def set_access_token(access_token): 75 """Set the shared access token to authenticate with Google Cloud Storage. 76 77 When set, the library will always attempt to communicate with the 78 real Google Cloud Storage with this token even when running on dev appserver. 79 Note the token could expire so it's up to you to renew it. 80 81 When absent, the library will automatically request and refresh a token 82 on appserver, or when on dev appserver, talk to a Google Cloud Storage 83 stub. 84 85 Args: 86 access_token: you can get one by run 'gsutil -d ls' and copy the 87 str after 'Bearer'. 88 """ 89 global _access_token 90 _access_token = access_token 91 92 93 def get_access_token(): 94 """Returns the shared access token.""" 95 return _access_token 96 97 98 class GCSFileStat(object): 99 """Container for GCS file stat.""" 100 101 def __init__(self, 102 filename, 103 st_size, 104 etag, 105 st_ctime, 106 content_type=None, 107 metadata=None, 108 is_dir=False): 109 """Initialize. 110 111 For files, the non optional arguments are always set. 112 For directories, only filename and is_dir is set. 113 114 Args: 115 filename: a Google Cloud Storage filename of form '/bucket/filename'. 116 st_size: file size in bytes. long compatible. 117 etag: hex digest of the md5 hash of the file's content. str. 118 st_ctime: posix file creation time. float compatible. 119 content_type: content type. str. 120 metadata: a str->str dict of user specified options when creating 121 the file. Possible keys are x-goog-meta-, content-disposition, 122 content-encoding, and cache-control. 123 is_dir: True if this represents a directory. False if this is a real file. 124 """ 125 self.filename = filename 126 self.is_dir = is_dir 127 self.st_size = None 128 self.st_ctime = None 129 self.etag = None 130 self.content_type = content_type 131 self.metadata = metadata 132 133 if not is_dir: 134 self.st_size = long(st_size) 135 self.st_ctime = float(st_ctime) 136 if etag[0] == '"' and etag[-1] == '"': 137 etag = etag[1:-1] 138 self.etag = etag 139 140 def __repr__(self): 141 if self.is_dir: 142 return '(directory: %s)' % self.filename 143 144 return ( 145 '(filename: %(filename)s, st_size: %(st_size)s, ' 146 'st_ctime: %(st_ctime)s, etag: %(etag)s, ' 147 'content_type: %(content_type)s, ' 148 'metadata: %(metadata)s)' % 149 dict(filename=self.filename, 150 st_size=self.st_size, 151 st_ctime=self.st_ctime, 152 etag=self.etag, 153 content_type=self.content_type, 154 metadata=self.metadata)) 155 156 def __cmp__(self, other): 157 if not isinstance(other, self.__class__): 158 raise ValueError('Argument to cmp must have the same type. ' 159 'Expect %s, got %s', self.__class__.__name__, 160 other.__class__.__name__) 161 if self.filename > other.filename: 162 return 1 163 elif self.filename < other.filename: 164 return -1 165 return 0 166 167 def __hash__(self): 168 if self.etag: 169 return hash(self.etag) 170 return hash(self.filename) 171 172 173 CSFileStat = GCSFileStat 174 175 176 def get_stored_content_length(headers): 177 """Return the content length (in bytes) of the object as stored in GCS. 178 179 x-goog-stored-content-length should always be present except when called via 180 the local dev_appserver. Therefore if it is not present we default to the 181 standard content-length header. 182 183 Args: 184 headers: a dict of headers from the http response. 185 186 Returns: 187 the stored content length. 188 """ 189 length = headers.get('x-goog-stored-content-length') 190 if length is None: 191 length = headers.get('content-length') 192 return length 193 194 195 def get_metadata(headers): 196 """Get user defined options from HTTP response headers.""" 197 return dict((k, v) for k, v in headers.iteritems() 198 if any(k.lower().startswith(valid) for valid in _GCS_METADATA)) 199 200 201 def validate_bucket_name(name): 202 """Validate a Google Storage bucket name. 203 204 Args: 205 name: a Google Storage bucket name with no prefix or suffix. 206 207 Raises: 208 ValueError: if name is invalid. 209 """ 210 _validate_path(name) 211 if not _GCS_BUCKET_REGEX.match(name): 212 raise ValueError('Bucket should be 3-63 characters long using only a-z,' 213 '0-9, underscore, dash or dot but got %s' % name) 214 215 216 def validate_bucket_path(path): 217 """Validate a Google Cloud Storage bucket path. 218 219 Args: 220 path: a Google Storage bucket path. It should have form '/bucket'. 221 222 Raises: 223 ValueError: if path is invalid. 224 """ 225 _validate_path(path) 226 if not _GCS_BUCKET_PATH_REGEX.match(path): 227 raise ValueError('Bucket should have format /bucket ' 228 'but got %s' % path) 229 230 231 def validate_file_path(path): 232 """Validate a Google Cloud Storage file path. 233 234 Args: 235 path: a Google Storage file path. It should have form '/bucket/filename'. 236 237 Raises: 238 ValueError: if path is invalid. 239 """ 240 _validate_path(path) 241 if not _GCS_FULLPATH_REGEX.match(path): 242 raise ValueError('Path should have format /bucket/filename ' 243 'but got %s' % path) 244 245 246 def _process_path_prefix(path_prefix): 247 """Validate and process a Google Cloud Stoarge path prefix. 248 249 Args: 250 path_prefix: a Google Cloud Storage path prefix of format '/bucket/prefix' 251 or '/bucket/' or '/bucket'. 252 253 Raises: 254 ValueError: if path is invalid. 255 256 Returns: 257 a tuple of /bucket and prefix. prefix can be None. 258 """ 259 _validate_path(path_prefix) 260 if not _GCS_PATH_PREFIX_REGEX.match(path_prefix): 261 raise ValueError('Path prefix should have format /bucket, /bucket/, ' 262 'or /bucket/prefix but got %s.' % path_prefix) 263 bucket_name_end = path_prefix.find('/', 1) 264 bucket = path_prefix 265 prefix = None 266 if bucket_name_end != -1: 267 bucket = path_prefix[:bucket_name_end] 268 prefix = path_prefix[bucket_name_end + 1:] or None 269 return bucket, prefix 270 271 272 def _validate_path(path): 273 """Basic validation of Google Storage paths. 274 275 Args: 276 path: a Google Storage path. It should have form '/bucket/filename' 277 or '/bucket'. 278 279 Raises: 280 ValueError: if path is invalid. 281 TypeError: if path is not of type basestring. 282 """ 283 if not path: 284 raise ValueError('Path is empty') 285 if not isinstance(path, basestring): 286 raise TypeError('Path should be a string but is %s (%s).' % 287 (path.__class__, path)) 288 289 290 def validate_options(options): 291 """Validate Google Cloud Storage options. 292 293 Args: 294 options: a str->basestring dict of options to pass to Google Cloud Storage. 295 296 Raises: 297 ValueError: if option is not supported. 298 TypeError: if option is not of type str or value of an option 299 is not of type basestring. 300 """ 301 if not options: 302 return 303 304 for k, v in options.iteritems(): 305 if not isinstance(k, str): 306 raise TypeError('option %r should be a str.' % k) 307 if not any(k.lower().startswith(valid) for valid in _GCS_OPTIONS): 308 raise ValueError('option %s is not supported.' % k) 309 if not isinstance(v, basestring): 310 raise TypeError('value %r for option %s should be of type basestring.' % 311 (v, k)) 312 313 314 def http_time_to_posix(http_time): 315 """Convert HTTP time format to posix time. 316 317 See http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1 318 for http time format. 319 320 Args: 321 http_time: time in RFC 2616 format. e.g. 322 "Mon, 20 Nov 1995 19:12:08 GMT". 323 324 Returns: 325 A float of secs from unix epoch. 326 """ 327 if http_time is not None: 328 return email_utils.mktime_tz(email_utils.parsedate_tz(http_time)) 329 330 331 def posix_time_to_http(posix_time): 332 """Convert posix time to HTML header time format. 333 334 Args: 335 posix_time: unix time. 336 337 Returns: 338 A datatime str in RFC 2616 format. 339 """ 340 if posix_time: 341 return email_utils.formatdate(posix_time, usegmt=True) 342 343 344 _DT_FORMAT = '%Y-%m-%dT%H:%M:%S' 345 346 347 def dt_str_to_posix(dt_str): 348 """format str to posix. 349 350 datetime str is of format %Y-%m-%dT%H:%M:%S.%fZ, 351 e.g. 2013-04-12T00:22:27.978Z. According to ISO 8601, T is a separator 352 between date and time when they are on the same line. 353 Z indicates UTC (zero meridian). 354 355 A pointer: http://www.cl.cam.ac.uk/~mgk25/iso-time.html 356 357 This is used to parse LastModified node from GCS's GET bucket XML response. 358 359 Args: 360 dt_str: A datetime str. 361 362 Returns: 363 A float of secs from unix epoch. By posix definition, epoch is midnight 364 1970/1/1 UTC. 365 """ 366 parsable, _ = dt_str.split('.') 367 dt = datetime.datetime.strptime(parsable, _DT_FORMAT) 368 return calendar.timegm(dt.utctimetuple()) 369 370 371 def posix_to_dt_str(posix): 372 """Reverse of str_to_datetime. 373 374 This is used by GCS stub to generate GET bucket XML response. 375 376 Args: 377 posix: A float of secs from unix epoch. 378 379 Returns: 380 A datetime str. 381 """ 382 dt = datetime.datetime.utcfromtimestamp(posix) 383 dt_str = dt.strftime(_DT_FORMAT) 384 return dt_str + '.000Z' 385 386 387 def local_run(): 388 """Whether we should hit GCS dev appserver stub.""" 389 server_software = os.environ.get('SERVER_SOFTWARE') 390 if server_software is None: 391 return True 392 if 'remote_api' in server_software: 393 return False 394 if server_software.startswith(('Development', 'testutil')): 395 return True 396 return False 397 398 399 def local_api_url(): 400 """Return URL for GCS emulation on dev appserver.""" 401 return 'http://%s%s' % (os.environ.get('HTTP_HOST'), LOCAL_GCS_ENDPOINT) 402 403 404 def memory_usage(method): 405 """Log memory usage before and after a method.""" 406 def wrapper(*args, **kwargs): 407 logging.info('Memory before method %s is %s.', 408 method.__name__, runtime.memory_usage().current()) 409 result = method(*args, **kwargs) 410 logging.info('Memory after method %s is %s', 411 method.__name__, runtime.memory_usage().current()) 412 return result 413 return wrapper 414 415 416 def _add_ns(tagname): 417 return '{%(ns)s}%(tag)s' % {'ns': CS_XML_NS, 418 'tag': tagname} 419 420 421 _T_CONTENTS = _add_ns('Contents') 422 _T_LAST_MODIFIED = _add_ns('LastModified') 423 _T_ETAG = _add_ns('ETag') 424 _T_KEY = _add_ns('Key') 425 _T_SIZE = _add_ns('Size') 426 _T_PREFIX = _add_ns('Prefix') 427 _T_COMMON_PREFIXES = _add_ns('CommonPrefixes') 428 _T_NEXT_MARKER = _add_ns('NextMarker') 429 _T_IS_TRUNCATED = _add_ns('IsTruncated') 430