1 # Copyright 2012 Google Inc. All Rights Reserved. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, 10 # software distributed under the License is distributed on an 11 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 12 # either express or implied. See the License for the specific 13 # language governing permissions and limitations under the License. 14 15 """Helpers shared by cloudstorage_stub and cloudstorage_api.""" 16 17 18 19 20 21 __all__ = ['CS_XML_NS', 22 'CSFileStat', 23 'dt_str_to_posix', 24 'local_api_url', 25 'LOCAL_GCS_ENDPOINT', 26 'local_run', 27 'get_access_token', 28 'get_metadata', 29 'GCSFileStat', 30 'http_time_to_posix', 31 'memory_usage', 32 'posix_time_to_http', 33 'posix_to_dt_str', 34 'set_access_token', 35 'validate_options', 36 'validate_bucket_name', 37 'validate_bucket_path', 38 'validate_file_path', 39 ] 40 41 42 import calendar 43 import datetime 44 from email import utils as email_utils 45 import logging 46 import os 47 import re 48 49 try: 50 from google.appengine.api import runtime 51 except ImportError: 52 from google.appengine.api import runtime 53 54 55 _GCS_BUCKET_REGEX_BASE = r'[a-z0-9\.\-_]{3,63}' 56 _GCS_BUCKET_REGEX = re.compile(_GCS_BUCKET_REGEX_BASE + r'$') 57 _GCS_BUCKET_PATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'$') 58 _GCS_PATH_PREFIX_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'.*') 59 _GCS_FULLPATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'/.*') 60 _GCS_METADATA = ['x-goog-meta-', 61 'content-disposition', 62 'cache-control', 63 'content-encoding'] 64 _GCS_OPTIONS = _GCS_METADATA + ['x-goog-acl'] 65 CS_XML_NS = 'http://doc.s3.amazonaws.com/2006-03-01' 66 LOCAL_GCS_ENDPOINT = '/_ah/gcs' 67 _access_token = '' 68 69 70 _MAX_GET_BUCKET_RESULT = 1000 71 72 73 def set_access_token(access_token): 74 """Set the shared access token to authenticate with Google Cloud Storage. 75 76 When set, the library will always attempt to communicate with the 77 real Google Cloud Storage with this token even when running on dev appserver. 78 Note the token could expire so it's up to you to renew it. 79 80 When absent, the library will automatically request and refresh a token 81 on appserver, or when on dev appserver, talk to a Google Cloud Storage 82 stub. 83 84 Args: 85 access_token: you can get one by run 'gsutil -d ls' and copy the 86 str after 'Bearer'. 87 """ 88 global _access_token 89 _access_token = access_token 90 91 92 def get_access_token(): 93 """Returns the shared access token.""" 94 return _access_token 95 96 97 class GCSFileStat(object): 98 """Container for GCS file stat.""" 99 100 def __init__(self, 101 filename, 102 st_size, 103 etag, 104 st_ctime, 105 content_type=None, 106 metadata=None, 107 is_dir=False): 108 """Initialize. 109 110 For files, the non optional arguments are always set. 111 For directories, only filename and is_dir is set. 112 113 Args: 114 filename: a Google Cloud Storage filename of form '/bucket/filename'. 115 st_size: file size in bytes. long compatible. 116 etag: hex digest of the md5 hash of the file's content. str. 117 st_ctime: posix file creation time. float compatible. 118 content_type: content type. str. 119 metadata: a str->str dict of user specified options when creating 120 the file. Possible keys are x-goog-meta-, content-disposition, 121 content-encoding, and cache-control. 122 is_dir: True if this represents a directory. False if this is a real file. 123 """ 124 self.filename = filename 125 self.is_dir = is_dir 126 self.st_size = None 127 self.st_ctime = None 128 self.etag = None 129 self.content_type = content_type 130 self.metadata = metadata 131 132 if not is_dir: 133 self.st_size = long(st_size) 134 self.st_ctime = float(st_ctime) 135 if etag[0] == '"' and etag[-1] == '"': 136 etag = etag[1:-1] 137 self.etag = etag 138 139 def __repr__(self): 140 if self.is_dir: 141 return '(directory: %s)' % self.filename 142 143 return ( 144 '(filename: %(filename)s, st_size: %(st_size)s, ' 145 'st_ctime: %(st_ctime)s, etag: %(etag)s, ' 146 'content_type: %(content_type)s, ' 147 'metadata: %(metadata)s)' % 148 dict(filename=self.filename, 149 st_size=self.st_size, 150 st_ctime=self.st_ctime, 151 etag=self.etag, 152 content_type=self.content_type, 153 metadata=self.metadata)) 154 155 def __cmp__(self, other): 156 if not isinstance(other, self.__class__): 157 raise ValueError('Argument to cmp must have the same type. ' 158 'Expect %s, got %s', self.__class__.__name__, 159 other.__class__.__name__) 160 if self.filename > other.filename: 161 return 1 162 elif self.filename < other.filename: 163 return -1 164 return 0 165 166 def __hash__(self): 167 if self.etag: 168 return hash(self.etag) 169 return hash(self.filename) 170 171 172 CSFileStat = GCSFileStat 173 174 175 def get_metadata(headers): 176 """Get user defined options from HTTP response headers.""" 177 return dict((k, v) for k, v in headers.iteritems() 178 if any(k.lower().startswith(valid) for valid in _GCS_METADATA)) 179 180 181 def validate_bucket_name(name): 182 """Validate a Google Storage bucket name. 183 184 Args: 185 name: a Google Storage bucket name with no prefix or suffix. 186 187 Raises: 188 ValueError: if name is invalid. 189 """ 190 _validate_path(name) 191 if not _GCS_BUCKET_REGEX.match(name): 192 raise ValueError('Bucket should be 3-63 characters long using only a-z,' 193 '0-9, underscore, dash or dot but got %s' % name) 194 195 196 def validate_bucket_path(path): 197 """Validate a Google Cloud Storage bucket path. 198 199 Args: 200 path: a Google Storage bucket path. It should have form '/bucket'. 201 202 Raises: 203 ValueError: if path is invalid. 204 """ 205 _validate_path(path) 206 if not _GCS_BUCKET_PATH_REGEX.match(path): 207 raise ValueError('Bucket should have format /bucket ' 208 'but got %s' % path) 209 210 211 def validate_file_path(path): 212 """Validate a Google Cloud Storage file path. 213 214 Args: 215 path: a Google Storage file path. It should have form '/bucket/filename'. 216 217 Raises: 218 ValueError: if path is invalid. 219 """ 220 _validate_path(path) 221 if not _GCS_FULLPATH_REGEX.match(path): 222 raise ValueError('Path should have format /bucket/filename ' 223 'but got %s' % path) 224 225 226 def _process_path_prefix(path_prefix): 227 """Validate and process a Google Cloud Stoarge path prefix. 228 229 Args: 230 path_prefix: a Google Cloud Storage path prefix of format '/bucket/prefix' 231 or '/bucket/' or '/bucket'. 232 233 Raises: 234 ValueError: if path is invalid. 235 236 Returns: 237 a tuple of /bucket and prefix. prefix can be None. 238 """ 239 _validate_path(path_prefix) 240 if not _GCS_PATH_PREFIX_REGEX.match(path_prefix): 241 raise ValueError('Path prefix should have format /bucket, /bucket/, ' 242 'or /bucket/prefix but got %s.' % path_prefix) 243 bucket_name_end = path_prefix.find('/', 1) 244 bucket = path_prefix 245 prefix = None 246 if bucket_name_end != -1: 247 bucket = path_prefix[:bucket_name_end] 248 prefix = path_prefix[bucket_name_end + 1:] or None 249 return bucket, prefix 250 251 252 def _validate_path(path): 253 """Basic validation of Google Storage paths. 254 255 Args: 256 path: a Google Storage path. It should have form '/bucket/filename' 257 or '/bucket'. 258 259 Raises: 260 ValueError: if path is invalid. 261 TypeError: if path is not of type basestring. 262 """ 263 if not path: 264 raise ValueError('Path is empty') 265 if not isinstance(path, basestring): 266 raise TypeError('Path should be a string but is %s (%s).' % 267 (path.__class__, path)) 268 269 270 def validate_options(options): 271 """Validate Google Cloud Storage options. 272 273 Args: 274 options: a str->basestring dict of options to pass to Google Cloud Storage. 275 276 Raises: 277 ValueError: if option is not supported. 278 TypeError: if option is not of type str or value of an option 279 is not of type basestring. 280 """ 281 if not options: 282 return 283 284 for k, v in options.iteritems(): 285 if not isinstance(k, str): 286 raise TypeError('option %r should be a str.' % k) 287 if not any(k.lower().startswith(valid) for valid in _GCS_OPTIONS): 288 raise ValueError('option %s is not supported.' % k) 289 if not isinstance(v, basestring): 290 raise TypeError('value %r for option %s should be of type basestring.' % 291 (v, k)) 292 293 294 def http_time_to_posix(http_time): 295 """Convert HTTP time format to posix time. 296 297 See http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1 298 for http time format. 299 300 Args: 301 http_time: time in RFC 2616 format. e.g. 302 "Mon, 20 Nov 1995 19:12:08 GMT". 303 304 Returns: 305 A float of secs from unix epoch. 306 """ 307 if http_time is not None: 308 return email_utils.mktime_tz(email_utils.parsedate_tz(http_time)) 309 310 311 def posix_time_to_http(posix_time): 312 """Convert posix time to HTML header time format. 313 314 Args: 315 posix_time: unix time. 316 317 Returns: 318 A datatime str in RFC 2616 format. 319 """ 320 if posix_time: 321 return email_utils.formatdate(posix_time, usegmt=True) 322 323 324 _DT_FORMAT = '%Y-%m-%dT%H:%M:%S' 325 326 327 def dt_str_to_posix(dt_str): 328 """format str to posix. 329 330 datetime str is of format %Y-%m-%dT%H:%M:%S.%fZ, 331 e.g. 2013-04-12T00:22:27.978Z. According to ISO 8601, T is a separator 332 between date and time when they are on the same line. 333 Z indicates UTC (zero meridian). 334 335 A pointer: http://www.cl.cam.ac.uk/~mgk25/iso-time.html 336 337 This is used to parse LastModified node from GCS's GET bucket XML response. 338 339 Args: 340 dt_str: A datetime str. 341 342 Returns: 343 A float of secs from unix epoch. By posix definition, epoch is midnight 344 1970/1/1 UTC. 345 """ 346 parsable, _ = dt_str.split('.') 347 dt = datetime.datetime.strptime(parsable, _DT_FORMAT) 348 return calendar.timegm(dt.utctimetuple()) 349 350 351 def posix_to_dt_str(posix): 352 """Reverse of str_to_datetime. 353 354 This is used by GCS stub to generate GET bucket XML response. 355 356 Args: 357 posix: A float of secs from unix epoch. 358 359 Returns: 360 A datetime str. 361 """ 362 dt = datetime.datetime.utcfromtimestamp(posix) 363 dt_str = dt.strftime(_DT_FORMAT) 364 return dt_str + '.000Z' 365 366 367 def local_run(): 368 """Whether we should hit GCS dev appserver stub.""" 369 server_software = os.environ.get('SERVER_SOFTWARE') 370 if server_software is None: 371 return True 372 if 'remote_api' in server_software: 373 return False 374 if server_software.startswith(('Development', 'testutil')): 375 return True 376 return False 377 378 379 def local_api_url(): 380 """Return URL for GCS emulation on dev appserver.""" 381 return 'http://%s%s' % (os.environ.get('HTTP_HOST'), LOCAL_GCS_ENDPOINT) 382 383 384 def memory_usage(method): 385 """Log memory usage before and after a method.""" 386 def wrapper(*args, **kwargs): 387 logging.info('Memory before method %s is %s.', 388 method.__name__, runtime.memory_usage().current()) 389 result = method(*args, **kwargs) 390 logging.info('Memory after method %s is %s', 391 method.__name__, runtime.memory_usage().current()) 392 return result 393 return wrapper 394 395 396 def _add_ns(tagname): 397 return '{%(ns)s}%(tag)s' % {'ns': CS_XML_NS, 398 'tag': tagname} 399 400 401 _T_CONTENTS = _add_ns('Contents') 402 _T_LAST_MODIFIED = _add_ns('LastModified') 403 _T_ETAG = _add_ns('ETag') 404 _T_KEY = _add_ns('Key') 405 _T_SIZE = _add_ns('Size') 406 _T_PREFIX = _add_ns('Prefix') 407 _T_COMMON_PREFIXES = _add_ns('CommonPrefixes') 408 _T_NEXT_MARKER = _add_ns('NextMarker') 409 _T_IS_TRUNCATED = _add_ns('IsTruncated') 410