Home | History | Annotate | Download | only in gslib
      1 # -*- coding: utf-8 -*-
      2 # Copyright 2010 Google Inc. All Rights Reserved.
      3 #
      4 # Licensed under the Apache License, Version 2.0 (the "License");
      5 # you may not use this file except in compliance with the License.
      6 # You may obtain a copy of the License at
      7 #
      8 #     http://www.apache.org/licenses/LICENSE-2.0
      9 #
     10 # Unless required by applicable law or agreed to in writing, software
     11 # distributed under the License is distributed on an "AS IS" BASIS,
     12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 # See the License for the specific language governing permissions and
     14 # limitations under the License.
     15 """Static data and helper functions."""
     16 
     17 from __future__ import absolute_import
     18 
     19 import collections
     20 import errno
     21 import logging
     22 import math
     23 import multiprocessing
     24 import os
     25 import pkgutil
     26 import re
     27 import struct
     28 import sys
     29 import tempfile
     30 import textwrap
     31 import threading
     32 import traceback
     33 import xml.etree.ElementTree as ElementTree
     34 
     35 import boto
     36 from boto import config
     37 import boto.auth
     38 from boto.exception import NoAuthHandlerFound
     39 from boto.gs.connection import GSConnection
     40 from boto.provider import Provider
     41 from boto.pyami.config import BotoConfigLocations
     42 import httplib2
     43 from oauth2client.client import HAS_CRYPTO
     44 from retry_decorator import retry_decorator
     45 
     46 import gslib
     47 from gslib.exception import CommandException
     48 from gslib.storage_url import StorageUrlFromString
     49 from gslib.translation_helper import AclTranslation
     50 from gslib.translation_helper import GenerationFromUrlAndString
     51 from gslib.translation_helper import S3_ACL_MARKER_GUID
     52 from gslib.translation_helper import S3_DELETE_MARKER_GUID
     53 from gslib.translation_helper import S3_MARKER_GUIDS
     54 
     55 # Detect platform types.
     56 PLATFORM = str(sys.platform).lower()
     57 IS_WINDOWS = 'win32' in PLATFORM
     58 IS_CYGWIN = 'cygwin' in PLATFORM
     59 IS_LINUX = 'linux' in PLATFORM
     60 IS_OSX = 'darwin' in PLATFORM
     61 
     62 # pylint: disable=g-import-not-at-top
     63 if IS_WINDOWS:
     64   from ctypes import c_int
     65   from ctypes import c_uint64
     66   from ctypes import c_char_p
     67   from ctypes import c_wchar_p
     68   from ctypes import windll
     69   from ctypes import POINTER
     70   from ctypes import WINFUNCTYPE
     71   from ctypes import WinError
     72 
     73 # pylint: disable=g-import-not-at-top
     74 try:
     75   # This module doesn't necessarily exist on Windows.
     76   import resource
     77   HAS_RESOURCE_MODULE = True
     78 except ImportError, e:
     79   HAS_RESOURCE_MODULE = False
     80 
     81 ONE_KIB = 1024
     82 ONE_MIB = 1024 * 1024
     83 TWO_MIB = 2 * ONE_MIB
     84 EIGHT_MIB = 8 * ONE_MIB
     85 TEN_MIB = 10 * ONE_MIB
     86 DEFAULT_FILE_BUFFER_SIZE = 8 * ONE_KIB
     87 _DEFAULT_LINES = 25
     88 
     89 # By default, the timeout for SSL read errors is infinite. This could
     90 # cause gsutil to hang on network disconnect, so pick a more reasonable
     91 # timeout.
     92 SSL_TIMEOUT = 60
     93 
     94 # Start with a progress callback every 64 KiB during uploads/downloads (JSON
     95 # API). Callback implementation should back off until it hits the maximum size
     96 # so that callbacks do not create huge amounts of log output.
     97 START_CALLBACK_PER_BYTES = 1024*64
     98 MAX_CALLBACK_PER_BYTES = 1024*1024*100
     99 
    100 # Upload/download files in 8 KiB chunks over the HTTP connection.
    101 TRANSFER_BUFFER_SIZE = 1024*8
    102 
    103 # Default number of progress callbacks during transfer (XML API).
    104 XML_PROGRESS_CALLBACKS = 10
    105 
    106 # For files >= this size, output a message indicating that we're running an
    107 # operation on the file (like hashing or gzipping) so it does not appear to the
    108 # user that the command is hanging.
    109 MIN_SIZE_COMPUTE_LOGGING = 100*1024*1024  # 100 MiB
    110 
    111 NO_MAX = sys.maxint
    112 
    113 UTF8 = 'utf-8'
    114 
    115 VERSION_MATCHER = re.compile(r'^(?P<maj>\d+)(\.(?P<min>\d+)(?P<suffix>.*))?')
    116 
    117 RELEASE_NOTES_URL = 'https://pub.storage.googleapis.com/gsutil_ReleaseNotes.txt'
    118 
    119 # Binary exponentiation strings.
    120 _EXP_STRINGS = [
    121     (0, 'B', 'bit'),
    122     (10, 'KiB', 'Kibit', 'K'),
    123     (20, 'MiB', 'Mibit', 'M'),
    124     (30, 'GiB', 'Gibit', 'G'),
    125     (40, 'TiB', 'Tibit', 'T'),
    126     (50, 'PiB', 'Pibit', 'P'),
    127     (60, 'EiB', 'Eibit', 'E'),
    128 ]
    129 
    130 
    131 global manager  # pylint: disable=global-at-module-level
    132 certs_file_lock = threading.Lock()
    133 configured_certs_files = []
    134 
    135 
    136 def _GenerateSuffixRegex():
    137   """Creates a suffix regex for human-readable byte counts."""
    138   human_bytes_re = r'(?P<num>\d*\.\d+|\d+)\s*(?P<suffix>%s)?'
    139   suffixes = []
    140   suffix_to_si = {}
    141   for i, si in enumerate(_EXP_STRINGS):
    142     si_suffixes = [s.lower() for s in list(si)[1:]]
    143     for suffix in si_suffixes:
    144       suffix_to_si[suffix] = i
    145     suffixes.extend(si_suffixes)
    146   human_bytes_re %= '|'.join(suffixes)
    147   matcher = re.compile(human_bytes_re)
    148   return suffix_to_si, matcher
    149 
    150 SUFFIX_TO_SI, MATCH_HUMAN_BYTES = _GenerateSuffixRegex()
    151 
    152 SECONDS_PER_DAY = 3600 * 24
    153 
    154 # On Unix-like systems, we will set the maximum number of open files to avoid
    155 # hitting the limit imposed by the OS. This number was obtained experimentally.
    156 MIN_ACCEPTABLE_OPEN_FILES_LIMIT = 1000
    157 
    158 GSUTIL_PUB_TARBALL = 'gs://pub/gsutil.tar.gz'
    159 
    160 Retry = retry_decorator.retry  # pylint: disable=invalid-name
    161 
    162 # Cache the values from this check such that they're available to all callers
    163 # without needing to run all the checks again (some of these, such as calling
    164 # multiprocessing.Manager(), are expensive operations).
    165 cached_multiprocessing_is_available = None
    166 cached_multiprocessing_is_available_stack_trace = None
    167 cached_multiprocessing_is_available_message = None
    168 
    169 
    170 # Enum class for specifying listing style.
    171 class ListingStyle(object):
    172   SHORT = 'SHORT'
    173   LONG = 'LONG'
    174   LONG_LONG = 'LONG_LONG'
    175 
    176 
    177 def UsingCrcmodExtension(crcmod):
    178   return (boto.config.get('GSUtil', 'test_assume_fast_crcmod', None) or
    179           (getattr(crcmod, 'crcmod', None) and
    180            getattr(crcmod.crcmod, '_usingExtension', None)))
    181 
    182 
    183 def CheckFreeSpace(path):
    184   """Return path/drive free space (in bytes)."""
    185   if IS_WINDOWS:
    186     try:
    187       # pylint: disable=invalid-name
    188       get_disk_free_space_ex = WINFUNCTYPE(c_int, c_wchar_p,
    189                                            POINTER(c_uint64),
    190                                            POINTER(c_uint64),
    191                                            POINTER(c_uint64))
    192       get_disk_free_space_ex = get_disk_free_space_ex(
    193           ('GetDiskFreeSpaceExW', windll.kernel32), (
    194               (1, 'lpszPathName'),
    195               (2, 'lpFreeUserSpace'),
    196               (2, 'lpTotalSpace'),
    197               (2, 'lpFreeSpace'),))
    198     except AttributeError:
    199       get_disk_free_space_ex = WINFUNCTYPE(c_int, c_char_p,
    200                                            POINTER(c_uint64),
    201                                            POINTER(c_uint64),
    202                                            POINTER(c_uint64))
    203       get_disk_free_space_ex = get_disk_free_space_ex(
    204           ('GetDiskFreeSpaceExA', windll.kernel32), (
    205               (1, 'lpszPathName'),
    206               (2, 'lpFreeUserSpace'),
    207               (2, 'lpTotalSpace'),
    208               (2, 'lpFreeSpace'),))
    209 
    210     def GetDiskFreeSpaceExErrCheck(result, unused_func, args):
    211       if not result:
    212         raise WinError()
    213       return args[1].value
    214     get_disk_free_space_ex.errcheck = GetDiskFreeSpaceExErrCheck
    215 
    216     return get_disk_free_space_ex(os.getenv('SystemDrive'))
    217   else:
    218     (_, f_frsize, _, _, f_bavail, _, _, _, _, _) = os.statvfs(path)
    219     return f_frsize * f_bavail
    220 
    221 
    222 def CreateDirIfNeeded(dir_path, mode=0777):
    223   """Creates a directory, suppressing already-exists errors."""
    224   if not os.path.exists(dir_path):
    225     try:
    226       # Unfortunately, even though we catch and ignore EEXIST, this call will
    227       # output a (needless) error message (no way to avoid that in Python).
    228       os.makedirs(dir_path, mode)
    229     # Ignore 'already exists' in case user tried to start up several
    230     # resumable uploads concurrently from a machine where no tracker dir had
    231     # yet been created.
    232     except OSError as e:
    233       if e.errno != errno.EEXIST:
    234         raise
    235 
    236 
    237 def DivideAndCeil(dividend, divisor):
    238   """Returns ceil(dividend / divisor).
    239 
    240   Takes care to avoid the pitfalls of floating point arithmetic that could
    241   otherwise yield the wrong result for large numbers.
    242 
    243   Args:
    244     dividend: Dividend for the operation.
    245     divisor: Divisor for the operation.
    246 
    247   Returns:
    248     Quotient.
    249   """
    250   quotient = dividend // divisor
    251   if (dividend % divisor) != 0:
    252     quotient += 1
    253   return quotient
    254 
    255 
    256 def GetGsutilStateDir():
    257   """Returns the location of the directory for gsutil state files.
    258 
    259   Certain operations, such as cross-process credential sharing and
    260   resumable transfer tracking, need a known location for state files which
    261   are created by gsutil as-needed.
    262 
    263   This location should only be used for storing data that is required to be in
    264   a static location.
    265 
    266   Returns:
    267     Path to directory for gsutil static state files.
    268   """
    269   config_file_dir = config.get(
    270       'GSUtil', 'state_dir',
    271       os.path.expanduser(os.path.join('~', '.gsutil')))
    272   CreateDirIfNeeded(config_file_dir)
    273   return config_file_dir
    274 
    275 
    276 def GetCredentialStoreFilename():
    277   return os.path.join(GetGsutilStateDir(), 'credstore')
    278 
    279 
    280 def GetGceCredentialCacheFilename():
    281   return os.path.join(GetGsutilStateDir(), 'gcecredcache')
    282 
    283 
    284 def GetTabCompletionLogFilename():
    285   return os.path.join(GetGsutilStateDir(), 'tab-completion-logs')
    286 
    287 
    288 def GetTabCompletionCacheFilename():
    289   tab_completion_dir = os.path.join(GetGsutilStateDir(), 'tab-completion')
    290   # Limit read permissions on the directory to owner for privacy.
    291   CreateDirIfNeeded(tab_completion_dir, mode=0700)
    292   return os.path.join(tab_completion_dir, 'cache')
    293 
    294 
    295 def PrintTrackerDirDeprecationWarningIfNeeded():
    296   # TODO: Remove this along with the tracker_dir config value 1 year after
    297   # 4.6 release date. Use state_dir instead.
    298   if config.has_option('GSUtil', 'resumable_tracker_dir'):
    299     sys.stderr.write('Warning: you have set resumable_tracker_dir in your '
    300                      '.boto configuration file. This configuration option is '
    301                      'deprecated; please use the state_dir configuration '
    302                      'option instead.\n')
    303 
    304 
    305 # Name of file where we keep the timestamp for the last time we checked whether
    306 # a new version of gsutil is available.
    307 PrintTrackerDirDeprecationWarningIfNeeded()
    308 CreateDirIfNeeded(GetGsutilStateDir())
    309 LAST_CHECKED_FOR_GSUTIL_UPDATE_TIMESTAMP_FILE = (
    310     os.path.join(GetGsutilStateDir(), '.last_software_update_check'))
    311 
    312 
    313 def HasConfiguredCredentials():
    314   """Determines if boto credential/config file exists."""
    315   has_goog_creds = (config.has_option('Credentials', 'gs_access_key_id') and
    316                     config.has_option('Credentials', 'gs_secret_access_key'))
    317   has_amzn_creds = (config.has_option('Credentials', 'aws_access_key_id') and
    318                     config.has_option('Credentials', 'aws_secret_access_key'))
    319   has_oauth_creds = (
    320       config.has_option('Credentials', 'gs_oauth2_refresh_token'))
    321   has_service_account_creds = (
    322       HAS_CRYPTO and
    323       config.has_option('Credentials', 'gs_service_client_id') and
    324       config.has_option('Credentials', 'gs_service_key_file'))
    325 
    326   if (has_goog_creds or has_amzn_creds or has_oauth_creds or
    327       has_service_account_creds):
    328     return True
    329 
    330   valid_auth_handler = None
    331   try:
    332     valid_auth_handler = boto.auth.get_auth_handler(
    333         GSConnection.DefaultHost, config, Provider('google'),
    334         requested_capability=['s3'])
    335     # Exclude the no-op auth handler as indicating credentials are configured.
    336     # Note we can't use isinstance() here because the no-op module may not be
    337     # imported so we can't get a reference to the class type.
    338     if getattr(getattr(valid_auth_handler, '__class__', None),
    339                '__name__', None) == 'NoOpAuth':
    340       valid_auth_handler = None
    341   except NoAuthHandlerFound:
    342     pass
    343 
    344   return valid_auth_handler
    345 
    346 
    347 def ConfigureNoOpAuthIfNeeded():
    348   """Sets up no-op auth handler if no boto credentials are configured."""
    349   if not HasConfiguredCredentials():
    350     if (config.has_option('Credentials', 'gs_service_client_id')
    351         and not HAS_CRYPTO):
    352       if os.environ.get('CLOUDSDK_WRAPPER') == '1':
    353         raise CommandException('\n'.join(textwrap.wrap(
    354             'Your gsutil is configured with an OAuth2 service account, but '
    355             'you do not have PyOpenSSL or PyCrypto 2.6 or later installed. '
    356             'Service account authentication requires one of these libraries; '
    357             'please reactivate your service account via the gcloud auth '
    358             'command and ensure any gcloud packages necessary for '
    359             'service accounts are present.')))
    360       else:
    361         raise CommandException('\n'.join(textwrap.wrap(
    362             'Your gsutil is configured with an OAuth2 service account, but '
    363             'you do not have PyOpenSSL or PyCrypto 2.6 or later installed. '
    364             'Service account authentication requires one of these libraries; '
    365             'please install either of them to proceed, or configure a '
    366             'different type of credentials with "gsutil config".')))
    367     else:
    368       # With no boto config file the user can still access publicly readable
    369       # buckets and objects.
    370       from gslib import no_op_auth_plugin  # pylint: disable=unused-variable
    371 
    372 
    373 def GetConfigFilePath():
    374   config_path = 'no config found'
    375   for path in BotoConfigLocations:
    376     try:
    377       with open(path, 'r'):
    378         config_path = path
    379       break
    380     except IOError:
    381       pass
    382   return config_path
    383 
    384 
    385 def GetBotoConfigFileList():
    386   """Returns list of boto config files that exist."""
    387   config_paths = boto.pyami.config.BotoConfigLocations
    388   if 'AWS_CREDENTIAL_FILE' in os.environ:
    389     config_paths.append(os.environ['AWS_CREDENTIAL_FILE'])
    390   config_files = {}
    391   for config_path in config_paths:
    392     if os.path.exists(config_path):
    393       config_files[config_path] = 1
    394   cf_list = []
    395   for config_file in config_files:
    396     cf_list.append(config_file)
    397   return cf_list
    398 
    399 
    400 def GetCertsFile():
    401   """Configures and returns the CA Certificates file.
    402 
    403   If one is already configured, use it. Otherwise, amend the configuration
    404   (in boto.config) to use the cert roots distributed with gsutil.
    405 
    406   Returns:
    407     string filename of the certs file to use.
    408   """
    409   certs_file = boto.config.get('Boto', 'ca_certificates_file', None)
    410   if not certs_file:
    411     with certs_file_lock:
    412       if configured_certs_files:
    413         disk_certs_file = configured_certs_files[0]
    414       else:
    415         disk_certs_file = os.path.abspath(
    416             os.path.join(gslib.GSLIB_DIR, 'data', 'cacerts.txt'))
    417         if not os.path.exists(disk_certs_file):
    418           # If the file is not present on disk, this means the gslib module
    419           # doesn't actually exist on disk anywhere. This can happen if it's
    420           # being imported from a zip file. Unfortunately, we have to copy the
    421           # certs file to a local temp file on disk because the underlying SSL
    422           # socket requires it to be a filesystem path.
    423           certs_data = pkgutil.get_data('gslib', 'data/cacerts.txt')
    424           if not certs_data:
    425             raise CommandException('Certificates file not found. Please '
    426                                    'reinstall gsutil from scratch')
    427           fd, fname = tempfile.mkstemp(suffix='.txt', prefix='gsutil-cacerts')
    428           f = os.fdopen(fd, 'w')
    429           f.write(certs_data)
    430           f.close()
    431           configured_certs_files.append(fname)
    432           disk_certs_file = fname
    433       certs_file = disk_certs_file
    434   return certs_file
    435 
    436 
    437 def GetCleanupFiles():
    438   """Returns a list of temp files to delete (if possible) when program exits."""
    439   cleanup_files = []
    440   if configured_certs_files:
    441     cleanup_files += configured_certs_files
    442   return cleanup_files
    443 
    444 
    445 def ProxyInfoFromEnvironmentVar(proxy_env_var):
    446   """Reads proxy info from the environment and converts to httplib2.ProxyInfo.
    447 
    448   Args:
    449     proxy_env_var: Environment variable string to read, such as http_proxy or
    450        https_proxy.
    451 
    452   Returns:
    453     httplib2.ProxyInfo constructed from the environment string.
    454   """
    455   proxy_url = os.environ.get(proxy_env_var)
    456   if not proxy_url or not proxy_env_var.lower().startswith('http'):
    457     return httplib2.ProxyInfo(httplib2.socks.PROXY_TYPE_HTTP, None, 0)
    458   proxy_protocol = proxy_env_var.lower().split('_')[0]
    459   if not proxy_url.lower().startswith('http'):
    460     # proxy_info_from_url requires a protocol, which is always http or https.
    461     proxy_url = proxy_protocol + '://' + proxy_url
    462   return httplib2.proxy_info_from_url(proxy_url, method=proxy_protocol)
    463 
    464 
    465 def GetNewHttp(http_class=httplib2.Http, **kwargs):
    466   """Creates and returns a new httplib2.Http instance.
    467 
    468   Args:
    469     http_class: Optional custom Http class to use.
    470     **kwargs: Arguments to pass to http_class constructor.
    471 
    472   Returns:
    473     An initialized httplib2.Http instance.
    474   """
    475   proxy_info = httplib2.ProxyInfo(
    476       proxy_type=3,
    477       proxy_host=boto.config.get('Boto', 'proxy', None),
    478       proxy_port=boto.config.getint('Boto', 'proxy_port', 0),
    479       proxy_user=boto.config.get('Boto', 'proxy_user', None),
    480       proxy_pass=boto.config.get('Boto', 'proxy_pass', None),
    481       proxy_rdns=boto.config.get('Boto', 'proxy_rdns', False))
    482 
    483   if not (proxy_info.proxy_host and proxy_info.proxy_port):
    484     # Fall back to using the environment variable.
    485     for proxy_env_var in ['http_proxy', 'https_proxy', 'HTTPS_PROXY']:
    486       if proxy_env_var in os.environ and os.environ[proxy_env_var]:
    487         proxy_info = ProxyInfoFromEnvironmentVar(proxy_env_var)
    488         # Assume proxy_rnds is True if a proxy environment variable exists.
    489         proxy_info.proxy_rdns = boto.config.get('Boto', 'proxy_rdns', True)
    490         break
    491 
    492   # Some installers don't package a certs file with httplib2, so use the
    493   # one included with gsutil.
    494   kwargs['ca_certs'] = GetCertsFile()
    495   # Use a non-infinite SSL timeout to avoid hangs during network flakiness.
    496   kwargs['timeout'] = SSL_TIMEOUT
    497   http = http_class(proxy_info=proxy_info, **kwargs)
    498   http.disable_ssl_certificate_validation = (not config.getbool(
    499       'Boto', 'https_validate_certificates'))
    500   return http
    501 
    502 
    503 # Retry for 10 minutes with exponential backoff, which corresponds to
    504 # the maximum Downtime Period specified in the GCS SLA
    505 # (https://cloud.google.com/storage/sla)
    506 def GetNumRetries():
    507   return config.getint('Boto', 'num_retries', 23)
    508 
    509 
    510 def GetMaxRetryDelay():
    511   return config.getint('Boto', 'max_retry_delay', 32)
    512 
    513 
    514 # Resumable downloads and uploads make one HTTP call per chunk (and must be
    515 # in multiples of 256KiB). Overridable for testing.
    516 def GetJsonResumableChunkSize():
    517   chunk_size = config.getint('GSUtil', 'json_resumable_chunk_size',
    518                              1024*1024*100L)
    519   if chunk_size == 0:
    520     chunk_size = 1024*256L
    521   elif chunk_size % 1024*256L != 0:
    522     chunk_size += (1024*256L - (chunk_size % (1024*256L)))
    523   return chunk_size
    524 
    525 
    526 def _RoundToNearestExponent(num):
    527   i = 0
    528   while i+1 < len(_EXP_STRINGS) and num >= (2 ** _EXP_STRINGS[i+1][0]):
    529     i += 1
    530   return i, round(float(num) / 2 ** _EXP_STRINGS[i][0], 2)
    531 
    532 
    533 def MakeHumanReadable(num):
    534   """Generates human readable string for a number of bytes.
    535 
    536   Args:
    537     num: The number, in bytes.
    538 
    539   Returns:
    540     A string form of the number using size abbreviations (KiB, MiB, etc.).
    541   """
    542   i, rounded_val = _RoundToNearestExponent(num)
    543   return '%g %s' % (rounded_val, _EXP_STRINGS[i][1])
    544 
    545 
    546 def MakeBitsHumanReadable(num):
    547   """Generates human readable string for a number of bits.
    548 
    549   Args:
    550     num: The number, in bits.
    551 
    552   Returns:
    553     A string form of the number using bit size abbreviations (kbit, Mbit, etc.)
    554   """
    555   i, rounded_val = _RoundToNearestExponent(num)
    556   return '%g %s' % (rounded_val, _EXP_STRINGS[i][2])
    557 
    558 
    559 def HumanReadableToBytes(human_string):
    560   """Tries to convert a human-readable string to a number of bytes.
    561 
    562   Args:
    563     human_string: A string supplied by user, e.g. '1M', '3 GiB'.
    564   Returns:
    565     An integer containing the number of bytes.
    566   Raises:
    567     ValueError: on an invalid string.
    568   """
    569   human_string = human_string.lower()
    570   m = MATCH_HUMAN_BYTES.match(human_string)
    571   if m:
    572     num = float(m.group('num'))
    573     if m.group('suffix'):
    574       power = _EXP_STRINGS[SUFFIX_TO_SI[m.group('suffix')]][0]
    575       num *= (2.0 ** power)
    576     num = int(round(num))
    577     return num
    578   raise ValueError('Invalid byte string specified: %s' % human_string)
    579 
    580 
    581 def Percentile(values, percent, key=lambda x: x):
    582   """Find the percentile of a list of values.
    583 
    584   Taken from: http://code.activestate.com/recipes/511478/
    585 
    586   Args:
    587     values: a list of numeric values. Note that the values MUST BE already
    588             sorted.
    589     percent: a float value from 0.0 to 1.0.
    590     key: optional key function to compute value from each element of the list
    591          of values.
    592 
    593   Returns:
    594     The percentile of the values.
    595   """
    596   if not values:
    597     return None
    598   k = (len(values) - 1) * percent
    599   f = math.floor(k)
    600   c = math.ceil(k)
    601   if f == c:
    602     return key(values[int(k)])
    603   d0 = key(values[int(f)]) * (c-k)
    604   d1 = key(values[int(c)]) * (k-f)
    605   return d0 + d1
    606 
    607 
    608 def RemoveCRLFFromString(input_str):
    609   """Returns the input string with all \\n and \\r removed."""
    610   return re.sub(r'[\r\n]', '', input_str)
    611 
    612 
    613 def UnaryDictToXml(message):
    614   """Generates XML representation of a nested dict.
    615 
    616   This dict contains exactly one top-level entry and an arbitrary number of
    617   2nd-level entries, e.g. capturing a WebsiteConfiguration message.
    618 
    619   Args:
    620     message: The dict encoding the message.
    621 
    622   Returns:
    623     XML string representation of the input dict.
    624 
    625   Raises:
    626     Exception: if dict contains more than one top-level entry.
    627   """
    628   if len(message) != 1:
    629     raise Exception('Expected dict of size 1, got size %d' % len(message))
    630 
    631   name, content = message.items()[0]
    632   element_type = ElementTree.Element(name)
    633   for element_property, value in sorted(content.items()):
    634     node = ElementTree.SubElement(element_type, element_property)
    635     node.text = value
    636   return ElementTree.tostring(element_type)
    637 
    638 
    639 def LookUpGsutilVersion(gsutil_api, url_str):
    640   """Looks up the gsutil version of the specified gsutil tarball URL.
    641 
    642   Version is specified in the metadata field set on that object.
    643 
    644   Args:
    645     gsutil_api: gsutil Cloud API to use when retrieving gsutil tarball.
    646     url_str: tarball URL to retrieve (such as 'gs://pub/gsutil.tar.gz').
    647 
    648   Returns:
    649     Version string if URL is a cloud URL containing x-goog-meta-gsutil-version
    650     metadata, else None.
    651   """
    652   url = StorageUrlFromString(url_str)
    653   if url.IsCloudUrl():
    654     obj = gsutil_api.GetObjectMetadata(url.bucket_name, url.object_name,
    655                                        provider=url.scheme,
    656                                        fields=['metadata'])
    657     if obj.metadata and obj.metadata.additionalProperties:
    658       for prop in obj.metadata.additionalProperties:
    659         if prop.key == 'gsutil_version':
    660           return prop.value
    661 
    662 
    663 def GetGsutilVersionModifiedTime():
    664   """Returns unix timestamp of when the VERSION file was last modified."""
    665   if not gslib.VERSION_FILE:
    666     return 0
    667   return int(os.path.getmtime(gslib.VERSION_FILE))
    668 
    669 
    670 def IsRunningInteractively():
    671   """Returns True if currently running interactively on a TTY."""
    672   return sys.stdout.isatty() and sys.stderr.isatty() and sys.stdin.isatty()
    673 
    674 
    675 def _HttpsValidateCertifcatesEnabled():
    676   return config.get('Boto', 'https_validate_certificates', True)
    677 
    678 CERTIFICATE_VALIDATION_ENABLED = _HttpsValidateCertifcatesEnabled()
    679 
    680 
    681 def _BotoIsSecure():
    682   return config.get('Boto', 'is_secure', True)
    683 
    684 BOTO_IS_SECURE = _BotoIsSecure()
    685 
    686 
    687 def ResumableThreshold():
    688   return config.getint('GSUtil', 'resumable_threshold', EIGHT_MIB)
    689 
    690 
    691 def AddAcceptEncoding(headers):
    692   """Adds accept-encoding:gzip to the dictionary of headers."""
    693   # If Accept-Encoding is not already set, set it to enable gzip.
    694   if 'accept-encoding' not in headers:
    695     headers['accept-encoding'] = 'gzip'
    696 
    697 
    698 # pylint: disable=too-many-statements
    699 def PrintFullInfoAboutObject(bucket_listing_ref, incl_acl=True):
    700   """Print full info for given object (like what displays for gsutil ls -L).
    701 
    702   Args:
    703     bucket_listing_ref: BucketListingRef being listed.
    704                         Must have ref_type OBJECT and a populated root_object
    705                         with the desired fields.
    706     incl_acl: True if ACL info should be output.
    707 
    708   Returns:
    709     Tuple (number of objects, object_length)
    710 
    711   Raises:
    712     Exception: if calling bug encountered.
    713   """
    714   url_str = bucket_listing_ref.url_string
    715   storage_url = StorageUrlFromString(url_str)
    716   obj = bucket_listing_ref.root_object
    717 
    718   if (obj.metadata and S3_DELETE_MARKER_GUID in
    719       obj.metadata.additionalProperties):
    720     num_bytes = 0
    721     num_objs = 0
    722     url_str += '<DeleteMarker>'
    723   else:
    724     num_bytes = obj.size
    725     num_objs = 1
    726 
    727   print '%s:' % url_str.encode(UTF8)
    728   if obj.updated:
    729     print '\tCreation time:\t\t%s' % obj.updated.strftime(
    730         '%a, %d %b %Y %H:%M:%S GMT')
    731   if obj.cacheControl:
    732     print '\tCache-Control:\t\t%s' % obj.cacheControl
    733   if obj.contentDisposition:
    734     print '\tContent-Disposition:\t\t%s' % obj.contentDisposition
    735   if obj.contentEncoding:
    736     print '\tContent-Encoding:\t\t%s' % obj.contentEncoding
    737   if obj.contentLanguage:
    738     print '\tContent-Language:\t%s' % obj.contentLanguage
    739   print '\tContent-Length:\t\t%s' % obj.size
    740   print '\tContent-Type:\t\t%s' % obj.contentType
    741   if obj.componentCount:
    742     print '\tComponent-Count:\t%d' % obj.componentCount
    743   marker_props = {}
    744   if obj.metadata and obj.metadata.additionalProperties:
    745     non_marker_props = []
    746     for add_prop in obj.metadata.additionalProperties:
    747       if add_prop.key not in S3_MARKER_GUIDS:
    748         non_marker_props.append(add_prop)
    749       else:
    750         marker_props[add_prop.key] = add_prop.value
    751     if non_marker_props:
    752       print '\tMetadata:'
    753       for ap in non_marker_props:
    754         meta_string = '\t\t%s:\t\t%s' % (ap.key, ap.value)
    755         print meta_string.encode(UTF8)
    756   if obj.crc32c: print '\tHash (crc32c):\t\t%s' % obj.crc32c
    757   if obj.md5Hash: print '\tHash (md5):\t\t%s' % obj.md5Hash
    758   print '\tETag:\t\t\t%s' % obj.etag.strip('"\'')
    759   if obj.generation:
    760     generation_str = GenerationFromUrlAndString(storage_url, obj.generation)
    761     print '\tGeneration:\t\t%s' % generation_str
    762   if obj.metageneration:
    763     print '\tMetageneration:\t\t%s' % obj.metageneration
    764   if incl_acl:
    765     # JSON API won't return acls as part of the response unless we have
    766     # full control scope
    767     if obj.acl:
    768       print '\tACL:\t\t%s' % AclTranslation.JsonFromMessage(obj.acl)
    769     elif S3_ACL_MARKER_GUID in marker_props:
    770       print '\tACL:\t\t%s' % marker_props[S3_ACL_MARKER_GUID]
    771     else:
    772       print ('\tACL:\t\t\tACCESS DENIED. Note: you need OWNER '
    773              'permission\n\t\t\t\ton the object to read its ACL.')
    774 
    775   return (num_objs, num_bytes)
    776 
    777 
    778 def CompareVersions(first, second):
    779   """Compares the first and second gsutil version strings.
    780 
    781   For example, 3.33 > 3.7, and 4.1 is a greater major version than 3.33.
    782   Does not handle multiple periods (e.g. 3.3.4) or complicated suffixes
    783   (e.g., 3.3RC4 vs. 3.3RC5). A version string with a suffix is treated as
    784   less than its non-suffix counterpart (e.g. 3.32 > 3.32pre).
    785 
    786   Args:
    787     first: First gsutil version string.
    788     second: Second gsutil version string.
    789 
    790   Returns:
    791     (g, m):
    792        g is True if first known to be greater than second, else False.
    793        m is True if first known to be greater by at least 1 major version,
    794          else False.
    795   """
    796   m1 = VERSION_MATCHER.match(str(first))
    797   m2 = VERSION_MATCHER.match(str(second))
    798 
    799   # If passed strings we don't know how to handle, be conservative.
    800   if not m1 or not m2:
    801     return (False, False)
    802 
    803   major_ver1 = int(m1.group('maj'))
    804   minor_ver1 = int(m1.group('min')) if m1.group('min') else 0
    805   suffix_ver1 = m1.group('suffix')
    806   major_ver2 = int(m2.group('maj'))
    807   minor_ver2 = int(m2.group('min')) if m2.group('min') else 0
    808   suffix_ver2 = m2.group('suffix')
    809 
    810   if major_ver1 > major_ver2:
    811     return (True, True)
    812   elif major_ver1 == major_ver2:
    813     if minor_ver1 > minor_ver2:
    814       return (True, False)
    815     elif minor_ver1 == minor_ver2:
    816       return (bool(suffix_ver2) and not suffix_ver1, False)
    817   return (False, False)
    818 
    819 
    820 def _IncreaseSoftLimitForResource(resource_name, fallback_value):
    821   """Sets a new soft limit for the maximum number of open files.
    822 
    823   The soft limit is used for this process (and its children), but the
    824   hard limit is set by the system and cannot be exceeded.
    825 
    826   We will first try to set the soft limit to the hard limit's value; if that
    827   fails, we will try to set the soft limit to the fallback_value iff this would
    828   increase the soft limit.
    829 
    830   Args:
    831     resource_name: Name of the resource to increase the soft limit for.
    832     fallback_value: Fallback value to be used if we couldn't set the
    833                     soft value to the hard value (e.g., if the hard value
    834                     is "unlimited").
    835 
    836   Returns:
    837     Current soft limit for the resource (after any changes we were able to
    838     make), or -1 if the resource doesn't exist.
    839   """
    840 
    841   # Get the value of the resource.
    842   try:
    843     (soft_limit, hard_limit) = resource.getrlimit(resource_name)
    844   except (resource.error, ValueError):
    845     # The resource wasn't present, so we can't do anything here.
    846     return -1
    847 
    848   # Try to set the value of the soft limit to the value of the hard limit.
    849   if hard_limit > soft_limit:  # Some OS's report 0 for "unlimited".
    850     try:
    851       resource.setrlimit(resource_name, (hard_limit, hard_limit))
    852       return hard_limit
    853     except (resource.error, ValueError):
    854       # We'll ignore this and try the fallback value.
    855       pass
    856 
    857   # Try to set the value of the soft limit to the fallback value.
    858   if soft_limit < fallback_value:
    859     try:
    860       resource.setrlimit(resource_name, (fallback_value, hard_limit))
    861       return fallback_value
    862     except (resource.error, ValueError):
    863       # We couldn't change the soft limit, so just report the current
    864       # value of the soft limit.
    865       return soft_limit
    866   else:
    867     return soft_limit
    868 
    869 
    870 def GetCloudApiInstance(cls, thread_state=None):
    871   """Gets a gsutil Cloud API instance.
    872 
    873   Since Cloud API implementations are not guaranteed to be thread-safe, each
    874   thread needs its own instance. These instances are passed to each thread
    875   via the thread pool logic in command.
    876 
    877   Args:
    878     cls: Command class to be used for single-threaded case.
    879     thread_state: Per thread state from this thread containing a gsutil
    880                   Cloud API instance.
    881 
    882   Returns:
    883     gsutil Cloud API instance.
    884   """
    885   return thread_state or cls.gsutil_api
    886 
    887 
    888 def GetFileSize(fp, position_to_eof=False):
    889   """Returns size of file, optionally leaving fp positioned at EOF."""
    890   if not position_to_eof:
    891     cur_pos = fp.tell()
    892   fp.seek(0, os.SEEK_END)
    893   cur_file_size = fp.tell()
    894   if not position_to_eof:
    895     fp.seek(cur_pos)
    896   return cur_file_size
    897 
    898 
    899 def GetStreamFromFileUrl(storage_url):
    900   if storage_url.IsStream():
    901     return sys.stdin
    902   else:
    903     return open(storage_url.object_name, 'rb')
    904 
    905 
    906 def UrlsAreForSingleProvider(url_args):
    907   """Tests whether the URLs are all for a single provider.
    908 
    909   Args:
    910     url_args: Strings to check.
    911 
    912   Returns:
    913     True if URLs are for single provider, False otherwise.
    914   """
    915   provider = None
    916   url = None
    917   for url_str in url_args:
    918     url = StorageUrlFromString(url_str)
    919     if not provider:
    920       provider = url.scheme
    921     elif url.scheme != provider:
    922       return False
    923   return provider is not None
    924 
    925 
    926 def HaveFileUrls(args_to_check):
    927   """Checks whether args_to_check contain any file URLs.
    928 
    929   Args:
    930     args_to_check: Command-line argument subset to check.
    931 
    932   Returns:
    933     True if args_to_check contains any file URLs.
    934   """
    935   for url_str in args_to_check:
    936     storage_url = StorageUrlFromString(url_str)
    937     if storage_url.IsFileUrl():
    938       return True
    939   return False
    940 
    941 
    942 def HaveProviderUrls(args_to_check):
    943   """Checks whether args_to_check contains any provider URLs (like 'gs://').
    944 
    945   Args:
    946     args_to_check: Command-line argument subset to check.
    947 
    948   Returns:
    949     True if args_to_check contains any provider URLs.
    950   """
    951   for url_str in args_to_check:
    952     storage_url = StorageUrlFromString(url_str)
    953     if storage_url.IsCloudUrl() and storage_url.IsProvider():
    954       return True
    955   return False
    956 
    957 # This must be defined at the module level for pickling across processes.
    958 MultiprocessingIsAvailableResult = collections.namedtuple(
    959     'MultiprocessingIsAvailableResult', ['is_available', 'stack_trace'])
    960 
    961 
    962 def CheckMultiprocessingAvailableAndInit(logger=None):
    963   """Checks if multiprocessing is available.
    964 
    965   There are some environments in which there is no way to use multiprocessing
    966   logic that's built into Python (e.g., if /dev/shm is not available, then
    967   we can't create semaphores). This simply tries out a few things that will be
    968   needed to make sure the environment can support the pieces of the
    969   multiprocessing module that we need.
    970 
    971   If multiprocessing is available, this performs necessary initialization for
    972   multiprocessing.  See gslib.command.InitializeMultiprocessingVariables for
    973   an explanation of why this is necessary.
    974 
    975   Args:
    976     logger: logging.logger to use for debug output.
    977 
    978   Returns:
    979     (multiprocessing_is_available, stack_trace):
    980       multiprocessing_is_available: True iff the multiprocessing module is
    981                                     available for use.
    982       stack_trace: The stack trace generated by the call we tried that failed.
    983   """
    984   # pylint: disable=global-variable-undefined
    985   global cached_multiprocessing_is_available
    986   global cached_multiprocessing_check_stack_trace
    987   global cached_multiprocessing_is_available_message
    988   if cached_multiprocessing_is_available is not None:
    989     if logger:
    990       logger.debug(cached_multiprocessing_check_stack_trace)
    991       logger.warn(cached_multiprocessing_is_available_message)
    992     return MultiprocessingIsAvailableResult(
    993         is_available=cached_multiprocessing_is_available,
    994         stack_trace=cached_multiprocessing_check_stack_trace)
    995 
    996   if IS_WINDOWS:
    997     message = """
    998 Multiple processes are not supported on Windows. Operations requesting
    999 parallelism will be executed with multiple threads in a single process only.    
   1000 """
   1001     if logger:
   1002       logger.warn(message)
   1003     return MultiprocessingIsAvailableResult(is_available=False,
   1004                                             stack_trace=None)
   1005 
   1006   stack_trace = None
   1007   multiprocessing_is_available = True
   1008   message = """
   1009 You have requested multiple processes for an operation, but the
   1010 required functionality of Python\'s multiprocessing module is not available.
   1011 Operations requesting parallelism will be executed with multiple threads in a
   1012 single process only.
   1013 """
   1014   try:
   1015     # Fails if /dev/shm (or some equivalent thereof) is not available for use
   1016     # (e.g., there's no implementation, or we can't write to it, etc.).
   1017     try:
   1018       multiprocessing.Value('i', 0)
   1019     except:
   1020       message += """
   1021 Please ensure that you have write access to both /dev/shm and /run/shm.
   1022 """
   1023       raise  # We'll handle this in one place below.
   1024 
   1025     # Manager objects and Windows are generally a pain to work with, so try it
   1026     # out as a sanity check. This definitely works on some versions of Windows,
   1027     # but it's certainly possible that there is some unknown configuration for
   1028     # which it won't.
   1029     global manager  # pylint: disable=global-variable-undefined
   1030 
   1031     manager = multiprocessing.Manager()
   1032 
   1033     # Check that the max number of open files is reasonable. Always check this
   1034     # after we're sure that the basic multiprocessing functionality is
   1035     # available, since this won't matter unless that's true.
   1036     limit = -1
   1037     if HAS_RESOURCE_MODULE:
   1038       # Try to set this with both resource names - RLIMIT_NOFILE for most Unix
   1039       # platforms, and RLIMIT_OFILE for BSD. Ignore AttributeError because the
   1040       # "resource" module is not guaranteed to know about these names.
   1041       try:
   1042         limit = max(limit,
   1043                     _IncreaseSoftLimitForResource(
   1044                         resource.RLIMIT_NOFILE,
   1045                         MIN_ACCEPTABLE_OPEN_FILES_LIMIT))
   1046       except AttributeError:
   1047         pass
   1048       try:
   1049         limit = max(limit,
   1050                     _IncreaseSoftLimitForResource(
   1051                         resource.RLIMIT_OFILE, MIN_ACCEPTABLE_OPEN_FILES_LIMIT))
   1052       except AttributeError:
   1053         pass
   1054 
   1055     if limit < MIN_ACCEPTABLE_OPEN_FILES_LIMIT:
   1056       message += ("""
   1057 Your max number of open files, %s, is too low to allow safe multiprocessing.
   1058 On Linux you can fix this by adding something like "ulimit -n 10000" to your
   1059 ~/.bashrc or equivalent file and opening a new terminal.
   1060 
   1061 On MacOS, you may also need to run a command like this once (in addition to the
   1062 above instructions), which might require a restart of your system to take
   1063 effect:
   1064   launchctl limit maxfiles 10000
   1065 
   1066 Alternatively, edit /etc/launchd.conf with something like:
   1067   limit maxfiles 10000 10000
   1068 
   1069 """ % limit)
   1070       raise Exception('Max number of open files, %s, is too low.' % limit)
   1071   except:  # pylint: disable=bare-except
   1072     stack_trace = traceback.format_exc()
   1073     multiprocessing_is_available = False
   1074     if logger is not None:
   1075       logger.debug(stack_trace)
   1076       logger.warn(message)
   1077 
   1078   # Set the cached values so that we never need to do this check again.
   1079   cached_multiprocessing_is_available = multiprocessing_is_available
   1080   cached_multiprocessing_check_stack_trace = stack_trace
   1081   cached_multiprocessing_is_available_message = message
   1082   return MultiprocessingIsAvailableResult(
   1083       is_available=cached_multiprocessing_is_available,
   1084       stack_trace=cached_multiprocessing_check_stack_trace)
   1085 
   1086 
   1087 def CreateLock():
   1088   """Returns either a multiprocessing lock or a threading lock.
   1089 
   1090   Use Multiprocessing lock iff we have access to the parts of the
   1091   multiprocessing module that are necessary to enable parallelism in operations.
   1092 
   1093   Returns:
   1094     Multiprocessing or threading lock.
   1095   """
   1096   if CheckMultiprocessingAvailableAndInit().is_available:
   1097     return manager.Lock()
   1098   else:
   1099     return threading.Lock()
   1100 
   1101 
   1102 def IsCloudSubdirPlaceholder(url, blr=None):
   1103   """Determines if URL is a cloud subdir placeholder.
   1104 
   1105   This function is needed because GUI tools (like the GCS cloud console) allow
   1106   users to create empty "folders" by creating a placeholder object; and parts
   1107   of gsutil need to treat those placeholder objects specially. For example,
   1108   gsutil rsync needs to avoid downloading those objects because they can cause
   1109   conflicts (see comments in rsync command for details).
   1110 
   1111   We currently detect two cases:
   1112     - Cloud objects whose name ends with '_$folder$'
   1113     - Cloud objects whose name ends with '/'
   1114 
   1115   Args:
   1116     url: The URL to be checked.
   1117     blr: BucketListingRef to check, or None if not available.
   1118          If None, size won't be checked.
   1119 
   1120   Returns:
   1121     True/False.
   1122   """
   1123   if not url.IsCloudUrl():
   1124     return False
   1125   url_str = url.url_string
   1126   if url_str.endswith('_$folder$'):
   1127     return True
   1128   if blr and blr.IsObject():
   1129     size = blr.root_object.size
   1130   else:
   1131     size = 0
   1132   return size == 0 and url_str.endswith('/')
   1133 
   1134 
   1135 def GetTermLines():
   1136   """Returns number of terminal lines."""
   1137   # fcntl isn't supported in Windows.
   1138   try:
   1139     import fcntl    # pylint: disable=g-import-not-at-top
   1140     import termios  # pylint: disable=g-import-not-at-top
   1141   except ImportError:
   1142     return _DEFAULT_LINES
   1143   def ioctl_GWINSZ(fd):  # pylint: disable=invalid-name
   1144     try:
   1145       return struct.unpack(
   1146           'hh', fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234'))[0]
   1147     except:  # pylint: disable=bare-except
   1148       return 0  # Failure (so will retry on different file descriptor below).
   1149   # Try to find a valid number of lines from termio for stdin, stdout,
   1150   # or stderr, in that order.
   1151   ioc = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
   1152   if not ioc:
   1153     try:
   1154       fd = os.open(os.ctermid(), os.O_RDONLY)
   1155       ioc = ioctl_GWINSZ(fd)
   1156       os.close(fd)
   1157     except:  # pylint: disable=bare-except
   1158       pass
   1159   if not ioc:
   1160     ioc = os.environ.get('LINES', _DEFAULT_LINES)
   1161   return int(ioc)
   1162 
   1163 
   1164 class GsutilStreamHandler(logging.StreamHandler):
   1165   """A subclass of StreamHandler for use in gsutil."""
   1166 
   1167   def flush(self):
   1168     # Note: we override the flush method here due to a python 2.6 bug. The
   1169     # python logging module will try to flush all stream handlers at exit.
   1170     # If the StreamHandler is pointing to a file that is already closed, the
   1171     # method throws an exception. Our unit tests temporarily redirect stderr,
   1172     # which causes the default StreamHandler to open its stream against a
   1173     # temporary file. By the time the process shuts down, the underlying file
   1174     # is closed, causing an exception. This was fixed in Python 2.7, but to
   1175     # remove the flake from Python 2.6, we maintain this here.
   1176     try:
   1177       logging.StreamHandler.flush(self)
   1178     except ValueError:
   1179       pass
   1180 
   1181 
   1182 def StdinIterator():
   1183   """A generator function that returns lines from stdin."""
   1184   for line in sys.stdin:
   1185     # Strip CRLF.
   1186     yield line.rstrip()
   1187