Home | History | Annotate | Download | only in commands
      1 # -*- coding: utf-8 -*-
      2 # Copyright 2011 Google Inc. All Rights Reserved.
      3 # Copyright 2011, Nexenta Systems Inc.
      4 #
      5 # Licensed under the Apache License, Version 2.0 (the "License");
      6 # you may not use this file except in compliance with the License.
      7 # You may obtain a copy of the License at
      8 #
      9 #     http://www.apache.org/licenses/LICENSE-2.0
     10 #
     11 # Unless required by applicable law or agreed to in writing, software
     12 # distributed under the License is distributed on an "AS IS" BASIS,
     13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 # See the License for the specific language governing permissions and
     15 # limitations under the License.
     16 """Implementation of Unix-like cp command for cloud storage providers."""
     17 
     18 from __future__ import absolute_import
     19 
     20 import os
     21 import time
     22 import traceback
     23 
     24 from gslib import copy_helper
     25 from gslib.cat_helper import CatHelper
     26 from gslib.command import Command
     27 from gslib.command_argument import CommandArgument
     28 from gslib.commands.compose import MAX_COMPONENT_COUNT
     29 from gslib.copy_helper import CreateCopyHelperOpts
     30 from gslib.copy_helper import ItemExistsError
     31 from gslib.copy_helper import Manifest
     32 from gslib.copy_helper import PARALLEL_UPLOAD_TEMP_NAMESPACE
     33 from gslib.copy_helper import SkipUnsupportedObjectError
     34 from gslib.cs_api_map import ApiSelector
     35 from gslib.exception import CommandException
     36 from gslib.name_expansion import NameExpansionIterator
     37 from gslib.storage_url import ContainsWildcard
     38 from gslib.util import CreateLock
     39 from gslib.util import GetCloudApiInstance
     40 from gslib.util import IsCloudSubdirPlaceholder
     41 from gslib.util import MakeHumanReadable
     42 from gslib.util import NO_MAX
     43 from gslib.util import RemoveCRLFFromString
     44 from gslib.util import StdinIterator
     45 
     46 _SYNOPSIS = """
     47   gsutil cp [OPTION]... src_url dst_url
     48   gsutil cp [OPTION]... src_url... dst_url
     49   gsutil cp [OPTION]... -I dst_url
     50 """
     51 
     52 _SYNOPSIS_TEXT = """
     53 <B>SYNOPSIS</B>
     54 """ + _SYNOPSIS
     55 
     56 _DESCRIPTION_TEXT = """
     57 <B>DESCRIPTION</B>
     58   The gsutil cp command allows you to copy data between your local file
     59   system and the cloud, copy data within the cloud, and copy data between
     60   cloud storage providers. For example, to copy all text files from the
     61   local directory to a bucket you could do:
     62 
     63     gsutil cp *.txt gs://my_bucket
     64 
     65   Similarly, you can download text files from a bucket by doing:
     66 
     67     gsutil cp gs://my_bucket/*.txt .
     68 
     69   If you want to copy an entire directory tree you need to use the -r option:
     70 
     71     gsutil cp -r dir gs://my_bucket
     72 
     73   If you have a large number of files to upload you might want to use the
     74   gsutil -m option, to perform a parallel (multi-threaded/multi-processing)
     75   copy:
     76 
     77     gsutil -m cp -r dir gs://my_bucket
     78 
     79   You can pass a list of URLs (one per line) to copy on stdin instead of as
     80   command line arguments by using the -I option. This allows you to use gsutil
     81   in a pipeline to upload or download files / objects as generated by a program,
     82   such as:
     83 
     84     some_program | gsutil -m cp -I gs://my_bucket
     85 
     86   or:
     87 
     88     some_program | gsutil -m cp -I ./download_dir
     89 
     90   The contents of stdin can name files, cloud URLs, and wildcards of files
     91   and cloud URLs.
     92 """
     93 
     94 _NAME_CONSTRUCTION_TEXT = """
     95 <B>HOW NAMES ARE CONSTRUCTED</B>
     96   The gsutil cp command strives to name objects in a way consistent with how
     97   Linux cp works, which causes names to be constructed in varying ways depending
     98   on whether you're performing a recursive directory copy or copying
     99   individually named objects; and whether you're copying to an existing or
    100   non-existent directory.
    101 
    102   When performing recursive directory copies, object names are constructed
    103   that mirror the source directory structure starting at the point of
    104   recursive processing. For example, the command:
    105 
    106     gsutil cp -r dir1/dir2 gs://my_bucket
    107 
    108   will create objects named like gs://my_bucket/dir2/a/b/c, assuming
    109   dir1/dir2 contains the file a/b/c.
    110 
    111   In contrast, copying individually named files will result in objects named
    112   by the final path component of the source files. For example, the command:
    113 
    114     gsutil cp dir1/dir2/** gs://my_bucket
    115 
    116   will create objects named like gs://my_bucket/c.
    117 
    118   The same rules apply for downloads: recursive copies of buckets and
    119   bucket subdirectories produce a mirrored filename structure, while copying
    120   individually (or wildcard) named objects produce flatly named files.
    121 
    122   Note that in the above example the '**' wildcard matches all names
    123   anywhere under dir. The wildcard '*' will match names just one level deep. For
    124   more details see 'gsutil help wildcards'.
    125 
    126   There's an additional wrinkle when working with subdirectories: the resulting
    127   names depend on whether the destination subdirectory exists. For example,
    128   if gs://my_bucket/subdir exists as a subdirectory, the command:
    129 
    130     gsutil cp -r dir1/dir2 gs://my_bucket/subdir
    131 
    132   will create objects named like gs://my_bucket/subdir/dir2/a/b/c. In contrast,
    133   if gs://my_bucket/subdir does not exist, this same gsutil cp command will
    134   create objects named like gs://my_bucket/subdir/a/b/c.
    135 
    136   Note: If you use the
    137   `Google Developers Console <https://console.developers.google.com>`_
    138   to create folders, it does so by creating a "placeholder" object that ends
    139   with a "/" character. gsutil skips these objects when downloading from the
    140   cloud to the local file system, because attempting to create a file that
    141   ends with a "/" is not allowed on Linux and MacOS. Because of this, it is
    142   recommended that you not create objects that end with "/" (unless you don't
    143   need to be able to download such objects using gsutil).
    144 """
    145 
    146 _SUBDIRECTORIES_TEXT = """
    147 <B>COPYING TO/FROM SUBDIRECTORIES; DISTRIBUTING TRANSFERS ACROSS MACHINES</B>
    148   You can use gsutil to copy to and from subdirectories by using a command
    149   like:
    150 
    151     gsutil cp -r dir gs://my_bucket/data
    152 
    153   This will cause dir and all of its files and nested subdirectories to be
    154   copied under the specified destination, resulting in objects with names like
    155   gs://my_bucket/data/dir/a/b/c. Similarly you can download from bucket
    156   subdirectories by using a command like:
    157 
    158     gsutil cp -r gs://my_bucket/data dir
    159 
    160   This will cause everything nested under gs://my_bucket/data to be downloaded
    161   into dir, resulting in files with names like dir/data/a/b/c.
    162 
    163   Copying subdirectories is useful if you want to add data to an existing
    164   bucket directory structure over time. It's also useful if you want
    165   to parallelize uploads and downloads across multiple machines (often
    166   reducing overall transfer time compared with simply running gsutil -m
    167   cp on one machine). For example, if your bucket contains this structure:
    168 
    169     gs://my_bucket/data/result_set_01/
    170     gs://my_bucket/data/result_set_02/
    171     ...
    172     gs://my_bucket/data/result_set_99/
    173 
    174   you could perform concurrent downloads across 3 machines by running these
    175   commands on each machine, respectively:
    176 
    177     gsutil -m cp -r gs://my_bucket/data/result_set_[0-3]* dir
    178     gsutil -m cp -r gs://my_bucket/data/result_set_[4-6]* dir
    179     gsutil -m cp -r gs://my_bucket/data/result_set_[7-9]* dir
    180 
    181   Note that dir could be a local directory on each machine, or it could
    182   be a directory mounted off of a shared file server; whether the latter
    183   performs acceptably may depend on a number of things, so we recommend
    184   you experiment and find out what works best for you.
    185 """
    186 
    187 _COPY_IN_CLOUD_TEXT = """
    188 <B>COPYING IN THE CLOUD AND METADATA PRESERVATION</B>
    189   If both the source and destination URL are cloud URLs from the same
    190   provider, gsutil copies data "in the cloud" (i.e., without downloading
    191   to and uploading from the machine where you run gsutil). In addition to
    192   the performance and cost advantages of doing this, copying in the cloud
    193   preserves metadata (like Content-Type and Cache-Control). In contrast,
    194   when you download data from the cloud it ends up in a file, which has
    195   no associated metadata. Thus, unless you have some way to hold on to
    196   or re-create that metadata, downloading to a file will not retain the
    197   metadata.
    198 
    199   Copies spanning locations and/or storage classes cause data to be rewritten
    200   in the cloud, which may take some time. Such operations can be resumed with
    201   the same command if they are interrupted, so long as the command parameters
    202   are identical. 
    203 
    204   Note that by default, the gsutil cp command does not copy the object
    205   ACL to the new object, and instead will use the default bucket ACL (see
    206   "gsutil help defacl").  You can override this behavior with the -p
    207   option (see OPTIONS below).
    208 
    209   One additional note about copying in the cloud: If the destination bucket has
    210   versioning enabled, gsutil cp will by default copy only live versions of the
    211   source object(s). For example:
    212 
    213     gsutil cp gs://bucket1/obj gs://bucket2
    214 
    215   will cause only the single live version of of gs://bucket1/obj to be copied
    216   to gs://bucket2, even if there are archived versions of gs://bucket1/obj. To
    217   also copy archived versions, use the -A flag:
    218 
    219     gsutil cp -A gs://bucket1/obj gs://bucket2
    220 
    221   The gsutil -m flag is disallowed when using the cp -A flag, to ensure that
    222   version ordering is preserved.
    223 """
    224 
    225 _CHECKSUM_VALIDATION_TEXT = """
    226 <B>CHECKSUM VALIDATION</B>
    227   At the end of every upload or download the gsutil cp command validates that
    228   the checksum it computes for the source file/object matches the checksum
    229   the service computes. If the checksums do not match, gsutil will delete the
    230   corrupted object and print a warning message. This very rarely happens, but
    231   if it does, please contact gs-team (at] google.com.
    232 
    233   If you know the MD5 of a file before uploading you can specify it in the
    234   Content-MD5 header, which will cause the cloud storage service to reject the
    235   upload if the MD5 doesn't match the value computed by the service. For
    236   example:
    237 
    238     % gsutil hash obj
    239     Hashing     obj:
    240     Hashes [base64] for obj:
    241             Hash (crc32c):          lIMoIw==
    242             Hash (md5):             VgyllJgiiaRAbyUUIqDMmw==
    243 
    244     % gsutil -h Content-MD5:VgyllJgiiaRAbyUUIqDMmw== cp obj gs://your-bucket/obj
    245     Copying file://obj [Content-Type=text/plain]...
    246     Uploading   gs://your-bucket/obj:                                182 b/182 B
    247 
    248     If the checksum didn't match the service would instead reject the upload and
    249     gsutil would print a message like:
    250 
    251     BadRequestException: 400 Provided MD5 hash "VgyllJgiiaRAbyUUIqDMmw=="
    252     doesn't match calculated MD5 hash "7gyllJgiiaRAbyUUIqDMmw==".
    253 
    254   Even if you don't do this gsutil will delete the object if the computed
    255   checksum mismatches, but specifying the Content-MD5 header has three
    256   advantages:
    257 
    258       1. It prevents the corrupted object from becoming visible at all, whereas
    259       otherwise it would be visible for 1-3 seconds before gsutil deletes it.
    260 
    261       2. It will definitively prevent the corrupted object from being left in
    262       the cloud, whereas the gsutil approach of deleting after the upload
    263       completes could fail if (for example) the gsutil process gets ^C'd
    264       between upload and deletion request.
    265 
    266       3. It supports a customer-to-service integrity check handoff. For example,
    267       if you have a content production pipeline that generates data to be
    268       uploaded to the cloud along with checksums of that data, specifying the
    269       MD5 computed by your content pipeline when you run gsutil cp will ensure
    270       that the checksums match all the way through the process (e.g., detecting
    271       if data gets corrupted on your local disk between the time it was written
    272       by your content pipeline and the time it was uploaded to GCS).
    273 
    274   Note: The Content-MD5 header is ignored for composite objects, because such
    275   objects only have a CRC32C checksum.
    276 """
    277 
    278 _RETRY_HANDLING_TEXT = """
    279 <B>RETRY HANDLING</B>
    280   The cp command will retry when failures occur, but if enough failures happen
    281   during a particular copy or delete operation the command will skip that object
    282   and move on. At the end of the copy run if any failures were not successfully
    283   retried, the cp command will report the count of failures, and exit with
    284   non-zero status.
    285 
    286   Note that there are cases where retrying will never succeed, such as if you
    287   don't have write permission to the destination bucket or if the destination
    288   path for some objects is longer than the maximum allowed length.
    289 
    290   For more details about gsutil's retry handling, please see
    291   "gsutil help retries".
    292 """
    293 
    294 _RESUMABLE_TRANSFERS_TEXT = """
    295 <B>RESUMABLE TRANSFERS</B>
    296   gsutil automatically uses the Google Cloud Storage resumable upload feature
    297   whenever you use the cp command to upload an object that is larger than 2
    298   MiB. You do not need to specify any special command line options to make this
    299   happen. If your upload is interrupted you can restart the upload by running
    300   the same cp command that you ran to start the upload. Until the upload
    301   has completed successfully, it will not be visible at the destination object
    302   and will not replace any existing object the upload is intended to overwrite.
    303   (However, see the section on PARALLEL COMPOSITE UPLOADS, which may leave
    304   temporary component objects in place during the upload process.)
    305 
    306   Similarly, gsutil automatically performs resumable downloads (using HTTP
    307   standard Range GET operations) whenever you use the cp command, unless the
    308   destination is a stream or null. In this case, a partially downloaded
    309   temporary file will be visible in the destination directory. Upon completion,
    310   the original file is deleted and overwritten with the downloaded contents.
    311 
    312   Resumable uploads and downloads store some state information in a files
    313   in ~/.gsutil named by the destination object or file. If you attempt to
    314   resume a transfer from a machine with a different directory, the transfer
    315   will start over from scratch.
    316 
    317   See also "gsutil help prod" for details on using resumable transfers
    318   in production.
    319 """
    320 
    321 _STREAMING_TRANSFERS_TEXT = """
    322 <B>STREAMING TRANSFERS</B>
    323   Use '-' in place of src_url or dst_url to perform a streaming
    324   transfer. For example:
    325 
    326     long_running_computation | gsutil cp - gs://my_bucket/obj
    327 
    328   Streaming uploads using the JSON API (see "gsutil help apis") are buffered in
    329   memory and can retry in the event of network flakiness or service errors.
    330 
    331   Streaming transfers (other than uploads using the JSON API) do not support
    332   resumable uploads/downloads. If you have a large amount of data to upload
    333   (say, more than 100 MiB) it is recommended to write the data to a local file
    334   and then copy that file to the cloud rather than streaming it (and similarly
    335   for large downloads).
    336 
    337   WARNING: When performing streaming transfers gsutil does not compute a
    338   checksum of the uploaded or downloaded data.  Therefore, we recommend that
    339   users either perform their own validation of the data or use non-streaming
    340   transfers (which perform integrity checking automatically).
    341 """
    342 
    343 _SLICED_OBJECT_DOWNLOADS_TEXT = """
    344 <B>SLICED OBJECT DOWNLOADS</B>
    345   gsutil automatically uses HTTP Range GET requests to perform "sliced"
    346   downloads in parallel for downloads of large objects. This means that, if
    347   enabled, disk space for the temporary download destination file will be
    348   pre-allocated and byte ranges (slices) within the file will be downloaded in
    349   parallel. Once all slices have completed downloading, the temporary file will
    350   be renamed to the destination file. No additional local disk space is
    351   required for this operation.
    352 
    353   This feature is only available for Google Cloud Storage objects because it
    354   requires a fast composable checksum that can be used to verify the data
    355   integrity of the slices. Thus, using sliced object downloads also requires a
    356   compiled crcmod (see "gsutil help crcmod") on the machine performing the
    357   download. If compiled crcmod is not available, normal download will instead
    358   be used.
    359 
    360   Note: since sliced object downloads cause multiple writes to occur at various
    361   locations on disk, this can degrade performance for disks with slow seek
    362   times, especially for large numbers of slices. While the default number of
    363   slices is small to avoid this, sliced object download can be completely
    364   disabled by setting the "sliced_object_download_threshold" variable in the
    365   .boto config file to 0.
    366 """
    367 
    368 _PARALLEL_COMPOSITE_UPLOADS_TEXT = """
    369 <B>PARALLEL COMPOSITE UPLOADS</B>
    370   gsutil can automatically use
    371   `object composition <https://developers.google.com/storage/docs/composite-objects>`_
    372   to perform uploads in parallel for large, local files being uploaded to Google
    373   Cloud Storage. This means that, if enabled (see next paragraph), a large file
    374   will be split into component pieces that will be uploaded in parallel. Those
    375   components will then be composed in the cloud, and the temporary components in
    376   the cloud will be deleted after successful composition. No additional local
    377   disk space is required for this operation.
    378 
    379   Using parallel composite uploads presents a tradeoff between upload
    380   performance and download configuration: If you enable parallel composite
    381   uploads your uploads will run faster, but someone will need to install a
    382   compiled crcmod (see "gsutil help crcmod") on every machine where objects are
    383   downloaded by gsutil or other Python applications. For some distributions this
    384   is easy (e.g., it comes pre-installed on MacOS), but in some cases users have
    385   found it difficult. Because of this at present parallel composite uploads are
    386   disabled by default. Google is actively working with a number of the Linux
    387   distributions to get crcmod included with the stock distribution. Once that is
    388   done we will re-enable parallel composite uploads by default in gsutil.
    389 
    390   Parallel composite uploads should not be used with NEARLINE storage
    391   class buckets, as doing this would incur an early deletion charge for each
    392   component object.
    393 
    394   To try parallel composite uploads you can run the command:
    395 
    396     gsutil -o GSUtil:parallel_composite_upload_threshold=150M cp bigfile gs://your-bucket
    397 
    398   where bigfile is larger than 150 MiB. When you do this notice that the upload
    399   progress indicator continuously updates for several different uploads at once
    400   (corresponding to each of the sections of the file being uploaded in
    401   parallel), until the parallel upload completes. If you then want to enable
    402   parallel composite uploads for all of your future uploads (notwithstanding the
    403   caveats mentioned earlier), you can uncomment and set the
    404   "parallel_composite_upload_threshold" config value in your .boto configuration
    405   file to this value.
    406 
    407   Note that the crcmod problem only impacts downloads via Python applications
    408   (such as gsutil). If any users who need to download the data using gsutil or
    409   other Python applications can install crcmod, it makes sense to enable
    410   parallel composite uploads (see above). For example, if you use gsutil to
    411   upload video assets and those assets will only ever be served via a Java
    412   application (there are efficient crc32c implementations available in Java), it
    413   would make sense to enable parallel composite uploads on your machine.
    414 
    415   If a parallel composite upload fails prior to composition, re-running the
    416   gsutil command will take advantage of resumable uploads for those components
    417   that failed, and the component objects will be deleted after the first
    418   successful attempt. Any temporary objects that were uploaded successfully
    419   before gsutil failed will still exist until the upload is completed
    420   successfully. The temporary objects will be named in the following fashion:
    421 
    422     <random ID>%s<hash>
    423 
    424   where <random ID> is some numerical value, and <hash> is an MD5 hash (not
    425   related to the hash of the contents of the file or object).
    426 
    427   To avoid leaving temporary objects around, you should make sure to check the
    428   exit status from the gsutil command.  This can be done in a bash script, for
    429   example, by doing:
    430 
    431      gsutil cp ./local-file gs://your-bucket/your-object
    432      if [ "$status" -ne "0" ] ; then
    433        << Code that handles failures >>
    434      fi
    435 
    436   Or, for copying a directory, use this instead:
    437 
    438      gsutil cp -c -L cp.log -r ./dir gs://bucket
    439      if [ "$status" -ne "0" ] ; then
    440        << Code that handles failures >>
    441      fi
    442 
    443   One important caveat is that files uploaded in this fashion are still subject
    444   to the maximum number of components limit. For example, if you upload a large
    445   file that gets split into %d components, and try to compose it with another
    446   object with %d components, the operation will fail because it exceeds the %d
    447   component limit. If you wish to compose an object later and the component
    448   limit is a concern, it is recommended that you disable parallel composite
    449   uploads for that transfer.
    450 
    451   Also note that an object uploaded using this feature will have a CRC32C hash,
    452   but it will not have an MD5 hash (and because of that, requires users who
    453   download the object to have crcmod installed, as noted earlier). For details
    454   see 'gsutil help crc32c'.
    455 
    456   Note that this feature can be completely disabled by setting the
    457   "parallel_composite_upload_threshold" variable in the .boto config file to 0.
    458 """ % (PARALLEL_UPLOAD_TEMP_NAMESPACE, 10, MAX_COMPONENT_COUNT - 9,
    459        MAX_COMPONENT_COUNT)
    460 
    461 
    462 _CHANGING_TEMP_DIRECTORIES_TEXT = """
    463 <B>CHANGING TEMP DIRECTORIES</B>
    464   gsutil writes data to a temporary directory in several cases:
    465 
    466   - when compressing data to be uploaded (see the -z option)
    467   - when decompressing data being downloaded (when the data has
    468     Content-Encoding:gzip, e.g., as happens when uploaded using gsutil cp -z)
    469   - when running integration tests (using the gsutil test command)
    470 
    471   In these cases it's possible the temp file location on your system that
    472   gsutil selects by default may not have enough space. If you find that
    473   gsutil runs out of space during one of these operations (e.g., raising
    474   "CommandException: Inadequate temp space available to compress <your file>"
    475   during a gsutil cp -z operation), you can change where it writes these
    476   temp files by setting the TMPDIR environment variable. On Linux and MacOS
    477   you can do this either by running gsutil this way:
    478 
    479     TMPDIR=/some/directory gsutil cp ...
    480 
    481   or by adding this line to your ~/.bashrc file and then restarting the shell
    482   before running gsutil:
    483 
    484     export TMPDIR=/some/directory
    485 
    486   On Windows 7 you can change the TMPDIR environment variable from Start ->
    487   Computer -> System -> Advanced System Settings -> Environment Variables.
    488   You need to reboot after making this change for it to take effect. (Rebooting
    489   is not necessary after running the export command on Linux and MacOS.)
    490 """
    491 
    492 _OPTIONS_TEXT = """
    493 <B>OPTIONS</B>
    494   -a canned_acl  Sets named canned_acl when uploaded objects created. See
    495                  'gsutil help acls' for further details.
    496 
    497   -A             Copy all source versions from a source buckets/folders.
    498                  If not set, only the live version of each source object is
    499                  copied. Note: this option is only useful when the destination
    500                  bucket has versioning enabled.
    501 
    502   -c             If an error occurs, continue to attempt to copy the remaining
    503                  files. If any copies were unsuccessful, gsutil's exit status
    504                  will be non-zero even if this flag is set. This option is
    505                  implicitly set when running "gsutil -m cp...". Note: -c only
    506                  applies to the actual copying operation. If an error occurs
    507                  while iterating over the files in the local directory (e.g.,
    508                  invalid Unicode file name) gsutil will print an error message
    509                  and abort.
    510 
    511   -D             Copy in "daisy chain" mode, i.e., copying between two buckets
    512                  by hooking a download to an upload, via the machine where
    513                  gsutil is run. By default, data are copied between two buckets
    514                  "in the cloud", i.e., without needing to copy via the machine
    515                  where gsutil runs.
    516 
    517                  By default, a "copy in the cloud" when the source is a
    518                  composite object will retain the composite nature of the
    519                  object. However, Daisy chain mode can be used to change a
    520                  composite object into a non-composite object. For example:
    521 
    522                      gsutil cp -D -p gs://bucket/obj gs://bucket/obj_tmp
    523                      gsutil mv -p gs://bucket/obj_tmp gs://bucket/obj
    524 
    525                  Note: Daisy chain mode is automatically used when copying
    526                  between providers (e.g., to copy data from Google Cloud Storage
    527                  to another provider).
    528 
    529   -e             Exclude symlinks. When specified, symbolic links will not be
    530                  copied.
    531 
    532   -I             Causes gsutil to read the list of files or objects to copy from
    533                  stdin. This allows you to run a program that generates the list
    534                  of files to upload/download.
    535 
    536   -L <file>      Outputs a manifest log file with detailed information about
    537                  each item that was copied. This manifest contains the following
    538                  information for each item:
    539 
    540                  - Source path.
    541                  - Destination path.
    542                  - Source size.
    543                  - Bytes transferred.
    544                  - MD5 hash.
    545                  - UTC date and time transfer was started in ISO 8601 format.
    546                  - UTC date and time transfer was completed in ISO 8601 format.
    547                  - Upload id, if a resumable upload was performed.
    548                  - Final result of the attempted transfer, success or failure.
    549                  - Failure details, if any.
    550 
    551                  If the log file already exists, gsutil will use the file as an
    552                  input to the copy process, and will also append log items to
    553                  the existing file. Files/objects that are marked in the
    554                  existing log file as having been successfully copied (or
    555                  skipped) will be ignored. Files/objects without entries will be
    556                  copied and ones previously marked as unsuccessful will be
    557                  retried. This can be used in conjunction with the -c option to
    558                  build a script that copies a large number of objects reliably,
    559                  using a bash script like the following:
    560 
    561                    until gsutil cp -c -L cp.log -r ./dir gs://bucket; do
    562                      sleep 1
    563                    done
    564 
    565                  The -c option will cause copying to continue after failures
    566                  occur, and the -L option will allow gsutil to pick up where it
    567                  left off without duplicating work. The loop will continue
    568                  running as long as gsutil exits with a non-zero status (such a
    569                  status indicates there was at least one failure during the
    570                  gsutil run).
    571 
    572                  Note: If you're trying to synchronize the contents of a
    573                  directory and a bucket (or two buckets), see
    574                  'gsutil help rsync'.
    575 
    576   -n             No-clobber. When specified, existing files or objects at the
    577                  destination will not be overwritten. Any items that are skipped
    578                  by this option will be reported as being skipped. This option
    579                  will perform an additional GET request to check if an item
    580                  exists before attempting to upload the data. This will save
    581                  retransmitting data, but the additional HTTP requests may make
    582                  small object transfers slower and more expensive.
    583 
    584   -p             Causes ACLs to be preserved when copying in the cloud. Note
    585                  that this option has performance and cost implications when
    586                  using  the XML API, as it requires separate HTTP calls for
    587                  interacting with ACLs. The performance issue can be mitigated
    588                  to some degree by using gsutil -m cp to cause parallel copying.
    589                  Also, this option only works if you have OWNER access to all of
    590                  the objects that are copied.
    591 
    592                  You can avoid the additional performance and cost of using
    593                  cp -p if you want all objects in the destination bucket to end
    594                  up with the same ACL by setting a default object ACL on that
    595                  bucket instead of using cp -p. See "help gsutil defacl".
    596 
    597                  Note that it's not valid to specify both the -a and -p options
    598                  together.
    599 
    600   -R, -r         Causes directories, buckets, and bucket subdirectories to be
    601                  copied recursively. If you neglect to use this option for
    602                  an upload, gsutil will copy any files it finds and skip any
    603                  directories. Similarly, neglecting to specify -r for a download
    604                  will cause gsutil to copy any objects at the current bucket
    605                  directory level, and skip any subdirectories.
    606 
    607   -U             Skip objects with unsupported object types instead of failing.
    608                  Unsupported object types are Amazon S3 Objects in the GLACIER
    609                  storage class.
    610 
    611   -v             Requests that the version-specific URL for each uploaded object
    612                  be printed. Given this URL you can make future upload requests
    613                  that are safe in the face of concurrent updates, because Google
    614                  Cloud Storage will refuse to perform the update if the current
    615                  object version doesn't match the version-specific URL. See
    616                  'gsutil help versions' for more details.
    617 
    618   -z <ext,...>   Applies gzip content-encoding to file uploads with the given
    619                  extensions. This is useful when uploading files with
    620                  compressible content (such as .js, .css, or .html files)
    621                  because it saves network bandwidth and space in Google Cloud
    622                  Storage, which in turn reduces storage costs.
    623 
    624                  When you specify the -z option, the data from your files is
    625                  compressed before it is uploaded, but your actual files are
    626                  left uncompressed on the local disk. The uploaded objects
    627                  retain the Content-Type and name of the original files but are
    628                  given a Content-Encoding header with the value "gzip" to
    629                  indicate that the object data stored are compressed on the
    630                  Google Cloud Storage servers.
    631 
    632                  For example, the following command:
    633 
    634                    gsutil cp -z html -a public-read cattypes.html gs://mycats
    635 
    636                  will do all of the following:
    637 
    638                  - Upload as the object gs://mycats/cattypes.html (cp command)
    639                  - Set the Content-Type to text/html (based on file extension)
    640                  - Compress the data in the file cattypes.html (-z option)
    641                  - Set the Content-Encoding to gzip (-z option)
    642                  - Set the ACL to public-read (-a option)
    643                  - If a user tries to view cattypes.html in a browser, the
    644                    browser will know to uncompress the data based on the
    645                    Content-Encoding header, and to render it as HTML based on
    646                    the Content-Type header.
    647 
    648                  Note that if you download an object with Content-Encoding:gzip
    649                  gsutil will decompress the content before writing the local
    650                  file.
    651 """
    652 
    653 _DETAILED_HELP_TEXT = '\n\n'.join([_SYNOPSIS_TEXT,
    654                                    _DESCRIPTION_TEXT,
    655                                    _NAME_CONSTRUCTION_TEXT,
    656                                    _SUBDIRECTORIES_TEXT,
    657                                    _COPY_IN_CLOUD_TEXT,
    658                                    _CHECKSUM_VALIDATION_TEXT,
    659                                    _RETRY_HANDLING_TEXT,
    660                                    _RESUMABLE_TRANSFERS_TEXT,
    661                                    _STREAMING_TRANSFERS_TEXT,
    662                                    _SLICED_OBJECT_DOWNLOADS_TEXT,
    663                                    _PARALLEL_COMPOSITE_UPLOADS_TEXT,
    664                                    _CHANGING_TEMP_DIRECTORIES_TEXT,
    665                                    _OPTIONS_TEXT])
    666 
    667 
    668 CP_SUB_ARGS = 'a:AcDeIL:MNnprRtUvz:'
    669 
    670 
    671 def _CopyFuncWrapper(cls, args, thread_state=None):
    672   cls.CopyFunc(args, thread_state=thread_state)
    673 
    674 
    675 def _CopyExceptionHandler(cls, e):
    676   """Simple exception handler to allow post-completion status."""
    677   cls.logger.error(str(e))
    678   cls.op_failure_count += 1
    679   cls.logger.debug('\n\nEncountered exception while copying:\n%s\n',
    680                    traceback.format_exc())
    681 
    682 
    683 def _RmExceptionHandler(cls, e):
    684   """Simple exception handler to allow post-completion status."""
    685   cls.logger.error(str(e))
    686 
    687 
    688 class CpCommand(Command):
    689   """Implementation of gsutil cp command.
    690 
    691   Note that CpCommand is run for both gsutil cp and gsutil mv. The latter
    692   happens by MvCommand calling CpCommand and passing the hidden (undocumented)
    693   -M option. This allows the copy and remove needed for each mv to run
    694   together (rather than first running all the cp's and then all the rm's, as
    695   we originally had implemented), which in turn avoids the following problem
    696   with removing the wrong objects: starting with a bucket containing only
    697   the object gs://bucket/obj, say the user does:
    698     gsutil mv gs://bucket/* gs://bucket/d.txt
    699   If we ran all the cp's and then all the rm's and we didn't expand the wildcard
    700   first, the cp command would first copy gs://bucket/obj to gs://bucket/d.txt,
    701   and the rm command would then remove that object. In the implementation
    702   prior to gsutil release 3.12 we avoided this by building a list of objects
    703   to process and then running the copies and then the removes; but building
    704   the list up front limits scalability (compared with the current approach
    705   of processing the bucket listing iterator on the fly).
    706   """
    707 
    708   # Command specification. See base class for documentation.
    709   command_spec = Command.CreateCommandSpec(
    710       'cp',
    711       command_name_aliases=['copy'],
    712       usage_synopsis=_SYNOPSIS,
    713       min_args=1,
    714       max_args=NO_MAX,
    715       # -t is deprecated but leave intact for now to avoid breakage.
    716       supported_sub_args=CP_SUB_ARGS,
    717       file_url_ok=True,
    718       provider_url_ok=False,
    719       urls_start_arg=0,
    720       gs_api_support=[ApiSelector.XML, ApiSelector.JSON],
    721       gs_default_api=ApiSelector.JSON,
    722       supported_private_args=['testcallbackfile='],
    723       argparse_arguments=[
    724           CommandArgument.MakeZeroOrMoreCloudOrFileURLsArgument()
    725       ]
    726   )
    727   # Help specification. See help_provider.py for documentation.
    728   help_spec = Command.HelpSpec(
    729       help_name='cp',
    730       help_name_aliases=['copy'],
    731       help_type='command_help',
    732       help_one_line_summary='Copy files and objects',
    733       help_text=_DETAILED_HELP_TEXT,
    734       subcommand_help_text={},
    735   )
    736 
    737   # pylint: disable=too-many-statements
    738   def CopyFunc(self, name_expansion_result, thread_state=None):
    739     """Worker function for performing the actual copy (and rm, for mv)."""
    740     gsutil_api = GetCloudApiInstance(self, thread_state=thread_state)
    741 
    742     copy_helper_opts = copy_helper.GetCopyHelperOpts()
    743     if copy_helper_opts.perform_mv:
    744       cmd_name = 'mv'
    745     else:
    746       cmd_name = self.command_name
    747     src_url = name_expansion_result.source_storage_url
    748     exp_src_url = name_expansion_result.expanded_storage_url
    749     src_url_names_container = name_expansion_result.names_container
    750     have_multiple_srcs = name_expansion_result.is_multi_source_request
    751 
    752     if src_url.IsCloudUrl() and src_url.IsProvider():
    753       raise CommandException(
    754           'The %s command does not allow provider-only source URLs (%s)' %
    755           (cmd_name, src_url))
    756     if have_multiple_srcs:
    757       copy_helper.InsistDstUrlNamesContainer(
    758           self.exp_dst_url, self.have_existing_dst_container, cmd_name)
    759 
    760     # Various GUI tools (like the GCS web console) create placeholder objects
    761     # ending with '/' when the user creates an empty directory. Normally these
    762     # tools should delete those placeholders once objects have been written
    763     # "under" the directory, but sometimes the placeholders are left around. We
    764     # need to filter them out here, otherwise if the user tries to rsync from
    765     # GCS to a local directory it will result in a directory/file conflict
    766     # (e.g., trying to download an object called "mydata/" where the local
    767     # directory "mydata" exists).
    768     if IsCloudSubdirPlaceholder(exp_src_url):
    769       # We used to output the message 'Skipping cloud sub-directory placeholder
    770       # object...' but we no longer do so because it caused customer confusion.
    771       return
    772 
    773     if copy_helper_opts.use_manifest and self.manifest.WasSuccessful(
    774         exp_src_url.url_string):
    775       return
    776 
    777     if copy_helper_opts.perform_mv:
    778       if name_expansion_result.names_container:
    779         # Use recursion_requested when performing name expansion for the
    780         # directory mv case so we can determine if any of the source URLs are
    781         # directories (and then use cp -r and rm -r to perform the move, to
    782         # match the behavior of Linux mv (which when moving a directory moves
    783         # all the contained files).
    784         self.recursion_requested = True
    785         # Disallow wildcard src URLs when moving directories, as supporting it
    786         # would make the name transformation too complex and would also be
    787         # dangerous (e.g., someone could accidentally move many objects to the
    788         # wrong name, or accidentally overwrite many objects).
    789         if ContainsWildcard(src_url.url_string):
    790           raise CommandException('The mv command disallows naming source '
    791                                  'directories using wildcards')
    792 
    793     if (self.exp_dst_url.IsFileUrl()
    794         and not os.path.exists(self.exp_dst_url.object_name)
    795         and have_multiple_srcs):
    796       os.makedirs(self.exp_dst_url.object_name)
    797 
    798     dst_url = copy_helper.ConstructDstUrl(
    799         src_url, exp_src_url, src_url_names_container, have_multiple_srcs,
    800         self.exp_dst_url, self.have_existing_dst_container,
    801         self.recursion_requested)
    802     dst_url = copy_helper.FixWindowsNaming(src_url, dst_url)
    803 
    804     copy_helper.CheckForDirFileConflict(exp_src_url, dst_url)
    805     if copy_helper.SrcDstSame(exp_src_url, dst_url):
    806       raise CommandException('%s: "%s" and "%s" are the same file - '
    807                              'abort.' % (cmd_name, exp_src_url, dst_url))
    808 
    809     if dst_url.IsCloudUrl() and dst_url.HasGeneration():
    810       raise CommandException('%s: a version-specific URL\n(%s)\ncannot be '
    811                              'the destination for gsutil cp - abort.'
    812                              % (cmd_name, dst_url))
    813 
    814     elapsed_time = bytes_transferred = 0
    815     try:
    816       if copy_helper_opts.use_manifest:
    817         self.manifest.Initialize(
    818             exp_src_url.url_string, dst_url.url_string)
    819       (elapsed_time, bytes_transferred, result_url, md5) = (
    820           copy_helper.PerformCopy(
    821               self.logger, exp_src_url, dst_url, gsutil_api,
    822               self, _CopyExceptionHandler, allow_splitting=True,
    823               headers=self.headers, manifest=self.manifest,
    824               gzip_exts=self.gzip_exts))
    825       if copy_helper_opts.use_manifest:
    826         if md5:
    827           self.manifest.Set(exp_src_url.url_string, 'md5', md5)
    828         self.manifest.SetResult(
    829             exp_src_url.url_string, bytes_transferred, 'OK')
    830       if copy_helper_opts.print_ver:
    831         # Some cases don't return a version-specific URL (e.g., if destination
    832         # is a file).
    833         self.logger.info('Created: %s', result_url)
    834     except ItemExistsError:
    835       message = 'Skipping existing item: %s' % dst_url
    836       self.logger.info(message)
    837       if copy_helper_opts.use_manifest:
    838         self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message)
    839     except SkipUnsupportedObjectError, e:
    840       message = ('Skipping item %s with unsupported object type %s' %
    841                  (exp_src_url.url_string, e.unsupported_type))
    842       self.logger.info(message)
    843       if copy_helper_opts.use_manifest:
    844         self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message)
    845     except copy_helper.FileConcurrencySkipError, e:
    846       self.logger.warn('Skipping copy of source URL %s because destination URL '
    847                        '%s is already being copied by another gsutil process '
    848                        'or thread (did you specify the same source URL twice?) '
    849                        % (src_url, dst_url))
    850     except Exception, e:
    851       if (copy_helper_opts.no_clobber and
    852           copy_helper.IsNoClobberServerException(e)):
    853         message = 'Rejected (noclobber): %s' % dst_url
    854         self.logger.info(message)
    855         if copy_helper_opts.use_manifest:
    856           self.manifest.SetResult(
    857               exp_src_url.url_string, 0, 'skip', message)
    858       elif self.continue_on_error:
    859         message = 'Error copying %s: %s' % (src_url, str(e))
    860         self.op_failure_count += 1
    861         self.logger.error(message)
    862         if copy_helper_opts.use_manifest:
    863           self.manifest.SetResult(
    864               exp_src_url.url_string, 0, 'error',
    865               RemoveCRLFFromString(message))
    866       else:
    867         if copy_helper_opts.use_manifest:
    868           self.manifest.SetResult(
    869               exp_src_url.url_string, 0, 'error', str(e))
    870         raise
    871     else:
    872       if copy_helper_opts.perform_mv:
    873         self.logger.info('Removing %s...', exp_src_url)
    874         if exp_src_url.IsCloudUrl():
    875           gsutil_api.DeleteObject(exp_src_url.bucket_name,
    876                                   exp_src_url.object_name,
    877                                   generation=exp_src_url.generation,
    878                                   provider=exp_src_url.scheme)
    879         else:
    880           os.unlink(exp_src_url.object_name)
    881 
    882     with self.stats_lock:
    883       self.total_elapsed_time += elapsed_time
    884       self.total_bytes_transferred += bytes_transferred
    885 
    886   # Command entry point.
    887   def RunCommand(self):
    888     copy_helper_opts = self._ParseOpts()
    889 
    890     self.total_elapsed_time = self.total_bytes_transferred = 0
    891     if self.args[-1] == '-' or self.args[-1] == 'file://-':
    892       return CatHelper(self).CatUrlStrings(self.args[:-1])
    893 
    894     if copy_helper_opts.read_args_from_stdin:
    895       if len(self.args) != 1:
    896         raise CommandException('Source URLs cannot be specified with -I option')
    897       url_strs = StdinIterator()
    898     else:
    899       if len(self.args) < 2:
    900         raise CommandException('Wrong number of arguments for "cp" command.')
    901       url_strs = self.args[:-1]
    902 
    903     (self.exp_dst_url, self.have_existing_dst_container) = (
    904         copy_helper.ExpandUrlToSingleBlr(self.args[-1], self.gsutil_api,
    905                                          self.debug, self.project_id))
    906 
    907     name_expansion_iterator = NameExpansionIterator(
    908         self.command_name, self.debug,
    909         self.logger, self.gsutil_api, url_strs,
    910         self.recursion_requested or copy_helper_opts.perform_mv,
    911         project_id=self.project_id, all_versions=self.all_versions,
    912         continue_on_error=self.continue_on_error or self.parallel_operations)
    913 
    914     # Use a lock to ensure accurate statistics in the face of
    915     # multi-threading/multi-processing.
    916     self.stats_lock = CreateLock()
    917 
    918     # Tracks if any copies failed.
    919     self.op_failure_count = 0
    920 
    921     # Start the clock.
    922     start_time = time.time()
    923 
    924     # Tuple of attributes to share/manage across multiple processes in
    925     # parallel (-m) mode.
    926     shared_attrs = ('op_failure_count', 'total_bytes_transferred')
    927 
    928     # Perform copy requests in parallel (-m) mode, if requested, using
    929     # configured number of parallel processes and threads. Otherwise,
    930     # perform requests with sequential function calls in current process.
    931     self.Apply(_CopyFuncWrapper, name_expansion_iterator,
    932                _CopyExceptionHandler, shared_attrs,
    933                fail_on_error=(not self.continue_on_error))
    934     self.logger.debug(
    935         'total_bytes_transferred: %d', self.total_bytes_transferred)
    936 
    937     end_time = time.time()
    938     self.total_elapsed_time = end_time - start_time
    939 
    940     # Sometimes, particularly when running unit tests, the total elapsed time
    941     # is really small. On Windows, the timer resolution is too small and
    942     # causes total_elapsed_time to be zero.
    943     try:
    944       float(self.total_bytes_transferred) / float(self.total_elapsed_time)
    945     except ZeroDivisionError:
    946       self.total_elapsed_time = 0.01
    947 
    948     self.total_bytes_per_second = (float(self.total_bytes_transferred) /
    949                                    float(self.total_elapsed_time))
    950 
    951     if self.debug == 3:
    952       # Note that this only counts the actual GET and PUT bytes for the copy
    953       # - not any transfers for doing wildcard expansion, the initial
    954       # HEAD/GET request performed to get the object metadata, etc.
    955       if self.total_bytes_transferred != 0:
    956         self.logger.info(
    957             'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)',
    958             self.total_bytes_transferred, self.total_elapsed_time,
    959             MakeHumanReadable(self.total_bytes_per_second))
    960     if self.op_failure_count:
    961       plural_str = 's' if self.op_failure_count > 1 else ''
    962       raise CommandException('%d file%s/object%s could not be transferred.' % (
    963           self.op_failure_count, plural_str, plural_str))
    964 
    965     return 0
    966 
    967   def _ParseOpts(self):
    968     perform_mv = False
    969     # exclude_symlinks is handled by Command parent class, so save in Command
    970     # state rather than CopyHelperOpts.
    971     self.exclude_symlinks = False
    972     no_clobber = False
    973     # continue_on_error is handled by Command parent class, so save in Command
    974     # state rather than CopyHelperOpts.
    975     self.continue_on_error = False
    976     daisy_chain = False
    977     read_args_from_stdin = False
    978     print_ver = False
    979     use_manifest = False
    980     preserve_acl = False
    981     canned_acl = None
    982     # canned_acl is handled by a helper function in parent
    983     # Command class, so save in Command state rather than CopyHelperOpts.
    984     self.canned = None
    985 
    986     self.all_versions = False
    987 
    988     self.skip_unsupported_objects = False
    989 
    990     # Files matching these extensions should be gzipped before uploading.
    991     self.gzip_exts = []
    992 
    993     test_callback_file = None
    994 
    995     # self.recursion_requested initialized in command.py (so can be checked
    996     # in parent class for all commands).
    997     self.manifest = None
    998     if self.sub_opts:
    999       for o, a in self.sub_opts:
   1000         if o == '-a':
   1001           canned_acl = a
   1002           self.canned = True
   1003         if o == '-A':
   1004           self.all_versions = True
   1005         if o == '-c':
   1006           self.continue_on_error = True
   1007         elif o == '-D':
   1008           daisy_chain = True
   1009         elif o == '-e':
   1010           self.exclude_symlinks = True
   1011         elif o == '--testcallbackfile':
   1012           # File path of a pickled class that implements ProgressCallback.call.
   1013           # Used for testing transfer interruptions and resumes.
   1014           test_callback_file = a
   1015         elif o == '-I':
   1016           read_args_from_stdin = True
   1017         elif o == '-L':
   1018           use_manifest = True
   1019           self.manifest = Manifest(a)
   1020         elif o == '-M':
   1021           # Note that we signal to the cp command to perform a move (copy
   1022           # followed by remove) and use directory-move naming rules by passing
   1023           # the undocumented (for internal use) -M option when running the cp
   1024           # command from mv.py.
   1025           perform_mv = True
   1026         elif o == '-n':
   1027           no_clobber = True
   1028         elif o == '-p':
   1029           preserve_acl = True
   1030         elif o == '-r' or o == '-R':
   1031           self.recursion_requested = True
   1032         elif o == '-U':
   1033           self.skip_unsupported_objects = True
   1034         elif o == '-v':
   1035           print_ver = True
   1036         elif o == '-z':
   1037           self.gzip_exts = [x.strip() for x in a.split(',')]
   1038     if preserve_acl and canned_acl:
   1039       raise CommandException(
   1040           'Specifying both the -p and -a options together is invalid.')
   1041     if self.all_versions and self.parallel_operations:
   1042       raise CommandException(
   1043           'The gsutil -m option is not supported with the cp -A flag, to '
   1044           'ensure that object version ordering is preserved. Please re-run '
   1045           'the command without the -m option.')
   1046     return CreateCopyHelperOpts(
   1047         perform_mv=perform_mv,
   1048         no_clobber=no_clobber,
   1049         daisy_chain=daisy_chain,
   1050         read_args_from_stdin=read_args_from_stdin,
   1051         print_ver=print_ver,
   1052         use_manifest=use_manifest,
   1053         preserve_acl=preserve_acl,
   1054         canned_acl=canned_acl,
   1055         skip_unsupported_objects=self.skip_unsupported_objects,
   1056         test_callback_file=test_callback_file)
   1057