1 # -*- coding: utf-8 -*- 2 # Copyright 2011 Google Inc. All Rights Reserved. 3 # Copyright 2011, Nexenta Systems Inc. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 """Implementation of Unix-like cp command for cloud storage providers.""" 17 18 from __future__ import absolute_import 19 20 import os 21 import time 22 import traceback 23 24 from gslib import copy_helper 25 from gslib.cat_helper import CatHelper 26 from gslib.command import Command 27 from gslib.command_argument import CommandArgument 28 from gslib.commands.compose import MAX_COMPONENT_COUNT 29 from gslib.copy_helper import CreateCopyHelperOpts 30 from gslib.copy_helper import ItemExistsError 31 from gslib.copy_helper import Manifest 32 from gslib.copy_helper import PARALLEL_UPLOAD_TEMP_NAMESPACE 33 from gslib.copy_helper import SkipUnsupportedObjectError 34 from gslib.cs_api_map import ApiSelector 35 from gslib.exception import CommandException 36 from gslib.name_expansion import NameExpansionIterator 37 from gslib.storage_url import ContainsWildcard 38 from gslib.util import CreateLock 39 from gslib.util import GetCloudApiInstance 40 from gslib.util import IsCloudSubdirPlaceholder 41 from gslib.util import MakeHumanReadable 42 from gslib.util import NO_MAX 43 from gslib.util import RemoveCRLFFromString 44 from gslib.util import StdinIterator 45 46 _SYNOPSIS = """ 47 gsutil cp [OPTION]... src_url dst_url 48 gsutil cp [OPTION]... src_url... dst_url 49 gsutil cp [OPTION]... -I dst_url 50 """ 51 52 _SYNOPSIS_TEXT = """ 53 <B>SYNOPSIS</B> 54 """ + _SYNOPSIS 55 56 _DESCRIPTION_TEXT = """ 57 <B>DESCRIPTION</B> 58 The gsutil cp command allows you to copy data between your local file 59 system and the cloud, copy data within the cloud, and copy data between 60 cloud storage providers. For example, to copy all text files from the 61 local directory to a bucket you could do: 62 63 gsutil cp *.txt gs://my_bucket 64 65 Similarly, you can download text files from a bucket by doing: 66 67 gsutil cp gs://my_bucket/*.txt . 68 69 If you want to copy an entire directory tree you need to use the -r option: 70 71 gsutil cp -r dir gs://my_bucket 72 73 If you have a large number of files to upload you might want to use the 74 gsutil -m option, to perform a parallel (multi-threaded/multi-processing) 75 copy: 76 77 gsutil -m cp -r dir gs://my_bucket 78 79 You can pass a list of URLs (one per line) to copy on stdin instead of as 80 command line arguments by using the -I option. This allows you to use gsutil 81 in a pipeline to upload or download files / objects as generated by a program, 82 such as: 83 84 some_program | gsutil -m cp -I gs://my_bucket 85 86 or: 87 88 some_program | gsutil -m cp -I ./download_dir 89 90 The contents of stdin can name files, cloud URLs, and wildcards of files 91 and cloud URLs. 92 """ 93 94 _NAME_CONSTRUCTION_TEXT = """ 95 <B>HOW NAMES ARE CONSTRUCTED</B> 96 The gsutil cp command strives to name objects in a way consistent with how 97 Linux cp works, which causes names to be constructed in varying ways depending 98 on whether you're performing a recursive directory copy or copying 99 individually named objects; and whether you're copying to an existing or 100 non-existent directory. 101 102 When performing recursive directory copies, object names are constructed 103 that mirror the source directory structure starting at the point of 104 recursive processing. For example, the command: 105 106 gsutil cp -r dir1/dir2 gs://my_bucket 107 108 will create objects named like gs://my_bucket/dir2/a/b/c, assuming 109 dir1/dir2 contains the file a/b/c. 110 111 In contrast, copying individually named files will result in objects named 112 by the final path component of the source files. For example, the command: 113 114 gsutil cp dir1/dir2/** gs://my_bucket 115 116 will create objects named like gs://my_bucket/c. 117 118 The same rules apply for downloads: recursive copies of buckets and 119 bucket subdirectories produce a mirrored filename structure, while copying 120 individually (or wildcard) named objects produce flatly named files. 121 122 Note that in the above example the '**' wildcard matches all names 123 anywhere under dir. The wildcard '*' will match names just one level deep. For 124 more details see 'gsutil help wildcards'. 125 126 There's an additional wrinkle when working with subdirectories: the resulting 127 names depend on whether the destination subdirectory exists. For example, 128 if gs://my_bucket/subdir exists as a subdirectory, the command: 129 130 gsutil cp -r dir1/dir2 gs://my_bucket/subdir 131 132 will create objects named like gs://my_bucket/subdir/dir2/a/b/c. In contrast, 133 if gs://my_bucket/subdir does not exist, this same gsutil cp command will 134 create objects named like gs://my_bucket/subdir/a/b/c. 135 136 Note: If you use the 137 `Google Developers Console <https://console.developers.google.com>`_ 138 to create folders, it does so by creating a "placeholder" object that ends 139 with a "/" character. gsutil skips these objects when downloading from the 140 cloud to the local file system, because attempting to create a file that 141 ends with a "/" is not allowed on Linux and MacOS. Because of this, it is 142 recommended that you not create objects that end with "/" (unless you don't 143 need to be able to download such objects using gsutil). 144 """ 145 146 _SUBDIRECTORIES_TEXT = """ 147 <B>COPYING TO/FROM SUBDIRECTORIES; DISTRIBUTING TRANSFERS ACROSS MACHINES</B> 148 You can use gsutil to copy to and from subdirectories by using a command 149 like: 150 151 gsutil cp -r dir gs://my_bucket/data 152 153 This will cause dir and all of its files and nested subdirectories to be 154 copied under the specified destination, resulting in objects with names like 155 gs://my_bucket/data/dir/a/b/c. Similarly you can download from bucket 156 subdirectories by using a command like: 157 158 gsutil cp -r gs://my_bucket/data dir 159 160 This will cause everything nested under gs://my_bucket/data to be downloaded 161 into dir, resulting in files with names like dir/data/a/b/c. 162 163 Copying subdirectories is useful if you want to add data to an existing 164 bucket directory structure over time. It's also useful if you want 165 to parallelize uploads and downloads across multiple machines (often 166 reducing overall transfer time compared with simply running gsutil -m 167 cp on one machine). For example, if your bucket contains this structure: 168 169 gs://my_bucket/data/result_set_01/ 170 gs://my_bucket/data/result_set_02/ 171 ... 172 gs://my_bucket/data/result_set_99/ 173 174 you could perform concurrent downloads across 3 machines by running these 175 commands on each machine, respectively: 176 177 gsutil -m cp -r gs://my_bucket/data/result_set_[0-3]* dir 178 gsutil -m cp -r gs://my_bucket/data/result_set_[4-6]* dir 179 gsutil -m cp -r gs://my_bucket/data/result_set_[7-9]* dir 180 181 Note that dir could be a local directory on each machine, or it could 182 be a directory mounted off of a shared file server; whether the latter 183 performs acceptably may depend on a number of things, so we recommend 184 you experiment and find out what works best for you. 185 """ 186 187 _COPY_IN_CLOUD_TEXT = """ 188 <B>COPYING IN THE CLOUD AND METADATA PRESERVATION</B> 189 If both the source and destination URL are cloud URLs from the same 190 provider, gsutil copies data "in the cloud" (i.e., without downloading 191 to and uploading from the machine where you run gsutil). In addition to 192 the performance and cost advantages of doing this, copying in the cloud 193 preserves metadata (like Content-Type and Cache-Control). In contrast, 194 when you download data from the cloud it ends up in a file, which has 195 no associated metadata. Thus, unless you have some way to hold on to 196 or re-create that metadata, downloading to a file will not retain the 197 metadata. 198 199 Copies spanning locations and/or storage classes cause data to be rewritten 200 in the cloud, which may take some time. Such operations can be resumed with 201 the same command if they are interrupted, so long as the command parameters 202 are identical. 203 204 Note that by default, the gsutil cp command does not copy the object 205 ACL to the new object, and instead will use the default bucket ACL (see 206 "gsutil help defacl"). You can override this behavior with the -p 207 option (see OPTIONS below). 208 209 One additional note about copying in the cloud: If the destination bucket has 210 versioning enabled, gsutil cp will by default copy only live versions of the 211 source object(s). For example: 212 213 gsutil cp gs://bucket1/obj gs://bucket2 214 215 will cause only the single live version of of gs://bucket1/obj to be copied 216 to gs://bucket2, even if there are archived versions of gs://bucket1/obj. To 217 also copy archived versions, use the -A flag: 218 219 gsutil cp -A gs://bucket1/obj gs://bucket2 220 221 The gsutil -m flag is disallowed when using the cp -A flag, to ensure that 222 version ordering is preserved. 223 """ 224 225 _CHECKSUM_VALIDATION_TEXT = """ 226 <B>CHECKSUM VALIDATION</B> 227 At the end of every upload or download the gsutil cp command validates that 228 the checksum it computes for the source file/object matches the checksum 229 the service computes. If the checksums do not match, gsutil will delete the 230 corrupted object and print a warning message. This very rarely happens, but 231 if it does, please contact gs-team (at] google.com. 232 233 If you know the MD5 of a file before uploading you can specify it in the 234 Content-MD5 header, which will cause the cloud storage service to reject the 235 upload if the MD5 doesn't match the value computed by the service. For 236 example: 237 238 % gsutil hash obj 239 Hashing obj: 240 Hashes [base64] for obj: 241 Hash (crc32c): lIMoIw== 242 Hash (md5): VgyllJgiiaRAbyUUIqDMmw== 243 244 % gsutil -h Content-MD5:VgyllJgiiaRAbyUUIqDMmw== cp obj gs://your-bucket/obj 245 Copying file://obj [Content-Type=text/plain]... 246 Uploading gs://your-bucket/obj: 182 b/182 B 247 248 If the checksum didn't match the service would instead reject the upload and 249 gsutil would print a message like: 250 251 BadRequestException: 400 Provided MD5 hash "VgyllJgiiaRAbyUUIqDMmw==" 252 doesn't match calculated MD5 hash "7gyllJgiiaRAbyUUIqDMmw==". 253 254 Even if you don't do this gsutil will delete the object if the computed 255 checksum mismatches, but specifying the Content-MD5 header has three 256 advantages: 257 258 1. It prevents the corrupted object from becoming visible at all, whereas 259 otherwise it would be visible for 1-3 seconds before gsutil deletes it. 260 261 2. It will definitively prevent the corrupted object from being left in 262 the cloud, whereas the gsutil approach of deleting after the upload 263 completes could fail if (for example) the gsutil process gets ^C'd 264 between upload and deletion request. 265 266 3. It supports a customer-to-service integrity check handoff. For example, 267 if you have a content production pipeline that generates data to be 268 uploaded to the cloud along with checksums of that data, specifying the 269 MD5 computed by your content pipeline when you run gsutil cp will ensure 270 that the checksums match all the way through the process (e.g., detecting 271 if data gets corrupted on your local disk between the time it was written 272 by your content pipeline and the time it was uploaded to GCS). 273 274 Note: The Content-MD5 header is ignored for composite objects, because such 275 objects only have a CRC32C checksum. 276 """ 277 278 _RETRY_HANDLING_TEXT = """ 279 <B>RETRY HANDLING</B> 280 The cp command will retry when failures occur, but if enough failures happen 281 during a particular copy or delete operation the command will skip that object 282 and move on. At the end of the copy run if any failures were not successfully 283 retried, the cp command will report the count of failures, and exit with 284 non-zero status. 285 286 Note that there are cases where retrying will never succeed, such as if you 287 don't have write permission to the destination bucket or if the destination 288 path for some objects is longer than the maximum allowed length. 289 290 For more details about gsutil's retry handling, please see 291 "gsutil help retries". 292 """ 293 294 _RESUMABLE_TRANSFERS_TEXT = """ 295 <B>RESUMABLE TRANSFERS</B> 296 gsutil automatically uses the Google Cloud Storage resumable upload feature 297 whenever you use the cp command to upload an object that is larger than 2 298 MiB. You do not need to specify any special command line options to make this 299 happen. If your upload is interrupted you can restart the upload by running 300 the same cp command that you ran to start the upload. Until the upload 301 has completed successfully, it will not be visible at the destination object 302 and will not replace any existing object the upload is intended to overwrite. 303 (However, see the section on PARALLEL COMPOSITE UPLOADS, which may leave 304 temporary component objects in place during the upload process.) 305 306 Similarly, gsutil automatically performs resumable downloads (using HTTP 307 standard Range GET operations) whenever you use the cp command, unless the 308 destination is a stream or null. In this case, a partially downloaded 309 temporary file will be visible in the destination directory. Upon completion, 310 the original file is deleted and overwritten with the downloaded contents. 311 312 Resumable uploads and downloads store some state information in a files 313 in ~/.gsutil named by the destination object or file. If you attempt to 314 resume a transfer from a machine with a different directory, the transfer 315 will start over from scratch. 316 317 See also "gsutil help prod" for details on using resumable transfers 318 in production. 319 """ 320 321 _STREAMING_TRANSFERS_TEXT = """ 322 <B>STREAMING TRANSFERS</B> 323 Use '-' in place of src_url or dst_url to perform a streaming 324 transfer. For example: 325 326 long_running_computation | gsutil cp - gs://my_bucket/obj 327 328 Streaming uploads using the JSON API (see "gsutil help apis") are buffered in 329 memory and can retry in the event of network flakiness or service errors. 330 331 Streaming transfers (other than uploads using the JSON API) do not support 332 resumable uploads/downloads. If you have a large amount of data to upload 333 (say, more than 100 MiB) it is recommended to write the data to a local file 334 and then copy that file to the cloud rather than streaming it (and similarly 335 for large downloads). 336 337 WARNING: When performing streaming transfers gsutil does not compute a 338 checksum of the uploaded or downloaded data. Therefore, we recommend that 339 users either perform their own validation of the data or use non-streaming 340 transfers (which perform integrity checking automatically). 341 """ 342 343 _SLICED_OBJECT_DOWNLOADS_TEXT = """ 344 <B>SLICED OBJECT DOWNLOADS</B> 345 gsutil automatically uses HTTP Range GET requests to perform "sliced" 346 downloads in parallel for downloads of large objects. This means that, if 347 enabled, disk space for the temporary download destination file will be 348 pre-allocated and byte ranges (slices) within the file will be downloaded in 349 parallel. Once all slices have completed downloading, the temporary file will 350 be renamed to the destination file. No additional local disk space is 351 required for this operation. 352 353 This feature is only available for Google Cloud Storage objects because it 354 requires a fast composable checksum that can be used to verify the data 355 integrity of the slices. Thus, using sliced object downloads also requires a 356 compiled crcmod (see "gsutil help crcmod") on the machine performing the 357 download. If compiled crcmod is not available, normal download will instead 358 be used. 359 360 Note: since sliced object downloads cause multiple writes to occur at various 361 locations on disk, this can degrade performance for disks with slow seek 362 times, especially for large numbers of slices. While the default number of 363 slices is small to avoid this, sliced object download can be completely 364 disabled by setting the "sliced_object_download_threshold" variable in the 365 .boto config file to 0. 366 """ 367 368 _PARALLEL_COMPOSITE_UPLOADS_TEXT = """ 369 <B>PARALLEL COMPOSITE UPLOADS</B> 370 gsutil can automatically use 371 `object composition <https://developers.google.com/storage/docs/composite-objects>`_ 372 to perform uploads in parallel for large, local files being uploaded to Google 373 Cloud Storage. This means that, if enabled (see next paragraph), a large file 374 will be split into component pieces that will be uploaded in parallel. Those 375 components will then be composed in the cloud, and the temporary components in 376 the cloud will be deleted after successful composition. No additional local 377 disk space is required for this operation. 378 379 Using parallel composite uploads presents a tradeoff between upload 380 performance and download configuration: If you enable parallel composite 381 uploads your uploads will run faster, but someone will need to install a 382 compiled crcmod (see "gsutil help crcmod") on every machine where objects are 383 downloaded by gsutil or other Python applications. For some distributions this 384 is easy (e.g., it comes pre-installed on MacOS), but in some cases users have 385 found it difficult. Because of this at present parallel composite uploads are 386 disabled by default. Google is actively working with a number of the Linux 387 distributions to get crcmod included with the stock distribution. Once that is 388 done we will re-enable parallel composite uploads by default in gsutil. 389 390 Parallel composite uploads should not be used with NEARLINE storage 391 class buckets, as doing this would incur an early deletion charge for each 392 component object. 393 394 To try parallel composite uploads you can run the command: 395 396 gsutil -o GSUtil:parallel_composite_upload_threshold=150M cp bigfile gs://your-bucket 397 398 where bigfile is larger than 150 MiB. When you do this notice that the upload 399 progress indicator continuously updates for several different uploads at once 400 (corresponding to each of the sections of the file being uploaded in 401 parallel), until the parallel upload completes. If you then want to enable 402 parallel composite uploads for all of your future uploads (notwithstanding the 403 caveats mentioned earlier), you can uncomment and set the 404 "parallel_composite_upload_threshold" config value in your .boto configuration 405 file to this value. 406 407 Note that the crcmod problem only impacts downloads via Python applications 408 (such as gsutil). If any users who need to download the data using gsutil or 409 other Python applications can install crcmod, it makes sense to enable 410 parallel composite uploads (see above). For example, if you use gsutil to 411 upload video assets and those assets will only ever be served via a Java 412 application (there are efficient crc32c implementations available in Java), it 413 would make sense to enable parallel composite uploads on your machine. 414 415 If a parallel composite upload fails prior to composition, re-running the 416 gsutil command will take advantage of resumable uploads for those components 417 that failed, and the component objects will be deleted after the first 418 successful attempt. Any temporary objects that were uploaded successfully 419 before gsutil failed will still exist until the upload is completed 420 successfully. The temporary objects will be named in the following fashion: 421 422 <random ID>%s<hash> 423 424 where <random ID> is some numerical value, and <hash> is an MD5 hash (not 425 related to the hash of the contents of the file or object). 426 427 To avoid leaving temporary objects around, you should make sure to check the 428 exit status from the gsutil command. This can be done in a bash script, for 429 example, by doing: 430 431 gsutil cp ./local-file gs://your-bucket/your-object 432 if [ "$status" -ne "0" ] ; then 433 << Code that handles failures >> 434 fi 435 436 Or, for copying a directory, use this instead: 437 438 gsutil cp -c -L cp.log -r ./dir gs://bucket 439 if [ "$status" -ne "0" ] ; then 440 << Code that handles failures >> 441 fi 442 443 One important caveat is that files uploaded in this fashion are still subject 444 to the maximum number of components limit. For example, if you upload a large 445 file that gets split into %d components, and try to compose it with another 446 object with %d components, the operation will fail because it exceeds the %d 447 component limit. If you wish to compose an object later and the component 448 limit is a concern, it is recommended that you disable parallel composite 449 uploads for that transfer. 450 451 Also note that an object uploaded using this feature will have a CRC32C hash, 452 but it will not have an MD5 hash (and because of that, requires users who 453 download the object to have crcmod installed, as noted earlier). For details 454 see 'gsutil help crc32c'. 455 456 Note that this feature can be completely disabled by setting the 457 "parallel_composite_upload_threshold" variable in the .boto config file to 0. 458 """ % (PARALLEL_UPLOAD_TEMP_NAMESPACE, 10, MAX_COMPONENT_COUNT - 9, 459 MAX_COMPONENT_COUNT) 460 461 462 _CHANGING_TEMP_DIRECTORIES_TEXT = """ 463 <B>CHANGING TEMP DIRECTORIES</B> 464 gsutil writes data to a temporary directory in several cases: 465 466 - when compressing data to be uploaded (see the -z option) 467 - when decompressing data being downloaded (when the data has 468 Content-Encoding:gzip, e.g., as happens when uploaded using gsutil cp -z) 469 - when running integration tests (using the gsutil test command) 470 471 In these cases it's possible the temp file location on your system that 472 gsutil selects by default may not have enough space. If you find that 473 gsutil runs out of space during one of these operations (e.g., raising 474 "CommandException: Inadequate temp space available to compress <your file>" 475 during a gsutil cp -z operation), you can change where it writes these 476 temp files by setting the TMPDIR environment variable. On Linux and MacOS 477 you can do this either by running gsutil this way: 478 479 TMPDIR=/some/directory gsutil cp ... 480 481 or by adding this line to your ~/.bashrc file and then restarting the shell 482 before running gsutil: 483 484 export TMPDIR=/some/directory 485 486 On Windows 7 you can change the TMPDIR environment variable from Start -> 487 Computer -> System -> Advanced System Settings -> Environment Variables. 488 You need to reboot after making this change for it to take effect. (Rebooting 489 is not necessary after running the export command on Linux and MacOS.) 490 """ 491 492 _OPTIONS_TEXT = """ 493 <B>OPTIONS</B> 494 -a canned_acl Sets named canned_acl when uploaded objects created. See 495 'gsutil help acls' for further details. 496 497 -A Copy all source versions from a source buckets/folders. 498 If not set, only the live version of each source object is 499 copied. Note: this option is only useful when the destination 500 bucket has versioning enabled. 501 502 -c If an error occurs, continue to attempt to copy the remaining 503 files. If any copies were unsuccessful, gsutil's exit status 504 will be non-zero even if this flag is set. This option is 505 implicitly set when running "gsutil -m cp...". Note: -c only 506 applies to the actual copying operation. If an error occurs 507 while iterating over the files in the local directory (e.g., 508 invalid Unicode file name) gsutil will print an error message 509 and abort. 510 511 -D Copy in "daisy chain" mode, i.e., copying between two buckets 512 by hooking a download to an upload, via the machine where 513 gsutil is run. By default, data are copied between two buckets 514 "in the cloud", i.e., without needing to copy via the machine 515 where gsutil runs. 516 517 By default, a "copy in the cloud" when the source is a 518 composite object will retain the composite nature of the 519 object. However, Daisy chain mode can be used to change a 520 composite object into a non-composite object. For example: 521 522 gsutil cp -D -p gs://bucket/obj gs://bucket/obj_tmp 523 gsutil mv -p gs://bucket/obj_tmp gs://bucket/obj 524 525 Note: Daisy chain mode is automatically used when copying 526 between providers (e.g., to copy data from Google Cloud Storage 527 to another provider). 528 529 -e Exclude symlinks. When specified, symbolic links will not be 530 copied. 531 532 -I Causes gsutil to read the list of files or objects to copy from 533 stdin. This allows you to run a program that generates the list 534 of files to upload/download. 535 536 -L <file> Outputs a manifest log file with detailed information about 537 each item that was copied. This manifest contains the following 538 information for each item: 539 540 - Source path. 541 - Destination path. 542 - Source size. 543 - Bytes transferred. 544 - MD5 hash. 545 - UTC date and time transfer was started in ISO 8601 format. 546 - UTC date and time transfer was completed in ISO 8601 format. 547 - Upload id, if a resumable upload was performed. 548 - Final result of the attempted transfer, success or failure. 549 - Failure details, if any. 550 551 If the log file already exists, gsutil will use the file as an 552 input to the copy process, and will also append log items to 553 the existing file. Files/objects that are marked in the 554 existing log file as having been successfully copied (or 555 skipped) will be ignored. Files/objects without entries will be 556 copied and ones previously marked as unsuccessful will be 557 retried. This can be used in conjunction with the -c option to 558 build a script that copies a large number of objects reliably, 559 using a bash script like the following: 560 561 until gsutil cp -c -L cp.log -r ./dir gs://bucket; do 562 sleep 1 563 done 564 565 The -c option will cause copying to continue after failures 566 occur, and the -L option will allow gsutil to pick up where it 567 left off without duplicating work. The loop will continue 568 running as long as gsutil exits with a non-zero status (such a 569 status indicates there was at least one failure during the 570 gsutil run). 571 572 Note: If you're trying to synchronize the contents of a 573 directory and a bucket (or two buckets), see 574 'gsutil help rsync'. 575 576 -n No-clobber. When specified, existing files or objects at the 577 destination will not be overwritten. Any items that are skipped 578 by this option will be reported as being skipped. This option 579 will perform an additional GET request to check if an item 580 exists before attempting to upload the data. This will save 581 retransmitting data, but the additional HTTP requests may make 582 small object transfers slower and more expensive. 583 584 -p Causes ACLs to be preserved when copying in the cloud. Note 585 that this option has performance and cost implications when 586 using the XML API, as it requires separate HTTP calls for 587 interacting with ACLs. The performance issue can be mitigated 588 to some degree by using gsutil -m cp to cause parallel copying. 589 Also, this option only works if you have OWNER access to all of 590 the objects that are copied. 591 592 You can avoid the additional performance and cost of using 593 cp -p if you want all objects in the destination bucket to end 594 up with the same ACL by setting a default object ACL on that 595 bucket instead of using cp -p. See "help gsutil defacl". 596 597 Note that it's not valid to specify both the -a and -p options 598 together. 599 600 -R, -r Causes directories, buckets, and bucket subdirectories to be 601 copied recursively. If you neglect to use this option for 602 an upload, gsutil will copy any files it finds and skip any 603 directories. Similarly, neglecting to specify -r for a download 604 will cause gsutil to copy any objects at the current bucket 605 directory level, and skip any subdirectories. 606 607 -U Skip objects with unsupported object types instead of failing. 608 Unsupported object types are Amazon S3 Objects in the GLACIER 609 storage class. 610 611 -v Requests that the version-specific URL for each uploaded object 612 be printed. Given this URL you can make future upload requests 613 that are safe in the face of concurrent updates, because Google 614 Cloud Storage will refuse to perform the update if the current 615 object version doesn't match the version-specific URL. See 616 'gsutil help versions' for more details. 617 618 -z <ext,...> Applies gzip content-encoding to file uploads with the given 619 extensions. This is useful when uploading files with 620 compressible content (such as .js, .css, or .html files) 621 because it saves network bandwidth and space in Google Cloud 622 Storage, which in turn reduces storage costs. 623 624 When you specify the -z option, the data from your files is 625 compressed before it is uploaded, but your actual files are 626 left uncompressed on the local disk. The uploaded objects 627 retain the Content-Type and name of the original files but are 628 given a Content-Encoding header with the value "gzip" to 629 indicate that the object data stored are compressed on the 630 Google Cloud Storage servers. 631 632 For example, the following command: 633 634 gsutil cp -z html -a public-read cattypes.html gs://mycats 635 636 will do all of the following: 637 638 - Upload as the object gs://mycats/cattypes.html (cp command) 639 - Set the Content-Type to text/html (based on file extension) 640 - Compress the data in the file cattypes.html (-z option) 641 - Set the Content-Encoding to gzip (-z option) 642 - Set the ACL to public-read (-a option) 643 - If a user tries to view cattypes.html in a browser, the 644 browser will know to uncompress the data based on the 645 Content-Encoding header, and to render it as HTML based on 646 the Content-Type header. 647 648 Note that if you download an object with Content-Encoding:gzip 649 gsutil will decompress the content before writing the local 650 file. 651 """ 652 653 _DETAILED_HELP_TEXT = '\n\n'.join([_SYNOPSIS_TEXT, 654 _DESCRIPTION_TEXT, 655 _NAME_CONSTRUCTION_TEXT, 656 _SUBDIRECTORIES_TEXT, 657 _COPY_IN_CLOUD_TEXT, 658 _CHECKSUM_VALIDATION_TEXT, 659 _RETRY_HANDLING_TEXT, 660 _RESUMABLE_TRANSFERS_TEXT, 661 _STREAMING_TRANSFERS_TEXT, 662 _SLICED_OBJECT_DOWNLOADS_TEXT, 663 _PARALLEL_COMPOSITE_UPLOADS_TEXT, 664 _CHANGING_TEMP_DIRECTORIES_TEXT, 665 _OPTIONS_TEXT]) 666 667 668 CP_SUB_ARGS = 'a:AcDeIL:MNnprRtUvz:' 669 670 671 def _CopyFuncWrapper(cls, args, thread_state=None): 672 cls.CopyFunc(args, thread_state=thread_state) 673 674 675 def _CopyExceptionHandler(cls, e): 676 """Simple exception handler to allow post-completion status.""" 677 cls.logger.error(str(e)) 678 cls.op_failure_count += 1 679 cls.logger.debug('\n\nEncountered exception while copying:\n%s\n', 680 traceback.format_exc()) 681 682 683 def _RmExceptionHandler(cls, e): 684 """Simple exception handler to allow post-completion status.""" 685 cls.logger.error(str(e)) 686 687 688 class CpCommand(Command): 689 """Implementation of gsutil cp command. 690 691 Note that CpCommand is run for both gsutil cp and gsutil mv. The latter 692 happens by MvCommand calling CpCommand and passing the hidden (undocumented) 693 -M option. This allows the copy and remove needed for each mv to run 694 together (rather than first running all the cp's and then all the rm's, as 695 we originally had implemented), which in turn avoids the following problem 696 with removing the wrong objects: starting with a bucket containing only 697 the object gs://bucket/obj, say the user does: 698 gsutil mv gs://bucket/* gs://bucket/d.txt 699 If we ran all the cp's and then all the rm's and we didn't expand the wildcard 700 first, the cp command would first copy gs://bucket/obj to gs://bucket/d.txt, 701 and the rm command would then remove that object. In the implementation 702 prior to gsutil release 3.12 we avoided this by building a list of objects 703 to process and then running the copies and then the removes; but building 704 the list up front limits scalability (compared with the current approach 705 of processing the bucket listing iterator on the fly). 706 """ 707 708 # Command specification. See base class for documentation. 709 command_spec = Command.CreateCommandSpec( 710 'cp', 711 command_name_aliases=['copy'], 712 usage_synopsis=_SYNOPSIS, 713 min_args=1, 714 max_args=NO_MAX, 715 # -t is deprecated but leave intact for now to avoid breakage. 716 supported_sub_args=CP_SUB_ARGS, 717 file_url_ok=True, 718 provider_url_ok=False, 719 urls_start_arg=0, 720 gs_api_support=[ApiSelector.XML, ApiSelector.JSON], 721 gs_default_api=ApiSelector.JSON, 722 supported_private_args=['testcallbackfile='], 723 argparse_arguments=[ 724 CommandArgument.MakeZeroOrMoreCloudOrFileURLsArgument() 725 ] 726 ) 727 # Help specification. See help_provider.py for documentation. 728 help_spec = Command.HelpSpec( 729 help_name='cp', 730 help_name_aliases=['copy'], 731 help_type='command_help', 732 help_one_line_summary='Copy files and objects', 733 help_text=_DETAILED_HELP_TEXT, 734 subcommand_help_text={}, 735 ) 736 737 # pylint: disable=too-many-statements 738 def CopyFunc(self, name_expansion_result, thread_state=None): 739 """Worker function for performing the actual copy (and rm, for mv).""" 740 gsutil_api = GetCloudApiInstance(self, thread_state=thread_state) 741 742 copy_helper_opts = copy_helper.GetCopyHelperOpts() 743 if copy_helper_opts.perform_mv: 744 cmd_name = 'mv' 745 else: 746 cmd_name = self.command_name 747 src_url = name_expansion_result.source_storage_url 748 exp_src_url = name_expansion_result.expanded_storage_url 749 src_url_names_container = name_expansion_result.names_container 750 have_multiple_srcs = name_expansion_result.is_multi_source_request 751 752 if src_url.IsCloudUrl() and src_url.IsProvider(): 753 raise CommandException( 754 'The %s command does not allow provider-only source URLs (%s)' % 755 (cmd_name, src_url)) 756 if have_multiple_srcs: 757 copy_helper.InsistDstUrlNamesContainer( 758 self.exp_dst_url, self.have_existing_dst_container, cmd_name) 759 760 # Various GUI tools (like the GCS web console) create placeholder objects 761 # ending with '/' when the user creates an empty directory. Normally these 762 # tools should delete those placeholders once objects have been written 763 # "under" the directory, but sometimes the placeholders are left around. We 764 # need to filter them out here, otherwise if the user tries to rsync from 765 # GCS to a local directory it will result in a directory/file conflict 766 # (e.g., trying to download an object called "mydata/" where the local 767 # directory "mydata" exists). 768 if IsCloudSubdirPlaceholder(exp_src_url): 769 # We used to output the message 'Skipping cloud sub-directory placeholder 770 # object...' but we no longer do so because it caused customer confusion. 771 return 772 773 if copy_helper_opts.use_manifest and self.manifest.WasSuccessful( 774 exp_src_url.url_string): 775 return 776 777 if copy_helper_opts.perform_mv: 778 if name_expansion_result.names_container: 779 # Use recursion_requested when performing name expansion for the 780 # directory mv case so we can determine if any of the source URLs are 781 # directories (and then use cp -r and rm -r to perform the move, to 782 # match the behavior of Linux mv (which when moving a directory moves 783 # all the contained files). 784 self.recursion_requested = True 785 # Disallow wildcard src URLs when moving directories, as supporting it 786 # would make the name transformation too complex and would also be 787 # dangerous (e.g., someone could accidentally move many objects to the 788 # wrong name, or accidentally overwrite many objects). 789 if ContainsWildcard(src_url.url_string): 790 raise CommandException('The mv command disallows naming source ' 791 'directories using wildcards') 792 793 if (self.exp_dst_url.IsFileUrl() 794 and not os.path.exists(self.exp_dst_url.object_name) 795 and have_multiple_srcs): 796 os.makedirs(self.exp_dst_url.object_name) 797 798 dst_url = copy_helper.ConstructDstUrl( 799 src_url, exp_src_url, src_url_names_container, have_multiple_srcs, 800 self.exp_dst_url, self.have_existing_dst_container, 801 self.recursion_requested) 802 dst_url = copy_helper.FixWindowsNaming(src_url, dst_url) 803 804 copy_helper.CheckForDirFileConflict(exp_src_url, dst_url) 805 if copy_helper.SrcDstSame(exp_src_url, dst_url): 806 raise CommandException('%s: "%s" and "%s" are the same file - ' 807 'abort.' % (cmd_name, exp_src_url, dst_url)) 808 809 if dst_url.IsCloudUrl() and dst_url.HasGeneration(): 810 raise CommandException('%s: a version-specific URL\n(%s)\ncannot be ' 811 'the destination for gsutil cp - abort.' 812 % (cmd_name, dst_url)) 813 814 elapsed_time = bytes_transferred = 0 815 try: 816 if copy_helper_opts.use_manifest: 817 self.manifest.Initialize( 818 exp_src_url.url_string, dst_url.url_string) 819 (elapsed_time, bytes_transferred, result_url, md5) = ( 820 copy_helper.PerformCopy( 821 self.logger, exp_src_url, dst_url, gsutil_api, 822 self, _CopyExceptionHandler, allow_splitting=True, 823 headers=self.headers, manifest=self.manifest, 824 gzip_exts=self.gzip_exts)) 825 if copy_helper_opts.use_manifest: 826 if md5: 827 self.manifest.Set(exp_src_url.url_string, 'md5', md5) 828 self.manifest.SetResult( 829 exp_src_url.url_string, bytes_transferred, 'OK') 830 if copy_helper_opts.print_ver: 831 # Some cases don't return a version-specific URL (e.g., if destination 832 # is a file). 833 self.logger.info('Created: %s', result_url) 834 except ItemExistsError: 835 message = 'Skipping existing item: %s' % dst_url 836 self.logger.info(message) 837 if copy_helper_opts.use_manifest: 838 self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message) 839 except SkipUnsupportedObjectError, e: 840 message = ('Skipping item %s with unsupported object type %s' % 841 (exp_src_url.url_string, e.unsupported_type)) 842 self.logger.info(message) 843 if copy_helper_opts.use_manifest: 844 self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message) 845 except copy_helper.FileConcurrencySkipError, e: 846 self.logger.warn('Skipping copy of source URL %s because destination URL ' 847 '%s is already being copied by another gsutil process ' 848 'or thread (did you specify the same source URL twice?) ' 849 % (src_url, dst_url)) 850 except Exception, e: 851 if (copy_helper_opts.no_clobber and 852 copy_helper.IsNoClobberServerException(e)): 853 message = 'Rejected (noclobber): %s' % dst_url 854 self.logger.info(message) 855 if copy_helper_opts.use_manifest: 856 self.manifest.SetResult( 857 exp_src_url.url_string, 0, 'skip', message) 858 elif self.continue_on_error: 859 message = 'Error copying %s: %s' % (src_url, str(e)) 860 self.op_failure_count += 1 861 self.logger.error(message) 862 if copy_helper_opts.use_manifest: 863 self.manifest.SetResult( 864 exp_src_url.url_string, 0, 'error', 865 RemoveCRLFFromString(message)) 866 else: 867 if copy_helper_opts.use_manifest: 868 self.manifest.SetResult( 869 exp_src_url.url_string, 0, 'error', str(e)) 870 raise 871 else: 872 if copy_helper_opts.perform_mv: 873 self.logger.info('Removing %s...', exp_src_url) 874 if exp_src_url.IsCloudUrl(): 875 gsutil_api.DeleteObject(exp_src_url.bucket_name, 876 exp_src_url.object_name, 877 generation=exp_src_url.generation, 878 provider=exp_src_url.scheme) 879 else: 880 os.unlink(exp_src_url.object_name) 881 882 with self.stats_lock: 883 self.total_elapsed_time += elapsed_time 884 self.total_bytes_transferred += bytes_transferred 885 886 # Command entry point. 887 def RunCommand(self): 888 copy_helper_opts = self._ParseOpts() 889 890 self.total_elapsed_time = self.total_bytes_transferred = 0 891 if self.args[-1] == '-' or self.args[-1] == 'file://-': 892 return CatHelper(self).CatUrlStrings(self.args[:-1]) 893 894 if copy_helper_opts.read_args_from_stdin: 895 if len(self.args) != 1: 896 raise CommandException('Source URLs cannot be specified with -I option') 897 url_strs = StdinIterator() 898 else: 899 if len(self.args) < 2: 900 raise CommandException('Wrong number of arguments for "cp" command.') 901 url_strs = self.args[:-1] 902 903 (self.exp_dst_url, self.have_existing_dst_container) = ( 904 copy_helper.ExpandUrlToSingleBlr(self.args[-1], self.gsutil_api, 905 self.debug, self.project_id)) 906 907 name_expansion_iterator = NameExpansionIterator( 908 self.command_name, self.debug, 909 self.logger, self.gsutil_api, url_strs, 910 self.recursion_requested or copy_helper_opts.perform_mv, 911 project_id=self.project_id, all_versions=self.all_versions, 912 continue_on_error=self.continue_on_error or self.parallel_operations) 913 914 # Use a lock to ensure accurate statistics in the face of 915 # multi-threading/multi-processing. 916 self.stats_lock = CreateLock() 917 918 # Tracks if any copies failed. 919 self.op_failure_count = 0 920 921 # Start the clock. 922 start_time = time.time() 923 924 # Tuple of attributes to share/manage across multiple processes in 925 # parallel (-m) mode. 926 shared_attrs = ('op_failure_count', 'total_bytes_transferred') 927 928 # Perform copy requests in parallel (-m) mode, if requested, using 929 # configured number of parallel processes and threads. Otherwise, 930 # perform requests with sequential function calls in current process. 931 self.Apply(_CopyFuncWrapper, name_expansion_iterator, 932 _CopyExceptionHandler, shared_attrs, 933 fail_on_error=(not self.continue_on_error)) 934 self.logger.debug( 935 'total_bytes_transferred: %d', self.total_bytes_transferred) 936 937 end_time = time.time() 938 self.total_elapsed_time = end_time - start_time 939 940 # Sometimes, particularly when running unit tests, the total elapsed time 941 # is really small. On Windows, the timer resolution is too small and 942 # causes total_elapsed_time to be zero. 943 try: 944 float(self.total_bytes_transferred) / float(self.total_elapsed_time) 945 except ZeroDivisionError: 946 self.total_elapsed_time = 0.01 947 948 self.total_bytes_per_second = (float(self.total_bytes_transferred) / 949 float(self.total_elapsed_time)) 950 951 if self.debug == 3: 952 # Note that this only counts the actual GET and PUT bytes for the copy 953 # - not any transfers for doing wildcard expansion, the initial 954 # HEAD/GET request performed to get the object metadata, etc. 955 if self.total_bytes_transferred != 0: 956 self.logger.info( 957 'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)', 958 self.total_bytes_transferred, self.total_elapsed_time, 959 MakeHumanReadable(self.total_bytes_per_second)) 960 if self.op_failure_count: 961 plural_str = 's' if self.op_failure_count > 1 else '' 962 raise CommandException('%d file%s/object%s could not be transferred.' % ( 963 self.op_failure_count, plural_str, plural_str)) 964 965 return 0 966 967 def _ParseOpts(self): 968 perform_mv = False 969 # exclude_symlinks is handled by Command parent class, so save in Command 970 # state rather than CopyHelperOpts. 971 self.exclude_symlinks = False 972 no_clobber = False 973 # continue_on_error is handled by Command parent class, so save in Command 974 # state rather than CopyHelperOpts. 975 self.continue_on_error = False 976 daisy_chain = False 977 read_args_from_stdin = False 978 print_ver = False 979 use_manifest = False 980 preserve_acl = False 981 canned_acl = None 982 # canned_acl is handled by a helper function in parent 983 # Command class, so save in Command state rather than CopyHelperOpts. 984 self.canned = None 985 986 self.all_versions = False 987 988 self.skip_unsupported_objects = False 989 990 # Files matching these extensions should be gzipped before uploading. 991 self.gzip_exts = [] 992 993 test_callback_file = None 994 995 # self.recursion_requested initialized in command.py (so can be checked 996 # in parent class for all commands). 997 self.manifest = None 998 if self.sub_opts: 999 for o, a in self.sub_opts: 1000 if o == '-a': 1001 canned_acl = a 1002 self.canned = True 1003 if o == '-A': 1004 self.all_versions = True 1005 if o == '-c': 1006 self.continue_on_error = True 1007 elif o == '-D': 1008 daisy_chain = True 1009 elif o == '-e': 1010 self.exclude_symlinks = True 1011 elif o == '--testcallbackfile': 1012 # File path of a pickled class that implements ProgressCallback.call. 1013 # Used for testing transfer interruptions and resumes. 1014 test_callback_file = a 1015 elif o == '-I': 1016 read_args_from_stdin = True 1017 elif o == '-L': 1018 use_manifest = True 1019 self.manifest = Manifest(a) 1020 elif o == '-M': 1021 # Note that we signal to the cp command to perform a move (copy 1022 # followed by remove) and use directory-move naming rules by passing 1023 # the undocumented (for internal use) -M option when running the cp 1024 # command from mv.py. 1025 perform_mv = True 1026 elif o == '-n': 1027 no_clobber = True 1028 elif o == '-p': 1029 preserve_acl = True 1030 elif o == '-r' or o == '-R': 1031 self.recursion_requested = True 1032 elif o == '-U': 1033 self.skip_unsupported_objects = True 1034 elif o == '-v': 1035 print_ver = True 1036 elif o == '-z': 1037 self.gzip_exts = [x.strip() for x in a.split(',')] 1038 if preserve_acl and canned_acl: 1039 raise CommandException( 1040 'Specifying both the -p and -a options together is invalid.') 1041 if self.all_versions and self.parallel_operations: 1042 raise CommandException( 1043 'The gsutil -m option is not supported with the cp -A flag, to ' 1044 'ensure that object version ordering is preserved. Please re-run ' 1045 'the command without the -m option.') 1046 return CreateCopyHelperOpts( 1047 perform_mv=perform_mv, 1048 no_clobber=no_clobber, 1049 daisy_chain=daisy_chain, 1050 read_args_from_stdin=read_args_from_stdin, 1051 print_ver=print_ver, 1052 use_manifest=use_manifest, 1053 preserve_acl=preserve_acl, 1054 canned_acl=canned_acl, 1055 skip_unsupported_objects=self.skip_unsupported_objects, 1056 test_callback_file=test_callback_file) 1057