Home | History | Annotate | Download | only in commands
      1 # -*- coding: utf-8 -*-
      2 # Copyright 2013 Google Inc. All Rights Reserved.
      3 #
      4 # Licensed under the Apache License, Version 2.0 (the "License");
      5 # you may not use this file except in compliance with the License.
      6 # You may obtain a copy of the License at
      7 #
      8 #     http://www.apache.org/licenses/LICENSE-2.0
      9 #
     10 # Unless required by applicable law or agreed to in writing, software
     11 # distributed under the License is distributed on an "AS IS" BASIS,
     12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 # See the License for the specific language governing permissions and
     14 # limitations under the License.
     15 """Implementation of Unix-like du command for cloud storage providers."""
     16 
     17 from __future__ import absolute_import
     18 
     19 import sys
     20 
     21 from gslib.boto_translation import S3_DELETE_MARKER_GUID
     22 from gslib.bucket_listing_ref import BucketListingObject
     23 from gslib.command import Command
     24 from gslib.command_argument import CommandArgument
     25 from gslib.cs_api_map import ApiSelector
     26 from gslib.exception import CommandException
     27 from gslib.ls_helper import LsHelper
     28 from gslib.storage_url import ContainsWildcard
     29 from gslib.storage_url import StorageUrlFromString
     30 from gslib.util import MakeHumanReadable
     31 from gslib.util import NO_MAX
     32 from gslib.util import UTF8
     33 
     34 _SYNOPSIS = """
     35   gsutil du url...
     36 """
     37 
     38 _DETAILED_HELP_TEXT = ("""
     39 <B>SYNOPSIS</B>
     40 """ + _SYNOPSIS + """
     41 
     42 
     43 <B>DESCRIPTION</B>
     44   The du command displays the amount of space (in bytes) being used by the
     45   objects in the file or object hierarchy under a given URL. The syntax emulates
     46   the Linux du command (which stands for disk usage). For example, the command:
     47 
     48   gsutil du -s gs://your-bucket/dir
     49 
     50   will report the total space used by all objects under gs://your-bucket/dir and
     51   any sub-directories.
     52 
     53 
     54 <B>OPTIONS</B>
     55   -0          Ends each output line with a 0 byte rather than a newline. This
     56               can be useful to make the output more easily machine-readable.
     57 
     58   -a          Includes non-current object versions / generations in the listing
     59               (only useful with a versioning-enabled bucket). Also prints
     60               generation and metageneration for each listed object.
     61 
     62   -c          Produce a grand total.
     63 
     64   -e          A pattern to exclude from reporting. Example: -e "*.o" would
     65               exclude any object that ends in ".o". Can be specified multiple
     66               times.
     67 
     68   -h          Prints object sizes in human-readable format (e.g., 1 KiB,
     69               234 MiB, 2GiB, etc.)
     70 
     71   -s          Display only a summary total for each argument.
     72 
     73   -X          Similar to -e, but excludes patterns from the given file. The
     74               patterns to exclude should be one per line.
     75 
     76 
     77 <B>EXAMPLES</B>
     78   To list the size of all objects in a bucket:
     79 
     80     gsutil du gs://bucketname
     81 
     82   To list the size of all objects underneath a prefix:
     83 
     84     gsutil du gs://bucketname/prefix/*
     85 
     86   To print the total number of bytes in a bucket, in human-readable form:
     87 
     88     gsutil du -ch gs://bucketname
     89 
     90   To see a summary of the total bytes in the two given buckets:
     91 
     92     gsutil du -s gs://bucket1 gs://bucket2
     93 
     94   To list the size of all objects in a versioned bucket, including objects that
     95   are not the latest:
     96 
     97     gsutil du -a gs://bucketname
     98 
     99   To list all objects in a bucket, except objects that end in ".bak",
    100   with each object printed ending in a null byte:
    101 
    102     gsutil du -e "*.bak" -0 gs://bucketname
    103 
    104   To get a total of all buckets in a project with a grand total for an entire
    105   project:
    106 
    107       gsutil -o GSUtil:default_project_id=project-name du -shc
    108 """)
    109 
    110 
    111 class DuCommand(Command):
    112   """Implementation of gsutil du command."""
    113 
    114   # Command specification. See base class for documentation.
    115   command_spec = Command.CreateCommandSpec(
    116       'du',
    117       command_name_aliases=[],
    118       usage_synopsis=_SYNOPSIS,
    119       min_args=0,
    120       max_args=NO_MAX,
    121       supported_sub_args='0ace:hsX:',
    122       file_url_ok=False,
    123       provider_url_ok=True,
    124       urls_start_arg=0,
    125       gs_api_support=[ApiSelector.XML, ApiSelector.JSON],
    126       gs_default_api=ApiSelector.JSON,
    127       argparse_arguments=[
    128           CommandArgument.MakeZeroOrMoreCloudURLsArgument()
    129       ]
    130   )
    131   # Help specification. See help_provider.py for documentation.
    132   help_spec = Command.HelpSpec(
    133       help_name='du',
    134       help_name_aliases=[],
    135       help_type='command_help',
    136       help_one_line_summary='Display object size usage',
    137       help_text=_DETAILED_HELP_TEXT,
    138       subcommand_help_text={},
    139   )
    140 
    141   def _PrintSummaryLine(self, num_bytes, name):
    142     size_string = (MakeHumanReadable(num_bytes)
    143                    if self.human_readable else str(num_bytes))
    144     sys.stdout.write('%(size)-10s  %(name)s%(ending)s' % {
    145         'size': size_string, 'name': name, 'ending': self.line_ending})
    146 
    147   def _PrintInfoAboutBucketListingRef(self, bucket_listing_ref):
    148     """Print listing info for given bucket_listing_ref.
    149 
    150     Args:
    151       bucket_listing_ref: BucketListing being listed.
    152 
    153     Returns:
    154       Tuple (number of objects, object size)
    155 
    156     Raises:
    157       Exception: if calling bug encountered.
    158     """
    159     obj = bucket_listing_ref.root_object
    160     url_str = bucket_listing_ref.url_string
    161     if (obj.metadata and S3_DELETE_MARKER_GUID in
    162         obj.metadata.additionalProperties):
    163       size_string = '0'
    164       num_bytes = 0
    165       num_objs = 0
    166       url_str += '<DeleteMarker>'
    167     else:
    168       size_string = (MakeHumanReadable(obj.size)
    169                      if self.human_readable else str(obj.size))
    170       num_bytes = obj.size
    171       num_objs = 1
    172 
    173     if not self.summary_only:
    174       sys.stdout.write('%(size)-10s  %(url)s%(ending)s' % {
    175           'size': size_string,
    176           'url': url_str.encode(UTF8),
    177           'ending': self.line_ending})
    178 
    179     return (num_objs, num_bytes)
    180 
    181   def RunCommand(self):
    182     """Command entry point for the du command."""
    183     self.line_ending = '\n'
    184     self.all_versions = False
    185     self.produce_total = False
    186     self.human_readable = False
    187     self.summary_only = False
    188     self.exclude_patterns = []
    189     if self.sub_opts:
    190       for o, a in self.sub_opts:
    191         if o == '-0':
    192           self.line_ending = '\0'
    193         elif o == '-a':
    194           self.all_versions = True
    195         elif o == '-c':
    196           self.produce_total = True
    197         elif o == '-e':
    198           self.exclude_patterns.append(a)
    199         elif o == '-h':
    200           self.human_readable = True
    201         elif o == '-s':
    202           self.summary_only = True
    203         elif o == '-X':
    204           if a == '-':
    205             f = sys.stdin
    206           else:
    207             f = open(a, 'r')
    208           try:
    209             for line in f:
    210               line = line.strip()
    211               if line:
    212                 self.exclude_patterns.append(line)
    213           finally:
    214             f.close()
    215 
    216     if not self.args:
    217       # Default to listing all gs buckets.
    218       self.args = ['gs://']
    219 
    220     total_bytes = 0
    221     got_nomatch_errors = False
    222 
    223     def _PrintObjectLong(blr):
    224       return self._PrintInfoAboutBucketListingRef(blr)
    225 
    226     def _PrintNothing(unused_blr=None):
    227       pass
    228 
    229     def _PrintDirectory(num_bytes, name):
    230       if not self.summary_only:
    231         self._PrintSummaryLine(num_bytes, name)
    232 
    233     for url_arg in self.args:
    234       top_level_storage_url = StorageUrlFromString(url_arg)
    235       if top_level_storage_url.IsFileUrl():
    236         raise CommandException('Only cloud URLs are supported for %s'
    237                                % self.command_name)
    238       bucket_listing_fields = ['size']
    239 
    240       ls_helper = LsHelper(
    241           self.WildcardIterator, self.logger,
    242           print_object_func=_PrintObjectLong, print_dir_func=_PrintNothing,
    243           print_dir_header_func=_PrintNothing,
    244           print_dir_summary_func=_PrintDirectory,
    245           print_newline_func=_PrintNothing, all_versions=self.all_versions,
    246           should_recurse=True, exclude_patterns=self.exclude_patterns,
    247           fields=bucket_listing_fields)
    248 
    249       # ls_helper expands to objects and prefixes, so perform a top-level
    250       # expansion first.
    251       if top_level_storage_url.IsProvider():
    252         # Provider URL: use bucket wildcard to iterate over all buckets.
    253         top_level_iter = self.WildcardIterator(
    254             '%s://*' % top_level_storage_url.scheme).IterBuckets(
    255                 bucket_fields=['id'])
    256       elif top_level_storage_url.IsBucket():
    257         top_level_iter = self.WildcardIterator(
    258             '%s://%s' % (top_level_storage_url.scheme,
    259                          top_level_storage_url.bucket_name)).IterBuckets(
    260                              bucket_fields=['id'])
    261       else:
    262         top_level_iter = [BucketListingObject(top_level_storage_url)]
    263 
    264       for blr in top_level_iter:
    265         storage_url = blr.storage_url
    266         if storage_url.IsBucket() and self.summary_only:
    267           storage_url = StorageUrlFromString(
    268               storage_url.CreatePrefixUrl(wildcard_suffix='**'))
    269         _, exp_objs, exp_bytes = ls_helper.ExpandUrlAndPrint(storage_url)
    270         if (storage_url.IsObject() and exp_objs == 0 and
    271             ContainsWildcard(url_arg) and not self.exclude_patterns):
    272           got_nomatch_errors = True
    273         total_bytes += exp_bytes
    274 
    275         if self.summary_only:
    276           self._PrintSummaryLine(exp_bytes, blr.url_string.rstrip('/'))
    277 
    278     if self.produce_total:
    279       self._PrintSummaryLine(total_bytes, 'total')
    280 
    281     if got_nomatch_errors:
    282       raise CommandException('One or more URLs matched no objects.')
    283 
    284     return 0
    285