Home | History | Annotate | Download | only in contrib
      1 #!/usr/bin/python
      2 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 import argparse
      7 import datetime
      8 import logging
      9 import os
     10 import re
     11 import sys
     12 import time
     13 
     14 os.environ['DJANGO_SETTINGS_MODULE'] = 'frontend.settings'
     15 
     16 import common
     17 from autotest_lib.server import utils
     18 from django.db import connections, transaction
     19 
     20 
     21 # Format Appears as: [Date] [Time] - [Msg Level] - [Message]
     22 LOGGING_FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
     23 # This regex makes sure the input is in the format of YYYY-MM-DD (2012-02-01)
     24 DATE_FORMAT_REGEX = ('^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]'
     25                      '|3[01])$')
     26 SELECT_CMD_FORMAT = """
     27 SELECT %(table)s.%(primary_key)s FROM %(table)s
     28 WHERE %(table)s.%(time_column)s <= "%(date)s"
     29 """
     30 SELECT_JOIN_CMD_FORMAT = """
     31 SELECT %(table)s.%(primary_key)s FROM %(table)s
     32 INNER JOIN %(related_table)s
     33   ON %(table)s.%(foreign_key)s=%(related_table)s.%(related_primary_key)s
     34 WHERE %(related_table)s.%(time_column)s <= "%(date)s"
     35 """
     36 SELECT_WITH_INDIRECTION_FORMAT = """
     37 SELECT %(table)s.%(primary_key)s FROM %(table)s
     38 INNER JOIN %(indirection_table)s
     39   ON %(table)s.%(foreign_key)s =
     40      %(indirection_table)s.%(indirection_primary_key)s
     41 INNER JOIN %(related_table)s
     42   ON %(indirection_table)s.%(indirection_foreign_key)s =
     43   %(related_table)s.%(related_primary_key)s
     44 WHERE %(related_table)s.%(time_column)s <= "%(date)s"
     45 """
     46 DELETE_ROWS_FORMAT = """
     47 DELETE FROM %(table)s
     48 WHERE %(table)s.%(primary_key)s IN (%(rows)s)
     49 """
     50 
     51 
     52 AFE_JOB_ID = 'afe_job_id'
     53 JOB_ID = 'job_id'
     54 JOB_IDX = 'job_idx'
     55 TEST_IDX = 'test_idx'
     56 
     57 # CAUTION: Make sure only the 'default' connection is used. Otherwise
     58 # db_cleanup may delete stuff from the global database, which is generally not
     59 # intended.
     60 cursor = connections['default'].cursor()
     61 
     62 # Globals for command line flag constants, for convenience.
     63 DRY_RUN = False
     64 STEP_SIZE = None
     65 LOAD_RATIO = 1.0
     66 
     67 class ProgressBar(object):
     68     TEXT = "{:<40s} [{:<20s}] ({:>9d}/{:>9d})"
     69 
     70     def __init__(self, name, amount):
     71         self._name = name
     72         self._amount = amount
     73         self._cur = 0
     74 
     75     def __enter__(self):
     76         return self
     77 
     78     def __exit__(self, a, b, c):
     79         sys.stdout.write('\n')
     80         sys.stdout.flush()
     81 
     82     def update(self, x):
     83         """
     84         Advance the counter by `x`.
     85 
     86         @param x: An integer of how many more elements were processed.
     87         """
     88         self._cur += x
     89 
     90     def show(self):
     91         """
     92         Display the progress bar on the current line.  Repeated invocations
     93         "update" the display.
     94         """
     95         if self._amount == 0:
     96             barlen = 20
     97         else:
     98             barlen = int(20 * self._cur / float(self._amount))
     99         if barlen:
    100             bartext = '=' * (barlen-1) + '>'
    101         else:
    102             bartext = ''
    103         text = self.TEXT.format(self._name, bartext, self._cur, self._amount)
    104         sys.stdout.write('\r')
    105         sys.stdout.write(text)
    106         sys.stdout.flush()
    107 
    108 
    109 def grouper(iterable, n):
    110     """
    111     Group the elements of `iterable` into groups of maximum size `n`.
    112 
    113     @param iterable: An iterable.
    114     @param n: Max size of returned groups.
    115     @returns: Yields iterables of size <= n.
    116 
    117     >>> grouper('ABCDEFG', 3)
    118     [['A', 'B', C'], ['D', 'E', 'F'], ['G']]
    119     """
    120     args = [iter(iterable)] * n
    121     while True:
    122         lst = []
    123         try:
    124             for itr in args:
    125                 lst.append(next(itr))
    126             yield lst
    127         except StopIteration:
    128             if lst:
    129                 yield lst
    130             break
    131 
    132 
    133 def _delete_table_data_before_date(table_to_delete_from, primary_key,
    134                                    related_table, related_primary_key,
    135                                    date, foreign_key=None,
    136                                    time_column="started_time",
    137                                    indirection_table=None,
    138                                    indirection_primary_key=None,
    139                                    indirection_foreign_key=None):
    140     """
    141     We want a delete statement that will only delete from one table while
    142     using a related table to find the rows to delete.
    143 
    144     An example mysql command:
    145     DELETE FROM tko_iteration_result USING tko_iteration_result INNER JOIN
    146     tko_tests WHERE tko_iteration_result.test_idx=tko_tests.test_idx AND
    147     tko_tests.started_time <= '2012-02-01';
    148 
    149     There are also tables that require 2 joins to determine which rows we want
    150     to delete and we determine these rows by joining the table we want to
    151     delete from with an indirection table to the actual jobs table.
    152 
    153     @param table_to_delete_from: Table whose rows we want to delete.
    154     @param related_table: Table with the date information we are selecting by.
    155     @param foreign_key: Foreign key used in table_to_delete_from to reference
    156                         the related table. If None, the primary_key is used.
    157     @param primary_key: Primary key in the related table.
    158     @param date: End date of the information we are trying to delete.
    159     @param time_column: Column that we want to use to compare the date to.
    160     @param indirection_table: Table we use to link the data we are trying to
    161                               delete with the table with the date information.
    162     @param indirection_primary_key: Key we use to connect the indirection table
    163                                     to the table we are trying to delete rows
    164                                     from.
    165     @param indirection_foreign_key: Key we use to connect the indirection table
    166                                     to the table with the date information.
    167     """
    168     if not foreign_key:
    169         foreign_key = primary_key
    170 
    171     if not related_table:
    172         # Deleting from a table directly.
    173         variables = dict(table=table_to_delete_from, primary_key=primary_key,
    174                          time_column=time_column, date=date)
    175         sql = SELECT_CMD_FORMAT % variables
    176     elif not indirection_table:
    177         # Deleting using a single JOIN to get the date information.
    178         variables = dict(primary_key=primary_key, table=table_to_delete_from,
    179                          foreign_key=foreign_key, related_table=related_table,
    180                          related_primary_key=related_primary_key,
    181                          time_column=time_column, date=date)
    182         sql = SELECT_JOIN_CMD_FORMAT % variables
    183     else:
    184         # There are cases where we need to JOIN 3 TABLES to determine the rows
    185         # we want to delete.
    186         variables = dict(primary_key=primary_key, table=table_to_delete_from,
    187                          indirection_table=indirection_table,
    188                          foreign_key=foreign_key,
    189                          indirection_primary_key=indirection_primary_key,
    190                          related_table=related_table,
    191                          related_primary_key=related_primary_key,
    192                          indirection_foreign_key=indirection_foreign_key,
    193                          time_column=time_column, date=date)
    194         sql = SELECT_WITH_INDIRECTION_FORMAT % variables
    195 
    196     logging.debug('SQL: %s', sql)
    197     cursor.execute(sql, [])
    198     rows = [x[0] for x in cursor.fetchall()]
    199     logging.debug(rows)
    200 
    201     if not rows or rows == [None]:
    202         with ProgressBar(table_to_delete_from, 0) as pb:
    203             pb.show()
    204         logging.debug('Nothing to delete for %s', table_to_delete_from)
    205         return
    206 
    207     with ProgressBar(table_to_delete_from, len(rows)) as pb:
    208         for row_keys in grouper(rows, STEP_SIZE):
    209             variables['rows'] = ','.join([str(x) for x in row_keys])
    210             sql = DELETE_ROWS_FORMAT % variables
    211 
    212             start = time.time()
    213             logging.debug('SQL: %s', sql)
    214             if not DRY_RUN:
    215                 cursor.execute(sql, [])
    216                 transaction.commit_unless_managed(using='default')
    217             end = time.time()
    218 
    219             pb.update(len(row_keys))
    220             pb.show()
    221 
    222             if LOAD_RATIO != 1.0:
    223                 assert 0 < LOAD_RATIO <= 1, (
    224                         'Load ratio must be a fraction between 0 and 1.')
    225                 time.sleep((end - start) / LOAD_RATIO)
    226 
    227 
    228 def _subtract_days(date, days_to_subtract):
    229     """
    230     Return a date (string) that is 'days' before 'date'
    231 
    232     @param date: date (string) we are subtracting from.
    233     @param days_to_subtract: days (int) we are subtracting.
    234     """
    235     date_obj = datetime.datetime.strptime(date, '%Y-%m-%d')
    236     difference = date_obj - datetime.timedelta(days=days_to_subtract)
    237     return difference.strftime('%Y-%m-%d')
    238 
    239 
    240 def _delete_all_data_before_date(date):
    241     """
    242     Delete all the database data before a given date.
    243 
    244     This function focuses predominately on the data for jobs in tko_jobs.
    245     However not all jobs in afe_jobs are also in tko_jobs.
    246 
    247     Therefore we delete all the afe_job and foreign key relations prior to two
    248     days before date. Then we do the queries using tko_jobs and these
    249     tables to ensure all the related information is gone. Even though we are
    250     repeating deletes on these tables, the second delete will be quick and
    251     completely thorough in ensuring we clean up all the foreign key
    252     dependencies correctly.
    253 
    254     @param date: End date of the information we are trying to delete.
    255     @param step: Rows to delete per SQL query.
    256     """
    257     # First cleanup all afe_job related data (prior to 2 days before date).
    258     # The reason for this is not all afe_jobs may be in tko_jobs.
    259     afe_date = _subtract_days(date, 2)
    260     logging.info('Cleaning up all afe_job data prior to %s.', afe_date)
    261     _delete_table_data_before_date('afe_aborted_host_queue_entries',
    262                                    'queue_entry_id',
    263                                    'afe_jobs', 'id', afe_date,
    264                                    time_column= 'created_on',
    265                                    foreign_key='queue_entry_id',
    266                                    indirection_table='afe_host_queue_entries',
    267                                    indirection_primary_key='id',
    268                                    indirection_foreign_key='job_id')
    269     _delete_table_data_before_date('afe_special_tasks', 'id',
    270                                    'afe_jobs', 'id',
    271                                    afe_date, time_column='created_on',
    272                                    foreign_key='queue_entry_id',
    273                                    indirection_table='afe_host_queue_entries',
    274                                    indirection_primary_key='id',
    275                                    indirection_foreign_key='job_id')
    276     _delete_table_data_before_date('afe_host_queue_entries', 'id',
    277                                    'afe_jobs', 'id',
    278                                    afe_date, time_column='created_on',
    279                                    foreign_key=JOB_ID)
    280     _delete_table_data_before_date('afe_job_keyvals', 'id',
    281                                    'afe_jobs', 'id',
    282                                    afe_date, time_column='created_on',
    283                                    foreign_key=JOB_ID)
    284     _delete_table_data_before_date('afe_jobs_dependency_labels', 'id',
    285                                    'afe_jobs', 'id',
    286                                    afe_date, time_column='created_on',
    287                                    foreign_key=JOB_ID)
    288     _delete_table_data_before_date('afe_jobs', 'id',
    289                                    None, None,
    290                                    afe_date, time_column='created_on')
    291     # Special tasks that aren't associated with an HQE
    292     # Since we don't do the queue_entry_id=NULL check, we might wipe out a bit
    293     # more than we should, but I doubt anyone will notice or care.
    294     _delete_table_data_before_date('afe_special_tasks', 'id',
    295                                    None, None,
    296                                    afe_date, time_column='time_requested')
    297 
    298     # Now go through and clean up all the rows related to tko_jobs prior to
    299     # date.
    300     logging.info('Cleaning up all data related to tko_jobs prior to %s.',
    301                   date)
    302     _delete_table_data_before_date('tko_test_attributes', 'id',
    303                                    'tko_tests', TEST_IDX,
    304                                    date, foreign_key=TEST_IDX)
    305     _delete_table_data_before_date('tko_test_labels_tests', 'id',
    306                                    'tko_tests', TEST_IDX,
    307                                    date, foreign_key= 'test_id')
    308     _delete_table_data_before_date('tko_iteration_result', TEST_IDX,
    309                                    'tko_tests', TEST_IDX,
    310                                    date)
    311     _delete_table_data_before_date('tko_iteration_perf_value', TEST_IDX,
    312                                    'tko_tests', TEST_IDX,
    313                                    date)
    314     _delete_table_data_before_date('tko_iteration_attributes', TEST_IDX,
    315                                    'tko_tests', TEST_IDX,
    316                                    date)
    317     _delete_table_data_before_date('tko_job_keyvals', 'id',
    318                                    'tko_jobs', JOB_IDX,
    319                                    date, foreign_key='job_id')
    320     _delete_table_data_before_date('afe_aborted_host_queue_entries',
    321                                    'queue_entry_id',
    322                                    'tko_jobs', AFE_JOB_ID, date,
    323                                    foreign_key='queue_entry_id',
    324                                    indirection_table='afe_host_queue_entries',
    325                                    indirection_primary_key='id',
    326                                    indirection_foreign_key='job_id')
    327     _delete_table_data_before_date('afe_special_tasks', 'id',
    328                                    'tko_jobs', AFE_JOB_ID,
    329                                    date, foreign_key='queue_entry_id',
    330                                    indirection_table='afe_host_queue_entries',
    331                                    indirection_primary_key='id',
    332                                    indirection_foreign_key='job_id')
    333     _delete_table_data_before_date('afe_host_queue_entries', 'id',
    334                                    'tko_jobs', AFE_JOB_ID,
    335                                    date, foreign_key='job_id')
    336     _delete_table_data_before_date('afe_job_keyvals', 'id',
    337                                    'tko_jobs', AFE_JOB_ID,
    338                                    date, foreign_key='job_id')
    339     _delete_table_data_before_date('afe_jobs_dependency_labels', 'id',
    340                                    'tko_jobs', AFE_JOB_ID,
    341                                    date, foreign_key='job_id')
    342     _delete_table_data_before_date('afe_jobs', 'id',
    343                                    'tko_jobs', AFE_JOB_ID,
    344                                    date, foreign_key='id')
    345     _delete_table_data_before_date('tko_tests', TEST_IDX,
    346                                    'tko_jobs', JOB_IDX,
    347                                    date, foreign_key=JOB_IDX)
    348     _delete_table_data_before_date('tko_jobs', JOB_IDX,
    349                                    None, None, date)
    350 
    351 
    352 def parse_args():
    353     """Parse command line arguments"""
    354     parser = argparse.ArgumentParser()
    355     parser.add_argument('-v', '--verbose', action='store_true',
    356                         help='Print SQL commands and results')
    357     parser.add_argument('--step', type=int, action='store',
    358                         default=1000,
    359                         help='Number of rows to delete at once')
    360     parser.add_argument('--dry_run', action='store_true',
    361                         help='Print SQL queries instead of executing them.')
    362     parser.add_argument('--load_ratio', type=float, action='store', default=0.2,
    363                         help=('The fraction of time the script should be '
    364                               'performing deletes. For example --load_ratio=.2 '
    365                               'will cause the script to sleep 80% of the time, '
    366                               'and perform work for the other 20%.'))
    367     parser.add_argument('date', help='Keep results newer than')
    368     return parser.parse_args()
    369 
    370 
    371 def main():
    372     args = parse_args()
    373 
    374     verbose = args.verbose or args.dry_run
    375     level = logging.DEBUG if verbose else logging.INFO
    376     logging.basicConfig(level=level, format=LOGGING_FORMAT)
    377     logging.info('Calling: %s', sys.argv)
    378 
    379     if not re.match(DATE_FORMAT_REGEX, args.date):
    380         print 'DATE must be in yyyy-mm-dd format!'
    381         return
    382 
    383     global STEP_SIZE, DRY_RUN, LOAD_RATIO
    384     STEP_SIZE = args.step
    385     DRY_RUN = args.dry_run
    386     LOAD_RATIO = args.load_ratio
    387 
    388     _delete_all_data_before_date(args.date)
    389 
    390 
    391 if __name__ == '__main__':
    392     main()
    393