Home | History | Annotate | Download | only in contrib
      1 #!/usr/bin/python
      2 
      3 import sys, optparse, pwd
      4 import common
      5 from autotest_lib.cli import rpc, host
      6 from autotest_lib.client.common_lib import host_queue_entry_states
      7 
      8 parser = optparse.OptionParser(
      9     usage='Usage: %prog [options] <job id> [<hostname>]\n\n'
     10           'Describes why the given job on the given host has not started.')
     11 parser.add_option('-w', '--web',
     12                   help='Autotest server to use (i.e. "autotest")')
     13 options, args = parser.parse_args()
     14 
     15 if len(args) < 1:
     16     parser.print_help()
     17     sys.exit(1)
     18 
     19 job_id = int(args[0])
     20 
     21 autotest_host = rpc.get_autotest_server(options.web)
     22 proxy = rpc.afe_comm(autotest_host)
     23 
     24 # job exists?
     25 jobs = proxy.run('get_jobs', id=job_id)
     26 if not jobs:
     27     print 'No such job', job_id
     28     sys.exit(1)
     29 job = jobs[0]
     30 owner = job['owner']
     31 
     32 RUNNING_HQE_STATUSES = host_queue_entry_states.ACTIVE_STATUSES
     33 
     34 # any entry eligible for this host?
     35 queue_entries = proxy.run('get_host_queue_entries', job__id=job_id)
     36 
     37 ### Divine why an atomic group job is or is not running.
     38 if queue_entries and queue_entries[0]['atomic_group']:
     39     if queue_entries[0]['status'] in RUNNING_HQE_STATUSES:
     40         print 'Job %d appears to have started (status: %s).' % (
     41                 job_id, queue_entries[0]['status'])
     42         sys.exit(0)
     43     # Hosts in Repairing or Repair Failed will have Queued queue entries.
     44     # We shouldn't consider those queue entries as a multi-group job.
     45     repair_hostnames = []
     46     for queue_entry in queue_entries:
     47         if queue_entry['host'] and queue_entry['host']['status']:
     48             if queue_entry['host']['status'].startswith('Repair'):
     49                 repair_hostnames.append(queue_entry['host']['hostname'])
     50         if queue_entry['status'] in ('Completed', 'Stopped'):
     51             print 'This job has already finished.'
     52             sys.exit(0)
     53     queue_entries_with_hosts = [queue_entry for queue_entry in queue_entries
     54                                 if queue_entry['host']]
     55     all_queue_entries_have_hosts = (len(queue_entries) ==
     56                                     len(queue_entries_with_hosts))
     57     if (not all_queue_entries_have_hosts and len(queue_entries) > 1 and
     58         not repair_hostnames):
     59         # We test repair_hostnames so that this message is not printed when
     60         # the script is run on an atomic group job which has hosts assigned
     61         # but is not running because too many of them are in Repairing or will
     62         # never run because hosts have exited Repairing into the Repair Failed
     63         # dead end.
     64         print 'This script does not support multi-group atomic group jobs.'
     65         print
     66         print 'Jobs scheduled in that state are typically unintentional.'
     67         print
     68         print 'Did you perhaps schedule the job via the web frontend and ask'
     69         print 'that it run on more than 1 (atomic group) of hosts via the '
     70         print '"Run on any" box?  If so, always enter 1 there when scheduling'
     71         print 'jobs on anything marked "(atomic group)".'
     72         print
     73         print len(queue_entries), 'non-started atomic group HostQueueEntries',
     74         print 'found for job', job_id
     75         sys.exit(1)
     76     atomic_group_name = queue_entries[0]['atomic_group']['name']
     77     # Get the list of labels associated with this atomic group.
     78     atomic_labels = proxy.run('get_labels',
     79                               atomic_group__name=atomic_group_name)
     80     if len(atomic_labels) < 1:
     81         print 'Job requests atomic group %s but no labels' % atomic_group_name
     82         print '(and thus no hosts) are associated with that atomic group.'
     83 
     84     job_sync_count = job['synch_count']
     85     # Ugh! This is returned as a comma separated str of label names.
     86     if job.get('dependencies'):
     87         job_dependency_label_names = job['dependencies'].split(',')
     88     else:
     89         job_dependency_label_names = []
     90 
     91     meta_host_name = queue_entries[0]['meta_host']
     92     if meta_host_name:
     93         meta_host = proxy.run('get_labels', atomic_group__name=meta_host_name)[0]
     94     else:
     95         meta_host = None
     96 
     97     # A mapping from label name -> a list of hostnames usable for this job.
     98     runnable_atomic_label_names = {}
     99 
    100     # A mapping from label name -> a host_exclude_reasons map as described
    101     # within the loop below.  Any atomic group labels in this map are not
    102     # ready to run the job for the reasons contained within.
    103     atomic_label_exclude_reasons = {}
    104 
    105     for label in atomic_labels:
    106         label_name = label['name']
    107         if meta_host and meta_host_name != label_name:
    108             print 'Cannot run on atomic label %s due to meta_host %s.' % (
    109                     label_name, meta_host_name)
    110             continue
    111         for dep_name in job_dependency_label_names:
    112             if dep_name != label_name:
    113                 print 'Not checking hosts in atomic label %s against' % (
    114                         label_name,)
    115                 print 'job dependency label %s.  There may be less hosts' % (
    116                         dep_name,)
    117                 print 'than examined below available to run this job.'
    118 
    119         # Get the list of hosts associated with this atomic group label.
    120         atomic_hosts = proxy.run('get_hosts', multiple_labels=[label_name])
    121 
    122         # A map of hostname -> A list of reasons it can't be used.
    123         host_exclude_reasons = {}
    124 
    125         atomic_hostnames = [h['hostname'] for h in atomic_hosts]
    126 
    127         # Map hostnames to a list of ACL names on that host.
    128         acl_groups = proxy.run('get_acl_groups',
    129                                hosts__hostname__in=atomic_hostnames)
    130         hostname_to_acl_name_list = {}
    131         for acl in acl_groups:
    132             for hostname in acl['hosts']:
    133                 hostname_to_acl_name_list.setdefault(hostname, []).append(
    134                         acl['name'])
    135 
    136         # Exclude any hosts that ACLs deny us access to.
    137         accessible_hosts = proxy.run('get_hosts', hostname__in=atomic_hostnames,
    138                                      aclgroup__users__login=owner)
    139         assert len(accessible_hosts) <= len(atomic_hosts)
    140         if len(accessible_hosts) != len(atomic_hosts):
    141             accessible_hostnames = set(h['hostname'] for h in accessible_hosts)
    142             acl_excluded_hostnames = (set(atomic_hostnames) -
    143                                       accessible_hostnames)
    144             for hostname in acl_excluded_hostnames:
    145                 acls = ','.join(hostname_to_acl_name_list[hostname])
    146                 host_exclude_reasons.setdefault(hostname, []).append(
    147                         'User %s does not have ACL access. ACLs: %s' % (
    148                                 owner, acls))
    149 
    150         # Check for locked hosts.
    151         locked_hosts = [h for h in atomic_hosts if h['locked']]
    152         for host in locked_hosts:
    153             locker = host.get('locked_by') or 'UNKNOWN'
    154             msg = 'Locked by user %s on %s.  No jobs will schedule on it.' % (
    155                     locker, host.get('lock_time'))
    156             host_exclude_reasons.setdefault(host['hostname'], []).append(msg)
    157 
    158         # Exclude hosts that are not Ready.
    159         for host in atomic_hosts:
    160             hostname = host['hostname']
    161             if host['status'] != 'Ready':
    162                 message = 'Status is %s' % host['status']
    163                 if host['status'] in ('Verifying', 'Pending', 'Running'):
    164                     running_hqes = proxy.run(
    165                             'get_host_queue_entries', host__hostname=hostname,
    166                             status__in=RUNNING_HQE_STATUSES)
    167                     if not running_hqes:
    168                         message += ' (unknown job)'
    169                     else:
    170                         message += ' (job %d)' % running_hqes[0]['job']['id']
    171                 host_exclude_reasons.setdefault(hostname, []).append(message)
    172 
    173         # If we don't have enough usable hosts, this group cannot run the job.
    174         usable_hostnames = [host['hostname'] for host in atomic_hosts
    175                             if host['hostname'] not in host_exclude_reasons]
    176         if len(usable_hostnames) < job_sync_count:
    177             message = ('%d hosts are required but only %d available.' %
    178                        (job_sync_count, len(usable_hostnames)))
    179             atomic_label_exclude_reasons[label_name] = (message,
    180                                                         host_exclude_reasons)
    181         else:
    182             runnable_atomic_label_names[label_name] = usable_hostnames
    183 
    184     for label_name, reason_tuple in atomic_label_exclude_reasons.iteritems():
    185         job_reason, hosts_reasons = reason_tuple
    186         print 'Atomic group "%s" via label "%s" CANNOT run job %d because:' % (
    187                 atomic_group_name, label_name, job_id)
    188         print job_reason
    189         for hostname in sorted(hosts_reasons.keys()):
    190             for reason in hosts_reasons[hostname]:
    191                 print '%s\t%s' % (hostname, reason)
    192         print
    193 
    194     for label_name, host_list in runnable_atomic_label_names.iteritems():
    195         print 'Atomic group "%s" via label "%s" is READY to run job %d on:' % (
    196                 atomic_group_name, label_name, job_id)
    197         print ', '.join(host_list)
    198         print 'Is the job scheduler healthy?'
    199         print
    200 
    201     sys.exit(0)
    202 
    203 
    204 ### Not an atomic group synchronous job:
    205 
    206 if len(args) != 2:
    207     if len(queue_entries) == 1 and queue_entries[0]['host']:
    208         hostname = queue_entries[0]['host']['hostname']
    209     else:
    210         parser.print_help()
    211         print '\nERROR: A hostname associated with the job is required.'
    212         sys.exit(1)
    213 else:
    214     hostname = args[1]
    215 
    216 # host exists?
    217 hosts = proxy.run('get_hosts', hostname=hostname)
    218 if not hosts:
    219     print 'No such host', hostname
    220     sys.exit(1)
    221 host = hosts[0]
    222 
    223 # Boolean to track our findings.  We want to list all reasons it won't run,
    224 # not just the first.
    225 job_will_run = True
    226 
    227 entries_for_this_host = [entry for entry in queue_entries
    228                          if entry['host']
    229                          and entry['host']['hostname'] == hostname]
    230 host_label_names = set(host['labels'])
    231 eligible_metahost_entries = [entry for entry in queue_entries
    232                              if entry['meta_host'] and not entry['host']
    233                              and entry['meta_host'] in host_label_names
    234                              and not entry['complete']]
    235 
    236 if entries_for_this_host:
    237     assert len(entries_for_this_host) == 1, (
    238         'Multiple entries for this job assigned to this host!')
    239     entry = entries_for_this_host[0]
    240     if entry['active'] or entry['complete']:
    241         print ('Job already ran or is running on this host! (status: %s)' %
    242                entry['full_status'])
    243         sys.exit(0)
    244     is_metahost = False
    245 else:
    246     # no entry for this host -- maybe an eligible metahost entry?
    247     if not eligible_metahost_entries:
    248         print ("Host isn't scheduled for this job, and no eligible metahost "
    249                "entry exists")
    250         sys.exit(0)
    251     is_metahost = True
    252 
    253 # meets atomic group requirements?
    254 host_labels = proxy.run('get_labels', name__in=list(host_label_names))
    255 host_atomic_group_labels = [label for label in host_labels
    256                             if label['atomic_group']]
    257 host_atomic_group_name = None
    258 if host_atomic_group_labels:
    259     atomic_groups = set()
    260     for label in host_atomic_group_labels:
    261         atomic_groups.add(label['atomic_group']['name'])
    262     if len(atomic_groups) != 1:
    263         print 'Host has more than one atomic group!'
    264         print list(atomic_groups)
    265         sys.exit(1)
    266     host_atomic_group_label = host_atomic_group_labels[0]
    267     host_atomic_group_name = host_atomic_group_label['atomic_group']['name']
    268 
    269 job_atomic_groups = set(entry['atomic_group'] for entry in queue_entries)
    270 assert len(job_atomic_groups) == 1, 'Job has more than one atomic group value!'
    271 job_atomic_group = job_atomic_groups.pop() # might be None
    272 job_atomic_group_name = None
    273 if job_atomic_group:
    274     job_atomic_group_name = job_atomic_group['name']
    275 
    276 if host_atomic_group_name != job_atomic_group_name:
    277     print ('Job is for atomic group %s, but host is in atomic group %s '
    278            '(label %s)' %
    279            (job_atomic_group_name, host_atomic_group_name,
    280             host_atomic_group_label['name']))
    281     job_will_run = False
    282 
    283 # host locked?
    284 if host['locked']:
    285     print 'Host is locked by', host['locked_by'], 'no jobs will schedule on it.'
    286     job_will_run = False
    287 
    288 # acl accessible?
    289 accessible = proxy.run('get_hosts', hostname=hostname,
    290                        aclgroup__users__login=owner)
    291 if not accessible:
    292     host_acls = ', '.join(group['name'] for group in
    293                           proxy.run('get_acl_groups', hosts__hostname=hostname))
    294     owner_acls = ', '.join(group['name'] for group in
    295                            proxy.run('get_acl_groups', users__login=owner))
    296     print 'Host not ACL-accessible to job owner', owner
    297     print ' Host ACLs:', host_acls
    298     print ' Owner Acls:', owner_acls
    299     job_will_run = False
    300 
    301 # meets dependencies?
    302 job_deps_list = job['dependencies'].split(',')
    303 job_deps = set()
    304 if job_deps_list != ['']:
    305     job_deps = set(job_deps_list)
    306 unmet = job_deps - host_label_names
    307 if unmet:
    308     print ("Host labels (%s) don't satisfy job dependencies: %s" %
    309            (', '.join(host_label_names), ', '.join(unmet)))
    310     job_will_run = False
    311 
    312 # at this point, if the job is for an unassigned atomic group, things are too
    313 # complicated to proceed
    314 unassigned_atomic_group_entries = [entry for entry in queue_entries
    315                                    if entry['atomic_group']
    316                                    and not entry['host']]
    317 if unassigned_atomic_group_entries:
    318     print ("Job is for an unassigned atomic group.  That's too complicated, I "
    319            "can't give you any definite answers.  Sorry.")
    320     sys.exit(1)
    321 
    322 # meets only_if_needed labels?
    323 if is_metahost:
    324     metahost_names = set(entry['meta_host']
    325                          for entry in eligible_metahost_entries)
    326     job_deps_and_metahosts = job_deps.union(metahost_names)
    327     for label in host_labels:
    328         unmet_exclusive_label = (label['only_if_needed'] and
    329                                  label['name'] not in job_deps_and_metahosts)
    330         if unmet_exclusive_label:
    331             print ('Host contains "only if needed" label %s, unused by job '
    332                    'dependencies and metahosts' % label['name'])
    333             job_will_run = False
    334 
    335 # host ready?
    336 if host['status'] != 'Ready':
    337     if host['status'] == 'Pending':
    338         active = proxy.run('get_host_queue_entries',
    339                            host=host['id'], active=True)
    340         if not active:
    341             print ('Host %s seems to be in "Pending" state incorrectly; please '
    342                    'report this to the Autotest team' % hostname)
    343             sys.exit(1)
    344     print 'Host not in "Ready" status (status="%s")' % host['status']
    345     job_will_run = False
    346 
    347 if job_will_run:
    348     print ("Job %s should run on host %s; if you've already waited about ten "
    349            "minutes or longer, it's probably a server issue or a bug." %
    350            (job_id, hostname))
    351     sys.exit(1)
    352 else:
    353     print "All of the reasons this job is not running are listed above."
    354     sys.exit(0)
    355