1 # Copyright 2018 The Chromium OS Authors. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 import contextlib 6 import logging 7 import os 8 import random 9 import re 10 11 from autotest_lib.client.bin import utils as client_utils 12 from autotest_lib.client.common_lib import utils as common_utils 13 from autotest_lib.client.common_lib import error 14 from autotest_lib.server import utils 15 from autotest_lib.server.cros import lockfile 16 17 18 @contextlib.contextmanager 19 def lock(filename): 20 """Prevents other autotest/tradefed instances from accessing cache. 21 22 @param filename: The file to be locked. 23 """ 24 filelock = lockfile.FileLock(filename) 25 # It is tempting just to call filelock.acquire(3600). But the implementation 26 # has very poor temporal granularity (timeout/10), which is unsuitable for 27 # our needs. See /usr/lib64/python2.7/site-packages/lockfile/ 28 attempts = 0 29 while not filelock.i_am_locking(): 30 try: 31 attempts += 1 32 logging.info('Waiting for cache lock...') 33 # We must not use a random integer as the filelock implementations 34 # may underflow an integer division. 35 filelock.acquire(random.uniform(0.0, pow(2.0, attempts))) 36 except (lockfile.AlreadyLocked, lockfile.LockTimeout): 37 # Our goal is to wait long enough to be sure something very bad 38 # happened to the locking thread. 11 attempts is between 15 and 39 # 30 minutes. 40 if attempts > 11: 41 # Normally we should aqcuire the lock immediately. Once we 42 # wait on the order of 10 minutes either the dev server IO is 43 # overloaded or a lock didn't get cleaned up. Take one for the 44 # team, break the lock and report a failure. This should fix 45 # the lock for following tests. If the failure affects more than 46 # one job look for a deadlock or dev server overload. 47 logging.error('Permanent lock failure. Trying to break lock.') 48 # TODO(ihf): Think how to do this cleaner without having a 49 # recursive lock breaking problem. We may have to kill every 50 # job that is currently waiting. The main goal though really is 51 # to have a cache that does not corrupt. And cache updates 52 # only happen once a month or so, everything else are reads. 53 filelock.break_lock() 54 raise error.TestFail('Error: permanent cache lock failure.') 55 else: 56 logging.info('Acquired cache lock after %d attempts.', attempts) 57 try: 58 yield 59 finally: 60 filelock.release() 61 logging.info('Released cache lock.') 62 63 64 @contextlib.contextmanager 65 def adb_keepalive(targets, extra_paths): 66 """A context manager that keeps the adb connection alive. 67 68 AdbKeepalive will spin off a new process that will continuously poll for 69 adb's connected state, and will attempt to reconnect if it ever goes down. 70 This is the only way we can currently recover safely from (intentional) 71 reboots. 72 73 @param target: the hostname and port of the DUT. 74 @param extra_paths: any additional components to the PATH environment 75 variable. 76 """ 77 from autotest_lib.client.common_lib.cros import adb_keepalive as module 78 # |__file__| returns the absolute path of the compiled bytecode of the 79 # module. We want to run the original .py file, so we need to change the 80 # extension back. 81 script_filename = module.__file__.replace('.pyc', '.py') 82 jobs = [common_utils.BgJob( 83 [script_filename, target], 84 nickname='adb_keepalive', 85 stderr_level=logging.DEBUG, 86 stdout_tee=common_utils.TEE_TO_LOGS, 87 stderr_tee=common_utils.TEE_TO_LOGS, 88 extra_paths=extra_paths) for target in targets] 89 90 try: 91 yield 92 finally: 93 # The adb_keepalive.py script runs forever until SIGTERM is sent. 94 for job in jobs: 95 common_utils.nuke_subprocess(job.sp) 96 common_utils.join_bg_jobs(jobs) 97 98 99 @contextlib.contextmanager 100 def pushd(d): 101 """Defines pushd. 102 @param d: the directory to change to. 103 """ 104 current = os.getcwd() 105 os.chdir(d) 106 try: 107 yield 108 finally: 109 os.chdir(current) 110 111 112 def parse_tradefed_result(result, waivers=None): 113 """Check the result from the tradefed output. 114 115 @param result: The result stdout string from the tradefed command. 116 @param waivers: a set() of tests which are permitted to fail. 117 @return List of the waived tests. 118 """ 119 # Regular expressions for start/end messages of each test-run chunk. 120 abi_re = r'arm\S*|x86\S*' 121 # TODO(kinaba): use the current running module name. 122 module_re = r'\S+' 123 start_re = re.compile(r'(?:Start|Continu)ing (%s) %s with' 124 r' (\d+(?:,\d+)?) test' % (abi_re, module_re)) 125 end_re = re.compile(r'(%s) %s (?:complet|fail)ed in .*\.' 126 r' (\d+) passed, (\d+) failed, (\d+) not executed' % 127 (abi_re, module_re)) 128 fail_re = re.compile(r'I/ConsoleReporter.* (\S+) fail:') 129 inaccurate_re = re.compile(r'IMPORTANT: Some modules failed to run to ' 130 'completion, tests counts may be inaccurate') 131 abis = set() 132 waived_count = dict() 133 failed_tests = set() 134 accurate = True 135 for line in result.splitlines(): 136 match = start_re.search(line) 137 if match: 138 abis = abis.union([match.group(1)]) 139 continue 140 match = end_re.search(line) 141 if match: 142 abi = match.group(1) 143 if abi not in abis: 144 logging.error('Trunk end with %s abi but have not seen ' 145 'any trunk start with this abi.(%s)', abi, line) 146 continue 147 match = fail_re.search(line) 148 if match: 149 testname = match.group(1) 150 if waivers and testname in waivers: 151 waived_count[testname] = waived_count.get(testname, 0) + 1 152 else: 153 failed_tests.add(testname) 154 continue 155 # b/66899135, tradefed may reported inaccuratly with `list results`. 156 # Add warning if summary section shows that the result is inacurrate. 157 match = inaccurate_re.search(line) 158 if match: 159 accurate = False 160 161 logging.info('Total ABIs: %s', abis) 162 if failed_tests: 163 logging.error('Failed (but not waived) tests:\n%s', 164 '\n'.join(sorted(failed_tests))) 165 166 # TODO(dhaddock): Find a more robust way to apply waivers. 167 waived = [] 168 for testname, fail_count in waived_count.items(): 169 if fail_count > len(abis): 170 # This should be an error.TestFail, but unfortunately 171 # tradefed has a bug that emits "fail" twice when a 172 # test failed during teardown. It will anyway causes 173 # a test count inconsistency and visible on the dashboard. 174 logging.error('Found %d failures for %s but there are only %d ' 175 'abis: %s', fail_count, testname, len(abis), abis) 176 fail_count = len(abis) 177 waived += [testname] * fail_count 178 logging.info('Waived failure for %s %d time(s)', testname, fail_count) 179 logging.info('Total waived = %s', waived) 180 return waived, accurate 181 182 183 def select_32bit_java(): 184 """Switches to 32 bit java if installed (like in lab lxc images) to save 185 about 30-40% server/shard memory during the run.""" 186 if utils.is_in_container() and not client_utils.is_moblab(): 187 java = '/usr/lib/jvm/java-8-openjdk-i386' 188 if os.path.exists(java): 189 logging.info('Found 32 bit java, switching to use it.') 190 os.environ['JAVA_HOME'] = java 191 os.environ['PATH'] = ( 192 os.path.join(java, 'bin') + os.pathsep + os.environ['PATH']) 193