Home | History | Annotate | Download | only in bin
      1 # Copyright 2007-2010 Google Inc.  Released under the GPL v2
      2 __author__ = "duanes (Duane Sand), pdahl (Peter Dahl)"
      3 
      4 # A basic cpuset/cgroup container manager for limiting memory use during tests
      5 #   for use on kernels not running some site-specific container manager
      6 
      7 import os, sys, re, glob, fcntl, logging
      8 from autotest_lib.client.bin import utils
      9 from autotest_lib.client.common_lib import error
     10 
     11 SUPER_ROOT = ''      # root of all containers or cgroups
     12 NO_LIMIT = (1 << 63) - 1   # containername/memory.limit_in_bytes if no limit
     13 
     14 # propio service classes:
     15 PROPIO_PRIO = 1
     16 PROPIO_NORMAL = 2
     17 PROPIO_IDLE = 3
     18 
     19 super_root_path = ''    # usually '/dev/cgroup'; '/dev/cpuset' on 2.6.18
     20 cpuset_prefix   = None  # usually 'cpuset.'; '' on 2.6.18
     21 fake_numa_containers = False # container mem via numa=fake mem nodes, else pages
     22 mem_isolation_on = False
     23 node_mbytes = 0         # mbytes in one typical mem node
     24 root_container_bytes = 0  # squishy limit on effective size of root container
     25 
     26 
     27 def discover_container_style():
     28     global super_root_path, cpuset_prefix
     29     global mem_isolation_on, fake_numa_containers
     30     global node_mbytes, root_container_bytes
     31     if super_root_path != '':
     32         return  # already looked up
     33     if os.path.exists('/dev/cgroup/tasks'):
     34         # running on 2.6.26 or later kernel with containers on:
     35         super_root_path = '/dev/cgroup'
     36         cpuset_prefix = 'cpuset.'
     37         if get_boot_numa():
     38             mem_isolation_on = fake_numa_containers = True
     39         else:  # memcg containers IFF compiled-in & mounted & non-fakenuma boot
     40             fake_numa_containers = False
     41             mem_isolation_on = os.path.exists(
     42                     '/dev/cgroup/memory.limit_in_bytes')
     43             # TODO: handle possibility of where memcg is mounted as its own
     44             #       cgroup hierarchy, separate from cpuset??
     45     elif os.path.exists('/dev/cpuset/tasks'):
     46         # running on 2.6.18 kernel with containers on:
     47         super_root_path = '/dev/cpuset'
     48         cpuset_prefix = ''
     49         mem_isolation_on = fake_numa_containers = get_boot_numa() != ''
     50     else:
     51         # neither cpuset nor cgroup filesystem active:
     52         super_root_path = None
     53         cpuset_prefix = 'no_cpusets_or_cgroups_exist'
     54         mem_isolation_on = fake_numa_containers = False
     55 
     56     logging.debug('mem_isolation: %s', mem_isolation_on)
     57     logging.debug('fake_numa_containers: %s', fake_numa_containers)
     58     if fake_numa_containers:
     59         node_mbytes = int(mbytes_per_mem_node())
     60     elif mem_isolation_on:  # memcg-style containers
     61         # For now, limit total of all containers to using just 98% of system's
     62         #   visible total ram, to avoid oom events at system level, and avoid
     63         #   page reclaim overhead from going above kswapd highwater mark.
     64         system_visible_pages = utils.memtotal() >> 2
     65         usable_pages = int(system_visible_pages * 0.98)
     66         root_container_bytes = usable_pages << 12
     67         logging.debug('root_container_bytes: %s',
     68                       utils.human_format(root_container_bytes))
     69 
     70 
     71 def need_mem_containers():
     72     discover_container_style()
     73     if not mem_isolation_on:
     74         raise error.AutotestError('Mem-isolation containers not enabled '
     75                                   'by latest reboot')
     76 
     77 def need_fake_numa():
     78     discover_container_style()
     79     if not fake_numa_containers:
     80         raise error.AutotestError('fake=numa not enabled by latest reboot')
     81 
     82 
     83 def full_path(container_name):
     84     discover_container_style()
     85     return os.path.join(super_root_path, container_name)
     86 
     87 
     88 def unpath(container_path):
     89     return container_path[len(super_root_path)+1:]
     90 
     91 
     92 def cpuset_attr(container_name, attr):
     93     discover_container_style()
     94     return os.path.join(super_root_path, container_name, cpuset_prefix+attr)
     95 
     96 
     97 def io_attr(container_name, attr):
     98     discover_container_style()
     99     # current version assumes shared cgroup hierarchy
    100     return os.path.join(super_root_path, container_name, 'io.'+attr)
    101 
    102 
    103 def tasks_path(container_name):
    104     return os.path.join(full_path(container_name), 'tasks')
    105 
    106 
    107 def mems_path(container_name):
    108     return cpuset_attr(container_name, 'mems')
    109 
    110 
    111 def memory_path(container_name):
    112     return os.path.join(super_root_path, container_name, 'memory')
    113 
    114 
    115 def cpus_path(container_name):
    116     return cpuset_attr(container_name, 'cpus')
    117 
    118 
    119 def container_exists(name):
    120     return name is not None and os.path.exists(tasks_path(name))
    121 
    122 
    123 def move_tasks_into_container(name, tasks):
    124     task_file = tasks_path(name)
    125     for task in tasks:
    126         try:
    127             logging.debug('moving task %s into container "%s"', task, name)
    128             utils.write_one_line(task_file, task)
    129         except Exception:
    130             if utils.pid_is_alive(task):
    131                 raise   # task exists but couldn't move it
    132             # task is gone or zombie so ignore this exception
    133 
    134 
    135 def move_self_into_container(name):
    136     me = str(os.getpid())
    137     move_tasks_into_container(name, [me])
    138     logging.debug('running self (pid %s) in container "%s"', me, name)
    139 
    140 
    141 def _avail_mbytes_via_nodes(parent):
    142     # total mbytes of mem nodes available for new containers in parent
    143     free_nodes = available_exclusive_mem_nodes(parent)
    144     mbytes = nodes_avail_mbytes(free_nodes)
    145     # don't have exact model for how container mgr measures mem space
    146     # better here to underestimate than overestimate
    147     mbytes = max(mbytes - node_mbytes//2, 0)
    148     return mbytes
    149 
    150 
    151 def _avail_bytes_via_pages(parent):
    152     # Get memory bytes available to parent container which could
    153     #  be allocated exclusively to new child containers.
    154     # This excludes mem previously allocated to existing children.
    155     available = container_bytes(parent)
    156     mem_files_pattern = os.path.join(full_path(parent),
    157                                      '*', 'memory.limit_in_bytes')
    158     for mem_file in glob.glob(mem_files_pattern):
    159         child_container = unpath(os.path.dirname(mem_file))
    160         available -= container_bytes(child_container)
    161     return available
    162 
    163 
    164 def avail_mbytes(parent=SUPER_ROOT):
    165     # total mbytes available in parent, for exclusive use in new containers
    166     if fake_numa_containers:
    167         return _avail_mbytes_via_nodes(parent)
    168     else:
    169         return _avail_bytes_via_pages(parent) >> 20
    170 
    171 
    172 def delete_leftover_test_containers():
    173     # recover mems and cores tied up by containers of prior failed tests:
    174     for child in inner_containers_of(SUPER_ROOT):
    175         _release_container_nest(child)
    176 
    177 
    178 def my_lock(lockname):
    179     # lockname is 'inner'
    180     lockdir = os.environ['AUTODIR']
    181     lockname = os.path.join(lockdir, '.cpuset.lock.'+lockname)
    182     lockfile = open(lockname, 'w')
    183     fcntl.flock(lockfile, fcntl.LOCK_EX)
    184     return lockfile
    185 
    186 
    187 def my_unlock(lockfile):
    188     fcntl.flock(lockfile, fcntl.LOCK_UN)
    189     lockfile.close()
    190 
    191 
    192 # Convert '1-3,7,9-12' to set(1,2,3,7,9,10,11,12)
    193 def rangelist_to_set(rangelist):
    194     result = set()
    195     if not rangelist:
    196         return result
    197     for x in rangelist.split(','):
    198         if re.match(r'^(\d+)$', x):
    199             result.add(int(x))
    200             continue
    201         m = re.match(r'^(\d+)-(\d+)$', x)
    202         if m:
    203             start = int(m.group(1))
    204             end = int(m.group(2))
    205             result.update(set(range(start, end+1)))
    206             continue
    207         msg = 'Cannot understand data input: %s %s' % (x, rangelist)
    208         raise ValueError(msg)
    209     return result
    210 
    211 
    212 def my_container_name():
    213     # Get current process's inherited or self-built container name
    214     #   within /dev/cpuset or /dev/cgroup.  Is '' for root container.
    215     name = utils.read_one_line('/proc/%i/cpuset' % os.getpid())
    216     return name[1:]   # strip leading /
    217 
    218 
    219 def get_mem_nodes(container_name):
    220     # all mem nodes now available to a container, both exclusive & shared
    221     file_name = mems_path(container_name)
    222     if os.path.exists(file_name):
    223         return rangelist_to_set(utils.read_one_line(file_name))
    224     else:
    225         return set()
    226 
    227 
    228 def _busy_mem_nodes(parent_container):
    229     # Get set of numa memory nodes now used (exclusively or shared)
    230     #   by existing children of parent container
    231     busy = set()
    232     mem_files_pattern = os.path.join(full_path(parent_container),
    233                                      '*', cpuset_prefix+'mems')
    234     for mem_file in glob.glob(mem_files_pattern):
    235         child_container = os.path.dirname(mem_file)
    236         busy |= get_mem_nodes(child_container)
    237     return busy
    238 
    239 
    240 def available_exclusive_mem_nodes(parent_container):
    241     # Get subset of numa memory nodes of parent container which could
    242     #  be allocated exclusively to new child containers.
    243     # This excludes nodes now allocated to existing children.
    244     need_fake_numa()
    245     available = get_mem_nodes(parent_container)
    246     available -= _busy_mem_nodes(parent_container)
    247     return available
    248 
    249 
    250 def my_mem_nodes():
    251     # Get set of numa memory nodes owned by current process's container.
    252     discover_container_style()
    253     if not mem_isolation_on:
    254         return set()    # as expected by vmstress
    255     return get_mem_nodes(my_container_name())
    256 
    257 
    258 def my_available_exclusive_mem_nodes():
    259     # Get subset of numa memory nodes owned by current process's
    260     # container, which could be allocated exclusively to new child
    261     # containers.  This excludes any nodes now allocated
    262     # to existing children.
    263     return available_exclusive_mem_nodes(my_container_name())
    264 
    265 
    266 def node_avail_kbytes(node):
    267     return node_mbytes << 10  # crude; fixed numa node size
    268 
    269 
    270 def nodes_avail_mbytes(nodes):
    271     # nodes' combined user+avail size, in Mbytes
    272     return sum(node_avail_kbytes(n) for n in nodes) // 1024
    273 
    274 
    275 def container_bytes(name):
    276     if fake_numa_containers:
    277         return nodes_avail_mbytes(get_mem_nodes(name)) << 20
    278     else:
    279         while True:
    280             file = memory_path(name) + '.limit_in_bytes'
    281             limit = int(utils.read_one_line(file))
    282             if limit < NO_LIMIT:
    283                 return limit
    284             if name == SUPER_ROOT:
    285                 return root_container_bytes
    286             name = os.path.dirname(name)
    287 
    288 
    289 def container_mbytes(name):
    290     return container_bytes(name) >> 20
    291 
    292 
    293 def mbytes_per_mem_node():
    294     # Get mbyte size of standard numa mem node, as float
    295     #  (some nodes are bigger than this)
    296     # Replaces utils.node_size().
    297     numa = get_boot_numa()
    298     if numa.endswith('M'):
    299         return float(numa[:-1])  # mbyte size of fake nodes
    300     elif numa:
    301         nodecnt = int(numa)  # fake numa mem nodes for container isolation
    302     else:
    303         nodecnt = len(utils.numa_nodes())  # phys mem-controller nodes
    304     # Use guessed total physical mem size, not kernel's
    305     #   lesser 'available memory' after various system tables.
    306     return utils.rounded_memtotal() / (nodecnt * 1024.0)
    307 
    308 
    309 def get_cpus(container_name):
    310     file_name = cpus_path(container_name)
    311     if os.path.exists(file_name):
    312         return rangelist_to_set(utils.read_one_line(file_name))
    313     else:
    314         return set()
    315 
    316 
    317 def get_tasks(container_name):
    318     file_name = tasks_path(container_name)
    319     try:
    320         tasks = [x.rstrip() for x in open(file_name).readlines()]
    321     except IOError:
    322         if os.path.exists(file_name):
    323             raise
    324         tasks = []   # container doesn't exist anymore
    325     return tasks
    326 
    327 
    328 def inner_containers_of(parent):
    329     pattern = os.path.join(full_path(parent), '*/tasks')
    330     return [unpath(os.path.dirname(task_file))
    331             for task_file in glob.glob(pattern)]
    332 
    333 
    334 def _release_container_nest(nest):
    335     # Destroy a container, and any nested sub-containers
    336     nest_path = full_path(nest)
    337     if os.path.exists(nest_path):
    338 
    339         # bottom-up walk of tree, releasing all nested sub-containers
    340         for child in inner_containers_of(nest):
    341             _release_container_nest(child)
    342 
    343         logging.debug("releasing container %s", nest)
    344 
    345         # Transfer any survivor tasks (e.g. self) to parent container
    346         parent = os.path.dirname(nest)
    347         move_tasks_into_container(parent, get_tasks(nest))
    348 
    349         # remove the now-empty outermost container of this nest
    350         if os.path.exists(nest_path):
    351             os.rmdir(nest_path)  # nested, or dead manager
    352 
    353 
    354 def release_container(container_name=None):
    355     # Destroy a container
    356     my_container = my_container_name()
    357     if container_name is None:
    358         container_name = my_container
    359     _release_container_nest(container_name)
    360     displaced = my_container_name()
    361     if displaced != my_container:
    362         logging.debug('now running self (pid %d) in container "%s"',
    363                       os.getpid(), displaced)
    364 
    365 
    366 def remove_empty_prio_classes(prios):
    367     # remove prio classes whose set of allowed priorities is empty
    368     #    e.g  'no:3;rt:;be:3;id:'  -->  'no:3;be:3'
    369     return ';'.join(p for p in prios.split(';') if p.split(':')[1])
    370 
    371 
    372 def all_drive_names():
    373     # list of all disk drives sda,sdb,...
    374     paths = glob.glob('/sys/block/sd*')
    375     if not paths:
    376         paths = glob.glob('/sys/block/hd*')
    377     return [os.path.basename(path) for path in paths]
    378 
    379 
    380 def set_io_controls(container_name, disks=[], ioprio_classes=[PROPIO_NORMAL],
    381                     io_shares=[95], io_limits=[0]):
    382     # set the propio controls for one container, for selected disks
    383     # writing directly to /dev/cgroup/container_name/io.io_service_level
    384     #    without using containerd or container.py
    385     # See wiki ProportionalIOScheduler for definitions
    386     # ioprio_classes: list of service classes, one per disk
    387     #    using numeric propio service classes as used by kernel API, namely
    388     #       1: RT, Real Time, aka PROPIO_PRIO
    389     #       2: BE, Best Effort, aka PROPIO_NORMAL
    390     #       3: PROPIO_IDLE
    391     # io_shares: list of disk-time-fractions, one per disk,
    392     #       as percentage integer 0..100
    393     # io_limits: list of limit on/off, one per disk
    394     #       0: no limit, shares use of other containers' unused disk time
    395     #       1: limited, container's use of disk time is capped to given DTF
    396     # ioprio_classes defaults to best-effort
    397     # io_limit defaults to no limit, use slack time
    398     if not disks:  # defaults to all drives
    399         disks = all_drive_names()
    400         io_shares      = [io_shares     [0]] * len(disks)
    401         ioprio_classes = [ioprio_classes[0]] * len(disks)
    402         io_limits      = [io_limits     [0]] * len(disks)
    403     if not (len(disks) == len(ioprio_classes) and len(disks) == len(io_shares)
    404                                               and len(disks) == len(io_limits)):
    405         raise error.AutotestError('Unequal number of values for io controls')
    406     service_level = io_attr(container_name, 'io_service_level')
    407     if not os.path.exists(service_level):
    408         return  # kernel predates propio features
    409             # or io cgroup is mounted separately from cpusets
    410     disk_infos = []
    411     for disk,ioclass,limit,share in zip(disks, ioprio_classes,
    412                                         io_limits, io_shares):
    413         parts = (disk, str(ioclass), str(limit), str(share))
    414         disk_info = ' '.join(parts)
    415         utils.write_one_line(service_level, disk_info)
    416         disk_infos.append(disk_info)
    417     logging.debug('set_io_controls of %s to %s',
    418                   container_name, ', '.join(disk_infos))
    419 
    420 
    421 def abbrev_list(vals):
    422     """Condense unsigned (0,4,5,6,7,10) to '0,4-7,10'."""
    423     ranges = []
    424     lower = 0
    425     upper = -2
    426     for val in sorted(vals)+[-1]:
    427         if val != upper+1:
    428             if lower == upper:
    429                 ranges.append(str(lower))
    430             elif lower <= upper:
    431                 ranges.append('%d-%d' % (lower, upper))
    432             lower = val
    433         upper = val
    434     return ','.join(ranges)
    435 
    436 
    437 def create_container_with_specific_mems_cpus(name, mems, cpus):
    438     need_fake_numa()
    439     os.mkdir(full_path(name))
    440     utils.write_one_line(cpuset_attr(name, 'mem_hardwall'), '1')
    441     utils.write_one_line(mems_path(name), ','.join(map(str, mems)))
    442     utils.write_one_line(cpus_path(name), ','.join(map(str, cpus)))
    443     logging.debug('container %s has %d cpus and %d nodes totalling %s bytes',
    444                   name, len(cpus), len(get_mem_nodes(name)),
    445                   utils.human_format(container_bytes(name)) )
    446 
    447 
    448 def create_container_via_memcg(name, parent, bytes, cpus):
    449     # create container via direct memcg cgroup writes
    450     os.mkdir(full_path(name))
    451     nodes = utils.read_one_line(mems_path(parent))
    452     utils.write_one_line(mems_path(name), nodes)  # inherit parent's nodes
    453     utils.write_one_line(memory_path(name)+'.limit_in_bytes', str(bytes))
    454     utils.write_one_line(cpus_path(name), ','.join(map(str, cpus)))
    455     logging.debug('Created container %s directly via memcg,'
    456                   ' has %d cpus and %s bytes',
    457                   name, len(cpus), utils.human_format(container_bytes(name)))
    458 
    459 
    460 def _create_fake_numa_container_directly(name, parent, mbytes, cpus):
    461     need_fake_numa()
    462     lockfile = my_lock('inner')   # serialize race between parallel tests
    463     try:
    464         # Pick specific mem nodes for new cpuset's exclusive use
    465         # For now, arbitrarily pick highest available node numbers
    466         needed_kbytes = mbytes * 1024
    467         nodes = sorted(list(available_exclusive_mem_nodes(parent)))
    468         kbytes = 0
    469         nodecnt = 0
    470         while kbytes < needed_kbytes and nodecnt < len(nodes):
    471             nodecnt += 1
    472             kbytes += node_avail_kbytes(nodes[-nodecnt])
    473         if kbytes < needed_kbytes:
    474             parent_mbytes = container_mbytes(parent)
    475             if mbytes > parent_mbytes:
    476                 raise error.AutotestError(
    477                       "New container's %d Mbytes exceeds "
    478                       "parent container's %d Mbyte size"
    479                       % (mbytes, parent_mbytes) )
    480             else:
    481                 raise error.AutotestError(
    482                       "Existing sibling containers hold "
    483                       "%d Mbytes needed by new container"
    484                       % ((needed_kbytes - kbytes)//1024) )
    485         mems = nodes[-nodecnt:]
    486 
    487         create_container_with_specific_mems_cpus(name, mems, cpus)
    488     finally:
    489         my_unlock(lockfile)
    490 
    491 
    492 def create_container_directly(name, mbytes, cpus):
    493     parent = os.path.dirname(name)
    494     if fake_numa_containers:
    495         _create_fake_numa_container_directly(name, parent, mbytes, cpus)
    496     else:
    497         create_container_via_memcg(name, parent, mbytes<<20, cpus)
    498 
    499 
    500 def create_container_with_mbytes_and_specific_cpus(name, mbytes,
    501                 cpus=None, root=SUPER_ROOT, io={}, move_in=True, timeout=0):
    502     """\
    503     Create a cpuset container and move job's current pid into it
    504     Allocate the list "cpus" of cpus to that container
    505 
    506             name = arbitrary string tag
    507             mbytes = reqested memory for job in megabytes
    508             cpus = list of cpu indicies to associate with the cpuset
    509                   defaults to all cpus avail with given root
    510             root = the parent cpuset to nest this new set within
    511                    '': unnested top-level container
    512             io = arguments for proportional IO containers
    513             move_in = True: Move current process into the new container now.
    514             timeout = must be 0: persist until explicitly deleted.
    515     """
    516     need_mem_containers()
    517     if not container_exists(root):
    518         raise error.AutotestError('Parent container "%s" does not exist'
    519                                    % root)
    520     if cpus is None:
    521         # default to biggest container we can make under root
    522         cpus = get_cpus(root)
    523     else:
    524         cpus = set(cpus)  # interface uses list
    525     if not cpus:
    526         raise error.AutotestError('Creating container with no cpus')
    527     name = os.path.join(root, name)  # path relative to super_root
    528     if os.path.exists(full_path(name)):
    529         raise error.AutotestError('Container %s already exists' % name)
    530     create_container_directly(name, mbytes, cpus)
    531     set_io_controls(name, **io)
    532     if move_in:
    533         move_self_into_container(name)
    534     return name
    535 
    536 
    537 def get_boot_numa():
    538     # get boot-time numa=fake=xyz option for current boot
    539     #   eg  numa=fake=nnn,  numa=fake=nnnM, or nothing
    540     label = 'numa=fake='
    541     for arg in utils.read_one_line('/proc/cmdline').split():
    542         if arg.startswith(label):
    543             return arg[len(label):]
    544     return ''
    545