Home | History | Annotate | Download | only in linux
      1 /* SPDX-License-Identifier: GPL-2.0 */
      2 /*
      3  * Copyright (C) 2017 Google, Inc.
      4  *
      5  */
      6 
      7 #ifndef _UAPI_LINUX_VSOC_SHM_H
      8 #define _UAPI_LINUX_VSOC_SHM_H
      9 
     10 #include <linux/types.h>
     11 
     12 /**
     13  * A permission is a token that permits a receiver to read and/or write an area
     14  * of memory within a Vsoc region.
     15  *
     16  * An fd_scoped permission grants both read and write access, and can be
     17  * attached to a file description (see open(2)).
     18  * Ownership of the area can then be shared by passing a file descriptor
     19  * among processes.
     20  *
     21  * begin_offset and end_offset define the area of memory that is controlled by
     22  * the permission. owner_offset points to a word, also in shared memory, that
     23  * controls ownership of the area.
     24  *
     25  * ownership of the region expires when the associated file description is
     26  * released.
     27  *
     28  * At most one permission can be attached to each file description.
     29  *
     30  * This is useful when implementing HALs like gralloc that scope and pass
     31  * ownership of shared resources via file descriptors.
     32  *
     33  * The caller is responsibe for doing any fencing.
     34  *
     35  * The calling process will normally identify a currently free area of
     36  * memory. It will construct a proposed fd_scoped_permission_arg structure:
     37  *
     38  *   begin_offset and end_offset describe the area being claimed
     39  *
     40  *   owner_offset points to the location in shared memory that indicates the
     41  *   owner of the area.
     42  *
     43  *   owned_value is the value that will be stored in owner_offset iff the
     44  *   permission can be granted. It must be different than VSOC_REGION_FREE.
     45  *
     46  * Two fd_scoped_permission structures are compatible if they vary only by
     47  * their owned_value fields.
     48  *
     49  * The driver ensures that, for any group of simultaneous callers proposing
     50  * compatible fd_scoped_permissions, it will accept exactly one of the
     51  * propopsals. The other callers will get a failure with errno of EAGAIN.
     52  *
     53  * A process receiving a file descriptor can identify the region being
     54  * granted using the VSOC_GET_FD_SCOPED_PERMISSION ioctl.
     55  */
     56 struct fd_scoped_permission {
     57 	__u32 begin_offset;
     58 	__u32 end_offset;
     59 	__u32 owner_offset;
     60 	__u32 owned_value;
     61 };
     62 
     63 /*
     64  * This value represents a free area of memory. The driver expects to see this
     65  * value at owner_offset when creating a permission otherwise it will not do it,
     66  * and will write this value back once the permission is no longer needed.
     67  */
     68 #define VSOC_REGION_FREE ((__u32)0)
     69 
     70 /**
     71  * ioctl argument for VSOC_CREATE_FD_SCOPE_PERMISSION
     72  */
     73 struct fd_scoped_permission_arg {
     74 	struct fd_scoped_permission perm;
     75 	__s32 managed_region_fd;
     76 };
     77 
     78 #define VSOC_NODE_FREE ((__u32)0)
     79 
     80 /*
     81  * Describes a signal table in shared memory. Each non-zero entry in the
     82  * table indicates that the receiver should signal the futex at the given
     83  * offset. Offsets are relative to the region, not the shared memory window.
     84  *
     85  * interrupt_signalled_offset is used to reliably signal interrupts across the
     86  * vmm boundary. There are two roles: transmitter and receiver. For example,
     87  * in the host_to_guest_signal_table the host is the transmitter and the
     88  * guest is the receiver. The protocol is as follows:
     89  *
     90  * 1. The transmitter should convert the offset of the futex to an offset
     91  *    in the signal table [0, (1 << num_nodes_lg2))
     92  *    The transmitter can choose any appropriate hashing algorithm, including
     93  *    hash = futex_offset & ((1 << num_nodes_lg2) - 1)
     94  *
     95  * 3. The transmitter should atomically compare and swap futex_offset with 0
     96  *    at hash. There are 3 possible outcomes
     97  *      a. The swap fails because the futex_offset is already in the table.
     98  *         The transmitter should stop.
     99  *      b. Some other offset is in the table. This is a hash collision. The
    100  *         transmitter should move to another table slot and try again. One
    101  *         possible algorithm:
    102  *         hash = (hash + 1) & ((1 << num_nodes_lg2) - 1)
    103  *      c. The swap worked. Continue below.
    104  *
    105  * 3. The transmitter atomically swaps 1 with the value at the
    106  *    interrupt_signalled_offset. There are two outcomes:
    107  *      a. The prior value was 1. In this case an interrupt has already been
    108  *         posted. The transmitter is done.
    109  *      b. The prior value was 0, indicating that the receiver may be sleeping.
    110  *         The transmitter will issue an interrupt.
    111  *
    112  * 4. On waking the receiver immediately exchanges a 0 with the
    113  *    interrupt_signalled_offset. If it receives a 0 then this a spurious
    114  *    interrupt. That may occasionally happen in the current protocol, but
    115  *    should be rare.
    116  *
    117  * 5. The receiver scans the signal table by atomicaly exchanging 0 at each
    118  *    location. If a non-zero offset is returned from the exchange the
    119  *    receiver wakes all sleepers at the given offset:
    120  *      futex((int*)(region_base + old_value), FUTEX_WAKE, MAX_INT);
    121  *
    122  * 6. The receiver thread then does a conditional wait, waking immediately
    123  *    if the value at interrupt_signalled_offset is non-zero. This catches cases
    124  *    here additional  signals were posted while the table was being scanned.
    125  *    On the guest the wait is handled via the VSOC_WAIT_FOR_INCOMING_INTERRUPT
    126  *    ioctl.
    127  */
    128 struct vsoc_signal_table_layout {
    129 	/* log_2(Number of signal table entries) */
    130 	__u32 num_nodes_lg2;
    131 	/*
    132 	 * Offset to the first signal table entry relative to the start of the
    133 	 * region
    134 	 */
    135 	__u32 futex_uaddr_table_offset;
    136 	/*
    137 	 * Offset to an atomic_t / atomic uint32_t. A non-zero value indicates
    138 	 * that one or more offsets are currently posted in the table.
    139 	 * semi-unique access to an entry in the table
    140 	 */
    141 	__u32 interrupt_signalled_offset;
    142 };
    143 
    144 #define VSOC_REGION_WHOLE ((__s32)0)
    145 #define VSOC_DEVICE_NAME_SZ 16
    146 
    147 /**
    148  * Each HAL would (usually) talk to a single device region
    149  * Mulitple entities care about these regions:
    150  * - The ivshmem_server will populate the regions in shared memory
    151  * - The guest kernel will read the region, create minor device nodes, and
    152  *   allow interested parties to register for FUTEX_WAKE events in the region
    153  * - HALs will access via the minor device nodes published by the guest kernel
    154  * - Host side processes will access the region via the ivshmem_server:
    155  *   1. Pass name to ivshmem_server at a UNIX socket
    156  *   2. ivshmemserver will reply with 2 fds:
    157  *     - host->guest doorbell fd
    158  *     - guest->host doorbell fd
    159  *     - fd for the shared memory region
    160  *     - region offset
    161  *   3. Start a futex receiver thread on the doorbell fd pointed at the
    162  *      signal_nodes
    163  */
    164 struct vsoc_device_region {
    165 	__u16 current_version;
    166 	__u16 min_compatible_version;
    167 	__u32 region_begin_offset;
    168 	__u32 region_end_offset;
    169 	__u32 offset_of_region_data;
    170 	struct vsoc_signal_table_layout guest_to_host_signal_table;
    171 	struct vsoc_signal_table_layout host_to_guest_signal_table;
    172 	/* Name of the device. Must always be terminated with a '\0', so
    173 	 * the longest supported device name is 15 characters.
    174 	 */
    175 	char device_name[VSOC_DEVICE_NAME_SZ];
    176 	/* There are two ways that permissions to access regions are handled:
    177 	 *   - When subdivided_by is VSOC_REGION_WHOLE, any process that can
    178 	 *     open the device node for the region gains complete access to it.
    179 	 *   - When subdivided is set processes that open the region cannot
    180 	 *     access it. Access to a sub-region must be established by invoking
    181 	 *     the VSOC_CREATE_FD_SCOPE_PERMISSION ioctl on the region
    182 	 *     referenced in subdivided_by, providing a fileinstance
    183 	 *     (represented by a fd) opened on this region.
    184 	 */
    185 	__u32 managed_by;
    186 };
    187 
    188 /*
    189  * The vsoc layout descriptor.
    190  * The first 4K should be reserved for the shm header and region descriptors.
    191  * The regions should be page aligned.
    192  */
    193 
    194 struct vsoc_shm_layout_descriptor {
    195 	__u16 major_version;
    196 	__u16 minor_version;
    197 
    198 	/* size of the shm. This may be redundant but nice to have */
    199 	__u32 size;
    200 
    201 	/* number of shared memory regions */
    202 	__u32 region_count;
    203 
    204 	/* The offset to the start of region descriptors */
    205 	__u32 vsoc_region_desc_offset;
    206 };
    207 
    208 /*
    209  * This specifies the current version that should be stored in
    210  * vsoc_shm_layout_descriptor.major_version and
    211  * vsoc_shm_layout_descriptor.minor_version.
    212  * It should be updated only if the vsoc_device_region and
    213  * vsoc_shm_layout_descriptor structures have changed.
    214  * Versioning within each region is transferred
    215  * via the min_compatible_version and current_version fields in
    216  * vsoc_device_region. The driver does not consult these fields: they are left
    217  * for the HALs and host processes and will change independently of the layout
    218  * version.
    219  */
    220 #define CURRENT_VSOC_LAYOUT_MAJOR_VERSION 2
    221 #define CURRENT_VSOC_LAYOUT_MINOR_VERSION 0
    222 
    223 #define VSOC_CREATE_FD_SCOPED_PERMISSION \
    224 	_IOW(0xF5, 0, struct fd_scoped_permission)
    225 #define VSOC_GET_FD_SCOPED_PERMISSION _IOR(0xF5, 1, struct fd_scoped_permission)
    226 
    227 /*
    228  * This is used to signal the host to scan the guest_to_host_signal_table
    229  * for new futexes to wake. This sends an interrupt if one is not already
    230  * in flight.
    231  */
    232 #define VSOC_MAYBE_SEND_INTERRUPT_TO_HOST _IO(0xF5, 2)
    233 
    234 /*
    235  * When this returns the guest will scan host_to_guest_signal_table to
    236  * check for new futexes to wake.
    237  */
    238 /* TODO(ghartman): Consider moving this to the bottom half */
    239 #define VSOC_WAIT_FOR_INCOMING_INTERRUPT _IO(0xF5, 3)
    240 
    241 /*
    242  * Guest HALs will use this to retrieve the region description after
    243  * opening their device node.
    244  */
    245 #define VSOC_DESCRIBE_REGION _IOR(0xF5, 4, struct vsoc_device_region)
    246 
    247 /*
    248  * Wake any threads that may be waiting for a host interrupt on this region.
    249  * This is mostly used during shutdown.
    250  */
    251 #define VSOC_SELF_INTERRUPT _IO(0xF5, 5)
    252 
    253 /*
    254  * This is used to signal the host to scan the guest_to_host_signal_table
    255  * for new futexes to wake. This sends an interrupt unconditionally.
    256  */
    257 #define VSOC_SEND_INTERRUPT_TO_HOST _IO(0xF5, 6)
    258 
    259 enum wait_types {
    260 	VSOC_WAIT_UNDEFINED = 0,
    261 	VSOC_WAIT_IF_EQUAL = 1,
    262 	VSOC_WAIT_IF_EQUAL_TIMEOUT = 2
    263 };
    264 
    265 /*
    266  * Wait for a condition to be true
    267  *
    268  * Note, this is sized and aligned so the 32 bit and 64 bit layouts are
    269  * identical.
    270  */
    271 struct vsoc_cond_wait {
    272 	/* Input: Offset of the 32 bit word to check */
    273 	__u32 offset;
    274 	/* Input: Value that will be compared with the offset */
    275 	__u32 value;
    276 	/* Monotonic time to wake at in seconds */
    277 	__u64 wake_time_sec;
    278 	/* Input: Monotonic time to wait in nanoseconds */
    279 	__u32 wake_time_nsec;
    280 	/* Input: Type of wait */
    281 	__u32 wait_type;
    282 	/* Output: Number of times the thread woke before returning. */
    283 	__u32 wakes;
    284 	/* Ensure that we're 8-byte aligned and 8 byte length for 32/64 bit
    285 	 * compatibility.
    286 	 */
    287 	__u32 reserved_1;
    288 };
    289 
    290 #define VSOC_COND_WAIT _IOWR(0xF5, 7, struct vsoc_cond_wait)
    291 
    292 /* Wake any local threads waiting at the offset given in arg */
    293 #define VSOC_COND_WAKE _IO(0xF5, 8)
    294 
    295 #endif /* _UAPI_LINUX_VSOC_SHM_H */
    296