Home | History | Annotate | Download | only in services
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "sandbox/linux/services/credentials.h"
      6 
      7 #include <errno.h>
      8 #include <limits.h>
      9 #include <signal.h>
     10 #include <stddef.h>
     11 #include <stdint.h>
     12 #include <stdio.h>
     13 #include <sys/syscall.h>
     14 #include <sys/types.h>
     15 #include <sys/wait.h>
     16 #include <unistd.h>
     17 
     18 #include "base/bind.h"
     19 #include "base/compiler_specific.h"
     20 #include "base/files/file_path.h"
     21 #include "base/files/file_util.h"
     22 #include "base/logging.h"
     23 #include "base/macros.h"
     24 #include "base/posix/eintr_wrapper.h"
     25 #include "base/process/launch.h"
     26 #include "base/third_party/valgrind/valgrind.h"
     27 #include "build/build_config.h"
     28 #include "sandbox/linux/services/namespace_utils.h"
     29 #include "sandbox/linux/services/proc_util.h"
     30 #include "sandbox/linux/services/syscall_wrappers.h"
     31 #include "sandbox/linux/services/thread_helpers.h"
     32 #include "sandbox/linux/system_headers/capability.h"
     33 #include "sandbox/linux/system_headers/linux_signal.h"
     34 
     35 namespace sandbox {
     36 
     37 namespace {
     38 
     39 bool IsRunningOnValgrind() { return RUNNING_ON_VALGRIND; }
     40 
     41 // Checks that the set of RES-uids and the set of RES-gids have
     42 // one element each and return that element in |resuid| and |resgid|
     43 // respectively. It's ok to pass NULL as one or both of the ids.
     44 bool GetRESIds(uid_t* resuid, gid_t* resgid) {
     45   uid_t ruid, euid, suid;
     46   gid_t rgid, egid, sgid;
     47   PCHECK(sys_getresuid(&ruid, &euid, &suid) == 0);
     48   PCHECK(sys_getresgid(&rgid, &egid, &sgid) == 0);
     49   const bool uids_are_equal = (ruid == euid) && (ruid == suid);
     50   const bool gids_are_equal = (rgid == egid) && (rgid == sgid);
     51   if (!uids_are_equal || !gids_are_equal) return false;
     52   if (resuid) *resuid = euid;
     53   if (resgid) *resgid = egid;
     54   return true;
     55 }
     56 
     57 const int kExitSuccess = 0;
     58 
     59 #if defined(__clang__)
     60 // Disable sanitizers that rely on TLS and may write to non-stack memory.
     61 __attribute__((no_sanitize_address))
     62 __attribute__((no_sanitize_thread))
     63 __attribute__((no_sanitize_memory))
     64 #endif
     65 int ChrootToSelfFdinfo(void*) {
     66   // This function can be run from a vforked child, so it should not write to
     67   // any memory other than the stack or errno. Reads from TLS may be different
     68   // from in the parent process.
     69   RAW_CHECK(sys_chroot("/proc/self/fdinfo/") == 0);
     70 
     71   // CWD is essentially an implicit file descriptor, so be careful to not
     72   // leave it behind.
     73   RAW_CHECK(chdir("/") == 0);
     74   _exit(kExitSuccess);
     75 }
     76 
     77 // chroot() to an empty dir that is "safe". To be safe, it must not contain
     78 // any subdirectory (chroot-ing there would allow a chroot escape) and it must
     79 // be impossible to create an empty directory there.
     80 // We achieve this by doing the following:
     81 // 1. We create a new process sharing file system information.
     82 // 2. In the child, we chroot to /proc/self/fdinfo/
     83 // This is already "safe", since fdinfo/ does not contain another directory and
     84 // one cannot create another directory there.
     85 // 3. The process dies
     86 // After (3) happens, the directory is not available anymore in /proc.
     87 bool ChrootToSafeEmptyDir() {
     88   // We need to chroot to a fdinfo that is unique to a process and have that
     89   // process die.
     90   // 1. We don't want to simply fork() because duplicating the page tables is
     91   // slow with a big address space.
     92   // 2. We do not use a regular thread (that would unshare CLONE_FILES) because
     93   // when we are in a PID namespace, we cannot easily get a handle to the
     94   // /proc/tid directory for the thread (since /proc may not be aware of the
     95   // PID namespace). With a process, we can just use /proc/self.
     96   pid_t pid = -1;
     97   char stack_buf[PTHREAD_STACK_MIN] ALIGNAS(16);
     98 #if defined(ARCH_CPU_X86_FAMILY) || defined(ARCH_CPU_ARM_FAMILY) || \
     99     defined(ARCH_CPU_MIPS_FAMILY)
    100   // The stack grows downward.
    101   void* stack = stack_buf + sizeof(stack_buf);
    102 #else
    103 #error "Unsupported architecture"
    104 #endif
    105 
    106   int clone_flags = CLONE_FS | LINUX_SIGCHLD;
    107   void* tls = nullptr;
    108 #if defined(ARCH_CPU_X86_64) || defined(ARCH_CPU_ARM_FAMILY)
    109   // Use CLONE_VM | CLONE_VFORK as an optimization to avoid copying page tables.
    110   // Since clone writes to the new child's TLS before returning, we must set a
    111   // new TLS to avoid corrupting the current process's TLS. On ARCH_CPU_X86,
    112   // glibc performs syscalls by calling a function pointer in TLS, so we do not
    113   // attempt this optimization.
    114   clone_flags |= CLONE_VM | CLONE_VFORK | CLONE_SETTLS;
    115 
    116   char tls_buf[PTHREAD_STACK_MIN] = {0};
    117   tls = tls_buf;
    118 #endif
    119 
    120   pid = clone(ChrootToSelfFdinfo, stack, clone_flags, nullptr, nullptr, tls,
    121               nullptr);
    122   PCHECK(pid != -1);
    123 
    124   int status = -1;
    125   PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid);
    126 
    127   return WIFEXITED(status) && WEXITSTATUS(status) == kExitSuccess;
    128 }
    129 
    130 // CHECK() that an attempt to move to a new user namespace raised an expected
    131 // errno.
    132 void CheckCloneNewUserErrno(int error) {
    133   // EPERM can happen if already in a chroot. EUSERS if too many nested
    134   // namespaces are used. EINVAL for kernels that don't support the feature.
    135   // Valgrind will ENOSYS unshare().
    136   PCHECK(error == EPERM || error == EUSERS || error == EINVAL ||
    137          error == ENOSYS);
    138 }
    139 
    140 // Converts a Capability to the corresponding Linux CAP_XXX value.
    141 int CapabilityToKernelValue(Credentials::Capability cap) {
    142   switch (cap) {
    143     case Credentials::Capability::SYS_CHROOT:
    144       return CAP_SYS_CHROOT;
    145     case Credentials::Capability::SYS_ADMIN:
    146       return CAP_SYS_ADMIN;
    147   }
    148 
    149   LOG(FATAL) << "Invalid Capability: " << static_cast<int>(cap);
    150   return 0;
    151 }
    152 
    153 }  // namespace.
    154 
    155 // static
    156 bool Credentials::DropAllCapabilities(int proc_fd) {
    157   if (!SetCapabilities(proc_fd, std::vector<Capability>())) {
    158     return false;
    159   }
    160 
    161   CHECK(!HasAnyCapability());
    162   return true;
    163 }
    164 
    165 // static
    166 bool Credentials::DropAllCapabilities() {
    167   base::ScopedFD proc_fd(ProcUtil::OpenProc());
    168   return Credentials::DropAllCapabilities(proc_fd.get());
    169 }
    170 
    171 // static
    172 bool Credentials::DropAllCapabilitiesOnCurrentThread() {
    173   return SetCapabilitiesOnCurrentThread(std::vector<Capability>());
    174 }
    175 
    176 // static
    177 bool Credentials::SetCapabilitiesOnCurrentThread(
    178     const std::vector<Capability>& caps) {
    179   struct cap_hdr hdr = {};
    180   hdr.version = _LINUX_CAPABILITY_VERSION_3;
    181   struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};
    182 
    183   // Initially, cap has no capability flags set. Enable the effective and
    184   // permitted flags only for the requested capabilities.
    185   for (const Capability cap : caps) {
    186     const int cap_num = CapabilityToKernelValue(cap);
    187     const size_t index = CAP_TO_INDEX(cap_num);
    188     const uint32_t mask = CAP_TO_MASK(cap_num);
    189     data[index].effective |= mask;
    190     data[index].permitted |= mask;
    191   }
    192 
    193   return sys_capset(&hdr, data) == 0;
    194 }
    195 
    196 // static
    197 bool Credentials::SetCapabilities(int proc_fd,
    198                                   const std::vector<Capability>& caps) {
    199   DCHECK_LE(0, proc_fd);
    200 
    201 #if !defined(THREAD_SANITIZER)
    202   // With TSAN, accept to break the security model as it is a testing
    203   // configuration.
    204   CHECK(ThreadHelpers::IsSingleThreaded(proc_fd));
    205 #endif
    206 
    207   return SetCapabilitiesOnCurrentThread(caps);
    208 }
    209 
    210 bool Credentials::HasAnyCapability() {
    211   struct cap_hdr hdr = {};
    212   hdr.version = _LINUX_CAPABILITY_VERSION_3;
    213   struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};
    214 
    215   PCHECK(sys_capget(&hdr, data) == 0);
    216 
    217   for (size_t i = 0; i < arraysize(data); ++i) {
    218     if (data[i].effective || data[i].permitted || data[i].inheritable) {
    219       return true;
    220     }
    221   }
    222 
    223   return false;
    224 }
    225 
    226 bool Credentials::HasCapability(Capability cap) {
    227   struct cap_hdr hdr = {};
    228   hdr.version = _LINUX_CAPABILITY_VERSION_3;
    229   struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};
    230 
    231   PCHECK(sys_capget(&hdr, data) == 0);
    232 
    233   const int cap_num = CapabilityToKernelValue(cap);
    234   const size_t index = CAP_TO_INDEX(cap_num);
    235   const uint32_t mask = CAP_TO_MASK(cap_num);
    236 
    237   return (data[index].effective | data[index].permitted |
    238           data[index].inheritable) &
    239          mask;
    240 }
    241 
    242 // static
    243 bool Credentials::CanCreateProcessInNewUserNS() {
    244   // Valgrind will let clone(2) pass-through, but doesn't support unshare(),
    245   // so always consider UserNS unsupported there.
    246   if (IsRunningOnValgrind()) {
    247     return false;
    248   }
    249 
    250 #if defined(THREAD_SANITIZER)
    251   // With TSAN, processes will always have threads running and can never
    252   // enter a new user namespace with MoveToNewUserNS().
    253   return false;
    254 #endif
    255 
    256   // This is roughly a fork().
    257   const pid_t pid = sys_clone(CLONE_NEWUSER | SIGCHLD, 0, 0, 0, 0);
    258 
    259   if (pid == -1) {
    260     CheckCloneNewUserErrno(errno);
    261     return false;
    262   }
    263 
    264   // The parent process could have had threads. In the child, these threads
    265   // have disappeared. Make sure to not do anything in the child, as this is a
    266   // fragile execution environment.
    267   if (pid == 0) {
    268     _exit(kExitSuccess);
    269   }
    270 
    271   // Always reap the child.
    272   int status = -1;
    273   PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid);
    274   CHECK(WIFEXITED(status));
    275   CHECK_EQ(kExitSuccess, WEXITSTATUS(status));
    276 
    277   // clone(2) succeeded, we can use CLONE_NEWUSER.
    278   return true;
    279 }
    280 
    281 bool Credentials::MoveToNewUserNS() {
    282   uid_t uid;
    283   gid_t gid;
    284   if (!GetRESIds(&uid, &gid)) {
    285     // If all the uids (or gids) are not equal to each other, the security
    286     // model will most likely confuse the caller, abort.
    287     DVLOG(1) << "uids or gids differ!";
    288     return false;
    289   }
    290   int ret = sys_unshare(CLONE_NEWUSER);
    291   if (ret) {
    292     const int unshare_errno = errno;
    293     VLOG(1) << "Looks like unprivileged CLONE_NEWUSER may not be available "
    294             << "on this kernel.";
    295     CheckCloneNewUserErrno(unshare_errno);
    296     return false;
    297   }
    298 
    299   if (NamespaceUtils::KernelSupportsDenySetgroups()) {
    300     PCHECK(NamespaceUtils::DenySetgroups());
    301   }
    302 
    303   // The current {r,e,s}{u,g}id is now an overflow id (c.f.
    304   // /proc/sys/kernel/overflowuid). Setup the uid and gid maps.
    305   DCHECK(GetRESIds(NULL, NULL));
    306   const char kGidMapFile[] = "/proc/self/gid_map";
    307   const char kUidMapFile[] = "/proc/self/uid_map";
    308   PCHECK(NamespaceUtils::WriteToIdMapFile(kGidMapFile, gid));
    309   PCHECK(NamespaceUtils::WriteToIdMapFile(kUidMapFile, uid));
    310   DCHECK(GetRESIds(NULL, NULL));
    311   return true;
    312 }
    313 
    314 bool Credentials::DropFileSystemAccess(int proc_fd) {
    315   CHECK_LE(0, proc_fd);
    316 
    317   CHECK(ChrootToSafeEmptyDir());
    318   CHECK(!base::DirectoryExists(base::FilePath("/proc")));
    319   CHECK(!ProcUtil::HasOpenDirectory(proc_fd));
    320   // We never let this function fail.
    321   return true;
    322 }
    323 
    324 pid_t Credentials::ForkAndDropCapabilitiesInChild() {
    325   pid_t pid = fork();
    326   if (pid != 0) {
    327     return pid;
    328   }
    329 
    330   // Since we just forked, we are single threaded.
    331   PCHECK(DropAllCapabilitiesOnCurrentThread());
    332   return 0;
    333 }
    334 
    335 }  // namespace sandbox.
    336