1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "sandbox/linux/services/credentials.h" 6 7 #include <errno.h> 8 #include <limits.h> 9 #include <signal.h> 10 #include <stddef.h> 11 #include <stdint.h> 12 #include <stdio.h> 13 #include <sys/syscall.h> 14 #include <sys/types.h> 15 #include <sys/wait.h> 16 #include <unistd.h> 17 18 #include "base/bind.h" 19 #include "base/files/file_path.h" 20 #include "base/files/file_util.h" 21 #include "base/logging.h" 22 #include "base/macros.h" 23 #include "base/posix/eintr_wrapper.h" 24 #include "base/process/launch.h" 25 #include "base/template_util.h" 26 #include "build/build_config.h" 27 #include "sandbox/linux/services/namespace_utils.h" 28 #include "sandbox/linux/services/proc_util.h" 29 #include "sandbox/linux/services/syscall_wrappers.h" 30 #include "sandbox/linux/services/thread_helpers.h" 31 #include "sandbox/linux/system_headers/capability.h" 32 #include "sandbox/linux/system_headers/linux_signal.h" 33 #include "third_party/valgrind/valgrind.h" 34 35 namespace sandbox { 36 37 namespace { 38 39 bool IsRunningOnValgrind() { return RUNNING_ON_VALGRIND; } 40 41 // Checks that the set of RES-uids and the set of RES-gids have 42 // one element each and return that element in |resuid| and |resgid| 43 // respectively. It's ok to pass NULL as one or both of the ids. 44 bool GetRESIds(uid_t* resuid, gid_t* resgid) { 45 uid_t ruid, euid, suid; 46 gid_t rgid, egid, sgid; 47 PCHECK(sys_getresuid(&ruid, &euid, &suid) == 0); 48 PCHECK(sys_getresgid(&rgid, &egid, &sgid) == 0); 49 const bool uids_are_equal = (ruid == euid) && (ruid == suid); 50 const bool gids_are_equal = (rgid == egid) && (rgid == sgid); 51 if (!uids_are_equal || !gids_are_equal) return false; 52 if (resuid) *resuid = euid; 53 if (resgid) *resgid = egid; 54 return true; 55 } 56 57 const int kExitSuccess = 0; 58 59 #if defined(__clang__) 60 // Disable sanitizers that rely on TLS and may write to non-stack memory. 61 __attribute__((no_sanitize_address)) 62 __attribute__((no_sanitize_thread)) 63 __attribute__((no_sanitize_memory)) 64 #endif 65 int ChrootToSelfFdinfo(void*) { 66 // This function can be run from a vforked child, so it should not write to 67 // any memory other than the stack or errno. Reads from TLS may be different 68 // from in the parent process. 69 RAW_CHECK(sys_chroot("/proc/self/fdinfo/") == 0); 70 71 // CWD is essentially an implicit file descriptor, so be careful to not 72 // leave it behind. 73 RAW_CHECK(chdir("/") == 0); 74 _exit(kExitSuccess); 75 } 76 77 // chroot() to an empty dir that is "safe". To be safe, it must not contain 78 // any subdirectory (chroot-ing there would allow a chroot escape) and it must 79 // be impossible to create an empty directory there. 80 // We achieve this by doing the following: 81 // 1. We create a new process sharing file system information. 82 // 2. In the child, we chroot to /proc/self/fdinfo/ 83 // This is already "safe", since fdinfo/ does not contain another directory and 84 // one cannot create another directory there. 85 // 3. The process dies 86 // After (3) happens, the directory is not available anymore in /proc. 87 bool ChrootToSafeEmptyDir() { 88 // We need to chroot to a fdinfo that is unique to a process and have that 89 // process die. 90 // 1. We don't want to simply fork() because duplicating the page tables is 91 // slow with a big address space. 92 // 2. We do not use a regular thread (that would unshare CLONE_FILES) because 93 // when we are in a PID namespace, we cannot easily get a handle to the 94 // /proc/tid directory for the thread (since /proc may not be aware of the 95 // PID namespace). With a process, we can just use /proc/self. 96 pid_t pid = -1; 97 char stack_buf[PTHREAD_STACK_MIN]; 98 #if defined(ARCH_CPU_X86_FAMILY) || defined(ARCH_CPU_ARM_FAMILY) || \ 99 defined(ARCH_CPU_MIPS64_FAMILY) || defined(ARCH_CPU_MIPS_FAMILY) 100 // The stack grows downward. 101 void* stack = stack_buf + sizeof(stack_buf); 102 #else 103 #error "Unsupported architecture" 104 #endif 105 106 int clone_flags = CLONE_FS | LINUX_SIGCHLD; 107 void* tls = nullptr; 108 #if defined(ARCH_CPU_X86_64) || defined(ARCH_CPU_ARM_FAMILY) 109 // Use CLONE_VM | CLONE_VFORK as an optimization to avoid copying page tables. 110 // Since clone writes to the new child's TLS before returning, we must set a 111 // new TLS to avoid corrupting the current process's TLS. On ARCH_CPU_X86, 112 // glibc performs syscalls by calling a function pointer in TLS, so we do not 113 // attempt this optimization. 114 clone_flags |= CLONE_VM | CLONE_VFORK | CLONE_SETTLS; 115 116 char tls_buf[PTHREAD_STACK_MIN] = {0}; 117 tls = tls_buf; 118 #endif 119 120 pid = clone(ChrootToSelfFdinfo, stack, clone_flags, nullptr, nullptr, tls, 121 nullptr); 122 PCHECK(pid != -1); 123 124 int status = -1; 125 PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid); 126 127 return WIFEXITED(status) && WEXITSTATUS(status) == kExitSuccess; 128 } 129 130 // CHECK() that an attempt to move to a new user namespace raised an expected 131 // errno. 132 void CheckCloneNewUserErrno(int error) { 133 // EPERM can happen if already in a chroot. EUSERS if too many nested 134 // namespaces are used. EINVAL for kernels that don't support the feature. 135 // Valgrind will ENOSYS unshare(). 136 PCHECK(error == EPERM || error == EUSERS || error == EINVAL || 137 error == ENOSYS); 138 } 139 140 // Converts a Capability to the corresponding Linux CAP_XXX value. 141 int CapabilityToKernelValue(Credentials::Capability cap) { 142 switch (cap) { 143 case Credentials::Capability::SYS_CHROOT: 144 return CAP_SYS_CHROOT; 145 case Credentials::Capability::SYS_ADMIN: 146 return CAP_SYS_ADMIN; 147 } 148 149 LOG(FATAL) << "Invalid Capability: " << static_cast<int>(cap); 150 return 0; 151 } 152 153 } // namespace. 154 155 // static 156 bool Credentials::DropAllCapabilities(int proc_fd) { 157 if (!SetCapabilities(proc_fd, std::vector<Capability>())) { 158 return false; 159 } 160 161 CHECK(!HasAnyCapability()); 162 return true; 163 } 164 165 // static 166 bool Credentials::DropAllCapabilities() { 167 base::ScopedFD proc_fd(ProcUtil::OpenProc()); 168 return Credentials::DropAllCapabilities(proc_fd.get()); 169 } 170 171 // static 172 bool Credentials::DropAllCapabilitiesOnCurrentThread() { 173 return SetCapabilitiesOnCurrentThread(std::vector<Capability>()); 174 } 175 176 // static 177 bool Credentials::SetCapabilitiesOnCurrentThread( 178 const std::vector<Capability>& caps) { 179 struct cap_hdr hdr = {}; 180 hdr.version = _LINUX_CAPABILITY_VERSION_3; 181 struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}}; 182 183 // Initially, cap has no capability flags set. Enable the effective and 184 // permitted flags only for the requested capabilities. 185 for (const Capability cap : caps) { 186 const int cap_num = CapabilityToKernelValue(cap); 187 const size_t index = CAP_TO_INDEX(cap_num); 188 const uint32_t mask = CAP_TO_MASK(cap_num); 189 data[index].effective |= mask; 190 data[index].permitted |= mask; 191 } 192 193 return sys_capset(&hdr, data) == 0; 194 } 195 196 // static 197 bool Credentials::SetCapabilities(int proc_fd, 198 const std::vector<Capability>& caps) { 199 DCHECK_LE(0, proc_fd); 200 201 #if !defined(THREAD_SANITIZER) 202 // With TSAN, accept to break the security model as it is a testing 203 // configuration. 204 CHECK(ThreadHelpers::IsSingleThreaded(proc_fd)); 205 #endif 206 207 return SetCapabilitiesOnCurrentThread(caps); 208 } 209 210 bool Credentials::HasAnyCapability() { 211 struct cap_hdr hdr = {}; 212 hdr.version = _LINUX_CAPABILITY_VERSION_3; 213 struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}}; 214 215 PCHECK(sys_capget(&hdr, data) == 0); 216 217 for (size_t i = 0; i < arraysize(data); ++i) { 218 if (data[i].effective || data[i].permitted || data[i].inheritable) { 219 return true; 220 } 221 } 222 223 return false; 224 } 225 226 bool Credentials::HasCapability(Capability cap) { 227 struct cap_hdr hdr = {}; 228 hdr.version = _LINUX_CAPABILITY_VERSION_3; 229 struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}}; 230 231 PCHECK(sys_capget(&hdr, data) == 0); 232 233 const int cap_num = CapabilityToKernelValue(cap); 234 const size_t index = CAP_TO_INDEX(cap_num); 235 const uint32_t mask = CAP_TO_MASK(cap_num); 236 237 return (data[index].effective | data[index].permitted | 238 data[index].inheritable) & 239 mask; 240 } 241 242 // static 243 bool Credentials::CanCreateProcessInNewUserNS() { 244 // Valgrind will let clone(2) pass-through, but doesn't support unshare(), 245 // so always consider UserNS unsupported there. 246 if (IsRunningOnValgrind()) { 247 return false; 248 } 249 250 #if defined(THREAD_SANITIZER) 251 // With TSAN, processes will always have threads running and can never 252 // enter a new user namespace with MoveToNewUserNS(). 253 return false; 254 #endif 255 256 // This is roughly a fork(). 257 const pid_t pid = sys_clone(CLONE_NEWUSER | SIGCHLD, 0, 0, 0, 0); 258 259 if (pid == -1) { 260 CheckCloneNewUserErrno(errno); 261 return false; 262 } 263 264 // The parent process could have had threads. In the child, these threads 265 // have disappeared. Make sure to not do anything in the child, as this is a 266 // fragile execution environment. 267 if (pid == 0) { 268 _exit(kExitSuccess); 269 } 270 271 // Always reap the child. 272 int status = -1; 273 PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid); 274 CHECK(WIFEXITED(status)); 275 CHECK_EQ(kExitSuccess, WEXITSTATUS(status)); 276 277 // clone(2) succeeded, we can use CLONE_NEWUSER. 278 return true; 279 } 280 281 bool Credentials::MoveToNewUserNS() { 282 uid_t uid; 283 gid_t gid; 284 if (!GetRESIds(&uid, &gid)) { 285 // If all the uids (or gids) are not equal to each other, the security 286 // model will most likely confuse the caller, abort. 287 DVLOG(1) << "uids or gids differ!"; 288 return false; 289 } 290 int ret = sys_unshare(CLONE_NEWUSER); 291 if (ret) { 292 const int unshare_errno = errno; 293 VLOG(1) << "Looks like unprivileged CLONE_NEWUSER may not be available " 294 << "on this kernel."; 295 CheckCloneNewUserErrno(unshare_errno); 296 return false; 297 } 298 299 if (NamespaceUtils::KernelSupportsDenySetgroups()) { 300 PCHECK(NamespaceUtils::DenySetgroups()); 301 } 302 303 // The current {r,e,s}{u,g}id is now an overflow id (c.f. 304 // /proc/sys/kernel/overflowuid). Setup the uid and gid maps. 305 DCHECK(GetRESIds(NULL, NULL)); 306 const char kGidMapFile[] = "/proc/self/gid_map"; 307 const char kUidMapFile[] = "/proc/self/uid_map"; 308 PCHECK(NamespaceUtils::WriteToIdMapFile(kGidMapFile, gid)); 309 PCHECK(NamespaceUtils::WriteToIdMapFile(kUidMapFile, uid)); 310 DCHECK(GetRESIds(NULL, NULL)); 311 return true; 312 } 313 314 bool Credentials::DropFileSystemAccess(int proc_fd) { 315 CHECK_LE(0, proc_fd); 316 317 CHECK(ChrootToSafeEmptyDir()); 318 CHECK(!base::DirectoryExists(base::FilePath("/proc"))); 319 CHECK(!ProcUtil::HasOpenDirectory(proc_fd)); 320 // We never let this function fail. 321 return true; 322 } 323 324 pid_t Credentials::ForkAndDropCapabilitiesInChild() { 325 pid_t pid = fork(); 326 if (pid != 0) { 327 return pid; 328 } 329 330 // Since we just forked, we are single threaded. 331 PCHECK(DropAllCapabilitiesOnCurrentThread()); 332 return 0; 333 } 334 335 } // namespace sandbox. 336