Home | History | Annotate | Download | only in seccomp-bpf
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Some headers on Android are missing cdefs: crbug.com/172337.
      6 // (We can't use OS_ANDROID here since build_config.h is not included).
      7 #if defined(ANDROID)
      8 #include <sys/cdefs.h>
      9 #endif
     10 
     11 #include <errno.h>
     12 #include <fcntl.h>
     13 #include <string.h>
     14 #include <sys/prctl.h>
     15 #include <sys/stat.h>
     16 #include <sys/syscall.h>
     17 #include <sys/types.h>
     18 #include <time.h>
     19 #include <unistd.h>
     20 
     21 #ifndef SECCOMP_BPF_STANDALONE
     22 #include "base/logging.h"
     23 #include "base/posix/eintr_wrapper.h"
     24 #endif
     25 
     26 #include "sandbox/linux/seccomp-bpf/codegen.h"
     27 #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"
     28 #include "sandbox/linux/seccomp-bpf/syscall.h"
     29 #include "sandbox/linux/seccomp-bpf/syscall_iterator.h"
     30 #include "sandbox/linux/seccomp-bpf/verifier.h"
     31 
     32 namespace {
     33 
     34 using playground2::ErrorCode;
     35 using playground2::Instruction;
     36 using playground2::Sandbox;
     37 using playground2::Trap;
     38 using playground2::arch_seccomp_data;
     39 
     40 const int kExpectedExitCode = 100;
     41 
     42 template<class T> int popcount(T x);
     43 template<> int popcount<unsigned int>(unsigned int x) {
     44   return __builtin_popcount(x);
     45 }
     46 template<> int popcount<unsigned long>(unsigned long x) {
     47   return __builtin_popcountl(x);
     48 }
     49 template<> int popcount<unsigned long long>(unsigned long long x) {
     50   return __builtin_popcountll(x);
     51 }
     52 
     53 void WriteFailedStderrSetupMessage(int out_fd) {
     54   const char* error_string = strerror(errno);
     55   static const char msg[] = "You have reproduced a puzzling issue.\n"
     56                             "Please, report to crbug.com/152530!\n"
     57                             "Failed to set up stderr: ";
     58   if (HANDLE_EINTR(write(out_fd, msg, sizeof(msg)-1)) > 0 && error_string &&
     59       HANDLE_EINTR(write(out_fd, error_string, strlen(error_string))) > 0 &&
     60       HANDLE_EINTR(write(out_fd, "\n", 1))) {
     61   }
     62 }
     63 
     64 // We define a really simple sandbox policy. It is just good enough for us
     65 // to tell that the sandbox has actually been activated.
     66 ErrorCode ProbeEvaluator(Sandbox *, int sysnum, void *) __attribute__((const));
     67 ErrorCode ProbeEvaluator(Sandbox *, int sysnum, void *) {
     68   switch (sysnum) {
     69   case __NR_getpid:
     70     // Return EPERM so that we can check that the filter actually ran.
     71     return ErrorCode(EPERM);
     72   case __NR_exit_group:
     73     // Allow exit() with a non-default return code.
     74     return ErrorCode(ErrorCode::ERR_ALLOWED);
     75   default:
     76     // Make everything else fail in an easily recognizable way.
     77     return ErrorCode(EINVAL);
     78   }
     79 }
     80 
     81 void ProbeProcess(void) {
     82   if (syscall(__NR_getpid) < 0 && errno == EPERM) {
     83     syscall(__NR_exit_group, static_cast<intptr_t>(kExpectedExitCode));
     84   }
     85 }
     86 
     87 ErrorCode AllowAllEvaluator(Sandbox *, int sysnum, void *) {
     88   if (!Sandbox::IsValidSyscallNumber(sysnum)) {
     89     return ErrorCode(ENOSYS);
     90   }
     91   return ErrorCode(ErrorCode::ERR_ALLOWED);
     92 }
     93 
     94 void TryVsyscallProcess(void) {
     95   time_t current_time;
     96   // time() is implemented as a vsyscall. With an older glibc, with
     97   // vsyscall=emulate and some versions of the seccomp BPF patch
     98   // we may get SIGKILL-ed. Detect this!
     99   if (time(&current_time) != static_cast<time_t>(-1)) {
    100     syscall(__NR_exit_group, static_cast<intptr_t>(kExpectedExitCode));
    101   }
    102 }
    103 
    104 bool IsSingleThreaded(int proc_fd) {
    105   if (proc_fd < 0) {
    106     // Cannot determine whether program is single-threaded. Hope for
    107     // the best...
    108     return true;
    109   }
    110 
    111   struct stat sb;
    112   int task = -1;
    113   if ((task = openat(proc_fd, "self/task", O_RDONLY|O_DIRECTORY)) < 0 ||
    114       fstat(task, &sb) != 0 ||
    115       sb.st_nlink != 3 ||
    116       HANDLE_EINTR(close(task))) {
    117     if (task >= 0) {
    118       if (HANDLE_EINTR(close(task))) { }
    119     }
    120     return false;
    121   }
    122   return true;
    123 }
    124 
    125 bool IsDenied(const ErrorCode& code) {
    126   return (code.err() & SECCOMP_RET_ACTION) == SECCOMP_RET_TRAP ||
    127          (code.err() >= (SECCOMP_RET_ERRNO + ErrorCode::ERR_MIN_ERRNO) &&
    128           code.err() <= (SECCOMP_RET_ERRNO + ErrorCode::ERR_MAX_ERRNO));
    129 }
    130 
    131 // Function that can be passed as a callback function to CodeGen::Traverse().
    132 // Checks whether the "insn" returns an UnsafeTrap() ErrorCode. If so, it
    133 // sets the "bool" variable pointed to by "aux".
    134 void CheckForUnsafeErrorCodes(Instruction *insn, void *aux) {
    135   bool *is_unsafe = static_cast<bool *>(aux);
    136   if (!*is_unsafe) {
    137     if (BPF_CLASS(insn->code) == BPF_RET &&
    138         insn->k > SECCOMP_RET_TRAP &&
    139         insn->k - SECCOMP_RET_TRAP <= SECCOMP_RET_DATA) {
    140       const ErrorCode& err =
    141         Trap::ErrorCodeFromTrapId(insn->k & SECCOMP_RET_DATA);
    142       if (err.error_type() != ErrorCode::ET_INVALID && !err.safe()) {
    143         *is_unsafe = true;
    144       }
    145     }
    146   }
    147 }
    148 
    149 // A Trap() handler that returns an "errno" value. The value is encoded
    150 // in the "aux" parameter.
    151 intptr_t ReturnErrno(const struct arch_seccomp_data&, void *aux) {
    152   // TrapFnc functions report error by following the native kernel convention
    153   // of returning an exit code in the range of -1..-4096. They do not try to
    154   // set errno themselves. The glibc wrapper that triggered the SIGSYS will
    155   // ultimately do so for us.
    156   int err = reinterpret_cast<intptr_t>(aux) & SECCOMP_RET_DATA;
    157   return -err;
    158 }
    159 
    160 // Function that can be passed as a callback function to CodeGen::Traverse().
    161 // Checks whether the "insn" returns an errno value from a BPF filter. If so,
    162 // it rewrites the instruction to instead call a Trap() handler that does
    163 // the same thing. "aux" is ignored.
    164 void RedirectToUserspace(Instruction *insn, void *aux) {
    165   // When inside an UnsafeTrap() callback, we want to allow all system calls.
    166   // This means, we must conditionally disable the sandbox -- and that's not
    167   // something that kernel-side BPF filters can do, as they cannot inspect
    168   // any state other than the syscall arguments.
    169   // But if we redirect all error handlers to user-space, then we can easily
    170   // make this decision.
    171   // The performance penalty for this extra round-trip to user-space is not
    172   // actually that bad, as we only ever pay it for denied system calls; and a
    173   // typical program has very few of these.
    174   Sandbox *sandbox = static_cast<Sandbox *>(aux);
    175   if (BPF_CLASS(insn->code) == BPF_RET &&
    176       (insn->k & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) {
    177     insn->k = sandbox->Trap(ReturnErrno,
    178                    reinterpret_cast<void *>(insn->k & SECCOMP_RET_DATA)).err();
    179   }
    180 }
    181 
    182 // Stackable wrapper around an Evaluators handler. Changes ErrorCodes
    183 // returned by a system call evaluator to match the changes made by
    184 // RedirectToUserspace(). "aux" should be pointer to wrapped system call
    185 // evaluator.
    186 ErrorCode RedirectToUserspaceEvalWrapper(Sandbox *sandbox, int sysnum,
    187                                          void *aux) {
    188   // We need to replicate the behavior of RedirectToUserspace(), so that our
    189   // Verifier can still work correctly.
    190   Sandbox::Evaluators *evaluators =
    191     reinterpret_cast<Sandbox::Evaluators *>(aux);
    192   const std::pair<Sandbox::EvaluateSyscall, void *>& evaluator =
    193     *evaluators->begin();
    194 
    195   ErrorCode err = evaluator.first(sandbox, sysnum, evaluator.second);
    196   if ((err.err() & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) {
    197     return sandbox->Trap(ReturnErrno,
    198                        reinterpret_cast<void *>(err.err() & SECCOMP_RET_DATA));
    199   }
    200   return err;
    201 }
    202 
    203 intptr_t BpfFailure(const struct arch_seccomp_data&, void *aux) {
    204   SANDBOX_DIE(static_cast<char *>(aux));
    205 }
    206 
    207 }  // namespace
    208 
    209 // The kernel gives us a sandbox, we turn it into a playground :-)
    210 // This is version 2 of the playground; version 1 was built on top of
    211 // pre-BPF seccomp mode.
    212 namespace playground2 {
    213 
    214 Sandbox::Sandbox()
    215     : quiet_(false),
    216       proc_fd_(-1),
    217       evaluators_(new Evaluators),
    218       conds_(new Conds) {
    219 }
    220 
    221 Sandbox::~Sandbox() {
    222   // It is generally unsafe to call any memory allocator operations or to even
    223   // call arbitrary destructors after having installed a new policy. We just
    224   // have no way to tell whether this policy would allow the system calls that
    225   // the constructors can trigger.
    226   // So, we normally destroy all of our complex state prior to starting the
    227   // sandbox. But this won't happen, if the Sandbox object was created and
    228   // never actually used to set up a sandbox. So, just in case, we are
    229   // destroying any remaining state.
    230   // The "if ()" statements are technically superfluous. But let's be explicit
    231   // that we really don't want to run any code, when we already destroyed
    232   // objects before setting up the sandbox.
    233   if (evaluators_) {
    234     delete evaluators_;
    235   }
    236   if (conds_) {
    237     delete conds_;
    238   }
    239 }
    240 
    241 bool Sandbox::IsValidSyscallNumber(int sysnum) {
    242   return SyscallIterator::IsValid(sysnum);
    243 }
    244 
    245 
    246 bool Sandbox::RunFunctionInPolicy(void (*code_in_sandbox)(),
    247                                   Sandbox::EvaluateSyscall syscall_evaluator,
    248                                   void *aux) {
    249   // Block all signals before forking a child process. This prevents an
    250   // attacker from manipulating our test by sending us an unexpected signal.
    251   sigset_t old_mask, new_mask;
    252   if (sigfillset(&new_mask) ||
    253       sigprocmask(SIG_BLOCK, &new_mask, &old_mask)) {
    254     SANDBOX_DIE("sigprocmask() failed");
    255   }
    256   int fds[2];
    257   if (pipe2(fds, O_NONBLOCK|O_CLOEXEC)) {
    258     SANDBOX_DIE("pipe() failed");
    259   }
    260 
    261   if (fds[0] <= 2 || fds[1] <= 2) {
    262     SANDBOX_DIE("Process started without standard file descriptors");
    263   }
    264 
    265   pid_t pid = fork();
    266   if (pid < 0) {
    267     // Die if we cannot fork(). We would probably fail a little later
    268     // anyway, as the machine is likely very close to running out of
    269     // memory.
    270     // But what we don't want to do is return "false", as a crafty
    271     // attacker might cause fork() to fail at will and could trick us
    272     // into running without a sandbox.
    273     sigprocmask(SIG_SETMASK, &old_mask, NULL);  // OK, if it fails
    274     SANDBOX_DIE("fork() failed unexpectedly");
    275   }
    276 
    277   // In the child process
    278   if (!pid) {
    279     // Test a very simple sandbox policy to verify that we can
    280     // successfully turn on sandboxing.
    281     Die::EnableSimpleExit();
    282 
    283     errno = 0;
    284     if (HANDLE_EINTR(close(fds[0]))) {
    285       // This call to close() has been failing in strange ways. See
    286       // crbug.com/152530. So we only fail in debug mode now.
    287 #if !defined(NDEBUG)
    288       WriteFailedStderrSetupMessage(fds[1]);
    289       SANDBOX_DIE(NULL);
    290 #endif
    291     }
    292     if (HANDLE_EINTR(dup2(fds[1], 2)) != 2) {
    293       // Stderr could very well be a file descriptor to .xsession-errors, or
    294       // another file, which could be backed by a file system that could cause
    295       // dup2 to fail while trying to close stderr. It's important that we do
    296       // not fail on trying to close stderr.
    297       // If dup2 fails here, we will continue normally, this means that our
    298       // parent won't cause a fatal failure if something writes to stderr in
    299       // this child.
    300 #if !defined(NDEBUG)
    301       // In DEBUG builds, we still want to get a report.
    302       WriteFailedStderrSetupMessage(fds[1]);
    303       SANDBOX_DIE(NULL);
    304 #endif
    305     }
    306     if (HANDLE_EINTR(close(fds[1]))) {
    307       // This call to close() has been failing in strange ways. See
    308       // crbug.com/152530. So we only fail in debug mode now.
    309 #if !defined(NDEBUG)
    310       WriteFailedStderrSetupMessage(fds[1]);
    311       SANDBOX_DIE(NULL);
    312 #endif
    313     }
    314 
    315     SetSandboxPolicy(syscall_evaluator, aux);
    316     StartSandbox();
    317 
    318     // Run our code in the sandbox.
    319     code_in_sandbox();
    320 
    321     // code_in_sandbox() is not supposed to return here.
    322     SANDBOX_DIE(NULL);
    323   }
    324 
    325   // In the parent process.
    326   if (HANDLE_EINTR(close(fds[1]))) {
    327     SANDBOX_DIE("close() failed");
    328   }
    329   if (sigprocmask(SIG_SETMASK, &old_mask, NULL)) {
    330     SANDBOX_DIE("sigprocmask() failed");
    331   }
    332   int status;
    333   if (HANDLE_EINTR(waitpid(pid, &status, 0)) != pid) {
    334     SANDBOX_DIE("waitpid() failed unexpectedly");
    335   }
    336   bool rc = WIFEXITED(status) && WEXITSTATUS(status) == kExpectedExitCode;
    337 
    338   // If we fail to support sandboxing, there might be an additional
    339   // error message. If so, this was an entirely unexpected and fatal
    340   // failure. We should report the failure and somebody must fix
    341   // things. This is probably a security-critical bug in the sandboxing
    342   // code.
    343   if (!rc) {
    344     char buf[4096];
    345     ssize_t len = HANDLE_EINTR(read(fds[0], buf, sizeof(buf) - 1));
    346     if (len > 0) {
    347       while (len > 1 && buf[len-1] == '\n') {
    348         --len;
    349       }
    350       buf[len] = '\000';
    351       SANDBOX_DIE(buf);
    352     }
    353   }
    354   if (HANDLE_EINTR(close(fds[0]))) {
    355     SANDBOX_DIE("close() failed");
    356   }
    357 
    358   return rc;
    359 }
    360 
    361 bool Sandbox::KernelSupportSeccompBPF() {
    362   return
    363     RunFunctionInPolicy(ProbeProcess, ProbeEvaluator, 0) &&
    364     RunFunctionInPolicy(TryVsyscallProcess, AllowAllEvaluator, 0);
    365 }
    366 
    367 Sandbox::SandboxStatus Sandbox::SupportsSeccompSandbox(int proc_fd) {
    368   // It the sandbox is currently active, we clearly must have support for
    369   // sandboxing.
    370   if (status_ == STATUS_ENABLED) {
    371     return status_;
    372   }
    373 
    374   // Even if the sandbox was previously available, something might have
    375   // changed in our run-time environment. Check one more time.
    376   if (status_ == STATUS_AVAILABLE) {
    377     if (!IsSingleThreaded(proc_fd)) {
    378       status_ = STATUS_UNAVAILABLE;
    379     }
    380     return status_;
    381   }
    382 
    383   if (status_ == STATUS_UNAVAILABLE && IsSingleThreaded(proc_fd)) {
    384     // All state transitions resulting in STATUS_UNAVAILABLE are immediately
    385     // preceded by STATUS_AVAILABLE. Furthermore, these transitions all
    386     // happen, if and only if they are triggered by the process being multi-
    387     // threaded.
    388     // In other words, if a single-threaded process is currently in the
    389     // STATUS_UNAVAILABLE state, it is safe to assume that sandboxing is
    390     // actually available.
    391     status_ = STATUS_AVAILABLE;
    392     return status_;
    393   }
    394 
    395   // If we have not previously checked for availability of the sandbox or if
    396   // we otherwise don't believe to have a good cached value, we have to
    397   // perform a thorough check now.
    398   if (status_ == STATUS_UNKNOWN) {
    399     // We create our own private copy of a "Sandbox" object. This ensures that
    400     // the object does not have any policies configured, that might interfere
    401     // with the tests done by "KernelSupportSeccompBPF()".
    402     Sandbox sandbox;
    403 
    404     // By setting "quiet_ = true" we suppress messages for expected and benign
    405     // failures (e.g. if the current kernel lacks support for BPF filters).
    406     sandbox.quiet_ = true;
    407     sandbox.set_proc_fd(proc_fd);
    408     status_ = sandbox.KernelSupportSeccompBPF()
    409       ? STATUS_AVAILABLE : STATUS_UNSUPPORTED;
    410 
    411     // As we are performing our tests from a child process, the run-time
    412     // environment that is visible to the sandbox is always guaranteed to be
    413     // single-threaded. Let's check here whether the caller is single-
    414     // threaded. Otherwise, we mark the sandbox as temporarily unavailable.
    415     if (status_ == STATUS_AVAILABLE && !IsSingleThreaded(proc_fd)) {
    416       status_ = STATUS_UNAVAILABLE;
    417     }
    418   }
    419   return status_;
    420 }
    421 
    422 void Sandbox::set_proc_fd(int proc_fd) {
    423   proc_fd_ = proc_fd;
    424 }
    425 
    426 void Sandbox::StartSandbox() {
    427   if (status_ == STATUS_UNSUPPORTED || status_ == STATUS_UNAVAILABLE) {
    428     SANDBOX_DIE("Trying to start sandbox, even though it is known to be "
    429                 "unavailable");
    430   } else if (!evaluators_ || !conds_) {
    431     SANDBOX_DIE("Cannot repeatedly start sandbox. Create a separate Sandbox "
    432                 "object instead.");
    433   }
    434   if (proc_fd_ < 0) {
    435     proc_fd_ = open("/proc", O_RDONLY|O_DIRECTORY);
    436   }
    437   if (proc_fd_ < 0) {
    438     // For now, continue in degraded mode, if we can't access /proc.
    439     // In the future, we might want to tighten this requirement.
    440   }
    441   if (!IsSingleThreaded(proc_fd_)) {
    442     SANDBOX_DIE("Cannot start sandbox, if process is already multi-threaded");
    443   }
    444 
    445   // We no longer need access to any files in /proc. We want to do this
    446   // before installing the filters, just in case that our policy denies
    447   // close().
    448   if (proc_fd_ >= 0) {
    449     if (HANDLE_EINTR(close(proc_fd_))) {
    450       SANDBOX_DIE("Failed to close file descriptor for /proc");
    451     }
    452     proc_fd_ = -1;
    453   }
    454 
    455   // Install the filters.
    456   InstallFilter();
    457 
    458   // We are now inside the sandbox.
    459   status_ = STATUS_ENABLED;
    460 }
    461 
    462 void Sandbox::PolicySanityChecks(EvaluateSyscall syscall_evaluator,
    463                                  void *aux) {
    464   for (SyscallIterator iter(true); !iter.Done(); ) {
    465     uint32_t sysnum = iter.Next();
    466     if (!IsDenied(syscall_evaluator(this, sysnum, aux))) {
    467       SANDBOX_DIE("Policies should deny system calls that are outside the "
    468                   "expected range (typically MIN_SYSCALL..MAX_SYSCALL)");
    469     }
    470   }
    471   return;
    472 }
    473 
    474 void Sandbox::SetSandboxPolicy(EvaluateSyscall syscall_evaluator, void *aux) {
    475   if (!evaluators_ || !conds_) {
    476     SANDBOX_DIE("Cannot change policy after sandbox has started");
    477   }
    478   PolicySanityChecks(syscall_evaluator, aux);
    479   evaluators_->push_back(std::make_pair(syscall_evaluator, aux));
    480 }
    481 
    482 void Sandbox::InstallFilter() {
    483   // We want to be very careful in not imposing any requirements on the
    484   // policies that are set with SetSandboxPolicy(). This means, as soon as
    485   // the sandbox is active, we shouldn't be relying on libraries that could
    486   // be making system calls. This, for example, means we should avoid
    487   // using the heap and we should avoid using STL functions.
    488   // Temporarily copy the contents of the "program" vector into a
    489   // stack-allocated array; and then explicitly destroy that object.
    490   // This makes sure we don't ex- or implicitly call new/delete after we
    491   // installed the BPF filter program in the kernel. Depending on the
    492   // system memory allocator that is in effect, these operators can result
    493   // in system calls to things like munmap() or brk().
    494   Program *program = AssembleFilter(false /* force_verification */);
    495 
    496   struct sock_filter bpf[program->size()];
    497   const struct sock_fprog prog = {
    498     static_cast<unsigned short>(program->size()), bpf };
    499   memcpy(bpf, &(*program)[0], sizeof(bpf));
    500   delete program;
    501 
    502   // Release memory that is no longer needed
    503   delete evaluators_;
    504   delete conds_;
    505   evaluators_ = NULL;
    506   conds_      = NULL;
    507 
    508   // Install BPF filter program
    509   if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
    510     SANDBOX_DIE(quiet_ ? NULL : "Kernel refuses to enable no-new-privs");
    511   } else {
    512     if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
    513       SANDBOX_DIE(quiet_ ? NULL : "Kernel refuses to turn on BPF filters");
    514     }
    515   }
    516 
    517   return;
    518 }
    519 
    520 Sandbox::Program *Sandbox::AssembleFilter(bool force_verification) {
    521 #if !defined(NDEBUG)
    522   force_verification = true;
    523 #endif
    524 
    525   // Verify that the user pushed a policy.
    526   if (evaluators_->empty()) {
    527     SANDBOX_DIE("Failed to configure system call filters");
    528   }
    529 
    530   // We can't handle stacked evaluators, yet. We'll get there eventually
    531   // though. Hang tight.
    532   if (evaluators_->size() != 1) {
    533     SANDBOX_DIE("Not implemented");
    534   }
    535 
    536   // Assemble the BPF filter program.
    537   CodeGen *gen = new CodeGen();
    538   if (!gen) {
    539     SANDBOX_DIE("Out of memory");
    540   }
    541 
    542   // If the architecture doesn't match SECCOMP_ARCH, disallow the
    543   // system call.
    544   Instruction *tail;
    545   Instruction *head =
    546     gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS, SECCOMP_ARCH_IDX,
    547   tail =
    548     gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH,
    549                          NULL,
    550     gen->MakeInstruction(BPF_RET+BPF_K,
    551                          Kill("Invalid audit architecture in BPF filter"))));
    552 
    553   bool has_unsafe_traps = false;
    554   {
    555     // Evaluate all possible system calls and group their ErrorCodes into
    556     // ranges of identical codes.
    557     Ranges ranges;
    558     FindRanges(&ranges);
    559 
    560     // Compile the system call ranges to an optimized BPF jumptable
    561     Instruction *jumptable =
    562       AssembleJumpTable(gen, ranges.begin(), ranges.end());
    563 
    564     // If there is at least one UnsafeTrap() in our program, the entire sandbox
    565     // is unsafe. We need to modify the program so that all non-
    566     // SECCOMP_RET_ALLOW ErrorCodes are handled in user-space. This will then
    567     // allow us to temporarily disable sandboxing rules inside of callbacks to
    568     // UnsafeTrap().
    569     gen->Traverse(jumptable, CheckForUnsafeErrorCodes, &has_unsafe_traps);
    570 
    571     // Grab the system call number, so that we can implement jump tables.
    572     Instruction *load_nr =
    573       gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS, SECCOMP_NR_IDX);
    574 
    575     // If our BPF program has unsafe jumps, enable support for them. This
    576     // test happens very early in the BPF filter program. Even before we
    577     // consider looking at system call numbers.
    578     // As support for unsafe jumps essentially defeats all the security
    579     // measures that the sandbox provides, we print a big warning message --
    580     // and of course, we make sure to only ever enable this feature if it
    581     // is actually requested by the sandbox policy.
    582     if (has_unsafe_traps) {
    583       if (SandboxSyscall(-1) == -1 && errno == ENOSYS) {
    584         SANDBOX_DIE("Support for UnsafeTrap() has not yet been ported to this "
    585                     "architecture");
    586       }
    587 
    588       EvaluateSyscall evaluateSyscall = evaluators_->begin()->first;
    589       void *aux                       = evaluators_->begin()->second;
    590       if (!evaluateSyscall(this, __NR_rt_sigprocmask, aux).
    591             Equals(ErrorCode(ErrorCode::ERR_ALLOWED)) ||
    592           !evaluateSyscall(this, __NR_rt_sigreturn, aux).
    593             Equals(ErrorCode(ErrorCode::ERR_ALLOWED))
    594 #if defined(__NR_sigprocmask)
    595        || !evaluateSyscall(this, __NR_sigprocmask, aux).
    596             Equals(ErrorCode(ErrorCode::ERR_ALLOWED))
    597 #endif
    598 #if defined(__NR_sigreturn)
    599        || !evaluateSyscall(this, __NR_sigreturn, aux).
    600             Equals(ErrorCode(ErrorCode::ERR_ALLOWED))
    601 #endif
    602           ) {
    603         SANDBOX_DIE("Invalid seccomp policy; if using UnsafeTrap(), you must "
    604                     "unconditionally allow sigreturn() and sigprocmask()");
    605       }
    606 
    607       if (!Trap::EnableUnsafeTrapsInSigSysHandler()) {
    608         // We should never be able to get here, as UnsafeTrap() should never
    609         // actually return a valid ErrorCode object unless the user set the
    610         // CHROME_SANDBOX_DEBUGGING environment variable; and therefore,
    611         // "has_unsafe_traps" would always be false. But better double-check
    612         // than enabling dangerous code.
    613         SANDBOX_DIE("We'd rather die than enable unsafe traps");
    614       }
    615       gen->Traverse(jumptable, RedirectToUserspace, this);
    616 
    617       // Allow system calls, if they originate from our magic return address
    618       // (which we can query by calling SandboxSyscall(-1)).
    619       uintptr_t syscall_entry_point =
    620         static_cast<uintptr_t>(SandboxSyscall(-1));
    621       uint32_t low = static_cast<uint32_t>(syscall_entry_point);
    622 #if __SIZEOF_POINTER__ > 4
    623       uint32_t hi  = static_cast<uint32_t>(syscall_entry_point >> 32);
    624 #endif
    625 
    626       // BPF cannot do native 64bit comparisons. On 64bit architectures, we
    627       // have to compare both 32bit halves of the instruction pointer. If they
    628       // match what we expect, we return ERR_ALLOWED. If either or both don't
    629       // match, we continue evalutating the rest of the sandbox policy.
    630       Instruction *escape_hatch =
    631         gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS, SECCOMP_IP_LSB_IDX,
    632         gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, low,
    633 #if __SIZEOF_POINTER__ > 4
    634         gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS, SECCOMP_IP_MSB_IDX,
    635         gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, hi,
    636 #endif
    637         gen->MakeInstruction(BPF_RET+BPF_K, ErrorCode(ErrorCode::ERR_ALLOWED)),
    638 #if __SIZEOF_POINTER__ > 4
    639                              load_nr)),
    640 #endif
    641                              load_nr));
    642       gen->JoinInstructions(tail, escape_hatch);
    643     } else {
    644       gen->JoinInstructions(tail, load_nr);
    645     }
    646     tail = load_nr;
    647 
    648     // On Intel architectures, verify that system call numbers are in the
    649     // expected number range. The older i386 and x86-64 APIs clear bit 30
    650     // on all system calls. The newer x32 API always sets bit 30.
    651 #if defined(__i386__) || defined(__x86_64__)
    652     Instruction *invalidX32 =
    653       gen->MakeInstruction(BPF_RET+BPF_K,
    654                            Kill("Illegal mixing of system call ABIs").err_);
    655     Instruction *checkX32 =
    656 #if defined(__x86_64__) && defined(__ILP32__)
    657       gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32);
    658 #else
    659       gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0);
    660 #endif
    661       gen->JoinInstructions(tail, checkX32);
    662       tail = checkX32;
    663 #endif
    664 
    665     // Append jump table to our pre-amble
    666     gen->JoinInstructions(tail, jumptable);
    667   }
    668 
    669   // Turn the DAG into a vector of instructions.
    670   Program *program = new Program();
    671   gen->Compile(head, program);
    672   delete gen;
    673 
    674   // Make sure compilation resulted in BPF program that executes
    675   // correctly. Otherwise, there is an internal error in our BPF compiler.
    676   // There is really nothing the caller can do until the bug is fixed.
    677   if (force_verification) {
    678     // Verification is expensive. We only perform this step, if we are
    679     // compiled in debug mode, or if the caller explicitly requested
    680     // verification.
    681     VerifyProgram(*program, has_unsafe_traps);
    682   }
    683 
    684   return program;
    685 }
    686 
    687 void Sandbox::VerifyProgram(const Program& program, bool has_unsafe_traps) {
    688   // If we previously rewrote the BPF program so that it calls user-space
    689   // whenever we return an "errno" value from the filter, then we have to
    690   // wrap our system call evaluator to perform the same operation. Otherwise,
    691   // the verifier would also report a mismatch in return codes.
    692   Evaluators redirected_evaluators;
    693   redirected_evaluators.push_back(
    694       std::make_pair(RedirectToUserspaceEvalWrapper, evaluators_));
    695 
    696   const char *err = NULL;
    697   if (!Verifier::VerifyBPF(
    698                        this,
    699                        program,
    700                        has_unsafe_traps ? redirected_evaluators : *evaluators_,
    701                        &err)) {
    702     CodeGen::PrintProgram(program);
    703     SANDBOX_DIE(err);
    704   }
    705 }
    706 
    707 void Sandbox::FindRanges(Ranges *ranges) {
    708   // Please note that "struct seccomp_data" defines system calls as a signed
    709   // int32_t, but BPF instructions always operate on unsigned quantities. We
    710   // deal with this disparity by enumerating from MIN_SYSCALL to MAX_SYSCALL,
    711   // and then verifying that the rest of the number range (both positive and
    712   // negative) all return the same ErrorCode.
    713   EvaluateSyscall evaluate_syscall = evaluators_->begin()->first;
    714   void *aux                        = evaluators_->begin()->second;
    715   uint32_t old_sysnum              = 0;
    716   ErrorCode old_err                = evaluate_syscall(this, old_sysnum, aux);
    717   ErrorCode invalid_err            = evaluate_syscall(this, MIN_SYSCALL - 1,
    718                                                       aux);
    719   for (SyscallIterator iter(false); !iter.Done(); ) {
    720     uint32_t sysnum = iter.Next();
    721     ErrorCode err = evaluate_syscall(this, static_cast<int>(sysnum), aux);
    722     if (!iter.IsValid(sysnum) && !invalid_err.Equals(err)) {
    723       // A proper sandbox policy should always treat system calls outside of
    724       // the range MIN_SYSCALL..MAX_SYSCALL (i.e. anything that returns
    725       // "false" for SyscallIterator::IsValid()) identically. Typically, all
    726       // of these system calls would be denied with the same ErrorCode.
    727       SANDBOX_DIE("Invalid seccomp policy");
    728     }
    729     if (!err.Equals(old_err) || iter.Done()) {
    730       ranges->push_back(Range(old_sysnum, sysnum - 1, old_err));
    731       old_sysnum = sysnum;
    732       old_err    = err;
    733     }
    734   }
    735 }
    736 
    737 Instruction *Sandbox::AssembleJumpTable(CodeGen *gen,
    738                                         Ranges::const_iterator start,
    739                                         Ranges::const_iterator stop) {
    740   // We convert the list of system call ranges into jump table that performs
    741   // a binary search over the ranges.
    742   // As a sanity check, we need to have at least one distinct ranges for us
    743   // to be able to build a jump table.
    744   if (stop - start <= 0) {
    745     SANDBOX_DIE("Invalid set of system call ranges");
    746   } else if (stop - start == 1) {
    747     // If we have narrowed things down to a single range object, we can
    748     // return from the BPF filter program.
    749     return RetExpression(gen, start->err);
    750   }
    751 
    752   // Pick the range object that is located at the mid point of our list.
    753   // We compare our system call number against the lowest valid system call
    754   // number in this range object. If our number is lower, it is outside of
    755   // this range object. If it is greater or equal, it might be inside.
    756   Ranges::const_iterator mid = start + (stop - start)/2;
    757 
    758   // Sub-divide the list of ranges and continue recursively.
    759   Instruction *jf = AssembleJumpTable(gen, start, mid);
    760   Instruction *jt = AssembleJumpTable(gen, mid, stop);
    761   return gen->MakeInstruction(BPF_JMP+BPF_JGE+BPF_K, mid->from, jt, jf);
    762 }
    763 
    764 Instruction *Sandbox::RetExpression(CodeGen *gen, const ErrorCode& err) {
    765   if (err.error_type_ == ErrorCode::ET_COND) {
    766     return CondExpression(gen, err);
    767   } else {
    768     return gen->MakeInstruction(BPF_RET+BPF_K, err);
    769   }
    770 }
    771 
    772 Instruction *Sandbox::CondExpression(CodeGen *gen, const ErrorCode& cond) {
    773   // We can only inspect the six system call arguments that are passed in
    774   // CPU registers.
    775   if (cond.argno_ < 0 || cond.argno_ >= 6) {
    776     SANDBOX_DIE("Internal compiler error; invalid argument number "
    777                 "encountered");
    778   }
    779 
    780   // BPF programs operate on 32bit entities. Load both halfs of the 64bit
    781   // system call argument and then generate suitable conditional statements.
    782   Instruction *msb_head =
    783     gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
    784                          SECCOMP_ARG_MSB_IDX(cond.argno_));
    785   Instruction *msb_tail = msb_head;
    786   Instruction *lsb_head =
    787     gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
    788                          SECCOMP_ARG_LSB_IDX(cond.argno_));
    789   Instruction *lsb_tail = lsb_head;
    790 
    791   // Emit a suitable comparison statement.
    792   switch (cond.op_) {
    793   case ErrorCode::OP_EQUAL:
    794     // Compare the least significant bits for equality
    795     lsb_tail = gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K,
    796                                     static_cast<uint32_t>(cond.value_),
    797                                     RetExpression(gen, *cond.passed_),
    798                                     RetExpression(gen, *cond.failed_));
    799     gen->JoinInstructions(lsb_head, lsb_tail);
    800 
    801     // If we are looking at a 64bit argument, we need to also compare the
    802     // most significant bits.
    803     if (cond.width_ == ErrorCode::TP_64BIT) {
    804       msb_tail = gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K,
    805                                       static_cast<uint32_t>(cond.value_ >> 32),
    806                                       lsb_head,
    807                                       RetExpression(gen, *cond.failed_));
    808       gen->JoinInstructions(msb_head, msb_tail);
    809     }
    810     break;
    811   case ErrorCode::OP_HAS_ALL_BITS:
    812     // Check the bits in the LSB half of the system call argument. Our
    813     // OP_HAS_ALL_BITS operator passes, iff all of the bits are set. This is
    814     // different from the kernel's BPF_JSET operation which passes, if any of
    815     // the bits are set.
    816     // Of course, if there is only a single set bit (or none at all), then
    817     // things get easier.
    818     {
    819       uint32_t lsb_bits = static_cast<uint32_t>(cond.value_);
    820       int lsb_bit_count = popcount(lsb_bits);
    821       if (lsb_bit_count == 0) {
    822         // No bits are set in the LSB half. The test will always pass.
    823         lsb_head = RetExpression(gen, *cond.passed_);
    824         lsb_tail = NULL;
    825       } else if (lsb_bit_count == 1) {
    826         // Exactly one bit is set in the LSB half. We can use the BPF_JSET
    827         // operator.
    828         lsb_tail = gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K,
    829                                         lsb_bits,
    830                                         RetExpression(gen, *cond.passed_),
    831                                         RetExpression(gen, *cond.failed_));
    832         gen->JoinInstructions(lsb_head, lsb_tail);
    833       } else {
    834         // More than one bit is set in the LSB half. We need to combine
    835         // BPF_AND and BPF_JEQ to test whether all of these bits are in fact
    836         // set in the system call argument.
    837         gen->JoinInstructions(lsb_head,
    838                      gen->MakeInstruction(BPF_ALU+BPF_AND+BPF_K,
    839                                           lsb_bits,
    840           lsb_tail = gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K,
    841                                           lsb_bits,
    842                                           RetExpression(gen, *cond.passed_),
    843                                           RetExpression(gen, *cond.failed_))));
    844       }
    845     }
    846 
    847     // If we are looking at a 64bit argument, we need to also check the bits
    848     // in the MSB half of the system call argument.
    849     if (cond.width_ == ErrorCode::TP_64BIT) {
    850       uint32_t msb_bits = static_cast<uint32_t>(cond.value_ >> 32);
    851       int msb_bit_count = popcount(msb_bits);
    852       if (msb_bit_count == 0) {
    853         // No bits are set in the MSB half. The test will always pass.
    854         msb_head = lsb_head;
    855       } else if (msb_bit_count == 1) {
    856         // Exactly one bit is set in the MSB half. We can use the BPF_JSET
    857         // operator.
    858         msb_tail = gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K,
    859                                         msb_bits,
    860                                         lsb_head,
    861                                         RetExpression(gen, *cond.failed_));
    862         gen->JoinInstructions(msb_head, msb_tail);
    863       } else {
    864         // More than one bit is set in the MSB half. We need to combine
    865         // BPF_AND and BPF_JEQ to test whether all of these bits are in fact
    866         // set in the system call argument.
    867         gen->JoinInstructions(msb_head,
    868           gen->MakeInstruction(BPF_ALU+BPF_AND+BPF_K,
    869                                msb_bits,
    870           gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K,
    871                                msb_bits,
    872                                lsb_head,
    873                                RetExpression(gen, *cond.failed_))));
    874       }
    875     }
    876     break;
    877   case ErrorCode::OP_HAS_ANY_BITS:
    878     // Check the bits in the LSB half of the system call argument. Our
    879     // OP_HAS_ANY_BITS operator passes, iff any of the bits are set. This maps
    880     // nicely to the kernel's BPF_JSET operation.
    881     {
    882       uint32_t lsb_bits = static_cast<uint32_t>(cond.value_);
    883       if (!lsb_bits) {
    884         // No bits are set in the LSB half. The test will always fail.
    885         lsb_head = RetExpression(gen, *cond.failed_);
    886         lsb_tail = NULL;
    887       } else {
    888         lsb_tail = gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K,
    889                                         lsb_bits,
    890                                         RetExpression(gen, *cond.passed_),
    891                                         RetExpression(gen, *cond.failed_));
    892         gen->JoinInstructions(lsb_head, lsb_tail);
    893       }
    894     }
    895 
    896     // If we are looking at a 64bit argument, we need to also check the bits
    897     // in the MSB half of the system call argument.
    898     if (cond.width_ == ErrorCode::TP_64BIT) {
    899       uint32_t msb_bits = static_cast<uint32_t>(cond.value_ >> 32);
    900       if (!msb_bits) {
    901         // No bits are set in the MSB half. The test will always fail.
    902         msb_head = lsb_head;
    903       } else {
    904         msb_tail = gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K,
    905                                         msb_bits,
    906                                         RetExpression(gen, *cond.passed_),
    907                                         lsb_head);
    908         gen->JoinInstructions(msb_head, msb_tail);
    909       }
    910     }
    911     break;
    912   default:
    913     // TODO(markus): Need to add support for OP_GREATER
    914     SANDBOX_DIE("Not implemented");
    915     break;
    916   }
    917 
    918   // Ensure that we never pass a 64bit value, when we only expect a 32bit
    919   // value. This is somewhat complicated by the fact that on 64bit systems,
    920   // callers could legitimately pass in a non-zero value in the MSB, iff the
    921   // LSB has been sign-extended into the MSB.
    922   if (cond.width_ == ErrorCode::TP_32BIT) {
    923     if (cond.value_ >> 32) {
    924       SANDBOX_DIE("Invalid comparison of a 32bit system call argument "
    925                   "against a 64bit constant; this test is always false.");
    926     }
    927 
    928     Instruction *invalid_64bit = RetExpression(gen, Unexpected64bitArgument());
    929     #if __SIZEOF_POINTER__ > 4
    930     invalid_64bit =
    931       gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, 0xFFFFFFFF,
    932       gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
    933                            SECCOMP_ARG_LSB_IDX(cond.argno_),
    934       gen->MakeInstruction(BPF_JMP+BPF_JGE+BPF_K, 0x80000000,
    935                            lsb_head,
    936                            invalid_64bit)),
    937                            invalid_64bit);
    938     #endif
    939     gen->JoinInstructions(
    940       msb_tail,
    941       gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, 0,
    942                            lsb_head,
    943                            invalid_64bit));
    944   }
    945 
    946   return msb_head;
    947 }
    948 
    949 ErrorCode Sandbox::Unexpected64bitArgument() {
    950   return Kill("Unexpected 64bit argument detected");
    951 }
    952 
    953 ErrorCode Sandbox::Trap(Trap::TrapFnc fnc, const void *aux) {
    954   return Trap::MakeTrap(fnc, aux, true /* Safe Trap */);
    955 }
    956 
    957 ErrorCode Sandbox::UnsafeTrap(Trap::TrapFnc fnc, const void *aux) {
    958   return Trap::MakeTrap(fnc, aux, false /* Unsafe Trap */);
    959 }
    960 
    961 intptr_t Sandbox::ForwardSyscall(const struct arch_seccomp_data& args) {
    962   return SandboxSyscall(args.nr,
    963                         static_cast<intptr_t>(args.args[0]),
    964                         static_cast<intptr_t>(args.args[1]),
    965                         static_cast<intptr_t>(args.args[2]),
    966                         static_cast<intptr_t>(args.args[3]),
    967                         static_cast<intptr_t>(args.args[4]),
    968                         static_cast<intptr_t>(args.args[5]));
    969 }
    970 
    971 ErrorCode Sandbox::Cond(int argno, ErrorCode::ArgType width,
    972                         ErrorCode::Operation op, uint64_t value,
    973                         const ErrorCode& passed, const ErrorCode& failed) {
    974   return ErrorCode(argno, width, op, value,
    975                    &*conds_->insert(passed).first,
    976                    &*conds_->insert(failed).first);
    977 }
    978 
    979 ErrorCode Sandbox::Kill(const char *msg) {
    980   return Trap(BpfFailure, const_cast<char *>(msg));
    981 }
    982 
    983 Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;
    984 
    985 }  // namespace
    986