// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include <errno.h> #include <fcntl.h> #include <linux/unistd.h> #include <netinet/in.h> #include <netinet/tcp.h> #include <netinet/udp.h> #include <pthread.h> #include <signal.h> #include <stdarg.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/ioctl.h> #include <sys/ipc.h> #include <sys/mman.h> #include <sys/prctl.h> #include <sys/resource.h> #include <sys/shm.h> #include <sys/socket.h> #include <sys/time.h> #include <sys/types.h> #include <time.h> #include <unistd.h> #include "base/macros.h" #include "base/posix/eintr_wrapper.h" #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h" #include "sandbox/linux/seccomp-bpf/sandbox_bpf_policy.h" #include "sandbox/linux/services/linux_syscalls.h" using sandbox::ErrorCode; using sandbox::SandboxBPF; using sandbox::SandboxBPFPolicy; using sandbox::arch_seccomp_data; #define ERR EPERM // We don't expect our sandbox to do anything useful yet. So, we will fail // almost immediately. For now, force the code to continue running. The // following line should be removed as soon as the sandbox is starting to // actually enforce restrictions in a meaningful way: #define _exit(x) do { } while (0) namespace { bool SendFds(int transport, const void *buf, size_t len, ...) { int count = 0; va_list ap; va_start(ap, len); while (va_arg(ap, int) >= 0) { ++count; } va_end(ap); if (!count) { return false; } char cmsg_buf[CMSG_SPACE(count*sizeof(int))]; memset(cmsg_buf, 0, sizeof(cmsg_buf)); struct iovec iov[2] = { { 0 } }; struct msghdr msg = { 0 }; int dummy = 0; iov[0].iov_base = &dummy; iov[0].iov_len = sizeof(dummy); if (buf && len > 0) { iov[1].iov_base = const_cast<void *>(buf); iov[1].iov_len = len; } msg.msg_iov = iov; msg.msg_iovlen = (buf && len > 0) ? 2 : 1; msg.msg_control = cmsg_buf; msg.msg_controllen = CMSG_LEN(count*sizeof(int)); struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; cmsg->cmsg_len = CMSG_LEN(count*sizeof(int)); va_start(ap, len); for (int i = 0, fd; (fd = va_arg(ap, int)) >= 0; ++i) { (reinterpret_cast<int *>(CMSG_DATA(cmsg)))[i] = fd; } return sendmsg(transport, &msg, 0) == static_cast<ssize_t>(sizeof(dummy) + ((buf && len > 0) ? len : 0)); } bool GetFds(int transport, void *buf, size_t *len, ...) { int count = 0; va_list ap; va_start(ap, len); for (int *fd; (fd = va_arg(ap, int *)) != NULL; ++count) { *fd = -1; } va_end(ap); if (!count) { return false; } char cmsg_buf[CMSG_SPACE(count*sizeof(int))]; memset(cmsg_buf, 0, sizeof(cmsg_buf)); struct iovec iov[2] = { { 0 } }; struct msghdr msg = { 0 }; int err; iov[0].iov_base = &err; iov[0].iov_len = sizeof(int); if (buf && len && *len > 0) { iov[1].iov_base = buf; iov[1].iov_len = *len; } msg.msg_iov = iov; msg.msg_iovlen = (buf && len && *len > 0) ? 2 : 1; msg.msg_control = cmsg_buf; msg.msg_controllen = CMSG_LEN(count*sizeof(int)); ssize_t bytes = recvmsg(transport, &msg, 0); if (len) { *len = bytes > static_cast<int>(sizeof(int)) ? bytes - sizeof(int) : 0; } if (bytes != static_cast<ssize_t>(sizeof(int) + iov[1].iov_len)) { if (bytes >= 0) { errno = 0; } return false; } if (err) { // "err" is the first four bytes of the payload. If these are non-zero, // the sender on the other side of the socketpair sent us an errno value. // We don't expect to get any file handles in this case. errno = err; return false; } struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); if ((msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) || !cmsg || cmsg->cmsg_level != SOL_SOCKET || cmsg->cmsg_type != SCM_RIGHTS || cmsg->cmsg_len != CMSG_LEN(count*sizeof(int))) { errno = EBADF; return false; } va_start(ap, len); for (int *fd, i = 0; (fd = va_arg(ap, int *)) != NULL; ++i) { *fd = (reinterpret_cast<int *>(CMSG_DATA(cmsg)))[i]; } va_end(ap); return true; } // POSIX doesn't define any async-signal safe function for converting // an integer to ASCII. We'll have to define our own version. // itoa_r() converts a (signed) integer to ASCII. It returns "buf", if the // conversion was successful or NULL otherwise. It never writes more than "sz" // bytes. Output will be truncated as needed, and a NUL character is always // appended. char *itoa_r(int i, char *buf, size_t sz) { // Make sure we can write at least one NUL byte. size_t n = 1; if (n > sz) { return NULL; } // Handle negative numbers. char *start = buf; int minint = 0; if (i < 0) { // Make sure we can write the '-' character. if (++n > sz) { *start = '\000'; return NULL; } *start++ = '-'; // Turn our number positive. if (i == -i) { // The lowest-most negative integer needs special treatment. minint = 1; i = -(i + 1); } else { // "Normal" negative numbers are easy. i = -i; } } // Loop until we have converted the entire number. Output at least one // character (i.e. '0'). char *ptr = start; do { // Make sure there is still enough space left in our output buffer. if (++n > sz) { buf = NULL; goto truncate; } // Output the next digit and (if necessary) compensate for the lowest-most // negative integer needing special treatment. This works because, no // matter the bit width of the integer, the lowest-most integer always ends // in 2, 4, 6, or 8. *ptr++ = i%10 + '0' + minint; minint = 0; i /= 10; } while (i); truncate: // Terminate the output with a NUL character. *ptr = '\000'; // Conversion to ASCII actually resulted in the digits being in reverse // order. We can't easily generate them in forward order, as we can't tell // the number of characters needed until we are done converting. // So, now, we reverse the string (except for the possible "-" sign). while (--ptr > start) { char ch = *ptr; *ptr = *start; *start++ = ch; } return buf; } // This handler gets called, whenever we encounter a system call that we // don't recognize explicitly. For the purposes of this program, we just // log the system call and then deny it. More elaborate sandbox policies // might try to evaluate the system call in user-space, instead. // The only notable complication is that this function must be async-signal // safe. This restricts the libary functions that we can call. intptr_t DefaultHandler(const struct arch_seccomp_data& data, void *) { static const char msg0[] = "Disallowed system call #"; static const char msg1[] = "\n"; char buf[sizeof(msg0) - 1 + 25 + sizeof(msg1)]; *buf = '\000'; strncat(buf, msg0, sizeof(buf) - 1); char *ptr = strrchr(buf, '\000'); itoa_r(data.nr, ptr, sizeof(buf) - (ptr - buf)); ptr = strrchr(ptr, '\000'); strncat(ptr, msg1, sizeof(buf) - (ptr - buf)); ptr = strrchr(ptr, '\000'); if (HANDLE_EINTR(write(2, buf, ptr - buf))) { } return -ERR; } class DemoPolicy : public SandboxBPFPolicy { public: DemoPolicy() {} virtual ErrorCode EvaluateSyscall(SandboxBPF* sandbox, int sysno) const OVERRIDE; private: DISALLOW_COPY_AND_ASSIGN(DemoPolicy); }; ErrorCode DemoPolicy::EvaluateSyscall(SandboxBPF* sandbox, int sysno) const { switch (sysno) { #if defined(__NR_accept) case __NR_accept: case __NR_accept4: #endif case __NR_alarm: case __NR_brk: case __NR_clock_gettime: case __NR_close: case __NR_dup: case __NR_dup2: case __NR_epoll_create: case __NR_epoll_ctl: case __NR_epoll_wait: case __NR_exit: case __NR_exit_group: case __NR_fcntl: #if defined(__NR_fcntl64) case __NR_fcntl64: #endif case __NR_fdatasync: case __NR_fstat: #if defined(__NR_fstat64) case __NR_fstat64: #endif case __NR_ftruncate: case __NR_futex: case __NR_getdents: case __NR_getdents64: case __NR_getegid: #if defined(__NR_getegid32) case __NR_getegid32: #endif case __NR_geteuid: #if defined(__NR_geteuid32) case __NR_geteuid32: #endif case __NR_getgid: #if defined(__NR_getgid32) case __NR_getgid32: #endif case __NR_getitimer: case __NR_setitimer: #if defined(__NR_getpeername) case __NR_getpeername: #endif case __NR_getpid: case __NR_gettid: #if defined(__NR_getsockname) case __NR_getsockname: #endif case __NR_gettimeofday: case __NR_getuid: #if defined(__NR_getuid32) case __NR_getuid32: #endif #if defined(__NR__llseek) case __NR__llseek: #endif case __NR_lseek: case __NR_nanosleep: case __NR_pipe: case __NR_pipe2: case __NR_poll: case __NR_pread64: case __NR_preadv: case __NR_pwrite64: case __NR_pwritev: case __NR_read: case __NR_readv: case __NR_restart_syscall: case __NR_set_robust_list: case __NR_rt_sigaction: #if defined(__NR_sigaction) case __NR_sigaction: #endif #if defined(__NR_signal) case __NR_signal: #endif case __NR_rt_sigprocmask: #if defined(__NR_sigprocmask) case __NR_sigprocmask: #endif #if defined(__NR_shutdown) case __NR_shutdown: #endif case __NR_rt_sigreturn: #if defined(__NR_sigreturn) case __NR_sigreturn: #endif #if defined(__NR_socketpair) case __NR_socketpair: #endif case __NR_time: case __NR_uname: case __NR_write: case __NR_writev: return ErrorCode(ErrorCode::ERR_ALLOWED); case __NR_prctl: // Allow PR_SET_DUMPABLE and PR_GET_DUMPABLE. Do not allow anything else. return sandbox->Cond(1, ErrorCode::TP_32BIT, ErrorCode::OP_EQUAL, PR_SET_DUMPABLE, ErrorCode(ErrorCode::ERR_ALLOWED), sandbox->Cond(1, ErrorCode::TP_32BIT, ErrorCode::OP_EQUAL, PR_GET_DUMPABLE, ErrorCode(ErrorCode::ERR_ALLOWED), sandbox->Trap(DefaultHandler, NULL))); // The following system calls are temporarily permitted. This must be // tightened later. But we currently don't implement enough of the sandboxing // API to do so. // As is, this sandbox isn't exactly safe :-/ #if defined(__NR_sendmsg) case __NR_sendmsg: case __NR_sendto: case __NR_recvmsg: case __NR_recvfrom: case __NR_getsockopt: case __NR_setsockopt: #elif defined(__NR_socketcall) case __NR_socketcall: #endif #if defined(__NR_shmat) case __NR_shmat: case __NR_shmctl: case __NR_shmdt: case __NR_shmget: #elif defined(__NR_ipc) case __NR_ipc: #endif #if defined(__NR_mmap2) case __NR_mmap2: #else case __NR_mmap: #endif #if defined(__NR_ugetrlimit) case __NR_ugetrlimit: #endif case __NR_getrlimit: case __NR_ioctl: case __NR_clone: case __NR_munmap: case __NR_mprotect: case __NR_madvise: case __NR_remap_file_pages: return ErrorCode(ErrorCode::ERR_ALLOWED); // Everything that isn't explicitly allowed is denied. default: return sandbox->Trap(DefaultHandler, NULL); } } void *ThreadFnc(void *arg) { return arg; } void *SendmsgStressThreadFnc(void *arg) { if (arg) { } static const int repetitions = 100; static const int kNumFds = 3; for (int rep = 0; rep < repetitions; ++rep) { int fds[2 + kNumFds]; if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds)) { perror("socketpair()"); _exit(1); } size_t len = 4; char buf[4]; if (!SendFds(fds[0], "test", 4, fds[1], fds[1], fds[1], -1) || !GetFds(fds[1], buf, &len, fds+2, fds+3, fds+4, NULL) || len != 4 || memcmp(buf, "test", len) || write(fds[2], "demo", 4) != 4 || read(fds[0], buf, 4) != 4 || memcmp(buf, "demo", 4)) { perror("sending/receiving of fds"); _exit(1); } for (int i = 0; i < 2+kNumFds; ++i) { if (close(fds[i])) { perror("close"); _exit(1); } } } return NULL; } } // namespace int main(int argc, char *argv[]) { if (argc) { } if (argv) { } int proc_fd = open("/proc", O_RDONLY|O_DIRECTORY); if (SandboxBPF::SupportsSeccompSandbox(proc_fd) != SandboxBPF::STATUS_AVAILABLE) { perror("sandbox"); _exit(1); } SandboxBPF sandbox; sandbox.set_proc_fd(proc_fd); sandbox.SetSandboxPolicy(new DemoPolicy()); if (!sandbox.StartSandbox(SandboxBPF::PROCESS_SINGLE_THREADED)) { fprintf(stderr, "StartSandbox() failed"); _exit(1); } // Check that we can create threads pthread_t thr; if (!pthread_create(&thr, NULL, ThreadFnc, reinterpret_cast<void *>(0x1234))) { void *ret; pthread_join(thr, &ret); if (ret != reinterpret_cast<void *>(0x1234)) { perror("clone() failed"); _exit(1); } } else { perror("clone() failed"); _exit(1); } // Check that we handle restart_syscall() without dieing. This is a little // tricky to trigger. And I can't think of a good way to verify whether it // actually executed. signal(SIGALRM, SIG_IGN); const struct itimerval tv = { { 0, 0 }, { 0, 5*1000 } }; const struct timespec tmo = { 0, 100*1000*1000 }; setitimer(ITIMER_REAL, &tv, NULL); nanosleep(&tmo, NULL); // Check that we can query the size of the stack, but that all other // calls to getrlimit() fail. if (((errno = 0), !getrlimit(RLIMIT_STACK, NULL)) || errno != EFAULT || ((errno = 0), !getrlimit(RLIMIT_CORE, NULL)) || errno != ERR) { perror("getrlimit()"); _exit(1); } // Check that we can query TCGETS and TIOCGWINSZ, but no other ioctls(). if (((errno = 0), !ioctl(2, TCGETS, NULL)) || errno != EFAULT || ((errno = 0), !ioctl(2, TIOCGWINSZ, NULL)) || errno != EFAULT || ((errno = 0), !ioctl(2, TCSETS, NULL)) || errno != ERR) { perror("ioctl()"); _exit(1); } // Check that prctl() can manipulate the dumpable flag, but nothing else. if (((errno = 0), !prctl(PR_GET_DUMPABLE)) || errno || ((errno = 0), prctl(PR_SET_DUMPABLE, 1)) || errno || ((errno = 0), !prctl(PR_SET_SECCOMP, 0)) || errno != ERR) { perror("prctl()"); _exit(1); } // Check that we can send and receive file handles. int fds[3]; if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds)) { perror("socketpair()"); _exit(1); } size_t len = 4; char buf[4]; if (!SendFds(fds[0], "test", 4, fds[1], -1) || !GetFds(fds[1], buf, &len, fds+2, NULL) || len != 4 || memcmp(buf, "test", len) || write(fds[2], "demo", 4) != 4 || read(fds[0], buf, 4) != 4 || memcmp(buf, "demo", 4) || close(fds[0]) || close(fds[1]) || close(fds[2])) { perror("sending/receiving of fds"); _exit(1); } // Check whether SysV IPC works. int shmid; void *addr; if ((shmid = shmget(IPC_PRIVATE, 4096, IPC_CREAT|0600)) < 0 || (addr = shmat(shmid, NULL, 0)) == reinterpret_cast<void *>(-1) || shmdt(addr) || shmctl(shmid, IPC_RMID, NULL)) { perror("sysv IPC"); _exit(1); } // Print a message so that the user can see the sandbox is activated. time_t tm = time(NULL); printf("Sandbox has been started at %s", ctime(&tm)); // Stress-test the sendmsg() code static const int kSendmsgStressNumThreads = 10; pthread_t sendmsgStressThreads[kSendmsgStressNumThreads]; for (int i = 0; i < kSendmsgStressNumThreads; ++i) { if (pthread_create(sendmsgStressThreads + i, NULL, SendmsgStressThreadFnc, NULL)) { perror("pthread_create"); _exit(1); } } for (int i = 0; i < kSendmsgStressNumThreads; ++i) { pthread_join(sendmsgStressThreads[i], NULL); } return 0; }