// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "base/process/launch.h"

#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <stdlib.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

#include <iterator>
#include <limits>
#include <set>

#include "base/allocator/type_profiler_control.h"
#include "base/command_line.h"
#include "base/compiler_specific.h"
#include "base/debug/debugger.h"
#include "base/debug/stack_trace.h"
#include "base/files/dir_reader_posix.h"
#include "base/files/file_util.h"
#include "base/files/scoped_file.h"
#include "base/logging.h"
#include "base/memory/scoped_ptr.h"
#include "base/posix/eintr_wrapper.h"
#include "base/process/kill.h"
#include "base/process/process_metrics.h"
#include "base/strings/stringprintf.h"
#include "base/synchronization/waitable_event.h"
#include "base/third_party/dynamic_annotations/dynamic_annotations.h"
#include "base/threading/platform_thread.h"
#include "base/threading/thread_restrictions.h"

#if defined(OS_LINUX)
#include <sys/prctl.h>
#endif

#if defined(OS_CHROMEOS)
#include <sys/ioctl.h>
#endif

#if defined(OS_FREEBSD)
#include <sys/event.h>
#include <sys/ucontext.h>
#endif

#if defined(OS_MACOSX)
#include <crt_externs.h>
#include <sys/event.h>
#else
extern char** environ;
#endif

namespace base {

namespace {

// Get the process's "environment" (i.e. the thing that setenv/getenv
// work with).
char** GetEnvironment() {
#if defined(OS_MACOSX)
  return *_NSGetEnviron();
#else
  return environ;
#endif
}

// Set the process's "environment" (i.e. the thing that setenv/getenv
// work with).
void SetEnvironment(char** env) {
#if defined(OS_MACOSX)
  *_NSGetEnviron() = env;
#else
  environ = env;
#endif
}

// Set the calling thread's signal mask to new_sigmask and return
// the previous signal mask.
sigset_t SetSignalMask(const sigset_t& new_sigmask) {
  sigset_t old_sigmask;
#if defined(OS_ANDROID)
  // POSIX says pthread_sigmask() must be used in multi-threaded processes,
  // but Android's pthread_sigmask() was broken until 4.1:
  // https://code.google.com/p/android/issues/detail?id=15337
  // http://stackoverflow.com/questions/13777109/pthread-sigmask-on-android-not-working
  RAW_CHECK(sigprocmask(SIG_SETMASK, &new_sigmask, &old_sigmask) == 0);
#else
  RAW_CHECK(pthread_sigmask(SIG_SETMASK, &new_sigmask, &old_sigmask) == 0);
#endif
  return old_sigmask;
}

#if !defined(OS_LINUX) || \
    (!defined(__i386__) && !defined(__x86_64__) && !defined(__arm__))
void ResetChildSignalHandlersToDefaults() {
  // The previous signal handlers are likely to be meaningless in the child's
  // context so we reset them to the defaults for now. http://crbug.com/44953
  // These signal handlers are set up at least in browser_main_posix.cc:
  // BrowserMainPartsPosix::PreEarlyInitialization and stack_trace_posix.cc:
  // EnableInProcessStackDumping.
  signal(SIGHUP, SIG_DFL);
  signal(SIGINT, SIG_DFL);
  signal(SIGILL, SIG_DFL);
  signal(SIGABRT, SIG_DFL);
  signal(SIGFPE, SIG_DFL);
  signal(SIGBUS, SIG_DFL);
  signal(SIGSEGV, SIG_DFL);
  signal(SIGSYS, SIG_DFL);
  signal(SIGTERM, SIG_DFL);
}

#else

// TODO(jln): remove the Linux special case once kernels are fixed.

// Internally the kernel makes sigset_t an array of long large enough to have
// one bit per signal.
typedef uint64_t kernel_sigset_t;

// This is what struct sigaction looks like to the kernel at least on X86 and
// ARM. MIPS, for instance, is very different.
struct kernel_sigaction {
  void* k_sa_handler;  // For this usage it only needs to be a generic pointer.
  unsigned long k_sa_flags;
  void* k_sa_restorer;  // For this usage it only needs to be a generic pointer.
  kernel_sigset_t k_sa_mask;
};

// glibc's sigaction() will prevent access to sa_restorer, so we need to roll
// our own.
int sys_rt_sigaction(int sig, const struct kernel_sigaction* act,
                     struct kernel_sigaction* oact) {
  return syscall(SYS_rt_sigaction, sig, act, oact, sizeof(kernel_sigset_t));
}

// This function is intended to be used in between fork() and execve() and will
// reset all signal handlers to the default.
// The motivation for going through all of them is that sa_restorer can leak
// from parents and help defeat ASLR on buggy kernels.  We reset it to NULL.
// See crbug.com/177956.
void ResetChildSignalHandlersToDefaults(void) {
  for (int signum = 1; ; ++signum) {
    struct kernel_sigaction act = {0};
    int sigaction_get_ret = sys_rt_sigaction(signum, NULL, &act);
    if (sigaction_get_ret && errno == EINVAL) {
#if !defined(NDEBUG)
      // Linux supports 32 real-time signals from 33 to 64.
      // If the number of signals in the Linux kernel changes, someone should
      // look at this code.
      const int kNumberOfSignals = 64;
      RAW_CHECK(signum == kNumberOfSignals + 1);
#endif  // !defined(NDEBUG)
      break;
    }
    // All other failures are fatal.
    if (sigaction_get_ret) {
      RAW_LOG(FATAL, "sigaction (get) failed.");
    }

    // The kernel won't allow to re-set SIGKILL or SIGSTOP.
    if (signum != SIGSTOP && signum != SIGKILL) {
      act.k_sa_handler = reinterpret_cast<void*>(SIG_DFL);
      act.k_sa_restorer = NULL;
      if (sys_rt_sigaction(signum, &act, NULL)) {
        RAW_LOG(FATAL, "sigaction (set) failed.");
      }
    }
#if !defined(NDEBUG)
    // Now ask the kernel again and check that no restorer will leak.
    if (sys_rt_sigaction(signum, NULL, &act) || act.k_sa_restorer) {
      RAW_LOG(FATAL, "Cound not fix sa_restorer.");
    }
#endif  // !defined(NDEBUG)
  }
}
#endif  // !defined(OS_LINUX) ||
        // (!defined(__i386__) && !defined(__x86_64__) && !defined(__arm__))

}  // anonymous namespace

// Functor for |ScopedDIR| (below).
struct ScopedDIRClose {
  inline void operator()(DIR* x) const {
    if (x)
      closedir(x);
  }
};

// Automatically closes |DIR*|s.
typedef scoped_ptr<DIR, ScopedDIRClose> ScopedDIR;

#if defined(OS_LINUX)
static const char kFDDir[] = "/proc/self/fd";
#elif defined(OS_MACOSX)
static const char kFDDir[] = "/dev/fd";
#elif defined(OS_SOLARIS)
static const char kFDDir[] = "/dev/fd";
#elif defined(OS_FREEBSD)
static const char kFDDir[] = "/dev/fd";
#elif defined(OS_OPENBSD)
static const char kFDDir[] = "/dev/fd";
#elif defined(OS_ANDROID)
static const char kFDDir[] = "/proc/self/fd";
#endif

void CloseSuperfluousFds(const base::InjectiveMultimap& saved_mapping) {
  // DANGER: no calls to malloc or locks are allowed from now on:
  // http://crbug.com/36678

  // Get the maximum number of FDs possible.
  size_t max_fds = GetMaxFds();

  DirReaderPosix fd_dir(kFDDir);
  if (!fd_dir.IsValid()) {
    // Fallback case: Try every possible fd.
    for (size_t i = 0; i < max_fds; ++i) {
      const int fd = static_cast<int>(i);
      if (fd == STDIN_FILENO || fd == STDOUT_FILENO || fd == STDERR_FILENO)
        continue;
      // Cannot use STL iterators here, since debug iterators use locks.
      size_t j;
      for (j = 0; j < saved_mapping.size(); j++) {
        if (fd == saved_mapping[j].dest)
          break;
      }
      if (j < saved_mapping.size())
        continue;

      // Since we're just trying to close anything we can find,
      // ignore any error return values of close().
      close(fd);
    }
    return;
  }

  const int dir_fd = fd_dir.fd();

  for ( ; fd_dir.Next(); ) {
    // Skip . and .. entries.
    if (fd_dir.name()[0] == '.')
      continue;

    char *endptr;
    errno = 0;
    const long int fd = strtol(fd_dir.name(), &endptr, 10);
    if (fd_dir.name()[0] == 0 || *endptr || fd < 0 || errno)
      continue;
    if (fd == STDIN_FILENO || fd == STDOUT_FILENO || fd == STDERR_FILENO)
      continue;
    // Cannot use STL iterators here, since debug iterators use locks.
    size_t i;
    for (i = 0; i < saved_mapping.size(); i++) {
      if (fd == saved_mapping[i].dest)
        break;
    }
    if (i < saved_mapping.size())
      continue;
    if (fd == dir_fd)
      continue;

    // When running under Valgrind, Valgrind opens several FDs for its
    // own use and will complain if we try to close them.  All of
    // these FDs are >= |max_fds|, so we can check against that here
    // before closing.  See https://bugs.kde.org/show_bug.cgi?id=191758
    if (fd < static_cast<int>(max_fds)) {
      int ret = IGNORE_EINTR(close(fd));
      DPCHECK(ret == 0);
    }
  }
}

bool LaunchProcess(const std::vector<std::string>& argv,
                   const LaunchOptions& options,
                   ProcessHandle* process_handle) {
  size_t fd_shuffle_size = 0;
  if (options.fds_to_remap) {
    fd_shuffle_size = options.fds_to_remap->size();
  }

  InjectiveMultimap fd_shuffle1;
  InjectiveMultimap fd_shuffle2;
  fd_shuffle1.reserve(fd_shuffle_size);
  fd_shuffle2.reserve(fd_shuffle_size);

  scoped_ptr<char*[]> argv_cstr(new char*[argv.size() + 1]);
  scoped_ptr<char*[]> new_environ;
  char* const empty_environ = NULL;
  char* const* old_environ = GetEnvironment();
  if (options.clear_environ)
    old_environ = &empty_environ;
  if (!options.environ.empty())
    new_environ = AlterEnvironment(old_environ, options.environ);

  sigset_t full_sigset;
  sigfillset(&full_sigset);
  const sigset_t orig_sigmask = SetSignalMask(full_sigset);

  pid_t pid;
#if defined(OS_LINUX)
  if (options.clone_flags) {
    // Signal handling in this function assumes the creation of a new
    // process, so we check that a thread is not being created by mistake
    // and that signal handling follows the process-creation rules.
    RAW_CHECK(
        !(options.clone_flags & (CLONE_SIGHAND | CLONE_THREAD | CLONE_VM)));
    pid = syscall(__NR_clone, options.clone_flags, 0, 0, 0);
  } else
#endif
  {
    pid = fork();
  }

  // Always restore the original signal mask in the parent.
  if (pid != 0) {
    SetSignalMask(orig_sigmask);
  }

  if (pid < 0) {
    DPLOG(ERROR) << "fork";
    return false;
  } else if (pid == 0) {
    // Child process

    // DANGER: no calls to malloc or locks are allowed from now on:
    // http://crbug.com/36678

    // DANGER: fork() rule: in the child, if you don't end up doing exec*(),
    // you call _exit() instead of exit(). This is because _exit() does not
    // call any previously-registered (in the parent) exit handlers, which
    // might do things like block waiting for threads that don't even exist
    // in the child.

    // If a child process uses the readline library, the process block forever.
    // In BSD like OSes including OS X it is safe to assign /dev/null as stdin.
    // See http://crbug.com/56596.
    base::ScopedFD null_fd(HANDLE_EINTR(open("/dev/null", O_RDONLY)));
    if (!null_fd.is_valid()) {
      RAW_LOG(ERROR, "Failed to open /dev/null");
      _exit(127);
    }

    int new_fd = HANDLE_EINTR(dup2(null_fd.get(), STDIN_FILENO));
    if (new_fd != STDIN_FILENO) {
      RAW_LOG(ERROR, "Failed to dup /dev/null for stdin");
      _exit(127);
    }

    if (options.new_process_group) {
      // Instead of inheriting the process group ID of the parent, the child
      // starts off a new process group with pgid equal to its process ID.
      if (setpgid(0, 0) < 0) {
        RAW_LOG(ERROR, "setpgid failed");
        _exit(127);
      }
    }

    // Stop type-profiler.
    // The profiler should be stopped between fork and exec since it inserts
    // locks at new/delete expressions.  See http://crbug.com/36678.
    base::type_profiler::Controller::Stop();

    if (options.maximize_rlimits) {
      // Some resource limits need to be maximal in this child.
      for (size_t i = 0; i < options.maximize_rlimits->size(); ++i) {
        const int resource = (*options.maximize_rlimits)[i];
        struct rlimit limit;
        if (getrlimit(resource, &limit) < 0) {
          RAW_LOG(WARNING, "getrlimit failed");
        } else if (limit.rlim_cur < limit.rlim_max) {
          limit.rlim_cur = limit.rlim_max;
          if (setrlimit(resource, &limit) < 0) {
            RAW_LOG(WARNING, "setrlimit failed");
          }
        }
      }
    }

#if defined(OS_MACOSX)
    RestoreDefaultExceptionHandler();
    if (!options.replacement_bootstrap_name.empty())
      ReplaceBootstrapPort(options.replacement_bootstrap_name);
#endif  // defined(OS_MACOSX)

    ResetChildSignalHandlersToDefaults();
    SetSignalMask(orig_sigmask);

#if 0
    // When debugging it can be helpful to check that we really aren't making
    // any hidden calls to malloc.
    void *malloc_thunk =
        reinterpret_cast<void*>(reinterpret_cast<intptr_t>(malloc) & ~4095);
    mprotect(malloc_thunk, 4096, PROT_READ | PROT_WRITE | PROT_EXEC);
    memset(reinterpret_cast<void*>(malloc), 0xff, 8);
#endif  // 0

#if defined(OS_CHROMEOS)
    if (options.ctrl_terminal_fd >= 0) {
      // Set process' controlling terminal.
      if (HANDLE_EINTR(setsid()) != -1) {
        if (HANDLE_EINTR(
                ioctl(options.ctrl_terminal_fd, TIOCSCTTY, NULL)) == -1) {
          RAW_LOG(WARNING, "ioctl(TIOCSCTTY), ctrl terminal not set");
        }
      } else {
        RAW_LOG(WARNING, "setsid failed, ctrl terminal not set");
      }
    }
#endif  // defined(OS_CHROMEOS)

    if (options.fds_to_remap) {
      // Cannot use STL iterators here, since debug iterators use locks.
      for (size_t i = 0; i < options.fds_to_remap->size(); ++i) {
        const FileHandleMappingVector::value_type& value =
            (*options.fds_to_remap)[i];
        fd_shuffle1.push_back(InjectionArc(value.first, value.second, false));
        fd_shuffle2.push_back(InjectionArc(value.first, value.second, false));
      }
    }

    if (!options.environ.empty() || options.clear_environ)
      SetEnvironment(new_environ.get());

    // fd_shuffle1 is mutated by this call because it cannot malloc.
    if (!ShuffleFileDescriptors(&fd_shuffle1))
      _exit(127);

    CloseSuperfluousFds(fd_shuffle2);

    // Set NO_NEW_PRIVS by default. Since NO_NEW_PRIVS only exists in kernel
    // 3.5+, do not check the return value of prctl here.
#if defined(OS_LINUX)
#ifndef PR_SET_NO_NEW_PRIVS
#define PR_SET_NO_NEW_PRIVS 38
#endif
    if (!options.allow_new_privs) {
      if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) && errno != EINVAL) {
        // Only log if the error is not EINVAL (i.e. not supported).
        RAW_LOG(FATAL, "prctl(PR_SET_NO_NEW_PRIVS) failed");
      }
    }
#endif

    for (size_t i = 0; i < argv.size(); i++)
      argv_cstr[i] = const_cast<char*>(argv[i].c_str());
    argv_cstr[argv.size()] = NULL;
    execvp(argv_cstr[0], argv_cstr.get());

    RAW_LOG(ERROR, "LaunchProcess: failed to execvp:");
    RAW_LOG(ERROR, argv_cstr[0]);
    _exit(127);
  } else {
    // Parent process
    if (options.wait) {
      // While this isn't strictly disk IO, waiting for another process to
      // finish is the sort of thing ThreadRestrictions is trying to prevent.
      base::ThreadRestrictions::AssertIOAllowed();
      pid_t ret = HANDLE_EINTR(waitpid(pid, 0, 0));
      DPCHECK(ret > 0);
    }

    if (process_handle)
      *process_handle = pid;
  }

  return true;
}


bool LaunchProcess(const CommandLine& cmdline,
                   const LaunchOptions& options,
                   ProcessHandle* process_handle) {
  return LaunchProcess(cmdline.argv(), options, process_handle);
}

void RaiseProcessToHighPriority() {
  // On POSIX, we don't actually do anything here.  We could try to nice() or
  // setpriority() or sched_getscheduler, but these all require extra rights.
}

// Return value used by GetAppOutputInternal to encapsulate the various exit
// scenarios from the function.
enum GetAppOutputInternalResult {
  EXECUTE_FAILURE,
  EXECUTE_SUCCESS,
  GOT_MAX_OUTPUT,
};

// Executes the application specified by |argv| and wait for it to exit. Stores
// the output (stdout) in |output|. If |do_search_path| is set, it searches the
// path for the application; in that case, |envp| must be null, and it will use
// the current environment. If |do_search_path| is false, |argv[0]| should fully
// specify the path of the application, and |envp| will be used as the
// environment. Redirects stderr to /dev/null.
// If we successfully start the application and get all requested output, we
// return GOT_MAX_OUTPUT, or if there is a problem starting or exiting
// the application we return RUN_FAILURE. Otherwise we return EXECUTE_SUCCESS.
// The GOT_MAX_OUTPUT return value exists so a caller that asks for limited
// output can treat this as a success, despite having an exit code of SIG_PIPE
// due to us closing the output pipe.
// In the case of EXECUTE_SUCCESS, the application exit code will be returned
// in |*exit_code|, which should be checked to determine if the application
// ran successfully.
static GetAppOutputInternalResult GetAppOutputInternal(
    const std::vector<std::string>& argv,
    char* const envp[],
    std::string* output,
    size_t max_output,
    bool do_search_path,
    int* exit_code) {
  // Doing a blocking wait for another command to finish counts as IO.
  base::ThreadRestrictions::AssertIOAllowed();
  // exit_code must be supplied so calling function can determine success.
  DCHECK(exit_code);
  *exit_code = EXIT_FAILURE;

  int pipe_fd[2];
  pid_t pid;
  InjectiveMultimap fd_shuffle1, fd_shuffle2;
  scoped_ptr<char*[]> argv_cstr(new char*[argv.size() + 1]);

  fd_shuffle1.reserve(3);
  fd_shuffle2.reserve(3);

  // Either |do_search_path| should be false or |envp| should be null, but not
  // both.
  DCHECK(!do_search_path ^ !envp);

  if (pipe(pipe_fd) < 0)
    return EXECUTE_FAILURE;

  switch (pid = fork()) {
    case -1:  // error
      close(pipe_fd[0]);
      close(pipe_fd[1]);
      return EXECUTE_FAILURE;
    case 0:  // child
      {
        // DANGER: no calls to malloc or locks are allowed from now on:
        // http://crbug.com/36678

#if defined(OS_MACOSX)
        RestoreDefaultExceptionHandler();
#endif

        // Obscure fork() rule: in the child, if you don't end up doing exec*(),
        // you call _exit() instead of exit(). This is because _exit() does not
        // call any previously-registered (in the parent) exit handlers, which
        // might do things like block waiting for threads that don't even exist
        // in the child.
        int dev_null = open("/dev/null", O_WRONLY);
        if (dev_null < 0)
          _exit(127);

        // Stop type-profiler.
        // The profiler should be stopped between fork and exec since it inserts
        // locks at new/delete expressions.  See http://crbug.com/36678.
        base::type_profiler::Controller::Stop();

        fd_shuffle1.push_back(InjectionArc(pipe_fd[1], STDOUT_FILENO, true));
        fd_shuffle1.push_back(InjectionArc(dev_null, STDERR_FILENO, true));
        fd_shuffle1.push_back(InjectionArc(dev_null, STDIN_FILENO, true));
        // Adding another element here? Remeber to increase the argument to
        // reserve(), above.

        for (size_t i = 0; i < fd_shuffle1.size(); ++i)
          fd_shuffle2.push_back(fd_shuffle1[i]);

        if (!ShuffleFileDescriptors(&fd_shuffle1))
          _exit(127);

        CloseSuperfluousFds(fd_shuffle2);

        for (size_t i = 0; i < argv.size(); i++)
          argv_cstr[i] = const_cast<char*>(argv[i].c_str());
        argv_cstr[argv.size()] = NULL;
        if (do_search_path)
          execvp(argv_cstr[0], argv_cstr.get());
        else
          execve(argv_cstr[0], argv_cstr.get(), envp);
        _exit(127);
      }
    default:  // parent
      {
        // Close our writing end of pipe now. Otherwise later read would not
        // be able to detect end of child's output (in theory we could still
        // write to the pipe).
        close(pipe_fd[1]);

        output->clear();
        char buffer[256];
        size_t output_buf_left = max_output;
        ssize_t bytes_read = 1;  // A lie to properly handle |max_output == 0|
                                 // case in the logic below.

        while (output_buf_left > 0) {
          bytes_read = HANDLE_EINTR(read(pipe_fd[0], buffer,
                                    std::min(output_buf_left, sizeof(buffer))));
          if (bytes_read <= 0)
            break;
          output->append(buffer, bytes_read);
          output_buf_left -= static_cast<size_t>(bytes_read);
        }
        close(pipe_fd[0]);

        // Always wait for exit code (even if we know we'll declare
        // GOT_MAX_OUTPUT).
        bool success = WaitForExitCode(pid, exit_code);

        // If we stopped because we read as much as we wanted, we return
        // GOT_MAX_OUTPUT (because the child may exit due to |SIGPIPE|).
        if (!output_buf_left && bytes_read > 0)
          return GOT_MAX_OUTPUT;
        else if (success)
          return EXECUTE_SUCCESS;
        return EXECUTE_FAILURE;
      }
  }
}

bool GetAppOutput(const CommandLine& cl, std::string* output) {
  return GetAppOutput(cl.argv(), output);
}

bool GetAppOutput(const std::vector<std::string>& argv, std::string* output) {
  // Run |execve()| with the current environment and store "unlimited" data.
  int exit_code;
  GetAppOutputInternalResult result = GetAppOutputInternal(
      argv, NULL, output, std::numeric_limits<std::size_t>::max(), true,
      &exit_code);
  return result == EXECUTE_SUCCESS && exit_code == EXIT_SUCCESS;
}

// TODO(viettrungluu): Conceivably, we should have a timeout as well, so we
// don't hang if what we're calling hangs.
bool GetAppOutputRestricted(const CommandLine& cl,
                            std::string* output, size_t max_output) {
  // Run |execve()| with the empty environment.
  char* const empty_environ = NULL;
  int exit_code;
  GetAppOutputInternalResult result = GetAppOutputInternal(
      cl.argv(), &empty_environ, output, max_output, false, &exit_code);
  return result == GOT_MAX_OUTPUT || (result == EXECUTE_SUCCESS &&
                                      exit_code == EXIT_SUCCESS);
}

bool GetAppOutputWithExitCode(const CommandLine& cl,
                              std::string* output,
                              int* exit_code) {
  // Run |execve()| with the current environment and store "unlimited" data.
  GetAppOutputInternalResult result = GetAppOutputInternal(
      cl.argv(), NULL, output, std::numeric_limits<std::size_t>::max(), true,
      exit_code);
  return result == EXECUTE_SUCCESS;
}

}  // namespace base