/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "IptablesRestoreController.h"

#include <poll.h>
#include <signal.h>
#include <sys/wait.h>
#include <unistd.h>

#define LOG_TAG "IptablesRestoreController"
#include <android-base/logging.h>
#include <android-base/file.h>
#include <netdutils/Syscalls.h>

#include "Controllers.h"

using android::netdutils::StatusOr;
using android::netdutils::sSyscalls;

constexpr char IPTABLES_RESTORE_PATH[] = "/system/bin/iptables-restore";
constexpr char IP6TABLES_RESTORE_PATH[] = "/system/bin/ip6tables-restore";

constexpr char PING[] = "#PING\n";

constexpr size_t PING_SIZE = sizeof(PING) - 1;

// Not compile-time constants because they are changed by the unit tests.
int IptablesRestoreController::MAX_RETRIES = 50;
int IptablesRestoreController::POLL_TIMEOUT_MS = 100;

class IptablesProcess {
public:
    IptablesProcess(pid_t pid, int stdIn, int stdOut, int stdErr) :
        pid(pid),
        stdIn(stdIn),
        processTerminated(false) {

        pollFds[STDOUT_IDX] = { .fd = stdOut, .events = POLLIN };
        pollFds[STDERR_IDX] = { .fd = stdErr, .events = POLLIN };
    }

    ~IptablesProcess() {
        close(stdIn);
        close(pollFds[STDOUT_IDX].fd);
        close(pollFds[STDERR_IDX].fd);
    }

    bool outputReady() {
        struct pollfd pollfd = { .fd = stdIn, .events = POLLOUT };
        int ret = poll(&pollfd, 1, 0);
        if (ret == -1) {
            ALOGE("outputReady poll failed: %s", strerror(errno));
            return false;
        }
        return (ret == 1) && !(pollfd.revents & POLLERR);
    }

    void stop() {
        if (processTerminated) return;

        // This can be called by drainAndWaitForAck (after a POLLHUP) or by sendCommand (if the
        // process was killed by something else on the system). In both cases, it's safe to send the
        // PID a SIGTERM, because the PID continues to exist until its parent (i.e., us) calls
        // waitpid on it, so there's no risk that the PID is reused.
        int err = kill(pid, SIGTERM);
        if (err) {
            err = errno;
        }

        if (err == ESRCH) {
            // This means that someone else inside netd but outside this class called waitpid(),
            // which is a programming error. There's no point in calling waitpid() here since we
            // know that the process is gone.
            ALOGE("iptables child process %d unexpectedly disappeared", pid);
            processTerminated = true;
            return;
        }

        if (err) {
            ALOGE("Error killing iptables child process %d: %s", pid, strerror(err));
        }

        int status;
        if (waitpid(pid, &status, 0) == -1) {
            ALOGE("Error waiting for iptables child process %d: %s", pid, strerror(errno));
        } else {
            ALOGW("iptables-restore process %d terminated status=%d", pid, status);
        }

        processTerminated = true;
    }

    const pid_t pid;
    const int stdIn;

    struct pollfd pollFds[2];
    std::string errBuf;

    std::atomic_bool processTerminated;

    static constexpr size_t STDOUT_IDX = 0;
    static constexpr size_t STDERR_IDX = 1;
};

IptablesRestoreController::IptablesRestoreController() {
    Init();
}

IptablesRestoreController::~IptablesRestoreController() {
}

void IptablesRestoreController::Init() {
    // We cannot fork these in parallel or a child process could inherit the pipe fds intended for
    // use by the other child process. see https://android-review.googlesource.com/469559 for what
    // breaks. This does not cause a latency hit, because the parent only has to wait for
    // forkAndExec, which is sub-millisecond, and the child processes then call exec() in parallel.
    mIpRestore.reset(forkAndExec(IPTABLES_PROCESS));
    mIp6Restore.reset(forkAndExec(IP6TABLES_PROCESS));
}

/* static */
IptablesProcess* IptablesRestoreController::forkAndExec(const IptablesProcessType type) {
    const char* const cmd = (type == IPTABLES_PROCESS) ?
        IPTABLES_RESTORE_PATH : IP6TABLES_RESTORE_PATH;

    // Create the pipes we'll use for communication with the child
    // process. One each for the child's in, out and err files.
    int stdin_pipe[2];
    int stdout_pipe[2];
    int stderr_pipe[2];

    if (pipe2(stdin_pipe,  O_CLOEXEC) == -1 ||
        pipe2(stdout_pipe, O_NONBLOCK | O_CLOEXEC) == -1 ||
        pipe2(stderr_pipe, O_NONBLOCK | O_CLOEXEC) == -1) {

        ALOGE("pipe2() failed: %s", strerror(errno));
        return nullptr;
    }

    const auto& sys = sSyscalls.get();
    StatusOr<pid_t> child_pid = sys.fork();
    if (!isOk(child_pid)) {
        ALOGE("fork() failed: %s", strerror(child_pid.status().code()));
        return nullptr;
    }

    if (child_pid.value() == 0) {
        // The child process. Reads from stdin, writes to stderr and stdout.

        // stdin_pipe[0] : The read end of the stdin pipe.
        // stdout_pipe[1] : The write end of the stdout pipe.
        // stderr_pipe[1] : The write end of the stderr pipe.
        if (dup2(stdin_pipe[0], 0) == -1 ||
            dup2(stdout_pipe[1], 1) == -1 ||
            dup2(stderr_pipe[1], 2) == -1) {
            ALOGE("dup2() failed: %s", strerror(errno));
            abort();
        }

        if (execl(cmd,
                  cmd,
                  "--noflush",  // Don't flush the whole table.
                  "-w",         // Wait instead of failing if the lock is held.
                  "-v",         // Verbose mode, to make sure our ping is echoed
                                // back to us.
                  nullptr) == -1) {
            ALOGE("execl(%s, ...) failed: %s", cmd, strerror(errno));
            abort();
        }

        // This statement is unreachable. We abort() upon error, and execl
        // if everything goes well.
        return nullptr;
    }

    // The parent process. Writes to stdout and stderr and reads from stdin.
    // stdin_pipe[0] : The read end of the stdin pipe.
    // stdout_pipe[1] : The write end of the stdout pipe.
    // stderr_pipe[1] : The write end of the stderr pipe.
    if (close(stdin_pipe[0]) == -1 ||
        close(stdout_pipe[1]) == -1 ||
        close(stderr_pipe[1]) == -1) {
        ALOGW("close() failed: %s", strerror(errno));
    }

    return new IptablesProcess(child_pid.value(), stdin_pipe[1], stdout_pipe[0], stderr_pipe[0]);
}

// TODO: Return -errno on failure instead of -1.
// TODO: Maybe we should keep a rotating buffer of the last N commands
// so that they can be dumped on dumpsys.
int IptablesRestoreController::sendCommand(const IptablesProcessType type,
                                           const std::string& command,
                                           std::string *output) {
   std::unique_ptr<IptablesProcess> *process =
           (type == IPTABLES_PROCESS) ? &mIpRestore : &mIp6Restore;


    // We might need to fork a new process if we haven't forked one yet, or
    // if the forked process terminated.
    //
    // NOTE: For a given command, this is the last point at which we try to
    // recover from a child death. If the child dies at some later point during
    // the execution of this method, we will receive an EPIPE and return an
    // error. The command will then need to be retried at a higher level.
    IptablesProcess *existingProcess = process->get();
    if (existingProcess != nullptr && !existingProcess->outputReady()) {
        existingProcess->stop();
        existingProcess = nullptr;
    }

    if (existingProcess == nullptr) {
        // Fork a new iptables[6]-restore process.
        IptablesProcess *newProcess = IptablesRestoreController::forkAndExec(type);
        if (newProcess == nullptr) {
            LOG(ERROR) << "Unable to fork ip[6]tables-restore, type: " << type;
            return -1;
        }

        process->reset(newProcess);
    }

    if (!android::base::WriteFully((*process)->stdIn, command.data(), command.length())) {
        ALOGE("Unable to send command: %s", strerror(errno));
        return -1;
    }

    if (!android::base::WriteFully((*process)->stdIn, PING, PING_SIZE)) {
        ALOGE("Unable to send ping command: %s", strerror(errno));
        return -1;
    }

    if (!drainAndWaitForAck(*process, command, output)) {
        // drainAndWaitForAck has already logged an error.
        return -1;
    }

    return 0;
}

void IptablesRestoreController::maybeLogStderr(const std::unique_ptr<IptablesProcess> &process,
                                               const std::string& command) {
    if (process->errBuf.empty()) {
        return;
    }

    ALOGE("iptables error:\n"
          "------- COMMAND -------\n"
          "%s\n"
          "-------  ERROR -------\n"
          "%s"
          "----------------------\n",
          command.c_str(), process->errBuf.c_str());
    process->errBuf.clear();
}

/* static */
bool IptablesRestoreController::drainAndWaitForAck(const std::unique_ptr<IptablesProcess> &process,
                                                   const std::string& command,
                                                   std::string *output) {
    bool receivedAck = false;
    int timeout = 0;
    while (!receivedAck && (timeout++ < MAX_RETRIES)) {
        int numEvents = TEMP_FAILURE_RETRY(
            poll(process->pollFds, ARRAY_SIZE(process->pollFds), POLL_TIMEOUT_MS));
        if (numEvents == -1) {
            ALOGE("Poll failed: %s", strerror(errno));
            return false;
        }

        // We've timed out, which means something has gone wrong - we know that stdout should have
        // become available to read with the ACK message, or that stderr should have been available
        // to read with an error message.
        if (numEvents == 0) {
            continue;
        }

        char buffer[PIPE_BUF];
        for (size_t i = 0; i < ARRAY_SIZE(process->pollFds); ++i) {
            const struct pollfd &pollfd = process->pollFds[i];
            if (pollfd.revents & POLLIN) {
                ssize_t size;
                do {
                    size = TEMP_FAILURE_RETRY(read(pollfd.fd, buffer, sizeof(buffer)));

                    if (size == -1) {
                        if (errno != EAGAIN) {
                            ALOGE("Unable to read from descriptor: %s", strerror(errno));
                        }
                        break;
                    }

                    if (i == IptablesProcess::STDOUT_IDX) {
                        // i == STDOUT_IDX: accumulate stdout into *output, and look
                        // for the ping response.
                        output->append(buffer, size);
                        size_t pos = output->find(PING);
                        if (pos != std::string::npos) {
                            if (output->size() > pos + PING_SIZE) {
                                size_t extra = output->size() - (pos + PING_SIZE);
                                ALOGW("%zd extra characters after iptables response: '%s...'",
                                      extra, output->substr(pos + PING_SIZE, 128).c_str());
                            }
                            output->resize(pos);
                            receivedAck = true;
                        }
                    } else {
                        // i == STDERR_IDX: accumulate stderr into errBuf.
                        process->errBuf.append(buffer, size);
                    }
                } while (size > 0);
            }
            if (pollfd.revents & POLLHUP) {
                // The pipe was closed. This likely means the subprocess is exiting, since
                // iptables-restore only closes stdin on error.
                process->stop();
                break;
            }
        }
    }

    if (!receivedAck && !process->processTerminated) {
        ALOGE("Timed out waiting for response from iptables process %d", process->pid);
        // Kill the process so that if it eventually recovers, we don't misinterpret the ping
        // response (or any output) of the command we just sent as coming from future commands.
        process->stop();
    }

    maybeLogStderr(process, command);

    return receivedAck;
}

int IptablesRestoreController::execute(const IptablesTarget target, const std::string& command,
                                       std::string *output) {
    std::lock_guard<std::mutex> lock(mLock);

    std::string buffer;
    if (output == nullptr) {
        output = &buffer;
    } else {
        output->clear();
    }

    int res = 0;
    if (target == V4 || target == V4V6) {
        res |= sendCommand(IPTABLES_PROCESS, command, output);
    }
    if (target == V6 || target == V4V6) {
        res |= sendCommand(IP6TABLES_PROCESS, command, output);
    }
    return res;
}

int IptablesRestoreController::getIpRestorePid(const IptablesProcessType type) {
    return type == IPTABLES_PROCESS ? mIpRestore->pid : mIp6Restore->pid;
}