/* SPDX-License-Identifier: MIT */
/* SPDX-FileCopyrightText: (c) Copyright 2024,2025 Andrew Bower <andrew@bower.uk> */

/* xchpst: eXtended Change Process State
 * A tool that is backwards compatible with chpst(8) from runit(8),
 * offering additional options to harden process with namespace isolation
 * and more. */

#include <assert.h>
#include <ctype.h>
#include <fcntl.h>
#include <getopt.h>
#include <grp.h>
#include <errno.h>
#include <poll.h>
#include <sched.h>
#include <signal.h>
#include <stdarg.h>
#include <strings.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <libgen.h>
#include <linux/ioprio.h>
#include <sys/file.h>
#include <sys/dir.h>
#include <sys/eventfd.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/prctl.h>
#include <sys/signalfd.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <sys/syscall.h>

#include "xchpst.h"
#include "caps.h"
#include "join.h"
#include "env.h"
#include "options.h"
#include "rootfs.h"
#include "mount.h"
#include "precreate.h"
#include "dirs.h"

static const char *version_str = STRINGIFY(PROG_VERSION);
#ifdef PROG_DEFAULT
static const char *default_app = STRINGIFY(PROG_DEFAULT);
#else
static const char *default_app = "xchpst";
#endif

const struct app apps[] = {
  { COMPAT_CHPST,       "chpst",     .long_opts = false },
  { COMPAT_XCHPST,      "xchpst",    .long_opts = true },
  { COMPAT_SOFTLIMIT,   "softlimit", .long_opts = false },
  { COMPAT_ENVDIR,      "envdir",    false, 1, { OPT_ENVDIR } },
  { COMPAT_PGRPHACK,    "pgrphack",  false, 1, { OPT_PGRPHACK } },
  { COMPAT_FGHACK,      "fghack",    false, 1, { OPT_FGHACK } },
  { COMPAT_SETUIDGID,   "setuidgid", false, 1, { OPT_SETUIDGID } },
  { COMPAT_ENVUIDGID,   "envuidgid", false, 1, { OPT_ENVUIDGID } },
  { COMPAT_SETLOCK,     "setlock",   false, 1, { OPT_LOCK_WAIT } },
  { COMPAT_APPLYUIDGID, "applyuidgid", .long_opts = false },
  { COMPAT_SETUIDGID_FROMENV,
                        "setuidgid-fromenv", false, 1, { OPT_UGFROMENV } },
};
#define max_apps ((ssize_t) (sizeof apps / (sizeof *apps)))

static const char *suite_prefixes[] = {
  "s6-",
  NULL
};

static int proc_self_fd = -1;
static struct {
  int needed;
  int sync_fd;
  pid_t pid;
} nshelper = { 0 };

static struct {
  char uid[16];
  char gid[16];
  char *gidlist;
} extra_env = { 0 };

static void version(FILE *out) {
  fprintf(out,
          "xchpst-%s (c) Copyright 2024,2025 Andrew Bower <andrew@bower.uk>\n",
          version_str);
}

static void usage(FILE *out) {
  version(out);
  fprintf(out, "\nusage: %s OPTIONS [--]",
          program_invocation_short_name);
  options_print_positional(out);
  fprintf(out, " PROG...    launch PROG with changed process state\n");
  options_explain_positional(out);
  options_print(out);
}

static int open_proc_self(void) {
  if (proc_self_fd == -1)
    proc_self_fd = open("/proc/self", O_RDONLY | O_DIRECTORY | O_CLOEXEC);
  else if (proc_self_fd == -1)
    perror("opening /proc/self");
  return proc_self_fd;
}

static void close_proc_self(void) {
  if (proc_self_fd != -1) {
    if (close(proc_self_fd) == -1)
      perror("closing /proc/self");
    proc_self_fd = -1;
  }
}

static int write_proc_self_once(const char *file, const char *fmt, ...) {
  int fd;
  char *text;
  ssize_t len;
  int rc = 0;
  va_list args;

  if (open_proc_self() == -1)
    goto fail0;

  fd = openat(proc_self_fd, file, O_WRONLY);
  if ((rc = fd == -1 ? 1 : 0))
    goto fail0;

  va_start(args, fmt);
  len = vasprintf(&text, fmt, args);
  va_end(args);
  if (text == NULL || len == -1) {
    rc = -1;
    goto fail;
  }

  rc = write(fd, text, len);
  rc = (rc == len ? 0 : -1);

  free(text);

fail:
  close(fd);
fail0:
  if (rc != 0)
    fprintf(stderr, "writing to %s: %s\n", file, strerror(errno));
  return rc;
}

void set_idmap(const char *path, enum idmap map, long id) {
  switch (map) {
  case IDMAP_SELF:
    write_proc_self_once(path, "%lu %lu %lu\n", 0, id, 1);
    break;
  case IDMAP_IDENTITY:
    write_proc_self_once(path, "%lu %lu %lu\n", 0, 0, 0x00010000UL);
    break;
  case IDMAP_FULL:
    write_proc_self_once(path, "%lu %lu %lu\n", 0, 0, 0xFFFFFFFFUL);
    break;
  }
}

int ns_helper(gid_t gid, uid_t uid) {
  if (nshelper.pid > 0) {
    const uint64_t e = 1;
    int rc, status;
    do rc = write(nshelper.sync_fd, &e, sizeof e);
    while (rc == -1 && errno == EINTR);
    if (rc == -1) {
      perror("ns_helper: write()");
      return 1;
    }
    do rc = waitpid(nshelper.pid, &status, 0);
    while (rc == -1 && errno == EINTR);
    if (rc == -1) {
      perror("ns_helper: waitpid()");
      return 1;
    }
    if (WIFEXITED(status) && WEXITSTATUS(status) != EXIT_SUCCESS)
      return 1;
  } else {
    set_idmap("gid_map", opt.idmap, gid);
    set_idmap("uid_map", opt.idmap, uid);
  }
  return 0;
}

void set_rlimit(int resource, struct limit *option) {
  struct rlimit prev;

  if (option->soft_specified || option->hard_specified) {
    if (getrlimit(resource, &prev) != 0) {
      fprintf(stderr, "warning: resource type %d cannot be controlled on this kernel\n", resource);
      return;
    }
  } else {
    return;
  }

  if (option->hard_specified)
    prev.rlim_max = option->limits.rlim_max;

  if (option->soft_specified) {
    if (option->limits.rlim_cur != RLIM_INFINITY &&
        option->limits.rlim_max != RLIM_INFINITY &&
        option->limits.rlim_cur > prev.rlim_max) {
      fprintf(stderr, "warning capping requested %d soft limit from %lld to maximum %lld\n",
              resource, (long long) option->limits.rlim_cur, (long long) prev.rlim_max);
      prev.rlim_cur = prev.rlim_max;
    } else {
      prev.rlim_cur = option->limits.rlim_cur;
    }
  }

  if (setrlimit(resource, &prev) != 0) {
    fprintf(stderr, "warning: failed to set type %d soft limit\n", resource);
  }
}

void set_resource_limits(void) {
  set_rlimit(RLIMIT_DATA, &opt.rlimit_data);
  set_rlimit(RLIMIT_AS, &opt.rlimit_as);
  set_rlimit(RLIMIT_STACK, &opt.rlimit_stack);
  set_rlimit(RLIMIT_MEMLOCK, &opt.rlimit_memlock);
  set_rlimit(RLIMIT_RSS, &opt.rlimit_rss);
  set_rlimit(RLIMIT_NOFILE, &opt.rlimit_nofile);
  set_rlimit(RLIMIT_NPROC, &opt.rlimit_nproc);
  set_rlimit(RLIMIT_FSIZE, &opt.rlimit_fsize);
  set_rlimit(RLIMIT_CORE, &opt.rlimit_core);
  set_rlimit(RLIMIT_CPU, &opt.rlimit_cpu);
  set_rlimit(RLIMIT_MSGQUEUE, &opt.rlimit_msgqueue);
  set_rlimit(RLIMIT_NICE, &opt.rlimit_nice);
  set_rlimit(RLIMIT_RTPRIO, &opt.rlimit_rtprio);
  set_rlimit(RLIMIT_RTTIME, &opt.rlimit_rttime);
  set_rlimit(RLIMIT_SIGPENDING, &opt.rlimit_sigpending);
  set_rlimit(RLIMIT_LOCKS, &opt.rlimit_locks);
}

bool drop_user(struct users_groups *ug) {
  int rc;
  int i;
  uid_t uid = ug->user.uid;
  gid_t gid = ug->group.gid;

  /* You do this backwards: supplemental groups first */
  gid_t *groups = malloc(sizeof(gid_t) * ug->num_supplemental);

#ifndef NO_CAP
  if (opt.caps_op != CAP_OP_NONE) {
    /* Postpone the loss of capabilities until after the user switch
     * and drop them when ready. */
    rc = prctl(PR_SET_KEEPCAPS, 1);
  }
#endif

  if (!groups && ug->num_supplemental)
    return false;

  for (i = 0; i < ug->num_supplemental; i++)
    groups[i] = ug->supplemental[i].gid;
  rc = (setgroups(i, groups) == -1 ? errno : 0);
  free(groups);
  if (rc) {
    perror("setgroups");
    return false;
  }

  /* Then main group */
  if (ug->group.resolved &&
      (rc = setresgid(gid, gid, gid))) {
    perror("setresgid");
    return false;
  }

  /* Then the actual user */
  if (ug->user.resolved &&
      (rc = setresuid(uid, uid, uid))) {
    perror("setresgid");
    return false;
  }
  return true;
}

static const struct app *find_app(const char *name) {
  const struct app *app;
  const char *ext = strchrnul(name, '.');
  const char **suite;

  for (suite = suite_prefixes; *suite && strncmp(name, *suite, strlen(*suite)); suite++);
  if (*suite)
    name += strlen(*suite);

  for (app = apps;
       app - apps < max_apps && strncmp(name, app->name, ext - name);
       app++);

  return app;
}

int main(int argc, char *argv[]) {
  sigset_t newmask;
  sigset_t oldmask;
  char **sub_argv;
  char *executable;
  struct pivot_state pivot = pivot_init();
  int sub_argc;
  pid_t child;
  int optind;
  int rc = 0;
  int ret = CHPST_ERROR_CHANGING_STATE;
  int lock_fd = -1;
  uid_t uid;
  gid_t gid;
  char *pwd = NULL;
  int fd;

  /* As which application were we invoked? */
  opt.app = find_app(program_invocation_short_name);
  if (opt.app - apps == max_apps)
    opt.app = find_app(default_app);
  assert(opt.app - apps != max_apps);

  if (!options_init())
    return CHPST_ERROR_OPTIONS;
  optind = options_parse(argc, argv);

  if (is_verbose())
    fprintf(stderr, "invoked as %s(%s)\n", opt.app->name, program_invocation_short_name);

  if (!set(OPT_FORK_JOIN) &&
      set(OPT_FGHACK)) {
    if (is_verbose())
      fprintf(stderr, "also going to do fork-join since fghack requested\n");
    enable(OPT_FORK_JOIN);
  }

  if (!set(OPT_FORK_JOIN) &&
      (opt.new_ns & CLONE_NEWPID)) {
    if (is_verbose())
      fprintf(stderr, "also going to do fork-join since new PID namespace requested\n");
    enable(OPT_FORK_JOIN);
  }

  if (!(opt.new_ns & CLONE_NEWNS) &&
      (set(OPT_NET_NS) || set(OPT_NET_ADOPT) || set(OPT_PRIVATE_RUN) || set(OPT_PRIVATE_TMP) ||
       set(OPT_RO_SYS) || set(OPT_RO_HOME) || set(OPT_RO_ETC) ||
       set(OPT_NEW_ROOT) || opt.new_ns & CLONE_NEWPID)) {
    if (is_verbose())
      fprintf(stderr, "also creating mount namespace implicitly due to other options\n");
    opt.new_ns |= CLONE_NEWNS;
  }

  if (opt.exit) {
    ret = opt.error ? CHPST_ERROR_OPTIONS : opt.retcode;
    goto finish0;
  }

  if (opt.help)
    usage(stdout);
  else if (opt.version)
    version(stdout);

  if (opt.help || opt.version) {
    ret = CHPST_OK;
    goto finish0;
  }

  if (optind == argc && !set(OPT_LOGIN))
    opt.error = true;

  if (opt.error) {
    const struct option_info *help_option = find_option(OPT_HELP, NULL);
    if (help_option && opt.app->long_opts && help_option->long_name &&
        is_compatible(help_option))
      fprintf(stderr, "%s: error in options. Run %s --%s for usage\n",
              program_invocation_short_name,
              program_invocation_short_name,
              help_option->long_name);
    else
      usage(stderr);
    ret = CHPST_ERROR_OPTIONS;
    goto finish0;
  }

  /* Do xchpsty-type things now! */
  sub_argc = argc - optind;
  if (sub_argc == 0)
    sub_argc++;
  if ((sub_argv = malloc((sub_argc + 1) * sizeof *sub_argv)) == NULL)
    goto finish0;

  memcpy(sub_argv, argv + optind, (argc - optind) * sizeof *sub_argv);
  sub_argv[sub_argc] = NULL;

  if (set(OPT_UMASK))
    umask(opt.umask);

  if (set(OPT_OOM))
    write_proc_self_once("oom_score_adj", "%ld", opt.oom_adjust);

  if (opt.lock_file) {
    if (opt.lock_nowait_override)
      opt.lock_wait = false;
    lock_fd = open(opt.lock_file, O_WRONLY | O_NDELAY | O_APPEND | O_CREAT, 0600);
    if (lock_fd != -1 && flock(lock_fd, LOCK_EX | (opt.lock_wait ? 0 : LOCK_NB)) == -1) {
      close(lock_fd);
      lock_fd = -1;
    }
    if (lock_fd == -1) {
      if (opt.lock_quiet)
        ret = CHPST_ERROR_EXIT;
      else
        fprintf(stderr, "error obtaining lock, %s\n", strerror(errno));
      goto finish;
    }
  }

  if (set(OPT_PGRPHACK)) {
    rc = setsid();
    if (rc == -1) {
      perror("setsid");
      goto finish;
    } else if (is_verbose()) {
      fprintf(stderr, "new session id: %d\n", rc);
    }
  }

  if (opt.env_dir && !read_env_dir(opt.env_dir))
    goto finish;

  if (set(OPT_ENVUIDGID)) {
    struct users_groups *ug = &opt.env_users_groups;
    struct sys_entry *entry;
    const ssize_t idsz = 12;
    char *s;
    int i;
    if ((entry = &ug->user)->resolved) {
      rc = snprintf(extra_env.uid, sizeof extra_env.uid, "UID=%d", entry->uid);
      if (rc == -1 || rc == sizeof extra_env.uid) {
        perror("creating UID env");
        goto finish;
      }
      putenv(extra_env.uid);
    } else {
      unsetenv("UID");
      fprintf(stderr, "envuidgid: no user resolved\n");
      goto finish;
    }
    if ((entry = &ug->group)->resolved) {
      rc = snprintf(extra_env.gid, sizeof extra_env.gid, "GID=%d", entry->gid);
      if (rc == -1 || rc == sizeof extra_env.uid) {
        perror("creating GID env");
        goto finish;
      }
      putenv(extra_env.gid);
    } else {
      unsetenv("GID");
    }
    if ((s = extra_env.gidlist = malloc(1 + idsz * ug->num_supplemental)) == NULL)
      goto finish;
    for (i = 0; i < ug->num_supplemental; i++) {
      rc = snprintf(s, idsz, "%d,", ug->supplemental[i].gid);
      if (rc < 0 || rc >= idsz) {
        perror("creating GIDLIST env");
        goto finish;
      }
      s += rc;
    }
    s[i ? -1 : 0] = '\0';
    setenv("GIDLIST", extra_env.gidlist, 1);
  }

  if (set(OPT_UGCLEARENV)) {
    unsetenv("UID");
    unsetenv("GID");
    unsetenv("GIDLIST");
  }

  if (set(OPT_NICE)) {
    int newnice;

    errno = 0;
    newnice = nice(opt.niceness);
    if (errno) {
      fprintf(stderr, "could not change niceness, %s\n", strerror(errno));
      goto finish;
    }
    if (is_verbose()) {
      fprintf(stderr, "now at niceness %d\n", newnice);
    }
  }

  if (set(OPT_IO_SCHED)) {
    if (syscall(SYS_ioprio_set,IOPRIO_WHO_PROCESS, 0, opt.ionice_prio) == -1) {
      fprintf(stderr, "warning: failed to set I/O scheduling class\n");
    } else if (is_verbose()) {
      fprintf(stderr, "set IO class to %d:%ld\n",
              IOPRIO_PRIO_CLASS(opt.ionice_prio),
              IOPRIO_PRIO_DATA(opt.ionice_prio));
    }
  }

  if (opt.cpu_affinity.size &&
      sched_setaffinity(0, opt.cpu_affinity.size,
                        opt.cpu_affinity.mask) == -1)
    perror("could not set CPU affinity");

  if (set(OPT_CPU_SCHED) &&
      sched_setscheduler(0, opt.sched_policy, &((struct sched_param) {})) == -1)
    perror("could not change scheduler policy");

#ifndef NO_CAP
  if ((opt.cap_bounds_op != CAP_OP_NONE ||
       opt.caps_op != CAP_OP_NONE) &&
      runtime.absent.caps) {
    fprintf(stderr, "ignoring capabilities as not supported on system");
    opt.cap_bounds_op = opt.caps_op = CAP_OP_NONE;
  }

  if (opt.cap_bounds_op != CAP_OP_NONE)
    if (!set_capabilities_bounding_set())
      goto finish;
#endif

  if (set(OPT_SETUIDGID)) {
    if (opt.users_groups.user.resolved) {
      uid = opt.users_groups.user.uid;
    } else {
      fprintf(stderr, "setuidgid: no user resolved\n");
      goto finish;
    }
  } else {
    uid = getuid();
  }

  if (set(OPT_SETUIDGID) && opt.users_groups.group.resolved) {
    gid = opt.users_groups.group.gid;
  } else {
    gid = getgid();
  }

  /* Set a login environment */
  if (set(OPT_LOGIN)) {
    struct users_groups *ug = NULL;
    struct users_groups current = { 0 };

    /* Prefer -u, -U then current user, but if the preferred choice
     * is not resolved then we don't set a login environment, rather
     * than picking an unpredictable one. */
    if (set(OPT_SETUIDGID))
      ug = &opt.users_groups;
    else if (set(OPT_ENVUIDGID))
      ug = &opt.env_users_groups;
    else
      usrgrp_resolve_uid(ug = &current, getuid(), true);

    if (ug->user.resolved) {
      if (ug->username && *ug->username) {
        setenv("USER", ug->username, 1);
        setenv("LOGNAME", ug->username, 1);
      }
      if (ug->home && *ug->home) {
        setenv("HOME", ug->home, 1);
      }
      if (ug->shell && *ug->shell) {
        setenv("SHELL", ug->shell, 1);
      }
    }
  }

  if (argc == optind)
    sub_argv[0] = getenv("SHELL");

  executable = sub_argv[0];
  if (opt.argv0)
    sub_argv[0] = opt.argv0;

  if (opt.app_name == NULL)
    opt.app_name = basename(sub_argv[0]);

  {
    uid_t o = set(OPT_SETUIDGID) ? uid : (uid_t) -1;
    gid_t g = set(OPT_SETUIDGID) ? gid : (gid_t) -1;

    if (set(OPT_RUN_DIR) &&
        precreate_dir("/run", 0755, o, g) == -1)
      goto finish;

    if (set(OPT_STATE_DIR) &&
        precreate_dir("/var/lib", 0755, o, g) == -1)
      goto finish;

    if (set(OPT_CACHE_DIR) &&
        precreate_dir("/var/cache", 0755, o, g) == -1)
      goto finish;

    if (set(OPT_LOG_DIR) &&
        precreate_dir("/var/log", 0755, o, g) == -1)
      goto finish;
  }

  /* Iff using a user namespace, drop the user first */
  if (set(OPT_SETUIDGID) &&
      opt.new_ns & CLONE_NEWUSER &&
      !drop_user(&opt.users_groups))
      goto finish;

  if (opt.idmap != IDMAP_SELF)
    nshelper.needed++;

  if (nshelper.needed) {
    uint64_t e;
    if (open_proc_self() == -1 ||
        (nshelper.sync_fd = eventfd(0, 0)) == -1 ||
        (nshelper.pid = fork()) == -1) {
      perror("ns_helper: open(\"/proc/self\"); eventfd(); fork()");
      goto finish;
    }
    if (nshelper.pid == 0) {
      while ((rc = read(nshelper.sync_fd, &e, sizeof e)) == -1 &&
             (errno == EAGAIN || errno == EINTR));
      if (rc != sizeof e || e != 1) {
        perror("ns_helper: read() and sync");
        goto finish;
      }
      exit(ns_helper(gid, uid));
    }
    /* Any state (variables) relied upon by the namespace helper must not
     * change between here and the sync. */
  }

  if (opt.new_ns) {
    rc = unshare(opt.new_ns);
    if (rc == -1) {
      perror(NAME_STR ": unshare()");
      goto finish;
    }
    if (is_debug()) fprintf(stderr, "created 0x%x namespaces\n", opt.new_ns);

    if (opt.new_ns & CLONE_NEWNS) {
      rc = mount(NULL, "/", NULL,
                 MS_REC | MS_SLAVE, NULL);
      if (rc == -1)
        fprintf(stderr, "recursive remounting / as MS_SLAVE: %s", strerror(errno));
    }
  }

  if (opt.new_ns & CLONE_NEWUSER) {
    rc = prctl(PR_SET_DUMPABLE, 1);
    setgroups(0, NULL);
    write_proc_self_once("setgroups", "%s", "deny\n");
    ns_helper(gid, uid);
    if (opt.idmap == IDMAP_SELF &&
        (setresgid(0, 0, 0) != 0 ||
         setresuid(0, 0, 0) != 0))
      fprintf(stderr, "warning: error becoming root in user namespace, %s\n", strerror(errno));
  }

  if (opt.net_adopt) {
    const char *failed_op;
    int old_dir;
    if ((failed_op = "open .", old_dir = open(".", O_DIRECTORY | O_RDONLY)) == -1)
      goto adopt_fail0;
    /* we shouldn't worry about chdir failing if path is absolute */
    if ((failed_op = "chdir", *opt.net_adopt != '/' && chdir("/var/run/netns")) < 0 ||
        (failed_op = "open adopted ns", fd = open(opt.net_adopt, O_RDONLY)) < 0)
      goto adopt_fail1;
    if ((failed_op = "setns", setns(fd, CLONE_NEWNET)) < 0 ||
        (failed_op = "umount2", umount2(opt.net_adopt, MNT_DETACH)) < 0 ||
        (failed_op = "unlink", unlink(opt.net_adopt) < 0))
      goto adopt_fail2;
    failed_op = NULL;
    if (opt.verbosity > 0) fprintf(stderr, "adopted net ns\n");
adopt_fail2:
    close(fd);
    if (fchdir(old_dir) == -1)
      failed_op = "fchdir to original cwd";
adopt_fail1:
    close(old_dir);
adopt_fail0:
    if (failed_op) {
      perror(failed_op);
      goto finish;
    }
  }

  if (opt.net_adopt || opt.new_ns & CLONE_NEWNET)
    special_mount("/sys", "sysfs", "sysfs", NULL);

  if (set(OPT_FORK_JOIN)) {
    bool subreaper = set(OPT_FGHACK) && !(opt.new_ns & CLONE_NEWPID);

    if (!sig_proxy_mask(&newmask, &oldmask))
      goto finish;

    if (subreaper && prctl(PR_SET_CHILD_SUBREAPER, 1) == -1) {
      perror("becoming subpreaper for fghack");
      goto finish;
    }

    child = fork();
    if (child == -1) {
      perror("fork");
      goto finish;
    } else if (child != 0) {
      goto join;
    } else {
      if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1)
        perror("warning: could not restore signal mask in child");
    }
  }

  /*************************************
   *  Inside child if fork-join used   *
   *************************************/

  pwd = get_current_dir_name();
  if (pwd == NULL)
    perror("warning: could not save location of starting directory");

  if (set(OPT_NEW_ROOT)) {
    /* mounting special filesystems after pivoting root will make the kernel
     * think this is an attempt to bypass a sandboxing program that wants to
     * hide proc and sys from us. We must mount them while the original mounts
     * are still visible to prevent a 'Mount too revealing' error in dmesg */
    if (!create_new_root(basename(executable), &pivot) || !pivot_to_new_root(&pivot))
      goto finish;
  } else {
    /* we can un+remount without much issue */
    if (opt.new_ns & CLONE_NEWPID)
      special_mount("/proc", "proc", "procfs", NULL);
  }

  if (set(OPT_PRIVATE_RUN) &&
      private_mount("/run") == -1)
    goto finish;

  if (set(OPT_PRIVATE_TMP) &&
      (private_mount("/tmp") == -1 ||
       private_mount("/var/tmp") == -1))
    goto finish;

  if (set(OPT_PROTECT_HOME) &&
      (private_mount("/home") == -1 ||
       private_mount("/root") == -1 ||
       private_mount("/run/user") == -1))
    goto finish;

  if (!set(OPT_PROTECT_HOME) && set(OPT_RO_HOME) &&
      (remount_ro("/home") == -1 ||
       remount_ro("/root") == -1 ||
       remount_ro("/run/user") == -1))
    goto finish;

  if (set(OPT_RO_SYS) &&
      remount_sys_ro() == -1)
    goto finish;

  if (set(OPT_RO_ETC) &&
      (remount_ro("/etc") == -1))
    goto finish;

  if ((set(OPT_NEW_ROOT) ||
       set(OPT_PRIVATE_RUN) ||
       set(OPT_PRIVATE_TMP) ||
       set(OPT_PROTECT_HOME) ||
       set(OPT_RO_SYS) ||
       set(OPT_RO_ETC)) &&
      (pwd != NULL && chdir(pwd) == -1))
    perror("warning: could not change to original directory relative to new fs");

  if (opt.chroot) {
    rc = chdir(opt.chroot);
    if (rc == -1) {
      perror("chdir for chroot");
      goto finish;
    }
    rc = chroot(".");
    if (rc == -1) {
      perror("chroot");
      goto finish;
    }
    if (is_verbose())
      fprintf(stderr, "entered chroot: %s\n", opt.chroot);
  }

  if (opt.chdir) {
    rc = chdir(opt.chdir);
    if (rc == -1) {
      perror("chdir");
      goto finish;
    }
    if (is_verbose())
      fprintf(stderr, "change directory: %s\n", opt.chdir);
  }

  if (set(OPT_SETUIDGID) &&
      (opt.new_ns & CLONE_NEWUSER) == 0 &&
      !drop_user(&opt.users_groups))
      goto finish;

#ifndef NO_CAP
  if (opt.caps_op != CAP_OP_NONE)
    if (!drop_capabilities())
      goto finish;
#endif

  if (set(OPT_FGHACK) &&
      opt.new_ns & CLONE_NEWPID &&
      !pico_init(open_proc_self()))
    goto finish;

  if (opt.dev_null_fds) {
    int devnull = open("/dev/null", 0);
    if (devnull == -1) {
      perror("open /dev/null");
      goto finish;
    }
    for (unsigned int fds = opt.dev_null_fds; fds; fds &= ~(1 << fd))
      if (dup2(devnull, fd = ffs(fds) - 1) == -1) {
        perror("dup2");
        close(devnull);
        goto finish;
      }
    close(devnull);
  }

  for (unsigned int close_fds = opt.close_fds; close_fds; close_fds &= ~(1 << fd))
    close(fd = ffs(close_fds) - 1);

  if (set(OPT_NO_NEW_PRIVS) && prctl(PR_SET_NO_NEW_PRIVS, 1L, 0L, 0L, 0L) == -1)
    perror("could not honour --no-new-privs");

  pivot_tidy(&pivot);

  set_resource_limits();

  /* Launch the target */
  rc = execvp(executable, sub_argv);

  /* Handle errors launching */
  assert(rc == -1);
  perror(NAME_STR ": execvp");

join:
  if (set(OPT_FORK_JOIN) && child != 0)
    join(child, &newmask, &oldmask, &ret,
         set(OPT_FGHACK) && !(opt.new_ns & CLONE_NEWPID),
         open_proc_self(), "fork-join");

finish:
  /* Actions here should be
     1) suitable for if exec() fails.
     2) clean up if --fork-join is used.
     3) not be necessary when --fork-join is not used.
   */

  close_proc_self();
  free_rootfs_data();
  pivot_tidy(&pivot);
  close_run_dir();

  if (lock_fd != -1)
    close(lock_fd);

  free(pwd);
  free(sub_argv);
  free(extra_env.gidlist);

finish0:
  options_free();
  return ret;
}
