void nsenter()
{
int argc, c;
char **argv;
get_args(&argc, &argv);
// check argv 0 to ensure that we are supposed to setns
// we use strncmp to test for a value of "nsenter" but also allows alternate implmentations
// after the setns code path to continue to use the argv 0 to determine actions to be run
// resulting in the ability to specify "nsenter-mknod", "nsenter-exec", etc...
if (strncmp(argv[0], kNsEnter, strlen(kNsEnter)) != 0) {
return;
}
#ifdef PR_SET_CHILD_SUBREAPER
if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) == -1) {
pr_perror("Failed to set child subreaper");
exit(1);
}
#endif
static const struct option longopts[] = {
{"nspid", required_argument, NULL, 'n'},
{"console", required_argument, NULL, 't'},
{NULL, 0, NULL, 0}
};
pid_t init_pid = -1;
char *init_pid_str = NULL;
char *console = NULL;
while ((c = getopt_long_only(argc, argv, "n:c:", longopts, NULL)) != -1) {
switch (c) {
case 'n':
init_pid_str = optarg;
break;
case 't':
console = optarg;
break;
}
}
if (init_pid_str == NULL) {
print_usage();
exit(1);
}
init_pid = strtol(init_pid_str, NULL, 10);
if ((init_pid == 0 && errno == EINVAL) || errno == ERANGE) {
pr_perror("Failed to parse PID from \"%s\" with output \"%d\"",
init_pid_str, init_pid);
print_usage();
exit(1);
}
argc -= 3;
argv += 3;
if (setsid() == -1) {
pr_perror("setsid failed");
exit(1);
}
// before we setns we need to dup the console
int consolefd = -1;
if (console != NULL) {
consolefd = open(console, O_RDWR);
if (consolefd < 0) {
pr_perror("Failed to open console %s", console);
exit(1);
}
}
// blocking until the parent placed the process inside correct cgroups.
unsigned char s;
if (read(3, &s, 1) != 1 || s != '1') {
pr_perror("failed to receive synchronization data from parent");
exit(1);
}
// Setns on all supported namespaces.
char ns_dir[PATH_MAX];
memset(ns_dir, 0, PATH_MAX);
snprintf(ns_dir, PATH_MAX - 1, "/proc/%d/ns/", init_pid);
int ns_dir_fd;
ns_dir_fd = open(ns_dir, O_RDONLY | O_DIRECTORY);
if (ns_dir_fd < 0) {
pr_perror("Unable to open %s", ns_dir);
exit(1);
}
char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt" };
const int num = sizeof(namespaces) / sizeof(char *);
int i;
for (i = 0; i < num; i++) {
// A zombie process has links on namespaces, but they can't be opened
struct stat st;
if (fstatat(ns_dir_fd, namespaces[i], &st, AT_SYMLINK_NOFOLLOW)
== -1) {
if (errno == ENOENT)
continue;
pr_perror("Failed to stat ns file %s for ns %s",
ns_dir, namespaces[i]);
exit(1);
//.........这里部分代码省略.........
//.........这里部分代码省略.........
vma_area->e->status |= VMA_AREA_VSYSCALL;
} else if (!strcmp(file_path, "[vdso]")) {
if (handle_vdso_vma(vma_area))
goto err;
} else if (!strcmp(file_path, "[vvar]")) {
if (handle_vvar_vma(vma_area))
goto err;
} else if (!strcmp(file_path, "[heap]")) {
vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP;
} else {
vma_area->e->status = VMA_AREA_REGULAR;
}
/*
* Some mapping hints for restore, we save this on
* disk and restore might need to analyze it.
*/
if (vma_area->file_borrowed) {
struct vma_area *prev = prev_vfi->vma;
/*
* Pick-up flags that might be set in the branch below.
* Status is copied as-is as it should be zero here,
* and have full match with the previous.
*/
vma_area->e->flags |= (prev->e->flags & MAP_ANONYMOUS);
vma_area->e->status = prev->e->status;
vma_area->e->shmid = prev->e->shmid;
vma_area->vmst = prev->vmst;
vma_area->mnt_id = prev->mnt_id;
} else if (vma_area->vm_file_fd >= 0) {
struct stat *st_buf = vma_area->vmst;
if (S_ISREG(st_buf->st_mode))
/* regular file mapping -- supported */;
else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO))
/* devzero mapping -- also makes sense */;
else {
pr_err("Can't handle non-regular mapping on %d's map %"PRIx64"\n", pid, vma_area->e->start);
goto err;
}
/*
* /dev/zero stands for anon-shared mapping
* otherwise it's some file mapping.
*/
if (is_anon_shmem_map(st_buf->st_dev)) {
if (!(vma_area->e->flags & MAP_SHARED))
goto err_bogus_mapping;
vma_area->e->flags |= MAP_ANONYMOUS;
vma_area->e->status |= VMA_ANON_SHARED;
vma_area->e->shmid = st_buf->st_ino;
if (!strncmp(file_path, "/SYSV", 5)) {
pr_info("path: %s\n", file_path);
vma_area->e->status |= VMA_AREA_SYSVIPC;
}
} else {
if (vma_area->e->flags & MAP_PRIVATE)
vma_area->e->status |= VMA_FILE_PRIVATE;
else
vma_area->e->status |= VMA_FILE_SHARED;
}
/*
* We cannot use the mnt_id value provided by the kernel
* for vm_file_fd if it is an AUFS file (the value is
* wrong). In such a case, fixup_aufs_vma_fd() has set
* mnt_id to -1 to mimic pre-3.15 kernels that didn't
* have mnt_id.
*/
if (vma_area->mnt_id != -1 &&
get_fd_mntid(vma_area->vm_file_fd, &vma_area->mnt_id))
return -1;
} else {
/*
* No file but mapping -- anonymous one.
*/
if (vma_area->e->flags & MAP_SHARED) {
vma_area->e->status |= VMA_ANON_SHARED;
vma_area->e->shmid = vfi->ino;
} else {
vma_area->e->status |= VMA_ANON_PRIVATE;
}
vma_area->e->flags |= MAP_ANONYMOUS;
}
return 0;
err:
return -1;
err_bogus_mapping:
pr_err("Bogus mapping 0x%"PRIx64"-0x%"PRIx64" (flags: %#x vm_file_fd: %d)\n",
vma_area->e->start, vma_area->e->end,
vma_area->e->flags, vma_area->vm_file_fd);
goto err;
err_bogus_mapfile:
pr_perror("Can't open %d's mapfile link %"PRIx64, pid, vma_area->e->start);
goto err;
}
static int vma_get_mapfile(char *fname, struct vma_area *vma, DIR *mfd,
struct vma_file_info *vfi, struct vma_file_info *prev_vfi)
{
char path[32];
if (prev_vfi->vma && vfi_equal(vfi, prev_vfi)) {
struct vma_area *prev = prev_vfi->vma;
/*
* If vfi is equal (!) and negative @vm_file_fd --
* we have nothing to borrow for sure.
*/
if (prev->vm_file_fd < 0)
return 0;
pr_debug("vma %"PRIx64" borrows vfi from previous %"PRIx64"\n",
vma->e->start, prev->e->start);
vma->vm_file_fd = prev->vm_file_fd;
if (prev->e->status & VMA_AREA_SOCKET)
vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR;
/*
* FIXME -- in theory there can be vmas that have
* dev:ino match, but live in different mount
* namespaces. However, we only borrow files for
* subsequent vmas. These are _very_ likely to
* have files from the same namespaces.
*/
vma->file_borrowed = true;
return 0;
}
/* Figure out if it's file mapping */
snprintf(path, sizeof(path), "%"PRIx64"-%"PRIx64, vma->e->start, vma->e->end);
/*
* Note that we "open" it in dumper process space
* so later we might refer to it via /proc/self/fd/vm_file_fd
* if needed.
*/
vma->vm_file_fd = openat(dirfd(mfd), path, O_RDONLY);
if (vma->vm_file_fd < 0) {
if (errno == ENOENT)
/* Just mapping w/o map_files link */
return 0;
if (errno == ENXIO) {
struct stat buf;
if (fstatat(dirfd(mfd), path, &buf, 0))
return -1;
if (S_ISSOCK(buf.st_mode)) {
pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start);
vma->vm_socket_id = buf.st_ino;
vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR;
return 0;
}
if ((buf.st_mode & S_IFMT) == 0 && !strcmp(fname, AIO_FNAME)) {
/* AIO ring, let's try */
close(vma->vm_file_fd);
vma->aio_nr_req = -1;
vma->e->status = VMA_AREA_AIORING;
return 0;
}
pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname);
}
return -1;
}
vma->vmst = xmalloc(sizeof(struct stat));
if (!vma->vmst)
return -1;
/*
* For AUFS support, we need to check if the symbolic link
* points to a branch. If it does, we cannot fstat() its file
* descriptor because it would return a different dev/ino than
* the real file. If fixup_aufs_vma_fd() returns positive,
* it means that it has stat()'ed using the full pathname.
* Zero return means that the symbolic link does not point to
* a branch and we can do fstat() below.
*/
if (opts.aufs) {
int ret;
ret = fixup_aufs_vma_fd(vma);
if (ret < 0)
return -1;
if (ret > 0)
return 0;
}
if (fstat(vma->vm_file_fd, vma->vmst) < 0) {
pr_perror("Failed fstat on map %"PRIx64"", vma->e->start);
return -1;
//.........这里部分代码省略.........
int main(int argc, char **argv)
{
int ret = -1, fd, status;
char path[PATH_MAX];
pid_t pid;
if (!getenv("ZDTM_NEWNS")) {
if (mount_and_add(cgname, "test") < 0)
return -1;
if (unshare(CLONE_NEWCGROUP) < 0) {
pr_perror("unshare");
goto out;
}
}
test_init(argc, argv);
test_daemon();
test_waitsig();
sprintf(path, "name=%s", cgname);
/* first check that the task is in zdtmtst:/ */
if (!pid_in_cgroup(getpid(), path, "/")) {
fail("pid not in cgroup /");
goto out;
}
/* now check that the task is in the right place in a ns by setnsing to
* someone else's ns and looking there.
*/
pid = fork();
if (pid < 0) {
pr_perror("fork");
goto out;
}
if (pid == 0) {
sprintf(path, "/proc/%d/ns/cgroup", 1);
fd = open(path, O_RDONLY);
if (fd < 0) {
pr_perror("open");
exit(1);
}
ret = setns(fd, CLONE_NEWCGROUP);
close(fd);
if (ret < 0) {
pr_perror("setns");
exit(1);
}
sprintf(path, "name=%s", cgname);
if (!pid_in_cgroup(getppid(), path, "/test")) {
fail("pid not in cgroup %s", path);
exit(1);
}
exit(0);
}
if (pid != waitpid(pid, &status, 0)) {
pr_err("wrong pid");
goto out;
}
if (!WIFEXITED(status) || WEXITSTATUS(status)) {
pr_err("got bad exit status %d\n", status);
goto out;
}
ret = 0;
pass();
out:
sprintf(path, "%s/%s/test", dirname, cgname);
rmdir(path);
sprintf(path, "%s/%s", dirname, cgname);
umount(path);
rmdir(path);
rmdir(dirname);
return ret;
}
int main(int argc, char **argv)
{
int ret = 0;
pid_t pid;
int i;
uint8_t buf[0x100000];
int pipes[PROCS_MAX * 2];
int in, out;
test_init(argc, argv);
if (num_procs > PROCS_MAX) {
pr_err("%d processes is too many: max = %d\n", num_procs, PROCS_MAX);
exit(1);
}
for (i = 0; i < num_procs; i++)
if (pipe(pipes + i * 2)) {
pr_perror("Can't create pipes");
exit(1);
}
if (signal(SIGCHLD, inc_num_exited) == SIG_ERR) {
pr_perror("can't set SIGCHLD handler");
exit(1);
}
for (i = 1; i < num_procs; i++) { /* i = 0 - parent */
pid = test_fork();
if (pid < 0) {
pr_perror("Can't fork");
kill(0, SIGKILL);
exit(1);
}
if (pid == 0) {
int j;
in = i * 2;
out = in - 1;
for (j = 0; j < num_procs * 2; j++)
if (j != in && j != out)
close(pipes[j]);
signal(SIGPIPE, SIG_IGN);
if (pipe_in2out(pipes[in], pipes[out], buf, sizeof(buf)) < 0)
/* pass errno as exit code to the parent */
if (test_go() /* signal NOT delivered */ ||
(errno != EINTR && errno != EPIPE))
ret = errno;
test_waitsig(); /* even if failed, wait for migration to complete */
close(pipes[in]);
close(pipes[out]);
exit(ret);
}
}
for (i = 1; i < num_procs * 2 - 1; i++)
close(pipes[i]);
in = pipes[0];
out = pipes[num_procs * 2 - 1];
/* don't block on writing, _do_ block on reading */
if (set_nonblock(out,1) < 0) {
pr_perror("setting O_NONBLOCK failed");
exit(1);
}
if (num_exited) {
pr_err("Some children died unexpectedly\n");
kill(0, SIGKILL);
exit(1);
}
test_daemon();
while (test_go()) {
int len, rlen = 0, wlen;
uint8_t rbuf[sizeof(buf)], *p;
datagen(buf, sizeof(buf), NULL);
wlen = write(out, buf, sizeof(buf));
if (wlen < 0) {
if (errno == EINTR)
continue;
else {
fail("write failed: %m\n", i);
ret = 1;
break;
}
}
for (p = rbuf, len = wlen; len > 0; p += rlen, len -= rlen) {
rlen = read(in, p, len);
if (rlen <= 0)
break;
}
if (rlen < 0 && errno == EINTR)
//.........这里部分代码省略.........
int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
{
siginfo_t si;
int status;
int ret, ret2, ptrace_errno, wait_errno = 0;
struct proc_status_creds cr;
/*
* For the comparison below, let's zero out any padding.
*/
memzero(&cr, sizeof(struct proc_status_creds));
ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
ptrace_errno = errno;
if (ret == 0) {
/*
* If we SEIZE-d the task stop it before going
* and reading its stat from proc. Otherwise task
* may die _while_ we're doing it and we'll have
* inconsistent seize/state pair.
*
* If task dies after we seize it but before we
* do this interrupt, we'll notice it via proc.
*/
ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
if (ret < 0) {
pr_perror("SEIZE %d: can't interrupt task", pid);
ptrace(PTRACE_DETACH, pid, NULL, NULL);
goto err;
}
}
/*
* It's ugly, but the ptrace API doesn't allow to distinguish
* attaching to zombie from other errors. Thus we have to parse
* the target's /proc/pid/stat. Sad, but parse whatever else
* we might need at that early point.
*/
try_again:
if (!ret) {
ret = wait4(pid, &status, __WALL, NULL);
wait_errno = errno;
}
ret2 = parse_pid_status(pid, &cr);
if (ret2)
goto err;
if (!may_dump(&cr)) {
pr_err("Check uid (pid: %d) failed\n", pid);
goto err;
}
if (ret < 0) {
if (cr.state != 'Z') {
if (pid == getpid())
pr_err("The criu itself is within dumped tree.\n");
else
pr_err("Unseizable non-zombie %d found, state %c, err %d/%d/%d\n",
pid, cr.state, ret, ptrace_errno, wait_errno);
return -1;
}
return TASK_DEAD;
}
if ((ppid != -1) && (cr.ppid != ppid)) {
pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
pid, ppid, cr.ppid);
goto err;
}
if (!WIFSTOPPED(status)) {
pr_err("SEIZE %d: task not stopped after seize\n", pid);
goto err;
}
ret = ptrace(PTRACE_GETSIGINFO, pid, NULL, &si);
if (ret < 0) {
pr_perror("SEIZE %d: can't read signfo", pid);
goto err;
}
if (SI_EVENT(si.si_code) != PTRACE_EVENT_STOP) {
/*
* Kernel notifies us about the task being seized received some
* event other than the STOP, i.e. -- a signal. Let the task
* handle one and repeat.
*/
if (ptrace(PTRACE_CONT, pid, NULL,
(void *)(unsigned long)si.si_signo)) {
pr_perror("Can't continue signal handling, aborting");
goto err;
}
ret = 0;
goto try_again;
}
//.........这里部分代码省略.........
开发者ID:gablg1,项目名称:criu,代码行数:101,代码来源:ptrace.c
示例18: collect_filter_for_pstree
static int collect_filter_for_pstree(struct pstree_item *item)
{
struct seccomp_info *infos = NULL, *cursor;
int info_count, i, ret = -1;
struct sock_filter buf[BPF_MAXINSNS];
void *m;
if (item->state == TASK_DEAD ||
dmpi(item)->pi_creds->seccomp_mode != SECCOMP_MODE_FILTER)
return 0;
for (i = 0; true; i++) {
int len;
struct seccomp_info *info, *inherited = NULL;
len = ptrace(PTRACE_SECCOMP_GET_FILTER, item->pid.real, i, buf);
if (len < 0) {
if (errno == ENOENT) {
/* end of the search */
BUG_ON(i == 0);
goto save_infos;
} else if (errno == EINVAL) {
pr_err("dumping seccomp infos not supported\n");
goto out;
} else {
pr_perror("couldn't dump seccomp filter");
goto out;
}
}
inherited = find_inherited(item->parent, buf, len);
if (inherited) {
bool found = false;
/* Small sanity check: if infos is already populated,
* we should have inherited that filter too. */
for (cursor = infos; cursor; cursor = cursor->prev) {
if (inherited->prev== cursor) {
found = true;
break;
}
}
BUG_ON(!found);
infos = inherited;
continue;
}
info = xmalloc(sizeof(*info));
if (!info)
goto out;
seccomp_filter__init(&info->filter);
info->filter.filter.len = len * sizeof(struct sock_filter);
info->filter.filter.data = xmalloc(info->filter.filter.len);
if (!info->filter.filter.data)
goto out;
memcpy(info->filter.filter.data, buf, info->filter.filter.len);
info->prev = infos;
infos = info;
}
save_infos:
info_count = i;
m = xrealloc(filters, sizeof(*filters) * (next_filter_id + info_count));
if (!m)
goto out;
filters = m;
for (cursor = infos, i = info_count + next_filter_id - 1;
i >= next_filter_id; i--) {
BUG_ON(!cursor);
cursor->id = i;
filters[i] = cursor;
cursor = cursor->prev;
}
next_filter_id += info_count;
dmpi(item)->pi_creds->last_filter = infos->id;
/* Don't free the part of the tree we just successfully acquired */
infos = NULL;
ret = 0;
out:
while (infos) {
struct seccomp_info *freeme = infos;
infos = infos->prev;
xfree(freeme->filter.filter.data);
xfree(freeme);
}
return ret;
}
请发表评论