Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20250122214534.2826650-1-safinaskar@zohomail.com>
Date: Thu, 23 Jan 2025 00:45:34 +0300
From: Askar Safin <safinaskar@...omail.com>
To: dalias@...c.org
Cc: musl@...ts.openwall.com,
	fw@...eb.enyo.de
Subject: Re: [bug] Ctrl-Z when process is doing posix_spawn makes the process hard to kill

 ---- On Sat, 18 Jan 2025 15:17:02 +0400  Rich Felker  wrote --- 
 > I don't understand what you think the kernel bug is.

Recently I got a suggestion to use CLONE_VM on io-uring@...r.kernel.org
( https://lore.kernel.org/io-uring/9ee30fc7-0329-4a69-b686-3131ce323c97@gmail.com/ )

So I tried CLONE_VM and it worked! I. e. this Ctrl-Z bug was not reproduced.

Also I compared various methods for spawning. And my testing shows that all
methods based on vfork or CLONE_VFORK or posix_spawn (as well as I understand it
is based on vfork, too) are buggy, and all others are not.

For all methods I wrote in comments whether the bug is reproducible on glibc
and musl.

In the end of this letter you will find full source.

So it may be good idea to replace vfork with CLONE_VM in musl and glibc.

Also, my CLONE_VM-based implementation is essentially reimplementation of vfork, but
in userspace. And it works. I. e. actual kernel implementation of vfork doesn't work,
and its userspace emulation works. This is strong argument for point of view, that
vfork is buggy in kernel.

--
Askar Safin
https://types.pl/@safinaskar

Source:

#define _GNU_SOURCE

#include <spawn.h>
#include <err.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sched.h>
#include <sys/wait.h>

char *args[] = {"/bin/true", NULL};
char *env[] = {"HOME=/", NULL};

// repro glibc
// repro musl
pid_t
spawn_via_posix_spawn (void)
{
    pid_t pid;
    if (posix_spawn (&pid, "/bin/true", NULL, NULL, args, env) != 0)
        {
            errx (1, "posix_spawn");
        }
    return pid;
}

// not repro glibc
// not repro musl
pid_t
spawn_via_fork (void)
{
    pid_t pid = fork ();
    if (pid == -1)
        {
            err (1, "fork");
        }
    if (pid == 0)
        {
            execve ("/bin/true", args, env);
            err (1, "execve");
        }
    return pid;
}

// repro glibc
// repro musl
pid_t
spawn_via_vfork (void)
{
    pid_t pid = vfork ();
    if (pid == -1)
        {
            err (1, "vfork");
        }
    if (pid == 0)
        {
            execve ("/bin/true", args, env);
            err (1, "execve");
        }
    return pid;
}

/* Okay, so below we will emulate vfork using CLONE_VM. We will do so using O_CLOEXEC pipe.
 * We will heavily rely on one important property: during execve Linux first destroys old memory,
 * and then closes all O_CLOEXEC fds. This is actually true, as we can see in Linux source:
 * https://elixir.bootlin.com/linux/v6.13-rc3/source/fs/exec.c#L1274
 * https://elixir.bootlin.com/linux/v6.13-rc3/source/fs/exec.c#L1312
 * As you can see, do_close_on_exec is called after exec_mmap
 */

int pipe_fd[2];

int
helper (void *a)
{
    if (syscall (SYS_close, pipe_fd[0]) != 0)
        {
            syscall (SYS_write, 2, "clo", 3);
            syscall (SYS_exit_group, 1);
        }

    syscall (SYS_execve, "/bin/true", args, env);
    syscall (SYS_write, 2, "exe", 3);
    syscall (SYS_exit_group, 1);
}

// not repro glibc
// not repro musl
pid_t
spawn_via_clone_vm (void)
{
    if (pipe2 (pipe_fd, O_CLOEXEC) == -1)
        {
            err (1, "pipe2");
        }

    // Begin of code, copied from "man 2 clone"

#define STACK_SIZE (1024 * 1024)    /* Stack size for cloned child */
    char            *stack;         /* Start of stack buffer */
    char            *stackTop;      /* End of stack buffer */

    /* Allocate memory to be used for the stack of the child. */
    stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
                        MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
    if (stack == MAP_FAILED)
        {
            err (1, "mmap");
        }

    stackTop = stack + STACK_SIZE;  /* Assume stack grows downward */

    pid_t pid = clone (helper, stackTop, CLONE_VM | SIGCHLD, NULL);

    if (pid == -1)
        {
            err (1, "clone");
        }

    // End of code, copied from "man 2 clone"

    // Okay, so now we should wait for "execve". We will do this using that pipe
    // We will use "syscall" to avoid messing with libc's state
    // We cannot even rely on errno, because it is probably shared now

    if (syscall (SYS_close, pipe_fd[1]) != 0)
        {
            syscall (SYS_write, 2, "clo", 3);
            syscall (SYS_exit_group, 1);
        }

    char buf[1];
    if (syscall (SYS_read, pipe_fd[0], buf, 1) != 0)
        {
            syscall (SYS_write, 2, "rea", 3);
            syscall (SYS_exit_group, 1);
        }

    // Okay, so the child did "execve", now we can continue running normally

    if (close (pipe_fd[0]) != 0)
        {
            err (1, "close");
        }

    return pid;
}

int
helper_clone_vfork (void *a)
{
    execve ("/bin/true", args, env);
    err (1, "execve");
}

// repro glibc
// repro musl
pid_t
spawn_via_clone_vfork (void)
{
    // Begin of code, copied from "man 2 clone"

#define STACK_SIZE (1024 * 1024)    /* Stack size for cloned child */
    char            *stack;         /* Start of stack buffer */
    char            *stackTop;      /* End of stack buffer */

    /* Allocate memory to be used for the stack of the child. */
    stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
                        MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
    if (stack == MAP_FAILED)
        {
            err (1, "mmap");
        }

    stackTop = stack + STACK_SIZE;  /* Assume stack grows downward */

    pid_t pid = clone (helper, stackTop, CLONE_VFORK | SIGCHLD, NULL);

    if (pid == -1)
        {
            err (1, "clone");
        }

    // End of code, copied from "man 2 clone"

    return pid;
}

int
main (void)
{
    for (;;)
        {
            pid_t pid = spawn_via_clone_vfork (); // You can replace this line with some other "spawn_via_..." function
            if (waitpid (pid, NULL, 0) != pid)
                {
                    err(1, "waitpid");
                }
        }
}

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.