|
|
Message-ID: <20110706163140.GA24949@peqn>
Date: Wed, 6 Jul 2011 11:31:40 -0500
From: Serge Hallyn <serge.hallyn@...onical.com>
To: Vasiliy Kulikov <segoon@...nwall.com>
Cc: akpm@...ux-foundation.org, daniel.lezcano@...e.fr,
ebiederm@...ssion.com, mingo@...e.hu, oleg@...hat.com,
rdunlap@...otime.net, tj@...nel.org,
kernel-hardening@...ts.openwall.com
Subject: Re: [PATCH] shm: handle separate PID namespaces case
Quoting Vasiliy Kulikov (segoon@...nwall.com):
> On Tue, Jul 05, 2011 at 10:57 -0500, Serge Hallyn wrote:
> > Do you have all of these patches applied in one git tree? Could
> > you do a quick 'git diff master..' (assuming master is Linus' tree)
> > and send the result (or send a git url), and I'll take a last closer
> > look at the patch tomorrow?
>
> These patches are against 3.0-rc2, it should be simple to rebase to -rc5
> since ipc/ is not heavily changed nowadays. The patches are stored in
> -mm tree. The blob against -rc2 is as follows:
>
Thanks, Vasiliy. I don't see anything wrong with it, though I do
wory (see below) about contention on task exit? The rest of my
comments are just safe-to-ignore-of-you-like nits.
> ---
>
> Documentation/sysctl/kernel.txt | 22 ++++++++
> include/linux/ipc_namespace.h | 7 +++
> include/linux/shm.h | 7 +++
> ipc/ipc_sysctl.c | 36 +++++++++++++
> ipc/shm.c | 105 +++++++++++++++++++++++++++++++++++++--
> kernel/exit.c | 1 +
> 6 files changed, 174 insertions(+), 4 deletions(-)
> ---
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
> index 5e7cb39..4d7b866 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -62,6 +62,7 @@ show up in /proc/sys/kernel:
> - shmall
> - shmmax [ sysv ipc ]
> - shmmni
> +- shm_rmid_forced
> - stop-a [ SPARC only ]
> - sysrq ==> Documentation/sysrq.txt
> - tainted
> @@ -475,6 +476,27 @@ kernel. This value defaults to SHMMAX.
>
> ==============================================================
>
> +shm_rmid_forced:
> +
> +Linux lets you set resource limits, including how much memory one
> +process can consume, via setrlimit(2). Unfortunately, shared memory
> +segments are allowed to exist without association with any process, and
> +thus might not be counted against any resource limits. If enabled,
> +shared memory segments are automatically destroyed when their attach
> +count becomes zero after a detach or a process termination. It will
> +also destroy segments that were created, but never attached to, on exit
> +from the process. The only use left for IPC_RMID is to immediately
> +destroy an unattached segment. Of course, this breaks the way things are
> +defined, so some applications might stop working. Note that this
> +feature will do you no good unless you also configure your resource
> +limits (in particular, RLIMIT_AS and RLIMIT_NPROC). Most systems don't
> +need this.
> +
> +Note that if you change this from 0 to 1, already created segments
> +without users and with a dead originative process will be destroyed.
> +
> +==============================================================
> +
> softlockup_thresh:
>
> This value can be used to lower the softlockup tolerance threshold. The
> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
> index a6d1655..8a297a5 100644
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -44,6 +44,11 @@ struct ipc_namespace {
> size_t shm_ctlall;
> int shm_ctlmni;
> int shm_tot;
> + /*
> + * Defines whether IPC_RMID is forced for _all_ shm segments regardless
> + * of shmctl()
> + */
> + int shm_rmid_forced;
>
> struct notifier_block ipcns_nb;
>
> @@ -72,6 +77,7 @@ extern int register_ipcns_notifier(struct ipc_namespace *);
> extern int cond_register_ipcns_notifier(struct ipc_namespace *);
> extern void unregister_ipcns_notifier(struct ipc_namespace *);
> extern int ipcns_notify(unsigned long);
> +extern void shm_destroy_orphaned(struct ipc_namespace *ns);
> #else /* CONFIG_SYSVIPC */
> static inline int register_ipcns_notifier(struct ipc_namespace *ns)
> { return 0; }
> @@ -79,6 +85,7 @@ static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns)
> { return 0; }
> static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { }
> static inline int ipcns_notify(unsigned long l) { return 0; }
> +static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
> #endif /* CONFIG_SYSVIPC */
>
> #ifdef CONFIG_POSIX_MQUEUE
> diff --git a/include/linux/shm.h b/include/linux/shm.h
> index eca6235..92808b8 100644
> --- a/include/linux/shm.h
> +++ b/include/linux/shm.h
> @@ -95,6 +95,9 @@ struct shmid_kernel /* private to the kernel */
> pid_t shm_cprid;
> pid_t shm_lprid;
> struct user_struct *mlock_user;
> +
> + /* The task created the shm object. NULL if the task is dead. */
> + struct task_struct *shm_creator;
> };
>
> /* shm_mode upper byte flags */
> @@ -106,6 +109,7 @@ struct shmid_kernel /* private to the kernel */
> #ifdef CONFIG_SYSVIPC
> long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr);
> extern int is_file_shm_hugepages(struct file *file);
> +extern void exit_shm(struct task_struct *task);
> #else
> static inline long do_shmat(int shmid, char __user *shmaddr,
> int shmflg, unsigned long *addr)
> @@ -116,6 +120,9 @@ static inline int is_file_shm_hugepages(struct file *file)
> {
> return 0;
> }
> +static inline void exit_shm(struct task_struct *task)
> +{
> +}
> #endif
>
> #endif /* __KERNEL__ */
> diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
> index 56410fa..00fba2b 100644
> --- a/ipc/ipc_sysctl.c
> +++ b/ipc/ipc_sysctl.c
> @@ -31,12 +31,37 @@ static int proc_ipc_dointvec(ctl_table *table, int write,
> void __user *buffer, size_t *lenp, loff_t *ppos)
> {
> struct ctl_table ipc_table;
> +
> memcpy(&ipc_table, table, sizeof(ipc_table));
> ipc_table.data = get_ipc(table);
>
> return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
> }
>
> +static int proc_ipc_dointvec_minmax(ctl_table *table, int write,
> + void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> + struct ctl_table ipc_table;
> +
> + memcpy(&ipc_table, table, sizeof(ipc_table));
> + ipc_table.data = get_ipc(table);
> +
> + return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
> +}
> +
> +static int proc_ipc_dointvec_minmax_orphans(ctl_table *table, int write,
> + void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> + struct ipc_namespace *ns = current->nsproxy->ipc_ns;
> + int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
> +
> + if (err < 0)
> + return err;
> + if (ns->shm_rmid_forced)
> + shm_destroy_orphaned(ns);
> + return err;
> +}
> +
> static int proc_ipc_callback_dointvec(ctl_table *table, int write,
> void __user *buffer, size_t *lenp, loff_t *ppos)
> {
> @@ -125,6 +150,8 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
> #else
> #define proc_ipc_doulongvec_minmax NULL
> #define proc_ipc_dointvec NULL
> +#define proc_ipc_dointvec_minmax NULL
> +#define proc_ipc_dointvec_minmax_orphans NULL
> #define proc_ipc_callback_dointvec NULL
> #define proc_ipcauto_dointvec_minmax NULL
> #endif
> @@ -155,6 +182,15 @@ static struct ctl_table ipc_kern_table[] = {
> .proc_handler = proc_ipc_dointvec,
> },
> {
> + .procname = "shm_rmid_forced",
> + .data = &init_ipc_ns.shm_rmid_forced,
> + .maxlen = sizeof(init_ipc_ns.shm_rmid_forced),
> + .mode = 0644,
> + .proc_handler = proc_ipc_dointvec_minmax_orphans,
> + .extra1 = &zero,
> + .extra2 = &one,
> + },
> + {
> .procname = "msgmax",
> .data = &init_ipc_ns.msg_ctlmax,
> .maxlen = sizeof (init_ipc_ns.msg_ctlmax),
> diff --git a/ipc/shm.c b/ipc/shm.c
> index ab3385a..bf46636 100644
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -74,6 +74,7 @@ void shm_init_ns(struct ipc_namespace *ns)
> ns->shm_ctlmax = SHMMAX;
> ns->shm_ctlall = SHMALL;
> ns->shm_ctlmni = SHMMNI;
> + ns->shm_rmid_forced = 1;
Given the description in Documentation/sysctl/kernel.txt, shouldn't
this default to 0?
> ns->shm_tot = 0;
> ipc_init_ids(&shm_ids(ns));
> }
> @@ -130,6 +131,12 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
> return container_of(ipcp, struct shmid_kernel, shm_perm);
> }
>
> +static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
> +{
> + rcu_read_lock();
> + spin_lock(&ipcp->shm_perm.lock);
> +}
> +
> static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns,
> int id)
> {
> @@ -187,6 +194,23 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
> }
>
> /*
> + * shm_may_destroy - identifies whether shm segment should be destroyed now
> + *
> + * Returns true if and only if there are no active users of the segment and
> + * one of the following is true:
> + *
> + * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
> + *
> + * 2) sysctl kernel.shm_rmid_forced is set to 1.
> + */
> +static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
'may' usually implies a permission check. Would this be better named
'shm_should_destroy()'?
> +{
> + return (shp->shm_nattch == 0) &&
> + (ns->shm_rmid_forced ||
> + (shp->shm_perm.mode & SHM_DEST));
> +}
> +
> +/*
> * remove the attach descriptor vma.
> * free memory for segment if it is marked destroyed.
> * The descriptor has already been removed from the current->mm->mmap list
> @@ -206,14 +230,87 @@ static void shm_close(struct vm_area_struct *vma)
> shp->shm_lprid = task_tgid_vnr(current);
> shp->shm_dtim = get_seconds();
> shp->shm_nattch--;
> - if(shp->shm_nattch == 0 &&
> - shp->shm_perm.mode & SHM_DEST)
> + if (shm_may_destroy(ns, shp))
> shm_destroy(ns, shp);
> else
> shm_unlock(shp);
> up_write(&shm_ids(ns).rw_mutex);
> }
>
> +/* Called with ns->shm_ids(ns).rw_mutex locked */
> +static int shm_try_destroy_current(int id, void *p, void *data)
> +{
> + struct ipc_namespace *ns = data;
> + struct kern_ipc_perm *ipcp = p;
> + struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
> +
> + if (shp->shm_creator != current)
> + return 0;
> +
> + /*
> + * Mark it as orphaned to destroy the segment when
> + * kernel.shm_rmid_forced is changed.
> + * It is noop if the following shm_may_destroy() returns true.
> + */
> + shp->shm_creator = NULL;
> +
> + /*
> + * Don't even try to destroy it. If shm_rmid_forced=0 and IPC_RMID
> + * is not set, it shouldn't be deleted here.
> + */
> + if (!ns->shm_rmid_forced)
> + return 0;
> +
> + if (shm_may_destroy(ns, shp)) {
This seems redundant. Would it be better to just make this
if (shp->shm_nattch == 0) {
here as we already know ns->shm_rmid_forced == 1?
> + shm_lock_by_ptr(shp);
> + shm_destroy(ns, shp);
Wish there were a clean way to document that the locks will be
released by shm_destroy().
> + }
> + return 0;
> +}
> +
> +/* Called with ns->shm_ids(ns).rw_mutex locked */
> +static int shm_try_destroy_orphaned(int id, void *p, void *data)
> +{
> + struct ipc_namespace *ns = data;
> + struct kern_ipc_perm *ipcp = p;
> + struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
> +
> + /*
> + * We want to destroy segments without users and with already
> + * exit'ed originating process.
> + *
> + * As shp->* are changed under rw_mutex, it's safe to skip shp locking.
> + */
> + if (shp->shm_creator != NULL)
> + return 0;
> +
> + if (shm_may_destroy(ns, shp)) {
> + shm_lock_by_ptr(shp);
> + shm_destroy(ns, shp);
> + }
> + return 0;
> +}
> +
> +void shm_destroy_orphaned(struct ipc_namespace *ns)
> +{
> + down_write(&shm_ids(ns).rw_mutex);
> + if (&shm_ids(ns).in_use)
> + idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
> + up_write(&shm_ids(ns).rw_mutex);
Hm, is this going to cause contention when killing a lot of tasks?
> +}
> +
> +
> +void exit_shm(struct task_struct *task)
> +{
> + struct ipc_namespace *ns = task->nsproxy->ipc_ns;
> +
> + /* Destroy all already created segments, but not mapped yet */
> + down_write(&shm_ids(ns).rw_mutex);
> + if (&shm_ids(ns).in_use)
> + idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
> + up_write(&shm_ids(ns).rw_mutex);
Having exit_shm() call shm_destroy_orphaned(task->nsproxy->ipc_ns) seems
more future-proof?
> +}
> +
> static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> {
> struct file *file = vma->vm_file;
> @@ -404,6 +501,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
> shp->shm_segsz = size;
> shp->shm_nattch = 0;
> shp->shm_file = file;
> + shp->shm_creator = current;
> /*
> * shmid gets reported as "inode#" in /proc/pid/maps.
> * proc-ps tools use this. Changing this will break them.
> @@ -950,8 +1048,7 @@ out_nattch:
> shp = shm_lock(ns, shmid);
> BUG_ON(IS_ERR(shp));
> shp->shm_nattch--;
> - if(shp->shm_nattch == 0 &&
> - shp->shm_perm.mode & SHM_DEST)
> + if (shm_may_destroy(ns, shp))
> shm_destroy(ns, shp);
> else
> shm_unlock(shp);
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 20a4064..816c610 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -991,6 +991,7 @@ NORET_TYPE void do_exit(long code)
> trace_sched_process_exit(tsk);
>
> exit_sem(tsk);
> + exit_shm(tsk);
> exit_files(tsk);
> exit_fs(tsk);
> check_stack_usage();
> --
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.