|
Message-ID: <20110705174200.GC6102@albatros> Date: Tue, 5 Jul 2011 21:42:00 +0400 From: Vasiliy Kulikov <segoon@...nwall.com> To: Serge Hallyn <serge.hallyn@...onical.com> Cc: akpm@...ux-foundation.org, daniel.lezcano@...e.fr, ebiederm@...ssion.com, mingo@...e.hu, oleg@...hat.com, rdunlap@...otime.net, tj@...nel.org, kernel-hardening@...ts.openwall.com Subject: Re: [PATCH] shm: handle separate PID namespaces case On Tue, Jul 05, 2011 at 10:57 -0500, Serge Hallyn wrote: > Do you have all of these patches applied in one git tree? Could > you do a quick 'git diff master..' (assuming master is Linus' tree) > and send the result (or send a git url), and I'll take a last closer > look at the patch tomorrow? These patches are against 3.0-rc2, it should be simple to rebase to -rc5 since ipc/ is not heavily changed nowadays. The patches are stored in -mm tree. The blob against -rc2 is as follows: --- Documentation/sysctl/kernel.txt | 22 ++++++++ include/linux/ipc_namespace.h | 7 +++ include/linux/shm.h | 7 +++ ipc/ipc_sysctl.c | 36 +++++++++++++ ipc/shm.c | 105 +++++++++++++++++++++++++++++++++++++-- kernel/exit.c | 1 + 6 files changed, 174 insertions(+), 4 deletions(-) --- diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 5e7cb39..4d7b866 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -62,6 +62,7 @@ show up in /proc/sys/kernel: - shmall - shmmax [ sysv ipc ] - shmmni +- shm_rmid_forced - stop-a [ SPARC only ] - sysrq ==> Documentation/sysrq.txt - tainted @@ -475,6 +476,27 @@ kernel. This value defaults to SHMMAX. ============================================================== +shm_rmid_forced: + +Linux lets you set resource limits, including how much memory one +process can consume, via setrlimit(2). Unfortunately, shared memory +segments are allowed to exist without association with any process, and +thus might not be counted against any resource limits. If enabled, +shared memory segments are automatically destroyed when their attach +count becomes zero after a detach or a process termination. It will +also destroy segments that were created, but never attached to, on exit +from the process. The only use left for IPC_RMID is to immediately +destroy an unattached segment. Of course, this breaks the way things are +defined, so some applications might stop working. Note that this +feature will do you no good unless you also configure your resource +limits (in particular, RLIMIT_AS and RLIMIT_NPROC). Most systems don't +need this. + +Note that if you change this from 0 to 1, already created segments +without users and with a dead originative process will be destroyed. + +============================================================== + softlockup_thresh: This value can be used to lower the softlockup tolerance threshold. The diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index a6d1655..8a297a5 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -44,6 +44,11 @@ struct ipc_namespace { size_t shm_ctlall; int shm_ctlmni; int shm_tot; + /* + * Defines whether IPC_RMID is forced for _all_ shm segments regardless + * of shmctl() + */ + int shm_rmid_forced; struct notifier_block ipcns_nb; @@ -72,6 +77,7 @@ extern int register_ipcns_notifier(struct ipc_namespace *); extern int cond_register_ipcns_notifier(struct ipc_namespace *); extern void unregister_ipcns_notifier(struct ipc_namespace *); extern int ipcns_notify(unsigned long); +extern void shm_destroy_orphaned(struct ipc_namespace *ns); #else /* CONFIG_SYSVIPC */ static inline int register_ipcns_notifier(struct ipc_namespace *ns) { return 0; } @@ -79,6 +85,7 @@ static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns) { return 0; } static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { } static inline int ipcns_notify(unsigned long l) { return 0; } +static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} #endif /* CONFIG_SYSVIPC */ #ifdef CONFIG_POSIX_MQUEUE diff --git a/include/linux/shm.h b/include/linux/shm.h index eca6235..92808b8 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -95,6 +95,9 @@ struct shmid_kernel /* private to the kernel */ pid_t shm_cprid; pid_t shm_lprid; struct user_struct *mlock_user; + + /* The task created the shm object. NULL if the task is dead. */ + struct task_struct *shm_creator; }; /* shm_mode upper byte flags */ @@ -106,6 +109,7 @@ struct shmid_kernel /* private to the kernel */ #ifdef CONFIG_SYSVIPC long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr); extern int is_file_shm_hugepages(struct file *file); +extern void exit_shm(struct task_struct *task); #else static inline long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr) @@ -116,6 +120,9 @@ static inline int is_file_shm_hugepages(struct file *file) { return 0; } +static inline void exit_shm(struct task_struct *task) +{ +} #endif #endif /* __KERNEL__ */ diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 56410fa..00fba2b 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -31,12 +31,37 @@ static int proc_ipc_dointvec(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; + memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = get_ipc(table); return proc_dointvec(&ipc_table, write, buffer, lenp, ppos); } +static int proc_ipc_dointvec_minmax(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table ipc_table; + + memcpy(&ipc_table, table, sizeof(ipc_table)); + ipc_table.data = get_ipc(table); + + return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); +} + +static int proc_ipc_dointvec_minmax_orphans(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ipc_namespace *ns = current->nsproxy->ipc_ns; + int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (err < 0) + return err; + if (ns->shm_rmid_forced) + shm_destroy_orphaned(ns); + return err; +} + static int proc_ipc_callback_dointvec(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -125,6 +150,8 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, #else #define proc_ipc_doulongvec_minmax NULL #define proc_ipc_dointvec NULL +#define proc_ipc_dointvec_minmax NULL +#define proc_ipc_dointvec_minmax_orphans NULL #define proc_ipc_callback_dointvec NULL #define proc_ipcauto_dointvec_minmax NULL #endif @@ -155,6 +182,15 @@ static struct ctl_table ipc_kern_table[] = { .proc_handler = proc_ipc_dointvec, }, { + .procname = "shm_rmid_forced", + .data = &init_ipc_ns.shm_rmid_forced, + .maxlen = sizeof(init_ipc_ns.shm_rmid_forced), + .mode = 0644, + .proc_handler = proc_ipc_dointvec_minmax_orphans, + .extra1 = &zero, + .extra2 = &one, + }, + { .procname = "msgmax", .data = &init_ipc_ns.msg_ctlmax, .maxlen = sizeof (init_ipc_ns.msg_ctlmax), diff --git a/ipc/shm.c b/ipc/shm.c index ab3385a..bf46636 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -74,6 +74,7 @@ void shm_init_ns(struct ipc_namespace *ns) ns->shm_ctlmax = SHMMAX; ns->shm_ctlall = SHMALL; ns->shm_ctlmni = SHMMNI; + ns->shm_rmid_forced = 1; ns->shm_tot = 0; ipc_init_ids(&shm_ids(ns)); } @@ -130,6 +131,12 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) return container_of(ipcp, struct shmid_kernel, shm_perm); } +static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) +{ + rcu_read_lock(); + spin_lock(&ipcp->shm_perm.lock); +} + static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, int id) { @@ -187,6 +194,23 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) } /* + * shm_may_destroy - identifies whether shm segment should be destroyed now + * + * Returns true if and only if there are no active users of the segment and + * one of the following is true: + * + * 1) shmctl(id, IPC_RMID, NULL) was called for this shp + * + * 2) sysctl kernel.shm_rmid_forced is set to 1. + */ +static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) +{ + return (shp->shm_nattch == 0) && + (ns->shm_rmid_forced || + (shp->shm_perm.mode & SHM_DEST)); +} + +/* * remove the attach descriptor vma. * free memory for segment if it is marked destroyed. * The descriptor has already been removed from the current->mm->mmap list @@ -206,14 +230,87 @@ static void shm_close(struct vm_area_struct *vma) shp->shm_lprid = task_tgid_vnr(current); shp->shm_dtim = get_seconds(); shp->shm_nattch--; - if(shp->shm_nattch == 0 && - shp->shm_perm.mode & SHM_DEST) + if (shm_may_destroy(ns, shp)) shm_destroy(ns, shp); else shm_unlock(shp); up_write(&shm_ids(ns).rw_mutex); } +/* Called with ns->shm_ids(ns).rw_mutex locked */ +static int shm_try_destroy_current(int id, void *p, void *data) +{ + struct ipc_namespace *ns = data; + struct kern_ipc_perm *ipcp = p; + struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); + + if (shp->shm_creator != current) + return 0; + + /* + * Mark it as orphaned to destroy the segment when + * kernel.shm_rmid_forced is changed. + * It is noop if the following shm_may_destroy() returns true. + */ + shp->shm_creator = NULL; + + /* + * Don't even try to destroy it. If shm_rmid_forced=0 and IPC_RMID + * is not set, it shouldn't be deleted here. + */ + if (!ns->shm_rmid_forced) + return 0; + + if (shm_may_destroy(ns, shp)) { + shm_lock_by_ptr(shp); + shm_destroy(ns, shp); + } + return 0; +} + +/* Called with ns->shm_ids(ns).rw_mutex locked */ +static int shm_try_destroy_orphaned(int id, void *p, void *data) +{ + struct ipc_namespace *ns = data; + struct kern_ipc_perm *ipcp = p; + struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); + + /* + * We want to destroy segments without users and with already + * exit'ed originating process. + * + * As shp->* are changed under rw_mutex, it's safe to skip shp locking. + */ + if (shp->shm_creator != NULL) + return 0; + + if (shm_may_destroy(ns, shp)) { + shm_lock_by_ptr(shp); + shm_destroy(ns, shp); + } + return 0; +} + +void shm_destroy_orphaned(struct ipc_namespace *ns) +{ + down_write(&shm_ids(ns).rw_mutex); + if (&shm_ids(ns).in_use) + idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); + up_write(&shm_ids(ns).rw_mutex); +} + + +void exit_shm(struct task_struct *task) +{ + struct ipc_namespace *ns = task->nsproxy->ipc_ns; + + /* Destroy all already created segments, but not mapped yet */ + down_write(&shm_ids(ns).rw_mutex); + if (&shm_ids(ns).in_use) + idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); + up_write(&shm_ids(ns).rw_mutex); +} + static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; @@ -404,6 +501,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_segsz = size; shp->shm_nattch = 0; shp->shm_file = file; + shp->shm_creator = current; /* * shmid gets reported as "inode#" in /proc/pid/maps. * proc-ps tools use this. Changing this will break them. @@ -950,8 +1048,7 @@ out_nattch: shp = shm_lock(ns, shmid); BUG_ON(IS_ERR(shp)); shp->shm_nattch--; - if(shp->shm_nattch == 0 && - shp->shm_perm.mode & SHM_DEST) + if (shm_may_destroy(ns, shp)) shm_destroy(ns, shp); else shm_unlock(shp); diff --git a/kernel/exit.c b/kernel/exit.c index 20a4064..816c610 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -991,6 +991,7 @@ NORET_TYPE void do_exit(long code) trace_sched_process_exit(tsk); exit_sem(tsk); + exit_shm(tsk); exit_files(tsk); exit_fs(tsk); check_stack_usage(); --
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.