Post by Zach BrownI don't think this patch is the way to go at all. It imposes an
allocation and vma walking overhead for the vast majority of IOs that
aren't interested. It doesn't look like it will get a consistent
ordering when multiple file systems are concerned. It doesn't record
the ranges of the mappings involved so Lustre can't properly use its
range locks. And finally, it doesn't prohibit mapping operations for
the duration of the IO -- the whole reason we ended up in this thread in
the first place :)
Hmm. So how do you propose we get rid of the mandatory vma walk? I was
thinking of making iolock a config option so when you don't have any
filesystems that need it, it can go away. I have also optimized the
extra allocation away when there are none mmap'd files that require
locking.
As for the rest of your comments, I heartly agree with them and
hopefully some interested party will take care of them :-).
Pekka
Index: 2.6-mm/fs/iolock.c
===================================================================
--- /dev/null
+++ 2.6-mm/fs/iolock.c
@@ -0,0 +1,183 @@
+/*
+ * I/O locking for memory regions. Used by filesystems that need special
+ * locking for mmap'd files.
+ */
+
+#include <linux/iolock.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+/*
+ * TODO:
+ *
+ * - Deadlock when two nodes acquire iolocks in reverse order for two
+ * different filesystems. Solution: use rbtree in iolock_chain so we
+ * can walk iolocks in order. XXX: what order is stable for two nodes
+ * that don't know about each other?
+ */
+
+/*
+ * I/O lock contains all files that participate in locking a memory region
+ * in an address_space.
+ */
+struct iolock {
+ struct address_space *mapping;
+ unsigned long nr_files;
+ struct file **files;
+ struct list_head chain;
+};
+
+struct iolock_chain {
+ struct list_head list;
+};
+
+static struct iolock *iolock_new(unsigned long max_files)
+{
+ struct iolock *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ goto out;
+ ret->files = kcalloc(max_files, sizeof(struct file *), GFP_KERNEL);
+ if (!ret->files) {
+ kfree(ret);
+ ret = NULL;
+ goto out;
+ }
+ INIT_LIST_HEAD(&ret->chain);
+out:
+ return ret;
+}
+
+static struct iolock_chain *iolock_chain_new(void)
+{
+ struct iolock_chain * ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ if (ret) {
+ INIT_LIST_HEAD(&ret->list);
+ }
+ return ret;
+}
+
+static int iolock_chain_acquire(struct iolock_chain *chain)
+{
+ struct iolock * iolock;
+ int err = 0;
+
+ list_for_each_entry(iolock, &chain->list, chain) {
+ if (iolock->mapping->a_ops->iolock_acquire) {
+ err = iolock->mapping->a_ops->iolock_acquire(
+ iolock->files, iolock->nr_files);
+ if (!err)
+ goto error;
+ }
+ }
+error:
+ return err;
+}
+
+static struct iolock *iolock_lookup(struct iolock_chain *chain,
+ struct address_space *mapping)
+{
+ struct iolock *ret = NULL;
+ struct iolock *iolock;
+
+ list_for_each_entry(iolock, &chain->list, chain) {
+ if (iolock->mapping == mapping) {
+ ret = iolock;
+ break;
+ }
+ }
+ return ret;
+}
+
+/**
+ * iolock_region - Lock memory region for file I/O.
+ * @buf: the buffer we want to lock.
+ * @size: size of the buffer.
+ *
+ * Returns a pointer to the iolock_chain or NULL to denote an empty chain;
+ * otherwise returns ERR_PTR().
+ */
+struct iolock_chain *iolock_region(const char __user *buf, size_t size)
+{
+ struct iolock_chain *ret = NULL;
+ int err = -ENOMEM;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long start = (unsigned long)buf;
+ unsigned long end = start + size;
+ int max_files;
+
+ down_read(&mm->mmap_sem);
+ max_files = mm->map_count;
+
+ for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
+ struct file *file;
+ struct address_space *mapping;
+ struct iolock *iolock;
+
+ if (end <= vma->vm_start)
+ break;
+
+ file = vma->vm_file;
+ if (!file)
+ continue;
+
+ mapping = file->f_mapping;
+ if (!mapping->a_ops->iolock_acquire ||
+ !mapping->a_ops->iolock_release)
+ continue;
+
+ /* Allocate chain lazily to avoid initialization overhead
+ when we don't have any files that require iolock. */
+ if (!ret) {
+ ret = iolock_chain_new();
+ if (!ret)
+ goto error;
+ }
+
+ iolock = iolock_lookup(ret, mapping);
+ if (!iolock) {
+ iolock = iolock_new(max_files);
+ if (!iolock)
+ goto error;
+ iolock->mapping = mapping;
+ }
+
+ iolock->files[iolock->nr_files++] = file;
+ list_add(&iolock->chain, &ret->list);
+ }
+ err = iolock_chain_acquire(ret);
+ if (!err)
+ goto error;
+
+out:
+ up_read(&mm->mmap_sem);
+ return ret;
+
+error:
+ iolock_release(ret);
+ ret = ERR_PTR(err);
+ goto out;
+}
+
+/**
+ * iolock_release - Release file I/O locks for a memory region.
+ * @chain: The I/O lock chain to release. Passing NULL means no-op.
+ */
+void iolock_release(struct iolock_chain *chain)
+{
+ struct iolock *iolock;
+
+ if (!chain)
+ return;
+
+ list_for_each_entry(iolock, &chain->list, chain) {
+ struct address_space *mapping = iolock->mapping;
+ if (mapping && mapping->a_ops->iolock_release)
+ mapping->a_ops->iolock_release(iolock->files, iolock->nr_files);
+ kfree(iolock->files);
+ kfree(iolock);
+ }
+ kfree(chain);
+}
Index: 2.6-mm/fs/read_write.c
===================================================================
--- 2.6-mm.orig/fs/read_write.c
+++ 2.6-mm/fs/read_write.c
@@ -14,6 +14,7 @@
#include <linux/security.h>
#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/iolock.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -247,14 +248,21 @@ ssize_t vfs_read(struct file *file, char
if (!ret) {
ret = security_file_permission (file, MAY_READ);
if (!ret) {
+ struct iolock_chain * lock = iolock_region(buf, count);
+ if (IS_ERR(lock)) {
+ ret = PTR_ERR(lock);
+ goto out;
+ }
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else
ret = do_sync_read(file, buf, count, pos);
+ iolock_release(lock);
if (ret > 0) {
fsnotify_access(file->f_dentry);
current->rchar += ret;
}
+ out:
current->syscr++;
}
}
@@ -298,14 +306,21 @@ ssize_t vfs_write(struct file *file, con
if (!ret) {
ret = security_file_permission (file, MAY_WRITE);
if (!ret) {
+ struct iolock_chain * lock = iolock_region(buf, count);
+ if (IS_ERR(lock)) {
+ ret = PTR_ERR(lock);
+ goto out;
+ }
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
+ iolock_release(lock);
if (ret > 0) {
fsnotify_modify(file->f_dentry);
current->wchar += ret;
}
+ out:
current->syscw++;
}
}
Index: 2.6-mm/include/linux/iolock.h
===================================================================
--- /dev/null
+++ 2.6-mm/include/linux/iolock.h
@@ -0,0 +1,11 @@
+#ifndef __LINUX_IOLOCK_H
+#define __LINUX_IOLOCK_H
+
+#include <linux/kernel.h>
+
+struct iolock_chain;
+
+extern struct iolock_chain *iolock_region(const char __user *, size_t);
+extern void iolock_release(struct iolock_chain *);
+
+#endif
Index: 2.6-mm/fs/Makefile
===================================================================
--- 2.6-mm.orig/fs/Makefile
+++ 2.6-mm/fs/Makefile
@@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.
ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
- ioprio.o
+ ioprio.o iolock.o
obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_EPOLL) += eventpoll.o
Index: 2.6-mm/include/linux/fs.h
===================================================================
--- 2.6-mm.orig/include/linux/fs.h
+++ 2.6-mm/include/linux/fs.h
@@ -334,6 +334,8 @@ struct address_space_operations {
loff_t offset, unsigned long nr_segs);
struct page* (*get_xip_page)(struct address_space *, sector_t,
int);
+ int (*iolock_acquire)(struct file **, unsigned long);
+ void (*iolock_release)(struct file **, unsigned long);
};
struct backing_dev_info;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/