summaryrefslogtreecommitdiffstats
path: root/fs/read_write.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-03-03 09:36:58 -0800
committerAl Viro <viro@zeniv.linux.org.uk>2014-03-10 11:44:41 -0400
commit9c225f2655e36a470c4f58dbbc99244c5fc7f2d4 (patch)
tree7cb89dbc82ee1b533ff2d097fed6a4248374bd4b /fs/read_write.c
parent1b56e98990bcdbb20b9fab163654b9315bf158e8 (diff)
downloadlinux-9c225f2655e36a470c4f58dbbc99244c5fc7f2d4.tar.gz
linux-9c225f2655e36a470c4f58dbbc99244c5fc7f2d4.tar.bz2
linux-9c225f2655e36a470c4f58dbbc99244c5fc7f2d4.zip
vfs: atomic f_pos accesses as per POSIX
Our write() system call has always been atomic in the sense that you get the expected thread-safe contiguous write, but we haven't actually guaranteed that concurrent writes are serialized wrt f_pos accesses, so threads (or processes) that share a file descriptor and use "write()" concurrently would quite likely overwrite each others data. This violates POSIX.1-2008/SUSv4 Section XSI 2.9.7 that says: "2.9.7 Thread Interactions with Regular File Operations All of the following functions shall be atomic with respect to each other in the effects specified in POSIX.1-2008 when they operate on regular files or symbolic links: [...]" and one of the effects is the file position update. This unprotected file position behavior is not new behavior, and nobody has ever cared. Until now. Yongzhi Pan reported unexpected behavior to Michael Kerrisk that was due to this. This resolves the issue with a f_pos-specific lock that is taken by read/write/lseek on file descriptors that may be shared across threads or processes. Reported-by: Yongzhi Pan <panyongzhi@gmail.com> Reported-by: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs/read_write.c')
-rw-r--r--fs/read_write.c54
1 files changed, 40 insertions, 14 deletions
diff --git a/fs/read_write.c b/fs/read_write.c
index edc5746a902a..932bb3414a96 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
}
EXPORT_SYMBOL(vfs_llseek);
+/*
+ * We only lock f_pos if we have threads or if the file might be
+ * shared with another process. In both cases we'll have an elevated
+ * file count (done either by fdget() or by fork()).
+ */
+static inline struct fd fdget_pos(int fd)
+{
+ struct fd f = fdget(fd);
+ struct file *file = f.file;
+
+ if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
+ if (file_count(file) > 1) {
+ f.flags |= FDPUT_POS_UNLOCK;
+ mutex_lock(&file->f_pos_lock);
+ }
+ }
+ return f;
+}
+
+static inline void fdput_pos(struct fd f)
+{
+ if (f.flags & FDPUT_POS_UNLOCK)
+ mutex_unlock(&f.file->f_pos_lock);
+ fdput(f);
+}
+
SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
off_t retval;
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
if (!f.file)
return -EBADF;
@@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
if (res != (loff_t)retval)
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
}
- fdput(f);
+ fdput_pos(f);
return retval;
}
@@ -498,7 +524,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
@@ -506,7 +532,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
ret = vfs_read(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}
return ret;
}
@@ -514,7 +540,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
@@ -522,7 +548,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
ret = vfs_write(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}
return ret;
@@ -797,7 +823,7 @@ EXPORT_SYMBOL(vfs_writev);
SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
@@ -805,7 +831,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
ret = vfs_readv(f.file, vec, vlen, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}
if (ret > 0)
@@ -817,7 +843,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
@@ -825,7 +851,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
ret = vfs_writev(f.file, vec, vlen, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}
if (ret > 0)
@@ -968,7 +994,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
const struct compat_iovec __user *,vec,
compat_ulong_t, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret;
loff_t pos;
@@ -978,7 +1004,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
ret = compat_readv(f.file, vec, vlen, &pos);
if (ret >= 0)
f.file->f_pos = pos;
- fdput(f);
+ fdput_pos(f);
return ret;
}
@@ -1035,7 +1061,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
const struct compat_iovec __user *, vec,
compat_ulong_t, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret;
loff_t pos;
@@ -1045,7 +1071,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
ret = compat_writev(f.file, vec, vlen, &pos);
if (ret >= 0)
f.file->f_pos = pos;
- fdput(f);
+ fdput_pos(f);
return ret;
}