summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/buffer.c73
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/ext4/ialloc.c3
-rw-r--r--fs/ext4/mballoc.c3
-rw-r--r--fs/file.c2
-rw-r--r--fs/jbd2/revoke.c15
-rw-r--r--fs/namespace.c69
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/splice.c2
-rw-r--r--fs/xattr.c4
10 files changed, 109 insertions, 66 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index c7abb4a029dc..7be23ff20b27 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -176,18 +176,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
}
EXPORT_SYMBOL(end_buffer_write_sync);
-/*
- * Various filesystems appear to want __find_get_block to be non-blocking.
- * But it's the page lock which protects the buffers. To get around this,
- * we get exclusion from try_to_free_buffers with the blockdev mapping's
- * i_private_lock.
- *
- * Hack idea: for the blockdev mapping, i_private_lock contention
- * may be quite high. This code could TryLock the page, and if that
- * succeeds, there is no need to take i_private_lock.
- */
static struct buffer_head *
-__find_get_block_slow(struct block_device *bdev, sector_t block)
+__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
{
struct address_space *bd_mapping = bdev->bd_mapping;
const int blkbits = bd_mapping->host->i_blkbits;
@@ -204,10 +194,28 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
if (IS_ERR(folio))
goto out;
- spin_lock(&bd_mapping->i_private_lock);
+ /*
+ * Folio lock protects the buffers. Callers that cannot block
+ * will fallback to serializing vs try_to_free_buffers() via
+ * the i_private_lock.
+ */
+ if (atomic)
+ spin_lock(&bd_mapping->i_private_lock);
+ else
+ folio_lock(folio);
+
head = folio_buffers(folio);
if (!head)
goto out_unlock;
+ /*
+ * Upon a noref migration, the folio lock serializes here;
+ * otherwise bail.
+ */
+ if (test_bit_acquire(BH_Migrate, &head->b_state)) {
+ WARN_ON(!atomic);
+ goto out_unlock;
+ }
+
bh = head;
do {
if (!buffer_mapped(bh))
@@ -236,7 +244,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
1 << blkbits);
}
out_unlock:
- spin_unlock(&bd_mapping->i_private_lock);
+ if (atomic)
+ spin_unlock(&bd_mapping->i_private_lock);
+ else
+ folio_unlock(folio);
folio_put(folio);
out:
return ret;
@@ -656,7 +667,9 @@ EXPORT_SYMBOL(generic_buffers_fsync);
void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize)
{
- struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
+ struct buffer_head *bh;
+
+ bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
if (bh) {
if (buffer_dirty(bh))
write_dirty_buffer(bh, 0);
@@ -1386,16 +1399,18 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
/*
* Perform a pagecache lookup for the matching buffer. If it's there, refresh
* it in the LRU and mark it as accessed. If it is not present then return
- * NULL
+ * NULL. Atomic context callers may also return NULL if the buffer is being
+ * migrated; similarly the page is not marked accessed either.
*/
-struct buffer_head *
-__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+static struct buffer_head *
+find_get_block_common(struct block_device *bdev, sector_t block,
+ unsigned size, bool atomic)
{
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
if (bh == NULL) {
/* __find_get_block_slow will mark the page accessed */
- bh = __find_get_block_slow(bdev, block);
+ bh = __find_get_block_slow(bdev, block, atomic);
if (bh)
bh_lru_install(bh);
} else
@@ -1403,8 +1418,23 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
return bh;
}
+
+struct buffer_head *
+__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+{
+ return find_get_block_common(bdev, block, size, true);
+}
EXPORT_SYMBOL(__find_get_block);
+/* same as __find_get_block() but allows sleeping contexts */
+struct buffer_head *
+__find_get_block_nonatomic(struct block_device *bdev, sector_t block,
+ unsigned size)
+{
+ return find_get_block_common(bdev, block, size, false);
+}
+EXPORT_SYMBOL(__find_get_block_nonatomic);
+
/**
* bdev_getblk - Get a buffer_head in a block device's buffer cache.
* @bdev: The block device.
@@ -1422,7 +1452,12 @@ EXPORT_SYMBOL(__find_get_block);
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- struct buffer_head *bh = __find_get_block(bdev, block, size);
+ struct buffer_head *bh;
+
+ if (gfpflags_allow_blocking(gfp))
+ bh = __find_get_block_nonatomic(bdev, block, size);
+ else
+ bh = __find_get_block(bdev, block, size);
might_alloc(gfp);
if (bh)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 6ac2bd555e86..06cd2963e41e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2367,7 +2367,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
/* Try to writeback the dirty pagecaches */
if (issued & (CEPH_CAP_FILE_BUFFER)) {
- loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1;
+ loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1;
ret = filemap_write_and_wait_range(inode->i_mapping,
orig_pos, lend);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 38bc8d74f4cc..e7ecc7c8a729 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -691,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
if (!bh || !buffer_uptodate(bh))
/*
* If the block is not in the buffer cache, then it
- * must have been written out.
+ * must have been written out, or, most unlikely, is
+ * being migrated - false failure should be OK here.
*/
goto out;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f88424c28194..1e98c5be4e0a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -6642,7 +6642,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
for (i = 0; i < count; i++) {
cond_resched();
if (is_metadata)
- bh = sb_find_get_block(inode->i_sb, block + i);
+ bh = sb_find_get_block_nonatomic(inode->i_sb,
+ block + i);
ext4_forget(handle, is_metadata, inode, bh, block + i);
}
}
diff --git a/fs/file.c b/fs/file.c
index dc3f7e120e3e..3a3146664cf3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -26,7 +26,7 @@
#include "internal.h"
-bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
+static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
{
/*
* If the reference count was already in the dead zone, then this
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 0cf0fddbee81..1467f6790747 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -345,7 +345,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
bh = bh_in;
if (!bh) {
- bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ bh = __find_get_block_nonatomic(bdev, blocknr,
+ journal->j_blocksize);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
@@ -355,7 +356,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
/* If there is a different buffer_head lying around in
* memory anywhere... */
- bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ bh2 = __find_get_block_nonatomic(bdev, blocknr,
+ journal->j_blocksize);
if (bh2) {
/* ... and it has RevokeValid status... */
if (bh2 != bh && buffer_revokevalid(bh2))
@@ -464,7 +466,8 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
- bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+ bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr,
+ bh->b_size);
if (bh2) {
if (bh2 != bh)
clear_buffer_revoked(bh2);
@@ -492,9 +495,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
struct jbd2_revoke_record_s *record;
struct buffer_head *bh;
record = (struct jbd2_revoke_record_s *)list_entry;
- bh = __find_get_block(journal->j_fs_dev,
- record->blocknr,
- journal->j_blocksize);
+ bh = __find_get_block_nonatomic(journal->j_fs_dev,
+ record->blocknr,
+ journal->j_blocksize);
if (bh) {
clear_buffer_revoked(bh);
__brelse(bh);
diff --git a/fs/namespace.c b/fs/namespace.c
index d9ca80dcc544..98a5cd756e9a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2826,56 +2826,62 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
struct vfsmount *mnt = path->mnt;
struct dentry *dentry;
struct mountpoint *mp = ERR_PTR(-ENOENT);
+ struct path under = {};
for (;;) {
- struct mount *m;
+ struct mount *m = real_mount(mnt);
if (beneath) {
- m = real_mount(mnt);
+ path_put(&under);
read_seqlock_excl(&mount_lock);
- dentry = dget(m->mnt_mountpoint);
+ under.mnt = mntget(&m->mnt_parent->mnt);
+ under.dentry = dget(m->mnt_mountpoint);
read_sequnlock_excl(&mount_lock);
+ dentry = under.dentry;
} else {
dentry = path->dentry;
}
inode_lock(dentry->d_inode);
- if (unlikely(cant_mount(dentry))) {
- inode_unlock(dentry->d_inode);
- goto out;
- }
-
namespace_lock();
- if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
+ if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
+ break; // not to be mounted on
+
+ if (beneath && unlikely(m->mnt_mountpoint != dentry ||
+ &m->mnt_parent->mnt != under.mnt)) {
namespace_unlock();
inode_unlock(dentry->d_inode);
- goto out;
+ continue; // got moved
}
mnt = lookup_mnt(path);
- if (likely(!mnt))
+ if (unlikely(mnt)) {
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
+ path_put(path);
+ path->mnt = mnt;
+ path->dentry = dget(mnt->mnt_root);
+ continue; // got overmounted
+ }
+ mp = get_mountpoint(dentry);
+ if (IS_ERR(mp))
break;
-
- namespace_unlock();
- inode_unlock(dentry->d_inode);
- if (beneath)
- dput(dentry);
- path_put(path);
- path->mnt = mnt;
- path->dentry = dget(mnt->mnt_root);
- }
-
- mp = get_mountpoint(dentry);
- if (IS_ERR(mp)) {
- namespace_unlock();
- inode_unlock(dentry->d_inode);
+ if (beneath) {
+ /*
+ * @under duplicates the references that will stay
+ * at least until namespace_unlock(), so the path_put()
+ * below is safe (and OK to do under namespace_lock -
+ * we are not dropping the final references here).
+ */
+ path_put(&under);
+ }
+ return mp;
}
-
-out:
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
if (beneath)
- dput(dentry);
-
+ path_put(&under);
return mp;
}
@@ -2886,14 +2892,11 @@ static inline struct mountpoint *lock_mount(struct path *path)
static void unlock_mount(struct mountpoint *where)
{
- struct dentry *dentry = where->m_dentry;
-
+ inode_unlock(where->m_dentry->d_inode);
read_seqlock_excl(&mount_lock);
put_mountpoint(where);
read_sequnlock_excl(&mount_lock);
-
namespace_unlock();
- inode_unlock(dentry->d_inode);
}
static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f1b4b3e611cb..c7a9729dc9d0 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1249,7 +1249,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
}
for (i = 0; i < p_blocks; i++, p_blkno++) {
- bh = __find_get_block(osb->sb->s_bdev, p_blkno,
+ bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno,
osb->sb->s_blocksize);
/* block not cached. */
if (!bh)
diff --git a/fs/splice.c b/fs/splice.c
index 90d464241f15..4d6df083e0c0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -45,7 +45,7 @@
* here if set to avoid blocking other users of this pipe if splice is
* being done on it.
*/
-static noinline void noinline pipe_clear_nowait(struct file *file)
+static noinline void pipe_clear_nowait(struct file *file)
{
fmode_t fmode = READ_ONCE(file->f_mode);
diff --git a/fs/xattr.c b/fs/xattr.c
index 02bee149ad96..fabb2a04501e 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -703,7 +703,7 @@ static int path_setxattrat(int dfd, const char __user *pathname,
return error;
filename = getname_maybe_null(pathname, at_flags);
- if (!filename) {
+ if (!filename && dfd >= 0) {
CLASS(fd, f)(dfd);
if (fd_empty(f))
error = -EBADF;
@@ -847,7 +847,7 @@ static ssize_t path_getxattrat(int dfd, const char __user *pathname,
return error;
filename = getname_maybe_null(pathname, at_flags);
- if (!filename) {
+ if (!filename && dfd >= 0) {
CLASS(fd, f)(dfd);
if (fd_empty(f))
return -EBADF;