summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/afs/dynroot.c4
-rw-r--r--fs/afs/fs_probe.c2
-rw-r--r--fs/afs/server.c2
-rw-r--r--fs/bcachefs/Kconfig7
-rw-r--r--fs/bcachefs/alloc_foreground.c2
-rw-r--r--fs/bcachefs/alloc_foreground.h4
-rw-r--r--fs/bcachefs/bcachefs.h6
-rw-r--r--fs/bcachefs/bcachefs_format.h81
-rw-r--r--fs/bcachefs/bkey_methods.c24
-rw-r--r--fs/bcachefs/btree_iter.c7
-rw-r--r--fs/bcachefs/btree_journal_iter.c5
-rw-r--r--fs/bcachefs/btree_node_scan.c6
-rw-r--r--fs/bcachefs/btree_update_interior.c2
-rw-r--r--fs/bcachefs/btree_write_buffer.c8
-rw-r--r--fs/bcachefs/buckets.c3
-rw-r--r--fs/bcachefs/buckets.h5
-rw-r--r--fs/bcachefs/checksum.c247
-rw-r--r--fs/bcachefs/checksum.h3
-rw-r--r--fs/bcachefs/clock.c2
-rw-r--r--fs/bcachefs/data_update.c2
-rw-r--r--fs/bcachefs/dirent.c20
-rw-r--r--fs/bcachefs/dirent.h15
-rw-r--r--fs/bcachefs/errcode.h2
-rw-r--r--fs/bcachefs/error.c17
-rw-r--r--fs/bcachefs/error.h1
-rw-r--r--fs/bcachefs/extents.c2
-rw-r--r--fs/bcachefs/fs-io-buffered.c17
-rw-r--r--fs/bcachefs/fs-ioctl.c217
-rw-r--r--fs/bcachefs/fs-ioctl.h75
-rw-r--r--fs/bcachefs/fs.c469
-rw-r--r--fs/bcachefs/fsck.c44
-rw-r--r--fs/bcachefs/inode.h8
-rw-r--r--fs/bcachefs/inode_format.h9
-rw-r--r--fs/bcachefs/io_read.c27
-rw-r--r--fs/bcachefs/journal.c36
-rw-r--r--fs/bcachefs/journal.h7
-rw-r--r--fs/bcachefs/journal_io.c2
-rw-r--r--fs/bcachefs/journal_reclaim.c5
-rw-r--r--fs/bcachefs/movinggc.c7
-rw-r--r--fs/bcachefs/movinggc.h9
-rw-r--r--fs/bcachefs/namei.c4
-rw-r--r--fs/bcachefs/opts.h5
-rw-r--r--fs/bcachefs/rebalance.c11
-rw-r--r--fs/bcachefs/rebalance.h2
-rw-r--r--fs/bcachefs/recovery.c19
-rw-r--r--fs/bcachefs/recovery_passes.c70
-rw-r--r--fs/bcachefs/sb-errors_format.h4
-rw-r--r--fs/bcachefs/snapshot.c2
-rw-r--r--fs/bcachefs/str_hash.h5
-rw-r--r--fs/bcachefs/super-io.c23
-rw-r--r--fs/bcachefs/super.c169
-rw-r--r--fs/bcachefs/sysfs.c7
-rw-r--r--fs/bcachefs/tests.c4
-rw-r--r--fs/bcachefs/util.h38
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/file.c9
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/subpage.c4
-rw-r--r--fs/btrfs/super.c3
-rw-r--r--fs/btrfs/tree-checker.c2
-rw-r--r--fs/btrfs/zoned.c19
-rw-r--r--fs/btrfs/zstd.c2
-rw-r--r--fs/buffer.c73
-rw-r--r--fs/cachefiles/key.c2
-rw-r--r--fs/ceph/Kconfig2
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/dax.c1
-rw-r--r--fs/devpts/inode.c4
-rw-r--r--fs/erofs/Kconfig2
-rw-r--r--fs/erofs/erofs_fs.h8
-rw-r--r--fs/erofs/fileio.c2
-rw-r--r--fs/erofs/zdata.c1
-rw-r--r--fs/erofs/zmap.c5
-rw-r--r--fs/eventpoll.c10
-rw-r--r--fs/exec.c3
-rw-r--r--fs/ext4/block_validity.c5
-rw-r--r--fs/ext4/ialloc.c3
-rw-r--r--fs/ext4/inode.c75
-rw-r--r--fs/ext4/mballoc.c21
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/file.c2
-rw-r--r--fs/fuse/virtio_fs.c3
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/hfs/bnode.c6
-rw-r--r--fs/hfsplus/bnode.c6
-rw-r--r--fs/iomap/buffered-io.c2
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/jbd2/journal.c4
-rw-r--r--fs/jbd2/revoke.c15
-rw-r--r--fs/jffs2/wbuf.c2
-rw-r--r--fs/namei.c89
-rw-r--r--fs/namespace.c103
-rw-r--r--fs/netfs/fscache_cache.c2
-rw-r--r--fs/netfs/fscache_cookie.c2
-rw-r--r--fs/netfs/main.c4
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/internal.h7
-rw-r--r--fs/nfs/nfs4session.h4
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nfsd/nfsfh.h7
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/overlayfs/overlayfs.h2
-rw-r--r--fs/overlayfs/super.c5
-rw-r--r--fs/pstore/platform.c2
-rw-r--r--fs/smb/client/cifs_fs_sb.h1
-rw-r--r--fs/smb/client/cifsencrypt.c16
-rw-r--r--fs/smb/client/cifsfs.h5
-rw-r--r--fs/smb/client/cifsglob.h13
-rw-r--r--fs/smb/client/cifspdu.h2
-rw-r--r--fs/smb/client/cifsproto.h2
-rw-r--r--fs/smb/client/cifssmb.c32
-rw-r--r--fs/smb/client/connect.c216
-rw-r--r--fs/smb/client/file.c28
-rw-r--r--fs/smb/client/fs_context.c21
-rw-r--r--fs/smb/client/fs_context.h5
-rw-r--r--fs/smb/client/inode.c44
-rw-r--r--fs/smb/client/link.c8
-rw-r--r--fs/smb/client/misc.c2
-rw-r--r--fs/smb/client/reparse.c63
-rw-r--r--fs/smb/client/reparse.h5
-rw-r--r--fs/smb/client/sess.c60
-rw-r--r--fs/smb/client/smb1ops.c110
-rw-r--r--fs/smb/client/smb2file.c21
-rw-r--r--fs/smb/client/smb2glob.h1
-rw-r--r--fs/smb/client/smb2inode.c67
-rw-r--r--fs/smb/client/smb2ops.c46
-rw-r--r--fs/smb/client/smb2pdu.c35
-rw-r--r--fs/smb/client/transport.c3
-rw-r--r--fs/smb/client/xattr.c36
-rw-r--r--fs/smb/common/smb2pdu.h9
-rw-r--r--fs/smb/server/connection.c4
-rw-r--r--fs/smb/server/oplock.c29
-rw-r--r--fs/smb/server/oplock.h1
-rw-r--r--fs/smb/server/smb2pdu.c4
-rw-r--r--fs/smb/server/smb_common.h2
-rw-r--r--fs/smb/server/transport_ipc.c7
-rw-r--r--fs/smb/server/transport_tcp.c14
-rw-r--r--fs/smb/server/transport_tcp.h1
-rw-r--r--fs/smb/server/vfs.c3
-rw-r--r--fs/smb/server/vfs_cache.c8
-rw-r--r--fs/splice.c2
-rw-r--r--fs/stat.c32
-rw-r--r--fs/xattr.c4
-rw-r--r--fs/xfs/Kconfig2
-rw-r--r--fs/xfs/xfs_buf.c1
-rw-r--r--fs/xfs/xfs_buf_mem.c2
-rw-r--r--fs/xfs/xfs_dquot.c3
-rw-r--r--fs/xfs/xfs_fsmap.c51
-rw-r--r--fs/xfs/xfs_inode_item.c6
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_sysfs.c32
-rw-r--r--fs/xfs/xfs_trans_ail.c5
-rw-r--r--fs/xfs/xfs_trans_priv.h28
-rw-r--r--fs/xfs/xfs_zone_alloc.c7
-rw-r--r--fs/xfs/xfs_zone_gc.c22
163 files changed, 2068 insertions, 1382 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index c718b2e2de0e..5b4847bd2fbb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -368,6 +368,7 @@ config GRACE_PERIOD
config LOCKD
tristate
depends on FILE_LOCKING
+ select CRC32
select GRACE_PERIOD
config LOCKD_V4
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 691e0ae607a1..8c6130789fde 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -348,9 +348,9 @@ static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx)
}
if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) {
- rcu_read_lock();
+ down_read(&net->cells_lock);
ret = afs_dynroot_readdir_cells(net, ctx);
- rcu_read_unlock();
+ up_read(&net->cells_lock);
}
return ret;
}
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 07a8bfbdd9b9..e0030ac74ea0 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -534,6 +534,6 @@ dont_wait:
*/
void afs_fs_probe_cleanup(struct afs_net *net)
{
- if (del_timer_sync(&net->fs_probe_timer))
+ if (timer_delete_sync(&net->fs_probe_timer))
afs_dec_servers_outstanding(net);
}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index c530d1ca15df..8755f2703815 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -318,7 +318,7 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate,
a = atomic_inc_return(&server->active);
if (a == 1 && activate &&
!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
- del_timer(&server->timer);
+ timer_delete(&server->timer);
trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index bf1c94e51dd0..07709b0d7688 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -4,7 +4,7 @@ config BCACHEFS_FS
depends on BLOCK
select EXPORTFS
select CLOSURES
- select LIBCRC32C
+ select CRC32
select CRC64
select FS_POSIX_ACL
select LZ4_COMPRESS
@@ -15,10 +15,9 @@ config BCACHEFS_FS
select ZLIB_INFLATE
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
- select CRYPTO
select CRYPTO_LIB_SHA256
- select CRYPTO_CHACHA20
- select CRYPTO_POLY1305
+ select CRYPTO_LIB_CHACHA
+ select CRYPTO_LIB_POLY1305
select KEYS
select RAID6_PQ
select XOR_BLOCKS
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 7c930ef77380..effafc3e0ced 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1425,6 +1425,8 @@ alloc_done:
open_bucket_for_each(c, &wp->ptrs, ob, i)
wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
+ wp->sectors_free = rounddown(wp->sectors_free, block_sectors(c));
+
BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
return 0;
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 69ec6a012898..4c1e33cf57c0 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -110,7 +110,9 @@ static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct writ
unsigned i;
open_bucket_for_each(c, &wp->ptrs, ob, i)
- ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
+ ob_push(c, ob->sectors_free < block_sectors(c)
+ ? &ptrs
+ : &keep, ob);
wp->ptrs = keep;
mutex_unlock(&wp->lock);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 5d9f208a1bb7..75f7408da173 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -788,6 +788,8 @@ struct bch_fs {
unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
u64 btrees_lost_data;
} sb;
+ DARRAY(enum bcachefs_metadata_version)
+ incompat_versions_requested;
#ifdef CONFIG_UNICODE
struct unicode_map *cf_encoding;
@@ -981,8 +983,8 @@ struct bch_fs {
mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
size_t zstd_workspace_size;
- struct crypto_sync_skcipher *chacha20;
- struct crypto_shash *poly1305;
+ struct bch_key chacha20_key;
+ bool chacha20_key_set;
atomic64_t key_version;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index a3db328dee31..d6e4a496f02b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -366,6 +366,10 @@ static inline void bkey_init(struct bkey *k)
#define __BKEY_PADDED(key, pad) \
struct bkey_i key; __u64 key ## _pad[pad]
+enum bch_bkey_type_flags {
+ BKEY_TYPE_strict_btree_checks = BIT(0),
+};
+
/*
* - DELETED keys are used internally to mark keys that should be ignored but
* override keys in composition order. Their version number is ignored.
@@ -383,46 +387,46 @@ static inline void bkey_init(struct bkey *k)
*
* - WHITEOUT: for hash table btrees
*/
-#define BCH_BKEY_TYPES() \
- x(deleted, 0) \
- x(whiteout, 1) \
- x(error, 2) \
- x(cookie, 3) \
- x(hash_whiteout, 4) \
- x(btree_ptr, 5) \
- x(extent, 6) \
- x(reservation, 7) \
- x(inode, 8) \
- x(inode_generation, 9) \
- x(dirent, 10) \
- x(xattr, 11) \
- x(alloc, 12) \
- x(quota, 13) \
- x(stripe, 14) \
- x(reflink_p, 15) \
- x(reflink_v, 16) \
- x(inline_data, 17) \
- x(btree_ptr_v2, 18) \
- x(indirect_inline_data, 19) \
- x(alloc_v2, 20) \
- x(subvolume, 21) \
- x(snapshot, 22) \
- x(inode_v2, 23) \
- x(alloc_v3, 24) \
- x(set, 25) \
- x(lru, 26) \
- x(alloc_v4, 27) \
- x(backpointer, 28) \
- x(inode_v3, 29) \
- x(bucket_gens, 30) \
- x(snapshot_tree, 31) \
- x(logged_op_truncate, 32) \
- x(logged_op_finsert, 33) \
- x(accounting, 34) \
- x(inode_alloc_cursor, 35)
+#define BCH_BKEY_TYPES() \
+ x(deleted, 0, 0) \
+ x(whiteout, 1, 0) \
+ x(error, 2, 0) \
+ x(cookie, 3, 0) \
+ x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \
+ x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \
+ x(extent, 6, BKEY_TYPE_strict_btree_checks) \
+ x(reservation, 7, BKEY_TYPE_strict_btree_checks) \
+ x(inode, 8, BKEY_TYPE_strict_btree_checks) \
+ x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \
+ x(dirent, 10, BKEY_TYPE_strict_btree_checks) \
+ x(xattr, 11, BKEY_TYPE_strict_btree_checks) \
+ x(alloc, 12, BKEY_TYPE_strict_btree_checks) \
+ x(quota, 13, BKEY_TYPE_strict_btree_checks) \
+ x(stripe, 14, BKEY_TYPE_strict_btree_checks) \
+ x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \
+ x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \
+ x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \
+ x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \
+ x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \
+ x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \
+ x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \
+ x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \
+ x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \
+ x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \
+ x(set, 25, 0) \
+ x(lru, 26, BKEY_TYPE_strict_btree_checks) \
+ x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \
+ x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \
+ x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \
+ x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \
+ x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \
+ x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \
+ x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \
+ x(accounting, 34, BKEY_TYPE_strict_btree_checks) \
+ x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks)
enum bch_bkey_type {
-#define x(name, nr) KEY_TYPE_##name = nr,
+#define x(name, nr, ...) KEY_TYPE_##name = nr,
BCH_BKEY_TYPES()
#undef x
KEY_TYPE_MAX,
@@ -863,6 +867,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
+LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 15c93576b5c2..00d05ccfaf73 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -21,7 +21,7 @@
#include "xattr.h"
const char * const bch2_bkey_types[] = {
-#define x(name, nr) #name,
+#define x(name, nr, ...) #name,
BCH_BKEY_TYPES()
#undef x
NULL
@@ -115,7 +115,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_
})
const struct bkey_ops bch2_bkey_ops[] = {
-#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
+#define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
BCH_BKEY_TYPES()
#undef x
};
@@ -155,6 +155,12 @@ static u64 bch2_key_types_allowed[] = {
#undef x
};
+static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = {
+#define x(name, nr, flags) [KEY_TYPE_##name] = flags,
+ BCH_BKEY_TYPES()
+#undef x
+};
+
const char *bch2_btree_node_type_str(enum btree_node_type type)
{
return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
@@ -177,8 +183,18 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
if (type >= BKEY_TYPE_NR)
return 0;
- bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
- (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) &&
+ enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX
+ ? bch2_bkey_type_flags[k.k->type]
+ : 0;
+
+ bool strict_key_type_allowed =
+ (from.flags & BCH_VALIDATE_commit) ||
+ type == BKEY_TYPE_btree ||
+ (from.btree < BTREE_ID_NR &&
+ (bkey_flags & BKEY_TYPE_strict_btree_checks));
+
+ bkey_fsck_err_on(strict_key_type_allowed &&
+ k.k->type < KEY_TYPE_MAX &&
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
c, bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e34e9598ef25..59fa527ac685 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2577,7 +2577,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct
struct bpos end)
{
if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
- !bkey_eq(iter->pos, POS_MAX)) {
+ !bkey_eq(iter->pos, POS_MAX) &&
+ !((iter->flags & BTREE_ITER_is_extents) &&
+ iter->pos.offset == U64_MAX)) {
+
/*
* bkey_start_pos(), for extents, is not monotonically
* increasing until after filtering for snapshots:
@@ -2602,7 +2605,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct
bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode);
int ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index d1ad1a7613c9..7d6c971db23c 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -644,8 +644,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
*/
static int journal_sort_key_cmp(const void *_l, const void *_r)
{
- cond_resched();
-
const struct journal_key *l = _l;
const struct journal_key *r = _r;
@@ -689,7 +687,8 @@ void bch2_journal_keys_put(struct bch_fs *c)
static void __journal_keys_sort(struct journal_keys *keys)
{
- sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
+ sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
+ journal_sort_key_cmp, NULL);
cond_resched();
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 8c9fdb7263fe..86acf037590c 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -183,7 +183,7 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
return;
if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
- if (!c->chacha20)
+ if (!c->chacha20_key_set)
return;
struct nonce nonce = btree_nonce(&bn->keys, 0);
@@ -398,7 +398,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
bch2_print_string_as_lines(KERN_INFO, buf.buf);
}
- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
+ sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
dst = 0;
darray_for_each(f->nodes, i) {
@@ -418,7 +418,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
}
f->nodes.nr = dst;
- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+ sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
if (0 && c->opts.verbose) {
printbuf_reset(&buf);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 55fbeeb8eaaa..44b5fe430370 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1221,7 +1221,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
ret = bch2_disk_reservation_get(c, &as->disk_res,
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
- c->opts.metadata_replicas,
+ READ_ONCE(c->opts.metadata_replicas),
disk_res_flags);
if (ret)
goto err;
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index adbe576ec77e..0941fb2c026d 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -428,10 +428,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
*/
trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
- sort(wb->flushing.keys.data,
- wb->flushing.keys.nr,
- sizeof(wb->flushing.keys.data[0]),
- wb_key_seq_cmp, NULL);
+ sort_nonatomic(wb->flushing.keys.data,
+ wb->flushing.keys.nr,
+ sizeof(wb->flushing.keys.data[0]),
+ wb_key_seq_cmp, NULL);
darray_for_each(wb->flushing.keys, i) {
if (!i->journal_seq)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index fea61e60a9ee..4ef261e8db4f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -37,7 +37,8 @@ void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage)
{
memset(usage, 0, sizeof(*usage));
- acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
+ acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage,
+ sizeof(struct bch_dev_usage_full) / sizeof(u64));
}
static u64 reserve_factor(u64 r)
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 1c38b165f48b..8d75b27a1418 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -242,11 +242,6 @@ static inline u64 dev_buckets_available(struct bch_dev *ca,
/* Filesystem usage: */
-static inline unsigned dev_usage_u64s(void)
-{
- return sizeof(struct bch_dev_usage) / sizeof(u64);
-}
-
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3726689093e3..d0a34a097b80 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -7,17 +7,12 @@
#include "super-io.h"
#include <linux/crc32c.h>
-#include <linux/crypto.h>
#include <linux/xxhash.h>
#include <linux/key.h>
#include <linux/random.h>
#include <linux/ratelimit.h>
-#include <linux/scatterlist.h>
-#include <crypto/algapi.h>
#include <crypto/chacha.h>
-#include <crypto/hash.h>
#include <crypto/poly1305.h>
-#include <crypto/skcipher.h>
#include <keys/user-type.h>
/*
@@ -96,116 +91,40 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void *
}
}
-static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
- struct nonce nonce,
- struct scatterlist *sg, size_t len)
+static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS],
+ const struct bch_key *key, struct nonce nonce)
{
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)];
- skcipher_request_set_sync_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
+ BUILD_BUG_ON(sizeof(key_words) != sizeof(*key));
+ memcpy(key_words, key, sizeof(key_words));
+ le32_to_cpu_array(key_words, ARRAY_SIZE(key_words));
- int ret = crypto_skcipher_encrypt(req);
- if (ret)
- pr_err("got error %i from crypto_skcipher_encrypt()", ret);
-
- return ret;
-}
-
-static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
- struct nonce nonce,
- void *buf, size_t len)
-{
- if (!is_vmalloc_addr(buf)) {
- struct scatterlist sg = {};
-
- sg_mark_end(&sg);
- sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf));
- return do_encrypt_sg(tfm, nonce, &sg, len);
- } else {
- DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
- size_t sgl_len = 0;
- int ret;
-
- darray_init(&sgl);
-
- while (len) {
- unsigned offset = offset_in_page(buf);
- struct scatterlist sg = {
- .page_link = (unsigned long) vmalloc_to_page(buf),
- .offset = offset,
- .length = min(len, PAGE_SIZE - offset),
- };
+ BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE);
+ chacha_init(state, key_words, (const u8 *)nonce.d);
- if (darray_push(&sgl, sg)) {
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
- if (ret)
- goto err;
-
- nonce = nonce_add(nonce, sgl_len);
- sgl_len = 0;
- sgl.nr = 0;
- BUG_ON(darray_push(&sgl, sg));
- }
-
- buf += sg.length;
- len -= sg.length;
- sgl_len += sg.length;
- }
-
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
-err:
- darray_exit(&sgl);
- return ret;
- }
+ memzero_explicit(key_words, sizeof(key_words));
}
-int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
- void *buf, size_t len)
+static void bch2_chacha20(const struct bch_key *key, struct nonce nonce,
+ void *data, size_t len)
{
- struct crypto_sync_skcipher *chacha20 =
- crypto_alloc_sync_skcipher("chacha20", 0, 0);
- int ret;
-
- ret = PTR_ERR_OR_ZERO(chacha20);
- if (ret) {
- pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
- return ret;
- }
-
- ret = crypto_skcipher_setkey(&chacha20->base,
- (void *) key, sizeof(*key));
- if (ret) {
- pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
- goto err;
- }
+ u32 state[CHACHA_STATE_WORDS];
- ret = do_encrypt(chacha20, nonce, buf, len);
-err:
- crypto_free_sync_skcipher(chacha20);
- return ret;
+ bch2_chacha20_init(state, key, nonce);
+ chacha20_crypt(state, data, data, len);
+ memzero_explicit(state, sizeof(state));
}
-static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
- struct nonce nonce)
+static void bch2_poly1305_init(struct poly1305_desc_ctx *desc,
+ struct bch_fs *c, struct nonce nonce)
{
- u8 key[POLY1305_KEY_SIZE];
- int ret;
+ u8 key[POLY1305_KEY_SIZE] = { 0 };
nonce.d[3] ^= BCH_NONCE_POLY;
- memset(key, 0, sizeof(key));
- ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
- if (ret)
- return ret;
-
- desc->tfm = c->poly1305;
- crypto_shash_init(desc);
- crypto_shash_update(desc, key, sizeof(key));
- return 0;
+ bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key));
+ poly1305_init(desc, key);
}
struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
@@ -230,14 +149,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
case BCH_CSUM_chacha20_poly1305_80:
case BCH_CSUM_chacha20_poly1305_128: {
- SHASH_DESC_ON_STACK(desc, c->poly1305);
+ struct poly1305_desc_ctx dctx;
u8 digest[POLY1305_DIGEST_SIZE];
struct bch_csum ret = { 0 };
- gen_poly_key(c, desc, nonce);
-
- crypto_shash_update(desc, data, len);
- crypto_shash_final(desc, digest);
+ bch2_poly1305_init(&dctx, c, nonce);
+ poly1305_update(&dctx, data, len);
+ poly1305_final(&dctx, digest);
memcpy(&ret, digest, bch_crc_bytes[type]);
return ret;
@@ -253,11 +171,12 @@ int bch2_encrypt(struct bch_fs *c, unsigned type,
if (!bch2_csum_type_is_encryption(type))
return 0;
- if (bch2_fs_inconsistent_on(!c->chacha20,
+ if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
c, "attempting to encrypt without encryption key"))
return -BCH_ERR_no_encryption_key;
- return do_encrypt(c->chacha20, nonce, data, len);
+ bch2_chacha20(&c->chacha20_key, nonce, data, len);
+ return 0;
}
static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
@@ -296,26 +215,26 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
case BCH_CSUM_chacha20_poly1305_80:
case BCH_CSUM_chacha20_poly1305_128: {
- SHASH_DESC_ON_STACK(desc, c->poly1305);
+ struct poly1305_desc_ctx dctx;
u8 digest[POLY1305_DIGEST_SIZE];
struct bch_csum ret = { 0 };
- gen_poly_key(c, desc, nonce);
+ bch2_poly1305_init(&dctx, c, nonce);
#ifdef CONFIG_HIGHMEM
__bio_for_each_segment(bv, bio, *iter, *iter) {
void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
- crypto_shash_update(desc, p, bv.bv_len);
+ poly1305_update(&dctx, p, bv.bv_len);
kunmap_local(p);
}
#else
__bio_for_each_bvec(bv, bio, *iter, *iter)
- crypto_shash_update(desc,
+ poly1305_update(&dctx,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
#endif
- crypto_shash_final(desc, digest);
+ poly1305_final(&dctx, digest);
memcpy(&ret, digest, bch_crc_bytes[type]);
return ret;
@@ -338,43 +257,33 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
{
struct bio_vec bv;
struct bvec_iter iter;
- DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
- size_t sgl_len = 0;
+ u32 chacha_state[CHACHA_STATE_WORDS];
int ret = 0;
- if (bch2_fs_inconsistent_on(!c->chacha20,
+ if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
c, "attempting to encrypt without encryption key"))
return -BCH_ERR_no_encryption_key;
- darray_init(&sgl);
+ bch2_chacha20_init(chacha_state, &c->chacha20_key, nonce);
bio_for_each_segment(bv, bio, iter) {
- struct scatterlist sg = {
- .page_link = (unsigned long) bv.bv_page,
- .offset = bv.bv_offset,
- .length = bv.bv_len,
- };
-
- if (darray_push(&sgl, sg)) {
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
- if (ret)
- goto err;
-
- nonce = nonce_add(nonce, sgl_len);
- sgl_len = 0;
- sgl.nr = 0;
-
- BUG_ON(darray_push(&sgl, sg));
+ void *p;
+
+ /*
+ * chacha_crypt() assumes that the length is a multiple of
+ * CHACHA_BLOCK_SIZE on any non-final call.
+ */
+ if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) {
+ bch_err_ratelimited(c, "bio not aligned for encryption");
+ ret = -EIO;
+ break;
}
- sgl_len += sg.length;
+ p = bvec_kmap_local(&bv);
+ chacha20_crypt(chacha_state, p, p, bv.bv_len);
+ kunmap_local(p);
}
-
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
-err:
- darray_exit(&sgl);
+ memzero_explicit(chacha_state, sizeof(chacha_state));
return ret;
}
@@ -650,10 +559,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
}
/* decrypt real key: */
- ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
- &sb_key, sizeof(sb_key));
- if (ret)
- goto err;
+ bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key));
if (bch2_key_is_encrypted(&sb_key)) {
bch_err(c, "incorrect encryption key");
@@ -668,31 +574,6 @@ err:
return ret;
}
-static int bch2_alloc_ciphers(struct bch_fs *c)
-{
- if (c->chacha20)
- return 0;
-
- struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
- int ret = PTR_ERR_OR_ZERO(chacha20);
- if (ret) {
- bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
- return ret;
- }
-
- struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0);
- ret = PTR_ERR_OR_ZERO(poly1305);
- if (ret) {
- bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
- crypto_free_sync_skcipher(chacha20);
- return ret;
- }
-
- c->chacha20 = chacha20;
- c->poly1305 = poly1305;
- return 0;
-}
-
#if 0
/*
@@ -797,35 +678,21 @@ err:
void bch2_fs_encryption_exit(struct bch_fs *c)
{
- if (c->poly1305)
- crypto_free_shash(c->poly1305);
- if (c->chacha20)
- crypto_free_sync_skcipher(c->chacha20);
+ memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key));
}
int bch2_fs_encryption_init(struct bch_fs *c)
{
struct bch_sb_field_crypt *crypt;
- struct bch_key key;
- int ret = 0;
+ int ret;
crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
if (!crypt)
- goto out;
+ return 0;
- ret = bch2_alloc_ciphers(c);
+ ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key);
if (ret)
- goto out;
-
- ret = bch2_decrypt_sb_key(c, crypt, &key);
- if (ret)
- goto out;
-
- ret = crypto_skcipher_setkey(&c->chacha20->base,
- (void *) &key.key, sizeof(key.key));
- if (ret)
- goto out;
-out:
- memzero_explicit(&key, sizeof(key));
- return ret;
+ return ret;
+ c->chacha20_key_set = true;
+ return 0;
}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 4ac251c8fcd8..1310782d3ae9 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -69,7 +69,6 @@ static inline void bch2_csum_err_msg(struct printbuf *out,
bch2_csum_to_text(out, type, expected);
}
-int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
int bch2_request_key(struct bch_sb *, struct bch_key *);
#ifndef __KERNEL__
int bch2_revoke_key(struct bch_sb *);
@@ -156,7 +155,7 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
if (type >= BCH_CSUM_NR)
return false;
- if (bch2_csum_type_is_encryption(type) && !c->chacha20)
+ if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set)
return false;
return true;
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 1f8e035d7119..d6dd12d74d4f 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -121,7 +121,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
} while (0);
__set_current_state(TASK_RUNNING);
- del_timer_sync(&wait.cpu_timer);
+ timer_delete_sync(&wait.cpu_timer);
destroy_timer_on_stack(&wait.cpu_timer);
bch2_io_timer_del(clock, &wait.io_timer);
}
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index de02ebf847ec..b211c97238ab 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -607,7 +607,7 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update
prt_newline(out);
printbuf_indent_add(out, 2);
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
- prt_printf(out, "read_done:\t\%u\n", m->read_done);
+ prt_printf(out, "read_done:\t%u\n", m->read_done);
bch2_write_op_to_text(out, &m->op);
printbuf_indent_sub(out, 2);
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index bf53a029f356..92ee59d9e00e 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -13,8 +13,8 @@
#include <linux/dcache.h>
-static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
- const struct qstr *str, struct qstr *out_cf)
+int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
+ const struct qstr *str, struct qstr *out_cf)
{
*out_cf = (struct qstr) QSTR_INIT(NULL, 0);
@@ -35,18 +35,6 @@ static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *
#endif
}
-static inline int bch2_maybe_casefold(struct btree_trans *trans,
- const struct bch_hash_info *info,
- const struct qstr *str, struct qstr *out_cf)
-{
- if (likely(!info->cf_encoding)) {
- *out_cf = *str;
- return 0;
- } else {
- return bch2_casefold(trans, info, str, out_cf);
- }
-}
-
static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
@@ -287,8 +275,8 @@ static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,
EBUG_ON(!dirent->v.d_casefold);
EBUG_ON(!cf_name->len);
- dirent->v.d_cf_name_block.d_name_len = name->len;
- dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len;
+ dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
+ dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len);
memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len);
memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0,
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 0880772b80a9..9838a7ba7ed1 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -23,6 +23,21 @@ struct bch_fs;
struct bch_hash_info;
struct bch_inode_info;
+int bch2_casefold(struct btree_trans *, const struct bch_hash_info *,
+ const struct qstr *, struct qstr *);
+
+static inline int bch2_maybe_casefold(struct btree_trans *trans,
+ const struct bch_hash_info *info,
+ const struct qstr *str, struct qstr *out_cf)
+{
+ if (likely(!info->cf_encoding)) {
+ *out_cf = *str;
+ return 0;
+ } else {
+ return bch2_casefold(trans, info, str, out_cf);
+ }
+}
+
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index c8696f01eb14..a615e4852ded 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -287,7 +287,7 @@
x(EIO, mark_stripe) \
x(EIO, stripe_reconstruct) \
x(EIO, key_type_error) \
- x(EIO, extent_poisened) \
+ x(EIO, extent_poisoned) \
x(EIO, missing_indirect_extent) \
x(EIO, invalidate_stripe_to_dev) \
x(EIO, no_encryption_key) \
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index baf5dfb32298..925b0b54ea2f 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -272,9 +272,6 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c,
{
struct fsck_err_state *s;
- if (!test_bit(BCH_FS_fsck_running, &c->flags))
- return NULL;
-
list_for_each_entry(s, &c->fsck_error_msgs, list)
if (s->id == id) {
/*
@@ -639,14 +636,14 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
return ret;
}
-void bch2_flush_fsck_errs(struct bch_fs *c)
+static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print)
{
struct fsck_err_state *s, *n;
mutex_lock(&c->fsck_error_msgs_lock);
list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
- if (s->ratelimited && s->last_msg)
+ if (print && s->ratelimited && s->last_msg)
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
list_del(&s->list);
@@ -657,6 +654,16 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
mutex_unlock(&c->fsck_error_msgs_lock);
}
+void bch2_flush_fsck_errs(struct bch_fs *c)
+{
+ __bch2_flush_fsck_errs(c, true);
+}
+
+void bch2_free_fsck_errs(struct bch_fs *c)
+{
+ __bch2_flush_fsck_errs(c, false);
+}
+
int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
subvol_inum inum, u64 offset)
{
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index d0d024dc714b..4a364fd44abe 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -93,6 +93,7 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
_flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
void bch2_flush_fsck_errs(struct bch_fs *);
+void bch2_free_fsck_errs(struct bch_fs *);
#define fsck_err_wrap(_do) \
({ \
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index ae7c7a177e10..dca2b8425cc0 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -139,7 +139,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return -BCH_ERR_extent_poisened;
+ return -BCH_ERR_extent_poisoned;
rcu_read_lock();
const union bch_extent_entry *entry;
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 19d4599918dc..e3a75dcca60c 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -225,11 +225,26 @@ static void bchfs_read(struct btree_trans *trans,
bch2_read_extent(trans, rbio, iter.pos,
data_btree, k, offset_into_extent, flags);
- swap(rbio->bio.bi_iter.bi_size, bytes);
+ /*
+ * Careful there's a landmine here if bch2_read_extent() ever
+ * starts returning transaction restarts here.
+ *
+ * We've changed rbio->bi_iter.bi_size to be "bytes we can read
+ * from this extent" with the swap call, and we restore it
+ * below. That restore needs to come before checking for
+ * errors.
+ *
+ * But unlike __bch2_read(), we use the rbio bvec iter, not one
+ * on the stack, so we can't do the restore right after the
+ * bch2_read_extent() call: we don't own that iterator anymore
+ * if BCH_READ_last_fragment is set, since we may have submitted
+ * that rbio instead of cloning it.
+ */
if (flags & BCH_READ_last_fragment)
break;
+ swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
err:
if (ret &&
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index c1553e44e049..a82dfce9e4ad 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -21,206 +21,6 @@
#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
-struct flags_set {
- unsigned mask;
- unsigned flags;
-
- unsigned projid;
-
- bool set_projinherit;
- bool projinherit;
-};
-
-static int bch2_inode_flags_set(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- /*
- * We're relying on btree locking here for exclusion with other ioctl
- * calls - use the flags in the btree (@bi), not inode->i_flags:
- */
- struct flags_set *s = p;
- unsigned newflags = s->flags;
- unsigned oldflags = bi->bi_flags & s->mask;
-
- if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
- !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
- if (!S_ISREG(bi->bi_mode) &&
- !S_ISDIR(bi->bi_mode) &&
- (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
- return -EINVAL;
-
- if ((newflags ^ oldflags) & BCH_INODE_casefolded) {
-#ifdef CONFIG_UNICODE
- int ret = 0;
- /* Not supported on individual files. */
- if (!S_ISDIR(bi->bi_mode))
- return -EOPNOTSUPP;
-
- /*
- * Make sure the dir is empty, as otherwise we'd need to
- * rehash everything and update the dirent keys.
- */
- ret = bch2_empty_dir_trans(trans, inode_inum(inode));
- if (ret < 0)
- return ret;
-
- ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding);
- if (ret)
- return ret;
-
- bch2_check_set_feature(c, BCH_FEATURE_casefolding);
-#else
- printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n");
- return -EOPNOTSUPP;
-#endif
- }
-
- if (s->set_projinherit) {
- bi->bi_fields_set &= ~(1 << Inode_opt_project);
- bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
- }
-
- bi->bi_flags &= ~s->mask;
- bi->bi_flags |= newflags;
-
- bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
- return 0;
-}
-
-static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
-{
- unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
-
- return put_user(flags, arg);
-}
-
-static int bch2_ioc_setflags(struct bch_fs *c,
- struct file *file,
- struct bch_inode_info *inode,
- void __user *arg)
-{
- struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
- unsigned uflags;
- int ret;
-
- if (get_user(uflags, (int __user *) arg))
- return -EFAULT;
-
- s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
- if (uflags)
- return -EOPNOTSUPP;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- inode_lock(&inode->v);
- if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
- ret = -EACCES;
- goto setflags_out;
- }
-
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
- bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
- ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
-
-setflags_out:
- inode_unlock(&inode->v);
- mnt_drop_write_file(file);
- return ret;
-}
-
-static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
- struct fsxattr __user *arg)
-{
- struct fsxattr fa = { 0 };
-
- fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
-
- if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
- fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
-
- fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
-
- if (copy_to_user(arg, &fa, sizeof(fa)))
- return -EFAULT;
-
- return 0;
-}
-
-static int fssetxattr_inode_update_fn(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct flags_set *s = p;
-
- if (s->projid != bi->bi_project) {
- bi->bi_fields_set |= 1U << Inode_opt_project;
- bi->bi_project = s->projid;
- }
-
- return bch2_inode_flags_set(trans, inode, bi, p);
-}
-
-static int bch2_ioc_fssetxattr(struct bch_fs *c,
- struct file *file,
- struct bch_inode_info *inode,
- struct fsxattr __user *arg)
-{
- struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
- struct fsxattr fa;
- int ret;
-
- if (copy_from_user(&fa, arg, sizeof(fa)))
- return -EFAULT;
-
- s.set_projinherit = true;
- s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
- fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
-
- s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
- if (fa.fsx_xflags)
- return -EOPNOTSUPP;
-
- if (fa.fsx_projid >= U32_MAX)
- return -EINVAL;
-
- /*
- * inode fields accessible via the xattr interface are stored with a +1
- * bias, so that 0 means unset:
- */
- s.projid = fa.fsx_projid + 1;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- inode_lock(&inode->v);
- if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
- ret = -EACCES;
- goto err;
- }
-
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
- bch2_set_projid(c, inode, fa.fsx_projid) ?:
- bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
- ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
-err:
- inode_unlock(&inode->v);
- mnt_drop_write_file(file);
- return ret;
-}
-
static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
@@ -558,23 +358,6 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
long ret;
switch (cmd) {
- case FS_IOC_GETFLAGS:
- ret = bch2_ioc_getflags(inode, (int __user *) arg);
- break;
-
- case FS_IOC_SETFLAGS:
- ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
- break;
-
- case FS_IOC_FSGETXATTR:
- ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
- break;
-
- case FS_IOC_FSSETXATTR:
- ret = bch2_ioc_fssetxattr(c, file, inode,
- (void __user *) arg);
- break;
-
case BCHFS_IOC_REINHERIT_ATTRS:
ret = bch2_ioc_reinherit_attrs(c, file, inode,
(void __user *) arg);
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
index ecd3bfdcde21..a657e4994b71 100644
--- a/fs/bcachefs/fs-ioctl.h
+++ b/fs/bcachefs/fs-ioctl.h
@@ -2,81 +2,6 @@
#ifndef _BCACHEFS_FS_IOCTL_H
#define _BCACHEFS_FS_IOCTL_H
-/* Inode flags: */
-
-/* bcachefs inode flags -> vfs inode flags: */
-static const __maybe_unused unsigned bch_flags_to_vfs[] = {
- [__BCH_INODE_sync] = S_SYNC,
- [__BCH_INODE_immutable] = S_IMMUTABLE,
- [__BCH_INODE_append] = S_APPEND,
- [__BCH_INODE_noatime] = S_NOATIME,
- [__BCH_INODE_casefolded] = S_CASEFOLD,
-};
-
-/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const __maybe_unused unsigned bch_flags_to_uflags[] = {
- [__BCH_INODE_sync] = FS_SYNC_FL,
- [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
- [__BCH_INODE_append] = FS_APPEND_FL,
- [__BCH_INODE_nodump] = FS_NODUMP_FL,
- [__BCH_INODE_noatime] = FS_NOATIME_FL,
- [__BCH_INODE_casefolded] = FS_CASEFOLD_FL,
-};
-
-/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const __maybe_unused unsigned bch_flags_to_xflags[] = {
- [__BCH_INODE_sync] = FS_XFLAG_SYNC,
- [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
- [__BCH_INODE_append] = FS_XFLAG_APPEND,
- [__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
- [__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
- //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
-};
-
-#define set_flags(_map, _in, _out) \
-do { \
- unsigned _i; \
- \
- for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
- if ((_in) & (1 << _i)) \
- (_out) |= _map[_i]; \
- else \
- (_out) &= ~_map[_i]; \
-} while (0)
-
-#define map_flags(_map, _in) \
-({ \
- unsigned _out = 0; \
- \
- set_flags(_map, _in, _out); \
- _out; \
-})
-
-#define map_flags_rev(_map, _in) \
-({ \
- unsigned _i, _out = 0; \
- \
- for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
- if ((_in) & _map[_i]) { \
- (_out) |= 1 << _i; \
- (_in) &= ~_map[_i]; \
- } \
- (_out); \
-})
-
-#define map_defined(_map) \
-({ \
- unsigned _in = ~0; \
- \
- map_flags_rev(_map, _in); \
-})
-
-/* Set VFS inode flags from bcachefs inode: */
-static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
-{
- set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
-}
-
long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 5a41b1a8e54f..0f1d61aab90b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -33,6 +33,7 @@
#include <linux/backing-dev.h>
#include <linux/exportfs.h>
#include <linux/fiemap.h>
+#include <linux/fileattr.h>
#include <linux/fs_context.h>
#include <linux/module.h>
#include <linux/pagemap.h>
@@ -51,6 +52,22 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
struct bch_subvolume *);
+/* Set VFS inode flags from bcachefs inode: */
+static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode)
+{
+ static const __maybe_unused unsigned bch_flags_to_vfs[] = {
+ [__BCH_INODE_sync] = S_SYNC,
+ [__BCH_INODE_immutable] = S_IMMUTABLE,
+ [__BCH_INODE_append] = S_APPEND,
+ [__BCH_INODE_noatime] = S_NOATIME,
+ };
+
+ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
+
+ if (bch2_inode_casefold(c, &inode->ei_inode))
+ inode->v.i_flags |= S_CASEFOLD;
+}
+
void bch2_inode_update_after_write(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
@@ -79,7 +96,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
inode->ei_inode = *bi;
- bch2_inode_flags_to_vfs(inode);
+ bch2_inode_flags_to_vfs(c, inode);
}
int __must_check bch2_write_inode(struct bch_fs *c,
@@ -631,13 +648,18 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
const struct qstr *name)
{
struct bch_fs *c = trans->c;
- struct btree_iter dirent_iter = {};
subvol_inum inum = {};
struct printbuf buf = PRINTBUF;
+ struct qstr lookup_name;
+ int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name);
+ if (ret)
+ return ERR_PTR(ret);
+
+ struct btree_iter dirent_iter = {};
struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
- dir_hash_info, dir, name, 0);
- int ret = bkey_err(k);
+ dir_hash_info, dir, &lookup_name, 0);
+ ret = bkey_err(k);
if (ret)
return ERR_PTR(ret);
@@ -825,6 +847,11 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
*/
set_nlink(&inode->v, 0);
}
+
+ if (IS_CASEFOLDED(vdir)) {
+ d_invalidate(dentry);
+ d_prune_aliases(&inode->v);
+ }
err:
bch2_trans_put(trans);
bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
@@ -1235,10 +1262,20 @@ static int bch2_tmpfile(struct mnt_idmap *idmap,
return finish_open_simple(file, 0);
}
+struct bch_fiemap_extent {
+ struct bkey_buf kbuf;
+ unsigned flags;
+};
+
static int bch2_fill_extent(struct bch_fs *c,
struct fiemap_extent_info *info,
- struct bkey_s_c k, unsigned flags)
+ struct bch_fiemap_extent *fe)
{
+ struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k);
+ unsigned flags = fe->flags;
+
+ BUG_ON(!k.k->size);
+
if (bkey_extent_is_direct_data(k.k)) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -1291,110 +1328,223 @@ static int bch2_fill_extent(struct bch_fs *c,
}
}
-static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
- u64 start, u64 len)
+/*
+ * Scan a range of an inode for data in pagecache.
+ *
+ * Intended to be retryable, so don't modify the output params until success is
+ * imminent.
+ */
+static int
+bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
+ bool nonblock)
{
- struct bch_fs *c = vinode->i_sb->s_fs_info;
- struct bch_inode_info *ei = to_bch_ei(vinode);
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_buf cur, prev;
- bool have_extent = false;
- int ret = 0;
+ loff_t dstart, dend;
- ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
- if (ret)
+ dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock);
+ if (dstart < 0)
+ return dstart;
+
+ if (dstart == *end) {
+ *start = dstart;
+ return 0;
+ }
+
+ dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock);
+ if (dend < 0)
+ return dend;
+
+ /* race */
+ BUG_ON(dstart == dend);
+
+ *start = dstart;
+ *end = dend;
+ return 0;
+}
+
+/*
+ * Scan a range of pagecache that corresponds to a file mapping hole in the
+ * extent btree. If data is found, fake up an extent key so it looks like a
+ * delalloc extent to the rest of the fiemap processing code.
+ */
+static int
+bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode,
+ u64 start, u64 end, struct bch_fiemap_extent *cur)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_extent *delextent;
+ struct bch_extent_ptr ptr = {};
+ loff_t dstart = start << 9, dend = end << 9;
+ int ret;
+
+ /*
+ * We hold btree locks here so we cannot block on folio locks without
+ * dropping trans locks first. Run a nonblocking scan for the common
+ * case of no folios over holes and fall back on failure.
+ *
+ * Note that dropping locks like this is technically racy against
+ * writeback inserting to the extent tree, but a non-sync fiemap scan is
+ * fundamentally racy with writeback anyways. Therefore, just report the
+ * range as delalloc regardless of whether we have to cycle trans locks.
+ */
+ ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true);
+ if (ret == -EAGAIN)
+ ret = drop_locks_do(trans,
+ bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false));
+ if (ret < 0)
return ret;
- struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
- if (start + len < start)
- return -EINVAL;
+ /*
+ * Create a fake extent key in the buffer. We have to add a dummy extent
+ * pointer for the fill code to add an extent entry. It's explicitly
+ * zeroed to reflect delayed allocation (i.e. phys offset 0).
+ */
+ bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64));
+ delextent = bkey_extent_init(cur->kbuf.k);
+ delextent->k.p = POS(inode->ei_inum.inum, dend >> 9);
+ delextent->k.size = (dend - dstart) >> 9;
+ bch2_bkey_append_ptr(&delextent->k_i, ptr);
- start >>= 9;
+ cur->flags = FIEMAP_EXTENT_DELALLOC;
- bch2_bkey_buf_init(&cur);
- bch2_bkey_buf_init(&prev);
- trans = bch2_trans_get(c);
+ return 0;
+}
+
+static int bch2_next_fiemap_extent(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ u64 start, u64 end,
+ struct bch_fiemap_extent *cur)
+{
+ u32 snapshot;
+ int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+ struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(ei->v.i_ino, start), 0);
+ SPOS(inode->ei_inum.inum, start, snapshot), 0);
- while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- enum btree_id data_btree = BTREE_ID_extents;
+ struct bkey_s_c k =
+ bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end));
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- bch2_trans_begin(trans);
+ ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, end, cur);
+ if (ret)
+ goto err;
- u32 snapshot;
- ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
- if (ret)
- continue;
+ struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k);
- bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
+ /*
+ * Does the pagecache or the btree take precedence?
+ *
+ * It _should_ be the pagecache, so that we correctly report delalloc
+ * extents when dirty in the pagecache (we're COW, after all).
+ *
+ * But we'd have to add per-sector writeback tracking to
+ * bch_folio_state, otherwise we report delalloc extents for clean
+ * cached data in the pagecache.
+ *
+ * We should do this, but even then fiemap won't report stable mappings:
+ * on bcachefs data moves around in the background (copygc, rebalance)
+ * and we don't provide a way for userspace to lock that out.
+ */
+ if (k.k &&
+ bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)),
+ pagecache_start)) {
+ bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k);
+ bch2_cut_front(iter.pos, cur->kbuf.k);
+ bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k);
+ cur->flags = 0;
+ } else if (k.k) {
+ bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k);
+ }
- k = bch2_btree_iter_peek_max(trans, &iter, end);
- ret = bkey_err(k);
+ if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) {
+ unsigned sectors = cur->kbuf.k->k.size;
+ s64 offset_into_extent = 0;
+ enum btree_id data_btree = BTREE_ID_extents;
+ int ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent,
+ &cur->kbuf);
if (ret)
- continue;
+ goto err;
- if (!k.k)
- break;
+ struct bkey_i *k = cur->kbuf.k;
+ sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent);
- if (!bkey_extent_is_data(k.k) &&
- k.k->type != KEY_TYPE_reservation) {
- bch2_btree_iter_advance(trans, &iter);
- continue;
- }
+ bch2_cut_front(POS(k->k.p.inode,
+ bkey_start_offset(&k->k) + offset_into_extent),
+ k);
+ bch2_key_resize(&k->k, sectors);
+ k->k.p = iter.pos;
+ k->k.p.offset += k->k.size;
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
- s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
- unsigned sectors = k.k->size - offset_into_extent;
+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
+ u64 start, u64 len)
+{
+ struct bch_fs *c = vinode->i_sb->s_fs_info;
+ struct bch_inode_info *ei = to_bch_ei(vinode);
+ struct btree_trans *trans;
+ struct bch_fiemap_extent cur, prev;
+ int ret = 0;
+
+ ret = fiemap_prep(&ei->v, info, start, &len, 0);
+ if (ret)
+ return ret;
+
+ if (start + len < start)
+ return -EINVAL;
+
+ start >>= 9;
+ u64 end = (start + len) >> 9;
- bch2_bkey_buf_reassemble(&cur, c, k);
+ bch2_bkey_buf_init(&cur.kbuf);
+ bch2_bkey_buf_init(&prev.kbuf);
+ bkey_init(&prev.kbuf.k->k);
- ret = bch2_read_indirect_extent(trans, &data_btree,
- &offset_into_extent, &cur);
+ trans = bch2_trans_get(c);
+
+ while (start < end) {
+ ret = lockrestart_do(trans,
+ bch2_next_fiemap_extent(trans, ei, start, end, &cur));
if (ret)
- continue;
+ goto err;
- k = bkey_i_to_s_c(cur.k);
- bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
+ BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start);
+ BUG_ON(cur.kbuf.k->k.p.offset > end);
- sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
+ if (bkey_start_offset(&cur.kbuf.k->k) == end)
+ break;
- bch2_cut_front(POS(k.k->p.inode,
- bkey_start_offset(k.k) +
- offset_into_extent),
- cur.k);
- bch2_key_resize(&cur.k->k, sectors);
- cur.k->k.p = iter.pos;
- cur.k->k.p.offset += cur.k->k.size;
+ start = cur.kbuf.k->k.p.offset;
- if (have_extent) {
+ if (!bkey_deleted(&prev.kbuf.k->k)) {
bch2_trans_unlock(trans);
- ret = bch2_fill_extent(c, info,
- bkey_i_to_s_c(prev.k), 0);
+ ret = bch2_fill_extent(c, info, &prev);
if (ret)
- break;
+ goto err;
}
- bkey_copy(prev.k, cur.k);
- have_extent = true;
-
- bch2_btree_iter_set_pos(trans, &iter,
- POS(iter.pos.inode, iter.pos.offset + sectors));
+ bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k);
+ prev.flags = cur.flags;
}
- bch2_trans_iter_exit(trans, &iter);
- if (!ret && have_extent) {
+ if (!bkey_deleted(&prev.kbuf.k->k)) {
bch2_trans_unlock(trans);
- ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
- FIEMAP_EXTENT_LAST);
+ prev.flags |= FIEMAP_EXTENT_LAST;
+ ret = bch2_fill_extent(c, info, &prev);
}
-
+err:
bch2_trans_put(trans);
- bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
- return ret < 0 ? ret : 0;
+ bch2_bkey_buf_exit(&cur.kbuf, c);
+ bch2_bkey_buf_exit(&prev.kbuf, c);
+
+ return bch2_err_class(ret < 0 ? ret : 0);
}
static const struct vm_operations_struct bch_vm_ops = {
@@ -1449,6 +1599,165 @@ static int bch2_open(struct inode *vinode, struct file *file)
return generic_file_open(vinode, file);
}
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
+static const __maybe_unused unsigned bch_flags_to_uflags[] = {
+ [__BCH_INODE_sync] = FS_SYNC_FL,
+ [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
+ [__BCH_INODE_append] = FS_APPEND_FL,
+ [__BCH_INODE_nodump] = FS_NODUMP_FL,
+ [__BCH_INODE_noatime] = FS_NOATIME_FL,
+};
+
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
+static const __maybe_unused unsigned bch_flags_to_xflags[] = {
+ [__BCH_INODE_sync] = FS_XFLAG_SYNC,
+ [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
+ [__BCH_INODE_append] = FS_XFLAG_APPEND,
+ [__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
+ [__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
+};
+
+static int bch2_fileattr_get(struct dentry *dentry,
+ struct fileattr *fa)
+{
+ struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+ fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags));
+
+ if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
+ fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
+
+ if (bch2_inode_casefold(c, &inode->ei_inode))
+ fa->flags |= FS_CASEFOLD_FL;
+
+ fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ];
+ return 0;
+}
+
+struct flags_set {
+ unsigned mask;
+ unsigned flags;
+ unsigned projid;
+ bool set_project;
+ bool set_casefold;
+ bool casefold;
+};
+
+static int fssetxattr_inode_update_fn(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_fs *c = trans->c;
+ struct flags_set *s = p;
+
+ /*
+ * We're relying on btree locking here for exclusion with other ioctl
+ * calls - use the flags in the btree (@bi), not inode->i_flags:
+ */
+ if (!S_ISREG(bi->bi_mode) &&
+ !S_ISDIR(bi->bi_mode) &&
+ (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags)
+ return -EINVAL;
+
+ if (s->casefold != bch2_inode_casefold(c, bi)) {
+#ifdef CONFIG_UNICODE
+ int ret = 0;
+ /* Not supported on individual files. */
+ if (!S_ISDIR(bi->bi_mode))
+ return -EOPNOTSUPP;
+
+ /*
+ * Make sure the dir is empty, as otherwise we'd need to
+ * rehash everything and update the dirent keys.
+ */
+ ret = bch2_empty_dir_trans(trans, inode_inum(inode));
+ if (ret < 0)
+ return ret;
+
+ ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding);
+ if (ret)
+ return ret;
+
+ bch2_check_set_feature(c, BCH_FEATURE_casefolding);
+
+ bi->bi_casefold = s->casefold + 1;
+ bi->bi_fields_set |= BIT(Inode_opt_casefold);
+
+#else
+ printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n");
+ return -EOPNOTSUPP;
+#endif
+ }
+
+ if (s->set_project) {
+ bi->bi_project = s->projid;
+ bi->bi_fields_set |= BIT(Inode_opt_project);
+ }
+
+ bi->bi_flags &= ~s->mask;
+ bi->bi_flags |= s->flags;
+
+ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
+ return 0;
+}
+
+static int bch2_fileattr_set(struct mnt_idmap *idmap,
+ struct dentry *dentry,
+ struct fileattr *fa)
+{
+ struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct flags_set s = {};
+ int ret;
+
+ if (fa->fsx_valid) {
+ fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
+
+ s.mask = map_defined(bch_flags_to_xflags);
+ s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags);
+ if (fa->fsx_xflags)
+ return -EOPNOTSUPP;
+
+ if (fa->fsx_projid >= U32_MAX)
+ return -EINVAL;
+
+ /*
+ * inode fields accessible via the xattr interface are stored with a +1
+ * bias, so that 0 means unset:
+ */
+ if ((inode->ei_inode.bi_project ||
+ fa->fsx_projid) &&
+ inode->ei_inode.bi_project != fa->fsx_projid + 1) {
+ s.projid = fa->fsx_projid + 1;
+ s.set_project = true;
+ }
+ }
+
+ if (fa->flags_valid) {
+ s.mask = map_defined(bch_flags_to_uflags);
+
+ s.set_casefold = true;
+ s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0;
+ fa->flags &= ~FS_CASEFOLD_FL;
+
+ s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags);
+ if (fa->flags)
+ return -EOPNOTSUPP;
+ }
+
+ mutex_lock(&inode->ei_update_lock);
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
+ (s.set_project
+ ? bch2_set_projid(c, inode, fa->fsx_projid)
+ : 0) ?:
+ bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
+ ATTR_CTIME);
+ mutex_unlock(&inode->ei_update_lock);
+ return ret;
+}
+
static const struct file_operations bch_file_operations = {
.open = bch2_open,
.llseek = bch2_llseek,
@@ -1476,6 +1785,8 @@ static const struct inode_operations bch_file_inode_operations = {
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
+ .fileattr_get = bch2_fileattr_get,
+ .fileattr_set = bch2_fileattr_set,
};
static const struct inode_operations bch_dir_inode_operations = {
@@ -1496,6 +1807,8 @@ static const struct inode_operations bch_dir_inode_operations = {
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
+ .fileattr_get = bch2_fileattr_get,
+ .fileattr_set = bch2_fileattr_set,
};
static const struct file_operations bch_dir_file_operations = {
@@ -1518,6 +1831,8 @@ static const struct inode_operations bch_symlink_inode_operations = {
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
+ .fileattr_get = bch2_fileattr_get,
+ .fileattr_set = bch2_fileattr_set,
};
static const struct inode_operations bch_special_inode_operations = {
@@ -1528,6 +1843,8 @@ static const struct inode_operations bch_special_inode_operations = {
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
+ .fileattr_get = bch2_fileattr_get,
+ .fileattr_set = bch2_fileattr_set,
};
static const struct address_space_operations bch_address_space_operations = {
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 18308f3d64a1..7b25cedd3e40 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -321,6 +321,31 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
return false;
+ /*
+ * Subvolume roots are special: older versions of subvolume roots may be
+ * disconnected, it's only the newest version that matters.
+ *
+ * We only keep a single dirent pointing to a subvolume root, i.e.
+ * older versions of snapshots will not have a different dirent pointing
+ * to the same subvolume root.
+ *
+ * This is because dirents that point to subvolumes are only visible in
+ * the parent subvolume - versioning is not needed - and keeping them
+ * around would break fsck, because when we're crossing subvolumes we
+ * don't have a consistent snapshot ID to do check the inode <-> dirent
+ * relationships.
+ *
+ * Thus, a subvolume root that's been renamed after a snapshot will have
+ * a disconnected older version - that's expected.
+ *
+ * Note that taking a snapshot always updates the root inode (to update
+ * the dirent backpointer), so a subvolume root inode with
+ * BCH_INODE_has_child_snapshot is never visible.
+ */
+ if (inode->bi_subvol &&
+ (inode->bi_flags & BCH_INODE_has_child_snapshot))
+ return false;
+
return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
}
@@ -1007,6 +1032,23 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
+ if ((ret || dirent_points_to_inode_nowarn(d, inode)) &&
+ inode->bi_subvol &&
+ (inode->bi_flags & BCH_INODE_has_child_snapshot)) {
+ /* Older version of a renamed subvolume root: we won't have a
+ * correct dirent for it. That's expected, see
+ * inode_should_reattach().
+ *
+ * We don't clear the backpointer field when doing the rename
+ * because there might be arbitrarily many versions in older
+ * snapshots.
+ */
+ inode->bi_dir = 0;
+ inode->bi_dir_offset = 0;
+ *write_inode = true;
+ goto out;
+ }
+
if (fsck_err_on(ret,
trans, inode_points_to_missing_dirent,
"inode points to missing dirent\n%s",
@@ -1027,7 +1069,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
inode->bi_dir_offset = 0;
*write_inode = true;
}
-
+out:
ret = 0;
fsck_err:
bch2_trans_iter_exit(trans, &dirent_iter);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index f82cfbf460d0..c74af15b14f2 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -243,6 +243,14 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k)
}
}
+static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi)
+{
+ /* inode apts are stored with a +1 bias: 0 means "unset, use fs opt" */
+ return bi->bi_casefold
+ ? bi->bi_casefold - 1
+ : c->opts.casefold;
+}
+
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index 117110af1e3f..87e193e8ed25 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -103,7 +103,8 @@ struct bch_inode_generation {
x(bi_parent_subvol, 32) \
x(bi_nocow, 8) \
x(bi_depth, 32) \
- x(bi_inodes_32bit, 8)
+ x(bi_inodes_32bit, 8) \
+ x(bi_casefold, 8)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
@@ -117,7 +118,8 @@ struct bch_inode_generation {
x(background_target, 16) \
x(erasure_code, 16) \
x(nocow, 8) \
- x(inodes_32bit, 8)
+ x(inodes_32bit, 8) \
+ x(casefold, 8)
enum inode_opt_id {
#define x(name, ...) \
@@ -137,8 +139,7 @@ enum inode_opt_id {
x(i_sectors_dirty, 6) \
x(unlinked, 7) \
x(backptr_untrusted, 8) \
- x(has_child_snapshot, 9) \
- x(casefolded, 10)
+ x(has_child_snapshot, 9)
/* bits 20+ reserved for packed fields below: */
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 417bb0c7bbfa..def4a26a3b45 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -487,6 +487,8 @@ static void bch2_rbio_retry(struct work_struct *work)
.inum = rbio->read_pos.inode,
};
struct bch_io_failures failed = { .nr = 0 };
+ int orig_error = rbio->ret;
+
struct btree_trans *trans = bch2_trans_get(c);
trace_io_read_retry(&rbio->bio);
@@ -519,7 +521,9 @@ static void bch2_rbio_retry(struct work_struct *work)
if (ret) {
rbio->ret = ret;
rbio->bio.bi_status = BLK_STS_IOERR;
- } else {
+ } else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace &&
+ orig_error != -BCH_ERR_data_read_ptr_stale_race &&
+ !failed.nr) {
struct printbuf buf = PRINTBUF;
lockrestart_do(trans,
@@ -977,7 +981,8 @@ retry_pick:
goto err;
}
- if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
+ if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) &&
+ !c->chacha20_key_set) {
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
@@ -1344,14 +1349,16 @@ err:
bch2_trans_iter_exit(trans, &iter);
- if (ret) {
- struct printbuf buf = PRINTBUF;
- lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, &buf, inum,
- bvec_iter.bi_sector << 9));
- prt_printf(&buf, "read error: %s", bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
+ if (unlikely(ret)) {
+ if (ret != -BCH_ERR_extent_poisoned) {
+ struct printbuf buf = PRINTBUF;
+ lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum,
+ bvec_iter.bi_sector << 9));
+ prt_printf(&buf, "data read error: %s", bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
rbio->bio.bi_status = BLK_STS_IOERR;
rbio->ret = ret;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index d8f74b6d0a75..bb45d3634194 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -281,7 +281,24 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
- BUG_ON(sectors > buf->sectors);
+ if (unlikely(sectors > buf->sectors)) {
+ struct printbuf err = PRINTBUF;
+ err.atomic++;
+
+ prt_printf(&err, "journal entry overran reserved space: %u > %u\n",
+ sectors, buf->sectors);
+ prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n",
+ le32_to_cpu(buf->data->u64s), buf->u64s_reserved,
+ j->cur_entry_u64s,
+ c->block_bits);
+ prt_printf(&err, "fatal error - emergency read only");
+ bch2_journal_halt_locked(j);
+
+ bch_err(c, "%s", err.buf);
+ printbuf_exit(&err);
+ return;
+ }
+
buf->sectors = sectors;
/*
@@ -1462,8 +1479,6 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
j->last_empty_seq = cur_seq - 1; /* to match j->seq */
spin_lock(&j->lock);
-
- set_bit(JOURNAL_running, &j->flags);
j->last_flush_write = jiffies;
j->reservations.idx = journal_cur_seq(j);
@@ -1474,6 +1489,21 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
return 0;
}
+void bch2_journal_set_replay_done(struct journal *j)
+{
+ /*
+ * journal_space_available must happen before setting JOURNAL_running
+ * JOURNAL_running must happen before JOURNAL_replay_done
+ */
+ spin_lock(&j->lock);
+ bch2_journal_space_available(j);
+
+ set_bit(JOURNAL_need_flush_write, &j->flags);
+ set_bit(JOURNAL_running, &j->flags);
+ set_bit(JOURNAL_replay_done, &j->flags);
+ spin_unlock(&j->lock);
+}
+
/* init/exit: */
void bch2_dev_journal_exit(struct bch_dev *ca)
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 47828771f9c2..641e20c05a14 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -437,12 +437,6 @@ static inline int bch2_journal_error(struct journal *j)
struct bch_dev;
-static inline void bch2_journal_set_replay_done(struct journal *j)
-{
- BUG_ON(!test_bit(JOURNAL_running, &j->flags));
- set_bit(JOURNAL_replay_done, &j->flags);
-}
-
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
@@ -459,6 +453,7 @@ void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
int bch2_fs_journal_start(struct journal *, u64);
+void bch2_journal_set_replay_done(struct journal *);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1b7961f4f609..2a54ac79189b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1460,7 +1460,7 @@ fsck_err:
static void journal_advance_devs_to_next_bucket(struct journal *j,
struct dev_alloc_list *devs,
- unsigned sectors, u64 seq)
+ unsigned sectors, __le64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 5d1547aa118a..ea670c3c43d8 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -252,7 +252,10 @@ void bch2_journal_space_available(struct journal *j)
bch2_journal_set_watermark(j);
out:
- j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
+ j->cur_entry_sectors = !ret
+ ? round_down(j->space[journal_space_discarded].next_entry,
+ block_sectors(c))
+ : 0;
j->cur_entry_error = ret;
if (!ret)
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 159410c50861..96873372b516 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -356,6 +356,13 @@ static int bch2_copygc_thread(void *arg)
set_freezable();
+ /*
+ * Data move operations can't run until after check_snapshots has
+ * completed, and bch2_snapshot_is_ancestor() is available.
+ */
+ kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots ||
+ kthread_should_stop());
+
bch2_move_stats_init(&move_stats, "copygc");
bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
writepoint_ptr(&c->copygc_write_point),
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
index ea181fef5bc9..d1885cf67a45 100644
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
@@ -5,6 +5,15 @@
unsigned long bch2_copygc_wait_amount(struct bch_fs *);
void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
+static inline void bch2_copygc_wakeup(struct bch_fs *c)
+{
+ rcu_read_lock();
+ struct task_struct *p = rcu_dereference(c->copygc_thread);
+ if (p)
+ wake_up_process(p);
+ rcu_read_unlock();
+}
+
void bch2_copygc_stop(struct bch_fs *);
int bch2_copygc_start(struct bch_fs *);
void bch2_fs_copygc_init(struct bch_fs *);
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index 0d65ea96f7a2..46f3c8b100a9 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -47,10 +47,6 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- /* Inherit casefold state from parent. */
- if (S_ISDIR(mode))
- new_inode->bi_flags |= dir_u->bi_flags & BCH_INODE_casefolded;
-
if (!(flags & BCH_CREATE_SNAPSHOT)) {
/* Normal create path - allocate a new inode: */
bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 4d06313076ff..dfb14810124c 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -228,6 +228,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH_SB_ERASURE_CODE, false, \
NULL, "Enable erasure coding (DO NOT USE YET)") \
+ x(casefold, u8, \
+ OPT_FS|OPT_INODE|OPT_FORMAT, \
+ OPT_BOOL(), \
+ BCH_SB_CASEFOLD, false, \
+ NULL, "Dirent lookups are casefolded") \
x(inodes_32bit, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index c63fa53f30d2..4ccdfc1f34aa 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -262,7 +262,7 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
int ret = bch2_trans_commit_do(c, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
bch2_set_rebalance_needs_scan_trans(trans, inum));
- rebalance_wakeup(c);
+ bch2_rebalance_wakeup(c);
return ret;
}
@@ -581,6 +581,13 @@ static int bch2_rebalance_thread(void *arg)
set_freezable();
+ /*
+ * Data move operations can't run until after check_snapshots has
+ * completed, and bch2_snapshot_is_ancestor() is available.
+ */
+ kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots ||
+ kthread_should_stop());
+
bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
writepoint_ptr(&c->rebalance_write_point),
true);
@@ -664,7 +671,7 @@ void bch2_rebalance_stop(struct bch_fs *c)
c->rebalance.thread = NULL;
if (p) {
- /* for sychronizing with rebalance_wakeup() */
+ /* for sychronizing with bch2_rebalance_wakeup() */
synchronize_rcu();
kthread_stop(p);
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 62a3859d3823..e5e8eb4a2dd1 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -37,7 +37,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
int bch2_set_fs_needs_rebalance(struct bch_fs *);
-static inline void rebalance_wakeup(struct bch_fs *c)
+static inline void bch2_rebalance_wakeup(struct bch_fs *c)
{
struct task_struct *p;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 79fd18a5a07c..d6c4ef819d40 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -18,6 +18,7 @@
#include "journal_seq_blacklist.h"
#include "logged_ops.h"
#include "move.h"
+#include "movinggc.h"
#include "namei.h"
#include "quota.h"
#include "rebalance.h"
@@ -389,9 +390,9 @@ int bch2_journal_replay(struct bch_fs *c)
* Now, replay any remaining keys in the order in which they appear in
* the journal, unpinning those journal entries as we go:
*/
- sort(keys_sorted.data, keys_sorted.nr,
- sizeof(keys_sorted.data[0]),
- journal_sort_seq_cmp, NULL);
+ sort_nonatomic(keys_sorted.data, keys_sorted.nr,
+ sizeof(keys_sorted.data[0]),
+ journal_sort_seq_cmp, NULL);
darray_for_each(keys_sorted, kp) {
cond_resched();
@@ -1125,14 +1126,17 @@ int bch2_fs_initialize(struct bch_fs *c)
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
- bch2_fs_journal_start(&c->journal, 1);
- set_bit(BCH_FS_accounting_replay_done, &c->flags);
- bch2_journal_set_replay_done(&c->journal);
+ ret = bch2_fs_journal_start(&c->journal, 1);
+ if (ret)
+ goto err;
ret = bch2_fs_read_write_early(c);
if (ret)
goto err;
+ set_bit(BCH_FS_accounting_replay_done, &c->flags);
+ bch2_journal_set_replay_done(&c->journal);
+
for_each_member_device(c, ca) {
ret = bch2_dev_usage_init(ca, false);
if (ret) {
@@ -1191,6 +1195,9 @@ int bch2_fs_initialize(struct bch_fs *c)
c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
+ bch2_copygc_wakeup(c);
+ bch2_rebalance_wakeup(c);
+
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
if (ret)
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 593ff142530d..22f72bb5b853 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -12,6 +12,7 @@
#include "journal.h"
#include "lru.h"
#include "logged_ops.h"
+#include "movinggc.h"
#include "rebalance.h"
#include "recovery.h"
#include "recovery_passes.h"
@@ -262,49 +263,52 @@ int bch2_run_recovery_passes(struct bch_fs *c)
*/
c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) {
- c->next_recovery_pass = c->curr_recovery_pass + 1;
+ spin_lock_irq(&c->recovery_pass_lock);
- spin_lock_irq(&c->recovery_pass_lock);
+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) {
+ unsigned prev_done = c->recovery_pass_done;
unsigned pass = c->curr_recovery_pass;
+ c->next_recovery_pass = pass + 1;
+
if (c->opts.recovery_pass_last &&
- c->curr_recovery_pass > c->opts.recovery_pass_last) {
- spin_unlock_irq(&c->recovery_pass_lock);
+ c->curr_recovery_pass > c->opts.recovery_pass_last)
break;
- }
- if (!should_run_recovery_pass(c, pass)) {
- c->curr_recovery_pass++;
- c->recovery_pass_done = max(c->recovery_pass_done, pass);
+ if (should_run_recovery_pass(c, pass)) {
spin_unlock_irq(&c->recovery_pass_lock);
- continue;
- }
- spin_unlock_irq(&c->recovery_pass_lock);
-
- ret = bch2_run_recovery_pass(c, pass) ?:
- bch2_journal_flush(&c->journal);
-
- if (!ret && !test_bit(BCH_FS_error, &c->flags))
- bch2_clear_recovery_pass_required(c, pass);
-
- spin_lock_irq(&c->recovery_pass_lock);
- if (c->next_recovery_pass < c->curr_recovery_pass) {
- /*
- * bch2_run_explicit_recovery_pass() was called: we
- * can't always catch -BCH_ERR_restart_recovery because
- * it may have been called from another thread (btree
- * node read completion)
- */
- ret = 0;
- c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass);
- } else {
- c->recovery_passes_complete |= BIT_ULL(pass);
- c->recovery_pass_done = max(c->recovery_pass_done, pass);
+ ret = bch2_run_recovery_pass(c, pass) ?:
+ bch2_journal_flush(&c->journal);
+
+ if (!ret && !test_bit(BCH_FS_error, &c->flags))
+ bch2_clear_recovery_pass_required(c, pass);
+ spin_lock_irq(&c->recovery_pass_lock);
+
+ if (c->next_recovery_pass < c->curr_recovery_pass) {
+ /*
+ * bch2_run_explicit_recovery_pass() was called: we
+ * can't always catch -BCH_ERR_restart_recovery because
+ * it may have been called from another thread (btree
+ * node read completion)
+ */
+ ret = 0;
+ c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass);
+ } else {
+ c->recovery_passes_complete |= BIT_ULL(pass);
+ c->recovery_pass_done = max(c->recovery_pass_done, pass);
+ }
}
+
c->curr_recovery_pass = c->next_recovery_pass;
- spin_unlock_irq(&c->recovery_pass_lock);
+
+ if (prev_done <= BCH_RECOVERY_PASS_check_snapshots &&
+ c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots) {
+ bch2_copygc_wakeup(c);
+ bch2_rebalance_wakeup(c);
+ }
}
+ spin_unlock_irq(&c->recovery_pass_lock);
+
return ret;
}
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 5d43e3504386..dc53d25c7cbb 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -290,8 +290,8 @@ enum bch_fsck_flags {
x(btree_node_bkey_bad_u64s, 260, 0) \
x(btree_node_topology_empty_interior_node, 261, 0) \
x(btree_ptr_v2_min_key_bad, 262, 0) \
- x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \
- x(snapshot_node_missing, 264, 0) \
+ x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \
+ x(snapshot_node_missing, 264, FSCK_AUTOFIX) \
x(dup_backpointer_to_bad_csum_extent, 265, 0) \
x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \
x(sb_clean_entry_overrun, 267, 0) \
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index b7de29aed839..fec569c7deb1 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -396,7 +396,7 @@ u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
u32 subvol = 0, s;
rcu_read_lock();
- while (id) {
+ while (id && bch2_snapshot_exists(c, id)) {
s = snapshot_t(c, id)->subvol;
if (s && (!subvol || s < subvol))
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 09a354a26c3b..0c1a00539bd1 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -33,7 +33,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
struct bch_hash_info {
u8 type;
- struct unicode_map *cf_encoding;
+ struct unicode_map *cf_encoding;
/*
* For crc32 or crc64 string hashes the first key value of
* the siphash_key (k0) is used as the key.
@@ -44,11 +44,10 @@ struct bch_hash_info {
static inline struct bch_hash_info
bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
- /* XXX ick */
struct bch_hash_info info = {
.type = INODE_STR_HASH(bi),
#ifdef CONFIG_UNICODE
- .cf_encoding = !!(bi->bi_flags & BCH_INODE_casefolded) ? c->cf_encoding : NULL,
+ .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
#endif
.siphash_key = { .k0 = bi->bi_hash_seed }
};
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index e27422b6d9c6..cb5d960aed92 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -73,14 +73,30 @@ int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version v
? 0
: -BCH_ERR_may_not_use_incompat_feature;
+ mutex_lock(&c->sb_lock);
if (!ret) {
- mutex_lock(&c->sb_lock);
SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
+ } else {
+ darray_for_each(c->incompat_versions_requested, i)
+ if (version == *i)
+ goto out;
+
+ darray_push(&c->incompat_versions_requested, version);
+ struct printbuf buf = PRINTBUF;
+ prt_str(&buf, "requested incompat feature ");
+ bch2_version_to_text(&buf, version);
+ prt_str(&buf, " currently not enabled");
+ prt_printf(&buf, "\n set version_upgrade=incompat to enable");
+
+ bch_notice(c, "%s", buf.buf);
+ printbuf_exit(&buf);
}
+out:
+ mutex_unlock(&c->sb_lock);
+
return ret;
}
@@ -1086,7 +1102,8 @@ int bch2_write_super(struct bch_fs *c)
prt_str(&buf, ")");
bch2_fs_fatal_error(c, ": %s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_sb_not_downgraded;
+ ret = -BCH_ERR_sb_not_downgraded;
+ goto out;
}
darray_for_each(online_devices, ca) {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index a58edde43bee..e4ab0595c0ae 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -70,14 +70,10 @@
#include <linux/percpu.h>
#include <linux/random.h>
#include <linux/sysfs.h>
-#include <crypto/hash.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
MODULE_DESCRIPTION("bcachefs filesystem");
-MODULE_SOFTDEP("pre: chacha20");
-MODULE_SOFTDEP("pre: poly1305");
-MODULE_SOFTDEP("pre: xxhash");
const char * const bch2_fs_flag_strs[] = {
#define x(n) #n,
@@ -422,32 +418,6 @@ bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
return ret;
}
-static int bch2_fs_read_write_late(struct bch_fs *c)
-{
- int ret;
-
- /*
- * Data move operations can't run until after check_snapshots has
- * completed, and bch2_snapshot_is_ancestor() is available.
- *
- * Ideally we'd start copygc/rebalance earlier instead of waiting for
- * all of recovery/fsck to complete:
- */
- ret = bch2_copygc_start(c);
- if (ret) {
- bch_err(c, "error starting copygc thread");
- return ret;
- }
-
- ret = bch2_rebalance_start(c);
- if (ret) {
- bch_err(c, "error starting rebalance thread");
- return ret;
- }
-
- return 0;
-}
-
static int __bch2_fs_read_write(struct bch_fs *c, bool early)
{
int ret;
@@ -470,29 +440,28 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
clear_bit(BCH_FS_clean_shutdown, &c->flags);
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
+ bch2_dev_allocator_add(c, ca);
+ percpu_ref_reinit(&ca->io_ref[WRITE]);
+ }
+ bch2_recalc_capacity(c);
+
/*
* First journal write must be a flush write: after a clean shutdown we
* don't read the journal, so the first journal write may end up
* overwriting whatever was there previously, and there must always be
* at least one non-flush write in the journal or recovery will fail:
*/
+ spin_lock(&c->journal.lock);
set_bit(JOURNAL_need_flush_write, &c->journal.flags);
set_bit(JOURNAL_running, &c->journal.flags);
-
- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
- bch2_dev_allocator_add(c, ca);
- percpu_ref_reinit(&ca->io_ref[WRITE]);
- }
- bch2_recalc_capacity(c);
+ bch2_journal_space_available(&c->journal);
+ spin_unlock(&c->journal.lock);
ret = bch2_fs_mark_dirty(c);
if (ret)
goto err;
- spin_lock(&c->journal.lock);
- bch2_journal_space_available(&c->journal);
- spin_unlock(&c->journal.lock);
-
ret = bch2_journal_reclaim_start(&c->journal);
if (ret)
goto err;
@@ -508,10 +477,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
atomic_long_inc(&c->writes[i]);
}
#endif
- if (!early) {
- ret = bch2_fs_read_write_late(c);
- if (ret)
- goto err;
+
+ ret = bch2_copygc_start(c);
+ if (ret) {
+ bch_err_msg(c, ret, "error starting copygc thread");
+ goto err;
+ }
+
+ ret = bch2_rebalance_start(c);
+ if (ret) {
+ bch_err_msg(c, ret, "error starting rebalance thread");
+ goto err;
}
bch2_do_discards(c);
@@ -557,6 +533,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
+ bch2_free_fsck_errs(c);
bch2_fs_accounting_exit(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
@@ -593,6 +570,7 @@ static void __bch2_fs_free(struct bch_fs *c)
free_percpu(c->online_reserved);
}
+ darray_exit(&c->incompat_versions_requested);
darray_exit(&c->btree_roots_extra);
free_percpu(c->pcpu);
free_percpu(c->usage);
@@ -1002,12 +980,6 @@ static void print_mount_opts(struct bch_fs *c)
prt_str(&p, "starting version ");
bch2_version_to_text(&p, c->sb.version);
- if (c->opts.read_only) {
- prt_str(&p, " opts=");
- first = false;
- prt_printf(&p, "ro");
- }
-
for (i = 0; i < bch2_opts_nr; i++) {
const struct bch_option *opt = &bch2_opt_table[i];
u64 v = bch2_opt_get_by_id(&c->opts, i);
@@ -1023,10 +995,49 @@ static void print_mount_opts(struct bch_fs *c)
bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
}
+ if (c->sb.version_incompat_allowed != c->sb.version) {
+ prt_printf(&p, "\n allowing incompatible features above ");
+ bch2_version_to_text(&p, c->sb.version_incompat_allowed);
+ }
+
bch_info(c, "%s", p.buf);
printbuf_exit(&p);
}
+static bool bch2_fs_may_start(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i, flags = 0;
+
+ if (c->opts.very_degraded)
+ flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
+
+ if (c->opts.degraded)
+ flags |= BCH_FORCE_IF_DEGRADED;
+
+ if (!c->opts.degraded &&
+ !c->opts.very_degraded) {
+ mutex_lock(&c->sb_lock);
+
+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+ if (!bch2_member_exists(c->disk_sb.sb, i))
+ continue;
+
+ ca = bch2_dev_locked(c, i);
+
+ if (!bch2_dev_is_online(ca) &&
+ (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ ca->mi.state == BCH_MEMBER_STATE_ro)) {
+ mutex_unlock(&c->sb_lock);
+ return false;
+ }
+ }
+ mutex_unlock(&c->sb_lock);
+ }
+
+ return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
+}
+
int bch2_fs_start(struct bch_fs *c)
{
time64_t now = ktime_get_real_seconds();
@@ -1034,6 +1045,9 @@ int bch2_fs_start(struct bch_fs *c)
print_mount_opts(c);
+ if (!bch2_fs_may_start(c))
+ return -BCH_ERR_insufficient_devices_to_start;
+
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
@@ -1086,13 +1100,10 @@ int bch2_fs_start(struct bch_fs *c)
wake_up(&c->ro_ref_wait);
down_write(&c->state_lock);
- if (c->opts.read_only) {
+ if (c->opts.read_only)
bch2_fs_read_only(c);
- } else {
- ret = !test_bit(BCH_FS_rw, &c->flags)
- ? bch2_fs_read_write(c)
- : bch2_fs_read_write_late(c);
- }
+ else if (!test_bit(BCH_FS_rw, &c->flags))
+ ret = bch2_fs_read_write(c);
up_write(&c->state_lock);
err:
@@ -1504,7 +1515,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
printbuf_exit(&name);
- rebalance_wakeup(c);
+ bch2_rebalance_wakeup(c);
return 0;
}
@@ -1563,40 +1574,6 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
}
}
-static bool bch2_fs_may_start(struct bch_fs *c)
-{
- struct bch_dev *ca;
- unsigned i, flags = 0;
-
- if (c->opts.very_degraded)
- flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
-
- if (c->opts.degraded)
- flags |= BCH_FORCE_IF_DEGRADED;
-
- if (!c->opts.degraded &&
- !c->opts.very_degraded) {
- mutex_lock(&c->sb_lock);
-
- for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
- if (!bch2_member_exists(c->disk_sb.sb, i))
- continue;
-
- ca = bch2_dev_locked(c, i);
-
- if (!bch2_dev_is_online(ca) &&
- (ca->mi.state == BCH_MEMBER_STATE_rw ||
- ca->mi.state == BCH_MEMBER_STATE_ro)) {
- mutex_unlock(&c->sb_lock);
- return false;
- }
- }
- mutex_unlock(&c->sb_lock);
- }
-
- return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
-}
-
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
bch2_dev_io_ref_stop(ca, WRITE);
@@ -1650,7 +1627,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
if (new_state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
- rebalance_wakeup(c);
+ bch2_rebalance_wakeup(c);
return ret;
}
@@ -1767,7 +1744,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
up_write(&c->state_lock);
return 0;
err:
- if (ca->mi.state == BCH_MEMBER_STATE_rw &&
+ if (test_bit(BCH_FS_rw, &c->flags) &&
+ ca->mi.state == BCH_MEMBER_STATE_rw &&
!percpu_ref_is_zero(&ca->io_ref[READ]))
__bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
@@ -2231,11 +2209,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
}
up_write(&c->state_lock);
- if (!bch2_fs_may_start(c)) {
- ret = -BCH_ERR_insufficient_devices_to_start;
- goto err_print;
- }
-
if (!c->opts.nostart) {
ret = bch2_fs_start(c);
if (ret)
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index e5f003c29369..82ee333ddd21 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -654,11 +654,10 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
bch2_set_rebalance_needs_scan(c, 0);
if (v && id == Opt_rebalance_enabled)
- rebalance_wakeup(c);
+ bch2_rebalance_wakeup(c);
- if (v && id == Opt_copygc_enabled &&
- c->copygc_thread)
- wake_up_process(c->copygc_thread);
+ if (v && id == Opt_copygc_enabled)
+ bch2_copygc_wakeup(c);
if (id == Opt_discard && !ca) {
mutex_lock(&c->sb_lock);
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index c265b102267a..782a05fe7656 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -342,6 +342,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
*/
static int test_peek_end(struct bch_fs *c, u64 nr)
{
+ delete_test_keys(c);
+
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
@@ -362,6 +364,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
static int test_peek_end_extents(struct bch_fs *c, u64 nr)
{
+ delete_test_keys(c);
+
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 6ba5071ab6dd..3e52c7f8ddd2 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -739,4 +739,42 @@ static inline void memcpy_swab(void *_dst, void *_src, size_t len)
*--dst = *src++;
}
+#define set_flags(_map, _in, _out) \
+do { \
+ unsigned _i; \
+ \
+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
+ if ((_in) & (1 << _i)) \
+ (_out) |= _map[_i]; \
+ else \
+ (_out) &= ~_map[_i]; \
+} while (0)
+
+#define map_flags(_map, _in) \
+({ \
+ unsigned _out = 0; \
+ \
+ set_flags(_map, _in, _out); \
+ _out; \
+})
+
+#define map_flags_rev(_map, _in) \
+({ \
+ unsigned _i, _out = 0; \
+ \
+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
+ if ((_in) & _map[_i]) { \
+ (_out) |= 1 << _i; \
+ (_in) &= ~_map[_i]; \
+ } \
+ (_out); \
+})
+
+#define map_defined(_map) \
+({ \
+ unsigned _in = ~0; \
+ \
+ map_flags_rev(_map, _in); \
+})
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index fa8515598341..73a2dfb854c5 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -3,9 +3,9 @@
config BTRFS_FS
tristate "Btrfs filesystem support"
select BLK_CGROUP_PUNT_BIO
+ select CRC32
select CRYPTO
select CRYPTO_CRC32C
- select LIBCRC32C
select CRYPTO_XXHASH
select CRYPTO_SHA256
select CRYPTO_BLAKE2B
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3dd555db3d32..aa58e0663a5d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3853,7 +3853,6 @@ static int write_dev_supers(struct btrfs_device *device,
atomic_inc(&device->sb_write_errors);
continue;
}
- ASSERT(folio_order(folio) == 0);
offset = offset_in_folio(folio, bytenr);
disk_super = folio_address(folio) + offset;
@@ -3926,7 +3925,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
/* If the folio has been removed, then we know it completed. */
if (IS_ERR(folio))
continue;
- ASSERT(folio_order(folio) == 0);
/* Folio will be unlocked once the write completes. */
folio_wait_locked(folio);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 262a707d8990..71b8a825c447 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2104,15 +2104,20 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* will always return true.
* So here we need to do extra page alignment for
* filemap_range_has_page().
+ *
+ * And do not decrease page_lockend right now, as it can be 0.
*/
const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
- const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
+ const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE);
while (1) {
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
+ /* The same page or adjacent pages. */
+ if (page_lockend <= page_lockstart)
+ break;
/*
* We can't have ordered extents in the range, nor dirty/writeback
* pages, because we have locked the inode's VFS lock in exclusive
@@ -2124,7 +2129,7 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* we do, unlock the range and retry.
*/
if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
- page_lockend))
+ page_lockend - 1))
break;
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a13d81bb56a0..63aeacc54945 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4902,6 +4902,8 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state,
&disk_bytenr, &disk_io_size);
+ if (ret == -EAGAIN)
+ goto out_acct;
if (ret < 0 && ret != -EIOCBQUEUED)
goto out_free;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f948f4f6431c..e17bcb034595 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3803,7 +3803,7 @@ out:
if (ret) {
if (inode)
iput(&inode->vfs_inode);
- inode = ERR_PTR(ret);
+ return ERR_PTR(ret);
}
return &inode->vfs_inode;
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 11dbd7be6a3b..c0a0b8b063d0 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -204,7 +204,7 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
btrfs_blocks_per_folio(fs_info, folio); \
\
btrfs_subpage_assert(fs_info, folio, start, len); \
- __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
+ __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \
__start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \
__start_bit; \
})
@@ -666,7 +666,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
btrfs_blocks_per_folio(fs_info, folio); \
const struct btrfs_subpage *subpage = folio_get_private(folio); \
\
- ASSERT(blocks_per_folio < BITS_PER_LONG); \
+ ASSERT(blocks_per_folio <= BITS_PER_LONG); \
*dst = bitmap_read(subpage->bitmaps, \
blocks_per_folio * btrfs_bitmap_nr_##name, \
blocks_per_folio); \
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 40709e2a44fc..7121d8c7a318 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1139,8 +1139,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
subvol_name = btrfs_get_subvol_name_from_objectid(info,
btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
if (!IS_ERR(subvol_name)) {
- seq_puts(seq, ",subvol=");
- seq_escape(seq, subvol_name, " \t\n\\");
+ seq_show_option(seq, "subvol", subvol_name);
kfree(subvol_name);
}
return 0;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 43979891f7c8..2b66a6130269 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -2235,7 +2235,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb,
btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
eb->start, check->level, found_level);
- return -EIO;
+ return -EUCLEAN;
}
if (!check->has_first_key)
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index fb8b8b29c169..4a3e02b49f29 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1277,7 +1277,7 @@ struct zone_info {
static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct zone_info *info, unsigned long *active,
- struct btrfs_chunk_map *map)
+ struct btrfs_chunk_map *map, bool new)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *device;
@@ -1307,6 +1307,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
return 0;
}
+ ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical));
+
/* This zone will be used for allocation, so mark this zone non-empty. */
btrfs_dev_clear_zone_empty(device, info->physical);
@@ -1319,6 +1321,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
* to determine the allocation offset within the zone.
*/
WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
+
+ if (new) {
+ sector_t capacity;
+
+ capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT);
+ up_read(&dev_replace->rwsem);
+ info->alloc_offset = 0;
+ info->capacity = capacity << SECTOR_SHIFT;
+
+ return 0;
+ }
+
nofs_flag = memalloc_nofs_save();
ret = btrfs_get_dev_zone(device, info->physical, &zone);
memalloc_nofs_restore(nofs_flag);
@@ -1588,7 +1602,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
for (i = 0; i < map->num_stripes; i++) {
- ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
+ ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new);
if (ret)
goto out;
@@ -1659,7 +1673,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
* stripe.
*/
cache->alloc_offset = cache->zone_capacity;
- ret = 0;
}
out:
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index cd5f38d6fbaa..3541efa765c7 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -225,7 +225,7 @@ void zstd_cleanup_workspace_manager(void)
}
spin_unlock_bh(&wsm.lock);
- del_timer_sync(&wsm.timer);
+ timer_delete_sync(&wsm.timer);
}
/*
diff --git a/fs/buffer.c b/fs/buffer.c
index c7abb4a029dc..7be23ff20b27 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -176,18 +176,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
}
EXPORT_SYMBOL(end_buffer_write_sync);
-/*
- * Various filesystems appear to want __find_get_block to be non-blocking.
- * But it's the page lock which protects the buffers. To get around this,
- * we get exclusion from try_to_free_buffers with the blockdev mapping's
- * i_private_lock.
- *
- * Hack idea: for the blockdev mapping, i_private_lock contention
- * may be quite high. This code could TryLock the page, and if that
- * succeeds, there is no need to take i_private_lock.
- */
static struct buffer_head *
-__find_get_block_slow(struct block_device *bdev, sector_t block)
+__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
{
struct address_space *bd_mapping = bdev->bd_mapping;
const int blkbits = bd_mapping->host->i_blkbits;
@@ -204,10 +194,28 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
if (IS_ERR(folio))
goto out;
- spin_lock(&bd_mapping->i_private_lock);
+ /*
+ * Folio lock protects the buffers. Callers that cannot block
+ * will fallback to serializing vs try_to_free_buffers() via
+ * the i_private_lock.
+ */
+ if (atomic)
+ spin_lock(&bd_mapping->i_private_lock);
+ else
+ folio_lock(folio);
+
head = folio_buffers(folio);
if (!head)
goto out_unlock;
+ /*
+ * Upon a noref migration, the folio lock serializes here;
+ * otherwise bail.
+ */
+ if (test_bit_acquire(BH_Migrate, &head->b_state)) {
+ WARN_ON(!atomic);
+ goto out_unlock;
+ }
+
bh = head;
do {
if (!buffer_mapped(bh))
@@ -236,7 +244,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
1 << blkbits);
}
out_unlock:
- spin_unlock(&bd_mapping->i_private_lock);
+ if (atomic)
+ spin_unlock(&bd_mapping->i_private_lock);
+ else
+ folio_unlock(folio);
folio_put(folio);
out:
return ret;
@@ -656,7 +667,9 @@ EXPORT_SYMBOL(generic_buffers_fsync);
void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize)
{
- struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
+ struct buffer_head *bh;
+
+ bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
if (bh) {
if (buffer_dirty(bh))
write_dirty_buffer(bh, 0);
@@ -1386,16 +1399,18 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
/*
* Perform a pagecache lookup for the matching buffer. If it's there, refresh
* it in the LRU and mark it as accessed. If it is not present then return
- * NULL
+ * NULL. Atomic context callers may also return NULL if the buffer is being
+ * migrated; similarly the page is not marked accessed either.
*/
-struct buffer_head *
-__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+static struct buffer_head *
+find_get_block_common(struct block_device *bdev, sector_t block,
+ unsigned size, bool atomic)
{
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
if (bh == NULL) {
/* __find_get_block_slow will mark the page accessed */
- bh = __find_get_block_slow(bdev, block);
+ bh = __find_get_block_slow(bdev, block, atomic);
if (bh)
bh_lru_install(bh);
} else
@@ -1403,8 +1418,23 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
return bh;
}
+
+struct buffer_head *
+__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+{
+ return find_get_block_common(bdev, block, size, true);
+}
EXPORT_SYMBOL(__find_get_block);
+/* same as __find_get_block() but allows sleeping contexts */
+struct buffer_head *
+__find_get_block_nonatomic(struct block_device *bdev, sector_t block,
+ unsigned size)
+{
+ return find_get_block_common(bdev, block, size, false);
+}
+EXPORT_SYMBOL(__find_get_block_nonatomic);
+
/**
* bdev_getblk - Get a buffer_head in a block device's buffer cache.
* @bdev: The block device.
@@ -1422,7 +1452,12 @@ EXPORT_SYMBOL(__find_get_block);
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- struct buffer_head *bh = __find_get_block(bdev, block, size);
+ struct buffer_head *bh;
+
+ if (gfpflags_allow_blocking(gfp))
+ bh = __find_get_block_nonatomic(bdev, block, size);
+ else
+ bh = __find_get_block(bdev, block, size);
might_alloc(gfp);
if (bh)
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index bf935e25bdbe..b48525680e73 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -8,7 +8,7 @@
#include <linux/slab.h>
#include "internal.h"
-static const char cachefiles_charmap[64] =
+static const char cachefiles_charmap[64] __nonstring =
"0123456789" /* 0 - 9 */
"abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 7249d70e1a43..3e7def3d31c1 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -3,7 +3,7 @@ config CEPH_FS
tristate "Ceph distributed file system"
depends on INET
select CEPH_LIB
- select LIBCRC32C
+ select CRC32
select CRYPTO_AES
select CRYPTO
select NETFS_SUPPORT
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 6ac2bd555e86..06cd2963e41e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2367,7 +2367,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
/* Try to writeback the dirty pagecaches */
if (issued & (CEPH_CAP_FILE_BUFFER)) {
- loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1;
+ loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1;
ret = filemap_write_and_wait_range(inode->i_mapping,
orig_pos, lend);
diff --git a/fs/dax.c b/fs/dax.c
index af5045b0f476..676303419e9e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -396,6 +396,7 @@ static inline unsigned long dax_folio_put(struct folio *folio)
order = folio_order(folio);
if (!order)
return 0;
+ folio_reset_order(folio);
for (i = 0; i < (1UL << order); i++) {
struct dev_pagemap *pgmap = page_pgmap(&folio->page);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 42e4d6eeb29f..9c20d78e41f6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -89,12 +89,12 @@ enum {
};
static const struct fs_parameter_spec devpts_param_specs[] = {
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_gid ("gid", Opt_gid),
fsparam_s32 ("max", Opt_max),
fsparam_u32oct ("mode", Opt_mode),
fsparam_flag ("newinstance", Opt_newinstance),
fsparam_u32oct ("ptmxmode", Opt_ptmxmode),
- fsparam_u32 ("uid", Opt_uid),
+ fsparam_uid ("uid", Opt_uid),
{}
};
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 331e49cd1b8d..8f68ec49ad89 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,8 +3,8 @@
config EROFS_FS
tristate "EROFS filesystem support"
depends on BLOCK
+ select CRC32
select FS_IOMAP
- select LIBCRC32C
help
EROFS (Enhanced Read-Only File System) is a lightweight read-only
file system with modern designs (e.g. no buffer heads, inline
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 9581e9bf8192..767fb4acdc93 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -56,7 +56,7 @@ struct erofs_super_block {
union {
__le16 rootnid_2b; /* nid of root directory */
__le16 blocks_hi; /* (48BIT on) blocks count MSB */
- } rb;
+ } __packed rb;
__le64 inos; /* total valid ino # (== f_files - f_favail) */
__le64 epoch; /* base seconds used for compact inodes */
__le32 fixed_nsec; /* fixed nanoseconds for compact inodes */
@@ -148,7 +148,7 @@ union erofs_inode_i_nb {
__le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */
__le16 blocks_hi; /* total blocks count MSB */
__le16 startblk_hi; /* starting block number MSB */
-};
+} __packed;
/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
@@ -369,9 +369,9 @@ struct z_erofs_map_header {
* bit 7 : pack the whole file into packed inode
*/
__u8 h_clusterbits;
- };
+ } __packed;
__le16 h_extents_hi; /* extent count MSB */
- };
+ } __packed;
};
enum {
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index bec4b56b3826..4fa0a0121288 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -32,6 +32,8 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
ret = 0;
}
if (rq->bio.bi_end_io) {
+ if (ret < 0 && !rq->bio.bi_status)
+ rq->bio.bi_status = errno_to_blk_status(ret);
rq->bio.bi_end_io(&rq->bio);
} else {
bio_for_each_folio_all(fi, &rq->bio) {
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 0671184d9cf1..5c061aaeeb45 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -725,7 +725,6 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
lockref_init(&pcl->lockref); /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
pcl->pclustersize = map->m_plen;
- pcl->pageofs_in = pageofs_in;
pcl->length = 0;
pcl->partial = true;
pcl->next = fe->head;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 8de50df05dfe..14ea47f954f5 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -559,7 +559,8 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
pos += sizeof(__le64);
lstart = 0;
} else {
- lstart = map->m_la >> vi->z_lclusterbits;
+ lstart = round_down(map->m_la, 1 << vi->z_lclusterbits);
+ pos += (lstart >> vi->z_lclusterbits) * recsz;
pa = EROFS_NULL_ADDR;
}
@@ -614,7 +615,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT;
vi->z_fragmentoff = map->m_plen;
- if (recsz >= offsetof(struct z_erofs_extent, pstart_lo))
+ if (recsz > offsetof(struct z_erofs_extent, pstart_lo))
vi->z_fragmentoff |= map->m_pa << 32;
} else if (map->m_plen) {
map->m_flags |= EROFS_MAP_MAPPED |
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 100376863a44..4bc264b854c4 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1996,6 +1996,14 @@ static int ep_try_send_events(struct eventpoll *ep,
return res;
}
+static int ep_schedule_timeout(ktime_t *to)
+{
+ if (to)
+ return ktime_after(*to, ktime_get());
+ else
+ return 1;
+}
+
/**
* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
@@ -2103,7 +2111,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
write_unlock_irq(&ep->lock);
- if (!eavail)
+ if (!eavail && ep_schedule_timeout(to))
timed_out = !schedule_hrtimeout_range(to, slack,
HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
diff --git a/fs/exec.c b/fs/exec.c
index 5d1c0d2dc403..8e4ea5f1e64c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1864,9 +1864,9 @@ static int bprm_execve(struct linux_binprm *bprm)
goto out;
sched_mm_cid_after_execve(current);
+ rseq_execve(current);
/* execve succeeded */
current->in_execve = 0;
- rseq_execve(current);
user_events_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
@@ -1883,6 +1883,7 @@ out:
force_fatal_sig(SIGSEGV);
sched_mm_cid_after_execve(current);
+ rseq_set_notify_resume(current);
current->in_execve = 0;
return retval;
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 87ee3a17bd29..e8c5525afc67 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line,
{
__le32 *bref = p;
unsigned int blk;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ if (journal && inode == journal->j_inode)
return 0;
while (bref < p+max) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 38bc8d74f4cc..e7ecc7c8a729 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -691,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
if (!bh || !buffer_uptodate(bh))
/*
* If the block is not in the buffer cache, then it
- * must have been written out.
+ * must have been written out, or, most unlikely, is
+ * being migrated - false failure should be OK here.
*/
goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1dc09ed5d403..94c7d2d828a6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -386,10 +386,11 @@ static int __check_block_validity(struct inode *inode, const char *func,
unsigned int line,
struct ext4_map_blocks *map)
{
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+ if (journal && inode == journal->j_inode)
return 0;
+
if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
@@ -4724,22 +4725,43 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
inode_set_iversion_queried(inode, val);
}
-static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
-
+static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
+ const char *function, unsigned int line)
{
+ const char *err_str;
+
if (flags & EXT4_IGET_EA_INODE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "missing EA_INODE flag";
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ err_str = "missing EA_INODE flag";
+ goto error;
+ }
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
- EXT4_I(inode)->i_file_acl)
- return "ea_inode with extended attributes";
+ EXT4_I(inode)->i_file_acl) {
+ err_str = "ea_inode with extended attributes";
+ goto error;
+ }
} else {
- if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "unexpected EA_INODE flag";
+ if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ /*
+ * open_by_handle_at() could provide an old inode number
+ * that has since been reused for an ea_inode; this does
+ * not indicate filesystem corruption
+ */
+ if (flags & EXT4_IGET_HANDLE)
+ return -ESTALE;
+ err_str = "unexpected EA_INODE flag";
+ goto error;
+ }
+ }
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
+ err_str = "unexpected bad inode w/o EXT4_IGET_BAD";
+ goto error;
}
- if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
- return "unexpected bad inode w/o EXT4_IGET_BAD";
- return NULL;
+ return 0;
+
+error:
+ ext4_error_inode(inode, function, line, 0, err_str);
+ return -EFSCORRUPTED;
}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -4751,7 +4773,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
struct ext4_inode_info *ei;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct inode *inode;
- const char *err_str;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
loff_t size;
@@ -4780,10 +4801,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW)) {
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
+ ret = check_igot_inode(inode, flags, function, line);
+ if (ret) {
iput(inode);
- return ERR_PTR(-EFSCORRUPTED);
+ return ERR_PTR(ret);
}
return inode;
}
@@ -5065,13 +5086,21 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
- ret = -EFSCORRUPTED;
- goto bad_inode;
+ ret = check_igot_inode(inode, flags, function, line);
+ /*
+ * -ESTALE here means there is nothing inherently wrong with the inode,
+ * it's just not an inode we can return for an fhandle lookup.
+ */
+ if (ret == -ESTALE) {
+ brelse(iloc.bh);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(-ESTALE);
}
-
+ if (ret)
+ goto bad_inode;
brelse(iloc.bh);
+
unlock_new_inode(inode);
return inode;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0d523e9fb3d5..1e98c5be4e0a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3037,10 +3037,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
unsigned char blocksize_bits = min_t(unsigned char,
sb->s_blocksize_bits,
EXT4_MAX_BLOCK_LOG_SIZE);
- struct sg {
- struct ext4_group_info info;
- ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
- } sg;
+ DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters,
+ EXT4_MAX_BLOCK_LOG_SIZE + 2);
group--;
if (group == 0)
@@ -3048,7 +3046,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
" 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
" 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
- i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+ i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) +
sizeof(struct ext4_group_info);
grinfo = ext4_get_group_info(sb, group);
@@ -3068,14 +3066,14 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
* We care only about free space counters in the group info and
* these are safe to access even after the buddy has been unloaded
*/
- memcpy(&sg, grinfo, i);
- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
- sg.info.bb_fragments, sg.info.bb_first_free);
+ memcpy(sg, grinfo, i);
+ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free,
+ sg->bb_fragments, sg->bb_first_free);
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
- sg.info.bb_counters[i] : 0);
+ sg->bb_counters[i] : 0);
seq_puts(seq, " ]");
- if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
+ if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg))
seq_puts(seq, " Block bitmap corrupted!");
seq_putc(seq, '\n');
return 0;
@@ -6644,7 +6642,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
for (i = 0; i < count; i++) {
cond_resched();
if (is_metadata)
- bh = sb_find_get_block(inode->i_sb, block + i);
+ bh = sb_find_get_block_nonatomic(inode->i_sb,
+ block + i);
ext4_forget(handle, is_metadata, inode, bh, block + i);
}
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index cb5cb33b1d91..e9712e64ec8f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1971,7 +1971,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
* split it in half by count; each resulting block will have at least
* half the space free.
*/
- if (i > 0)
+ if (i >= 0)
split = count - move;
else
split = count/2;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8122d4ffb3b5..181934499624 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5680,7 +5680,7 @@ failed_mount3:
/* flush s_sb_upd_work before sbi destroy */
flush_work(&sbi->s_sb_upd_work);
ext4_stop_mmpd(sbi);
- del_timer_sync(&sbi->s_err_report);
+ timer_delete_sync(&sbi->s_err_report);
ext4_group_desc_free(sbi);
failed_mount:
#if IS_ENABLED(CONFIG_UNICODE)
diff --git a/fs/file.c b/fs/file.c
index dc3f7e120e3e..3a3146664cf3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -26,7 +26,7 @@
#include "internal.h"
-bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
+static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
{
/*
* If the reference count was already in the dead zone, then this
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 2c7b24cb67ad..53c2626e90e7 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1669,6 +1669,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
unsigned int virtqueue_size;
int err = -EIO;
+ if (!fsc->source)
+ return invalf(fsc, "No source specified");
+
/* This gets a reference on virtio_fs object. This ptr gets installed
* in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
* to drop the reference to this object.
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index be7f87a8e11a..7bd231d16d4a 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -4,7 +4,6 @@ config GFS2_FS
select BUFFER_HEAD
select FS_POSIX_ACL
select CRC32
- select LIBCRC32C
select QUOTACTL
select FS_IOMAP
help
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 6add6ebfef89..cb823a8a6ba9 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
else
key_len = tree->max_key_len + 1;
+ if (key_len > sizeof(hfs_btree_key) || key_len < 1) {
+ memset(key, 0, sizeof(hfs_btree_key));
+ pr_err("hfs: Invalid key length: %d\n", key_len);
+ return;
+ }
+
hfs_bnode_read(node, key, off, key_len);
}
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 87974d5e6791..079ea80534f7 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
else
key_len = tree->max_key_len + 2;
+ if (key_len > sizeof(hfsplus_btree_key) || key_len < 1) {
+ memset(key, 0, sizeof(hfsplus_btree_key));
+ pr_err("hfsplus: Invalid key length: %d\n", key_len);
+ return;
+ }
+
hfs_bnode_read(node, key, off, key_len);
}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 31553372b33a..5b08bd417b28 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -259,7 +259,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
}
/* truncate len if we find any trailing uptodate block(s) */
- for ( ; i <= last; i++) {
+ while (++i <= last) {
if (ifs_block_is_uptodate(ifs, i)) {
plen -= (last - i + 1) * block_size;
last = i - 1;
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 35768a63fb1d..421d247fae52 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -180,7 +180,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb,
return NULL;
return isofs_export_iget(sb,
- fh_len > 2 ? ifid->parent_block : 0,
+ fh_len > 3 ? ifid->parent_block : 0,
ifid->parent_offset,
fh_len > 4 ? ifid->parent_generation : 0);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a5ccba25ff47..743a1d7633cd 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -197,7 +197,7 @@ loop:
if (journal->j_commit_sequence != journal->j_commit_request) {
jbd2_debug(1, "OK, requests differ\n");
write_unlock(&journal->j_state_lock);
- del_timer_sync(&journal->j_commit_timer);
+ timer_delete_sync(&journal->j_commit_timer);
jbd2_journal_commit_transaction(journal);
write_lock(&journal->j_state_lock);
goto loop;
@@ -246,7 +246,7 @@ loop:
goto loop;
end_loop:
- del_timer_sync(&journal->j_commit_timer);
+ timer_delete_sync(&journal->j_commit_timer);
journal->j_task = NULL;
wake_up(&journal->j_wait_done_commit);
jbd2_debug(1, "Journal thread exiting.\n");
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 0cf0fddbee81..1467f6790747 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -345,7 +345,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
bh = bh_in;
if (!bh) {
- bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ bh = __find_get_block_nonatomic(bdev, blocknr,
+ journal->j_blocksize);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
@@ -355,7 +356,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
/* If there is a different buffer_head lying around in
* memory anywhere... */
- bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ bh2 = __find_get_block_nonatomic(bdev, blocknr,
+ journal->j_blocksize);
if (bh2) {
/* ... and it has RevokeValid status... */
if (bh2 != bh && buffer_revokevalid(bh2))
@@ -464,7 +466,8 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
- bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+ bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr,
+ bh->b_size);
if (bh2) {
if (bh2 != bh)
clear_buffer_revoked(bh2);
@@ -492,9 +495,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
struct jbd2_revoke_record_s *record;
struct buffer_head *bh;
record = (struct jbd2_revoke_record_s *)list_entry;
- bh = __find_get_block(journal->j_fs_dev,
- record->blocknr,
- journal->j_blocksize);
+ bh = __find_get_block_nonatomic(journal->j_fs_dev,
+ record->blocknr,
+ journal->j_blocksize);
if (bh) {
clear_buffer_revoked(bh);
__brelse(bh);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4061e0ba7010..bb815a002984 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -584,7 +584,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
size_t retlen;
/* Nothing to do if not write-buffering the flash. In particular, we shouldn't
- del_timer() the timer we never initialised. */
+ call timer_delete() on the timer we never initialised. */
if (!jffs2_is_writebuffered(c))
return 0;
diff --git a/fs/namei.c b/fs/namei.c
index 360a86ca1f02..84a0e0b0111c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -125,9 +125,9 @@
#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
-static inline void initname(struct filename *name)
+static inline void initname(struct filename *name, const char __user *uptr)
{
- name->uptr = NULL;
+ name->uptr = uptr;
name->aname = NULL;
atomic_set(&name->refcnt, 1);
}
@@ -210,7 +210,7 @@ getname_flags(const char __user *filename, int flags)
return ERR_PTR(-ENAMETOOLONG);
}
}
- initname(result);
+ initname(result, filename);
audit_getname(result);
return result;
}
@@ -268,7 +268,7 @@ struct filename *getname_kernel(const char * filename)
return ERR_PTR(-ENAMETOOLONG);
}
memcpy((char *)result->name, filename, len);
- initname(result);
+ initname(result, NULL);
audit_getname(result);
return result;
}
@@ -1665,27 +1665,20 @@ static struct dentry *lookup_dcache(const struct qstr *name,
return dentry;
}
-/*
- * Parent directory has inode locked exclusive. This is one
- * and only case when ->lookup() gets called on non in-lookup
- * dentries - as the matter of fact, this only gets called
- * when directory is guaranteed to have no in-lookup children
- * at all.
- * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
- * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
- */
-struct dentry *lookup_one_qstr_excl(const struct qstr *name,
- struct dentry *base,
- unsigned int flags)
+static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name,
+ struct dentry *base,
+ unsigned int flags)
{
- struct dentry *dentry = lookup_dcache(name, base, flags);
+ struct dentry *dentry;
struct dentry *old;
- struct inode *dir = base->d_inode;
+ struct inode *dir;
+ dentry = lookup_dcache(name, base, flags);
if (dentry)
- goto found;
+ return dentry;
/* Don't create child dentry for a dead directory. */
+ dir = base->d_inode;
if (unlikely(IS_DEADDIR(dir)))
return ERR_PTR(-ENOENT);
@@ -1698,7 +1691,24 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name,
dput(dentry);
dentry = old;
}
-found:
+ return dentry;
+}
+
+/*
+ * Parent directory has inode locked exclusive. This is one
+ * and only case when ->lookup() gets called on non in-lookup
+ * dentries - as the matter of fact, this only gets called
+ * when directory is guaranteed to have no in-lookup children
+ * at all.
+ * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
+ * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
+ */
+struct dentry *lookup_one_qstr_excl(const struct qstr *name,
+ struct dentry *base, unsigned int flags)
+{
+ struct dentry *dentry;
+
+ dentry = lookup_one_qstr_excl_raw(name, base, flags);
if (IS_ERR(dentry))
return dentry;
if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
@@ -2742,23 +2752,48 @@ static int filename_parentat(int dfd, struct filename *name,
/* does lookup, returns the object with parent locked */
static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)
{
+ struct path parent_path __free(path_put) = {};
struct dentry *d;
struct qstr last;
int type, error;
- error = filename_parentat(dfd, name, 0, path, &last, &type);
+ error = filename_parentat(dfd, name, 0, &parent_path, &last, &type);
if (error)
return ERR_PTR(error);
- if (unlikely(type != LAST_NORM)) {
- path_put(path);
+ if (unlikely(type != LAST_NORM))
return ERR_PTR(-EINVAL);
+ inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
+ d = lookup_one_qstr_excl(&last, parent_path.dentry, 0);
+ if (IS_ERR(d)) {
+ inode_unlock(parent_path.dentry->d_inode);
+ return d;
}
- inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
- d = lookup_one_qstr_excl(&last, path->dentry, 0);
+ path->dentry = no_free_ptr(parent_path.dentry);
+ path->mnt = no_free_ptr(parent_path.mnt);
+ return d;
+}
+
+struct dentry *kern_path_locked_negative(const char *name, struct path *path)
+{
+ struct path parent_path __free(path_put) = {};
+ struct filename *filename __free(putname) = getname_kernel(name);
+ struct dentry *d;
+ struct qstr last;
+ int type, error;
+
+ error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type);
+ if (error)
+ return ERR_PTR(error);
+ if (unlikely(type != LAST_NORM))
+ return ERR_PTR(-EINVAL);
+ inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
+ d = lookup_one_qstr_excl_raw(&last, parent_path.dentry, 0);
if (IS_ERR(d)) {
- inode_unlock(path->dentry->d_inode);
- path_put(path);
+ inode_unlock(parent_path.dentry->d_inode);
+ return d;
}
+ path->dentry = no_free_ptr(parent_path.dentry);
+ path->mnt = no_free_ptr(parent_path.mnt);
return d;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 14935a0500a2..98a5cd756e9a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1830,6 +1830,8 @@ static inline void namespace_lock(void)
down_write(&namespace_sem);
}
+DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock())
+
enum umount_tree_flags {
UMOUNT_SYNC = 1,
UMOUNT_PROPAGATE = 2,
@@ -2383,7 +2385,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
return;
}
- scoped_guard(rwsem_write, &namespace_sem) {
+ scoped_guard(namespace_lock, &namespace_sem) {
ns = m->mnt_ns;
if (!must_dissolve(ns))
return;
@@ -2824,56 +2826,62 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
struct vfsmount *mnt = path->mnt;
struct dentry *dentry;
struct mountpoint *mp = ERR_PTR(-ENOENT);
+ struct path under = {};
for (;;) {
- struct mount *m;
+ struct mount *m = real_mount(mnt);
if (beneath) {
- m = real_mount(mnt);
+ path_put(&under);
read_seqlock_excl(&mount_lock);
- dentry = dget(m->mnt_mountpoint);
+ under.mnt = mntget(&m->mnt_parent->mnt);
+ under.dentry = dget(m->mnt_mountpoint);
read_sequnlock_excl(&mount_lock);
+ dentry = under.dentry;
} else {
dentry = path->dentry;
}
inode_lock(dentry->d_inode);
- if (unlikely(cant_mount(dentry))) {
- inode_unlock(dentry->d_inode);
- goto out;
- }
-
namespace_lock();
- if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
+ if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
+ break; // not to be mounted on
+
+ if (beneath && unlikely(m->mnt_mountpoint != dentry ||
+ &m->mnt_parent->mnt != under.mnt)) {
namespace_unlock();
inode_unlock(dentry->d_inode);
- goto out;
+ continue; // got moved
}
mnt = lookup_mnt(path);
- if (likely(!mnt))
+ if (unlikely(mnt)) {
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
+ path_put(path);
+ path->mnt = mnt;
+ path->dentry = dget(mnt->mnt_root);
+ continue; // got overmounted
+ }
+ mp = get_mountpoint(dentry);
+ if (IS_ERR(mp))
break;
-
- namespace_unlock();
- inode_unlock(dentry->d_inode);
- if (beneath)
- dput(dentry);
- path_put(path);
- path->mnt = mnt;
- path->dentry = dget(mnt->mnt_root);
- }
-
- mp = get_mountpoint(dentry);
- if (IS_ERR(mp)) {
- namespace_unlock();
- inode_unlock(dentry->d_inode);
+ if (beneath) {
+ /*
+ * @under duplicates the references that will stay
+ * at least until namespace_unlock(), so the path_put()
+ * below is safe (and OK to do under namespace_lock -
+ * we are not dropping the final references here).
+ */
+ path_put(&under);
+ }
+ return mp;
}
-
-out:
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
if (beneath)
- dput(dentry);
-
+ path_put(&under);
return mp;
}
@@ -2884,14 +2892,11 @@ static inline struct mountpoint *lock_mount(struct path *path)
static void unlock_mount(struct mountpoint *where)
{
- struct dentry *dentry = where->m_dentry;
-
+ inode_unlock(where->m_dentry->d_inode);
read_seqlock_excl(&mount_lock);
put_mountpoint(where);
read_sequnlock_excl(&mount_lock);
-
namespace_unlock();
- inode_unlock(dentry->d_inode);
}
static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
@@ -5189,8 +5194,8 @@ static void finish_mount_kattr(struct mount_kattr *kattr)
mnt_idmap_put(kattr->mnt_idmap);
}
-static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize,
- struct mount_kattr *kattr)
+static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize,
+ struct mount_kattr *kattr)
{
int ret;
struct mount_attr attr;
@@ -5213,9 +5218,13 @@ static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize,
if (attr.attr_set == 0 &&
attr.attr_clr == 0 &&
attr.propagation == 0)
- return 0;
+ return 0; /* Tell caller to not bother. */
- return build_mount_kattr(&attr, usize, kattr);
+ ret = build_mount_kattr(&attr, usize, kattr);
+ if (ret < 0)
+ return ret;
+
+ return 1;
}
SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
@@ -5247,8 +5256,8 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
if (flags & AT_RECURSIVE)
kattr.kflags |= MOUNT_KATTR_RECURSE;
- err = copy_mount_setattr(uattr, usize, &kattr);
- if (err)
+ err = wants_mount_setattr(uattr, usize, &kattr);
+ if (err <= 0)
return err;
err = user_path_at(dfd, path, kattr.lookup_flags, &target);
@@ -5282,15 +5291,17 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
if (flags & AT_RECURSIVE)
kattr.kflags |= MOUNT_KATTR_RECURSE;
- ret = copy_mount_setattr(uattr, usize, &kattr);
- if (ret)
+ ret = wants_mount_setattr(uattr, usize, &kattr);
+ if (ret < 0)
return ret;
- ret = do_mount_setattr(&file->f_path, &kattr);
- if (ret)
- return ret;
+ if (ret) {
+ ret = do_mount_setattr(&file->f_path, &kattr);
+ if (ret)
+ return ret;
- finish_mount_kattr(&kattr);
+ finish_mount_kattr(&kattr);
+ }
}
fd = get_unused_fd_flags(flags & O_CLOEXEC);
diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c
index 9397ed39b0b4..8f70f8da064b 100644
--- a/fs/netfs/fscache_cache.c
+++ b/fs/netfs/fscache_cache.c
@@ -372,7 +372,7 @@ void fscache_withdraw_cache(struct fscache_cache *cache)
EXPORT_SYMBOL(fscache_withdraw_cache);
#ifdef CONFIG_PROC_FS
-static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] = "-PAEW";
+static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] __nonstring = "-PAEW";
/*
* Generate a list of caches in /proc/fs/fscache/caches
diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c
index d4d4b3a8b106..3d56fc73435f 100644
--- a/fs/netfs/fscache_cookie.c
+++ b/fs/netfs/fscache_cookie.c
@@ -29,7 +29,7 @@ static LIST_HEAD(fscache_cookie_lru);
static DEFINE_SPINLOCK(fscache_cookie_lru_lock);
DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out);
static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker);
-static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD";
+static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] __nonstring = "-LCAIFUWRD";
static unsigned int fscache_lru_cookie_timeout = 10 * HZ;
void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 4e3e62040831..70ecc8f5f210 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -127,11 +127,13 @@ static int __init netfs_init(void)
if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0)
goto error_subreqpool;
+#ifdef CONFIG_PROC_FS
if (!proc_mkdir("fs/netfs", NULL))
goto error_proc;
if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
&netfs_requests_seq_ops))
goto error_procfile;
+#endif
#ifdef CONFIG_FSCACHE_STATS
if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
netfs_stats_show))
@@ -144,9 +146,11 @@ static int __init netfs_init(void)
return 0;
error_fscache:
+#ifdef CONFIG_PROC_FS
error_procfile:
remove_proc_subtree("fs/netfs", NULL);
error_proc:
+#endif
mempool_exit(&netfs_subrequest_pool);
error_subreqpool:
kmem_cache_destroy(netfs_subrequest_slab);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index d3f76101ad4b..07932ce9246c 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -2,6 +2,7 @@
config NFS_FS
tristate "NFS client support"
depends on INET && FILE_LOCKING && MULTIUSER
+ select CRC32
select LOCKD
select SUNRPC
select NFS_COMMON
@@ -196,7 +197,6 @@ config NFS_USE_KERNEL_DNS
config NFS_DEBUG
bool
depends on NFS_FS && SUNRPC_DEBUG
- select CRC32
default y
config NFS_DISABLE_UDP_SUPPORT
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ec8d32d0e2e9..6655e5f32ec6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -899,18 +899,11 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts)
return ((u64)ts->tv_sec << 30) + ts->tv_nsec;
}
-#ifdef CONFIG_CRC32
static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
{
return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
NFS4_STATEID_OTHER_SIZE);
}
-#else
-static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
-{
- return 0;
-}
-#endif
static inline bool nfs_current_task_exiting(void)
{
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 351616c61df5..f9c291e2165c 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -148,16 +148,12 @@ static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,
memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);
}
-#ifdef CONFIG_CRC32
/*
* nfs_session_id_hash - calculate the crc32 hash for the session id
* @session - pointer to session
*/
#define nfs_session_id_hash(sess_id) \
(~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data)))
-#else
-#define nfs_session_id_hash(session) (0)
-#endif
#else /* defined(CONFIG_NFS_V4_1) */
static inline int nfs4_init_session(struct nfs_client *clp)
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 792d3fed1b45..731a88f6313e 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -4,6 +4,7 @@ config NFSD
depends on INET
depends on FILE_LOCKING
depends on FSNOTIFY
+ select CRC32
select LOCKD
select SUNRPC
select EXPORTFS
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2041268b398a..59a693f22452 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5430,7 +5430,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
queued = nfsd4_run_cb(&dp->dl_recall);
WARN_ON_ONCE(!queued);
if (!queued)
- nfs4_put_stid(&dp->dl_stid);
+ refcount_dec(&dp->dl_stid.sc_count);
}
/* Called from break_lease() with flc_lock held. */
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 876152a91f12..5103c2f4d225 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -267,7 +267,6 @@ static inline bool fh_fsid_match(const struct knfsd_fh *fh1,
return true;
}
-#ifdef CONFIG_CRC32
/**
* knfsd_fh_hash - calculate the crc32 hash for the filehandle
* @fh - pointer to filehandle
@@ -279,12 +278,6 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
{
return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size);
}
-#else
-static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
-{
- return 0;
-}
-#endif
/**
* fh_clear_pre_post_attrs - Reset pre/post attributes
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 3a202e51b360..83970d97840b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2424,7 +2424,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
* the area protected by sc_state_lock.
*/
if (thread_is_alive)
- del_timer_sync(&sci->sc_timer);
+ timer_delete_sync(&sci->sc_timer);
}
/**
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0f46b22561d6..fce9beb214f0 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -724,7 +724,7 @@ static void o2net_shutdown_sc(struct work_struct *work)
if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
/* we shouldn't flush as we're in the thread, the
* races with pending sc work structs are harmless */
- del_timer_sync(&sc->sc_idle_timeout);
+ timer_delete_sync(&sc->sc_idle_timeout);
o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
sc_put(sc);
kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f1b4b3e611cb..c7a9729dc9d0 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1249,7 +1249,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
}
for (i = 0; i < p_blocks; i++, p_blkno++) {
- bh = __find_get_block(osb->sb->s_bdev, p_blkno,
+ bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno,
osb->sb->s_blocksize);
/* block not cached. */
if (!bh)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 6f2f8f4cfbbc..aef942a758ce 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -541,8 +541,6 @@ int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d,
bool ovl_is_metacopy_dentry(struct dentry *dentry);
char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding);
int ovl_ensure_verity_loaded(struct path *path);
-int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path,
- u8 *digest_buf, int *buf_length);
int ovl_validate_verity(struct ovl_fs *ofs,
struct path *metapath,
struct path *datapath);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index b63474d1b064..e19940d649ca 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1138,6 +1138,11 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
return ERR_PTR(-EINVAL);
}
+ if (ctx->nr == ctx->nr_data) {
+ pr_err("at least one non-data lowerdir is required\n");
+ return ERR_PTR(-EINVAL);
+ }
+
err = -EINVAL;
for (i = 0; i < ctx->nr; i++) {
l = &ctx->lower[i];
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 557cf9d40177..f8b9c9c73997 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -563,7 +563,7 @@ void pstore_unregister(struct pstore_info *psi)
pstore_unregister_kmsg();
/* Stop timer and make sure all work has finished. */
- del_timer_sync(&pstore_timer);
+ timer_delete_sync(&pstore_timer);
flush_work(&pstore_work);
/* Remove all backend records from filesystem tree. */
diff --git a/fs/smb/client/cifs_fs_sb.h b/fs/smb/client/cifs_fs_sb.h
index 651759192280..5e8d163cb5f8 100644
--- a/fs/smb/client/cifs_fs_sb.h
+++ b/fs/smb/client/cifs_fs_sb.h
@@ -49,6 +49,7 @@
struct cifs_sb_info {
struct rb_root tlink_tree;
+ struct list_head tcon_sb_link;
spinlock_t tlink_tree_lock;
struct tcon_link *master_tlink;
struct nls_table *local_nls;
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index e69968e88fe7..35892df7335c 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -704,18 +704,12 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server)
cifs_free_hash(&server->secmech.md5);
cifs_free_hash(&server->secmech.sha512);
- if (!SERVER_IS_CHAN(server)) {
- if (server->secmech.enc) {
- crypto_free_aead(server->secmech.enc);
- server->secmech.enc = NULL;
- }
-
- if (server->secmech.dec) {
- crypto_free_aead(server->secmech.dec);
- server->secmech.dec = NULL;
- }
- } else {
+ if (server->secmech.enc) {
+ crypto_free_aead(server->secmech.enc);
server->secmech.enc = NULL;
+ }
+ if (server->secmech.dec) {
+ crypto_free_aead(server->secmech.dec);
server->secmech.dec = NULL;
}
}
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 8dea0cf3a8de..ca435a3841b8 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -135,7 +135,6 @@ extern ssize_t cifs_file_copychunk_range(unsigned int xid,
extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern void cifs_setsize(struct inode *inode, loff_t offset);
-extern int cifs_truncate_page(struct address_space *mapping, loff_t from);
struct smb3_fs_context;
extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type,
@@ -146,6 +145,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 53
-#define CIFS_VERSION "2.53"
+#define SMB3_PRODUCT_BUILD 54
+#define CIFS_VERSION "2.54"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 6ae170a2a042..3b32116b0b49 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -625,10 +625,8 @@ struct smb_version_operations {
bool (*is_status_io_timeout)(char *buf);
/* Check for STATUS_NETWORK_NAME_DELETED */
bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv);
- int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data);
+ struct reparse_data_buffer * (*get_reparse_point_buffer)(const struct kvec *rsp_iov,
+ u32 *plen);
int (*create_reparse_symlink)(const unsigned int xid,
struct inode *inode,
struct dentry *dentry,
@@ -714,6 +712,8 @@ struct TCP_Server_Info {
spinlock_t srv_lock; /* protect anything here that is not protected */
__u64 conn_id; /* connection identifier (useful for debugging) */
int srv_count; /* reference counter */
+ int rfc1001_sessinit; /* whether to estasblish netbios session */
+ bool with_rfc1001; /* if netbios session is used */
/* 15 character server name + 0x20 16th byte indicating type = srv */
char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
struct smb_version_operations *ops;
@@ -1321,7 +1321,8 @@ struct cifs_tcon {
#endif
struct list_head pending_opens; /* list of incomplete opens */
struct cached_fids *cfids;
- /* BB add field for back pointer to sb struct(s)? */
+ struct list_head cifs_sb_list;
+ spinlock_t sb_list_lock;
#ifdef CONFIG_CIFS_DFS_UPCALL
struct delayed_work dfs_cache_work;
struct list_head dfs_ses_list;
@@ -1718,6 +1719,7 @@ struct mid_q_entry {
void *resp_buf; /* pointer to received SMB header */
unsigned int resp_buf_size;
int mid_state; /* wish this were enum but can not pass to wait_event */
+ int mid_rc; /* rc for MID_RC */
unsigned int mid_flags;
__le16 command; /* smb command code */
unsigned int optype; /* operation type */
@@ -1880,6 +1882,7 @@ static inline bool is_replayable_error(int error)
#define MID_RESPONSE_MALFORMED 0x10
#define MID_SHUTDOWN 0x20
#define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */
+#define MID_RC 0x80 /* mid_rc contains custom rc */
/* Flags */
#define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index 48d0d6f439cf..18d67ab113f0 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -2256,6 +2256,8 @@ typedef struct {
#define FILE_SUPPORTS_ENCRYPTION 0x00020000
#define FILE_SUPPORTS_OBJECT_IDS 0x00010000
#define FILE_VOLUME_IS_COMPRESSED 0x00008000
+#define FILE_SUPPORTS_POSIX_UNLINK_RENAME 0x00000400
+#define FILE_RETURNS_CLEANUP_RESULT_INFO 0x00000200
#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100
#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080
#define FILE_SUPPORTS_SPARSE_FILES 0x00000040
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index cfcc07905bdf..59f6fdfe560e 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -163,6 +163,8 @@ extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
struct cifsFileInfo **ret_file);
+extern int cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode,
+ struct file *file);
extern unsigned int smbCalcSize(void *buf);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
struct TCP_Server_Info *server);
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 29dcb88392e5..60cb264a01e5 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1041,15 +1041,31 @@ static __u16 convert_disposition(int disposition)
static int
access_flags_to_smbopen_mode(const int access_flags)
{
- int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE);
-
- if (masked_flags == GENERIC_READ)
- return SMBOPEN_READ;
- else if (masked_flags == GENERIC_WRITE)
+ /*
+ * SYSTEM_SECURITY grants both read and write access to SACL, treat is as read/write.
+ * MAXIMUM_ALLOWED grants as many access as possible, so treat it as read/write too.
+ * SYNCHRONIZE as is does not grant any specific access, so do not check its mask.
+ * If only SYNCHRONIZE bit is specified then fallback to read access.
+ */
+ bool with_write_flags = access_flags & (FILE_WRITE_DATA | FILE_APPEND_DATA | FILE_WRITE_EA |
+ FILE_DELETE_CHILD | FILE_WRITE_ATTRIBUTES | DELETE |
+ WRITE_DAC | WRITE_OWNER | SYSTEM_SECURITY |
+ MAXIMUM_ALLOWED | GENERIC_WRITE | GENERIC_ALL);
+ bool with_read_flags = access_flags & (FILE_READ_DATA | FILE_READ_EA | FILE_EXECUTE |
+ FILE_READ_ATTRIBUTES | READ_CONTROL |
+ SYSTEM_SECURITY | MAXIMUM_ALLOWED | GENERIC_ALL |
+ GENERIC_EXECUTE | GENERIC_READ);
+ bool with_execute_flags = access_flags & (FILE_EXECUTE | MAXIMUM_ALLOWED | GENERIC_ALL |
+ GENERIC_EXECUTE);
+
+ if (with_write_flags && with_read_flags)
+ return SMBOPEN_READWRITE;
+ else if (with_write_flags)
return SMBOPEN_WRITE;
-
- /* just go for read/write */
- return SMBOPEN_READWRITE;
+ else if (with_execute_flags)
+ return SMBOPEN_EXECUTE;
+ else
+ return SMBOPEN_READ;
}
int
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index d7bad2c3af37..df976ce6aed9 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -300,7 +300,6 @@ cifs_abort_connection(struct TCP_Server_Info *server)
server->ssocket->flags);
sock_release(server->ssocket);
server->ssocket = NULL;
- put_net(cifs_net_ns(server));
}
server->sequence_number = 0;
server->session_estab = false;
@@ -371,7 +370,7 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num
*
*/
static int __cifs_reconnect(struct TCP_Server_Info *server,
- bool mark_smb_session)
+ bool mark_smb_session, bool once)
{
int rc = 0;
@@ -399,6 +398,9 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
if (rc) {
cifs_server_unlock(server);
cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc);
+ /* If was asked to reconnect only once, do not try it more times */
+ if (once)
+ break;
msleep(3000);
} else {
atomic_inc(&tcpSesReconnectCount);
@@ -564,19 +566,33 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
return rc;
}
-int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+static int
+_cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session, bool once)
{
if (!server->leaf_fullpath)
- return __cifs_reconnect(server, mark_smb_session);
+ return __cifs_reconnect(server, mark_smb_session, once);
return reconnect_dfs_server(server);
}
#else
-int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+static int
+_cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session, bool once)
{
- return __cifs_reconnect(server, mark_smb_session);
+ return __cifs_reconnect(server, mark_smb_session, once);
}
#endif
+int
+cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+{
+ return _cifs_reconnect(server, mark_smb_session, false);
+}
+
+static int
+cifs_reconnect_once(struct TCP_Server_Info *server)
+{
+ return _cifs_reconnect(server, true, true);
+}
+
static void
cifs_echo_request(struct work_struct *work)
{
@@ -803,26 +819,110 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
/* Regular SMB response */
return true;
case RFC1002_SESSION_KEEP_ALIVE:
+ /*
+ * RFC 1002 session keep alive can sent by the server only when
+ * we established a RFC 1002 session. But Samba servers send
+ * RFC 1002 session keep alive also over port 445 on which
+ * RFC 1002 session is not established.
+ */
cifs_dbg(FYI, "RFC 1002 session keep alive\n");
break;
case RFC1002_POSITIVE_SESSION_RESPONSE:
- cifs_dbg(FYI, "RFC 1002 positive session response\n");
+ /*
+ * RFC 1002 positive session response cannot be returned
+ * for SMB request. RFC 1002 session response is handled
+ * exclusively in ip_rfc1001_connect() function.
+ */
+ cifs_server_dbg(VFS, "RFC 1002 positive session response (unexpected)\n");
+ cifs_reconnect(server, true);
break;
case RFC1002_NEGATIVE_SESSION_RESPONSE:
/*
* We get this from Windows 98 instead of an error on
- * SMB negprot response.
- */
- cifs_dbg(FYI, "RFC 1002 negative session response\n");
- /* give server a second to clean up */
- msleep(1000);
- /*
- * Always try 445 first on reconnect since we get NACK
- * on some if we ever connected to port 139 (the NACK
- * is since we do not begin with RFC1001 session
- * initialize frame).
+ * SMB negprot response, when we have not established
+ * RFC 1002 session (which means ip_rfc1001_connect()
+ * was skipped). Note that same still happens with
+ * Windows Server 2022 when connecting via port 139.
+ * So for this case when mount option -o nonbsessinit
+ * was not specified, try to reconnect with establishing
+ * RFC 1002 session. If new socket establishment with
+ * RFC 1002 session was successful then return to the
+ * mid's caller -EAGAIN, so it can retry the request.
*/
- cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
+ if (!cifs_rdma_enabled(server) &&
+ server->tcpStatus == CifsInNegotiate &&
+ !server->with_rfc1001 &&
+ server->rfc1001_sessinit != 0) {
+ int rc, mid_rc;
+ struct mid_q_entry *mid, *nmid;
+ LIST_HEAD(dispose_list);
+
+ cifs_dbg(FYI, "RFC 1002 negative session response during SMB Negotiate, retrying with NetBIOS session\n");
+
+ /*
+ * Before reconnect, delete all pending mids for this
+ * server, so reconnect would not signal connection
+ * aborted error to mid's callbacks. Note that for this
+ * server there should be exactly one pending mid
+ * corresponding to SMB1/SMB2 Negotiate packet.
+ */
+ spin_lock(&server->mid_lock);
+ list_for_each_entry_safe(mid, nmid, &server->pending_mid_q, qhead) {
+ kref_get(&mid->refcount);
+ list_move(&mid->qhead, &dispose_list);
+ mid->mid_flags |= MID_DELETED;
+ }
+ spin_unlock(&server->mid_lock);
+
+ /* Now try to reconnect once with NetBIOS session. */
+ server->with_rfc1001 = true;
+ rc = cifs_reconnect_once(server);
+
+ /*
+ * If reconnect was successful then indicate -EAGAIN
+ * to mid's caller. If reconnect failed with -EAGAIN
+ * then mask it as -EHOSTDOWN, so mid's caller would
+ * know that it failed.
+ */
+ if (rc == 0)
+ mid_rc = -EAGAIN;
+ else if (rc == -EAGAIN)
+ mid_rc = -EHOSTDOWN;
+ else
+ mid_rc = rc;
+
+ /*
+ * After reconnect (either successful or unsuccessful)
+ * deliver reconnect status to mid's caller via mid's
+ * callback. Use MID_RC state which indicates that the
+ * return code should be read from mid_rc member.
+ */
+ list_for_each_entry_safe(mid, nmid, &dispose_list, qhead) {
+ list_del_init(&mid->qhead);
+ mid->mid_rc = mid_rc;
+ mid->mid_state = MID_RC;
+ mid->callback(mid);
+ release_mid(mid);
+ }
+
+ /*
+ * If reconnect failed then wait two seconds. In most
+ * cases we were been called from the mount context and
+ * delivered failure to mid's callback will stop this
+ * receiver task thread and fails the mount process.
+ * So wait two seconds to prevent another reconnect
+ * in this task thread, which would be useless as the
+ * mount context will fail at all.
+ */
+ if (rc != 0)
+ msleep(2000);
+ } else {
+ cifs_server_dbg(VFS, "RFC 1002 negative session response (unexpected)\n");
+ cifs_reconnect(server, true);
+ }
+ break;
+ case RFC1002_RETARGET_SESSION_RESPONSE:
+ cifs_server_dbg(VFS, "RFC 1002 retarget session response (unexpected)\n");
cifs_reconnect(server, true);
break;
default:
@@ -973,13 +1073,9 @@ clean_demultiplex_info(struct TCP_Server_Info *server)
msleep(125);
if (cifs_rdma_enabled(server))
smbd_destroy(server);
-
if (server->ssocket) {
sock_release(server->ssocket);
server->ssocket = NULL;
-
- /* Release netns reference for the socket. */
- put_net(cifs_net_ns(server));
}
if (!list_empty(&server->pending_mid_q)) {
@@ -1027,7 +1123,6 @@ clean_demultiplex_info(struct TCP_Server_Info *server)
*/
}
- /* Release netns reference for this server. */
put_net(cifs_net_ns(server));
kfree(server->leaf_fullpath);
kfree(server->hostname);
@@ -1673,8 +1768,6 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
tcp_ses->ops = ctx->ops;
tcp_ses->vals = ctx->vals;
-
- /* Grab netns reference for this server. */
cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
tcp_ses->sign = ctx->sign;
@@ -1701,6 +1794,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
ctx->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
memcpy(tcp_ses->server_RFC1001_name,
ctx->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+ tcp_ses->rfc1001_sessinit = ctx->rfc1001_sessinit;
+ tcp_ses->with_rfc1001 = false;
tcp_ses->session_estab = false;
tcp_ses->sequence_number = 0;
tcp_ses->channel_sequence_num = 0; /* only tracked for primary channel */
@@ -1731,12 +1826,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
*/
tcp_ses->tcpStatus = CifsNew;
++tcp_ses->srv_count;
+ tcp_ses->echo_interval = ctx->echo_interval * HZ;
- if (ctx->echo_interval >= SMB_ECHO_INTERVAL_MIN &&
- ctx->echo_interval <= SMB_ECHO_INTERVAL_MAX)
- tcp_ses->echo_interval = ctx->echo_interval * HZ;
- else
- tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ;
if (tcp_ses->rdma) {
#ifndef CONFIG_CIFS_SMB_DIRECT
cifs_dbg(VFS, "CONFIG_CIFS_SMB_DIRECT is not enabled\n");
@@ -1804,7 +1895,6 @@ smbd_connected:
out_err_crypto_release:
cifs_crypto_secmech_release(tcp_ses);
- /* Release netns reference for this server. */
put_net(cifs_net_ns(tcp_ses));
out_err:
@@ -1813,10 +1903,8 @@ out_err:
cifs_put_tcp_session(tcp_ses->primary_server, false);
kfree(tcp_ses->hostname);
kfree(tcp_ses->leaf_fullpath);
- if (tcp_ses->ssocket) {
+ if (tcp_ses->ssocket)
sock_release(tcp_ses->ssocket);
- put_net(cifs_net_ns(tcp_ses));
- }
kfree(tcp_ses);
}
return ERR_PTR(rc);
@@ -2457,6 +2545,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
return 0;
if (tcon->nodelete != ctx->nodelete)
return 0;
+ if (tcon->posix_extensions != ctx->linux_ext)
+ return 0;
return 1;
}
@@ -3221,6 +3311,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
return -EIO;
}
+ server->with_rfc1001 = true;
return 0;
}
@@ -3257,24 +3348,20 @@ generic_ip_connect(struct TCP_Server_Info *server)
socket = server->ssocket;
} else {
struct net *net = cifs_net_ns(server);
+ struct sock *sk;
- rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket);
+ rc = __sock_create(net, sfamily, SOCK_STREAM,
+ IPPROTO_TCP, &server->ssocket, 1);
if (rc < 0) {
cifs_server_dbg(VFS, "Error %d creating socket\n", rc);
return rc;
}
- /*
- * Grab netns reference for the socket.
- *
- * This reference will be released in several situations:
- * - In the failure path before the cifsd thread is started.
- * - In the all place where server->socket is released, it is
- * also set to NULL.
- * - Ultimately in clean_demultiplex_info(), during the final
- * teardown.
- */
- get_net(net);
+ sk = server->ssocket->sk;
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ sk->sk_net_refcnt = 1;
+ get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
/* BB other socket options to set KEEPALIVE, NODELAY? */
cifs_dbg(FYI, "Socket created\n");
@@ -3326,13 +3413,21 @@ generic_ip_connect(struct TCP_Server_Info *server)
if (rc < 0) {
cifs_dbg(FYI, "Error %d connecting to server\n", rc);
trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc);
- put_net(cifs_net_ns(server));
sock_release(socket);
server->ssocket = NULL;
return rc;
}
trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr);
- if (sport == htons(RFC1001_PORT))
+
+ /*
+ * Establish RFC1001 NetBIOS session when it was explicitly requested
+ * by mount option -o nbsessinit, or when connecting to default RFC1001
+ * server port (139) and it was not explicitly disabled by mount option
+ * -o nonbsessinit.
+ */
+ if (server->with_rfc1001 ||
+ server->rfc1001_sessinit == 1 ||
+ (server->rfc1001_sessinit == -1 && sport == htons(RFC1001_PORT)))
rc = ip_rfc1001_connect(server);
return rc;
@@ -3481,6 +3576,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
struct smb3_fs_context *ctx = cifs_sb->ctx;
INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
+ INIT_LIST_HEAD(&cifs_sb->tcon_sb_link);
spin_lock_init(&cifs_sb->tlink_tree_lock);
cifs_sb->tlink_tree = RB_ROOT;
@@ -3713,6 +3809,10 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
spin_unlock(&cifs_sb->tlink_tree_lock);
+ spin_lock(&tcon->sb_list_lock);
+ list_add(&cifs_sb->tcon_sb_link, &tcon->cifs_sb_list);
+ spin_unlock(&tcon->sb_list_lock);
+
queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
TLINK_IDLE_EXPIRE);
return 0;
@@ -4054,9 +4154,19 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
struct rb_root *root = &cifs_sb->tlink_tree;
struct rb_node *node;
struct tcon_link *tlink;
+ struct cifs_tcon *tcon = NULL;
cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
+ if (cifs_sb->master_tlink) {
+ tcon = cifs_sb->master_tlink->tl_tcon;
+ if (tcon) {
+ spin_lock(&tcon->sb_list_lock);
+ list_del_init(&cifs_sb->tcon_sb_link);
+ spin_unlock(&tcon->sb_list_lock);
+ }
+ }
+
spin_lock(&cifs_sb->tlink_tree_lock);
while ((node = rb_first(root))) {
tlink = rb_entry(node, struct tcon_link, tl_rbnode);
@@ -4078,11 +4188,13 @@ int
cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
struct TCP_Server_Info *server)
{
+ bool in_retry = false;
int rc = 0;
if (!server->ops->need_neg || !server->ops->negotiate)
return -ENOSYS;
+retry:
/* only send once per connect */
spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsGood &&
@@ -4102,6 +4214,14 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
spin_unlock(&server->srv_lock);
rc = server->ops->negotiate(xid, ses, server);
+ if (rc == -EAGAIN) {
+ /* Allow one retry attempt */
+ if (!in_retry) {
+ in_retry = true;
+ goto retry;
+ }
+ rc = -EHOSTDOWN;
+ }
if (rc == 0) {
spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsInNegotiate)
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 8407fb108664..9e8f404b9e56 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -1007,6 +1007,11 @@ int cifs_open(struct inode *inode, struct file *file)
} else {
_cifsFileInfo_put(cfile, true, false);
}
+ } else {
+ /* hard link on the defeered close file */
+ rc = cifs_get_hardlink_path(tcon, inode, file);
+ if (rc)
+ cifs_close_deferred_file(CIFS_I(inode));
}
if (server->oplocks)
@@ -2071,6 +2076,29 @@ cifs_move_llist(struct list_head *source, struct list_head *dest)
list_move(li, dest);
}
+int
+cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode,
+ struct file *file)
+{
+ struct cifsFileInfo *open_file = NULL;
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
+ int rc = 0;
+
+ spin_lock(&tcon->open_file_lock);
+ spin_lock(&cinode->open_file_lock);
+
+ list_for_each_entry(open_file, &cinode->openFileList, flist) {
+ if (file->f_flags == open_file->f_flags) {
+ rc = -EINVAL;
+ break;
+ }
+ }
+
+ spin_unlock(&cinode->open_file_lock);
+ spin_unlock(&tcon->open_file_lock);
+ return rc;
+}
+
void
cifs_free_llist(struct list_head *llist)
{
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index bdb762d398af..2980941b9667 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -135,6 +135,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag("witness", Opt_witness),
fsparam_flag_no("nativesocket", Opt_nativesocket),
fsparam_flag_no("unicode", Opt_unicode),
+ fsparam_flag_no("nbsessinit", Opt_nbsessinit),
/* Mount options which take uid or gid */
fsparam_uid("backupuid", Opt_backupuid),
@@ -968,6 +969,10 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
cifs_errorf(fc, "can not change unicode during remount\n");
return -EINVAL;
}
+ if (new_ctx->rfc1001_sessinit != old_ctx->rfc1001_sessinit) {
+ cifs_errorf(fc, "can not change nbsessinit during remount\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -1333,6 +1338,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_rsize:
ctx->rsize = result.uint_32;
ctx->got_rsize = true;
+ ctx->vol_rsize = ctx->rsize;
break;
case Opt_wsize:
ctx->wsize = result.uint_32;
@@ -1348,6 +1354,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->wsize, PAGE_SIZE);
}
}
+ ctx->vol_wsize = ctx->wsize;
break;
case Opt_acregmax:
if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) {
@@ -1383,6 +1390,11 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->closetimeo = HZ * result.uint_32;
break;
case Opt_echo_interval:
+ if (result.uint_32 < SMB_ECHO_INTERVAL_MIN ||
+ result.uint_32 > SMB_ECHO_INTERVAL_MAX) {
+ cifs_errorf(fc, "echo interval is out of bounds\n");
+ goto cifs_parse_mount_err;
+ }
ctx->echo_interval = result.uint_32;
break;
case Opt_snapshot:
@@ -1602,6 +1614,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
if (i == RFC1001_NAME_LEN && param->string[i] != 0)
pr_warn("server netbiosname longer than 15 truncated\n");
break;
+ case Opt_nbsessinit:
+ ctx->rfc1001_sessinit = !result.negated;
+ cifs_dbg(FYI, "rfc1001_sessinit set to %d\n", ctx->rfc1001_sessinit);
+ break;
case Opt_ver:
/* version of mount userspace tools, not dialect */
/* If interface changes in mount.cifs bump to new ver */
@@ -1889,13 +1905,16 @@ int smb3_init_fs_context(struct fs_context *fc)
memset(ctx->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
ctx->source_rfc1001_name[i] = toupper(nodename[i]);
-
ctx->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
+
/*
* null target name indicates to use *SMBSERVR default called name
* if we end up sending RFC1001 session initialize
*/
ctx->target_rfc1001_name[0] = 0;
+
+ ctx->rfc1001_sessinit = -1; /* autodetect based on port number */
+
ctx->cred_uid = current_uid();
ctx->linux_uid = current_uid();
ctx->linux_gid = current_gid();
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 42c6b66c2c1a..d1d29249bcdb 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -174,6 +174,7 @@ enum cifs_param {
Opt_iocharset,
Opt_netbiosname,
Opt_servern,
+ Opt_nbsessinit,
Opt_ver,
Opt_vers,
Opt_sec,
@@ -216,6 +217,7 @@ struct smb3_fs_context {
char *iocharset; /* local code page for mapping to and from Unicode */
char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
+ int rfc1001_sessinit;
kuid_t cred_uid;
kuid_t linux_uid;
kgid_t linux_gid;
@@ -280,6 +282,9 @@ struct smb3_fs_context {
bool use_client_guid:1;
/* reuse existing guid for multichannel */
u8 client_guid[SMB2_CLIENT_GUID_SIZE];
+ /* User-specified original r/wsize value */
+ unsigned int vol_rsize;
+ unsigned int vol_wsize;
unsigned int bsize;
unsigned int rasize;
unsigned int rsize;
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 3bb21aa58474..75be4b46bc6f 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -1203,18 +1203,17 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
goto out;
}
break;
- case IO_REPARSE_TAG_MOUNT_POINT:
- cifs_create_junction_fattr(fattr, sb);
- rc = 0;
- goto out;
default:
/* Check for cached reparse point data */
if (data->symlink_target || data->reparse.buf) {
rc = 0;
- } else if (iov && server->ops->parse_reparse_point) {
- rc = server->ops->parse_reparse_point(cifs_sb,
- full_path,
- iov, data);
+ } else if (iov && server->ops->get_reparse_point_buffer) {
+ struct reparse_data_buffer *reparse_buf;
+ u32 reparse_len;
+
+ reparse_buf = server->ops->get_reparse_point_buffer(iov, &reparse_len);
+ rc = parse_reparse_point(reparse_buf, reparse_len,
+ cifs_sb, full_path, data);
/*
* If the reparse point was not handled but it is the
* name surrogate which points to directory, then treat
@@ -1228,6 +1227,16 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
cifs_create_junction_fattr(fattr, sb);
goto out;
}
+ /*
+ * If the reparse point is unsupported by the Linux SMB
+ * client then let it process by the SMB server. So mask
+ * the -EOPNOTSUPP error code. This will allow Linux SMB
+ * client to send SMB OPEN request to server. If server
+ * does not support this reparse point too then server
+ * will return error during open the path.
+ */
+ if (rc == -EOPNOTSUPP)
+ rc = 0;
}
if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) {
@@ -2901,23 +2910,6 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
return -EOPNOTSUPP;
}
-int cifs_truncate_page(struct address_space *mapping, loff_t from)
-{
- pgoff_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE - 1);
- struct page *page;
- int rc = 0;
-
- page = grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
-
- zero_user_segment(page, offset, PAGE_SIZE);
- unlock_page(page);
- put_page(page);
- return rc;
-}
-
void cifs_setsize(struct inode *inode, loff_t offset)
{
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
@@ -3012,8 +3004,6 @@ set_size_out:
*/
attrs->ia_ctime = attrs->ia_mtime = current_time(inode);
attrs->ia_valid |= ATTR_CTIME | ATTR_MTIME;
-
- cifs_truncate_page(inode->i_mapping, inode->i_size);
}
return rc;
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index a88253668286..769752ad2c5c 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -258,7 +258,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_io_parms io_parms = {0};
int buf_type = CIFS_NO_BUFFER;
- FILE_ALL_INFO file_info;
+ struct cifs_open_info_data query_data;
oparms = (struct cifs_open_parms) {
.tcon = tcon,
@@ -270,11 +270,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
.fid = &fid,
};
- rc = CIFS_open(xid, &oparms, &oplock, &file_info);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &query_data);
if (rc)
return rc;
- if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+ if (query_data.fi.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
rc = -ENOENT;
/* it's not a symlink */
goto out;
@@ -313,7 +313,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
.fid = &fid,
};
- rc = CIFS_open(xid, &oparms, &oplock, NULL);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL);
if (rc)
return rc;
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index b328dc5c7988..7b6ed9b23e71 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -137,8 +137,10 @@ tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace)
spin_lock_init(&ret_buf->tc_lock);
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
+ INIT_LIST_HEAD(&ret_buf->cifs_sb_list);
spin_lock_init(&ret_buf->open_file_lock);
spin_lock_init(&ret_buf->stat_lock);
+ spin_lock_init(&ret_buf->sb_list_lock);
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
ret_buf->stats_from_time = ktime_get_real_seconds();
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 2b9e9885dc42..bb25e77c5540 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -542,12 +542,12 @@ static int wsl_set_reparse_buf(struct reparse_data_buffer **buf,
kfree(symname_utf16);
return -ENOMEM;
}
- /* Flag 0x02000000 is unknown, but all wsl symlinks have this value */
- symlink_buf->Flags = cpu_to_le32(0x02000000);
- /* PathBuffer is in UTF-8 but without trailing null-term byte */
+ /* Version field must be set to 2 (MS-FSCC 2.1.2.7) */
+ symlink_buf->Version = cpu_to_le32(2);
+ /* Target for Version 2 is in UTF-8 but without trailing null-term byte */
symname_utf8_len = utf16s_to_utf8s((wchar_t *)symname_utf16, symname_utf16_len/2,
UTF16_LITTLE_ENDIAN,
- symlink_buf->PathBuffer,
+ symlink_buf->Target,
symname_utf8_maxlen);
*buf = (struct reparse_data_buffer *)symlink_buf;
buf_len = sizeof(struct reparse_wsl_symlink_data_buffer) + symname_utf8_len;
@@ -1016,29 +1016,36 @@ static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf
struct cifs_open_info_data *data)
{
int len = le16_to_cpu(buf->ReparseDataLength);
+ int data_offset = offsetof(typeof(*buf), Target) - offsetof(typeof(*buf), Version);
int symname_utf8_len;
__le16 *symname_utf16;
int symname_utf16_len;
- if (len <= sizeof(buf->Flags)) {
+ if (len <= data_offset) {
cifs_dbg(VFS, "srv returned malformed wsl symlink buffer\n");
return -EIO;
}
- /* PathBuffer is in UTF-8 but without trailing null-term byte */
- symname_utf8_len = len - sizeof(buf->Flags);
+ /* MS-FSCC 2.1.2.7 defines layout of the Target field only for Version 2. */
+ if (le32_to_cpu(buf->Version) != 2) {
+ cifs_dbg(VFS, "srv returned unsupported wsl symlink version %u\n", le32_to_cpu(buf->Version));
+ return -EIO;
+ }
+
+ /* Target for Version 2 is in UTF-8 but without trailing null-term byte */
+ symname_utf8_len = len - data_offset;
/*
* Check that buffer does not contain null byte
* because Linux cannot process symlink with null byte.
*/
- if (strnlen(buf->PathBuffer, symname_utf8_len) != symname_utf8_len) {
+ if (strnlen(buf->Target, symname_utf8_len) != symname_utf8_len) {
cifs_dbg(VFS, "srv returned null byte in wsl symlink target location\n");
return -EIO;
}
symname_utf16 = kzalloc(symname_utf8_len * 2, GFP_KERNEL);
if (!symname_utf16)
return -ENOMEM;
- symname_utf16_len = utf8s_to_utf16s(buf->PathBuffer, symname_utf8_len,
+ symname_utf16_len = utf8s_to_utf16s(buf->Target, symname_utf8_len,
UTF16_LITTLE_ENDIAN,
(wchar_t *) symname_utf16, symname_utf8_len * 2);
if (symname_utf16_len < 0) {
@@ -1062,8 +1069,6 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
const char *full_path,
struct cifs_open_info_data *data)
{
- struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
-
data->reparse.buf = buf;
/* See MS-FSCC 2.1.2 */
@@ -1090,24 +1095,17 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
}
return 0;
default:
- cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n",
- le32_to_cpu(buf->ReparseTag));
return -EOPNOTSUPP;
}
}
-int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data)
+struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov,
+ u32 *plen)
{
- struct reparse_data_buffer *buf;
struct smb2_ioctl_rsp *io = rsp_iov->iov_base;
- u32 plen = le32_to_cpu(io->OutputCount);
-
- buf = (struct reparse_data_buffer *)((u8 *)io +
- le32_to_cpu(io->OutputOffset));
- return parse_reparse_point(buf, plen, cifs_sb, full_path, data);
+ *plen = le32_to_cpu(io->OutputCount);
+ return (struct reparse_data_buffer *)((u8 *)io +
+ le32_to_cpu(io->OutputOffset));
}
static bool wsl_to_fattr(struct cifs_open_info_data *data,
@@ -1233,16 +1231,6 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
bool ok;
switch (tag) {
- case IO_REPARSE_TAG_INTERNAL:
- if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY))
- return false;
- fallthrough;
- case IO_REPARSE_TAG_DFS:
- case IO_REPARSE_TAG_DFSR:
- case IO_REPARSE_TAG_MOUNT_POINT:
- /* See cifs_create_junction_fattr() */
- fattr->cf_mode = S_IFDIR | 0711;
- break;
case IO_REPARSE_TAG_LX_SYMLINK:
case IO_REPARSE_TAG_LX_FIFO:
case IO_REPARSE_TAG_AF_UNIX:
@@ -1262,7 +1250,14 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
fattr->cf_mode |= S_IFLNK;
break;
default:
- return false;
+ if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY))
+ return false;
+ if (!IS_REPARSE_TAG_NAME_SURROGATE(tag) &&
+ tag != IO_REPARSE_TAG_INTERNAL)
+ return false;
+ /* See cifs_create_junction_fattr() */
+ fattr->cf_mode = S_IFDIR | 0711;
+ break;
}
fattr->cf_dtype = S_DT(fattr->cf_mode);
diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h
index c0be5ab45a78..08de853b36a8 100644
--- a/fs/smb/client/reparse.h
+++ b/fs/smb/client/reparse.h
@@ -135,9 +135,6 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev);
-int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data);
+struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov, u32 *len);
#endif /* _CIFS_REPARSE_H */
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index f2ca5963cd9d..b3fa9ee26912 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -680,6 +680,22 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
*pbcc_area = bcc_ptr;
}
+static void
+ascii_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
+{
+ char *bcc_ptr = *pbcc_area;
+
+ strcpy(bcc_ptr, "Linux version ");
+ bcc_ptr += strlen("Linux version ");
+ strcpy(bcc_ptr, init_utsname()->release);
+ bcc_ptr += strlen(init_utsname()->release) + 1;
+
+ strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
+ bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
+
+ *pbcc_area = bcc_ptr;
+}
+
static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
const struct nls_table *nls_cp)
{
@@ -704,6 +720,25 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
*pbcc_area = bcc_ptr;
}
+static void ascii_domain_string(char **pbcc_area, struct cifs_ses *ses,
+ const struct nls_table *nls_cp)
+{
+ char *bcc_ptr = *pbcc_area;
+ int len;
+
+ /* copy domain */
+ if (ses->domainName != NULL) {
+ len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
+ if (WARN_ON_ONCE(len < 0))
+ len = CIFS_MAX_DOMAINNAME_LEN - 1;
+ bcc_ptr += len;
+ } /* else we send a null domain name so server will default to its own domain */
+ *bcc_ptr = 0;
+ bcc_ptr++;
+
+ *pbcc_area = bcc_ptr;
+}
+
static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
const struct nls_table *nls_cp)
{
@@ -749,25 +784,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
*bcc_ptr = 0;
bcc_ptr++; /* account for null termination */
- /* copy domain */
- if (ses->domainName != NULL) {
- len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
- if (WARN_ON_ONCE(len < 0))
- len = CIFS_MAX_DOMAINNAME_LEN - 1;
- bcc_ptr += len;
- } /* else we send a null domain name so server will default to its own domain */
- *bcc_ptr = 0;
- bcc_ptr++;
-
/* BB check for overflow here */
- strcpy(bcc_ptr, "Linux version ");
- bcc_ptr += strlen("Linux version ");
- strcpy(bcc_ptr, init_utsname()->release);
- bcc_ptr += strlen(init_utsname()->release) + 1;
-
- strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
- bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
+ ascii_domain_string(&bcc_ptr, ses, nls_cp);
+ ascii_oslm_strings(&bcc_ptr, nls_cp);
*pbcc_area = bcc_ptr;
}
@@ -1570,7 +1590,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
sess_data->iov[1].iov_len = msg->secblob_len;
pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len);
- if (ses->capabilities & CAP_UNICODE) {
+ if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) {
/* unicode strings must be word aligned */
if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
*bcc_ptr = 0;
@@ -1579,8 +1599,8 @@ sess_auth_kerberos(struct sess_data *sess_data)
unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
} else {
- /* BB: is this right? */
- ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+ ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+ ascii_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
}
sess_data->iov[2].iov_len = (long) bcc_ptr -
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 8701484805cd..0adeec652dc1 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -14,6 +14,8 @@
#include "cifspdu.h"
#include "cifs_unicode.h"
#include "fs_context.h"
+#include "nterr.h"
+#include "smberr.h"
/*
* An NT cancel request header looks just like the original request except:
@@ -426,13 +428,6 @@ cifs_negotiate(const unsigned int xid,
{
int rc;
rc = CIFSSMBNegotiate(xid, ses, server);
- if (rc == -EAGAIN) {
- /* retry only once on 1st time connection */
- set_credits(server, 1);
- rc = CIFSSMBNegotiate(xid, ses, server);
- if (rc == -EAGAIN)
- rc = -EHOSTDOWN;
- }
return rc;
}
@@ -444,8 +439,8 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int wsize;
/* start with specified wsize, or default */
- if (ctx->wsize)
- wsize = ctx->wsize;
+ if (ctx->got_wsize)
+ wsize = ctx->vol_wsize;
else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
wsize = CIFS_DEFAULT_IOSIZE;
else
@@ -497,7 +492,7 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
else
defsize = server->maxBuf - sizeof(READ_RSP);
- rsize = ctx->rsize ? ctx->rsize : defsize;
+ rsize = ctx->got_rsize ? ctx->vol_rsize : defsize;
/*
* no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
@@ -573,6 +568,42 @@ static int cifs_query_path_info(const unsigned int xid,
data->reparse_point = le32_to_cpu(fi.Attributes) & ATTR_REPARSE;
}
+#ifdef CONFIG_CIFS_XATTR
+ /*
+ * For WSL CHR and BLK reparse points it is required to fetch
+ * EA $LXDEV which contains major and minor device numbers.
+ */
+ if (!rc && data->reparse_point) {
+ struct smb2_file_full_ea_info *ea;
+
+ ea = (struct smb2_file_full_ea_info *)data->wsl.eas;
+ rc = CIFSSMBQAllEAs(xid, tcon, full_path, SMB2_WSL_XATTR_DEV,
+ &ea->ea_data[SMB2_WSL_XATTR_NAME_LEN + 1],
+ SMB2_WSL_XATTR_DEV_SIZE, cifs_sb);
+ if (rc == SMB2_WSL_XATTR_DEV_SIZE) {
+ ea->next_entry_offset = cpu_to_le32(0);
+ ea->flags = 0;
+ ea->ea_name_length = SMB2_WSL_XATTR_NAME_LEN;
+ ea->ea_value_length = cpu_to_le16(SMB2_WSL_XATTR_DEV_SIZE);
+ memcpy(&ea->ea_data[0], SMB2_WSL_XATTR_DEV, SMB2_WSL_XATTR_NAME_LEN + 1);
+ data->wsl.eas_len = sizeof(*ea) + SMB2_WSL_XATTR_NAME_LEN + 1 +
+ SMB2_WSL_XATTR_DEV_SIZE;
+ rc = 0;
+ } else if (rc >= 0) {
+ /* It is an error if EA $LXDEV has wrong size. */
+ rc = -EINVAL;
+ } else {
+ /*
+ * In all other cases ignore error if fetching
+ * of EA $LXDEV failed. It is needed only for
+ * WSL CHR and BLK reparse points and wsl_to_fattr()
+ * handle the case when EA is missing.
+ */
+ rc = 0;
+ }
+ }
+#endif
+
return rc;
}
@@ -975,18 +1006,13 @@ static int cifs_query_symlink(const unsigned int xid,
return rc;
}
-static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data)
+static struct reparse_data_buffer *cifs_get_reparse_point_buffer(const struct kvec *rsp_iov,
+ u32 *plen)
{
- struct reparse_data_buffer *buf;
TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base;
- u32 plen = le16_to_cpu(io->ByteCount);
-
- buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol +
- le32_to_cpu(io->DataOffset));
- return parse_reparse_point(buf, plen, cifs_sb, full_path, data);
+ *plen = le16_to_cpu(io->ByteCount);
+ return (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol +
+ le32_to_cpu(io->DataOffset));
}
static bool
@@ -1069,6 +1095,47 @@ cifs_make_node(unsigned int xid, struct inode *inode,
full_path, mode, dev);
}
+static bool
+cifs_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
+{
+ struct smb_hdr *shdr = (struct smb_hdr *)buf;
+ struct TCP_Server_Info *pserver;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+
+ if (shdr->Flags2 & SMBFLG2_ERR_STATUS) {
+ if (shdr->Status.CifsError != cpu_to_le32(NT_STATUS_NETWORK_NAME_DELETED))
+ return false;
+ } else {
+ if (shdr->Status.DosError.ErrorClass != ERRSRV ||
+ shdr->Status.DosError.Error != cpu_to_le16(ERRinvtid))
+ return false;
+ }
+
+ /* If server is a channel, select the primary channel */
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ if (tcon->tid == shdr->Tid) {
+ spin_lock(&tcon->tc_lock);
+ tcon->need_reconnect = true;
+ spin_unlock(&tcon->tc_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ pr_warn_once("Server share %s deleted.\n",
+ tcon->tree_name);
+ return true;
+ }
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ return false;
+}
+
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -1121,7 +1188,7 @@ struct smb_version_operations smb1_operations = {
.rename = CIFSSMBRename,
.create_hardlink = CIFSCreateHardLink,
.query_symlink = cifs_query_symlink,
- .parse_reparse_point = cifs_parse_reparse_point,
+ .get_reparse_point_buffer = cifs_get_reparse_point_buffer,
.open = cifs_open_file,
.set_fid = cifs_set_fid,
.close = cifs_close_file,
@@ -1153,6 +1220,7 @@ struct smb_version_operations smb1_operations = {
.get_acl_by_fid = get_cifs_acl_by_fid,
.set_acl = set_cifs_acl,
.make_node = cifs_make_node,
+ .is_network_name_deleted = cifs_is_network_name_deleted,
};
struct smb_version_values smb1_values = {
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index d609a20fb98a..a7f629238830 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -152,16 +152,35 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32
int err_buftype = CIFS_NO_BUFFER;
struct cifs_fid *fid = oparms->fid;
struct network_resiliency_req nr_ioctl_req;
+ bool retry_without_read_attributes = false;
smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
if (smb2_path == NULL)
return -ENOMEM;
- oparms->desired_access |= FILE_READ_ATTRIBUTES;
+ /*
+ * GENERIC_READ, GENERIC_EXECUTE, GENERIC_ALL and MAXIMUM_ALLOWED
+ * contains also FILE_READ_ATTRIBUTES access right. So do not append
+ * FILE_READ_ATTRIBUTES when not needed and prevent calling code path
+ * for retry_without_read_attributes.
+ */
+ if (!(oparms->desired_access & FILE_READ_ATTRIBUTES) &&
+ !(oparms->desired_access & GENERIC_READ) &&
+ !(oparms->desired_access & GENERIC_EXECUTE) &&
+ !(oparms->desired_access & GENERIC_ALL) &&
+ !(oparms->desired_access & MAXIMUM_ALLOWED)) {
+ oparms->desired_access |= FILE_READ_ATTRIBUTES;
+ retry_without_read_attributes = true;
+ }
smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov,
&err_buftype);
+ if (rc == -EACCES && retry_without_read_attributes) {
+ oparms->desired_access &= ~FILE_READ_ATTRIBUTES;
+ rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov,
+ &err_buftype);
+ }
if (rc && data) {
struct smb2_hdr *hdr = err_iov.iov_base;
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index 2466e6155136..224495322a05 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -38,6 +38,7 @@ enum smb2_compound_ops {
SMB2_OP_SET_REPARSE,
SMB2_OP_GET_REPARSE,
SMB2_OP_QUERY_WSL_EA,
+ SMB2_OP_OPEN_QUERY,
};
/* Used when constructing chained read requests. */
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index e9fd3e204a6f..57d9bfbadd97 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -176,6 +176,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec *out_iov, int *out_buftype, struct dentry *dentry)
{
+ struct smb2_create_rsp *create_rsp = NULL;
struct smb2_query_info_rsp *qi_rsp = NULL;
struct smb2_compound_vars *vars = NULL;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
@@ -265,7 +266,13 @@ replay_again:
num_rqst++;
rc = 0;
- for (i = 0; i < num_cmds; i++) {
+ i = 0;
+
+ /* Skip the leading explicit OPEN operation */
+ if (num_cmds > 0 && cmds[0] == SMB2_OP_OPEN_QUERY)
+ i++;
+
+ for (; i < num_cmds; i++) {
/* Operation */
switch (cmds[i]) {
case SMB2_OP_QUERY_INFO:
@@ -640,6 +647,27 @@ finished:
}
tmp_rc = rc;
+
+ if (rc == 0 && num_cmds > 0 && cmds[0] == SMB2_OP_OPEN_QUERY) {
+ create_rsp = rsp_iov[0].iov_base;
+ idata = in_iov[0].iov_base;
+ idata->fi.CreationTime = create_rsp->CreationTime;
+ idata->fi.LastAccessTime = create_rsp->LastAccessTime;
+ idata->fi.LastWriteTime = create_rsp->LastWriteTime;
+ idata->fi.ChangeTime = create_rsp->ChangeTime;
+ idata->fi.Attributes = create_rsp->FileAttributes;
+ idata->fi.AllocationSize = create_rsp->AllocationSize;
+ idata->fi.EndOfFile = create_rsp->EndofFile;
+ if (le32_to_cpu(idata->fi.NumberOfLinks) == 0)
+ idata->fi.NumberOfLinks = cpu_to_le32(1); /* dummy value */
+ idata->fi.DeletePending = 0;
+ idata->fi.Directory = !!(le32_to_cpu(create_rsp->FileAttributes) & ATTR_DIRECTORY);
+
+ /* smb2_parse_contexts() fills idata->fi.IndexNumber */
+ rc = smb2_parse_contexts(server, &rsp_iov[0], &oparms->fid->epoch,
+ oparms->fid->lease_key, &oplock, &idata->fi, NULL);
+ }
+
for (i = 0; i < num_cmds; i++) {
char *buf = rsp_iov[i + i].iov_base;
@@ -978,6 +1006,43 @@ int smb2_query_path_info(const unsigned int xid,
case 0:
rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]);
break;
+ case -EACCES:
+ /*
+ * If SMB2_OP_QUERY_INFO (called when POSIX extensions are not used) failed with
+ * STATUS_ACCESS_DENIED then it means that caller does not have permission to
+ * open the path with FILE_READ_ATTRIBUTES access and therefore cannot issue
+ * SMB2_OP_QUERY_INFO command.
+ *
+ * There is an alternative way how to query limited information about path but still
+ * suitable for stat() syscall. SMB2 OPEN/CREATE operation returns in its successful
+ * response subset of query information.
+ *
+ * So try to open the path without FILE_READ_ATTRIBUTES but with MAXIMUM_ALLOWED
+ * access which will grant the maximum possible access to the file and the response
+ * will contain required query information for stat() syscall.
+ */
+
+ if (tcon->posix_extensions)
+ break;
+
+ num_cmds = 1;
+ cmds[0] = SMB2_OP_OPEN_QUERY;
+ in_iov[0].iov_base = data;
+ in_iov[0].iov_len = sizeof(*data);
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, MAXIMUM_ALLOWED,
+ FILE_OPEN, create_options, ACL_NO_MODE);
+ free_rsp_iov(out_iov, out_buftype, ARRAY_SIZE(out_iov));
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ &oparms, in_iov, cmds, num_cmds,
+ cfile, out_iov, out_buftype, NULL);
+
+ hdr = out_iov[0].iov_base;
+ if (!hdr || out_buftype[0] == CIFS_NO_BUFFER)
+ goto out;
+
+ if (!rc)
+ rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]);
+ break;
case -EOPNOTSUPP:
/*
* BB TODO: When support for special files added to Samba
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index a700e5921961..2fe8eeb98535 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -464,12 +464,20 @@ smb2_negotiate(const unsigned int xid,
server->CurrentMid = 0;
spin_unlock(&server->mid_lock);
rc = SMB2_negotiate(xid, ses, server);
- /* BB we probably don't need to retry with modern servers */
- if (rc == -EAGAIN)
- rc = -EHOSTDOWN;
return rc;
}
+static inline unsigned int
+prevent_zero_iosize(unsigned int size, const char *type)
+{
+ if (size == 0) {
+ cifs_dbg(VFS, "SMB: Zero %ssize calculated, using minimum value %u\n",
+ type, CIFS_MIN_DEFAULT_IOSIZE);
+ return CIFS_MIN_DEFAULT_IOSIZE;
+ }
+ return size;
+}
+
static unsigned int
smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
{
@@ -477,12 +485,12 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int wsize;
/* start with specified wsize, or default */
- wsize = ctx->wsize ? ctx->wsize : CIFS_DEFAULT_IOSIZE;
+ wsize = ctx->got_wsize ? ctx->vol_wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
- return wsize;
+ return prevent_zero_iosize(wsize, "w");
}
static unsigned int
@@ -492,7 +500,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int wsize;
/* start with specified wsize, or default */
- wsize = ctx->wsize ? ctx->wsize : SMB3_DEFAULT_IOSIZE;
+ wsize = ctx->got_wsize ? ctx->vol_wsize : SMB3_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
@@ -514,7 +522,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
- return wsize;
+ return prevent_zero_iosize(wsize, "w");
}
static unsigned int
@@ -524,13 +532,13 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int rsize;
/* start with specified rsize, or default */
- rsize = ctx->rsize ? ctx->rsize : CIFS_DEFAULT_IOSIZE;
+ rsize = ctx->got_rsize ? ctx->vol_rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
- return rsize;
+ return prevent_zero_iosize(rsize, "r");
}
static unsigned int
@@ -540,7 +548,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int rsize;
/* start with specified rsize, or default */
- rsize = ctx->rsize ? ctx->rsize : SMB3_DEFAULT_IOSIZE;
+ rsize = ctx->got_rsize ? ctx->vol_rsize : SMB3_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
@@ -563,7 +571,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
- return rsize;
+ return prevent_zero_iosize(rsize, "r");
}
/*
@@ -3526,8 +3534,6 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
if (rc == 0) {
netfs_resize_file(&cifsi->netfs, new_eof, true);
cifs_setsize(inode, new_eof);
- cifs_truncate_page(inode->i_mapping, inode->i_size);
- truncate_setsize(inode, new_eof);
}
goto out;
}
@@ -4549,9 +4555,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
return rc;
}
} else {
- if (unlikely(!server->secmech.dec))
- return -EIO;
-
+ rc = smb3_crypto_aead_allocate(server);
+ if (unlikely(rc))
+ return rc;
tfm = server->secmech.dec;
}
@@ -5297,7 +5303,7 @@ struct smb_version_operations smb20_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .parse_reparse_point = smb2_parse_reparse_point,
+ .get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
.create_reparse_symlink = smb2_create_reparse_symlink,
@@ -5400,7 +5406,7 @@ struct smb_version_operations smb21_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .parse_reparse_point = smb2_parse_reparse_point,
+ .get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
.create_reparse_symlink = smb2_create_reparse_symlink,
@@ -5507,7 +5513,7 @@ struct smb_version_operations smb30_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .parse_reparse_point = smb2_parse_reparse_point,
+ .get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
.create_reparse_symlink = smb2_create_reparse_symlink,
@@ -5623,7 +5629,7 @@ struct smb_version_operations smb311_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .parse_reparse_point = smb2_parse_reparse_point,
+ .get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
.create_reparse_symlink = smb2_create_reparse_symlink,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 4f69a1825e42..c4d52bebd37d 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -43,6 +43,7 @@
#endif
#include "cached_dir.h"
#include "compress.h"
+#include "fs_context.h"
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -1251,15 +1252,8 @@ SMB2_negotiate(const unsigned int xid,
cifs_server_dbg(VFS, "Missing expected negotiate contexts\n");
}
- if (server->cipher_type && !rc) {
- if (!SERVER_IS_CHAN(server)) {
- rc = smb3_crypto_aead_allocate(server);
- } else {
- /* For channels, just reuse the primary server crypto secmech. */
- server->secmech.enc = server->primary_server->secmech.enc;
- server->secmech.dec = server->primary_server->secmech.dec;
- }
- }
+ if (server->cipher_type && !rc)
+ rc = smb3_crypto_aead_allocate(server);
neg_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -4089,6 +4083,24 @@ smb2_echo_callback(struct mid_q_entry *mid)
add_credits(server, &credits, CIFS_ECHO_OP);
}
+static void cifs_renegotiate_iosize(struct TCP_Server_Info *server,
+ struct cifs_tcon *tcon)
+{
+ struct cifs_sb_info *cifs_sb;
+
+ if (server == NULL || tcon == NULL)
+ return;
+
+ spin_lock(&tcon->sb_list_lock);
+ list_for_each_entry(cifs_sb, &tcon->cifs_sb_list, tcon_sb_link) {
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tcon, cifs_sb->ctx);
+ cifs_sb->ctx->wsize =
+ server->ops->negotiate_wsize(tcon, cifs_sb->ctx);
+ }
+ spin_unlock(&tcon->sb_list_lock);
+}
+
void smb2_reconnect_server(struct work_struct *work)
{
struct TCP_Server_Info *server = container_of(work,
@@ -4174,9 +4186,10 @@ void smb2_reconnect_server(struct work_struct *work)
list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) {
rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server, true);
- if (!rc)
+ if (!rc) {
+ cifs_renegotiate_iosize(server, tcon);
cifs_reopen_persistent_handles(tcon);
- else
+ } else
resched = true;
list_del_init(&tcon->rlist);
if (tcon->ipc)
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 03434dbe9374..266af17aa7d9 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -894,6 +894,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
case MID_SHUTDOWN:
rc = -EHOSTDOWN;
break;
+ case MID_RC:
+ rc = mid->mid_rc;
+ break;
default:
if (!(mid->mid_flags & MID_DELETED)) {
list_del_init(&mid->qhead);
diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c
index 7d49f38f01f3..b88fa04f5792 100644
--- a/fs/smb/client/xattr.c
+++ b/fs/smb/client/xattr.c
@@ -31,6 +31,8 @@
* secure, replaced by SMB2 (then even more highly secure SMB3) many years ago
*/
#define SMB3_XATTR_CIFS_ACL "system.smb3_acl" /* DACL only */
+#define SMB3_XATTR_CIFS_NTSD_SACL "system.smb3_ntsd_sacl" /* SACL only */
+#define SMB3_XATTR_CIFS_NTSD_OWNER "system.smb3_ntsd_owner" /* owner only */
#define SMB3_XATTR_CIFS_NTSD "system.smb3_ntsd" /* owner plus DACL */
#define SMB3_XATTR_CIFS_NTSD_FULL "system.smb3_ntsd_full" /* owner/DACL/SACL */
#define SMB3_XATTR_ATTRIB "smb3.dosattrib" /* full name: user.smb3.dosattrib */
@@ -38,6 +40,7 @@
/* BB need to add server (Samba e.g) support for security and trusted prefix */
enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT,
+ XATTR_CIFS_NTSD_SACL, XATTR_CIFS_NTSD_OWNER,
XATTR_CIFS_NTSD, XATTR_CIFS_NTSD_FULL };
static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon,
@@ -160,6 +163,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
break;
case XATTR_CIFS_ACL:
+ case XATTR_CIFS_NTSD_SACL:
+ case XATTR_CIFS_NTSD_OWNER:
case XATTR_CIFS_NTSD:
case XATTR_CIFS_NTSD_FULL: {
struct smb_ntsd *pacl;
@@ -187,6 +192,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
CIFS_ACL_GROUP |
CIFS_ACL_DACL);
break;
+ case XATTR_CIFS_NTSD_OWNER:
+ aclflags = (CIFS_ACL_OWNER |
+ CIFS_ACL_GROUP);
+ break;
+ case XATTR_CIFS_NTSD_SACL:
+ aclflags = CIFS_ACL_SACL;
+ break;
case XATTR_CIFS_ACL:
default:
aclflags = CIFS_ACL_DACL;
@@ -308,6 +320,8 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
break;
case XATTR_CIFS_ACL:
+ case XATTR_CIFS_NTSD_SACL:
+ case XATTR_CIFS_NTSD_OWNER:
case XATTR_CIFS_NTSD:
case XATTR_CIFS_NTSD_FULL: {
/*
@@ -327,6 +341,12 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
case XATTR_CIFS_NTSD:
extra_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO;
break;
+ case XATTR_CIFS_NTSD_OWNER:
+ extra_info = OWNER_SECINFO | GROUP_SECINFO;
+ break;
+ case XATTR_CIFS_NTSD_SACL:
+ extra_info = SACL_SECINFO;
+ break;
case XATTR_CIFS_ACL:
default:
extra_info = DACL_SECINFO;
@@ -448,6 +468,20 @@ static const struct xattr_handler smb3_acl_xattr_handler = {
.set = cifs_xattr_set,
};
+static const struct xattr_handler smb3_ntsd_sacl_xattr_handler = {
+ .name = SMB3_XATTR_CIFS_NTSD_SACL,
+ .flags = XATTR_CIFS_NTSD_SACL,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
+static const struct xattr_handler smb3_ntsd_owner_xattr_handler = {
+ .name = SMB3_XATTR_CIFS_NTSD_OWNER,
+ .flags = XATTR_CIFS_NTSD_OWNER,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
static const struct xattr_handler cifs_cifs_ntsd_xattr_handler = {
.name = CIFS_XATTR_CIFS_NTSD,
.flags = XATTR_CIFS_NTSD,
@@ -493,6 +527,8 @@ const struct xattr_handler * const cifs_xattr_handlers[] = {
&cifs_os2_xattr_handler,
&cifs_cifs_acl_xattr_handler,
&smb3_acl_xattr_handler, /* alias for above since avoiding "cifs" */
+ &smb3_ntsd_sacl_xattr_handler,
+ &smb3_ntsd_owner_xattr_handler,
&cifs_cifs_ntsd_xattr_handler,
&smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */
&cifs_cifs_ntsd_full_xattr_handler,
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index c7a0efda4403..f79a5165a7cc 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -95,6 +95,9 @@
*/
#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
+/* According to MS-SMB2 specification The minimum recommended value is 65536.*/
+#define CIFS_MIN_DEFAULT_IOSIZE (65536)
+
/*
* SMB2 Header Definition
*
@@ -1564,13 +1567,13 @@ struct reparse_nfs_data_buffer {
__u8 DataBuffer[];
} __packed;
-/* For IO_REPARSE_TAG_LX_SYMLINK */
+/* For IO_REPARSE_TAG_LX_SYMLINK - see MS-FSCC 2.1.2.7 */
struct reparse_wsl_symlink_data_buffer {
__le32 ReparseTag;
__le16 ReparseDataLength;
__u16 Reserved;
- __le32 Flags;
- __u8 PathBuffer[]; /* Variable Length UTF-8 string without nul-term */
+ __le32 Version; /* Always 2 */
+ __u8 Target[]; /* Variable Length UTF-8 string without nul-term */
} __packed;
struct validate_negotiate_info_req {
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index c1f22c129111..83764c230e9d 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -39,8 +39,10 @@ void ksmbd_conn_free(struct ksmbd_conn *conn)
xa_destroy(&conn->sessions);
kvfree(conn->request_buf);
kfree(conn->preauth_info);
- if (atomic_dec_and_test(&conn->refcnt))
+ if (atomic_dec_and_test(&conn->refcnt)) {
+ ksmbd_free_transport(conn->transport);
kfree(conn);
+ }
}
/**
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index f103b1bd0400..81a29857b1e3 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -129,14 +129,6 @@ static void free_opinfo(struct oplock_info *opinfo)
kfree(opinfo);
}
-static inline void opinfo_free_rcu(struct rcu_head *rcu_head)
-{
- struct oplock_info *opinfo;
-
- opinfo = container_of(rcu_head, struct oplock_info, rcu_head);
- free_opinfo(opinfo);
-}
-
struct oplock_info *opinfo_get(struct ksmbd_file *fp)
{
struct oplock_info *opinfo;
@@ -157,8 +149,8 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)
if (list_empty(&ci->m_op_list))
return NULL;
- rcu_read_lock();
- opinfo = list_first_or_null_rcu(&ci->m_op_list, struct oplock_info,
+ down_read(&ci->m_lock);
+ opinfo = list_first_entry(&ci->m_op_list, struct oplock_info,
op_entry);
if (opinfo) {
if (opinfo->conn == NULL ||
@@ -171,8 +163,7 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)
}
}
}
-
- rcu_read_unlock();
+ up_read(&ci->m_lock);
return opinfo;
}
@@ -185,7 +176,7 @@ void opinfo_put(struct oplock_info *opinfo)
if (!atomic_dec_and_test(&opinfo->refcount))
return;
- call_rcu(&opinfo->rcu_head, opinfo_free_rcu);
+ free_opinfo(opinfo);
}
static void opinfo_add(struct oplock_info *opinfo)
@@ -193,7 +184,7 @@ static void opinfo_add(struct oplock_info *opinfo)
struct ksmbd_inode *ci = opinfo->o_fp->f_ci;
down_write(&ci->m_lock);
- list_add_rcu(&opinfo->op_entry, &ci->m_op_list);
+ list_add(&opinfo->op_entry, &ci->m_op_list);
up_write(&ci->m_lock);
}
@@ -207,7 +198,7 @@ static void opinfo_del(struct oplock_info *opinfo)
write_unlock(&lease_list_lock);
}
down_write(&ci->m_lock);
- list_del_rcu(&opinfo->op_entry);
+ list_del(&opinfo->op_entry);
up_write(&ci->m_lock);
}
@@ -1347,8 +1338,8 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
ci = fp->f_ci;
op = opinfo_get(fp);
- rcu_read_lock();
- list_for_each_entry_rcu(brk_op, &ci->m_op_list, op_entry) {
+ down_read(&ci->m_lock);
+ list_for_each_entry(brk_op, &ci->m_op_list, op_entry) {
if (brk_op->conn == NULL)
continue;
@@ -1358,7 +1349,6 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
if (ksmbd_conn_releasing(brk_op->conn))
continue;
- rcu_read_unlock();
if (brk_op->is_lease && (brk_op->o_lease->state &
(~(SMB2_LEASE_READ_CACHING_LE |
SMB2_LEASE_HANDLE_CACHING_LE)))) {
@@ -1388,9 +1378,8 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE, NULL);
next:
opinfo_put(brk_op);
- rcu_read_lock();
}
- rcu_read_unlock();
+ up_read(&ci->m_lock);
if (op)
opinfo_put(op);
diff --git a/fs/smb/server/oplock.h b/fs/smb/server/oplock.h
index 3f64f0787263..9a56eaadd0dd 100644
--- a/fs/smb/server/oplock.h
+++ b/fs/smb/server/oplock.h
@@ -71,7 +71,6 @@ struct oplock_info {
struct list_head lease_entry;
wait_queue_head_t oplock_q; /* Other server threads */
wait_queue_head_t oplock_brk; /* oplock breaking wait */
- struct rcu_head rcu_head;
};
struct lease_break_info {
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index d24d95d15d87..57839f9708bb 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1602,8 +1602,10 @@ static int krb5_authenticate(struct ksmbd_work *work,
if (prev_sess_id && prev_sess_id != sess->id)
destroy_previous_session(conn, sess->user, prev_sess_id);
- if (sess->state == SMB2_SESSION_VALID)
+ if (sess->state == SMB2_SESSION_VALID) {
ksmbd_free_user(sess->user);
+ sess->user = NULL;
+ }
retval = ksmbd_krb5_authenticate(sess, in_blob, in_len,
out_blob, &out_len);
diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h
index a3d8a905b07e..d742ba754348 100644
--- a/fs/smb/server/smb_common.h
+++ b/fs/smb/server/smb_common.h
@@ -72,6 +72,8 @@
#define FILE_SUPPORTS_ENCRYPTION 0x00020000
#define FILE_SUPPORTS_OBJECT_IDS 0x00010000
#define FILE_VOLUME_IS_COMPRESSED 0x00008000
+#define FILE_SUPPORTS_POSIX_UNLINK_RENAME 0x00000400
+#define FILE_RETURNS_CLEANUP_RESULT_INFO 0x00000200
#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100
#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080
#define FILE_SUPPORTS_SPARSE_FILES 0x00000040
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index 3f185ae60dc5..2a3e2b0ce557 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -310,7 +310,11 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
server_conf.signing = req->signing;
server_conf.tcp_port = req->tcp_port;
server_conf.ipc_timeout = req->ipc_timeout * HZ;
- server_conf.deadtime = req->deadtime * SMB_ECHO_INTERVAL;
+ if (check_mul_overflow(req->deadtime, SMB_ECHO_INTERVAL,
+ &server_conf.deadtime)) {
+ ret = -EINVAL;
+ goto out;
+ }
server_conf.share_fake_fscaps = req->share_fake_fscaps;
ksmbd_init_domain(req->sub_auth);
@@ -337,6 +341,7 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
server_conf.bind_interfaces_only = req->bind_interfaces_only;
ret |= ksmbd_tcp_set_interfaces(KSMBD_STARTUP_CONFIG_INTERFACES(req),
req->ifc_list_sz);
+out:
if (ret) {
pr_err("Server configuration error: %s %s %s\n",
req->netbios_name, req->server_string,
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 7f38a3c3f5bd..abedf510899a 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -93,17 +93,21 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk)
return t;
}
-static void free_transport(struct tcp_transport *t)
+void ksmbd_free_transport(struct ksmbd_transport *kt)
{
- kernel_sock_shutdown(t->sock, SHUT_RDWR);
- sock_release(t->sock);
- t->sock = NULL;
+ struct tcp_transport *t = TCP_TRANS(kt);
- ksmbd_conn_free(KSMBD_TRANS(t)->conn);
+ sock_release(t->sock);
kfree(t->iov);
kfree(t);
}
+static void free_transport(struct tcp_transport *t)
+{
+ kernel_sock_shutdown(t->sock, SHUT_RDWR);
+ ksmbd_conn_free(KSMBD_TRANS(t)->conn);
+}
+
/**
* kvec_array_init() - initialize a IO vector segment
* @new: IO vector to be initialized
diff --git a/fs/smb/server/transport_tcp.h b/fs/smb/server/transport_tcp.h
index 8c9aa624cfe3..1e51675ee1b2 100644
--- a/fs/smb/server/transport_tcp.h
+++ b/fs/smb/server/transport_tcp.h
@@ -8,6 +8,7 @@
int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz);
struct interface *ksmbd_find_netdev_name_iface_list(char *netdev_name);
+void ksmbd_free_transport(struct ksmbd_transport *kt);
int ksmbd_tcp_init(void);
void ksmbd_tcp_destroy(void);
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 8554aa5a1059..391d07da586c 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -479,7 +479,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
int err = 0;
if (work->conn->connection_type) {
- if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) {
+ if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE)) ||
+ S_ISDIR(file_inode(fp->filp)->i_mode)) {
pr_err("no right to write(%pD)\n", fp->filp);
err = -EACCES;
goto out;
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 8d1f30dcba7e..1f8fa3468173 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -713,12 +713,8 @@ static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon,
static bool ksmbd_durable_scavenger_alive(void)
{
- mutex_lock(&durable_scavenger_lock);
- if (!durable_scavenger_running) {
- mutex_unlock(&durable_scavenger_lock);
+ if (!durable_scavenger_running)
return false;
- }
- mutex_unlock(&durable_scavenger_lock);
if (kthread_should_stop())
return false;
@@ -799,9 +795,7 @@ static int ksmbd_durable_scavenger(void *dummy)
break;
}
- mutex_lock(&durable_scavenger_lock);
durable_scavenger_running = false;
- mutex_unlock(&durable_scavenger_lock);
module_put(THIS_MODULE);
diff --git a/fs/splice.c b/fs/splice.c
index 90d464241f15..4d6df083e0c0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -45,7 +45,7 @@
* here if set to avoid blocking other users of this pipe if splice is
* being done on it.
*/
-static noinline void noinline pipe_clear_nowait(struct file *file)
+static noinline void pipe_clear_nowait(struct file *file)
{
fmode_t fmode = READ_ONCE(file->f_mode);
diff --git a/fs/stat.c b/fs/stat.c
index f13308bfdc98..3d9222807214 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -204,12 +204,25 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
STATX_ATTR_DAX);
idmap = mnt_idmap(path->mnt);
- if (inode->i_op->getattr)
- return inode->i_op->getattr(idmap, path, stat,
- request_mask,
- query_flags);
+ if (inode->i_op->getattr) {
+ int ret;
+
+ ret = inode->i_op->getattr(idmap, path, stat, request_mask,
+ query_flags);
+ if (ret)
+ return ret;
+ } else {
+ generic_fillattr(idmap, request_mask, inode, stat);
+ }
+
+ /*
+ * If this is a block device inode, override the filesystem attributes
+ * with the block device specific parameters that need to be obtained
+ * from the bdev backing inode.
+ */
+ if (S_ISBLK(stat->mode))
+ bdev_statx(path, stat, request_mask);
- generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
EXPORT_SYMBOL(vfs_getattr_nosec);
@@ -295,15 +308,6 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,
if (path_mounted(path))
stat->attributes |= STATX_ATTR_MOUNT_ROOT;
stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
-
- /*
- * If this is a block device inode, override the filesystem
- * attributes with the block device specific parameters that need to be
- * obtained from the bdev backing inode.
- */
- if (S_ISBLK(stat->mode))
- bdev_statx(path, stat, request_mask);
-
return 0;
}
diff --git a/fs/xattr.c b/fs/xattr.c
index 02bee149ad96..fabb2a04501e 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -703,7 +703,7 @@ static int path_setxattrat(int dfd, const char __user *pathname,
return error;
filename = getname_maybe_null(pathname, at_flags);
- if (!filename) {
+ if (!filename && dfd >= 0) {
CLASS(fd, f)(dfd);
if (fd_empty(f))
error = -EBADF;
@@ -847,7 +847,7 @@ static ssize_t path_getxattrat(int dfd, const char __user *pathname,
return error;
filename = getname_maybe_null(pathname, at_flags);
- if (!filename) {
+ if (!filename && dfd >= 0) {
CLASS(fd, f)(dfd);
if (fd_empty(f))
return -EBADF;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index fffd6fffdce0..ae0ca6858496 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -3,7 +3,7 @@ config XFS_FS
tristate "XFS filesystem support"
depends on BLOCK
select EXPORTFS
- select LIBCRC32C
+ select CRC32
select FS_IOMAP
help
XFS is a high performance journaling filesystem which originated
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8e7f1b324b3b..1a2b3f06fa71 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -105,6 +105,7 @@ xfs_buf_free(
{
unsigned int size = BBTOB(bp->b_length);
+ might_sleep();
trace_xfs_buf_free(bp, _RET_IP_);
ASSERT(list_empty(&bp->b_lru));
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index b4ffd80b7cb6..dcbfa274e06d 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -165,7 +165,7 @@ xmbuf_map_backing_mem(
folio_set_dirty(folio);
folio_unlock(folio);
- bp->b_addr = folio_address(folio);
+ bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos);
return 0;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index edbc521870a1..b4e32f0860b7 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1186,9 +1186,8 @@ xfs_qm_dqflush_done(
if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&
(lip->li_lsn == qlip->qli_flush_lsn ||
test_bit(XFS_LI_FAILED, &lip->li_flags))) {
-
spin_lock(&ailp->ail_lock);
- xfs_clear_li_failed(lip);
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
if (lip->li_lsn == qlip->qli_flush_lsn) {
/* xfs_ail_update_finish() drops the AIL lock */
tail_lsn = xfs_ail_delete_one(ailp, lip);
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index a4bc1642fe56..414b27a86458 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -876,6 +876,7 @@ xfs_getfsmap_rtdev_rmapbt(
const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
+ struct xfs_fsmap key0 = *keys; /* struct copy */
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = NULL;
struct xfs_btree_cur *bt_cur = NULL;
@@ -887,32 +888,46 @@ xfs_getfsmap_rtdev_rmapbt(
int error = 0;
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks);
- if (keys[0].fmr_physical >= eofs)
+ if (key0.fmr_physical >= eofs)
return 0;
+ /*
+ * On zoned filesystems with an internal rt volume, the volume comes
+ * immediately after the end of the data volume. However, the
+ * xfs_rtblock_t address space is relative to the start of the data
+ * device, which means that the first @rtstart fsblocks do not actually
+ * point anywhere. If a fsmap query comes in with the low key starting
+ * below @rtstart, report it as "owned by filesystem".
+ */
rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart);
- if (keys[0].fmr_physical < rtstart_daddr) {
+ if (xfs_has_zoned(mp) && key0.fmr_physical < rtstart_daddr) {
struct xfs_fsmap_irec frec = {
.owner = XFS_RMAP_OWN_FS,
.len_daddr = rtstart_daddr,
};
- /* Adjust the low key if we are continuing from where we left off. */
- if (keys[0].fmr_length > 0) {
- info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length;
- return 0;
+ /*
+ * Adjust the start of the query range if we're picking up from
+ * a previous round, and only emit the record if we haven't
+ * already gone past.
+ */
+ key0.fmr_physical += key0.fmr_length;
+ if (key0.fmr_physical < rtstart_daddr) {
+ error = xfs_getfsmap_helper(tp, info, &frec);
+ if (error)
+ return error;
+
+ key0.fmr_physical = rtstart_daddr;
}
- /* Fabricate an rmap entry for space occupied by the data dev */
- error = xfs_getfsmap_helper(tp, info, &frec);
- if (error)
- return error;
+ /* Zero the other fields to avoid further adjustments. */
+ key0.fmr_owner = 0;
+ key0.fmr_offset = 0;
+ key0.fmr_length = 0;
}
- start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical);
- end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr +
- min(eofs - 1, keys[1].fmr_physical));
-
+ start_rtb = xfs_daddr_to_rtb(mp, key0.fmr_physical);
+ end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
info->missing_owner = XFS_FMR_OWN_FREE;
/*
@@ -920,12 +935,12 @@ xfs_getfsmap_rtdev_rmapbt(
* low to the fsmap low key and max out the high key to the end
* of the rtgroup.
*/
- info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
- error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, key0.fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->low, &key0);
if (error)
return error;
- info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length);
- xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+ info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, key0.fmr_length);
+ xfs_getfsmap_set_irec_flags(&info->low, &key0);
/* Adjust the low key if we are continuing from where we left off. */
if (info->low.rm_blockcount == 0) {
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 40fc1bf900af..c6cb0b6b9e46 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -1089,13 +1089,7 @@ xfs_iflush_abort(
* state. Whilst the inode is in the AIL, it should have a valid buffer
* pointer for push operations to access - it is only safe to remove the
* inode from the buffer once it has been removed from the AIL.
- *
- * We also clear the failed bit before removing the item from the AIL
- * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer
- * references the inode item owns and needs to hold until we've fully
- * aborted the inode log item and detached it from the buffer.
*/
- clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
xfs_trans_ail_delete(&iip->ili_item, 0);
/*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 6493bdb57351..980aabc49512 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2888,7 +2888,7 @@ xlog_force_and_check_iclog(
*
* 1. the current iclog is active and has no data; the previous iclog
* is in the active or dirty state.
- * 2. the current iclog is drity, and the previous iclog is in the
+ * 2. the current iclog is dirty, and the previous iclog is in the
* active or dirty state.
*
* We may sleep if:
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 799b84220ebb..e5192c12e7ac 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -229,6 +229,7 @@ typedef struct xfs_mount {
bool m_finobt_nores; /* no per-AG finobt resv. */
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
+ unsigned int m_zonegc_low_space;
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index b7e82d85f043..7a5c5ef2db92 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -718,8 +718,40 @@ max_open_zones_show(
}
XFS_SYSFS_ATTR_RO(max_open_zones);
+static ssize_t
+zonegc_low_space_store(
+ struct kobject *kobj,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ unsigned int val;
+
+ ret = kstrtouint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 100)
+ return -EINVAL;
+
+ zoned_to_mp(kobj)->m_zonegc_low_space = val;
+
+ return count;
+}
+
+static ssize_t
+zonegc_low_space_show(
+ struct kobject *kobj,
+ char *buf)
+{
+ return sysfs_emit(buf, "%u\n",
+ zoned_to_mp(kobj)->m_zonegc_low_space);
+}
+XFS_SYSFS_ATTR_RW(zonegc_low_space);
+
static struct attribute *xfs_zoned_attrs[] = {
ATTR_LIST(max_open_zones),
+ ATTR_LIST(zonegc_low_space),
NULL,
};
ATTRIBUTE_GROUPS(xfs_zoned);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0fcb1828e598..85a649fec6ac 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -909,10 +909,9 @@ xfs_trans_ail_delete(
return;
}
- /* xfs_ail_update_finish() drops the AIL lock */
- xfs_clear_li_failed(lip);
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
tail_lsn = xfs_ail_delete_one(ailp, lip);
- xfs_ail_update_finish(ailp, tail_lsn);
+ xfs_ail_update_finish(ailp, tail_lsn); /* drops the AIL lock */
}
int
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index bd841df93021..f945f0450b16 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -167,32 +167,4 @@ xfs_trans_ail_copy_lsn(
}
#endif
-static inline void
-xfs_clear_li_failed(
- struct xfs_log_item *lip)
-{
- struct xfs_buf *bp = lip->li_buf;
-
- ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
- lockdep_assert_held(&lip->li_ailp->ail_lock);
-
- if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) {
- lip->li_buf = NULL;
- xfs_buf_rele(bp);
- }
-}
-
-static inline void
-xfs_set_li_failed(
- struct xfs_log_item *lip,
- struct xfs_buf *bp)
-{
- lockdep_assert_held(&lip->li_ailp->ail_lock);
-
- if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) {
- xfs_buf_hold(bp);
- lip->li_buf = bp;
- }
-}
-
#endif /* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 52af234936a2..d509e49b2aaa 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -1201,6 +1201,13 @@ xfs_mount_zones(
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
iz.available + iz.reclaimable);
+ /*
+ * The user may configure GC to free up a percentage of unused blocks.
+ * By default this is 0. GC will always trigger at the minimum level
+ * for keeping max_open_zones available for data placement.
+ */
+ mp->m_zonegc_low_space = 0;
+
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index c5136ea9bb1d..81c94dd1d596 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -162,18 +162,36 @@ struct xfs_zone_gc_data {
/*
* We aim to keep enough zones free in stock to fully use the open zone limit
- * for data placement purposes.
+ * for data placement purposes. Additionally, the m_zonegc_low_space tunable
+ * can be set to make sure a fraction of the unused blocks are available for
+ * writing.
*/
bool
xfs_zoned_need_gc(
struct xfs_mount *mp)
{
+ s64 available, free, threshold;
+ s32 remainder;
+
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
return false;
- if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
+
+ available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
+
+ if (available <
mp->m_groups[XG_TYPE_RTG].blocks *
(mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
return true;
+
+ free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
+
+ threshold = div_s64_rem(free, 100, &remainder);
+ threshold = threshold * mp->m_zonegc_low_space +
+ remainder * div_s64(mp->m_zonegc_low_space, 100);
+
+ if (available < threshold)
+ return true;
+
return false;
}