summaryrefslogtreecommitdiffstats
path: root/include/linux/netfs.h
diff options
context:
space:
mode:
authorDavid Howells <dhowells@redhat.com>2024-03-18 16:52:05 +0000
committerDavid Howells <dhowells@redhat.com>2024-05-01 18:07:36 +0100
commit288ace2f57c9d06dd2e42bd80d03747d879a4068 (patch)
tree09b6e75dfb7b1120ce04771c6629bd5525ae446b /include/linux/netfs.h
parent7ba167c4c73ed96eb002c98a9d7d49317dfb0191 (diff)
downloadlinux-288ace2f57c9d06dd2e42bd80d03747d879a4068.tar.gz
linux-288ace2f57c9d06dd2e42bd80d03747d879a4068.tar.bz2
linux-288ace2f57c9d06dd2e42bd80d03747d879a4068.zip
netfs: New writeback implementation
The current netfslib writeback implementation creates writeback requests of contiguous folio data and then separately tiles subrequests over the space twice, once for the server and once for the cache. This creates a few issues: (1) Every time there's a discontiguity or a change between writing to only one destination or writing to both, it must create a new request. This makes it harder to do vectored writes. (2) The folios don't have the writeback mark removed until the end of the request - and a request could be hundreds of megabytes. (3) In future, I want to support a larger cache granularity, which will require aggregation of some folios that contain unmodified data (which only need to go to the cache) and some which contain modifications (which need to be uploaded and stored to the cache) - but, currently, these are treated as discontiguous. There's also a move to get everyone to use writeback_iter() to extract writable folios from the pagecache. That said, currently writeback_iter() has some issues that make it less than ideal: (1) there's no way to cancel the iteration, even if you find a "temporary" error that means the current folio and all subsequent folios are going to fail; (2) there's no way to filter the folios being written back - something that will impact Ceph with it's ordered snap system; (3) and if you get a folio you can't immediately deal with (say you need to flush the preceding writes), you are left with a folio hanging in the locked state for the duration, when really we should unlock it and relock it later. In this new implementation, I use writeback_iter() to pump folios, progressively creating two parallel, but separate streams and cleaning up the finished folios as the subrequests complete. Either or both streams can contain gaps, and the subrequests in each stream can be of variable size, don't need to align with each other and don't need to align with the folios. Indeed, subrequests can cross folio boundaries, may cover several folios or a folio may be spanned by multiple folios, e.g.: +---+---+-----+-----+---+----------+ Folios: | | | | | | | +---+---+-----+-----+---+----------+ +------+------+ +----+----+ Upload: | | |.....| | | +------+------+ +----+----+ +------+------+------+------+------+ Cache: | | | | | | +------+------+------+------+------+ The progressive subrequest construction permits the algorithm to be preparing both the next upload to the server and the next write to the cache whilst the previous ones are already in progress. Throttling can be applied to control the rate of production of subrequests - and, in any case, we probably want to write them to the server in ascending order, particularly if the file will be extended. Content crypto can also be prepared at the same time as the subrequests and run asynchronously, with the prepped requests being stalled until the crypto catches up with them. This might also be useful for transport crypto, but that happens at a lower layer, so probably would be harder to pull off. The algorithm is split into three parts: (1) The issuer. This walks through the data, packaging it up, encrypting it and creating subrequests. The part of this that generates subrequests only deals with file positions and spans and so is usable for DIO/unbuffered writes as well as buffered writes. (2) The collector. This asynchronously collects completed subrequests, unlocks folios, frees crypto buffers and performs any retries. This runs in a work queue so that the issuer can return to the caller for writeback (so that the VM can have its kswapd thread back) or async writes. (3) The retryer. This pauses the issuer, waits for all outstanding subrequests to complete and then goes through the failed subrequests to reissue them. This may involve reprepping them (with cifs, the credits must be renegotiated, and a subrequest may need splitting), and doing RMW for content crypto if there's a conflicting change on the server. [!] Note that some of the functions are prefixed with "new_" to avoid clashes with existing functions. These will be renamed in a later patch that cuts over to the new algorithm. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: Eric Van Hensbergen <ericvh@kernel.org> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org
Diffstat (limited to 'include/linux/netfs.h')
-rw-r--r--include/linux/netfs.h68
1 files changed, 67 insertions, 1 deletions
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 88269681d4fc..42dba05a428b 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -64,6 +64,7 @@ struct netfs_inode {
#if IS_ENABLED(CONFIG_FSCACHE)
struct fscache_cookie *cache;
#endif
+ struct mutex wb_lock; /* Writeback serialisation */
loff_t remote_i_size; /* Size of the remote file */
loff_t zero_point; /* Size after which we assume there's no data
* on the server */
@@ -71,7 +72,6 @@ struct netfs_inode {
#define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */
#define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */
#define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */
-#define NETFS_ICTX_NO_WRITE_STREAMING 3 /* Don't engage in write-streaming */
#define NETFS_ICTX_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark
* write to cache on read */
};
@@ -127,6 +127,33 @@ static inline struct netfs_group *netfs_folio_group(struct folio *folio)
}
/*
+ * Stream of I/O subrequests going to a particular destination, such as the
+ * server or the local cache. This is mainly intended for writing where we may
+ * have to write to multiple destinations concurrently.
+ */
+struct netfs_io_stream {
+ /* Submission tracking */
+ struct netfs_io_subrequest *construct; /* Op being constructed */
+ unsigned int submit_off; /* Folio offset we're submitting from */
+ unsigned int submit_len; /* Amount of data left to submit */
+ unsigned int submit_max_len; /* Amount I/O can be rounded up to */
+ void (*prepare_write)(struct netfs_io_subrequest *subreq);
+ void (*issue_write)(struct netfs_io_subrequest *subreq);
+ /* Collection tracking */
+ struct list_head subrequests; /* Contributory I/O operations */
+ struct netfs_io_subrequest *front; /* Op being collected */
+ unsigned long long collected_to; /* Position we've collected results to */
+ size_t transferred; /* The amount transferred from this stream */
+ enum netfs_io_source source; /* Where to read from/write to */
+ unsigned short error; /* Aggregate error for the stream */
+ unsigned char stream_nr; /* Index of stream in parent table */
+ bool avail; /* T if stream is available */
+ bool active; /* T if stream is active */
+ bool need_retry; /* T if this stream needs retrying */
+ bool failed; /* T if this stream failed */
+};
+
+/*
* Resources required to do operations on a cache.
*/
struct netfs_cache_resources {
@@ -150,13 +177,16 @@ struct netfs_io_subrequest {
struct list_head rreq_link; /* Link in rreq->subrequests */
struct iov_iter io_iter; /* Iterator for this subrequest */
unsigned long long start; /* Where to start the I/O */
+ size_t max_len; /* Maximum size of the I/O */
size_t len; /* Size of the I/O */
size_t transferred; /* Amount of data transferred */
refcount_t ref;
short error; /* 0 or error that occurred */
unsigned short debug_index; /* Index in list (for debugging output) */
+ unsigned int nr_segs; /* Number of segs in io_iter */
unsigned int max_nr_segs; /* 0 or max number of segments in an iterator */
enum netfs_io_source source; /* Where to read from/write to */
+ unsigned char stream_nr; /* I/O stream this belongs to */
unsigned long flags;
#define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */
#define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */
@@ -164,6 +194,11 @@ struct netfs_io_subrequest {
#define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */
#define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */
#define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */
+#define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */
+#define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */
+#define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */
+#define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */
+#define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */
};
enum netfs_io_origin {
@@ -194,6 +229,9 @@ struct netfs_io_request {
struct netfs_cache_resources cache_resources;
struct list_head proc_link; /* Link in netfs_iorequests */
struct list_head subrequests; /* Contributory I/O operations */
+ struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */
+#define NR_IO_STREAMS 2 //wreq->nr_io_streams
+ struct netfs_group *group; /* Writeback group being written back */
struct iov_iter iter; /* Unencrypted-side iterator */
struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */
void *netfs_priv; /* Private data for the netfs */
@@ -203,6 +241,8 @@ struct netfs_io_request {
unsigned int rsize; /* Maximum read size (0 for none) */
unsigned int wsize; /* Maximum write size (0 for none) */
atomic_t subreq_counter; /* Next subreq->debug_index */
+ unsigned int nr_group_rel; /* Number of refs to release on ->group */
+ spinlock_t lock; /* Lock for queuing subreqs */
atomic_t nr_outstanding; /* Number of ops in progress */
atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */
size_t upper_len; /* Length can be extended to here */
@@ -214,6 +254,10 @@ struct netfs_io_request {
bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */
unsigned long long i_size; /* Size of the file */
unsigned long long start; /* Start position */
+ atomic64_t issued_to; /* Write issuer folio cursor */
+ unsigned long long contiguity; /* Tracking for gaps in the writeback sequence */
+ unsigned long long collected_to; /* Point we've collected to */
+ unsigned long long cleaned_to; /* Position we've cleaned folios to */
pgoff_t no_unlock_folio; /* Don't unlock this folio after read */
refcount_t ref;
unsigned long flags;
@@ -227,6 +271,9 @@ struct netfs_io_request {
#define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */
#define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */
#define NETFS_RREQ_BLOCKED 10 /* We blocked */
+#define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */
+#define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */
+#define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */
#define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark
* write to cache on read */
const struct netfs_request_ops *netfs_ops;
@@ -258,6 +305,9 @@ struct netfs_request_ops {
/* Write request handling */
void (*create_write_requests)(struct netfs_io_request *wreq,
loff_t start, size_t len);
+ void (*begin_writeback)(struct netfs_io_request *wreq);
+ void (*prepare_write)(struct netfs_io_subrequest *subreq);
+ void (*issue_write)(struct netfs_io_subrequest *subreq);
void (*invalidate_cache)(struct netfs_io_request *wreq);
};
@@ -292,6 +342,9 @@ struct netfs_cache_ops {
netfs_io_terminated_t term_func,
void *term_func_priv);
+ /* Write data to the cache from a netfs subrequest. */
+ void (*issue_write)(struct netfs_io_subrequest *subreq);
+
/* Expand readahead request */
void (*expand_readahead)(struct netfs_cache_resources *cres,
unsigned long long *_start,
@@ -304,6 +357,13 @@ struct netfs_cache_ops {
enum netfs_io_source (*prepare_read)(struct netfs_io_subrequest *subreq,
unsigned long long i_size);
+ /* Prepare a write subrequest, working out if we're allowed to do it
+ * and finding out the maximum amount of data to gather before
+ * attempting to submit. If we're not permitted to do it, the
+ * subrequest should be marked failed.
+ */
+ void (*prepare_write_subreq)(struct netfs_io_subrequest *subreq);
+
/* Prepare a write operation, working out what part of the write we can
* actually do.
*/
@@ -349,6 +409,8 @@ int netfs_write_begin(struct netfs_inode *, struct file *,
struct folio **, void **fsdata);
int netfs_writepages(struct address_space *mapping,
struct writeback_control *wbc);
+int new_netfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc);
bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio);
int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
@@ -372,8 +434,11 @@ size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
struct netfs_io_subrequest *netfs_create_write_request(
struct netfs_io_request *wreq, enum netfs_io_source dest,
loff_t start, size_t len, work_func_t worker);
+void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq);
void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
bool was_async);
+void new_netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+ bool was_async);
void netfs_queue_write_request(struct netfs_io_subrequest *subreq);
int netfs_start_io_read(struct inode *inode);
@@ -415,6 +480,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx,
#if IS_ENABLED(CONFIG_FSCACHE)
ctx->cache = NULL;
#endif
+ mutex_init(&ctx->wb_lock);
/* ->releasepage() drives zero_point */
if (use_zero_point) {
ctx->zero_point = ctx->remote_i_size;