diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 5247df896c5d01..0ce4d791763289 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -203,6 +203,67 @@ static const struct file_operations fuse_conn_congestion_threshold_ops = { .write = fuse_conn_congestion_threshold_write, }; +static ssize_t fuse_conn_writethrough_threshold_read(struct file *file, + char __user *buf, + size_t len, loff_t *ppos) +{ + struct fuse_conn *fc; + unsigned val; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + val = READ_ONCE(fc->writethrough_threshold); + fuse_conn_put(fc); + + return fuse_conn_limit_read(file, buf, len, ppos, val); +} + +static ssize_t fuse_conn_writethrough_threshold_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + struct fuse_conn *fc; + char kbuf[32]; + unsigned long long val; + char *end; + + if (*ppos) + return -EINVAL; + if (count == 0 || count >= sizeof(kbuf)) + return -EINVAL; + if (copy_from_user(kbuf, buf, count)) + return -EFAULT; + kbuf[count] = '\0'; + + /* memparse accepts a bare suffix without a digit; require a digit */ + if (kbuf[0] < '0' || kbuf[0] > '9') + return -EINVAL; + + val = memparse(kbuf, &end); + end = skip_spaces(end); + if (*end) + return -EINVAL; + if (val > UINT_MAX) + return -EINVAL; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return -ENOENT; + + WRITE_ONCE(fc->writethrough_threshold, (unsigned int)val); + fuse_conn_put(fc); + + return count; +} + +static const struct file_operations fuse_conn_writethrough_threshold_ops = { + .open = nonseekable_open, + .read = fuse_conn_writethrough_threshold_read, + .write = fuse_conn_writethrough_threshold_write, +}; + static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, struct fuse_conn *fc, const char *name, int mode, @@ -269,7 +330,10 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) NULL, &fuse_conn_max_background_ops) || !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", S_IFREG | 0600, NULL, - &fuse_conn_congestion_threshold_ops)) + &fuse_conn_congestion_threshold_ops) || + !fuse_ctl_add_dentry(parent, fc, "writethrough_threshold", + S_IFREG | 0600, NULL, + &fuse_conn_writethrough_threshold_ops)) goto err; return 0; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 467093acadbb89..ee7aa25e12ebbe 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1666,6 +1666,76 @@ static ssize_t fuse_writeback_write_iter(struct kiocb *iocb, return written < 0 ? written : total_written; } +/* + * With writeback caching the request size seen by the server depends on + * how many contiguous dirty pages the flusher finds, which is bounded by + * dirty throttling: with BDI_CAP_STRICTLIMIT the dirty window can degrade + * to a single page under streaming writes, turning large application + * writes into page-sized requests. + * + * Writes that already match the server's preferred alignment gain + * nothing from accumulating in the page cache, so send them through + * fuse_perform_write() instead, which packs requests up to max_write. + * They create no dirty pages, hence no DLM write lock needs to be cached + * for them. Unaligned writes keep using the writeback cache, where they + * can merge with neighbouring data. + * + * A non-zero writethrough_threshold additionally forces any write at or + * above that size through fuse_perform_write() regardless of alignment. + */ +static bool fuse_use_writeback_cache(struct fuse_conn *fc, struct kiocb *iocb, + struct iov_iter *from) +{ + size_t count = iov_iter_count(from); + unsigned int wt; + u64 align; + bool ret; + + if (!fc->big_writes) + return true; + + /* these rely on the semantics of their current paths */ + if (iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND | IOCB_NOWAIT)) + return true; + + wt = READ_ONCE(fc->writethrough_threshold); + if (wt && count >= wt) + return false; + + align = fc->alignment_pages ? + (u64)fc->alignment_pages << PAGE_SHIFT : PAGE_SIZE; + + ret = !IS_ALIGNED(iocb->ki_pos, align) || !IS_ALIGNED((u64)count, align); + return ret; +} + +/* + * @return true if an exclusive lock is needed for a cached (buffered) write. + */ +static bool fuse_cache_wr_exclusive_lock(struct kiocb *iocb, bool cache_mode) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_conn *fc = get_fuse_conn(inode); + + /* + * Without the DLM the inode rwsem is the only writer exclusion, and + * outside writeback-cache mode the write is covered by neither the DLM + * write lock nor the synchronous-server path described above. + */ + if (!fc->dlm || !cache_mode) + return true; + + /* O_DIRECT writes fall back to generic_file_direct_write(). */ + if (iocb->ki_flags & IOCB_DIRECT) + return true; + + /* Append needs the eventual EOF - always needs an exclusive lock. */ + if (iocb->ki_flags & IOCB_APPEND) + return true; + + return false; +} + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1675,7 +1745,10 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = mapping->host; ssize_t err, count; struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); bool writeback = false; + bool cache_mode = false; + bool exclusive; if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ @@ -1684,8 +1757,10 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) return err; - if (!fc->handle_killpriv_v2 || - !setattr_should_drop_suidgid(idmap, file_inode(file))) { + cache_mode = !fc->handle_killpriv_v2 || + !setattr_should_drop_suidgid(idmap, file_inode(file)); + + if (cache_mode && fuse_use_writeback_cache(fc, iocb, from)) { writeback = true; /* @@ -1713,7 +1788,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) } } - inode_lock(inode); + exclusive = fuse_cache_wr_exclusive_lock(iocb, cache_mode); + if (exclusive) { + inode_lock(inode); + } else { + inode_lock_shared(inode); + } err = count = generic_write_checks(iocb, from); if (err <= 0) @@ -1732,7 +1812,51 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) written = direct_write_fallback(iocb, from, written, fuse_perform_write(iocb, from)); } else if (writeback) { + loff_t pos = iocb->ki_pos; + loff_t end = pos + count; + loff_t orig_size = 0; + bool extended = false; + + if (fc->dlm && !exclusive && end > i_size_read(inode)) { + /* + * Lockless pre-check above keeps in-bounds writes off + * fi->lock; under the shared inode lock i_size only grows + * (extenders take fi->lock, truncate is excluded), so a + * stale read can only over-trigger this slow path, never + * miss an extension. Re-check authoritatively here. + */ + spin_lock(&fi->lock); + orig_size = i_size_read(inode); + if (end > orig_size) { + i_size_write(inode, end); + extended = true; + } + spin_unlock(&fi->lock); + + /* Zero the tail of the folio straddling the old EOF. */ + if (extended && orig_size < pos) + pagecache_isize_extended(inode, orig_size, pos); + } + written = fuse_writeback_write_iter(iocb, from, file); + + /* + * Reconcile the speculative extension with what was actually + * written. Only retract the tail if no concurrent extender has + * pushed i_size past our claim; otherwise [reached, end) is a + * legitimate hole inside their extension and must remain. + */ + if (extended) { + loff_t reached = written > 0 ? pos + written : orig_size; + + if (reached < end) { + spin_lock(&fi->lock); + if (i_size_read(inode) == end) + i_size_write(inode, reached); + spin_unlock(&fi->lock); + } + } + if (written < 0) { err = written; goto out; @@ -1741,7 +1865,10 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) written = fuse_perform_write(iocb, from); } out: - inode_unlock(inode); + if (exclusive) + inode_unlock(inode); + else + inode_unlock_shared(inode); if (written > 0) written = generic_write_sync(iocb, written); @@ -2114,19 +2241,15 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) { + for (i = 0; i < ap->num_folios; i++) /* * Benchmarks showed that ending writeback within the * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ iomap_finish_folio_write(inode, ap->folios[i], 1); - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - wb_writeout_inc(&bdi->wb); - } wake_up(&fi->page_waitq); } @@ -2301,14 +2424,11 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, uint32_t folio_index, loff_t offset, unsigned len) { - struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; ap->folios[folio_index] = folio; ap->descs[folio_index].offset = offset; ap->descs[folio_index].length = len; - - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, @@ -3506,6 +3626,20 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); - if (enable_large_folios) - mapping_set_large_folios(inode->i_mapping); + if (enable_large_folios) { + /* + * Readahead and writeback batch whole folios into a single + * request, capped at min(fc->max_pages, fc->max_read/PAGE_SIZE) + * pages. The page cache must therefore never build a folio + * larger than that, or fuse_readahead() trips WARN_ON(!pages) + * and then dereferences a NULL ap->folios[0] in + * fuse_send_readpages(). Bound the folio order to the request + * limit instead of MAX_PAGECACHE_ORDER. + */ + unsigned int max_pages = min(fc->max_pages, + fc->max_read >> PAGE_SHIFT); + + mapping_set_folio_order_range(inode->i_mapping, 0, + ilog2(max_pages ?: 1)); + } } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 365c28cb282146..0c7196270538d2 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -47,7 +47,7 @@ #define FUSE_NAME_MAX (PATH_MAX - 1) /** Number of dentries for each connection in the control filesystem */ -#define FUSE_CTL_NUM_DENTRIES 5 +#define FUSE_CTL_NUM_DENTRIES 6 /* Frequency (in seconds) of request timeout checks, if opted into */ #define FUSE_TIMEOUT_TIMER_FREQ 15 @@ -1045,6 +1045,9 @@ struct fuse_conn { /* The foffset alignment in PAGE */ unsigned int alignment_pages; + /* Buffered writes >= this size bypass the writeback cache (0 = off) */ + unsigned int writethrough_threshold; + /** * XArray tracking tasks that need DLM retry. * Maps task pointer -> struct fuse_dlm_retry. diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6a94de9528210f..70cf89c9fd6bd9 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1704,6 +1704,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, else fm->sb->s_bdi->ra_pages = min(fm->sb->s_bdi->ra_pages, ra_pages); + fm->sb->s_bdi->io_pages = fc->max_pages; fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; fc->max_write = max_t(unsigned, 4096, fc->max_write); @@ -1834,7 +1835,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) /* fuse does it's own writeback accounting */ sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; - sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; + sb->s_bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; /* * For a single fuse filesystem use max 1% of dirty + diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e721148c95d07d..9a1e895dd5df1b 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -66,16 +66,6 @@ static inline void wb_stat_mod(struct bdi_writeback *wb, percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } -static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) -{ - wb_stat_mod(wb, item, 1); -} - -static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) -{ - wb_stat_mod(wb, item, -1); -} - static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_read_positive(&wb->stat[item]); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 23374de18e2fd9..0493b39af4adbb 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -445,8 +445,8 @@ struct fuse_file_lock { * init_out.request_timeout contains the timeout (in secs) * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation - * FUSE_ALIGN_PG_ORDER: page order (power of 2 exponent for number of pages) for - * optimal io-size alignment + * FUSE_ALIGN_PG_ORDER: alignment order (power of 2 exponent of the IO size + * in bytes) for optimal io-size alignment * FUSE_URING_REDUCED_Q: Client (kernel) supports less queues - Server is free * to register between 1 and nr-core io-uring queues */ @@ -947,7 +947,8 @@ struct fuse_init_in { #define FUSE_COMPAT_22_INIT_OUT_SIZE 24 /* - * align_page_order: Number of pages for optimal IO, or a multiple of that + * align_page_order: log2 of the optimal IO size in bytes; IO is optimal + * when sized and aligned to (1 << align_page_order) or a multiple of it */ struct fuse_init_out { uint32_t major;