diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 5247df896c5d01..0ce4d791763289 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -203,6 +203,67 @@ static const struct file_operations fuse_conn_congestion_threshold_ops = {
 	.write = fuse_conn_congestion_threshold_write,
 };
 
+static ssize_t fuse_conn_writethrough_threshold_read(struct file *file,
+						     char __user *buf,
+						     size_t len, loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	unsigned val;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return 0;
+
+	val = READ_ONCE(fc->writethrough_threshold);
+	fuse_conn_put(fc);
+
+	return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_writethrough_threshold_write(struct file *file,
+						      const char __user *buf,
+						      size_t count, loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	char kbuf[32];
+	unsigned long long val;
+	char *end;
+
+	if (*ppos)
+		return -EINVAL;
+	if (count == 0 || count >= sizeof(kbuf))
+		return -EINVAL;
+	if (copy_from_user(kbuf, buf, count))
+		return -EFAULT;
+	kbuf[count] = '\0';
+
+	/* memparse accepts a bare suffix without a digit; require a digit */
+	if (kbuf[0] < '0' || kbuf[0] > '9')
+		return -EINVAL;
+
+	val = memparse(kbuf, &end);
+	end = skip_spaces(end);
+	if (*end)
+		return -EINVAL;
+	if (val > UINT_MAX)
+		return -EINVAL;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return -ENOENT;
+
+	WRITE_ONCE(fc->writethrough_threshold, (unsigned int)val);
+	fuse_conn_put(fc);
+
+	return count;
+}
+
+static const struct file_operations fuse_conn_writethrough_threshold_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_writethrough_threshold_read,
+	.write = fuse_conn_writethrough_threshold_write,
+};
+
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 					  struct fuse_conn *fc,
 					  const char *name, int mode,
@@ -269,7 +330,10 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
 				 NULL, &fuse_conn_max_background_ops) ||
 	    !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
 				 S_IFREG | 0600, NULL,
-				 &fuse_conn_congestion_threshold_ops))
+				 &fuse_conn_congestion_threshold_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "writethrough_threshold",
+				 S_IFREG | 0600, NULL,
+				 &fuse_conn_writethrough_threshold_ops))
 		goto err;
 
 	return 0;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 467093acadbb89..ee7aa25e12ebbe 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1666,6 +1666,76 @@ static ssize_t fuse_writeback_write_iter(struct kiocb *iocb,
 	return written < 0 ? written : total_written;
 }
 
+/*
+ * With writeback caching the request size seen by the server depends on
+ * how many contiguous dirty pages the flusher finds, which is bounded by
+ * dirty throttling: with BDI_CAP_STRICTLIMIT the dirty window can degrade
+ * to a single page under streaming writes, turning large application
+ * writes into page-sized requests.
+ *
+ * Writes that already match the server's preferred alignment gain
+ * nothing from accumulating in the page cache, so send them through
+ * fuse_perform_write() instead, which packs requests up to max_write.
+ * They create no dirty pages, hence no DLM write lock needs to be cached
+ * for them.  Unaligned writes keep using the writeback cache, where they
+ * can merge with neighbouring data.
+ *
+ * A non-zero writethrough_threshold additionally forces any write at or
+ * above that size through fuse_perform_write() regardless of alignment.
+ */
+static bool fuse_use_writeback_cache(struct fuse_conn *fc, struct kiocb *iocb,
+				     struct iov_iter *from)
+{
+	size_t count = iov_iter_count(from);
+	unsigned int wt;
+	u64 align;
+	bool ret;
+
+	if (!fc->big_writes)
+		return true;
+
+	/* these rely on the semantics of their current paths */
+	if (iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND | IOCB_NOWAIT))
+		return true;
+
+	wt = READ_ONCE(fc->writethrough_threshold);
+	if (wt && count >= wt)
+		return false;
+
+	align = fc->alignment_pages ?
+		(u64)fc->alignment_pages << PAGE_SHIFT : PAGE_SIZE;
+
+	ret = !IS_ALIGNED(iocb->ki_pos, align) || !IS_ALIGNED((u64)count, align);
+	return ret;
+}
+
+/*
+ * @return true if an exclusive lock is needed for a cached (buffered) write.
+ */
+static bool fuse_cache_wr_exclusive_lock(struct kiocb *iocb, bool cache_mode)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	/*
+	 * Without the DLM the inode rwsem is the only writer exclusion, and
+	 * outside writeback-cache mode the write is covered by neither the DLM
+	 * write lock nor the synchronous-server path described above.
+	 */
+	if (!fc->dlm || !cache_mode)
+		return true;
+
+	/* O_DIRECT writes fall back to generic_file_direct_write(). */
+	if (iocb->ki_flags & IOCB_DIRECT)
+		return true;
+
+	/* Append needs the eventual EOF - always needs an exclusive lock. */
+	if (iocb->ki_flags & IOCB_APPEND)
+		return true;
+
+	return false;
+}
+
 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
@@ -1675,7 +1745,10 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct inode *inode = mapping->host;
 	ssize_t err, count;
 	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	bool writeback = false;
+	bool cache_mode = false;
+	bool exclusive;
 
 	if (fc->writeback_cache) {
 		/* Update size (EOF optimization) and mode (SUID clearing) */
@@ -1684,8 +1757,10 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		if (err)
 			return err;
 
-		if (!fc->handle_killpriv_v2 ||
-		    !setattr_should_drop_suidgid(idmap, file_inode(file))) {
+		cache_mode = !fc->handle_killpriv_v2 ||
+			     !setattr_should_drop_suidgid(idmap, file_inode(file));
+
+		if (cache_mode && fuse_use_writeback_cache(fc, iocb, from)) {
 			writeback = true;
 
 			/*
@@ -1713,7 +1788,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		}
 	}
 
-	inode_lock(inode);
+	exclusive = fuse_cache_wr_exclusive_lock(iocb, cache_mode);
+	if (exclusive) {
+		inode_lock(inode);
+	} else {
+		inode_lock_shared(inode);
+	}
 
 	err = count = generic_write_checks(iocb, from);
 	if (err <= 0)
@@ -1732,7 +1812,51 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		written = direct_write_fallback(iocb, from, written,
 						fuse_perform_write(iocb, from));
 	} else if (writeback) {
+		loff_t pos = iocb->ki_pos;
+		loff_t end = pos + count;
+		loff_t orig_size = 0;
+		bool extended = false;
+
+		if (fc->dlm && !exclusive && end > i_size_read(inode)) {
+			/*
+			 * Lockless pre-check above keeps in-bounds writes off
+			 * fi->lock; under the shared inode lock i_size only grows
+			 * (extenders take fi->lock, truncate is excluded), so a
+			 * stale read can only over-trigger this slow path, never
+			 * miss an extension.  Re-check authoritatively here.
+			 */
+			spin_lock(&fi->lock);
+			orig_size = i_size_read(inode);
+			if (end > orig_size) {
+				i_size_write(inode, end);
+				extended = true;
+			}
+			spin_unlock(&fi->lock);
+
+			/* Zero the tail of the folio straddling the old EOF. */
+			if (extended && orig_size < pos)
+				pagecache_isize_extended(inode, orig_size, pos);
+		}
+
 		written = fuse_writeback_write_iter(iocb, from, file);
+
+		/*
+		 * Reconcile the speculative extension with what was actually
+		 * written.  Only retract the tail if no concurrent extender has
+		 * pushed i_size past our claim; otherwise [reached, end) is a
+		 * legitimate hole inside their extension and must remain.
+		 */
+		if (extended) {
+			loff_t reached = written > 0 ? pos + written : orig_size;
+
+			if (reached < end) {
+				spin_lock(&fi->lock);
+				if (i_size_read(inode) == end)
+					i_size_write(inode, reached);
+				spin_unlock(&fi->lock);
+			}
+		}
+
 		if (written < 0) {
 			err = written;
 			goto out;
@@ -1741,7 +1865,10 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		written = fuse_perform_write(iocb, from);
 	}
 out:
-	inode_unlock(inode);
+	if (exclusive)
+		inode_unlock(inode);
+	else
+		inode_unlock_shared(inode);
 	if (written > 0)
 		written = generic_write_sync(iocb, written);
 
@@ -2114,19 +2241,15 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
 	struct fuse_args_pages *ap = &wpa->ia.ap;
 	struct inode *inode = wpa->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	int i;
 
-	for (i = 0; i < ap->num_folios; i++) {
+	for (i = 0; i < ap->num_folios; i++)
 		/*
 		 * Benchmarks showed that ending writeback within the
 		 * scope of the fi->lock alleviates xarray lock
 		 * contention and noticeably improves performance.
 		 */
 		iomap_finish_folio_write(inode, ap->folios[i], 1);
-		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
-		wb_writeout_inc(&bdi->wb);
-	}
 
 	wake_up(&fi->page_waitq);
 }
@@ -2301,14 +2424,11 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
 static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
 					  uint32_t folio_index, loff_t offset, unsigned len)
 {
-	struct inode *inode = folio->mapping->host;
 	struct fuse_args_pages *ap = &wpa->ia.ap;
 
 	ap->folios[folio_index] = folio;
 	ap->descs[folio_index].offset = offset;
 	ap->descs[folio_index].length = len;
-
-	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
 }
 
 static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
@@ -3506,6 +3626,20 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
 	if (IS_ENABLED(CONFIG_FUSE_DAX))
 		fuse_dax_inode_init(inode, flags);
 
-	if (enable_large_folios)
-		mapping_set_large_folios(inode->i_mapping);
+	if (enable_large_folios) {
+		/*
+		 * Readahead and writeback batch whole folios into a single
+		 * request, capped at min(fc->max_pages, fc->max_read/PAGE_SIZE)
+		 * pages.  The page cache must therefore never build a folio
+		 * larger than that, or fuse_readahead() trips WARN_ON(!pages)
+		 * and then dereferences a NULL ap->folios[0] in
+		 * fuse_send_readpages().  Bound the folio order to the request
+		 * limit instead of MAX_PAGECACHE_ORDER.
+		 */
+		unsigned int max_pages = min(fc->max_pages,
+					     fc->max_read >> PAGE_SHIFT);
+
+		mapping_set_folio_order_range(inode->i_mapping, 0,
+					      ilog2(max_pages ?: 1));
+	}
 }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 365c28cb282146..0c7196270538d2 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -47,7 +47,7 @@
 #define FUSE_NAME_MAX (PATH_MAX - 1)
 
 /** Number of dentries for each connection in the control filesystem */
-#define FUSE_CTL_NUM_DENTRIES 5
+#define FUSE_CTL_NUM_DENTRIES 6
 
 /* Frequency (in seconds) of request timeout checks, if opted into */
 #define FUSE_TIMEOUT_TIMER_FREQ 15
@@ -1045,6 +1045,9 @@ struct fuse_conn {
 	/* The foffset alignment in PAGE */
 	unsigned int alignment_pages;
 
+	/* Buffered writes >= this size bypass the writeback cache (0 = off) */
+	unsigned int writethrough_threshold;
+
 	/**
 	 * XArray tracking tasks that need DLM retry.
 	 * Maps task pointer -> struct fuse_dlm_retry.
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6a94de9528210f..70cf89c9fd6bd9 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1704,6 +1704,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 		else
 			fm->sb->s_bdi->ra_pages =
 				min(fm->sb->s_bdi->ra_pages, ra_pages);
+		fm->sb->s_bdi->io_pages = fc->max_pages;
 		fc->minor = arg->minor;
 		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
 		fc->max_write = max_t(unsigned, 4096, fc->max_write);
@@ -1834,7 +1835,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 
 	/* fuse does it's own writeback accounting */
 	sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
-	sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT;
+	sb->s_bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
 
 	/*
 	 * For a single fuse filesystem use max 1% of dirty +
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e721148c95d07d..9a1e895dd5df1b 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -66,16 +66,6 @@ static inline void wb_stat_mod(struct bdi_writeback *wb,
 	percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
 }
 
-static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
-{
-	wb_stat_mod(wb, item, 1);
-}
-
-static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
-{
-	wb_stat_mod(wb, item, -1);
-}
-
 static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
 	return percpu_counter_read_positive(&wb->stat[item]);
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 23374de18e2fd9..0493b39af4adbb 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -445,8 +445,8 @@ struct fuse_file_lock {
  *			 init_out.request_timeout contains the timeout (in secs)
  * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation
  * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation
- * FUSE_ALIGN_PG_ORDER: page order (power of 2 exponent for number of pages) for
- *			optimal io-size alignment
+ * FUSE_ALIGN_PG_ORDER: alignment order (power of 2 exponent of the IO size
+ *			in bytes) for optimal io-size alignment
  * FUSE_URING_REDUCED_Q: Client (kernel) supports less queues - Server is free
  *			 to register between 1 and nr-core io-uring queues
  */
@@ -947,7 +947,8 @@ struct fuse_init_in {
 #define FUSE_COMPAT_22_INIT_OUT_SIZE 24
 
 /*
- * align_page_order: Number of pages for optimal IO, or a multiple of that
+ * align_page_order: log2 of the optimal IO size in bytes; IO is optimal
+ * when sized and aligned to (1 << align_page_order) or a multiple of it
  */
 struct fuse_init_out {
 	uint32_t	major;