Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 65 additions & 1 deletion fs/fuse/control.c
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,67 @@ static const struct file_operations fuse_conn_congestion_threshold_ops = {
.write = fuse_conn_congestion_threshold_write,
};

static ssize_t fuse_conn_writethrough_threshold_read(struct file *file,
char __user *buf,
size_t len, loff_t *ppos)
{
struct fuse_conn *fc;
unsigned val;

fc = fuse_ctl_file_conn_get(file);
if (!fc)
return 0;

val = READ_ONCE(fc->writethrough_threshold);
fuse_conn_put(fc);

return fuse_conn_limit_read(file, buf, len, ppos, val);
}

static ssize_t fuse_conn_writethrough_threshold_write(struct file *file,
const char __user *buf,
size_t count, loff_t *ppos)
{
struct fuse_conn *fc;
char kbuf[32];
unsigned long long val;
char *end;

if (*ppos)
return -EINVAL;
if (count == 0 || count >= sizeof(kbuf))
return -EINVAL;
if (copy_from_user(kbuf, buf, count))
return -EFAULT;
kbuf[count] = '\0';

/* memparse accepts a bare suffix without a digit; require a digit */
if (kbuf[0] < '0' || kbuf[0] > '9')
return -EINVAL;

val = memparse(kbuf, &end);
end = skip_spaces(end);
if (*end)
return -EINVAL;
if (val > UINT_MAX)
return -EINVAL;

fc = fuse_ctl_file_conn_get(file);
if (!fc)
return -ENOENT;

WRITE_ONCE(fc->writethrough_threshold, (unsigned int)val);
fuse_conn_put(fc);

return count;
}

static const struct file_operations fuse_conn_writethrough_threshold_ops = {
.open = nonseekable_open,
.read = fuse_conn_writethrough_threshold_read,
.write = fuse_conn_writethrough_threshold_write,
};

static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
struct fuse_conn *fc,
const char *name, int mode,
Expand Down Expand Up @@ -269,7 +330,10 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
NULL, &fuse_conn_max_background_ops) ||
!fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
S_IFREG | 0600, NULL,
&fuse_conn_congestion_threshold_ops))
&fuse_conn_congestion_threshold_ops) ||
!fuse_ctl_add_dentry(parent, fc, "writethrough_threshold",
S_IFREG | 0600, NULL,
&fuse_conn_writethrough_threshold_ops))
goto err;

return 0;
Expand Down
162 changes: 148 additions & 14 deletions fs/fuse/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -1666,6 +1666,76 @@ static ssize_t fuse_writeback_write_iter(struct kiocb *iocb,
return written < 0 ? written : total_written;
}

/*
* With writeback caching the request size seen by the server depends on
* how many contiguous dirty pages the flusher finds, which is bounded by
* dirty throttling: with BDI_CAP_STRICTLIMIT the dirty window can degrade
* to a single page under streaming writes, turning large application
* writes into page-sized requests.
*
* Writes that already match the server's preferred alignment gain
* nothing from accumulating in the page cache, so send them through
* fuse_perform_write() instead, which packs requests up to max_write.
* They create no dirty pages, hence no DLM write lock needs to be cached
* for them. Unaligned writes keep using the writeback cache, where they
* can merge with neighbouring data.
*
* A non-zero writethrough_threshold additionally forces any write at or
* above that size through fuse_perform_write() regardless of alignment.
*/
static bool fuse_use_writeback_cache(struct fuse_conn *fc, struct kiocb *iocb,
struct iov_iter *from)
{
size_t count = iov_iter_count(from);
unsigned int wt;
u64 align;
bool ret;

if (!fc->big_writes)
return true;

/* these rely on the semantics of their current paths */
if (iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND | IOCB_NOWAIT))
return true;

wt = READ_ONCE(fc->writethrough_threshold);
if (wt && count >= wt)
return false;

align = fc->alignment_pages ?
(u64)fc->alignment_pages << PAGE_SHIFT : PAGE_SIZE;

ret = !IS_ALIGNED(iocb->ki_pos, align) || !IS_ALIGNED((u64)count, align);
return ret;
}

/*
* @return true if an exclusive lock is needed for a cached (buffered) write.
*/
static bool fuse_cache_wr_exclusive_lock(struct kiocb *iocb, bool cache_mode)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct fuse_conn *fc = get_fuse_conn(inode);

/*
* Without the DLM the inode rwsem is the only writer exclusion, and
* outside writeback-cache mode the write is covered by neither the DLM
* write lock nor the synchronous-server path described above.
*/
if (!fc->dlm || !cache_mode)
return true;

/* O_DIRECT writes fall back to generic_file_direct_write(). */
if (iocb->ki_flags & IOCB_DIRECT)
return true;

/* Append needs the eventual EOF - always needs an exclusive lock. */
if (iocb->ki_flags & IOCB_APPEND)
return true;

return false;
}

static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
Expand All @@ -1675,7 +1745,10 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = mapping->host;
ssize_t err, count;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
bool writeback = false;
bool cache_mode = false;
bool exclusive;

if (fc->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
Expand All @@ -1684,8 +1757,10 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
return err;

if (!fc->handle_killpriv_v2 ||
!setattr_should_drop_suidgid(idmap, file_inode(file))) {
cache_mode = !fc->handle_killpriv_v2 ||
!setattr_should_drop_suidgid(idmap, file_inode(file));

if (cache_mode && fuse_use_writeback_cache(fc, iocb, from)) {
writeback = true;

/*
Expand Down Expand Up @@ -1713,7 +1788,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
}

inode_lock(inode);
exclusive = fuse_cache_wr_exclusive_lock(iocb, cache_mode);
if (exclusive) {
inode_lock(inode);
} else {
inode_lock_shared(inode);
}

err = count = generic_write_checks(iocb, from);
if (err <= 0)
Expand All @@ -1732,7 +1812,51 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
written = direct_write_fallback(iocb, from, written,
fuse_perform_write(iocb, from));
} else if (writeback) {
loff_t pos = iocb->ki_pos;
loff_t end = pos + count;
loff_t orig_size = 0;
bool extended = false;

if (fc->dlm && !exclusive && end > i_size_read(inode)) {
/*
* Lockless pre-check above keeps in-bounds writes off
* fi->lock; under the shared inode lock i_size only grows
* (extenders take fi->lock, truncate is excluded), so a
* stale read can only over-trigger this slow path, never
* miss an extension. Re-check authoritatively here.
*/
spin_lock(&fi->lock);
orig_size = i_size_read(inode);
if (end > orig_size) {
i_size_write(inode, end);
extended = true;
}
spin_unlock(&fi->lock);

/* Zero the tail of the folio straddling the old EOF. */
if (extended && orig_size < pos)
pagecache_isize_extended(inode, orig_size, pos);
}

written = fuse_writeback_write_iter(iocb, from, file);

/*
* Reconcile the speculative extension with what was actually
* written. Only retract the tail if no concurrent extender has
* pushed i_size past our claim; otherwise [reached, end) is a
* legitimate hole inside their extension and must remain.
*/
if (extended) {
loff_t reached = written > 0 ? pos + written : orig_size;

if (reached < end) {
spin_lock(&fi->lock);
if (i_size_read(inode) == end)
i_size_write(inode, reached);
spin_unlock(&fi->lock);
}
}

if (written < 0) {
err = written;
goto out;
Expand All @@ -1741,7 +1865,10 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
written = fuse_perform_write(iocb, from);
}
out:
inode_unlock(inode);
if (exclusive)
inode_unlock(inode);
else
inode_unlock_shared(inode);
if (written > 0)
written = generic_write_sync(iocb, written);

Expand Down Expand Up @@ -2114,19 +2241,15 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
struct fuse_args_pages *ap = &wpa->ia.ap;
struct inode *inode = wpa->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;

for (i = 0; i < ap->num_folios; i++) {
for (i = 0; i < ap->num_folios; i++)
/*
* Benchmarks showed that ending writeback within the
* scope of the fi->lock alleviates xarray lock
* contention and noticeably improves performance.
*/
iomap_finish_folio_write(inode, ap->folios[i], 1);
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
wb_writeout_inc(&bdi->wb);
}

wake_up(&fi->page_waitq);
}
Expand Down Expand Up @@ -2301,14 +2424,11 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
uint32_t folio_index, loff_t offset, unsigned len)
{
struct inode *inode = folio->mapping->host;
struct fuse_args_pages *ap = &wpa->ia.ap;

ap->folios[folio_index] = folio;
ap->descs[folio_index].offset = offset;
ap->descs[folio_index].length = len;

inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
}

static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
Expand Down Expand Up @@ -3506,6 +3626,20 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
if (IS_ENABLED(CONFIG_FUSE_DAX))
fuse_dax_inode_init(inode, flags);

if (enable_large_folios)
mapping_set_large_folios(inode->i_mapping);
if (enable_large_folios) {
/*
* Readahead and writeback batch whole folios into a single
* request, capped at min(fc->max_pages, fc->max_read/PAGE_SIZE)
* pages. The page cache must therefore never build a folio
* larger than that, or fuse_readahead() trips WARN_ON(!pages)
* and then dereferences a NULL ap->folios[0] in
* fuse_send_readpages(). Bound the folio order to the request
* limit instead of MAX_PAGECACHE_ORDER.
*/
unsigned int max_pages = min(fc->max_pages,
fc->max_read >> PAGE_SHIFT);

mapping_set_folio_order_range(inode->i_mapping, 0,
ilog2(max_pages ?: 1));
}
}
5 changes: 4 additions & 1 deletion fs/fuse/fuse_i.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
#define FUSE_NAME_MAX (PATH_MAX - 1)

/** Number of dentries for each connection in the control filesystem */
#define FUSE_CTL_NUM_DENTRIES 5
#define FUSE_CTL_NUM_DENTRIES 6

/* Frequency (in seconds) of request timeout checks, if opted into */
#define FUSE_TIMEOUT_TIMER_FREQ 15
Expand Down Expand Up @@ -1045,6 +1045,9 @@ struct fuse_conn {
/* The foffset alignment in PAGE */
unsigned int alignment_pages;

/* Buffered writes >= this size bypass the writeback cache (0 = off) */
unsigned int writethrough_threshold;

/**
* XArray tracking tasks that need DLM retry.
* Maps task pointer -> struct fuse_dlm_retry.
Expand Down
3 changes: 2 additions & 1 deletion fs/fuse/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1704,6 +1704,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
else
fm->sb->s_bdi->ra_pages =
min(fm->sb->s_bdi->ra_pages, ra_pages);
fm->sb->s_bdi->io_pages = fc->max_pages;
fc->minor = arg->minor;
fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
fc->max_write = max_t(unsigned, 4096, fc->max_write);
Expand Down Expand Up @@ -1834,7 +1835,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)

/* fuse does it's own writeback accounting */
sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT;
sb->s_bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;

/*
* For a single fuse filesystem use max 1% of dirty +
Expand Down
10 changes: 0 additions & 10 deletions include/linux/backing-dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,6 @@ static inline void wb_stat_mod(struct bdi_writeback *wb,
percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
}

static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
wb_stat_mod(wb, item, 1);
}

static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
wb_stat_mod(wb, item, -1);
}

static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
return percpu_counter_read_positive(&wb->stat[item]);
Expand Down
7 changes: 4 additions & 3 deletions include/uapi/linux/fuse.h
Original file line number Diff line number Diff line change
Expand Up @@ -445,8 +445,8 @@ struct fuse_file_lock {
* init_out.request_timeout contains the timeout (in secs)
* FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation
* FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation
* FUSE_ALIGN_PG_ORDER: page order (power of 2 exponent for number of pages) for
* optimal io-size alignment
* FUSE_ALIGN_PG_ORDER: alignment order (power of 2 exponent of the IO size
* in bytes) for optimal io-size alignment
* FUSE_URING_REDUCED_Q: Client (kernel) supports less queues - Server is free
* to register between 1 and nr-core io-uring queues
*/
Expand Down Expand Up @@ -947,7 +947,8 @@ struct fuse_init_in {
#define FUSE_COMPAT_22_INIT_OUT_SIZE 24

/*
* align_page_order: Number of pages for optimal IO, or a multiple of that
* align_page_order: log2 of the optimal IO size in bytes; IO is optimal
* when sized and aligned to (1 << align_page_order) or a multiple of it
*/
struct fuse_init_out {
uint32_t major;
Expand Down