[v4,1/4] block: Introduce provisioning primitives
Commit Message
Introduce block request REQ_OP_PROVISION. The intent of this request
is to request underlying storage to preallocate disk space for the given
block range. Block devices that support this capability will export
a provision limit within their request queues.
This patch also adds the capability to call fallocate() in mode 0
on block devices, which will send REQ_OP_PROVISION to the block
device for the specified range,
Signed-off-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
---
block/blk-core.c | 5 ++++
block/blk-lib.c | 53 +++++++++++++++++++++++++++++++++++++++
block/blk-merge.c | 18 +++++++++++++
block/blk-settings.c | 19 ++++++++++++++
block/blk-sysfs.c | 8 ++++++
block/bounce.c | 1 +
block/fops.c | 25 +++++++++++++-----
include/linux/bio.h | 6 +++--
include/linux/blk_types.h | 5 +++-
include/linux/blkdev.h | 16 ++++++++++++
10 files changed, 147 insertions(+), 9 deletions(-)
Comments
On 4/18/23 15:12, Sarthak Kukreti wrote:
> /* Fail if we don't recognize the flags. */
> - if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
> + if (mode != 0 && mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
> return -EOPNOTSUPP;
Is this change necessary? Doesn't (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
!= 0 imply that mode != 0?
Thanks,
Bart.
On Tue, Apr 18, 2023 at 03:12:04PM -0700, Sarthak Kukreti wrote:
> Introduce block request REQ_OP_PROVISION. The intent of this request
> is to request underlying storage to preallocate disk space for the given
> block range. Block devices that support this capability will export
> a provision limit within their request queues.
>
> This patch also adds the capability to call fallocate() in mode 0
> on block devices, which will send REQ_OP_PROVISION to the block
> device for the specified range,
>
> Signed-off-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
> ---
> block/blk-core.c | 5 ++++
> block/blk-lib.c | 53 +++++++++++++++++++++++++++++++++++++++
> block/blk-merge.c | 18 +++++++++++++
> block/blk-settings.c | 19 ++++++++++++++
> block/blk-sysfs.c | 8 ++++++
> block/bounce.c | 1 +
> block/fops.c | 25 +++++++++++++-----
> include/linux/bio.h | 6 +++--
> include/linux/blk_types.h | 5 +++-
> include/linux/blkdev.h | 16 ++++++++++++
> 10 files changed, 147 insertions(+), 9 deletions(-)
>
<cut to the fallocate part; the block/ changes look fine to /me/ at
first glance, but what do I know... ;)>
> diff --git a/block/fops.c b/block/fops.c
> index d2e6be4e3d1c..e1775269654a 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -611,9 +611,13 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
> return ret;
> }
>
> +#define BLKDEV_FALLOC_FL_TRUNCATE \
At first I thought from this name that you were defining a new truncate
mode for fallocate, then I realized that this is mask for deciding if we
/want/ to truncate the pagecache.
#define BLKDEV_FALLOC_TRUNCATE_MASK ?
> + (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | \
Ok, so discarding and writing zeroes truncates the page cache, makes
sense since we're "writing" directly to the block device.
> + FALLOC_FL_NO_HIDE_STALE)
Here things get tricky -- some of the FALLOC_FL mode bits are really an
opcode and cannot be specified together, whereas others select optional
behavior for certain opcodes.
IIRC, the mutually exclusive opcodes are:
PUNCH_HOLE
ZERO_RANGE
COLLAPSE_RANGE
INSERT_RANGE
(none of the above, for allocation)
and the "variants on a theme are":
KEEP_SIZE
NO_HIDE_STALE
UNSHARE_RANGE
not all of which are supported by all the opcodes.
Does it make sense to truncate the page cache if userspace passes in
mode == NO_HIDE_STALE? There's currently no defined meaning for this
combination, but I think this means we'll truncate the pagecache before
deciding if we're actually going to issue any commands.
I think that's just a bug in the existing code -- it should be
validating that @mode is any of the supported combinations *before*
truncating the pagecache.
Otherwise you could have a mkfs program that starts writing new fs
metadata, decides to provision the storage (say for a logging region),
doesn't realize it's running on an old kernel, and then oops the
provision attempt fails but have we now shredded the pagecache and lost
all the writes?
--D
> +
> #define BLKDEV_FALLOC_FL_SUPPORTED \
> - (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
> - FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
> + (BLKDEV_FALLOC_FL_TRUNCATE | FALLOC_FL_KEEP_SIZE | \
> + FALLOC_FL_UNSHARE_RANGE)
>
> static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> loff_t len)
> @@ -625,7 +629,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> int error;
>
> /* Fail if we don't recognize the flags. */
> - if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
> + if (mode != 0 && mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
> return -EOPNOTSUPP;
>
> /* Don't go off the end of the device. */
> @@ -649,11 +653,20 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> filemap_invalidate_lock(inode->i_mapping);
>
> /* Invalidate the page cache, including dirty pages. */
> - error = truncate_bdev_range(bdev, file->f_mode, start, end);
> - if (error)
> - goto fail;
> + if (mode & BLKDEV_FALLOC_FL_TRUNCATE) {
> + error = truncate_bdev_range(bdev, file->f_mode, start, end);
> + if (error)
> + goto fail;
> + }
>
> switch (mode) {
> + case 0:
> + case FALLOC_FL_UNSHARE_RANGE:
> + case FALLOC_FL_KEEP_SIZE:
> + case FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE:
> + error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
> + len >> SECTOR_SHIFT, GFP_KERNEL);
> + break;
> case FALLOC_FL_ZERO_RANGE:
> case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
> error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index d766be7152e1..9820b3b039f2 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -57,7 +57,8 @@ static inline bool bio_has_data(struct bio *bio)
> bio->bi_iter.bi_size &&
> bio_op(bio) != REQ_OP_DISCARD &&
> bio_op(bio) != REQ_OP_SECURE_ERASE &&
> - bio_op(bio) != REQ_OP_WRITE_ZEROES)
> + bio_op(bio) != REQ_OP_WRITE_ZEROES &&
> + bio_op(bio) != REQ_OP_PROVISION)
> return true;
>
> return false;
> @@ -67,7 +68,8 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
> {
> return bio_op(bio) == REQ_OP_DISCARD ||
> bio_op(bio) == REQ_OP_SECURE_ERASE ||
> - bio_op(bio) == REQ_OP_WRITE_ZEROES;
> + bio_op(bio) == REQ_OP_WRITE_ZEROES ||
> + bio_op(bio) == REQ_OP_PROVISION;
> }
>
> static inline void *bio_data(struct bio *bio)
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 99be590f952f..27bdf88f541c 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -385,7 +385,10 @@ enum req_op {
> REQ_OP_DRV_IN = (__force blk_opf_t)34,
> REQ_OP_DRV_OUT = (__force blk_opf_t)35,
>
> - REQ_OP_LAST = (__force blk_opf_t)36,
> + /* request device to provision block */
> + REQ_OP_PROVISION = (__force blk_opf_t)37,
> +
> + REQ_OP_LAST = (__force blk_opf_t)38,
> };
>
> enum req_flag_bits {
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 941304f17492..239e2f418b6e 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -303,6 +303,7 @@ struct queue_limits {
> unsigned int discard_granularity;
> unsigned int discard_alignment;
> unsigned int zone_write_granularity;
> + unsigned int max_provision_sectors;
>
> unsigned short max_segments;
> unsigned short max_integrity_segments;
> @@ -921,6 +922,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q,
> unsigned int max_discard_sectors);
> extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
> unsigned int max_write_same_sectors);
> +extern void blk_queue_max_provision_sectors(struct request_queue *q,
> + unsigned int max_provision_sectors);
> extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
> extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
> unsigned int max_zone_append_sectors);
> @@ -1060,6 +1063,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
> sector_t nr_sects, gfp_t gfp);
>
> +extern int blkdev_issue_provision(struct block_device *bdev, sector_t sector,
> + sector_t nr_sects, gfp_t gfp_mask);
> +
> #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
> #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
>
> @@ -1139,6 +1145,11 @@ static inline unsigned short queue_max_discard_segments(const struct request_que
> return q->limits.max_discard_segments;
> }
>
> +static inline unsigned short queue_max_provision_sectors(const struct request_queue *q)
> +{
> + return q->limits.max_provision_sectors;
> +}
> +
> static inline unsigned int queue_max_segment_size(const struct request_queue *q)
> {
> return q->limits.max_segment_size;
> @@ -1281,6 +1292,11 @@ static inline bool bdev_nowait(struct block_device *bdev)
> return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
> }
>
> +static inline unsigned int bdev_max_provision_sectors(struct block_device *bdev)
> +{
> + return bdev_get_queue(bdev)->limits.max_provision_sectors;
> +}
> +
> static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
> {
> return blk_queue_zoned_model(bdev_get_queue(bdev));
> --
> 2.40.0.634.g4ca3ef3211-goog
>
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://listman.redhat.com/mailman/listinfo/dm-devel
>
On Wed, Apr 19 2023 at 11:36P -0400,
Darrick J. Wong <djwong@kernel.org> wrote:
> On Tue, Apr 18, 2023 at 03:12:04PM -0700, Sarthak Kukreti wrote:
> > Introduce block request REQ_OP_PROVISION. The intent of this request
> > is to request underlying storage to preallocate disk space for the given
> > block range. Block devices that support this capability will export
> > a provision limit within their request queues.
> >
> > This patch also adds the capability to call fallocate() in mode 0
> > on block devices, which will send REQ_OP_PROVISION to the block
> > device for the specified range,
> >
> > Signed-off-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > ---
> > block/blk-core.c | 5 ++++
> > block/blk-lib.c | 53 +++++++++++++++++++++++++++++++++++++++
> > block/blk-merge.c | 18 +++++++++++++
> > block/blk-settings.c | 19 ++++++++++++++
> > block/blk-sysfs.c | 8 ++++++
> > block/bounce.c | 1 +
> > block/fops.c | 25 +++++++++++++-----
> > include/linux/bio.h | 6 +++--
> > include/linux/blk_types.h | 5 +++-
> > include/linux/blkdev.h | 16 ++++++++++++
> > 10 files changed, 147 insertions(+), 9 deletions(-)
> >
>
> <cut to the fallocate part; the block/ changes look fine to /me/ at
> first glance, but what do I know... ;)>
>
> > diff --git a/block/fops.c b/block/fops.c
> > index d2e6be4e3d1c..e1775269654a 100644
> > --- a/block/fops.c
> > +++ b/block/fops.c
> > @@ -611,9 +611,13 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > return ret;
> > }
> >
> > +#define BLKDEV_FALLOC_FL_TRUNCATE \
>
> At first I thought from this name that you were defining a new truncate
> mode for fallocate, then I realized that this is mask for deciding if we
> /want/ to truncate the pagecache.
>
> #define BLKDEV_FALLOC_TRUNCATE_MASK ?
>
> > + (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | \
>
> Ok, so discarding and writing zeroes truncates the page cache, makes
> sense since we're "writing" directly to the block device.
>
> > + FALLOC_FL_NO_HIDE_STALE)
>
> Here things get tricky -- some of the FALLOC_FL mode bits are really an
> opcode and cannot be specified together, whereas others select optional
> behavior for certain opcodes.
>
> IIRC, the mutually exclusive opcodes are:
>
> PUNCH_HOLE
> ZERO_RANGE
> COLLAPSE_RANGE
> INSERT_RANGE
> (none of the above, for allocation)
>
> and the "variants on a theme are":
>
> KEEP_SIZE
> NO_HIDE_STALE
> UNSHARE_RANGE
>
> not all of which are supported by all the opcodes.
>
> Does it make sense to truncate the page cache if userspace passes in
> mode == NO_HIDE_STALE? There's currently no defined meaning for this
> combination, but I think this means we'll truncate the pagecache before
> deciding if we're actually going to issue any commands.
>
> I think that's just a bug in the existing code -- it should be
> validating that @mode is any of the supported combinations *before*
> truncating the pagecache.
>
> Otherwise you could have a mkfs program that starts writing new fs
> metadata, decides to provision the storage (say for a logging region),
> doesn't realize it's running on an old kernel, and then oops the
> provision attempt fails but have we now shredded the pagecache and lost
> all the writes?
While that just caused me to have an "oh shit, that's crazy" (in a
scary way) belly laugh...
(And obviously needs fixing independent of this patchset)
Shouldn't mkfs first check that the underlying storage supports
REQ_OP_PROVISION by verifying
/sys/block/<dev>/queue/provision_max_bytes exists and is not 0?
(Just saying, we need to add new features more defensively.. you just
made the case based on this scenario's implications alone)
Sarthak, please note I said "provision_max_bytes": all other ops
(e.g. DISCARD, WRITE_ZEROES, etc) have <op>_max_bytes exported through
sysfs, not <op>_max_sectors. Please export provision_max_bytes, e.g.:
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 202aa78f933e..2e5ac7b1ffbd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -605,12 +605,12 @@ QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size");
QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size");
QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
-QUEUE_RO_ENTRY(queue_max_provision_sectors, "max_provision_sectors");
QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity");
QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
+QUEUE_RO_ENTRY(queue_provision_max, "provision_max_bytes");
QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
On Wed, Apr 19, 2023 at 12:17:34PM -0400, Mike Snitzer wrote:
> On Wed, Apr 19 2023 at 11:36P -0400,
> Darrick J. Wong <djwong@kernel.org> wrote:
>
> > On Tue, Apr 18, 2023 at 03:12:04PM -0700, Sarthak Kukreti wrote:
> > > Introduce block request REQ_OP_PROVISION. The intent of this request
> > > is to request underlying storage to preallocate disk space for the given
> > > block range. Block devices that support this capability will export
> > > a provision limit within their request queues.
> > >
> > > This patch also adds the capability to call fallocate() in mode 0
> > > on block devices, which will send REQ_OP_PROVISION to the block
> > > device for the specified range,
> > >
> > > Signed-off-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
> > > ---
> > > block/blk-core.c | 5 ++++
> > > block/blk-lib.c | 53 +++++++++++++++++++++++++++++++++++++++
> > > block/blk-merge.c | 18 +++++++++++++
> > > block/blk-settings.c | 19 ++++++++++++++
> > > block/blk-sysfs.c | 8 ++++++
> > > block/bounce.c | 1 +
> > > block/fops.c | 25 +++++++++++++-----
> > > include/linux/bio.h | 6 +++--
> > > include/linux/blk_types.h | 5 +++-
> > > include/linux/blkdev.h | 16 ++++++++++++
> > > 10 files changed, 147 insertions(+), 9 deletions(-)
> > >
> >
> > <cut to the fallocate part; the block/ changes look fine to /me/ at
> > first glance, but what do I know... ;)>
> >
> > > diff --git a/block/fops.c b/block/fops.c
> > > index d2e6be4e3d1c..e1775269654a 100644
> > > --- a/block/fops.c
> > > +++ b/block/fops.c
> > > @@ -611,9 +611,13 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > > return ret;
> > > }
> > >
> > > +#define BLKDEV_FALLOC_FL_TRUNCATE \
> >
> > At first I thought from this name that you were defining a new truncate
> > mode for fallocate, then I realized that this is mask for deciding if we
> > /want/ to truncate the pagecache.
> >
> > #define BLKDEV_FALLOC_TRUNCATE_MASK ?
> >
> > > + (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | \
> >
> > Ok, so discarding and writing zeroes truncates the page cache, makes
> > sense since we're "writing" directly to the block device.
> >
> > > + FALLOC_FL_NO_HIDE_STALE)
> >
> > Here things get tricky -- some of the FALLOC_FL mode bits are really an
> > opcode and cannot be specified together, whereas others select optional
> > behavior for certain opcodes.
> >
> > IIRC, the mutually exclusive opcodes are:
> >
> > PUNCH_HOLE
> > ZERO_RANGE
> > COLLAPSE_RANGE
> > INSERT_RANGE
> > (none of the above, for allocation)
> >
> > and the "variants on a theme are":
> >
> > KEEP_SIZE
> > NO_HIDE_STALE
> > UNSHARE_RANGE
> >
> > not all of which are supported by all the opcodes.
> >
> > Does it make sense to truncate the page cache if userspace passes in
> > mode == NO_HIDE_STALE? There's currently no defined meaning for this
> > combination, but I think this means we'll truncate the pagecache before
> > deciding if we're actually going to issue any commands.
> >
> > I think that's just a bug in the existing code -- it should be
> > validating that @mode is any of the supported combinations *before*
> > truncating the pagecache.
> >
> > Otherwise you could have a mkfs program that starts writing new fs
> > metadata, decides to provision the storage (say for a logging region),
> > doesn't realize it's running on an old kernel, and then oops the
> > provision attempt fails but have we now shredded the pagecache and lost
> > all the writes?
>
> While that just caused me to have an "oh shit, that's crazy" (in a
> scary way) belly laugh...
I just tried this and:
# xfs_io -c 'pwrite -S 0x58 1m 1m' -c fsync -c 'pwrite -S 0x59 1m 4096' -c 'pread -v 1m 64' -c 'falloc 1m 4096' -c 'pread -v 1m 64' /dev/sda
wrote 1048576/1048576 bytes at offset 1048576
1 MiB, 256 ops; 0.0013 sec (723.589 MiB/sec and 185238.7844 ops/sec)
wrote 4096/4096 bytes at offset 1048576
4 KiB, 1 ops; 0.0000 sec (355.114 MiB/sec and 90909.0909 ops/sec)
00100000: 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 YYYYYYYYYYYYYYYY
00100010: 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 YYYYYYYYYYYYYYYY
00100020: 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 YYYYYYYYYYYYYYYY
00100030: 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 YYYYYYYYYYYYYYYY
read 64/64 bytes at offset 1048576
64.000000 bytes, 1 ops; 0.0000 sec (1.565 MiB/sec and 25641.0256 ops/sec)
fallocate: Operation not supported
00100000: 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 XXXXXXXXXXXXXXXX
00100010: 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 XXXXXXXXXXXXXXXX
00100020: 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 XXXXXXXXXXXXXXXX
00100030: 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 XXXXXXXXXXXXXXXX
read 64/64 bytes at offset 1048576
64.000000 bytes, 1 ops; 0.0003 sec (176.554 KiB/sec and 2824.8588 ops/sec)
(Write 1MB of Xs, flush it to disk, write 4k of Ys, confirm the Y's are
in the page cache, fail to fallocate, reread and spot the Xs that we
supposedly overwrote.)
oops.
> (And obviously needs fixing independent of this patchset)
>
> Shouldn't mkfs first check that the underlying storage supports
> REQ_OP_PROVISION by verifying
> /sys/block/<dev>/queue/provision_max_bytes exists and is not 0?
> (Just saying, we need to add new features more defensively.. you just
> made the case based on this scenario's implications alone)
Not for fallocate -- for regular files, there's no way to check if the
filesystem actually supports the operation requested other than to try
it and see if it succeeds. We probably should've defined a DRY_RUN flag
for that purpose back when it was introduced.
For fallocate calls to block devices, yes, the program can check the
queue limits in sysfs if fstat says the supplied path is a block device,
but I don't know that most programs are that thorough. The fallocate(1)
CLI program does not check.
Then I moved on to fs utilities:
ext4: For discard, mke2fs calls BLKDISCARD if it detects a block device
via fstat, and falloc(PUNCH|KEEP_SIZE) for anything else. For zeroing,
it only uses falloc(ZERO) or falloc(PUNCH|KEEP_SIZE) and does not try to
use BLKZEROOUT. It does not check sysfs queue limits at all.
XFS: mkfs.xfs issues BLKDISCARD before writing anything to the device,
so that's fine. It uses falloc(ZERO) to erase the log, but since
xfsprogs provides its own buffer cache and uses O_DIRECT, pagecache
coherency problems aren't an issue.
btrfs: mkfs.btrfs only issues BLKDISCARD, and only before it starts
writing the new fs, so no problems there.
--D
> Sarthak, please note I said "provision_max_bytes": all other ops
> (e.g. DISCARD, WRITE_ZEROES, etc) have <op>_max_bytes exported through
> sysfs, not <op>_max_sectors. Please export provision_max_bytes, e.g.:
>
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 202aa78f933e..2e5ac7b1ffbd 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -605,12 +605,12 @@ QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size");
> QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size");
>
> QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
> -QUEUE_RO_ENTRY(queue_max_provision_sectors, "max_provision_sectors");
> QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity");
> QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
> QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
> QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
>
> +QUEUE_RO_ENTRY(queue_provision_max, "provision_max_bytes");
> QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
> QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
> QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
On Wed, Apr 19, 2023 at 10:26:02AM -0700, Darrick J. Wong wrote:
> On Wed, Apr 19, 2023 at 12:17:34PM -0400, Mike Snitzer wrote:
> > (And obviously needs fixing independent of this patchset)
> >
> > Shouldn't mkfs first check that the underlying storage supports
> > REQ_OP_PROVISION by verifying
> > /sys/block/<dev>/queue/provision_max_bytes exists and is not 0?
> > (Just saying, we need to add new features more defensively.. you just
> > made the case based on this scenario's implications alone)
>
> Not for fallocate -- for regular files, there's no way to check if the
> filesystem actually supports the operation requested other than to try
> it and see if it succeeds. We probably should've defined a DRY_RUN flag
> for that purpose back when it was introduced.
That ignores the fact that fallocate() was never intended to
guarantee it will work in all contexts - it's an advisory interface
at it's most basic level. If the call succeeds, then great, it does
what is says on the box, but if it fails then it should have no
visible effect on user data at all and the application still needs
to perform whatever modification it needed done in some other way.
IOWs, calling it one a block device without first checking if the
block device supports that fallocate operation is exactly how it is
supposed to be used. If the kernel can't handle this gracefully
without corrupting data, then that's a kernel bug and not an
application problem.
> For fallocate calls to block devices, yes, the program can check the
> queue limits in sysfs if fstat says the supplied path is a block device,
> but I don't know that most programs are that thorough. The fallocate(1)
> CLI program does not check.
Right. fallocate() was intended to just do the right thing without
the application having to jump thrown an unknown number of hoops to
determine if fallocate() can be used or not in the context it is
executing in. The kernel implementation is supposed to abstract all that
context-dependent behaviour away from the application; all the
application has to do is implement the fallocate() fast path and a
single generic "do the right thing the slow way" fallback when the
fallocate() method it called is not supported...
-Dave.
On Wed, Apr 19, 2023 at 4:21 PM Dave Chinner <david@fromorbit.com> wrote:
>
> On Wed, Apr 19, 2023 at 10:26:02AM -0700, Darrick J. Wong wrote:
> > On Wed, Apr 19, 2023 at 12:17:34PM -0400, Mike Snitzer wrote:
> > > (And obviously needs fixing independent of this patchset)
> > >
> > > Shouldn't mkfs first check that the underlying storage supports
> > > REQ_OP_PROVISION by verifying
> > > /sys/block/<dev>/queue/provision_max_bytes exists and is not 0?
> > > (Just saying, we need to add new features more defensively.. you just
> > > made the case based on this scenario's implications alone)
> >
> > Not for fallocate -- for regular files, there's no way to check if the
> > filesystem actually supports the operation requested other than to try
> > it and see if it succeeds. We probably should've defined a DRY_RUN flag
> > for that purpose back when it was introduced.
>
> That ignores the fact that fallocate() was never intended to
> guarantee it will work in all contexts - it's an advisory interface
> at it's most basic level. If the call succeeds, then great, it does
> what is says on the box, but if it fails then it should have no
> visible effect on user data at all and the application still needs
> to perform whatever modification it needed done in some other way.
>
> IOWs, calling it one a block device without first checking if the
> block device supports that fallocate operation is exactly how it is
> supposed to be used. If the kernel can't handle this gracefully
> without corrupting data, then that's a kernel bug and not an
> application problem.
>
> > For fallocate calls to block devices, yes, the program can check the
> > queue limits in sysfs if fstat says the supplied path is a block device,
> > but I don't know that most programs are that thorough. The fallocate(1)
> > CLI program does not check.
>
> Right. fallocate() was intended to just do the right thing without
> the application having to jump thrown an unknown number of hoops to
> determine if fallocate() can be used or not in the context it is
> executing in. The kernel implementation is supposed to abstract all that
> context-dependent behaviour away from the application; all the
> application has to do is implement the fallocate() fast path and a
> single generic "do the right thing the slow way" fallback when the
> fallocate() method it called is not supported...
>
I added a separate commit[1] to fix this so that we only
truncate_bdev_range() iff we are in a supported de-allocate mode call.
Subsequently, the REQ_OP_PROVISION patch is a bit simpler when rebased
on top.
[1] https://www.spinics.net/lists/kernel/msg4765688.html
Best
Sarthak
> -Dave.
> --
> Dave Chinner
> david@fromorbit.com
Dropped in v5.
Thanks!
Sarthak
On Tue, Apr 18, 2023 at 3:43 PM Bart Van Assche <bvanassche@acm.org> wrote:
>
> On 4/18/23 15:12, Sarthak Kukreti wrote:
> > /* Fail if we don't recognize the flags. */
> > - if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
> > + if (mode != 0 && mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
> > return -EOPNOTSUPP;
>
> Is this change necessary? Doesn't (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
> != 0 imply that mode != 0?
>
> Thanks,
>
> Bart.
>
@@ -123,6 +123,7 @@ static const char *const blk_op_name[] = {
REQ_OP_NAME(WRITE_ZEROES),
REQ_OP_NAME(DRV_IN),
REQ_OP_NAME(DRV_OUT),
+ REQ_OP_NAME(PROVISION)
};
#undef REQ_OP_NAME
@@ -798,6 +799,10 @@ void submit_bio_noacct(struct bio *bio)
if (!q->limits.max_write_zeroes_sectors)
goto not_supported;
break;
+ case REQ_OP_PROVISION:
+ if (!q->limits.max_provision_sectors)
+ goto not_supported;
+ break;
default:
break;
}
@@ -343,3 +343,56 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
return ret;
}
EXPORT_SYMBOL(blkdev_issue_secure_erase);
+
+/**
+ * blkdev_issue_provision - provision a block range
+ * @bdev: blockdev to write
+ * @sector: start sector
+ * @nr_sects: number of sectors to provision
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ * Issues a provision request to the block device for the range of sectors.
+ * For thinly provisioned block devices, this acts as a signal for the
+ * underlying storage pool to allocate space for this block range.
+ */
+int blkdev_issue_provision(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp)
+{
+ sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+ unsigned int max_sectors = bdev_max_provision_sectors(bdev);
+ struct bio *bio = NULL;
+ struct blk_plug plug;
+ int ret = 0;
+
+ if (max_sectors == 0)
+ return -EOPNOTSUPP;
+ if ((sector | nr_sects) & bs_mask)
+ return -EINVAL;
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
+ blk_start_plug(&plug);
+ for (;;) {
+ unsigned int req_sects = min_t(sector_t, nr_sects, max_sectors);
+
+ bio = blk_next_bio(bio, bdev, 0, REQ_OP_PROVISION, gfp);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_iter.bi_size = req_sects << SECTOR_SHIFT;
+
+ sector += req_sects;
+ nr_sects -= req_sects;
+ if (!nr_sects) {
+ ret = submit_bio_wait(bio);
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
+ bio_put(bio);
+ break;
+ }
+ cond_resched();
+ }
+ blk_finish_plug(&plug);
+
+ return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_provision);
@@ -158,6 +158,21 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
}
+static struct bio *bio_split_provision(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned int *nsegs, struct bio_set *bs)
+{
+ *nsegs = 0;
+
+ if (!lim->max_provision_sectors)
+ return NULL;
+
+ if (bio_sectors(bio) <= lim->max_provision_sectors)
+ return NULL;
+
+ return bio_split(bio, lim->max_provision_sectors, GFP_NOIO, bs);
+}
+
/*
* Return the maximum number of sectors from the start of a bio that may be
* submitted as a single request to a block device. If enough sectors remain,
@@ -366,6 +381,9 @@ struct bio *__bio_split_to_limits(struct bio *bio,
case REQ_OP_WRITE_ZEROES:
split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
break;
+ case REQ_OP_PROVISION:
+ split = bio_split_provision(bio, lim, nr_segs, bs);
+ break;
default:
split = bio_split_rw(bio, lim, nr_segs, bs,
get_max_io_size(bio, lim) << SECTOR_SHIFT);
@@ -59,6 +59,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->zoned = BLK_ZONED_NONE;
lim->zone_write_granularity = 0;
lim->dma_alignment = 511;
+ lim->max_provision_sectors = 0;
}
/**
@@ -82,6 +83,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_dev_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
lim->max_zone_append_sectors = UINT_MAX;
+ lim->max_provision_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -208,6 +210,20 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
}
EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
+/**
+ * blk_queue_max_provision_sectors - set max sectors for a single provision
+ *
+ * @q: the request queue for the device
+ * @max_provision_sectors: maximum number of sectors to provision per command
+ **/
+
+void blk_queue_max_provision_sectors(struct request_queue *q,
+ unsigned int max_provision_sectors)
+{
+ q->limits.max_provision_sectors = max_provision_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_provision_sectors);
+
/**
* blk_queue_max_zone_append_sectors - set max sectors for a single zone append
* @q: the request queue for the device
@@ -578,6 +594,9 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_segment_size = min_not_zero(t->max_segment_size,
b->max_segment_size);
+ t->max_provision_sectors = min_not_zero(t->max_provision_sectors,
+ b->max_provision_sectors);
+
t->misaligned |= b->misaligned;
alignment = queue_limit_alignment_offset(b, start);
@@ -132,6 +132,12 @@ static ssize_t queue_max_discard_segments_show(struct request_queue *q,
return queue_var_show(queue_max_discard_segments(q), page);
}
+static ssize_t queue_max_provision_sectors_show(struct request_queue *q,
+ char *page)
+{
+ return queue_var_show(queue_max_provision_sectors(q), (page));
+}
+
static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
{
return queue_var_show(q->limits.max_integrity_segments, page);
@@ -599,6 +605,7 @@ QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size");
QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size");
QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
+QUEUE_RO_ENTRY(queue_max_provision_sectors, "max_provision_sectors");
QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity");
QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
@@ -648,6 +655,7 @@ static struct attribute *queue_attrs[] = {
&queue_max_sectors_entry.attr,
&queue_max_segments_entry.attr,
&queue_max_discard_segments_entry.attr,
+ &queue_max_provision_sectors_entry.attr,
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
&elv_iosched_entry.attr,
@@ -176,6 +176,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_PROVISION:
break;
default:
bio_for_each_segment(bv, bio_src, iter)
@@ -611,9 +611,13 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
}
+#define BLKDEV_FALLOC_FL_TRUNCATE \
+ (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | \
+ FALLOC_FL_NO_HIDE_STALE)
+
#define BLKDEV_FALLOC_FL_SUPPORTED \
- (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
- FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
+ (BLKDEV_FALLOC_FL_TRUNCATE | FALLOC_FL_KEEP_SIZE | \
+ FALLOC_FL_UNSHARE_RANGE)
static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
@@ -625,7 +629,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
int error;
/* Fail if we don't recognize the flags. */
- if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
+ if (mode != 0 && mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
/* Don't go off the end of the device. */
@@ -649,11 +653,20 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
filemap_invalidate_lock(inode->i_mapping);
/* Invalidate the page cache, including dirty pages. */
- error = truncate_bdev_range(bdev, file->f_mode, start, end);
- if (error)
- goto fail;
+ if (mode & BLKDEV_FALLOC_FL_TRUNCATE) {
+ error = truncate_bdev_range(bdev, file->f_mode, start, end);
+ if (error)
+ goto fail;
+ }
switch (mode) {
+ case 0:
+ case FALLOC_FL_UNSHARE_RANGE:
+ case FALLOC_FL_KEEP_SIZE:
+ case FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE:
+ error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL);
+ break;
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
@@ -57,7 +57,8 @@ static inline bool bio_has_data(struct bio *bio)
bio->bi_iter.bi_size &&
bio_op(bio) != REQ_OP_DISCARD &&
bio_op(bio) != REQ_OP_SECURE_ERASE &&
- bio_op(bio) != REQ_OP_WRITE_ZEROES)
+ bio_op(bio) != REQ_OP_WRITE_ZEROES &&
+ bio_op(bio) != REQ_OP_PROVISION)
return true;
return false;
@@ -67,7 +68,8 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
{
return bio_op(bio) == REQ_OP_DISCARD ||
bio_op(bio) == REQ_OP_SECURE_ERASE ||
- bio_op(bio) == REQ_OP_WRITE_ZEROES;
+ bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+ bio_op(bio) == REQ_OP_PROVISION;
}
static inline void *bio_data(struct bio *bio)
@@ -385,7 +385,10 @@ enum req_op {
REQ_OP_DRV_IN = (__force blk_opf_t)34,
REQ_OP_DRV_OUT = (__force blk_opf_t)35,
- REQ_OP_LAST = (__force blk_opf_t)36,
+ /* request device to provision block */
+ REQ_OP_PROVISION = (__force blk_opf_t)37,
+
+ REQ_OP_LAST = (__force blk_opf_t)38,
};
enum req_flag_bits {
@@ -303,6 +303,7 @@ struct queue_limits {
unsigned int discard_granularity;
unsigned int discard_alignment;
unsigned int zone_write_granularity;
+ unsigned int max_provision_sectors;
unsigned short max_segments;
unsigned short max_integrity_segments;
@@ -921,6 +922,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors);
extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
unsigned int max_write_same_sectors);
+extern void blk_queue_max_provision_sectors(struct request_queue *q,
+ unsigned int max_provision_sectors);
extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
unsigned int max_zone_append_sectors);
@@ -1060,6 +1063,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp);
+extern int blkdev_issue_provision(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask);
+
#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
@@ -1139,6 +1145,11 @@ static inline unsigned short queue_max_discard_segments(const struct request_que
return q->limits.max_discard_segments;
}
+static inline unsigned short queue_max_provision_sectors(const struct request_queue *q)
+{
+ return q->limits.max_provision_sectors;
+}
+
static inline unsigned int queue_max_segment_size(const struct request_queue *q)
{
return q->limits.max_segment_size;
@@ -1281,6 +1292,11 @@ static inline bool bdev_nowait(struct block_device *bdev)
return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
}
+static inline unsigned int bdev_max_provision_sectors(struct block_device *bdev)
+{
+ return bdev_get_queue(bdev)->limits.max_provision_sectors;
+}
+
static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
{
return blk_queue_zoned_model(bdev_get_queue(bdev));