[v2] block: ublk: enable zoned storage support

Message ID 20230224200502.391570-1-nmi@metaspace.dk
State New
Headers
Series [v2] block: ublk: enable zoned storage support |

Commit Message

Andreas Hindborg Feb. 24, 2023, 8:05 p.m. UTC
  Add zoned storage support to ublk: report_zones and operations:
 - REQ_OP_ZONE_OPEN
 - REQ_OP_ZONE_CLOSE
 - REQ_OP_ZONE_FINISH
 - REQ_OP_ZONE_RESET

This allows implementation of zoned storage devices in user space. An
example user space implementation based on ubdsrv is available [1].

[1] https://github.com/metaspace/ubdsrv/commit/14a2b708f74f70cfecb076d92e680dc718cc1f6d

Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
---
Changes since v1:
 - Fixed conditional compilation bug
 - Refactored to collect conditional code additions together
 - Fixed style errors
 - Zero stack allocated value used for zone report

Reported-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202302250222.XOrw9j6z-lkp@intel.com/
v1: https://lore.kernel.org/linux-block/20230224125950.214779-1-nmi@metaspace.dk/

 drivers/block/ublk_drv.c      | 150 ++++++++++++++++++++++++++++++++--
 include/uapi/linux/ublk_cmd.h |  18 ++++
 2 files changed, 162 insertions(+), 6 deletions(-)


base-commit: c9c3395d5e3dcc6daee66c6908354d47bf98cb0c
  

Comments

Niklas Cassel Feb. 27, 2023, 10:20 a.m. UTC | #1
On Fri, Feb 24, 2023 at 09:05:01PM +0100, Andreas Hindborg wrote:
> Add zoned storage support to ublk: report_zones and operations:
>  - REQ_OP_ZONE_OPEN
>  - REQ_OP_ZONE_CLOSE
>  - REQ_OP_ZONE_FINISH
>  - REQ_OP_ZONE_RESET
> 
> This allows implementation of zoned storage devices in user space. An
> example user space implementation based on ubdsrv is available [1].
> 
> [1] https://github.com/metaspace/ubdsrv/commit/14a2b708f74f70cfecb076d92e680dc718cc1f6d
> 
> Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
> ---
> Changes since v1:
>  - Fixed conditional compilation bug
>  - Refactored to collect conditional code additions together
>  - Fixed style errors
>  - Zero stack allocated value used for zone report
> 
> Reported-by: Niklas Cassel <Niklas.Cassel@wdc.com>
> Reported-by: kernel test robot <lkp@intel.com>
> Link: https://lore.kernel.org/oe-kbuild-all/202302250222.XOrw9j6z-lkp@intel.com/
> v1: https://lore.kernel.org/linux-block/20230224125950.214779-1-nmi@metaspace.dk/
> 
>  drivers/block/ublk_drv.c      | 150 ++++++++++++++++++++++++++++++++--
>  include/uapi/linux/ublk_cmd.h |  18 ++++
>  2 files changed, 162 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 6368b56eacf1..37e516903867 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -20,6 +20,7 @@
>  #include <linux/major.h>
>  #include <linux/wait.h>
>  #include <linux/blkdev.h>
> +#include <linux/blkzoned.h>
>  #include <linux/init.h>
>  #include <linux/swap.h>
>  #include <linux/slab.h>
> @@ -51,10 +52,12 @@
>  		| UBLK_F_URING_CMD_COMP_IN_TASK \
>  		| UBLK_F_NEED_GET_DATA \
>  		| UBLK_F_USER_RECOVERY \
> -		| UBLK_F_USER_RECOVERY_REISSUE)
> +		| UBLK_F_USER_RECOVERY_REISSUE \
> +		| UBLK_F_ZONED)
>  
>  /* All UBLK_PARAM_TYPE_* should be included here */
> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
> +#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD \
> +			     | UBLK_PARAM_TYPE_ZONED)
>  
>  struct ublk_rq_data {
>  	struct llist_node node;
> @@ -187,6 +190,98 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
>  
>  static struct miscdevice ublk_misc;
>  
> +#ifdef CONFIG_BLK_DEV_ZONED
> +static void ublk_set_nr_zones(struct ublk_device *ub)
> +{
> +	const struct ublk_param_basic *p = &ub->params.basic;
> +
> +	if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
> +		ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
> +}
> +
> +static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
> +{
> +	const struct ublk_param_zoned *p = &ub->params.zoned;
> +
> +	if (ub->dev_info.flags & UBLK_F_ZONED) {
> +		disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
> +		disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
> +	}
> +}
> +
> +static int ublk_revalidate_disk_zones(struct gendisk *disk)
> +{
> +	return blk_revalidate_disk_zones(disk, NULL);
> +}
> +
> +static int ublk_report_zones(struct gendisk *disk, sector_t sector,
> +			     unsigned int nr_zones, report_zones_cb cb,
> +			     void *data)
> +{
> +	struct ublk_device *ub;
> +	unsigned int zone_size;
> +	unsigned int first_zone;
> +	int ret = 0;
> +
> +	ub = disk->private_data;
> +
> +	if (!(ub->dev_info.flags & UBLK_F_ZONED))
> +		return -EINVAL;
> +
> +	zone_size = disk->queue->limits.chunk_sectors;
> +	first_zone = sector >> ilog2(zone_size);
> +	nr_zones = min(ub->ub_disk->nr_zones - first_zone, nr_zones);
> +
> +	for (unsigned int i = 0; i < nr_zones; i++) {
> +		struct request *req;
> +		blk_status_t status;
> +		struct blk_zone info = {0};
> +
> +		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
> +
> +		if (IS_ERR(req)) {
> +			ret = PTR_ERR(req);
> +			goto out;
> +		}
> +
> +		req->__sector = sector;
> +
> +		ret = blk_rq_map_kern(disk->queue, req, &info, sizeof(info),
> +				      GFP_KERNEL);
> +
> +		if (ret)
> +			goto out;
> +
> +		status = blk_execute_rq(req, 0);
> +		ret = blk_status_to_errno(status);
> +		if (ret)
> +			goto out;
> +
> +		blk_mq_free_request(req);
> +
> +		ret = cb(&info, i, data);
> +		if (ret)
> +			goto out;
> +
> +		/* A zero length zone means don't ask for more zones */
> +		if (!info.len) {
> +			nr_zones = i;
> +			break;
> +		}
> +
> +		sector += zone_size;
> +	}
> +	ret = nr_zones;
> +
> + out:
> +	return ret;
> +}
> +#else
> +void ublk_set_nr_zones(struct ublk_device *ub);
> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
> +int ublk_revalidate_disk_zones(struct gendisk *disk);

These are declarations, shouldn't they be dummy definitions instead?

e.g.
static int ublk_revalidate_disk_zones(struct gendisk *disk) { return -EOPNOTSUPP; };


It would be nice if they could be avoided altogether.

Looking how e.g. null-blk and btrfs has solved this:
https://github.com/torvalds/linux/blob/v6.2/fs/btrfs/Makefile#L39
https://github.com/torvalds/linux/blob/v6.2/drivers/block/null_blk/Makefile#L11

They have put the zoned stuff in a separate C file that is only compiled
when CONFIG_BLK_DEV_ZONED is set.

I'm not sure if a similar design is desired for ublk or not.

However, if a similar design pattern was used, it could probably avoid
some of these unpleasant dummy definitions altogether.


Kind regards,
Niklas

> +#endif
> +
  
Andreas Hindborg Feb. 27, 2023, 11:59 a.m. UTC | #2
Niklas Cassel <Niklas.Cassel@wdc.com> writes:

> On Fri, Feb 24, 2023 at 09:05:01PM +0100, Andreas Hindborg wrote:
>> Add zoned storage support to ublk: report_zones and operations:
>>  - REQ_OP_ZONE_OPEN
>>  - REQ_OP_ZONE_CLOSE
>>  - REQ_OP_ZONE_FINISH
>>  - REQ_OP_ZONE_RESET
>> 
>> This allows implementation of zoned storage devices in user space. An
>> example user space implementation based on ubdsrv is available [1].
>> 
>> [1] https://github.com/metaspace/ubdsrv/commit/14a2b708f74f70cfecb076d92e680dc718cc1f6d
>> 
>> Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
>> ---
>> Changes since v1:
>>  - Fixed conditional compilation bug
>>  - Refactored to collect conditional code additions together
>>  - Fixed style errors
>>  - Zero stack allocated value used for zone report
>> 
>> Reported-by: Niklas Cassel <Niklas.Cassel@wdc.com>
>> Reported-by: kernel test robot <lkp@intel.com>
>> Link: https://lore.kernel.org/oe-kbuild-all/202302250222.XOrw9j6z-lkp@intel.com/
>> v1: https://lore.kernel.org/linux-block/20230224125950.214779-1-nmi@metaspace.dk/
>> 
>>  drivers/block/ublk_drv.c      | 150 ++++++++++++++++++++++++++++++++--
>>  include/uapi/linux/ublk_cmd.h |  18 ++++
>>  2 files changed, 162 insertions(+), 6 deletions(-)
>> 
>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>> index 6368b56eacf1..37e516903867 100644
>> --- a/drivers/block/ublk_drv.c
>> +++ b/drivers/block/ublk_drv.c
>> @@ -20,6 +20,7 @@
>>  #include <linux/major.h>
>>  #include <linux/wait.h>
>>  #include <linux/blkdev.h>
>> +#include <linux/blkzoned.h>
>>  #include <linux/init.h>
>>  #include <linux/swap.h>
>>  #include <linux/slab.h>
>> @@ -51,10 +52,12 @@
>>  		| UBLK_F_URING_CMD_COMP_IN_TASK \
>>  		| UBLK_F_NEED_GET_DATA \
>>  		| UBLK_F_USER_RECOVERY \
>> -		| UBLK_F_USER_RECOVERY_REISSUE)
>> +		| UBLK_F_USER_RECOVERY_REISSUE \
>> +		| UBLK_F_ZONED)
>>  
>>  /* All UBLK_PARAM_TYPE_* should be included here */
>> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
>> +#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD \
>> +			     | UBLK_PARAM_TYPE_ZONED)
>>  
>>  struct ublk_rq_data {
>>  	struct llist_node node;
>> @@ -187,6 +190,98 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
>>  
>>  static struct miscdevice ublk_misc;
>>  
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +static void ublk_set_nr_zones(struct ublk_device *ub)
>> +{
>> +	const struct ublk_param_basic *p = &ub->params.basic;
>> +
>> +	if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
>> +		ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
>> +}
>> +
>> +static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
>> +{
>> +	const struct ublk_param_zoned *p = &ub->params.zoned;
>> +
>> +	if (ub->dev_info.flags & UBLK_F_ZONED) {
>> +		disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
>> +		disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
>> +	}
>> +}
>> +
>> +static int ublk_revalidate_disk_zones(struct gendisk *disk)
>> +{
>> +	return blk_revalidate_disk_zones(disk, NULL);
>> +}
>> +
>> +static int ublk_report_zones(struct gendisk *disk, sector_t sector,
>> +			     unsigned int nr_zones, report_zones_cb cb,
>> +			     void *data)
>> +{
>> +	struct ublk_device *ub;
>> +	unsigned int zone_size;
>> +	unsigned int first_zone;
>> +	int ret = 0;
>> +
>> +	ub = disk->private_data;
>> +
>> +	if (!(ub->dev_info.flags & UBLK_F_ZONED))
>> +		return -EINVAL;
>> +
>> +	zone_size = disk->queue->limits.chunk_sectors;
>> +	first_zone = sector >> ilog2(zone_size);
>> +	nr_zones = min(ub->ub_disk->nr_zones - first_zone, nr_zones);
>> +
>> +	for (unsigned int i = 0; i < nr_zones; i++) {
>> +		struct request *req;
>> +		blk_status_t status;
>> +		struct blk_zone info = {0};
>> +
>> +		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
>> +
>> +		if (IS_ERR(req)) {
>> +			ret = PTR_ERR(req);
>> +			goto out;
>> +		}
>> +
>> +		req->__sector = sector;
>> +
>> +		ret = blk_rq_map_kern(disk->queue, req, &info, sizeof(info),
>> +				      GFP_KERNEL);
>> +
>> +		if (ret)
>> +			goto out;
>> +
>> +		status = blk_execute_rq(req, 0);
>> +		ret = blk_status_to_errno(status);
>> +		if (ret)
>> +			goto out;
>> +
>> +		blk_mq_free_request(req);
>> +
>> +		ret = cb(&info, i, data);
>> +		if (ret)
>> +			goto out;
>> +
>> +		/* A zero length zone means don't ask for more zones */
>> +		if (!info.len) {
>> +			nr_zones = i;
>> +			break;
>> +		}
>> +
>> +		sector += zone_size;
>> +	}
>> +	ret = nr_zones;
>> +
>> + out:
>> +	return ret;
>> +}
>> +#else
>> +void ublk_set_nr_zones(struct ublk_device *ub);
>> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
>> +int ublk_revalidate_disk_zones(struct gendisk *disk);
>
> These are declarations, shouldn't they be dummy definitions instead?

I looked at how nvme host defines nvme_revalidate_zones() when I did
this. The functions become undefined symbols but because the call sites
are optimized out they go away.

>
> e.g.
> static int ublk_revalidate_disk_zones(struct gendisk *disk) { return -EOPNOTSUPP; };

Not sure how this is better?

>
>
> It would be nice if they could be avoided altogether.
>
> Looking how e.g. null-blk and btrfs has solved this:
> https://github.com/torvalds/linux/blob/v6.2/fs/btrfs/Makefile#L39
> https://github.com/torvalds/linux/blob/v6.2/drivers/block/null_blk/Makefile#L11
>
> They have put the zoned stuff in a separate C file that is only compiled
> when CONFIG_BLK_DEV_ZONED is set.
>
> I'm not sure if a similar design is desired for ublk or not.
>
> However, if a similar design pattern was used, it could probably avoid
> some of these unpleasant dummy definitions altogether.

This is the same as I do here, except I put the declarations in the c
file instead of a header. I did this for two reasons 1) there is no ublk
header besides the uapi header (I would add a header just for this), 2)
the declarations need only exist inside ublk_drv.c. For btrfs, null_blk,
nvme, the declarations go in a header file and the functions in question
do not have static linkage.

I could move the function declarations out of the #else block, but then
they would need to be declared static and that gives a compiler warning
when the implementation is not present.

BR Andreas
  
Niklas Cassel Feb. 27, 2023, 2:29 p.m. UTC | #3
On Mon, Feb 27, 2023 at 12:59:45PM +0100, Andreas Hindborg wrote:

(snip)

> >> +#else
> >> +void ublk_set_nr_zones(struct ublk_device *ub);
> >> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
> >> +int ublk_revalidate_disk_zones(struct gendisk *disk);
> >
> > These are declarations, shouldn't they be dummy definitions instead?
> 
> I looked at how nvme host defines nvme_revalidate_zones() when I did
> this. The functions become undefined symbols but because the call sites
> are optimized out they go away.

Looking at e.g. nvme_revalidate_zones

$ git grep nvme_revalidate_zones
drivers/nvme/host/core.c:               ret = nvme_revalidate_zones(ns);
drivers/nvme/host/nvme.h:int nvme_revalidate_zones(struct nvme_ns *ns);
drivers/nvme/host/zns.c:int nvme_revalidate_zones(struct nvme_ns *ns)

The function is declared in nvme.h, but like you say, without any definition.

zns.c provides a definition, but that file is only build if
CONFIG_BLK_DEV_ZONED is set.


> > https://github.com/torvalds/linux/blob/v6.2/fs/btrfs/Makefile#L39
> > https://github.com/torvalds/linux/blob/v6.2/drivers/block/null_blk/Makefile#L11
> >
> > They have put the zoned stuff in a separate C file that is only compiled
> > when CONFIG_BLK_DEV_ZONED is set.
> >
> > I'm not sure if a similar design is desired for ublk or not.
> >
> > However, if a similar design pattern was used, it could probably avoid
> > some of these unpleasant dummy definitions altogether.
> 
> This is the same as I do here, except I put the declarations in the c
> file instead of a header. I did this for two reasons 1) there is no ublk
> header besides the uapi header (I would add a header just for this), 2)
> the declarations need only exist inside ublk_drv.c. For btrfs, null_blk,
> nvme, the declarations go in a header file and the functions in question
> do not have static linkage.
> 
> I could move the function declarations out of the #else block, but then
> they would need to be declared static and that gives a compiler warning
> when the implementation is not present.

I would love to hear someone else's opinion about this as well, but I do
think that having #ifdef and #else with both a declaration and a definition
in the C file is quite ugly.

If having an internal only header (in the same directory as the C file),
makes the C code easier to read, I'm all for it.

It seems to work for nvme to only have a declaration in an internal header
file, and only provide a definition if CONFIG_BLK_DEV_ZONED is set,
presumably without giving a warning. Perhaps ublk can do the same?


Kind regards,
Niklas
  
Andreas Hindborg Feb. 27, 2023, 2:41 p.m. UTC | #4
Niklas Cassel <Niklas.Cassel@wdc.com> writes:

> On Mon, Feb 27, 2023 at 12:59:45PM +0100, Andreas Hindborg wrote:
>
> (snip)
>
>> >> +#else
>> >> +void ublk_set_nr_zones(struct ublk_device *ub);
>> >> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
>> >> +int ublk_revalidate_disk_zones(struct gendisk *disk);
>> >
>> > These are declarations, shouldn't they be dummy definitions instead?
>> 
>> I looked at how nvme host defines nvme_revalidate_zones() when I did
>> this. The functions become undefined symbols but because the call sites
>> are optimized out they go away.
>
> Looking at e.g. nvme_revalidate_zones
>
> $ git grep nvme_revalidate_zones
> drivers/nvme/host/core.c:               ret = nvme_revalidate_zones(ns);
> drivers/nvme/host/nvme.h:int nvme_revalidate_zones(struct nvme_ns *ns);
> drivers/nvme/host/zns.c:int nvme_revalidate_zones(struct nvme_ns *ns)
>
> The function is declared in nvme.h, but like you say, without any definition.
>
> zns.c provides a definition, but that file is only build if
> CONFIG_BLK_DEV_ZONED is set.
>
>
>> > https://github.com/torvalds/linux/blob/v6.2/fs/btrfs/Makefile#L39
>> > https://github.com/torvalds/linux/blob/v6.2/drivers/block/null_blk/Makefile#L11
>> >
>> > They have put the zoned stuff in a separate C file that is only compiled
>> > when CONFIG_BLK_DEV_ZONED is set.
>> >
>> > I'm not sure if a similar design is desired for ublk or not.
>> >
>> > However, if a similar design pattern was used, it could probably avoid
>> > some of these unpleasant dummy definitions altogether.
>> 
>> This is the same as I do here, except I put the declarations in the c
>> file instead of a header. I did this for two reasons 1) there is no ublk
>> header besides the uapi header (I would add a header just for this), 2)
>> the declarations need only exist inside ublk_drv.c. For btrfs, null_blk,
>> nvme, the declarations go in a header file and the functions in question
>> do not have static linkage.
>> 
>> I could move the function declarations out of the #else block, but then
>> they would need to be declared static and that gives a compiler warning
>> when the implementation is not present.
>
> I would love to hear someone else's opinion about this as well, but I do
> think that having #ifdef and #else with both a declaration and a definition
> in the C file is quite ugly.
>
> If having an internal only header (in the same directory as the C file),
> makes the C code easier to read, I'm all for it.
>
> It seems to work for nvme to only have a declaration in an internal header
> file, and only provide a definition if CONFIG_BLK_DEV_ZONED is set,
> presumably without giving a warning. Perhaps ublk can do the same?

Sure, I can do that if that is preferred. As I said the result will be
he same with he exception that the function symbols will not have static
linkage when defined in a separate file with declarations in a header.

I will let this version sit for a while to see if anyone has an opinion,
and then I will ship a new version next week.

BR Andreas
  
Minwoo Im Feb. 27, 2023, 3:20 p.m. UTC | #5
On 23-02-24 21:05:01, Andreas Hindborg wrote:
> +static int ublk_report_zones(struct gendisk *disk, sector_t sector,
> +			     unsigned int nr_zones, report_zones_cb cb,
> +			     void *data)
> +{
> +	struct ublk_device *ub;
> +	unsigned int zone_size;
> +	unsigned int first_zone;
> +	int ret = 0;
> +
> +	ub = disk->private_data;
> +
> +	if (!(ub->dev_info.flags & UBLK_F_ZONED))
> +		return -EINVAL;
> +
> +	zone_size = disk->queue->limits.chunk_sectors;
> +	first_zone = sector >> ilog2(zone_size);
> +	nr_zones = min(ub->ub_disk->nr_zones - first_zone, nr_zones);
> +
> +	for (unsigned int i = 0; i < nr_zones; i++) {
> +		struct request *req;
> +		blk_status_t status;
> +		struct blk_zone info = {0};
> +
> +		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
> +
> +		if (IS_ERR(req)) {
> +			ret = PTR_ERR(req);
> +			goto out;
> +		}

Can we just return directly just like above (-EINVAL)?

if (IS_ERR(req))
	return PTR_ERR(req);

> +
> +		req->__sector = sector;
> +
> +		ret = blk_rq_map_kern(disk->queue, req, &info, sizeof(info),
> +				      GFP_KERNEL);
> +
> +		if (ret)
> +			goto out;

If we really have to use goto here, then I think we have choices:
put blk_mq_free_request(req); here or put it to the out: area.

> +
> +		status = blk_execute_rq(req, 0);
> +		ret = blk_status_to_errno(status);
> +		if (ret)
> +			goto out;
> +
> +		blk_mq_free_request(req);
> +
> +		ret = cb(&info, i, data);
> +		if (ret)
> +			goto out;
> +
> +		/* A zero length zone means don't ask for more zones */
> +		if (!info.len) {
> +			nr_zones = i;
> +			break;
> +		}
> +
> +		sector += zone_size;
> +	}
> +	ret = nr_zones;
> +
> + out:
> +	return ret;
> +}
  
Andreas Hindborg Feb. 27, 2023, 3:36 p.m. UTC | #6
Minwoo Im <minwoo.im.dev@gmail.com> writes:

> On 23-02-24 21:05:01, Andreas Hindborg wrote:
>> +static int ublk_report_zones(struct gendisk *disk, sector_t sector,
>> +			     unsigned int nr_zones, report_zones_cb cb,
>> +			     void *data)
>> +{
>> +	struct ublk_device *ub;
>> +	unsigned int zone_size;
>> +	unsigned int first_zone;
>> +	int ret = 0;
>> +
>> +	ub = disk->private_data;
>> +
>> +	if (!(ub->dev_info.flags & UBLK_F_ZONED))
>> +		return -EINVAL;
>> +
>> +	zone_size = disk->queue->limits.chunk_sectors;
>> +	first_zone = sector >> ilog2(zone_size);
>> +	nr_zones = min(ub->ub_disk->nr_zones - first_zone, nr_zones);
>> +
>> +	for (unsigned int i = 0; i < nr_zones; i++) {
>> +		struct request *req;
>> +		blk_status_t status;
>> +		struct blk_zone info = {0};
>> +
>> +		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
>> +
>> +		if (IS_ERR(req)) {
>> +			ret = PTR_ERR(req);
>> +			goto out;
>> +		}
>
> Can we just return directly just like above (-EINVAL)?
>
> if (IS_ERR(req))
> 	return PTR_ERR(req);

Yes, I agree.

>
>> +
>> +		req->__sector = sector;
>> +
>> +		ret = blk_rq_map_kern(disk->queue, req, &info, sizeof(info),
>> +				      GFP_KERNEL);
>> +
>> +		if (ret)
>> +			goto out;
>
> If we really have to use goto here, then I think we have choices:
> put blk_mq_free_request(req); here or put it to the out: area.

Lets free here and drop the goto.

Thanks,
Andreas

>> +
>> +		status = blk_execute_rq(req, 0);
>> +		ret = blk_status_to_errno(status);
>> +		if (ret)
>> +			goto out;
>> +
>> +		blk_mq_free_request(req);
>> +
>> +		ret = cb(&info, i, data);
>> +		if (ret)
>> +			goto out;
>> +
>> +		/* A zero length zone means don't ask for more zones */
>> +		if (!info.len) {
>> +			nr_zones = i;
>> +			break;
>> +		}
>> +
>> +		sector += zone_size;
>> +	}
>> +	ret = nr_zones;
>> +
>> + out:
>> +	return ret;
>> +}
  
Hans Holmberg Feb. 28, 2023, 9:52 a.m. UTC | #7
On Fri, Feb 24, 2023 at 9:06 PM Andreas Hindborg <nmi@metaspace.dk> wrote:
>
> Add zoned storage support to ublk: report_zones and operations:
>  - REQ_OP_ZONE_OPEN
>  - REQ_OP_ZONE_CLOSE
>  - REQ_OP_ZONE_FINISH
>  - REQ_OP_ZONE_RESET

Reset all is missing, right? Might as well define that before there are
users of the ublk<->ublksrv interface.

>
> This allows implementation of zoned storage devices in user space. An
> example user space implementation based on ubdsrv is available [1].
>
> [1] https://github.com/metaspace/ubdsrv/commit/14a2b708f74f70cfecb076d92e680dc718cc1f6d
>
> Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
> ---
> Changes since v1:
>  - Fixed conditional compilation bug
>  - Refactored to collect conditional code additions together
>  - Fixed style errors
>  - Zero stack allocated value used for zone report
>
> Reported-by: Niklas Cassel <Niklas.Cassel@wdc.com>
> Reported-by: kernel test robot <lkp@intel.com>
> Link: https://lore.kernel.org/oe-kbuild-all/202302250222.XOrw9j6z-lkp@intel.com/
> v1: https://lore.kernel.org/linux-block/20230224125950.214779-1-nmi@metaspace.dk/
>
>  drivers/block/ublk_drv.c      | 150 ++++++++++++++++++++++++++++++++--
>  include/uapi/linux/ublk_cmd.h |  18 ++++
>  2 files changed, 162 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 6368b56eacf1..37e516903867 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -20,6 +20,7 @@
>  #include <linux/major.h>
>  #include <linux/wait.h>
>  #include <linux/blkdev.h>
> +#include <linux/blkzoned.h>
>  #include <linux/init.h>
>  #include <linux/swap.h>
>  #include <linux/slab.h>
> @@ -51,10 +52,12 @@
>                 | UBLK_F_URING_CMD_COMP_IN_TASK \
>                 | UBLK_F_NEED_GET_DATA \
>                 | UBLK_F_USER_RECOVERY \
> -               | UBLK_F_USER_RECOVERY_REISSUE)
> +               | UBLK_F_USER_RECOVERY_REISSUE \
> +               | UBLK_F_ZONED)
>
>  /* All UBLK_PARAM_TYPE_* should be included here */
> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
> +#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD \
> +                            | UBLK_PARAM_TYPE_ZONED)
>
>  struct ublk_rq_data {
>         struct llist_node node;
> @@ -187,6 +190,98 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
>
>  static struct miscdevice ublk_misc;
>
> +#ifdef CONFIG_BLK_DEV_ZONED
> +static void ublk_set_nr_zones(struct ublk_device *ub)
> +{
> +       const struct ublk_param_basic *p = &ub->params.basic;
> +
> +       if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
> +               ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
> +}
> +
> +static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
> +{
> +       const struct ublk_param_zoned *p = &ub->params.zoned;
> +
> +       if (ub->dev_info.flags & UBLK_F_ZONED) {
> +               disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
> +               disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
> +       }
> +}
> +
> +static int ublk_revalidate_disk_zones(struct gendisk *disk)
> +{
> +       return blk_revalidate_disk_zones(disk, NULL);
> +}
> +
> +static int ublk_report_zones(struct gendisk *disk, sector_t sector,
> +                            unsigned int nr_zones, report_zones_cb cb,
> +                            void *data)
> +{
> +       struct ublk_device *ub;
> +       unsigned int zone_size;
> +       unsigned int first_zone;
> +       int ret = 0;
> +
> +       ub = disk->private_data;
> +
> +       if (!(ub->dev_info.flags & UBLK_F_ZONED))
> +               return -EINVAL;
> +
> +       zone_size = disk->queue->limits.chunk_sectors;
> +       first_zone = sector >> ilog2(zone_size);
> +       nr_zones = min(ub->ub_disk->nr_zones - first_zone, nr_zones);
> +
> +       for (unsigned int i = 0; i < nr_zones; i++) {
> +               struct request *req;
> +               blk_status_t status;
> +               struct blk_zone info = {0};
> +
> +               req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
> +
> +               if (IS_ERR(req)) {
> +                       ret = PTR_ERR(req);
> +                       goto out;
> +               }
> +
> +               req->__sector = sector;
> +
> +               ret = blk_rq_map_kern(disk->queue, req, &info, sizeof(info),
> +                                     GFP_KERNEL);
> +
> +               if (ret)
> +                       goto out;
> +
> +               status = blk_execute_rq(req, 0);

Pingponging with userspace to retrieve thousand(s) of zone states is
very inefficient, so I don't think we
want to define the report zone ublk uapi to be one-zone-per-call.

Also, just overloading REQ_OP_DRV_IN with report zones instead of
defining a zone-report specific call does
not seem future proof - there might be a need to do other
driver-specific calls in the future.

Virtio blk implements reporting in a better way - se
virtblk_report_zones (that just got merged in Linus tree)
The same type of interface should be possible for ublk.

Cheers,
Hans

> +               ret = blk_status_to_errno(status);
> +               if (ret)
> +                       goto out;
> +
> +               blk_mq_free_request(req);
> +
> +               ret = cb(&info, i, data);
> +               if (ret)
> +                       goto out;
> +
> +               /* A zero length zone means don't ask for more zones */
> +               if (!info.len) {
> +                       nr_zones = i;
> +                       break;
> +               }
> +
> +               sector += zone_size;
> +       }
> +       ret = nr_zones;
> +
> + out:
> +       return ret;
> +}
> +#else
> +void ublk_set_nr_zones(struct ublk_device *ub);
> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
> +int ublk_revalidate_disk_zones(struct gendisk *disk);
> +#endif
> +
>  static void ublk_dev_param_basic_apply(struct ublk_device *ub)
>  {
>         struct request_queue *q = ub->ub_disk->queue;
> @@ -212,6 +307,9 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
>                 set_disk_ro(ub->ub_disk, true);
>
>         set_capacity(ub->ub_disk, p->dev_sectors);
> +
> +       if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> +               ublk_set_nr_zones(ub);
>  }
>
>  static void ublk_dev_param_discard_apply(struct ublk_device *ub)
> @@ -268,6 +366,9 @@ static int ublk_apply_params(struct ublk_device *ub)
>         if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
>                 ublk_dev_param_discard_apply(ub);
>
> +       if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types & UBLK_PARAM_TYPE_ZONED))
> +               ublk_dev_param_zoned_apply(ub);
> +
>         return 0;
>  }
>
> @@ -361,9 +462,13 @@ static void ublk_free_disk(struct gendisk *disk)
>         put_device(&ub->cdev_dev);
>  }
>
> +
>  static const struct block_device_operations ub_fops = {
> -       .owner =        THIS_MODULE,
> -       .free_disk =    ublk_free_disk,
> +       .owner = THIS_MODULE,
> +       .free_disk = ublk_free_disk,
> +#ifdef CONFIG_BLK_DEV_ZONED
> +       .report_zones = ublk_report_zones,
> +#endif
>  };
>
>  #define UBLK_MAX_PIN_PAGES     32
> @@ -499,7 +604,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
>  {
>         const unsigned int rq_bytes = blk_rq_bytes(req);
>
> -       if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
> +       if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && ublk_rq_has_data(req)) {
>                 struct ublk_map_data data = {
>                         .ubq    =       ubq,
>                         .rq     =       req,
> @@ -566,6 +671,26 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
>         case REQ_OP_WRITE_ZEROES:
>                 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
>                 break;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +       case REQ_OP_ZONE_OPEN:
> +               ublk_op = UBLK_IO_OP_ZONE_OPEN;
> +               break;
> +       case REQ_OP_ZONE_CLOSE:
> +               ublk_op = UBLK_IO_OP_ZONE_CLOSE;
> +               break;
> +       case REQ_OP_ZONE_FINISH:
> +               ublk_op = UBLK_IO_OP_ZONE_FINISH;
> +               break;
> +       case REQ_OP_ZONE_RESET:
> +               ublk_op = UBLK_IO_OP_ZONE_RESET;
> +               break;
> +       case REQ_OP_DRV_IN:
> +               ublk_op = UBLK_IO_OP_DRV_IN;
> +               break;
> +       case REQ_OP_ZONE_APPEND:
> +               /* We do not support zone append yet */
> +               fallthrough;
> +#endif
>         default:
>                 return BLK_STS_IOERR;
>         }
> @@ -612,7 +737,8 @@ static void ublk_complete_rq(struct request *req)
>          *
>          * Both the two needn't unmap.
>          */
> -       if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
> +       if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
> +           req_op(req) != REQ_OP_DRV_IN) {
>                 blk_mq_end_request(req, BLK_STS_OK);
>                 return;
>         }
> @@ -1535,6 +1661,15 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
>         if (ret)
>                 goto out_put_disk;
>
> +       if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> +           ub->dev_info.flags & UBLK_F_ZONED) {
> +               disk_set_zoned(disk, BLK_ZONED_HM);
> +               blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
> +               ret = ublk_revalidate_disk_zones(disk);
> +               if (ret)
> +                       goto out_put_disk;
> +       }
> +
>         get_device(&ub->cdev_dev);
>         ret = add_disk(disk);
>         if (ret) {
> @@ -1673,6 +1808,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
>         if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
>                 ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
>
> +       if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> +               ub->dev_info.flags &= ~UBLK_F_ZONED;
> +
>         /* We are not ready to support zero copy */
>         ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
>
> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
> index 8f88e3a29998..074b97821575 100644
> --- a/include/uapi/linux/ublk_cmd.h
> +++ b/include/uapi/linux/ublk_cmd.h
> @@ -78,6 +78,10 @@
>  #define UBLK_F_USER_RECOVERY   (1UL << 3)
>
>  #define UBLK_F_USER_RECOVERY_REISSUE   (1UL << 4)
> +/*
> + * Enable zoned device support
> + */
> +#define UBLK_F_ZONED (1ULL << 5)
>
>  /* device state */
>  #define UBLK_S_DEV_DEAD        0
> @@ -129,6 +133,12 @@ struct ublksrv_ctrl_dev_info {
>  #define                UBLK_IO_OP_DISCARD      3
>  #define                UBLK_IO_OP_WRITE_SAME   4
>  #define                UBLK_IO_OP_WRITE_ZEROES 5
> +#define                UBLK_IO_OP_ZONE_OPEN            10
> +#define                UBLK_IO_OP_ZONE_CLOSE           11
> +#define                UBLK_IO_OP_ZONE_FINISH          12
> +#define                UBLK_IO_OP_ZONE_APPEND          13
> +#define                UBLK_IO_OP_ZONE_RESET           15
> +#define                UBLK_IO_OP_DRV_IN               34
>
>  #define                UBLK_IO_F_FAILFAST_DEV          (1U << 8)
>  #define                UBLK_IO_F_FAILFAST_TRANSPORT    (1U << 9)
> @@ -214,6 +224,12 @@ struct ublk_param_discard {
>         __u16   reserved0;
>  };
>
> +struct ublk_param_zoned {
> +       __u64   max_open_zones;
> +       __u64   max_active_zones;
> +       __u64   max_append_size;
> +};
> +
>  struct ublk_params {
>         /*
>          * Total length of parameters, userspace has to set 'len' for both
> @@ -224,10 +240,12 @@ struct ublk_params {
>         __u32   len;
>  #define UBLK_PARAM_TYPE_BASIC           (1 << 0)
>  #define UBLK_PARAM_TYPE_DISCARD         (1 << 1)
> +#define UBLK_PARAM_TYPE_ZONED           (1 << 2)
>         __u32   types;                  /* types of parameter included */
>
>         struct ublk_param_basic         basic;
>         struct ublk_param_discard       discard;
> +       struct ublk_param_zoned         zoned;
>  };
>
>  #endif
>
> base-commit: c9c3395d5e3dcc6daee66c6908354d47bf98cb0c
> --
> 2.39.2
>
  
Andreas Hindborg March 1, 2023, 7:28 a.m. UTC | #8
Hans Holmberg <hans@owltronix.com> writes:

> On Fri, Feb 24, 2023 at 9:06 PM Andreas Hindborg <nmi@metaspace.dk> wrote:
>>
>> Add zoned storage support to ublk: report_zones and operations:
>>  - REQ_OP_ZONE_OPEN
>>  - REQ_OP_ZONE_CLOSE
>>  - REQ_OP_ZONE_FINISH
>>  - REQ_OP_ZONE_RESET
>
> Reset all is missing, right? Might as well define that before there are
> users of the ublk<->ublksrv interface.

REQ_OP_ZONE_APPEND and REQ_OP_ZONE_RESET_ALL are not handled in this
patch. REQ_OP_* are converted to UBLK_IO_OP_* in ublk_setup_iod() in an
extensible way. It should be no problem adding support for more
operations later without breaking backwards compatibility.

>
>>
>> This allows implementation of zoned storage devices in user space. An
>> example user space implementation based on ubdsrv is available [1].
>>
>> [1] https://github.com/metaspace/ubdsrv/commit/14a2b708f74f70cfecb076d92e680dc718cc1f6d
>>
>> Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
>> ---
>> Changes since v1:
>>  - Fixed conditional compilation bug
>>  - Refactored to collect conditional code additions together
>>  - Fixed style errors
>>  - Zero stack allocated value used for zone report
>>
>> Reported-by: Niklas Cassel <Niklas.Cassel@wdc.com>
>> Reported-by: kernel test robot <lkp@intel.com>
>> Link: https://lore.kernel.org/oe-kbuild-all/202302250222.XOrw9j6z-lkp@intel.com/
>> v1: https://lore.kernel.org/linux-block/20230224125950.214779-1-nmi@metaspace.dk/
>>
>>  drivers/block/ublk_drv.c      | 150 ++++++++++++++++++++++++++++++++--
>>  include/uapi/linux/ublk_cmd.h |  18 ++++
>>  2 files changed, 162 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>> index 6368b56eacf1..37e516903867 100644
>> --- a/drivers/block/ublk_drv.c
>> +++ b/drivers/block/ublk_drv.c
>> @@ -20,6 +20,7 @@
>>  #include <linux/major.h>
>>  #include <linux/wait.h>
>>  #include <linux/blkdev.h>
>> +#include <linux/blkzoned.h>
>>  #include <linux/init.h>
>>  #include <linux/swap.h>
>>  #include <linux/slab.h>
>> @@ -51,10 +52,12 @@
>>                 | UBLK_F_URING_CMD_COMP_IN_TASK \
>>                 | UBLK_F_NEED_GET_DATA \
>>                 | UBLK_F_USER_RECOVERY \
>> -               | UBLK_F_USER_RECOVERY_REISSUE)
>> +               | UBLK_F_USER_RECOVERY_REISSUE \
>> +               | UBLK_F_ZONED)
>>
>>  /* All UBLK_PARAM_TYPE_* should be included here */
>> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
>> +#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD \
>> +                            | UBLK_PARAM_TYPE_ZONED)
>>
>>  struct ublk_rq_data {
>>         struct llist_node node;
>> @@ -187,6 +190,98 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
>>
>>  static struct miscdevice ublk_misc;
>>
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +static void ublk_set_nr_zones(struct ublk_device *ub)
>> +{
>> +       const struct ublk_param_basic *p = &ub->params.basic;
>> +
>> +       if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
>> +               ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
>> +}
>> +
>> +static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
>> +{
>> +       const struct ublk_param_zoned *p = &ub->params.zoned;
>> +
>> +       if (ub->dev_info.flags & UBLK_F_ZONED) {
>> +               disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
>> +               disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
>> +       }
>> +}
>> +
>> +static int ublk_revalidate_disk_zones(struct gendisk *disk)
>> +{
>> +       return blk_revalidate_disk_zones(disk, NULL);
>> +}
>> +
>> +static int ublk_report_zones(struct gendisk *disk, sector_t sector,
>> +                            unsigned int nr_zones, report_zones_cb cb,
>> +                            void *data)
>> +{
>> +       struct ublk_device *ub;
>> +       unsigned int zone_size;
>> +       unsigned int first_zone;
>> +       int ret = 0;
>> +
>> +       ub = disk->private_data;
>> +
>> +       if (!(ub->dev_info.flags & UBLK_F_ZONED))
>> +               return -EINVAL;
>> +
>> +       zone_size = disk->queue->limits.chunk_sectors;
>> +       first_zone = sector >> ilog2(zone_size);
>> +       nr_zones = min(ub->ub_disk->nr_zones - first_zone, nr_zones);
>> +
>> +       for (unsigned int i = 0; i < nr_zones; i++) {
>> +               struct request *req;
>> +               blk_status_t status;
>> +               struct blk_zone info = {0};
>> +
>> +               req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
>> +
>> +               if (IS_ERR(req)) {
>> +                       ret = PTR_ERR(req);
>> +                       goto out;
>> +               }
>> +
>> +               req->__sector = sector;
>> +
>> +               ret = blk_rq_map_kern(disk->queue, req, &info, sizeof(info),
>> +                                     GFP_KERNEL);
>> +
>> +               if (ret)
>> +                       goto out;
>> +
>> +               status = blk_execute_rq(req, 0);
>
> Pingponging with userspace to retrieve thousand(s) of zone states is
> very inefficient, so I don't think we
> want to define the report zone ublk uapi to be one-zone-per-call.

I agree. I chose this implementation because it was not clear to me how
to communicate the zone report request details to user space. But I
think we can overload the nr_sectors and start_sector of the iod to
communicate the size of the report request.

> Also, just overloading REQ_OP_DRV_IN with report zones instead of
> defining a zone-report specific call does
> not seem future proof - there might be a need to do other
> driver-specific calls in the future.

Yea, ublk does not currently have a command field in the request pdu, so
I did not want to add that. However, it makes sense if we ever want to
use REQ_OP_DRV_IN for anything other than zone report.

>
> Virtio blk implements reporting in a better way - se
> virtblk_report_zones (that just got merged in Linus tree)
> The same type of interface should be possible for ublk.

Something like that could work 👍

>> +               ret = blk_status_to_errno(status);
>> +               if (ret)
>> +                       goto out;
>> +
>> +               blk_mq_free_request(req);
>> +
>> +               ret = cb(&info, i, data);
>> +               if (ret)
>> +                       goto out;
>> +
>> +               /* A zero length zone means don't ask for more zones */
>> +               if (!info.len) {
>> +                       nr_zones = i;
>> +                       break;
>> +               }
>> +
>> +               sector += zone_size;
>> +       }
>> +       ret = nr_zones;
>> +
>> + out:
>> +       return ret;
>> +}
>> +#else
>> +void ublk_set_nr_zones(struct ublk_device *ub);
>> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
>> +int ublk_revalidate_disk_zones(struct gendisk *disk);
>> +#endif
>> +
>>  static void ublk_dev_param_basic_apply(struct ublk_device *ub)
>>  {
>>         struct request_queue *q = ub->ub_disk->queue;
>> @@ -212,6 +307,9 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
>>                 set_disk_ro(ub->ub_disk, true);
>>
>>         set_capacity(ub->ub_disk, p->dev_sectors);
>> +
>> +       if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> +               ublk_set_nr_zones(ub);
>>  }
>>
>>  static void ublk_dev_param_discard_apply(struct ublk_device *ub)
>> @@ -268,6 +366,9 @@ static int ublk_apply_params(struct ublk_device *ub)
>>         if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
>>                 ublk_dev_param_discard_apply(ub);
>>
>> +       if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types & UBLK_PARAM_TYPE_ZONED))
>> +               ublk_dev_param_zoned_apply(ub);
>> +
>>         return 0;
>>  }
>>
>> @@ -361,9 +462,13 @@ static void ublk_free_disk(struct gendisk *disk)
>>         put_device(&ub->cdev_dev);
>>  }
>>
>> +
>>  static const struct block_device_operations ub_fops = {
>> -       .owner =        THIS_MODULE,
>> -       .free_disk =    ublk_free_disk,
>> +       .owner = THIS_MODULE,
>> +       .free_disk = ublk_free_disk,
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +       .report_zones = ublk_report_zones,
>> +#endif
>>  };
>>
>>  #define UBLK_MAX_PIN_PAGES     32
>> @@ -499,7 +604,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
>>  {
>>         const unsigned int rq_bytes = blk_rq_bytes(req);
>>
>> -       if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
>> +       if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && ublk_rq_has_data(req)) {
>>                 struct ublk_map_data data = {
>>                         .ubq    =       ubq,
>>                         .rq     =       req,
>> @@ -566,6 +671,26 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
>>         case REQ_OP_WRITE_ZEROES:
>>                 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
>>                 break;
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +       case REQ_OP_ZONE_OPEN:
>> +               ublk_op = UBLK_IO_OP_ZONE_OPEN;
>> +               break;
>> +       case REQ_OP_ZONE_CLOSE:
>> +               ublk_op = UBLK_IO_OP_ZONE_CLOSE;
>> +               break;
>> +       case REQ_OP_ZONE_FINISH:
>> +               ublk_op = UBLK_IO_OP_ZONE_FINISH;
>> +               break;
>> +       case REQ_OP_ZONE_RESET:
>> +               ublk_op = UBLK_IO_OP_ZONE_RESET;
>> +               break;
>> +       case REQ_OP_DRV_IN:
>> +               ublk_op = UBLK_IO_OP_DRV_IN;
>> +               break;
>> +       case REQ_OP_ZONE_APPEND:
>> +               /* We do not support zone append yet */
>> +               fallthrough;
>> +#endif
>>         default:
>>                 return BLK_STS_IOERR;
>>         }
>> @@ -612,7 +737,8 @@ static void ublk_complete_rq(struct request *req)
>>          *
>>          * Both the two needn't unmap.
>>          */
>> -       if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
>> +       if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
>> +           req_op(req) != REQ_OP_DRV_IN) {
>>                 blk_mq_end_request(req, BLK_STS_OK);
>>                 return;
>>         }
>> @@ -1535,6 +1661,15 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
>>         if (ret)
>>                 goto out_put_disk;
>>
>> +       if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>> +           ub->dev_info.flags & UBLK_F_ZONED) {
>> +               disk_set_zoned(disk, BLK_ZONED_HM);
>> +               blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
>> +               ret = ublk_revalidate_disk_zones(disk);
>> +               if (ret)
>> +                       goto out_put_disk;
>> +       }
>> +
>>         get_device(&ub->cdev_dev);
>>         ret = add_disk(disk);
>>         if (ret) {
>> @@ -1673,6 +1808,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
>>         if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
>>                 ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
>>
>> +       if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> +               ub->dev_info.flags &= ~UBLK_F_ZONED;
>> +
>>         /* We are not ready to support zero copy */
>>         ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
>>
>> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>> index 8f88e3a29998..074b97821575 100644
>> --- a/include/uapi/linux/ublk_cmd.h
>> +++ b/include/uapi/linux/ublk_cmd.h
>> @@ -78,6 +78,10 @@
>>  #define UBLK_F_USER_RECOVERY   (1UL << 3)
>>
>>  #define UBLK_F_USER_RECOVERY_REISSUE   (1UL << 4)
>> +/*
>> + * Enable zoned device support
>> + */
>> +#define UBLK_F_ZONED (1ULL << 5)
>>
>>  /* device state */
>>  #define UBLK_S_DEV_DEAD        0
>> @@ -129,6 +133,12 @@ struct ublksrv_ctrl_dev_info {
>>  #define                UBLK_IO_OP_DISCARD      3
>>  #define                UBLK_IO_OP_WRITE_SAME   4
>>  #define                UBLK_IO_OP_WRITE_ZEROES 5
>> +#define                UBLK_IO_OP_ZONE_OPEN            10
>> +#define                UBLK_IO_OP_ZONE_CLOSE           11
>> +#define                UBLK_IO_OP_ZONE_FINISH          12
>> +#define                UBLK_IO_OP_ZONE_APPEND          13
>> +#define                UBLK_IO_OP_ZONE_RESET           15
>> +#define                UBLK_IO_OP_DRV_IN               34
>>
>>  #define                UBLK_IO_F_FAILFAST_DEV          (1U << 8)
>>  #define                UBLK_IO_F_FAILFAST_TRANSPORT    (1U << 9)
>> @@ -214,6 +224,12 @@ struct ublk_param_discard {
>>         __u16   reserved0;
>>  };
>>
>> +struct ublk_param_zoned {
>> +       __u64   max_open_zones;
>> +       __u64   max_active_zones;
>> +       __u64   max_append_size;
>> +};
>> +
>>  struct ublk_params {
>>         /*
>>          * Total length of parameters, userspace has to set 'len' for both
>> @@ -224,10 +240,12 @@ struct ublk_params {
>>         __u32   len;
>>  #define UBLK_PARAM_TYPE_BASIC           (1 << 0)
>>  #define UBLK_PARAM_TYPE_DISCARD         (1 << 1)
>> +#define UBLK_PARAM_TYPE_ZONED           (1 << 2)
>>         __u32   types;                  /* types of parameter included */
>>
>>         struct ublk_param_basic         basic;
>>         struct ublk_param_discard       discard;
>> +       struct ublk_param_zoned         zoned;
>>  };
>>
>>  #endif
>>
>> base-commit: c9c3395d5e3dcc6daee66c6908354d47bf98cb0c
>> --
>> 2.39.2
>>
  
Ming Lei March 2, 2023, 2:50 a.m. UTC | #9
On Fri, Feb 24, 2023 at 09:05:01PM +0100, Andreas Hindborg wrote:
> Add zoned storage support to ublk: report_zones and operations:
>  - REQ_OP_ZONE_OPEN
>  - REQ_OP_ZONE_CLOSE
>  - REQ_OP_ZONE_FINISH
>  - REQ_OP_ZONE_RESET
> 
> This allows implementation of zoned storage devices in user space. An
> example user space implementation based on ubdsrv is available [1].
> 
> [1] https://github.com/metaspace/ubdsrv/commit/14a2b708f74f70cfecb076d92e680dc718cc1f6d

As I suggested, please write one simple & clean zoned target for verifying
the interface, and better to not add to tgt_null.c.

> 
> Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
> ---
> Changes since v1:
>  - Fixed conditional compilation bug
>  - Refactored to collect conditional code additions together
>  - Fixed style errors
>  - Zero stack allocated value used for zone report
> 
> Reported-by: Niklas Cassel <Niklas.Cassel@wdc.com>
> Reported-by: kernel test robot <lkp@intel.com>
> Link: https://lore.kernel.org/oe-kbuild-all/202302250222.XOrw9j6z-lkp@intel.com/
> v1: https://lore.kernel.org/linux-block/20230224125950.214779-1-nmi@metaspace.dk/
> 
>  drivers/block/ublk_drv.c      | 150 ++++++++++++++++++++++++++++++++--
>  include/uapi/linux/ublk_cmd.h |  18 ++++
>  2 files changed, 162 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 6368b56eacf1..37e516903867 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -20,6 +20,7 @@
>  #include <linux/major.h>
>  #include <linux/wait.h>
>  #include <linux/blkdev.h>
> +#include <linux/blkzoned.h>
>  #include <linux/init.h>
>  #include <linux/swap.h>
>  #include <linux/slab.h>
> @@ -51,10 +52,12 @@
>  		| UBLK_F_URING_CMD_COMP_IN_TASK \
>  		| UBLK_F_NEED_GET_DATA \
>  		| UBLK_F_USER_RECOVERY \
> -		| UBLK_F_USER_RECOVERY_REISSUE)
> +		| UBLK_F_USER_RECOVERY_REISSUE \
> +		| UBLK_F_ZONED)
>  
>  /* All UBLK_PARAM_TYPE_* should be included here */
> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
> +#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD \
> +			     | UBLK_PARAM_TYPE_ZONED)
>  
>  struct ublk_rq_data {
>  	struct llist_node node;
> @@ -187,6 +190,98 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
>  
>  static struct miscdevice ublk_misc;
>  
> +#ifdef CONFIG_BLK_DEV_ZONED
> +static void ublk_set_nr_zones(struct ublk_device *ub)
> +{
> +	const struct ublk_param_basic *p = &ub->params.basic;
> +
> +	if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
> +		ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
> +}
> +
> +static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
> +{
> +	const struct ublk_param_zoned *p = &ub->params.zoned;
> +
> +	if (ub->dev_info.flags & UBLK_F_ZONED) {
> +		disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
> +		disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
> +	}
> +}
> +
> +static int ublk_revalidate_disk_zones(struct gendisk *disk)
> +{
> +	return blk_revalidate_disk_zones(disk, NULL);
> +}
> +
> +static int ublk_report_zones(struct gendisk *disk, sector_t sector,
> +			     unsigned int nr_zones, report_zones_cb cb,
> +			     void *data)
> +{
> +	struct ublk_device *ub;
> +	unsigned int zone_size;
> +	unsigned int first_zone;
> +	int ret = 0;
> +
> +	ub = disk->private_data;
> +
> +	if (!(ub->dev_info.flags & UBLK_F_ZONED))
> +		return -EINVAL;
> +
> +	zone_size = disk->queue->limits.chunk_sectors;
> +	first_zone = sector >> ilog2(zone_size);
> +	nr_zones = min(ub->ub_disk->nr_zones - first_zone, nr_zones);
> +
> +	for (unsigned int i = 0; i < nr_zones; i++) {

The local variable 'i' needs to be declared in the front part
of this function body.

> +		struct request *req;
> +		blk_status_t status;
> +		struct blk_zone info = {0};
> +
> +		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
> +
> +		if (IS_ERR(req)) {
> +			ret = PTR_ERR(req);
> +			goto out;
> +		}
> +
> +		req->__sector = sector;

Why is req->__sector set?

> +
> +		ret = blk_rq_map_kern(disk->queue, req, &info, sizeof(info),
> +				      GFP_KERNEL);
> +
> +		if (ret)
> +			goto out;
> +
> +		status = blk_execute_rq(req, 0);
> +		ret = blk_status_to_errno(status);
> +		if (ret)
> +			goto out;
> +
> +		blk_mq_free_request(req);
> +
> +		ret = cb(&info, i, data);
> +		if (ret)
> +			goto out;
> +
> +		/* A zero length zone means don't ask for more zones */
> +		if (!info.len) {
> +			nr_zones = i;
> +			break;
> +		}
> +
> +		sector += zone_size;
> +	}

I'd suggest to report as many as possible zones in one command, and
the dev_info.max_io_buf_bytes is the max allowed buffer size for one
command, please refer to nvme_ns_report_zones().

Also we are going to extend ublk in the multiple LUN/NS style, and I
guess that won't be one issue since ->report_zones() is always done on
disk level, right?

> +	ret = nr_zones;
> +
> + out:
> +	return ret;
> +}
> +#else
> +void ublk_set_nr_zones(struct ublk_device *ub);
> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
> +int ublk_revalidate_disk_zones(struct gendisk *disk);
> +#endif
> +
>  static void ublk_dev_param_basic_apply(struct ublk_device *ub)
>  {
>  	struct request_queue *q = ub->ub_disk->queue;
> @@ -212,6 +307,9 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
>  		set_disk_ro(ub->ub_disk, true);
>  
>  	set_capacity(ub->ub_disk, p->dev_sectors);
> +
> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> +		ublk_set_nr_zones(ub);
>  }
>  
>  static void ublk_dev_param_discard_apply(struct ublk_device *ub)
> @@ -268,6 +366,9 @@ static int ublk_apply_params(struct ublk_device *ub)
>  	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
>  		ublk_dev_param_discard_apply(ub);
>  
> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types & UBLK_PARAM_TYPE_ZONED))
> +		ublk_dev_param_zoned_apply(ub);
> +
>  	return 0;
>  }
>  
> @@ -361,9 +462,13 @@ static void ublk_free_disk(struct gendisk *disk)
>  	put_device(&ub->cdev_dev);
>  }
>  
> +
>  static const struct block_device_operations ub_fops = {
> -	.owner =	THIS_MODULE,
> -	.free_disk =	ublk_free_disk,
> +	.owner = THIS_MODULE,
> +	.free_disk = ublk_free_disk,
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	.report_zones = ublk_report_zones,
> +#endif

Define one null ublk_report_zones in #else branch of the above #ifdef, then we
can save one #ifdef.

>  };
>  
>  #define UBLK_MAX_PIN_PAGES	32
> @@ -499,7 +604,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
>  {
>  	const unsigned int rq_bytes = blk_rq_bytes(req);
>  
> -	if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
> +	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && ublk_rq_has_data(req)) {
>  		struct ublk_map_data data = {
>  			.ubq	=	ubq,
>  			.rq	=	req,
> @@ -566,6 +671,26 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
>  	case REQ_OP_WRITE_ZEROES:
>  		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
>  		break;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	case REQ_OP_ZONE_OPEN:
> +		ublk_op = UBLK_IO_OP_ZONE_OPEN;
> +		break;
> +	case REQ_OP_ZONE_CLOSE:
> +		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
> +		break;
> +	case REQ_OP_ZONE_FINISH:
> +		ublk_op = UBLK_IO_OP_ZONE_FINISH;
> +		break;
> +	case REQ_OP_ZONE_RESET:
> +		ublk_op = UBLK_IO_OP_ZONE_RESET;
> +		break;
> +	case REQ_OP_DRV_IN:
> +		ublk_op = UBLK_IO_OP_DRV_IN;
> +		break;
> +	case REQ_OP_ZONE_APPEND:
> +		/* We do not support zone append yet */
> +		fallthrough;
> +#endif

The above '#ifdef' is needn't, since  OP_ZONE should be defined no
matter if CONFIG_BLK_DEV_ZONED is enabled or not.

>  	default:
>  		return BLK_STS_IOERR;
>  	}
> @@ -612,7 +737,8 @@ static void ublk_complete_rq(struct request *req)
>  	 *
>  	 * Both the two needn't unmap.
>  	 */
> -	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
> +	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
> +	    req_op(req) != REQ_OP_DRV_IN) {
>  		blk_mq_end_request(req, BLK_STS_OK);
>  		return;
>  	}
> @@ -1535,6 +1661,15 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
>  	if (ret)
>  		goto out_put_disk;
>  
> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> +	    ub->dev_info.flags & UBLK_F_ZONED) {
> +		disk_set_zoned(disk, BLK_ZONED_HM);
> +		blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
> +		ret = ublk_revalidate_disk_zones(disk);
> +		if (ret)
> +			goto out_put_disk;
> +	}
> +
>  	get_device(&ub->cdev_dev);
>  	ret = add_disk(disk);
>  	if (ret) {
> @@ -1673,6 +1808,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
>  	if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
>  		ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
>  
> +	if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> +		ub->dev_info.flags &= ~UBLK_F_ZONED;
> +
>  	/* We are not ready to support zero copy */
>  	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
>  
> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
> index 8f88e3a29998..074b97821575 100644
> --- a/include/uapi/linux/ublk_cmd.h
> +++ b/include/uapi/linux/ublk_cmd.h
> @@ -78,6 +78,10 @@
>  #define UBLK_F_USER_RECOVERY	(1UL << 3)
>  
>  #define UBLK_F_USER_RECOVERY_REISSUE	(1UL << 4)
> +/*
> + * Enable zoned device support
> + */
> +#define UBLK_F_ZONED (1ULL << 5)
>  
>  /* device state */
>  #define UBLK_S_DEV_DEAD	0
> @@ -129,6 +133,12 @@ struct ublksrv_ctrl_dev_info {
>  #define		UBLK_IO_OP_DISCARD	3
>  #define		UBLK_IO_OP_WRITE_SAME	4
>  #define		UBLK_IO_OP_WRITE_ZEROES	5
> +#define		UBLK_IO_OP_ZONE_OPEN		10
> +#define		UBLK_IO_OP_ZONE_CLOSE		11
> +#define		UBLK_IO_OP_ZONE_FINISH		12
> +#define		UBLK_IO_OP_ZONE_APPEND		13
> +#define		UBLK_IO_OP_ZONE_RESET		15
> +#define		UBLK_IO_OP_DRV_IN		34
>  
>  #define		UBLK_IO_F_FAILFAST_DEV		(1U << 8)
>  #define		UBLK_IO_F_FAILFAST_TRANSPORT	(1U << 9)
> @@ -214,6 +224,12 @@ struct ublk_param_discard {
>  	__u16	reserved0;
>  };
>  
> +struct ublk_param_zoned {
> +	__u64	max_open_zones;
> +	__u64	max_active_zones;
> +	__u64	max_append_size;
> +};

Is the above zoned parameter enough for future extension?
Does ZNS need extra parameter? Or some zoned new(important) features?

I highly suggest to reserve some fields for extension, given
it is one ABI interface, which is supposed to be defined well
enough from the beginning.


Thanks, 
Ming
  
Andreas Hindborg March 2, 2023, 7:31 a.m. UTC | #10
Hi Ming,

Ming Lei <ming.lei@redhat.com> writes:

> On Fri, Feb 24, 2023 at 09:05:01PM +0100, Andreas Hindborg wrote:
>> Add zoned storage support to ublk: report_zones and operations:
>>  - REQ_OP_ZONE_OPEN
>>  - REQ_OP_ZONE_CLOSE
>>  - REQ_OP_ZONE_FINISH
>>  - REQ_OP_ZONE_RESET
>> 
>> This allows implementation of zoned storage devices in user space. An
>> example user space implementation based on ubdsrv is available [1].
>> 
>> [1] https://github.com/metaspace/ubdsrv/commit/14a2b708f74f70cfecb076d92e680dc718cc1f6d
>
> As I suggested, please write one simple & clean zoned target for verifying
> the interface, and better to not add to tgt_null.c.

For ubdsrv, I understand that you prefer to reimplement null and loop targets for
zoned storage. I don't understand why you think this is a good idea,
since we will have massive code duplication. This would be comparable to
having a separate null_blk driver for zoned storage. Am I understanding
you correctly?

Anyway, I think we can discuss the kernel patch even though the ubdsrv
implementation is not a separate target yet? The code would be almost
identical, it would just live in a separate translation unit.

>
>> 
>> Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
>> ---
>> Changes since v1:
>>  - Fixed conditional compilation bug
>>  - Refactored to collect conditional code additions together
>>  - Fixed style errors
>>  - Zero stack allocated value used for zone report
>> 
>> Reported-by: Niklas Cassel <Niklas.Cassel@wdc.com>
>> Reported-by: kernel test robot <lkp@intel.com>
>> Link: https://lore.kernel.org/oe-kbuild-all/202302250222.XOrw9j6z-lkp@intel.com/
>> v1: https://lore.kernel.org/linux-block/20230224125950.214779-1-nmi@metaspace.dk/
>> 
>>  drivers/block/ublk_drv.c      | 150 ++++++++++++++++++++++++++++++++--
>>  include/uapi/linux/ublk_cmd.h |  18 ++++
>>  2 files changed, 162 insertions(+), 6 deletions(-)
>> 
>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>> index 6368b56eacf1..37e516903867 100644
>> --- a/drivers/block/ublk_drv.c
>> +++ b/drivers/block/ublk_drv.c
>> @@ -20,6 +20,7 @@
>>  #include <linux/major.h>
>>  #include <linux/wait.h>
>>  #include <linux/blkdev.h>
>> +#include <linux/blkzoned.h>
>>  #include <linux/init.h>
>>  #include <linux/swap.h>
>>  #include <linux/slab.h>
>> @@ -51,10 +52,12 @@
>>  		| UBLK_F_URING_CMD_COMP_IN_TASK \
>>  		| UBLK_F_NEED_GET_DATA \
>>  		| UBLK_F_USER_RECOVERY \
>> -		| UBLK_F_USER_RECOVERY_REISSUE)
>> +		| UBLK_F_USER_RECOVERY_REISSUE \
>> +		| UBLK_F_ZONED)
>>  
>>  /* All UBLK_PARAM_TYPE_* should be included here */
>> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
>> +#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD \
>> +			     | UBLK_PARAM_TYPE_ZONED)
>>  
>>  struct ublk_rq_data {
>>  	struct llist_node node;
>> @@ -187,6 +190,98 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
>>  
>>  static struct miscdevice ublk_misc;
>>  
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +static void ublk_set_nr_zones(struct ublk_device *ub)
>> +{
>> +	const struct ublk_param_basic *p = &ub->params.basic;
>> +
>> +	if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
>> +		ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
>> +}
>> +
>> +static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
>> +{
>> +	const struct ublk_param_zoned *p = &ub->params.zoned;
>> +
>> +	if (ub->dev_info.flags & UBLK_F_ZONED) {
>> +		disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
>> +		disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
>> +	}
>> +}
>> +
>> +static int ublk_revalidate_disk_zones(struct gendisk *disk)
>> +{
>> +	return blk_revalidate_disk_zones(disk, NULL);
>> +}
>> +
>> +static int ublk_report_zones(struct gendisk *disk, sector_t sector,
>> +			     unsigned int nr_zones, report_zones_cb cb,
>> +			     void *data)
>> +{
>> +	struct ublk_device *ub;
>> +	unsigned int zone_size;
>> +	unsigned int first_zone;
>> +	int ret = 0;
>> +
>> +	ub = disk->private_data;
>> +
>> +	if (!(ub->dev_info.flags & UBLK_F_ZONED))
>> +		return -EINVAL;
>> +
>> +	zone_size = disk->queue->limits.chunk_sectors;
>> +	first_zone = sector >> ilog2(zone_size);
>> +	nr_zones = min(ub->ub_disk->nr_zones - first_zone, nr_zones);
>> +
>> +	for (unsigned int i = 0; i < nr_zones; i++) {
>
> The local variable 'i' needs to be declared in the front part
> of this function body.

Ok.

>
>> +		struct request *req;
>> +		blk_status_t status;
>> +		struct blk_zone info = {0};
>> +
>> +		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
>> +
>> +		if (IS_ERR(req)) {
>> +			ret = PTR_ERR(req);
>> +			goto out;
>> +		}
>> +
>> +		req->__sector = sector;
>
> Why is req->__sector set?

I use it to carry information about the first zone of the report request.

>
>> +
>> +		ret = blk_rq_map_kern(disk->queue, req, &info, sizeof(info),
>> +				      GFP_KERNEL);
>> +
>> +		if (ret)
>> +			goto out;
>> +
>> +		status = blk_execute_rq(req, 0);
>> +		ret = blk_status_to_errno(status);
>> +		if (ret)
>> +			goto out;
>> +
>> +		blk_mq_free_request(req);
>> +
>> +		ret = cb(&info, i, data);
>> +		if (ret)
>> +			goto out;
>> +
>> +		/* A zero length zone means don't ask for more zones */
>> +		if (!info.len) {
>> +			nr_zones = i;
>> +			break;
>> +		}
>> +
>> +		sector += zone_size;
>> +	}
>
> I'd suggest to report as many as possible zones in one command, and
> the dev_info.max_io_buf_bytes is the max allowed buffer size for one
> command, please refer to nvme_ns_report_zones().

I agree about fetching more zones. However, it is no good to fetch up to
a max, since the requested zone report may less than max. I was
considering overloading req->__data_len and iod->nr_sectors to convey
the number of requested zones. What do you think about that?

>
> Also we are going to extend ublk in the multiple LUN/NS style, and I
> guess that won't be one issue since ->report_zones() is always done on
> disk level, right?

Yes, that should be fine.

>
>> +	ret = nr_zones;
>> +
>> + out:
>> +	return ret;
>> +}
>> +#else
>> +void ublk_set_nr_zones(struct ublk_device *ub);
>> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
>> +int ublk_revalidate_disk_zones(struct gendisk *disk);
>> +#endif
>> +
>>  static void ublk_dev_param_basic_apply(struct ublk_device *ub)
>>  {
>>  	struct request_queue *q = ub->ub_disk->queue;
>> @@ -212,6 +307,9 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
>>  		set_disk_ro(ub->ub_disk, true);
>>  
>>  	set_capacity(ub->ub_disk, p->dev_sectors);
>> +
>> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> +		ublk_set_nr_zones(ub);
>>  }
>>  
>>  static void ublk_dev_param_discard_apply(struct ublk_device *ub)
>> @@ -268,6 +366,9 @@ static int ublk_apply_params(struct ublk_device *ub)
>>  	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
>>  		ublk_dev_param_discard_apply(ub);
>>  
>> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types & UBLK_PARAM_TYPE_ZONED))
>> +		ublk_dev_param_zoned_apply(ub);
>> +
>>  	return 0;
>>  }
>>  
>> @@ -361,9 +462,13 @@ static void ublk_free_disk(struct gendisk *disk)
>>  	put_device(&ub->cdev_dev);
>>  }
>>  
>> +
>>  static const struct block_device_operations ub_fops = {
>> -	.owner =	THIS_MODULE,
>> -	.free_disk =	ublk_free_disk,
>> +	.owner = THIS_MODULE,
>> +	.free_disk = ublk_free_disk,
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +	.report_zones = ublk_report_zones,
>> +#endif
>
> Define one null ublk_report_zones in #else branch of the above #ifdef, then we
> can save one #ifdef.

I would have to define it as a null pointer in the #else case then?

>
>>  };
>>  
>>  #define UBLK_MAX_PIN_PAGES	32
>> @@ -499,7 +604,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
>>  {
>>  	const unsigned int rq_bytes = blk_rq_bytes(req);
>>  
>> -	if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
>> +	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && ublk_rq_has_data(req)) {
>>  		struct ublk_map_data data = {
>>  			.ubq	=	ubq,
>>  			.rq	=	req,
>> @@ -566,6 +671,26 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
>>  	case REQ_OP_WRITE_ZEROES:
>>  		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
>>  		break;
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +	case REQ_OP_ZONE_OPEN:
>> +		ublk_op = UBLK_IO_OP_ZONE_OPEN;
>> +		break;
>> +	case REQ_OP_ZONE_CLOSE:
>> +		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
>> +		break;
>> +	case REQ_OP_ZONE_FINISH:
>> +		ublk_op = UBLK_IO_OP_ZONE_FINISH;
>> +		break;
>> +	case REQ_OP_ZONE_RESET:
>> +		ublk_op = UBLK_IO_OP_ZONE_RESET;
>> +		break;
>> +	case REQ_OP_DRV_IN:
>> +		ublk_op = UBLK_IO_OP_DRV_IN;
>> +		break;
>> +	case REQ_OP_ZONE_APPEND:
>> +		/* We do not support zone append yet */
>> +		fallthrough;
>> +#endif
>
> The above '#ifdef' is needn't, since  OP_ZONE should be defined no
> matter if CONFIG_BLK_DEV_ZONED is enabled or not.

I see. But do we want to process the requests if BLK_DEV_ZONED is not
enabled, or do we want to fail the IO?

>
>>  	default:
>>  		return BLK_STS_IOERR;
>>  	}
>> @@ -612,7 +737,8 @@ static void ublk_complete_rq(struct request *req)
>>  	 *
>>  	 * Both the two needn't unmap.
>>  	 */
>> -	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
>> +	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
>> +	    req_op(req) != REQ_OP_DRV_IN) {
>>  		blk_mq_end_request(req, BLK_STS_OK);
>>  		return;
>>  	}
>> @@ -1535,6 +1661,15 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
>>  	if (ret)
>>  		goto out_put_disk;
>>  
>> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>> +	    ub->dev_info.flags & UBLK_F_ZONED) {
>> +		disk_set_zoned(disk, BLK_ZONED_HM);
>> +		blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
>> +		ret = ublk_revalidate_disk_zones(disk);
>> +		if (ret)
>> +			goto out_put_disk;
>> +	}
>> +
>>  	get_device(&ub->cdev_dev);
>>  	ret = add_disk(disk);
>>  	if (ret) {
>> @@ -1673,6 +1808,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
>>  	if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
>>  		ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
>>  
>> +	if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> +		ub->dev_info.flags &= ~UBLK_F_ZONED;
>> +
>>  	/* We are not ready to support zero copy */
>>  	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
>>  
>> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>> index 8f88e3a29998..074b97821575 100644
>> --- a/include/uapi/linux/ublk_cmd.h
>> +++ b/include/uapi/linux/ublk_cmd.h
>> @@ -78,6 +78,10 @@
>>  #define UBLK_F_USER_RECOVERY	(1UL << 3)
>>  
>>  #define UBLK_F_USER_RECOVERY_REISSUE	(1UL << 4)
>> +/*
>> + * Enable zoned device support
>> + */
>> +#define UBLK_F_ZONED (1ULL << 5)
>>  
>>  /* device state */
>>  #define UBLK_S_DEV_DEAD	0
>> @@ -129,6 +133,12 @@ struct ublksrv_ctrl_dev_info {
>>  #define		UBLK_IO_OP_DISCARD	3
>>  #define		UBLK_IO_OP_WRITE_SAME	4
>>  #define		UBLK_IO_OP_WRITE_ZEROES	5
>> +#define		UBLK_IO_OP_ZONE_OPEN		10
>> +#define		UBLK_IO_OP_ZONE_CLOSE		11
>> +#define		UBLK_IO_OP_ZONE_FINISH		12
>> +#define		UBLK_IO_OP_ZONE_APPEND		13
>> +#define		UBLK_IO_OP_ZONE_RESET		15
>> +#define		UBLK_IO_OP_DRV_IN		34
>>  
>>  #define		UBLK_IO_F_FAILFAST_DEV		(1U << 8)
>>  #define		UBLK_IO_F_FAILFAST_TRANSPORT	(1U << 9)
>> @@ -214,6 +224,12 @@ struct ublk_param_discard {
>>  	__u16	reserved0;
>>  };
>>  
>> +struct ublk_param_zoned {
>> +	__u64	max_open_zones;
>> +	__u64	max_active_zones;
>> +	__u64	max_append_size;
>> +};
>
> Is the above zoned parameter enough for future extension?
> Does ZNS need extra parameter? Or some zoned new(important) features?

@Damien, @Hans, @Matias, what do you think?

>
> I highly suggest to reserve some fields for extension, given
> it is one ABI interface, which is supposed to be defined well
> enough from the beginning.

How many bytes would you reserve?

Thanks!

Best regards,
Andreas
  
Damien Le Moal March 2, 2023, 8:19 a.m. UTC | #11
On 3/2/23 16:31, Andreas Hindborg wrote:
>>> +struct ublk_param_zoned {
>>> +	__u64	max_open_zones;
>>> +	__u64	max_active_zones;
>>> +	__u64	max_append_size;
>>> +};
>>
>> Is the above zoned parameter enough for future extension?
>> Does ZNS need extra parameter? Or some zoned new(important) features?
> 
> @Damien, @Hans, @Matias, what do you think?

Yes, add some reserved fields. The struct is 24 B for now, you can make it 32 B.
But it is a little odd: why 64 bits for max open/active zones and max append ?
bio len is 32 bits and number of zones also 32 bits. You do not need 64 bits for
all these fields. Also, no zone model reported with this ? What about write
granularity (for SMR HDDs) too ?

So something like:

struct ublk_param_zoned {
	__u32	model;
	__u32	write_granularity;
	__u32	max_open_zones;
	__u32	max_active_zones;
	__u32	max_append_size;
	__u8	reserved[12];
};

looks better to me. Note sure about the need for model and write_granularity
here though. I did not follow zoned ublk patches.
  
Ming Lei March 2, 2023, 8:32 a.m. UTC | #12
On Thu, Mar 02, 2023 at 08:31:07AM +0100, Andreas Hindborg wrote:
> 
> Hi Ming,
> 
> Ming Lei <ming.lei@redhat.com> writes:
> 
> > On Fri, Feb 24, 2023 at 09:05:01PM +0100, Andreas Hindborg wrote:
> >> Add zoned storage support to ublk: report_zones and operations:
> >>  - REQ_OP_ZONE_OPEN
> >>  - REQ_OP_ZONE_CLOSE
> >>  - REQ_OP_ZONE_FINISH
> >>  - REQ_OP_ZONE_RESET
> >> 
> >> This allows implementation of zoned storage devices in user space. An
> >> example user space implementation based on ubdsrv is available [1].
> >> 
> >> [1] https://github.com/metaspace/ubdsrv/commit/14a2b708f74f70cfecb076d92e680dc718cc1f6d
> >
> > As I suggested, please write one simple & clean zoned target for verifying
> > the interface, and better to not add to tgt_null.c.
> 
> For ubdsrv, I understand that you prefer to reimplement null and loop targets for
> zoned storage. I don't understand why you think this is a good idea,
> since we will have massive code duplication. This would be comparable to
> having a separate null_blk driver for zoned storage. Am I understanding
> you correctly?

It is userspace, code duplication isn't a big deal, but putting lots
of un-related things together is just one mess.

> 
> Anyway, I think we can discuss the kernel patch even though the ubdsrv
> implementation is not a separate target yet? The code would be almost
> identical, it would just live in a separate translation unit.

I just suggest to make the userspace test code clean & easy, which plays
big role for looking kernel interface from user side.

> 
> >
> >> 
> >> Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
> >> ---
> >> Changes since v1:
> >>  - Fixed conditional compilation bug
> >>  - Refactored to collect conditional code additions together
> >>  - Fixed style errors
> >>  - Zero stack allocated value used for zone report
> >> 
> >> Reported-by: Niklas Cassel <Niklas.Cassel@wdc.com>
> >> Reported-by: kernel test robot <lkp@intel.com>
> >> Link: https://lore.kernel.org/oe-kbuild-all/202302250222.XOrw9j6z-lkp@intel.com/
> >> v1: https://lore.kernel.org/linux-block/20230224125950.214779-1-nmi@metaspace.dk/
> >> 
> >>  drivers/block/ublk_drv.c      | 150 ++++++++++++++++++++++++++++++++--
> >>  include/uapi/linux/ublk_cmd.h |  18 ++++
> >>  2 files changed, 162 insertions(+), 6 deletions(-)
> >> 
> >> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> >> index 6368b56eacf1..37e516903867 100644
> >> --- a/drivers/block/ublk_drv.c
> >> +++ b/drivers/block/ublk_drv.c
> >> @@ -20,6 +20,7 @@
> >>  #include <linux/major.h>
> >>  #include <linux/wait.h>
> >>  #include <linux/blkdev.h>
> >> +#include <linux/blkzoned.h>
> >>  #include <linux/init.h>
> >>  #include <linux/swap.h>
> >>  #include <linux/slab.h>
> >> @@ -51,10 +52,12 @@
> >>  		| UBLK_F_URING_CMD_COMP_IN_TASK \
> >>  		| UBLK_F_NEED_GET_DATA \
> >>  		| UBLK_F_USER_RECOVERY \
> >> -		| UBLK_F_USER_RECOVERY_REISSUE)
> >> +		| UBLK_F_USER_RECOVERY_REISSUE \
> >> +		| UBLK_F_ZONED)
> >>  
> >>  /* All UBLK_PARAM_TYPE_* should be included here */
> >> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
> >> +#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD \
> >> +			     | UBLK_PARAM_TYPE_ZONED)
> >>  
> >>  struct ublk_rq_data {
> >>  	struct llist_node node;
> >> @@ -187,6 +190,98 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
> >>  
> >>  static struct miscdevice ublk_misc;
> >>  
> >> +#ifdef CONFIG_BLK_DEV_ZONED
> >> +static void ublk_set_nr_zones(struct ublk_device *ub)
> >> +{
> >> +	const struct ublk_param_basic *p = &ub->params.basic;
> >> +
> >> +	if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
> >> +		ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
> >> +}
> >> +
> >> +static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
> >> +{
> >> +	const struct ublk_param_zoned *p = &ub->params.zoned;
> >> +
> >> +	if (ub->dev_info.flags & UBLK_F_ZONED) {
> >> +		disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
> >> +		disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
> >> +	}
> >> +}
> >> +
> >> +static int ublk_revalidate_disk_zones(struct gendisk *disk)
> >> +{
> >> +	return blk_revalidate_disk_zones(disk, NULL);
> >> +}
> >> +
> >> +static int ublk_report_zones(struct gendisk *disk, sector_t sector,
> >> +			     unsigned int nr_zones, report_zones_cb cb,
> >> +			     void *data)
> >> +{
> >> +	struct ublk_device *ub;
> >> +	unsigned int zone_size;
> >> +	unsigned int first_zone;
> >> +	int ret = 0;
> >> +
> >> +	ub = disk->private_data;
> >> +
> >> +	if (!(ub->dev_info.flags & UBLK_F_ZONED))
> >> +		return -EINVAL;
> >> +
> >> +	zone_size = disk->queue->limits.chunk_sectors;
> >> +	first_zone = sector >> ilog2(zone_size);
> >> +	nr_zones = min(ub->ub_disk->nr_zones - first_zone, nr_zones);
> >> +
> >> +	for (unsigned int i = 0; i < nr_zones; i++) {
> >
> > The local variable 'i' needs to be declared in the front part
> > of this function body.
> 
> Ok.
> 
> >
> >> +		struct request *req;
> >> +		blk_status_t status;
> >> +		struct blk_zone info = {0};
> >> +
> >> +		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
> >> +
> >> +		if (IS_ERR(req)) {
> >> +			ret = PTR_ERR(req);
> >> +			goto out;
> >> +		}
> >> +
> >> +		req->__sector = sector;
> >
> > Why is req->__sector set?
> 
> I use it to carry information about the first zone of the report request.

No, please do not use __sector in this way, which is supposed to be used by
driver for ZNS/ZONE_APPEND only.

> 
> >
> >> +
> >> +		ret = blk_rq_map_kern(disk->queue, req, &info, sizeof(info),
> >> +				      GFP_KERNEL);
> >> +
> >> +		if (ret)
> >> +			goto out;
> >> +
> >> +		status = blk_execute_rq(req, 0);
> >> +		ret = blk_status_to_errno(status);
> >> +		if (ret)
> >> +			goto out;
> >> +
> >> +		blk_mq_free_request(req);
> >> +
> >> +		ret = cb(&info, i, data);
> >> +		if (ret)
> >> +			goto out;
> >> +
> >> +		/* A zero length zone means don't ask for more zones */
> >> +		if (!info.len) {
> >> +			nr_zones = i;
> >> +			break;
> >> +		}
> >> +
> >> +		sector += zone_size;
> >> +	}
> >
> > I'd suggest to report as many as possible zones in one command, and
> > the dev_info.max_io_buf_bytes is the max allowed buffer size for one
> > command, please refer to nvme_ns_report_zones().
> 
> I agree about fetching more zones. However, it is no good to fetch up to
> a max, since the requested zone report may less than max. I was

Short read should always be supported, so the interface may need to 
return how many zones in single command, please refer to nvme_ns_report_zones().

> considering overloading req->__data_len and iod->nr_sectors to convey
> the number of requested zones. What do you think about that?

Any field pre-suffixed with "__" in 'struct request' should be only for
block layer, but __sector is one exception and its scenario is obvious.

So please do not misuse req->__data_len, but using iod is fine.

> 
> >
> > Also we are going to extend ublk in the multiple LUN/NS style, and I
> > guess that won't be one issue since ->report_zones() is always done on
> > disk level, right?
> 
> Yes, that should be fine.
> 
> >
> >> +	ret = nr_zones;
> >> +
> >> + out:
> >> +	return ret;
> >> +}
> >> +#else
> >> +void ublk_set_nr_zones(struct ublk_device *ub);
> >> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
> >> +int ublk_revalidate_disk_zones(struct gendisk *disk);
> >> +#endif
> >> +
> >>  static void ublk_dev_param_basic_apply(struct ublk_device *ub)
> >>  {
> >>  	struct request_queue *q = ub->ub_disk->queue;
> >> @@ -212,6 +307,9 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
> >>  		set_disk_ro(ub->ub_disk, true);
> >>  
> >>  	set_capacity(ub->ub_disk, p->dev_sectors);
> >> +
> >> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> >> +		ublk_set_nr_zones(ub);
> >>  }
> >>  
> >>  static void ublk_dev_param_discard_apply(struct ublk_device *ub)
> >> @@ -268,6 +366,9 @@ static int ublk_apply_params(struct ublk_device *ub)
> >>  	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
> >>  		ublk_dev_param_discard_apply(ub);
> >>  
> >> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types & UBLK_PARAM_TYPE_ZONED))
> >> +		ublk_dev_param_zoned_apply(ub);
> >> +
> >>  	return 0;
> >>  }
> >>  
> >> @@ -361,9 +462,13 @@ static void ublk_free_disk(struct gendisk *disk)
> >>  	put_device(&ub->cdev_dev);
> >>  }
> >>  
> >> +
> >>  static const struct block_device_operations ub_fops = {
> >> -	.owner =	THIS_MODULE,
> >> -	.free_disk =	ublk_free_disk,
> >> +	.owner = THIS_MODULE,
> >> +	.free_disk = ublk_free_disk,
> >> +#ifdef CONFIG_BLK_DEV_ZONED
> >> +	.report_zones = ublk_report_zones,
> >> +#endif
> >
> > Define one null ublk_report_zones in #else branch of the above #ifdef, then we
> > can save one #ifdef.
> 
> I would have to define it as a null pointer in the #else case then?

#define ublk_report_zones       NULL

> 
> >
> >>  };
> >>  
> >>  #define UBLK_MAX_PIN_PAGES	32
> >> @@ -499,7 +604,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
> >>  {
> >>  	const unsigned int rq_bytes = blk_rq_bytes(req);
> >>  
> >> -	if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
> >> +	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && ublk_rq_has_data(req)) {
> >>  		struct ublk_map_data data = {
> >>  			.ubq	=	ubq,
> >>  			.rq	=	req,
> >> @@ -566,6 +671,26 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
> >>  	case REQ_OP_WRITE_ZEROES:
> >>  		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
> >>  		break;
> >> +#ifdef CONFIG_BLK_DEV_ZONED
> >> +	case REQ_OP_ZONE_OPEN:
> >> +		ublk_op = UBLK_IO_OP_ZONE_OPEN;
> >> +		break;
> >> +	case REQ_OP_ZONE_CLOSE:
> >> +		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
> >> +		break;
> >> +	case REQ_OP_ZONE_FINISH:
> >> +		ublk_op = UBLK_IO_OP_ZONE_FINISH;
> >> +		break;
> >> +	case REQ_OP_ZONE_RESET:
> >> +		ublk_op = UBLK_IO_OP_ZONE_RESET;
> >> +		break;
> >> +	case REQ_OP_DRV_IN:
> >> +		ublk_op = UBLK_IO_OP_DRV_IN;
> >> +		break;
> >> +	case REQ_OP_ZONE_APPEND:
> >> +		/* We do not support zone append yet */
> >> +		fallthrough;
> >> +#endif
> >
> > The above '#ifdef' is needn't, since  OP_ZONE should be defined no
> > matter if CONFIG_BLK_DEV_ZONED is enabled or not.
> 
> I see. But do we want to process the requests if BLK_DEV_ZONED is not
> enabled, or do we want to fail the IO?

Here we should trust block core code.

> 
> >
> >>  	default:
> >>  		return BLK_STS_IOERR;
> >>  	}
> >> @@ -612,7 +737,8 @@ static void ublk_complete_rq(struct request *req)
> >>  	 *
> >>  	 * Both the two needn't unmap.
> >>  	 */
> >> -	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
> >> +	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
> >> +	    req_op(req) != REQ_OP_DRV_IN) {
> >>  		blk_mq_end_request(req, BLK_STS_OK);
> >>  		return;
> >>  	}
> >> @@ -1535,6 +1661,15 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
> >>  	if (ret)
> >>  		goto out_put_disk;
> >>  
> >> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> >> +	    ub->dev_info.flags & UBLK_F_ZONED) {
> >> +		disk_set_zoned(disk, BLK_ZONED_HM);
> >> +		blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
> >> +		ret = ublk_revalidate_disk_zones(disk);
> >> +		if (ret)
> >> +			goto out_put_disk;
> >> +	}
> >> +
> >>  	get_device(&ub->cdev_dev);
> >>  	ret = add_disk(disk);
> >>  	if (ret) {
> >> @@ -1673,6 +1808,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
> >>  	if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
> >>  		ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
> >>  
> >> +	if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> >> +		ub->dev_info.flags &= ~UBLK_F_ZONED;
> >> +
> >>  	/* We are not ready to support zero copy */
> >>  	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
> >>  
> >> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
> >> index 8f88e3a29998..074b97821575 100644
> >> --- a/include/uapi/linux/ublk_cmd.h
> >> +++ b/include/uapi/linux/ublk_cmd.h
> >> @@ -78,6 +78,10 @@
> >>  #define UBLK_F_USER_RECOVERY	(1UL << 3)
> >>  
> >>  #define UBLK_F_USER_RECOVERY_REISSUE	(1UL << 4)
> >> +/*
> >> + * Enable zoned device support
> >> + */
> >> +#define UBLK_F_ZONED (1ULL << 5)
> >>  
> >>  /* device state */
> >>  #define UBLK_S_DEV_DEAD	0
> >> @@ -129,6 +133,12 @@ struct ublksrv_ctrl_dev_info {
> >>  #define		UBLK_IO_OP_DISCARD	3
> >>  #define		UBLK_IO_OP_WRITE_SAME	4
> >>  #define		UBLK_IO_OP_WRITE_ZEROES	5
> >> +#define		UBLK_IO_OP_ZONE_OPEN		10
> >> +#define		UBLK_IO_OP_ZONE_CLOSE		11
> >> +#define		UBLK_IO_OP_ZONE_FINISH		12
> >> +#define		UBLK_IO_OP_ZONE_APPEND		13
> >> +#define		UBLK_IO_OP_ZONE_RESET		15
> >> +#define		UBLK_IO_OP_DRV_IN		34
> >>  
> >>  #define		UBLK_IO_F_FAILFAST_DEV		(1U << 8)
> >>  #define		UBLK_IO_F_FAILFAST_TRANSPORT	(1U << 9)
> >> @@ -214,6 +224,12 @@ struct ublk_param_discard {
> >>  	__u16	reserved0;
> >>  };
> >>  
> >> +struct ublk_param_zoned {
> >> +	__u64	max_open_zones;
> >> +	__u64	max_active_zones;
> >> +	__u64	max_append_size;
> >> +};
> >
> > Is the above zoned parameter enough for future extension?
> > Does ZNS need extra parameter? Or some zoned new(important) features?
> 
> @Damien, @Hans, @Matias, what do you think?
> 
> >
> > I highly suggest to reserve some fields for extension, given
> > it is one ABI interface, which is supposed to be defined well
> > enough from the beginning.
> 
> How many bytes would you reserve?

It really depends on storage/zoned field knowledge.


Thanks,
Ming
  
Ming Lei March 2, 2023, 9:01 a.m. UTC | #13
On Thu, Mar 02, 2023 at 04:32:21PM +0800, Ming Lei wrote:
> On Thu, Mar 02, 2023 at 08:31:07AM +0100, Andreas Hindborg wrote:
> > 

...

> > 
> > I agree about fetching more zones. However, it is no good to fetch up to
> > a max, since the requested zone report may less than max. I was
> 
> Short read should always be supported, so the interface may need to 
> return how many zones in single command, please refer to nvme_ns_report_zones().

blk_zone is part of uapi, maybe the short read can be figured out by
one all-zeroed 'blk_zone'?  then no extra uapi data is needed for
reporting zones.


thanks,
Ming
  
Ming Lei March 2, 2023, 9:14 a.m. UTC | #14
On Thu, Mar 2, 2023 at 5:02 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Thu, Mar 02, 2023 at 04:32:21PM +0800, Ming Lei wrote:
> > On Thu, Mar 02, 2023 at 08:31:07AM +0100, Andreas Hindborg wrote:
> > >
>
> ...
>
> > >
> > > I agree about fetching more zones. However, it is no good to fetch up to
> > > a max, since the requested zone report may less than max. I was
> >
> > Short read should always be supported, so the interface may need to
> > return how many zones in single command, please refer to nvme_ns_report_zones().
>
> blk_zone is part of uapi, maybe the short read can be figured out by
> one all-zeroed 'blk_zone'?  then no extra uapi data is needed for
> reporting zones.

oops, we have blk_zone_report data for reporting zones to userspace already,
see blkdev_report_zones_ioctl(), then this way can be re-used for getting zone
report from ublk server too, right?

Thanks,
Ming
  
Andreas Hindborg March 2, 2023, 10:07 a.m. UTC | #15
Ming Lei <ming.lei@redhat.com> writes:

> On Thu, Mar 2, 2023 at 5:02 PM Ming Lei <ming.lei@redhat.com> wrote:
>>
>> On Thu, Mar 02, 2023 at 04:32:21PM +0800, Ming Lei wrote:
>> > On Thu, Mar 02, 2023 at 08:31:07AM +0100, Andreas Hindborg wrote:
>> > >
>>
>> ...
>>
>> > >
>> > > I agree about fetching more zones. However, it is no good to fetch up to
>> > > a max, since the requested zone report may less than max. I was
>> >
>> > Short read should always be supported, so the interface may need to
>> > return how many zones in single command, please refer to nvme_ns_report_zones().
>>
>> blk_zone is part of uapi, maybe the short read can be figured out by
>> one all-zeroed 'blk_zone'?  then no extra uapi data is needed for
>> reporting zones.
>
> oops, we have blk_zone_report data for reporting zones to userspace already,
> see blkdev_report_zones_ioctl(), then this way can be re-used for getting zone
> report from ublk server too, right?

Yes that would be nice. But I did the report_zone command like a read
operation, so we are not currently copying any buffers to user space
when issuing the command, we just rely on the iod. I think it would be
better to use the start_sectors and nr_sectors of the iod instead. Then
we don't have to copy the blk_zone_report. What do you think?

BR Andreas
  
Ming Lei March 2, 2023, 1:16 p.m. UTC | #16
On Thu, Mar 02, 2023 at 11:07:15AM +0100, Andreas Hindborg wrote:
> 
> Ming Lei <ming.lei@redhat.com> writes:
> 
> > On Thu, Mar 2, 2023 at 5:02 PM Ming Lei <ming.lei@redhat.com> wrote:
> >>
> >> On Thu, Mar 02, 2023 at 04:32:21PM +0800, Ming Lei wrote:
> >> > On Thu, Mar 02, 2023 at 08:31:07AM +0100, Andreas Hindborg wrote:
> >> > >
> >>
> >> ...
> >>
> >> > >
> >> > > I agree about fetching more zones. However, it is no good to fetch up to
> >> > > a max, since the requested zone report may less than max. I was
> >> >
> >> > Short read should always be supported, so the interface may need to
> >> > return how many zones in single command, please refer to nvme_ns_report_zones().
> >>
> >> blk_zone is part of uapi, maybe the short read can be figured out by
> >> one all-zeroed 'blk_zone'?  then no extra uapi data is needed for
> >> reporting zones.
> >
> > oops, we have blk_zone_report data for reporting zones to userspace already,
> > see blkdev_report_zones_ioctl(), then this way can be re-used for getting zone
> > report from ublk server too, right?
> 
> Yes that would be nice. But I did the report_zone command like a read
> operation, so we are not currently copying any buffers to user space
> when issuing the command, we just rely on the iod.

What I meant is to reuse the format of blk_zone_report for returning
multiple 'blk_zone' info in single command.

The only change is that you need to allocate one bigger kernel buffer
to hold more 'blk_zone' in single report zone request.

> I think it would be
> better to use the start_sectors and nr_sectors of the iod instead. Then
> we don't have to copy the blk_zone_report. What do you think?

For IN parameter of report zone command, you still can reuse
blk_zone_report:

struct blk_zone_report {
        __u64           sector;
        __u32           nr_zones;
        __u32           flags;
};

Just by using the 1st two 64b words of iod for holding 'blk_zone_report', and
keep the iod->addr field not touched.


[1] https://lore.kernel.org/linux-block/20230301140611.163055-1-ming.lei@redhat.com/T/#md36358552d45a7d563940632d4c779a99f72916d

Thanks,
Ming
  
Andreas Hindborg March 2, 2023, 1:28 p.m. UTC | #17
Ming Lei <ming.lei@redhat.com> writes:

> On Thu, Mar 02, 2023 at 11:07:15AM +0100, Andreas Hindborg wrote:
>> 
>> Ming Lei <ming.lei@redhat.com> writes:
>> 
>> > On Thu, Mar 2, 2023 at 5:02 PM Ming Lei <ming.lei@redhat.com> wrote:
>> >>
>> >> On Thu, Mar 02, 2023 at 04:32:21PM +0800, Ming Lei wrote:
>> >> > On Thu, Mar 02, 2023 at 08:31:07AM +0100, Andreas Hindborg wrote:
>> >> > >
>> >>
>> >> ...
>> >>
>> >> > >
>> >> > > I agree about fetching more zones. However, it is no good to fetch up to
>> >> > > a max, since the requested zone report may less than max. I was
>> >> >
>> >> > Short read should always be supported, so the interface may need to
>> >> > return how many zones in single command, please refer to nvme_ns_report_zones().
>> >>
>> >> blk_zone is part of uapi, maybe the short read can be figured out by
>> >> one all-zeroed 'blk_zone'?  then no extra uapi data is needed for
>> >> reporting zones.
>> >
>> > oops, we have blk_zone_report data for reporting zones to userspace already,
>> > see blkdev_report_zones_ioctl(), then this way can be re-used for getting zone
>> > report from ublk server too, right?
>> 
>> Yes that would be nice. But I did the report_zone command like a read
>> operation, so we are not currently copying any buffers to user space
>> when issuing the command, we just rely on the iod.
>
> What I meant is to reuse the format of blk_zone_report for returning
> multiple 'blk_zone' info in single command.
>
> The only change is that you need to allocate one bigger kernel buffer
> to hold more 'blk_zone' in single report zone request.
>
>> I think it would be
>> better to use the start_sectors and nr_sectors of the iod instead. Then
>> we don't have to copy the blk_zone_report. What do you think?
>
> For IN parameter of report zone command, you still can reuse
> blk_zone_report:
>
> struct blk_zone_report {
>         __u64           sector;
>         __u32           nr_zones;
>         __u32           flags;
> };
>
> Just by using the 1st two 64b words of iod for holding 'blk_zone_report', and
> keep the iod->addr field not touched.

I see. Would you make the first part of `struct ublksrv_io_desc` a union
for this, or would you just cast it at the use site?

BR Andreas
  
Ming Lei March 3, 2023, 2:59 a.m. UTC | #18
On Thu, Mar 02, 2023 at 02:28:33PM +0100, Andreas Hindborg wrote:
> 
> Ming Lei <ming.lei@redhat.com> writes:
> 
> > On Thu, Mar 02, 2023 at 11:07:15AM +0100, Andreas Hindborg wrote:
> >> 
> >> Ming Lei <ming.lei@redhat.com> writes:
> >> 
> >> > On Thu, Mar 2, 2023 at 5:02 PM Ming Lei <ming.lei@redhat.com> wrote:
> >> >>
> >> >> On Thu, Mar 02, 2023 at 04:32:21PM +0800, Ming Lei wrote:
> >> >> > On Thu, Mar 02, 2023 at 08:31:07AM +0100, Andreas Hindborg wrote:
> >> >> > >
> >> >>
> >> >> ...
> >> >>
> >> >> > >
> >> >> > > I agree about fetching more zones. However, it is no good to fetch up to
> >> >> > > a max, since the requested zone report may less than max. I was
> >> >> >
> >> >> > Short read should always be supported, so the interface may need to
> >> >> > return how many zones in single command, please refer to nvme_ns_report_zones().
> >> >>
> >> >> blk_zone is part of uapi, maybe the short read can be figured out by
> >> >> one all-zeroed 'blk_zone'?  then no extra uapi data is needed for
> >> >> reporting zones.
> >> >
> >> > oops, we have blk_zone_report data for reporting zones to userspace already,
> >> > see blkdev_report_zones_ioctl(), then this way can be re-used for getting zone
> >> > report from ublk server too, right?
> >> 
> >> Yes that would be nice. But I did the report_zone command like a read
> >> operation, so we are not currently copying any buffers to user space
> >> when issuing the command, we just rely on the iod.
> >
> > What I meant is to reuse the format of blk_zone_report for returning
> > multiple 'blk_zone' info in single command.
> >
> > The only change is that you need to allocate one bigger kernel buffer
> > to hold more 'blk_zone' in single report zone request.
> >
> >> I think it would be
> >> better to use the start_sectors and nr_sectors of the iod instead. Then
> >> we don't have to copy the blk_zone_report. What do you think?
> >
> > For IN parameter of report zone command, you still can reuse
> > blk_zone_report:
> >
> > struct blk_zone_report {
> >         __u64           sector;
> >         __u32           nr_zones;
> >         __u32           flags;
> > };
> >
> > Just by using the 1st two 64b words of iod for holding 'blk_zone_report', and
> > keep the iod->addr field not touched.
> 
> I see. Would you make the first part of `struct ublksrv_io_desc` a union
> for this, or would you just cast it at the use site?

oops, you still need iod->op_flags for recognizing the io op, so just
start_sector and nr_sectors can be used.

However, this way isn't good too, cause UBLK_IO_OP_DRV_IN is just mapped
to 'report zone' command in your implementation, what if new pt request
is required in future?

We need to think about how to support ublk pt request in generic way.

Thanks,
Ming
  
Andreas Hindborg March 3, 2023, 8:27 a.m. UTC | #19
Ming Lei <ming.lei@redhat.com> writes:

> On Thu, Mar 02, 2023 at 02:28:33PM +0100, Andreas Hindborg wrote:
>> 
>> Ming Lei <ming.lei@redhat.com> writes:
>> 
>> > On Thu, Mar 02, 2023 at 11:07:15AM +0100, Andreas Hindborg wrote:
>> >> 
>> >> Ming Lei <ming.lei@redhat.com> writes:
>> >> 
>> >> > On Thu, Mar 2, 2023 at 5:02 PM Ming Lei <ming.lei@redhat.com> wrote:
>> >> >>
>> >> >> On Thu, Mar 02, 2023 at 04:32:21PM +0800, Ming Lei wrote:
>> >> >> > On Thu, Mar 02, 2023 at 08:31:07AM +0100, Andreas Hindborg wrote:
>> >> >> > >
>> >> >>
>> >> >> ...
>> >> >>
>> >> >> > >
>> >> >> > > I agree about fetching more zones. However, it is no good to fetch up to
>> >> >> > > a max, since the requested zone report may less than max. I was
>> >> >> >
>> >> >> > Short read should always be supported, so the interface may need to
>> >> >> > return how many zones in single command, please refer to nvme_ns_report_zones().
>> >> >>
>> >> >> blk_zone is part of uapi, maybe the short read can be figured out by
>> >> >> one all-zeroed 'blk_zone'?  then no extra uapi data is needed for
>> >> >> reporting zones.
>> >> >
>> >> > oops, we have blk_zone_report data for reporting zones to userspace already,
>> >> > see blkdev_report_zones_ioctl(), then this way can be re-used for getting zone
>> >> > report from ublk server too, right?
>> >> 
>> >> Yes that would be nice. But I did the report_zone command like a read
>> >> operation, so we are not currently copying any buffers to user space
>> >> when issuing the command, we just rely on the iod.
>> >
>> > What I meant is to reuse the format of blk_zone_report for returning
>> > multiple 'blk_zone' info in single command.
>> >
>> > The only change is that you need to allocate one bigger kernel buffer
>> > to hold more 'blk_zone' in single report zone request.
>> >
>> >> I think it would be
>> >> better to use the start_sectors and nr_sectors of the iod instead. Then
>> >> we don't have to copy the blk_zone_report. What do you think?
>> >
>> > For IN parameter of report zone command, you still can reuse
>> > blk_zone_report:
>> >
>> > struct blk_zone_report {
>> >         __u64           sector;
>> >         __u32           nr_zones;
>> >         __u32           flags;
>> > };
>> >
>> > Just by using the 1st two 64b words of iod for holding 'blk_zone_report', and
>> > keep the iod->addr field not touched.
>> 
>> I see. Would you make the first part of `struct ublksrv_io_desc` a union
>> for this, or would you just cast it at the use site?
>
> oops, you still need iod->op_flags for recognizing the io op, so just
> start_sector and nr_sectors can be used.

We do not actually need to pass the flags to user space, or back from
user space to kernel for ublk zone report. They are currently used to
tell user space if the zone report contains capacity field. We could
exclude them from the ublk kabi since the zone report will always
contain capacity? But it might be good to have a flags field or future
things.

> However, this way isn't good too, cause UBLK_IO_OP_DRV_IN is just mapped
> to 'report zone' command in your implementation, what if new pt request
> is required in future?

We are currently mapping REQ_OP_* 1:1 to  UBLK_OP_OP_*. If we relax
this, we can have a UBLK_IO_OP_REPORT_ZONES.

>
> We need to think about how to support ublk pt request in generic way.

Another option is to allow REQ_OP_DRV_IN to pass a buffer to user space.
Instead of being similar to a read operation, it could be a combination of
a read and a write operation.

BR Andreas
  
Ming Lei March 3, 2023, 11:47 a.m. UTC | #20
On Fri, Mar 03, 2023 at 09:27:58AM +0100, Andreas Hindborg wrote:
> 
> Ming Lei <ming.lei@redhat.com> writes:
> 
> > On Thu, Mar 02, 2023 at 02:28:33PM +0100, Andreas Hindborg wrote:
> >> 
> >> Ming Lei <ming.lei@redhat.com> writes:
> >> 
> >> > On Thu, Mar 02, 2023 at 11:07:15AM +0100, Andreas Hindborg wrote:
> >> >> 
> >> >> Ming Lei <ming.lei@redhat.com> writes:
> >> >> 
> >> >> > On Thu, Mar 2, 2023 at 5:02 PM Ming Lei <ming.lei@redhat.com> wrote:
> >> >> >>
> >> >> >> On Thu, Mar 02, 2023 at 04:32:21PM +0800, Ming Lei wrote:
> >> >> >> > On Thu, Mar 02, 2023 at 08:31:07AM +0100, Andreas Hindborg wrote:
> >> >> >> > >
> >> >> >>
> >> >> >> ...
> >> >> >>
> >> >> >> > >
> >> >> >> > > I agree about fetching more zones. However, it is no good to fetch up to
> >> >> >> > > a max, since the requested zone report may less than max. I was
> >> >> >> >
> >> >> >> > Short read should always be supported, so the interface may need to
> >> >> >> > return how many zones in single command, please refer to nvme_ns_report_zones().
> >> >> >>
> >> >> >> blk_zone is part of uapi, maybe the short read can be figured out by
> >> >> >> one all-zeroed 'blk_zone'?  then no extra uapi data is needed for
> >> >> >> reporting zones.
> >> >> >
> >> >> > oops, we have blk_zone_report data for reporting zones to userspace already,
> >> >> > see blkdev_report_zones_ioctl(), then this way can be re-used for getting zone
> >> >> > report from ublk server too, right?
> >> >> 
> >> >> Yes that would be nice. But I did the report_zone command like a read
> >> >> operation, so we are not currently copying any buffers to user space
> >> >> when issuing the command, we just rely on the iod.
> >> >
> >> > What I meant is to reuse the format of blk_zone_report for returning
> >> > multiple 'blk_zone' info in single command.
> >> >
> >> > The only change is that you need to allocate one bigger kernel buffer
> >> > to hold more 'blk_zone' in single report zone request.
> >> >
> >> >> I think it would be
> >> >> better to use the start_sectors and nr_sectors of the iod instead. Then
> >> >> we don't have to copy the blk_zone_report. What do you think?
> >> >
> >> > For IN parameter of report zone command, you still can reuse
> >> > blk_zone_report:
> >> >
> >> > struct blk_zone_report {
> >> >         __u64           sector;
> >> >         __u32           nr_zones;
> >> >         __u32           flags;
> >> > };
> >> >
> >> > Just by using the 1st two 64b words of iod for holding 'blk_zone_report', and
> >> > keep the iod->addr field not touched.
> >> 
> >> I see. Would you make the first part of `struct ublksrv_io_desc` a union
> >> for this, or would you just cast it at the use site?
> >
> > oops, you still need iod->op_flags for recognizing the io op, so just
> > start_sector and nr_sectors can be used.
> 
> We do not actually need to pass the flags to user space, or back from
> user space to kernel for ublk zone report. They are currently used to
> tell user space if the zone report contains capacity field. We could
> exclude them from the ublk kabi since the zone report will always
> contain capacity? But it might be good to have a flags field or future
> things.
> 
> > However, this way isn't good too, cause UBLK_IO_OP_DRV_IN is just mapped
> > to 'report zone' command in your implementation, what if new pt request
> > is required in future?
> 
> We are currently mapping REQ_OP_* 1:1 to  UBLK_OP_OP_*. If we relax
> this, we can have a UBLK_IO_OP_REPORT_ZONES.

The op takes 8 bits, and enough to cover all normal block layer OPs and
driver specific OPs, so I'd suggest this way, and ublk device
specific OP can be started from 32, prefixed with

	UBLK_IO_OP_DRV_IN				//[32, 96)
or
	UBLK_IO_OP_DRV_OUT				//[96, 160)

such as, report zones can be defined as

	enum {
		__UBLK_IO_OP_DRV_IN_START = 32,
		UBLK_IO_OP_DRV_IN_REPORT_ZONES = __UBLK_IO_OP_DRV_IN_START,
		__UBLK_IO_OP_DRV_IN_END = 96,

		__UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
		__UBLK_IO_OP_DRV_OUT_END = 160,
	};

For any DRV OPs, iod header(not include ->addr) and buffer format can be re-defined
as uapi structure.

What do you think of this way?

> 
> >
> > We need to think about how to support ublk pt request in generic way.
> 
> Another option is to allow REQ_OP_DRV_IN to pass a buffer to user space.
> Instead of being similar to a read operation, it could be a combination of
> a read and a write operation.

That might be more flexible, but could add driver & userspace's
complexity, so I'd suggest to try to avoid bidirectional buffer asap,
but we still reserve support for it via UBLK_IO_OP_DRV_IN_OUT*.

Thanks,
Ming
  

Patch

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 6368b56eacf1..37e516903867 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -20,6 +20,7 @@ 
 #include <linux/major.h>
 #include <linux/wait.h>
 #include <linux/blkdev.h>
+#include <linux/blkzoned.h>
 #include <linux/init.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
@@ -51,10 +52,12 @@ 
 		| UBLK_F_URING_CMD_COMP_IN_TASK \
 		| UBLK_F_NEED_GET_DATA \
 		| UBLK_F_USER_RECOVERY \
-		| UBLK_F_USER_RECOVERY_REISSUE)
+		| UBLK_F_USER_RECOVERY_REISSUE \
+		| UBLK_F_ZONED)
 
 /* All UBLK_PARAM_TYPE_* should be included here */
-#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
+#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD \
+			     | UBLK_PARAM_TYPE_ZONED)
 
 struct ublk_rq_data {
 	struct llist_node node;
@@ -187,6 +190,98 @@  static DEFINE_MUTEX(ublk_ctl_mutex);
 
 static struct miscdevice ublk_misc;
 
+#ifdef CONFIG_BLK_DEV_ZONED
+static void ublk_set_nr_zones(struct ublk_device *ub)
+{
+	const struct ublk_param_basic *p = &ub->params.basic;
+
+	if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
+		ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
+}
+
+static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
+{
+	const struct ublk_param_zoned *p = &ub->params.zoned;
+
+	if (ub->dev_info.flags & UBLK_F_ZONED) {
+		disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
+		disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
+	}
+}
+
+static int ublk_revalidate_disk_zones(struct gendisk *disk)
+{
+	return blk_revalidate_disk_zones(disk, NULL);
+}
+
+static int ublk_report_zones(struct gendisk *disk, sector_t sector,
+			     unsigned int nr_zones, report_zones_cb cb,
+			     void *data)
+{
+	struct ublk_device *ub;
+	unsigned int zone_size;
+	unsigned int first_zone;
+	int ret = 0;
+
+	ub = disk->private_data;
+
+	if (!(ub->dev_info.flags & UBLK_F_ZONED))
+		return -EINVAL;
+
+	zone_size = disk->queue->limits.chunk_sectors;
+	first_zone = sector >> ilog2(zone_size);
+	nr_zones = min(ub->ub_disk->nr_zones - first_zone, nr_zones);
+
+	for (unsigned int i = 0; i < nr_zones; i++) {
+		struct request *req;
+		blk_status_t status;
+		struct blk_zone info = {0};
+
+		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
+
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			goto out;
+		}
+
+		req->__sector = sector;
+
+		ret = blk_rq_map_kern(disk->queue, req, &info, sizeof(info),
+				      GFP_KERNEL);
+
+		if (ret)
+			goto out;
+
+		status = blk_execute_rq(req, 0);
+		ret = blk_status_to_errno(status);
+		if (ret)
+			goto out;
+
+		blk_mq_free_request(req);
+
+		ret = cb(&info, i, data);
+		if (ret)
+			goto out;
+
+		/* A zero length zone means don't ask for more zones */
+		if (!info.len) {
+			nr_zones = i;
+			break;
+		}
+
+		sector += zone_size;
+	}
+	ret = nr_zones;
+
+ out:
+	return ret;
+}
+#else
+void ublk_set_nr_zones(struct ublk_device *ub);
+void ublk_dev_param_zoned_apply(struct ublk_device *ub);
+int ublk_revalidate_disk_zones(struct gendisk *disk);
+#endif
+
 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
 {
 	struct request_queue *q = ub->ub_disk->queue;
@@ -212,6 +307,9 @@  static void ublk_dev_param_basic_apply(struct ublk_device *ub)
 		set_disk_ro(ub->ub_disk, true);
 
 	set_capacity(ub->ub_disk, p->dev_sectors);
+
+	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
+		ublk_set_nr_zones(ub);
 }
 
 static void ublk_dev_param_discard_apply(struct ublk_device *ub)
@@ -268,6 +366,9 @@  static int ublk_apply_params(struct ublk_device *ub)
 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
 		ublk_dev_param_discard_apply(ub);
 
+	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types & UBLK_PARAM_TYPE_ZONED))
+		ublk_dev_param_zoned_apply(ub);
+
 	return 0;
 }
 
@@ -361,9 +462,13 @@  static void ublk_free_disk(struct gendisk *disk)
 	put_device(&ub->cdev_dev);
 }
 
+
 static const struct block_device_operations ub_fops = {
-	.owner =	THIS_MODULE,
-	.free_disk =	ublk_free_disk,
+	.owner = THIS_MODULE,
+	.free_disk = ublk_free_disk,
+#ifdef CONFIG_BLK_DEV_ZONED
+	.report_zones = ublk_report_zones,
+#endif
 };
 
 #define UBLK_MAX_PIN_PAGES	32
@@ -499,7 +604,7 @@  static int ublk_unmap_io(const struct ublk_queue *ubq,
 {
 	const unsigned int rq_bytes = blk_rq_bytes(req);
 
-	if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
+	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && ublk_rq_has_data(req)) {
 		struct ublk_map_data data = {
 			.ubq	=	ubq,
 			.rq	=	req,
@@ -566,6 +671,26 @@  static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
 	case REQ_OP_WRITE_ZEROES:
 		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
 		break;
+#ifdef CONFIG_BLK_DEV_ZONED
+	case REQ_OP_ZONE_OPEN:
+		ublk_op = UBLK_IO_OP_ZONE_OPEN;
+		break;
+	case REQ_OP_ZONE_CLOSE:
+		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
+		break;
+	case REQ_OP_ZONE_FINISH:
+		ublk_op = UBLK_IO_OP_ZONE_FINISH;
+		break;
+	case REQ_OP_ZONE_RESET:
+		ublk_op = UBLK_IO_OP_ZONE_RESET;
+		break;
+	case REQ_OP_DRV_IN:
+		ublk_op = UBLK_IO_OP_DRV_IN;
+		break;
+	case REQ_OP_ZONE_APPEND:
+		/* We do not support zone append yet */
+		fallthrough;
+#endif
 	default:
 		return BLK_STS_IOERR;
 	}
@@ -612,7 +737,8 @@  static void ublk_complete_rq(struct request *req)
 	 *
 	 * Both the two needn't unmap.
 	 */
-	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
+	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
+	    req_op(req) != REQ_OP_DRV_IN) {
 		blk_mq_end_request(req, BLK_STS_OK);
 		return;
 	}
@@ -1535,6 +1661,15 @@  static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
 	if (ret)
 		goto out_put_disk;
 
+	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+	    ub->dev_info.flags & UBLK_F_ZONED) {
+		disk_set_zoned(disk, BLK_ZONED_HM);
+		blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
+		ret = ublk_revalidate_disk_zones(disk);
+		if (ret)
+			goto out_put_disk;
+	}
+
 	get_device(&ub->cdev_dev);
 	ret = add_disk(disk);
 	if (ret) {
@@ -1673,6 +1808,9 @@  static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 	if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
 		ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
 
+	if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
+		ub->dev_info.flags &= ~UBLK_F_ZONED;
+
 	/* We are not ready to support zero copy */
 	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
 
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 8f88e3a29998..074b97821575 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -78,6 +78,10 @@ 
 #define UBLK_F_USER_RECOVERY	(1UL << 3)
 
 #define UBLK_F_USER_RECOVERY_REISSUE	(1UL << 4)
+/*
+ * Enable zoned device support
+ */
+#define UBLK_F_ZONED (1ULL << 5)
 
 /* device state */
 #define UBLK_S_DEV_DEAD	0
@@ -129,6 +133,12 @@  struct ublksrv_ctrl_dev_info {
 #define		UBLK_IO_OP_DISCARD	3
 #define		UBLK_IO_OP_WRITE_SAME	4
 #define		UBLK_IO_OP_WRITE_ZEROES	5
+#define		UBLK_IO_OP_ZONE_OPEN		10
+#define		UBLK_IO_OP_ZONE_CLOSE		11
+#define		UBLK_IO_OP_ZONE_FINISH		12
+#define		UBLK_IO_OP_ZONE_APPEND		13
+#define		UBLK_IO_OP_ZONE_RESET		15
+#define		UBLK_IO_OP_DRV_IN		34
 
 #define		UBLK_IO_F_FAILFAST_DEV		(1U << 8)
 #define		UBLK_IO_F_FAILFAST_TRANSPORT	(1U << 9)
@@ -214,6 +224,12 @@  struct ublk_param_discard {
 	__u16	reserved0;
 };
 
+struct ublk_param_zoned {
+	__u64	max_open_zones;
+	__u64	max_active_zones;
+	__u64	max_append_size;
+};
+
 struct ublk_params {
 	/*
 	 * Total length of parameters, userspace has to set 'len' for both
@@ -224,10 +240,12 @@  struct ublk_params {
 	__u32	len;
 #define UBLK_PARAM_TYPE_BASIC           (1 << 0)
 #define UBLK_PARAM_TYPE_DISCARD         (1 << 1)
+#define UBLK_PARAM_TYPE_ZONED           (1 << 2)
 	__u32	types;			/* types of parameter included */
 
 	struct ublk_param_basic		basic;
 	struct ublk_param_discard	discard;
+	struct ublk_param_zoned		zoned;
 };
 
 #endif