[v9,2/2] ublk: enable zoned storage support

Message ID 20230714072510.47770-3-nmi@metaspace.dk
State New
Headers
Series ublk: enable zoned storage support |

Commit Message

Andreas Hindborg July 14, 2023, 7:25 a.m. UTC
  From: Andreas Hindborg <a.hindborg@samsung.com>

Add zoned storage support to ublk: report_zones and operations:
 - REQ_OP_ZONE_OPEN
 - REQ_OP_ZONE_CLOSE
 - REQ_OP_ZONE_FINISH
 - REQ_OP_ZONE_RESET
 - REQ_OP_ZONE_APPEND

The zone append feature uses the `addr` field of `struct ublksrv_io_cmd` to
communicate ALBA back to the kernel. Therefore ublk must be used with the
user copy feature (UBLK_F_USER_COPY) for zoned storage support to be
available. Without this feature, ublk will not allow zoned storage support.

Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
---
 drivers/block/ublk_drv.c      | 335 ++++++++++++++++++++++++++++++++--
 include/uapi/linux/ublk_cmd.h |  36 +++-
 2 files changed, 354 insertions(+), 17 deletions(-)
  

Comments

Ming Lei July 19, 2023, 9:26 a.m. UTC | #1
On Fri, Jul 14, 2023 at 09:25:10AM +0200, Andreas Hindborg wrote:
> From: Andreas Hindborg <a.hindborg@samsung.com>
> 
> Add zoned storage support to ublk: report_zones and operations:
>  - REQ_OP_ZONE_OPEN
>  - REQ_OP_ZONE_CLOSE
>  - REQ_OP_ZONE_FINISH
>  - REQ_OP_ZONE_RESET
>  - REQ_OP_ZONE_APPEND
> 
> The zone append feature uses the `addr` field of `struct ublksrv_io_cmd` to
> communicate ALBA back to the kernel. Therefore ublk must be used with the
> user copy feature (UBLK_F_USER_COPY) for zoned storage support to be
> available. Without this feature, ublk will not allow zoned storage support.
> 
> Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
> ---

...

> +/*
> + * Construct a zone report. The report request is carried in `struct
> + * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone
> + * and shall indicate the first zone of the report. The `nr_sectors` shall
> + * indicate how many zones should be reported (divide by zone size to get number
> + * of zones in the report) and must be an integer multiple of the zone size. The
> + * report shall be delivered as a `struct blk_zone` array. To report fewer zones
> + * than requested, zero the last entry of the returned array.
> + */
> +#define		UBLK_IO_OP_REPORT_ZONES		18

Actually, I meant the following delta change in V8 comment, then the UAPI
looks more clean & readable wrt. reporting how many zones in UBLK_IO_OP_REPORT_ZONES
and reusing ublksrv_io_cmd->addr.

Otherwise, this patchset looks fine.


diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 5698f4575e05..454c852ed328 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -70,7 +70,7 @@ struct ublk_rq_data {
 	struct kref ref;
 	__u64 sector;
 	__u32 operation;
-	__u32 nr_sectors;
+	__u32 nr_zones;
 };
 
 struct ublk_uring_cmd_pdu {
@@ -335,7 +335,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
 		pdu = blk_mq_rq_to_pdu(req);
 		pdu->operation = UBLK_IO_OP_REPORT_ZONES;
 		pdu->sector = sector;
-		pdu->nr_sectors = zones_in_request * zone_size_sectors;
+		pdu->nr_zones = zones_in_request;
 
 		ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
 					GFP_KERNEL);
@@ -404,7 +404,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
 		switch (ublk_op) {
 		case UBLK_IO_OP_REPORT_ZONES:
 			iod->op_flags = ublk_op | ublk_req_build_flags(req);
-			iod->nr_sectors = pdu->nr_sectors;
+			iod->nr_zones = pdu->nr_zones;
 			iod->start_sector = pdu->sector;
 			return BLK_STS_OK;
 		default:
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 4d97eb0f7d13..602a788a650e 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -249,11 +249,13 @@ struct ublksrv_ctrl_dev_info {
 /*
  * Construct a zone report. The report request is carried in `struct
  * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone
- * and shall indicate the first zone of the report. The `nr_sectors` shall
- * indicate how many zones should be reported (divide by zone size to get number
- * of zones in the report) and must be an integer multiple of the zone size. The
- * report shall be delivered as a `struct blk_zone` array. To report fewer zones
- * than requested, zero the last entry of the returned array.
+ * and shall indicate the first zone of the report. The `nr_zones` shall
+ * indicate how many zones should be reported at most. The report shall be
+ * delivered as a `struct blk_zone` array. To report fewer zones than
+ * requested, zero the last entry of the returned array.
+ *
+ * So related definitions(blk_zone, blk_zone_cond, blk_zone_type, ...) in
+ * include/uapi/linux/blkzoned.h are part of ublk UAPI.
  */
 #define		UBLK_IO_OP_REPORT_ZONES		18
 
@@ -276,7 +278,10 @@ struct ublksrv_io_desc {
 	/* op: bit 0-7, flags: bit 8-31 */
 	__u32		op_flags;
 
-	__u32		nr_sectors;
+	union {
+		__u32		nr_sectors;
+		__u32		nr_zones; /* for UBLK_IO_OP_REPORT_ZONES only */
+	};
 
 	/* start sector for this io */
 	__u64		start_sector;
@@ -308,6 +313,12 @@ struct ublksrv_io_cmd {
 	/*
 	 * userspace buffer address in ublksrv daemon process, valid for
 	 * FETCH* command only
+	 *
+	 * This field shouldn't be used if UBLK_F_USER_COPY is enabled,
+	 * because userspace deals with data copy by pread()/pwrite() over
+	 * /dev/ublkcN. But in case of UBLK_F_ZONED, 'addr' is re-used to
+	 * pass back the allocated LBA for UBLK_IO_OP_ZONE_APPEND which
+	 * actually depends on UBLK_F_USER_COPY
 	 */
 	__u64	addr;
 };

Thanks,
Ming
  
Niklas Cassel July 19, 2023, 12:01 p.m. UTC | #2
On Fri, Jul 14, 2023 at 09:25:10AM +0200, Andreas Hindborg wrote:
> From: Andreas Hindborg <a.hindborg@samsung.com>

Hello Andreas!

> 
> Add zoned storage support to ublk: report_zones and operations:
>  - REQ_OP_ZONE_OPEN
>  - REQ_OP_ZONE_CLOSE
>  - REQ_OP_ZONE_FINISH
>  - REQ_OP_ZONE_RESET
>  - REQ_OP_ZONE_APPEND
> 
> The zone append feature uses the `addr` field of `struct ublksrv_io_cmd` to
> communicate ALBA back to the kernel. Therefore ublk must be used with the
> user copy feature (UBLK_F_USER_COPY) for zoned storage support to be
> available. Without this feature, ublk will not allow zoned storage support.
> 
> Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
> ---
>  drivers/block/ublk_drv.c      | 335 ++++++++++++++++++++++++++++++++--
>  include/uapi/linux/ublk_cmd.h |  36 +++-
>  2 files changed, 354 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 8d271901efac..5698f4575e05 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -56,16 +56,21 @@
>  		| UBLK_F_USER_RECOVERY_REISSUE \
>  		| UBLK_F_UNPRIVILEGED_DEV \
>  		| UBLK_F_CMD_IOCTL_ENCODE \
> -		| UBLK_F_USER_COPY)
> +		| UBLK_F_USER_COPY \
> +		| UBLK_F_ZONED)
>  
>  /* All UBLK_PARAM_TYPE_* should be included here */
> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
> -		UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
> +#define UBLK_PARAM_TYPE_ALL                                \
> +	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> +	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
>  
>  struct ublk_rq_data {
>  	struct llist_node node;
>  
>  	struct kref ref;
> +	__u64 sector;
> +	__u32 operation;
> +	__u32 nr_sectors;
>  };
>  
>  struct ublk_uring_cmd_pdu {
> @@ -110,6 +115,11 @@ struct ublk_uring_cmd_pdu {
>   */
>  #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
>  
> +/*
> + * Set when IO is Zone Append
> + */
> +#define UBLK_IO_FLAG_ZONE_APPEND 0x10
> +
>  struct ublk_io {
>  	/* userspace buffer address from io cmd */
>  	__u64	addr;
> @@ -185,11 +195,264 @@ struct ublk_params_header {
>  	__u32	types;
>  };
>  
> +static inline unsigned int ublk_req_build_flags(struct request *req);
> +static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
> +						   int tag);
> +
>  static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
>  {
>  	return ub->dev_info.flags & UBLK_F_USER_COPY;
>  }
>  
> +static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
> +{
> +	return ub->dev_info.flags & UBLK_F_ZONED;
> +}
> +
> +static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
> +{
> +	return ubq->flags & UBLK_F_ZONED;
> +}
> +
> +#ifdef CONFIG_BLK_DEV_ZONED
> +
> +static int ublk_get_nr_zones(const struct ublk_device *ub)
> +{
> +	const struct ublk_param_basic *p = &ub->params.basic;
> +
> +	/* Zone size is a power of 2 */
> +	return p->dev_sectors >> ilog2(p->chunk_sectors);
> +}
> +
> +static int ublk_revalidate_disk_zones(struct ublk_device *ub)
> +{
> +	return blk_revalidate_disk_zones(ub->ub_disk, NULL);
> +}
> +
> +static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
> +{
> +	const struct ublk_param_zoned *p = &ub->params.zoned;
> +	int nr_zones;
> +
> +	if (!ublk_dev_is_zoned(ub))
> +		return -EINVAL;
> +
> +	if (!p->max_zone_append_sectors)
> +		return -EINVAL;
> +
> +	nr_zones = ublk_get_nr_zones(ub);
> +
> +	if (p->max_active_zones > nr_zones)
> +		return -EINVAL;
> +
> +	if (p->max_open_zones > nr_zones)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
> +{
> +	const struct ublk_param_zoned *p = &ub->params.zoned;
> +
> +	disk_set_zoned(ub->ub_disk, BLK_ZONED_HM);
> +	blk_queue_required_elevator_features(ub->ub_disk->queue,
> +					     ELEVATOR_F_ZBD_SEQ_WRITE);
> +	disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
> +	disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
> +	blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
> +
> +	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
> +
> +	return 0;
> +}
> +
> +/* Based on virtblk_alloc_report_buffer */
> +static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
> +				      unsigned int nr_zones, size_t *buflen)
> +{
> +	struct request_queue *q = ublk->ub_disk->queue;
> +	size_t bufsize;
> +	void *buf;
> +
> +	nr_zones = min_t(unsigned int, nr_zones,
> +			 ublk->ub_disk->nr_zones);
> +
> +	bufsize = nr_zones * sizeof(struct blk_zone);
> +	bufsize =
> +		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
> +
> +	while (bufsize >= sizeof(struct blk_zone)) {
> +		buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
> +		if (buf) {
> +			*buflen = bufsize;
> +			return buf;
> +		}
> +		bufsize >>= 1;
> +	}
> +
> +	*buflen = 0;
> +	return NULL;
> +}
> +
> +static int ublk_report_zones(struct gendisk *disk, sector_t sector,
> +		      unsigned int nr_zones, report_zones_cb cb, void *data)
> +{
> +	struct ublk_device *ub = disk->private_data;
> +	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
> +	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
> +	unsigned int done_zones = 0;
> +	unsigned int max_zones_per_request;
> +	int ret;
> +	struct blk_zone *buffer;
> +	size_t buffer_length;
> +
> +	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
> +			 nr_zones);
> +
> +	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
> +	if (!buffer)
> +		return -ENOMEM;
> +
> +	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
> +
> +	while (done_zones < nr_zones) {
> +		unsigned int remaining_zones = nr_zones - done_zones;
> +		unsigned int zones_in_request =
> +			min_t(unsigned int, remaining_zones, max_zones_per_request);
> +		struct request *req;
> +		struct ublk_rq_data *pdu;
> +		blk_status_t status;
> +
> +		memset(buffer, 0, buffer_length);
> +
> +		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
> +		if (IS_ERR(req)) {
> +			ret = PTR_ERR(req);
> +			goto out;
> +		}
> +
> +		pdu = blk_mq_rq_to_pdu(req);
> +		pdu->operation = UBLK_IO_OP_REPORT_ZONES;
> +		pdu->sector = sector;
> +		pdu->nr_sectors = zones_in_request * zone_size_sectors;
> +
> +		ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
> +					GFP_KERNEL);
> +		if (ret) {
> +			blk_mq_free_request(req);
> +			goto out;
> +		}
> +
> +		status = blk_execute_rq(req, 0);
> +		ret = blk_status_to_errno(status);
> +		blk_mq_free_request(req);
> +		if (ret)
> +			goto out;
> +
> +		for (unsigned int i = 0; i < zones_in_request; i++) {
> +			struct blk_zone *zone = buffer + i;
> +
> +			/* A zero length zone means no more zones in this response */
> +			if (!zone->len)
> +				break;
> +
> +			ret = cb(zone, i, data);
> +			if (ret)
> +				goto out;
> +
> +			done_zones++;
> +			sector += zone_size_sectors;
> +
> +		}
> +	}
> +
> +	ret = done_zones;
> +
> +out:
> +	kvfree(buffer);
> +	return ret;
> +}
> +
> +static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
> +					 struct request *req)
> +{
> +	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> +	struct ublk_io *io = &ubq->ios[req->tag];
> +	struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
> +	u32 ublk_op;
> +
> +	switch (req_op(req)) {
> +	case REQ_OP_ZONE_OPEN:
> +		ublk_op = UBLK_IO_OP_ZONE_OPEN;
> +		break;
> +	case REQ_OP_ZONE_CLOSE:
> +		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
> +		break;
> +	case REQ_OP_ZONE_FINISH:
> +		ublk_op = UBLK_IO_OP_ZONE_FINISH;
> +		break;
> +	case REQ_OP_ZONE_RESET:
> +		ublk_op = UBLK_IO_OP_ZONE_RESET;
> +		break;
> +	case REQ_OP_ZONE_APPEND:
> +		ublk_op = UBLK_IO_OP_ZONE_APPEND;
> +		io->flags |= UBLK_IO_FLAG_ZONE_APPEND;
> +		break;
> +	case REQ_OP_DRV_IN:
> +		ublk_op = pdu->operation;
> +		switch (ublk_op) {
> +		case UBLK_IO_OP_REPORT_ZONES:
> +			iod->op_flags = ublk_op | ublk_req_build_flags(req);
> +			iod->nr_sectors = pdu->nr_sectors;
> +			iod->start_sector = pdu->sector;
> +			return BLK_STS_OK;
> +		default:
> +			return BLK_STS_IOERR;
> +		}
> +	case REQ_OP_ZONE_RESET_ALL:
> +	case REQ_OP_DRV_OUT:
> +		/* We do not support reset_all and drv_out */
> +		return BLK_STS_NOTSUPP;
> +	default:
> +		return BLK_STS_IOERR;
> +	}
> +
> +	iod->op_flags = ublk_op | ublk_req_build_flags(req);
> +	iod->nr_sectors = blk_rq_sectors(req);
> +	iod->start_sector = blk_rq_pos(req);
> +	iod->addr = io->addr;
> +
> +	return BLK_STS_OK;
> +}
> +
> +#else
> +
> +#define ublk_report_zones (NULL)
> +
> +static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static int ublk_revalidate_disk_zones(struct ublk_device *ub)
> +{
> +	return 0;
> +}
> +
> +static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
> +					 struct request *req)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +#endif
> +
>  static inline void __ublk_complete_rq(struct request *req);
>  static void ublk_complete_rq(struct kref *ref);
>  
> @@ -286,6 +549,9 @@ static int ublk_validate_params(const struct ublk_device *ub)
>  
>  		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
>  			return -EINVAL;
> +
> +		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
> +			return -EINVAL;
>  	} else
>  		return -EINVAL;
>  
> @@ -304,6 +570,11 @@ static int ublk_validate_params(const struct ublk_device *ub)
>  	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
>  		return -EINVAL;
>  
> +	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
> +		return ublk_dev_param_zoned_validate(ub);
> +	else if (ublk_dev_is_zoned(ub))
> +		return -EINVAL;
> +
>  	return 0;
>  }
>  
> @@ -317,6 +588,9 @@ static int ublk_apply_params(struct ublk_device *ub)
>  	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
>  		ublk_dev_param_discard_apply(ub);
>  
> +	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
> +		return ublk_dev_param_zoned_apply(ub);
> +
>  	return 0;
>  }
>  
> @@ -487,6 +761,7 @@ static const struct block_device_operations ub_fops = {
>  	.owner =	THIS_MODULE,
>  	.open =		ublk_open,
>  	.free_disk =	ublk_free_disk,
> +	.report_zones =	ublk_report_zones,
>  };
>  
>  #define UBLK_MAX_PIN_PAGES	32
> @@ -601,7 +876,8 @@ static inline bool ublk_need_map_req(const struct request *req)
>  
>  static inline bool ublk_need_unmap_req(const struct request *req)
>  {
> -	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
> +	return ublk_rq_has_data(req) &&
> +	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
>  }
>  
>  static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
> @@ -685,8 +961,13 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
>  {
>  	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
>  	struct ublk_io *io = &ubq->ios[req->tag];
> +	enum req_op op = req_op(req);
>  	u32 ublk_op;
>  
> +	if (!ublk_queue_is_zoned(ubq) &&
> +	    (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
> +		return -EIO;
> +
>  	switch (req_op(req)) {
>  	case REQ_OP_READ:
>  		ublk_op = UBLK_IO_OP_READ;
> @@ -704,6 +985,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
>  		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
>  		break;
>  	default:
> +		if (ublk_queue_is_zoned(ubq))
> +			return ublk_setup_iod_zoned(ubq, req);
>  		return BLK_STS_IOERR;
>  	}
>  
> @@ -756,7 +1039,8 @@ static inline void __ublk_complete_rq(struct request *req)
>  	 *
>  	 * Both the two needn't unmap.
>  	 */
> -	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
> +	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
> +	    req_op(req) != REQ_OP_DRV_IN)
>  		goto exit;
>  
>  	/* for READ request, writing data in iod->addr to rq buffers */
> @@ -1120,6 +1404,11 @@ static void ublk_commit_completion(struct ublk_device *ub,
>  	/* find the io request and complete */
>  	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
>  
> +	if (io->flags & UBLK_IO_FLAG_ZONE_APPEND) {

Do we really need to introduce a completely new flag just for this?

if (req_op(req) == REQ_OP_ZONE_APPEND)

should work just as well, no?


Kind regards,
Niklas
  
Ming Lei July 19, 2023, 12:41 p.m. UTC | #3
On Wed, Jul 19, 2023 at 05:26:11PM +0800, Ming Lei wrote:
> On Fri, Jul 14, 2023 at 09:25:10AM +0200, Andreas Hindborg wrote:
> > From: Andreas Hindborg <a.hindborg@samsung.com>
> > 
> > Add zoned storage support to ublk: report_zones and operations:
> >  - REQ_OP_ZONE_OPEN
> >  - REQ_OP_ZONE_CLOSE
> >  - REQ_OP_ZONE_FINISH
> >  - REQ_OP_ZONE_RESET
> >  - REQ_OP_ZONE_APPEND
> > 
> > The zone append feature uses the `addr` field of `struct ublksrv_io_cmd` to
> > communicate ALBA back to the kernel. Therefore ublk must be used with the
> > user copy feature (UBLK_F_USER_COPY) for zoned storage support to be
> > available. Without this feature, ublk will not allow zoned storage support.
> > 
> > Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
> > ---
> 
> ...
> 
> > +/*
> > + * Construct a zone report. The report request is carried in `struct
> > + * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone
> > + * and shall indicate the first zone of the report. The `nr_sectors` shall
> > + * indicate how many zones should be reported (divide by zone size to get number
> > + * of zones in the report) and must be an integer multiple of the zone size. The
> > + * report shall be delivered as a `struct blk_zone` array. To report fewer zones
> > + * than requested, zero the last entry of the returned array.
> > + */
> > +#define		UBLK_IO_OP_REPORT_ZONES		18
> 
> Actually, I meant the following delta change in V8 comment, then the UAPI
> looks more clean & readable wrt. reporting how many zones in UBLK_IO_OP_REPORT_ZONES
> and reusing ublksrv_io_cmd->addr.
> 
> Otherwise, this patchset looks fine.
> 
> 
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 5698f4575e05..454c852ed328 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -70,7 +70,7 @@ struct ublk_rq_data {
>  	struct kref ref;
>  	__u64 sector;
>  	__u32 operation;
> -	__u32 nr_sectors;
> +	__u32 nr_zones;
>  };
>  
>  struct ublk_uring_cmd_pdu {
> @@ -335,7 +335,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
>  		pdu = blk_mq_rq_to_pdu(req);
>  		pdu->operation = UBLK_IO_OP_REPORT_ZONES;
>  		pdu->sector = sector;
> -		pdu->nr_sectors = zones_in_request * zone_size_sectors;
> +		pdu->nr_zones = zones_in_request;
>  
>  		ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
>  					GFP_KERNEL);
> @@ -404,7 +404,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
>  		switch (ublk_op) {
>  		case UBLK_IO_OP_REPORT_ZONES:
>  			iod->op_flags = ublk_op | ublk_req_build_flags(req);
> -			iod->nr_sectors = pdu->nr_sectors;
> +			iod->nr_zones = pdu->nr_zones;
>  			iod->start_sector = pdu->sector;
>  			return BLK_STS_OK;
>  		default:
> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
> index 4d97eb0f7d13..602a788a650e 100644
> --- a/include/uapi/linux/ublk_cmd.h
> +++ b/include/uapi/linux/ublk_cmd.h
> @@ -249,11 +249,13 @@ struct ublksrv_ctrl_dev_info {
>  /*
>   * Construct a zone report. The report request is carried in `struct
>   * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone
> - * and shall indicate the first zone of the report. The `nr_sectors` shall
> - * indicate how many zones should be reported (divide by zone size to get number
> - * of zones in the report) and must be an integer multiple of the zone size. The
> - * report shall be delivered as a `struct blk_zone` array. To report fewer zones
> - * than requested, zero the last entry of the returned array.
> + * and shall indicate the first zone of the report. The `nr_zones` shall
> + * indicate how many zones should be reported at most. The report shall be
> + * delivered as a `struct blk_zone` array. To report fewer zones than
> + * requested, zero the last entry of the returned array.
> + *
> + * So related definitions(blk_zone, blk_zone_cond, blk_zone_type, ...) in
> + * include/uapi/linux/blkzoned.h are part of ublk UAPI.
>   */
>  #define		UBLK_IO_OP_REPORT_ZONES		18
>  
> @@ -276,7 +278,10 @@ struct ublksrv_io_desc {
>  	/* op: bit 0-7, flags: bit 8-31 */
>  	__u32		op_flags;
>  
> -	__u32		nr_sectors;
> +	union {
> +		__u32		nr_sectors;
> +		__u32		nr_zones; /* for UBLK_IO_OP_REPORT_ZONES only */
> +	};
>  
>  	/* start sector for this io */
>  	__u64		start_sector;
> @@ -308,6 +313,12 @@ struct ublksrv_io_cmd {
>  	/*
>  	 * userspace buffer address in ublksrv daemon process, valid for
>  	 * FETCH* command only
> +	 *
> +	 * This field shouldn't be used if UBLK_F_USER_COPY is enabled,
> +	 * because userspace deals with data copy by pread()/pwrite() over
> +	 * /dev/ublkcN. But in case of UBLK_F_ZONED, 'addr' is re-used to
> +	 * pass back the allocated LBA for UBLK_IO_OP_ZONE_APPEND which
> +	 * actually depends on UBLK_F_USER_COPY
>  	 */
>  	__u64	addr;

Or use union to cover zoned_append_lba, and we still need above
document about UBLK_F_USER_COPY & UBLK_F_ZONED uses.

	union {
		__u64	addr;
		__u64	zoned_append_lba;
	}


Thanks,
Ming
  
Andreas Hindborg Aug. 1, 2023, 12:11 p.m. UTC | #4
Niklas Cassel <Niklas.Cassel@wdc.com> writes:

> On Fri, Jul 14, 2023 at 09:25:10AM +0200, Andreas Hindborg wrote:
>> From: Andreas Hindborg <a.hindborg@samsung.com>
>
> Hello Andreas!
>

<snip>

>>  	/* for READ request, writing data in iod->addr to rq buffers */
>> @@ -1120,6 +1404,11 @@ static void ublk_commit_completion(struct ublk_device *ub,
>>  	/* find the io request and complete */
>>  	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
>>  
>> +	if (io->flags & UBLK_IO_FLAG_ZONE_APPEND) {
>
> Do we really need to introduce a completely new flag just for this?
>
> if (req_op(req) == REQ_OP_ZONE_APPEND)
>
> should work just as well, no?

Makes sense, thanks.

BR Andreas
  
Ming Lei Aug. 1, 2023, 12:58 p.m. UTC | #5
On Tue, Aug 01, 2023 at 02:11:56PM +0200, Andreas Hindborg (Samsung) wrote:
> 
> Niklas Cassel <Niklas.Cassel@wdc.com> writes:
> 
> > On Fri, Jul 14, 2023 at 09:25:10AM +0200, Andreas Hindborg wrote:
> >> From: Andreas Hindborg <a.hindborg@samsung.com>
> >
> > Hello Andreas!
> >
> 
> <snip>
> 
> >>  	/* for READ request, writing data in iod->addr to rq buffers */
> >> @@ -1120,6 +1404,11 @@ static void ublk_commit_completion(struct ublk_device *ub,
> >>  	/* find the io request and complete */
> >>  	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
> >>  
> >> +	if (io->flags & UBLK_IO_FLAG_ZONE_APPEND) {
> >
> > Do we really need to introduce a completely new flag just for this?
> >
> > if (req_op(req) == REQ_OP_ZONE_APPEND)
> >
> > should work just as well, no?
> 
> Makes sense, thanks.

The above one can be replaced with req_op().

But extra cost is added when retrieving request for the check in
__ublk_ch_uring_cmd().


Thanks,
Ming
  
Andreas Hindborg Aug. 2, 2023, 9:09 a.m. UTC | #6
Ming Lei <ming.lei@redhat.com> writes:

> On Tue, Aug 01, 2023 at 02:11:56PM +0200, Andreas Hindborg (Samsung) wrote:
>> 
>> Niklas Cassel <Niklas.Cassel@wdc.com> writes:
>> 
>> > On Fri, Jul 14, 2023 at 09:25:10AM +0200, Andreas Hindborg wrote:
>> >> From: Andreas Hindborg <a.hindborg@samsung.com>
>> >
>> > Hello Andreas!
>> >
>> 
>> <snip>
>> 
>> >>  	/* for READ request, writing data in iod->addr to rq buffers */
>> >> @@ -1120,6 +1404,11 @@ static void ublk_commit_completion(struct ublk_device *ub,
>> >>  	/* find the io request and complete */
>> >>  	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
>> >>  
>> >> +	if (io->flags & UBLK_IO_FLAG_ZONE_APPEND) {
>> >
>> > Do we really need to introduce a completely new flag just for this?
>> >
>> > if (req_op(req) == REQ_OP_ZONE_APPEND)
>> >
>> > should work just as well, no?
>> 
>> Makes sense, thanks.
>
> The above one can be replaced with req_op().
>
> But extra cost is added when retrieving request for the check in
> __ublk_ch_uring_cmd().
>

How about this (diff to v9):

@@ -1709,7 +1702,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
 		goto out;
 
 	if (ublk_support_user_copy(ubq) &&
-	    !(io->flags & UBLK_IO_FLAG_ZONE_APPEND) && ub_cmd->addr) {
+	    _IOC_NR(cmd_op) != UBLK_IO_COMMIT_AND_FETCH_REQ && ub_cmd->addr) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1751,6 +1744,12 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
 		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
 			goto out;
 
+		if (ublk_support_user_copy(ubq) &&
+		    req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
+			ret = -EINVAL;
+			goto out;
+		}
+
 		if (!ublk_support_user_copy(ubq)) {
 			/*
 			 * COMMIT_AND_FETCH_REQ has to provide IO buffer if


BR Andreas
  
Andreas Hindborg Aug. 2, 2023, 10:19 a.m. UTC | #7
Ming Lei <ming.lei@redhat.com> writes:

> On Wed, Jul 19, 2023 at 05:26:11PM +0800, Ming Lei wrote:
>> On Fri, Jul 14, 2023 at 09:25:10AM +0200, Andreas Hindborg wrote:
>> > From: Andreas Hindborg <a.hindborg@samsung.com>
>> > 
>> > Add zoned storage support to ublk: report_zones and operations:
>> >  - REQ_OP_ZONE_OPEN
>> >  - REQ_OP_ZONE_CLOSE
>> >  - REQ_OP_ZONE_FINISH
>> >  - REQ_OP_ZONE_RESET
>> >  - REQ_OP_ZONE_APPEND
>> > 
>> > The zone append feature uses the `addr` field of `struct ublksrv_io_cmd` to
>> > communicate ALBA back to the kernel. Therefore ublk must be used with the
>> > user copy feature (UBLK_F_USER_COPY) for zoned storage support to be
>> > available. Without this feature, ublk will not allow zoned storage support.
>> > 
>> > Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
>> > ---
>> 
>> ...
>> 
>> > +/*
>> > + * Construct a zone report. The report request is carried in `struct
>> > + * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone
>> > + * and shall indicate the first zone of the report. The `nr_sectors` shall
>> > + * indicate how many zones should be reported (divide by zone size to get number
>> > + * of zones in the report) and must be an integer multiple of the zone size. The
>> > + * report shall be delivered as a `struct blk_zone` array. To report fewer zones
>> > + * than requested, zero the last entry of the returned array.
>> > + */
>> > +#define		UBLK_IO_OP_REPORT_ZONES		18
>> 
>> Actually, I meant the following delta change in V8 comment, then the UAPI
>> looks more clean & readable wrt. reporting how many zones in UBLK_IO_OP_REPORT_ZONES
>> and reusing ublksrv_io_cmd->addr.
>> 
>> Otherwise, this patchset looks fine.
>> 
>> 
>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>> index 5698f4575e05..454c852ed328 100644
>> --- a/drivers/block/ublk_drv.c
>> +++ b/drivers/block/ublk_drv.c
>> @@ -70,7 +70,7 @@ struct ublk_rq_data {
>>  	struct kref ref;
>>  	__u64 sector;
>>  	__u32 operation;
>> -	__u32 nr_sectors;
>> +	__u32 nr_zones;
>>  };
>>  
>>  struct ublk_uring_cmd_pdu {
>> @@ -335,7 +335,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
>>  		pdu = blk_mq_rq_to_pdu(req);
>>  		pdu->operation = UBLK_IO_OP_REPORT_ZONES;
>>  		pdu->sector = sector;
>> -		pdu->nr_sectors = zones_in_request * zone_size_sectors;
>> +		pdu->nr_zones = zones_in_request;
>>  
>>  		ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
>>  					GFP_KERNEL);
>> @@ -404,7 +404,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
>>  		switch (ublk_op) {
>>  		case UBLK_IO_OP_REPORT_ZONES:
>>  			iod->op_flags = ublk_op | ublk_req_build_flags(req);
>> -			iod->nr_sectors = pdu->nr_sectors;
>> +			iod->nr_zones = pdu->nr_zones;
>>  			iod->start_sector = pdu->sector;
>>  			return BLK_STS_OK;
>>  		default:
>> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>> index 4d97eb0f7d13..602a788a650e 100644
>> --- a/include/uapi/linux/ublk_cmd.h
>> +++ b/include/uapi/linux/ublk_cmd.h
>> @@ -249,11 +249,13 @@ struct ublksrv_ctrl_dev_info {
>>  /*
>>   * Construct a zone report. The report request is carried in `struct
>>   * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone
>> - * and shall indicate the first zone of the report. The `nr_sectors` shall
>> - * indicate how many zones should be reported (divide by zone size to get number
>> - * of zones in the report) and must be an integer multiple of the zone size. The
>> - * report shall be delivered as a `struct blk_zone` array. To report fewer zones
>> - * than requested, zero the last entry of the returned array.
>> + * and shall indicate the first zone of the report. The `nr_zones` shall
>> + * indicate how many zones should be reported at most. The report shall be
>> + * delivered as a `struct blk_zone` array. To report fewer zones than
>> + * requested, zero the last entry of the returned array.
>> + *
>> + * So related definitions(blk_zone, blk_zone_cond, blk_zone_type, ...) in
>> + * include/uapi/linux/blkzoned.h are part of ublk UAPI.
>>   */
>>  #define		UBLK_IO_OP_REPORT_ZONES		18
>>  
>> @@ -276,7 +278,10 @@ struct ublksrv_io_desc {
>>  	/* op: bit 0-7, flags: bit 8-31 */
>>  	__u32		op_flags;
>>  
>> -	__u32		nr_sectors;
>> +	union {
>> +		__u32		nr_sectors;
>> +		__u32		nr_zones; /* for UBLK_IO_OP_REPORT_ZONES only */
>> +	};
>>  
>>  	/* start sector for this io */
>>  	__u64		start_sector;
>> @@ -308,6 +313,12 @@ struct ublksrv_io_cmd {
>>  	/*
>>  	 * userspace buffer address in ublksrv daemon process, valid for
>>  	 * FETCH* command only
>> +	 *
>> +	 * This field shouldn't be used if UBLK_F_USER_COPY is enabled,
>> +	 * because userspace deals with data copy by pread()/pwrite() over
>> +	 * /dev/ublkcN. But in case of UBLK_F_ZONED, 'addr' is re-used to
>> +	 * pass back the allocated LBA for UBLK_IO_OP_ZONE_APPEND which
>> +	 * actually depends on UBLK_F_USER_COPY
>>  	 */
>>  	__u64	addr;
>
> Or use union to cover zoned_append_lba, and we still need above
> document about UBLK_F_USER_COPY & UBLK_F_ZONED uses.
>
> 	union {
> 		__u64	addr;
> 		__u64	zoned_append_lba;
> 	}
>

Thanks, I'll add this to the next version.

BR Andreas
  
Ming Lei Aug. 3, 2023, 12:06 a.m. UTC | #8
On Wed, Aug 02, 2023 at 11:09:56AM +0200, Andreas Hindborg (Samsung) wrote:
> 
> Ming Lei <ming.lei@redhat.com> writes:
> 
> > On Tue, Aug 01, 2023 at 02:11:56PM +0200, Andreas Hindborg (Samsung) wrote:
> >> 
> >> Niklas Cassel <Niklas.Cassel@wdc.com> writes:
> >> 
> >> > On Fri, Jul 14, 2023 at 09:25:10AM +0200, Andreas Hindborg wrote:
> >> >> From: Andreas Hindborg <a.hindborg@samsung.com>
> >> >
> >> > Hello Andreas!
> >> >
> >> 
> >> <snip>
> >> 
> >> >>  	/* for READ request, writing data in iod->addr to rq buffers */
> >> >> @@ -1120,6 +1404,11 @@ static void ublk_commit_completion(struct ublk_device *ub,
> >> >>  	/* find the io request and complete */
> >> >>  	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
> >> >>  
> >> >> +	if (io->flags & UBLK_IO_FLAG_ZONE_APPEND) {
> >> >
> >> > Do we really need to introduce a completely new flag just for this?
> >> >
> >> > if (req_op(req) == REQ_OP_ZONE_APPEND)
> >> >
> >> > should work just as well, no?
> >> 
> >> Makes sense, thanks.
> >
> > The above one can be replaced with req_op().
> >
> > But extra cost is added when retrieving request for the check in
> > __ublk_ch_uring_cmd().
> >
> 
> How about this (diff to v9):
> 
> @@ -1709,7 +1702,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
>  		goto out;
>  
>  	if (ublk_support_user_copy(ubq) &&
> -	    !(io->flags & UBLK_IO_FLAG_ZONE_APPEND) && ub_cmd->addr) {
> +	    _IOC_NR(cmd_op) != UBLK_IO_COMMIT_AND_FETCH_REQ && ub_cmd->addr) {
>  		ret = -EINVAL;
>  		goto out;
>  	}

Let's merge the above original user_copy check into 'case UBLK_IO_FETCH_REQ' &
'case UBLK_IO_COMMIT_AND_FETCH_REQ' first, then this patch can be cleaner, which
can be done as one prep change for zoned support.

> @@ -1751,6 +1744,12 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
>  		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
>  			goto out;
>  
> +		if (ublk_support_user_copy(ubq) &&
> +		    req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +

Given request is available for UBLK_IO_COMMIT_AND_FETCH_REQ, this approach is
good, and UBLK_IO_FETCH_REQ cmd doesn't have OP.


Thanks,
Ming
  

Patch

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 8d271901efac..5698f4575e05 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -56,16 +56,21 @@ 
 		| UBLK_F_USER_RECOVERY_REISSUE \
 		| UBLK_F_UNPRIVILEGED_DEV \
 		| UBLK_F_CMD_IOCTL_ENCODE \
-		| UBLK_F_USER_COPY)
+		| UBLK_F_USER_COPY \
+		| UBLK_F_ZONED)
 
 /* All UBLK_PARAM_TYPE_* should be included here */
-#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
-		UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
+#define UBLK_PARAM_TYPE_ALL                                \
+	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
+	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
 
 struct ublk_rq_data {
 	struct llist_node node;
 
 	struct kref ref;
+	__u64 sector;
+	__u32 operation;
+	__u32 nr_sectors;
 };
 
 struct ublk_uring_cmd_pdu {
@@ -110,6 +115,11 @@  struct ublk_uring_cmd_pdu {
  */
 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
 
+/*
+ * Set when IO is Zone Append
+ */
+#define UBLK_IO_FLAG_ZONE_APPEND 0x10
+
 struct ublk_io {
 	/* userspace buffer address from io cmd */
 	__u64	addr;
@@ -185,11 +195,264 @@  struct ublk_params_header {
 	__u32	types;
 };
 
+static inline unsigned int ublk_req_build_flags(struct request *req);
+static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
+						   int tag);
+
 static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
 {
 	return ub->dev_info.flags & UBLK_F_USER_COPY;
 }
 
+static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
+{
+	return ub->dev_info.flags & UBLK_F_ZONED;
+}
+
+static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
+{
+	return ubq->flags & UBLK_F_ZONED;
+}
+
+#ifdef CONFIG_BLK_DEV_ZONED
+
+static int ublk_get_nr_zones(const struct ublk_device *ub)
+{
+	const struct ublk_param_basic *p = &ub->params.basic;
+
+	/* Zone size is a power of 2 */
+	return p->dev_sectors >> ilog2(p->chunk_sectors);
+}
+
+static int ublk_revalidate_disk_zones(struct ublk_device *ub)
+{
+	return blk_revalidate_disk_zones(ub->ub_disk, NULL);
+}
+
+static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
+{
+	const struct ublk_param_zoned *p = &ub->params.zoned;
+	int nr_zones;
+
+	if (!ublk_dev_is_zoned(ub))
+		return -EINVAL;
+
+	if (!p->max_zone_append_sectors)
+		return -EINVAL;
+
+	nr_zones = ublk_get_nr_zones(ub);
+
+	if (p->max_active_zones > nr_zones)
+		return -EINVAL;
+
+	if (p->max_open_zones > nr_zones)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
+{
+	const struct ublk_param_zoned *p = &ub->params.zoned;
+
+	disk_set_zoned(ub->ub_disk, BLK_ZONED_HM);
+	blk_queue_required_elevator_features(ub->ub_disk->queue,
+					     ELEVATOR_F_ZBD_SEQ_WRITE);
+	disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
+	disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
+	blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
+
+	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
+
+	return 0;
+}
+
+/* Based on virtblk_alloc_report_buffer */
+static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
+				      unsigned int nr_zones, size_t *buflen)
+{
+	struct request_queue *q = ublk->ub_disk->queue;
+	size_t bufsize;
+	void *buf;
+
+	nr_zones = min_t(unsigned int, nr_zones,
+			 ublk->ub_disk->nr_zones);
+
+	bufsize = nr_zones * sizeof(struct blk_zone);
+	bufsize =
+		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
+
+	while (bufsize >= sizeof(struct blk_zone)) {
+		buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
+		if (buf) {
+			*buflen = bufsize;
+			return buf;
+		}
+		bufsize >>= 1;
+	}
+
+	*buflen = 0;
+	return NULL;
+}
+
+static int ublk_report_zones(struct gendisk *disk, sector_t sector,
+		      unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+	struct ublk_device *ub = disk->private_data;
+	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
+	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
+	unsigned int done_zones = 0;
+	unsigned int max_zones_per_request;
+	int ret;
+	struct blk_zone *buffer;
+	size_t buffer_length;
+
+	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
+			 nr_zones);
+
+	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
+	if (!buffer)
+		return -ENOMEM;
+
+	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
+
+	while (done_zones < nr_zones) {
+		unsigned int remaining_zones = nr_zones - done_zones;
+		unsigned int zones_in_request =
+			min_t(unsigned int, remaining_zones, max_zones_per_request);
+		struct request *req;
+		struct ublk_rq_data *pdu;
+		blk_status_t status;
+
+		memset(buffer, 0, buffer_length);
+
+		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			goto out;
+		}
+
+		pdu = blk_mq_rq_to_pdu(req);
+		pdu->operation = UBLK_IO_OP_REPORT_ZONES;
+		pdu->sector = sector;
+		pdu->nr_sectors = zones_in_request * zone_size_sectors;
+
+		ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
+					GFP_KERNEL);
+		if (ret) {
+			blk_mq_free_request(req);
+			goto out;
+		}
+
+		status = blk_execute_rq(req, 0);
+		ret = blk_status_to_errno(status);
+		blk_mq_free_request(req);
+		if (ret)
+			goto out;
+
+		for (unsigned int i = 0; i < zones_in_request; i++) {
+			struct blk_zone *zone = buffer + i;
+
+			/* A zero length zone means no more zones in this response */
+			if (!zone->len)
+				break;
+
+			ret = cb(zone, i, data);
+			if (ret)
+				goto out;
+
+			done_zones++;
+			sector += zone_size_sectors;
+
+		}
+	}
+
+	ret = done_zones;
+
+out:
+	kvfree(buffer);
+	return ret;
+}
+
+static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
+					 struct request *req)
+{
+	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
+	struct ublk_io *io = &ubq->ios[req->tag];
+	struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
+	u32 ublk_op;
+
+	switch (req_op(req)) {
+	case REQ_OP_ZONE_OPEN:
+		ublk_op = UBLK_IO_OP_ZONE_OPEN;
+		break;
+	case REQ_OP_ZONE_CLOSE:
+		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
+		break;
+	case REQ_OP_ZONE_FINISH:
+		ublk_op = UBLK_IO_OP_ZONE_FINISH;
+		break;
+	case REQ_OP_ZONE_RESET:
+		ublk_op = UBLK_IO_OP_ZONE_RESET;
+		break;
+	case REQ_OP_ZONE_APPEND:
+		ublk_op = UBLK_IO_OP_ZONE_APPEND;
+		io->flags |= UBLK_IO_FLAG_ZONE_APPEND;
+		break;
+	case REQ_OP_DRV_IN:
+		ublk_op = pdu->operation;
+		switch (ublk_op) {
+		case UBLK_IO_OP_REPORT_ZONES:
+			iod->op_flags = ublk_op | ublk_req_build_flags(req);
+			iod->nr_sectors = pdu->nr_sectors;
+			iod->start_sector = pdu->sector;
+			return BLK_STS_OK;
+		default:
+			return BLK_STS_IOERR;
+		}
+	case REQ_OP_ZONE_RESET_ALL:
+	case REQ_OP_DRV_OUT:
+		/* We do not support reset_all and drv_out */
+		return BLK_STS_NOTSUPP;
+	default:
+		return BLK_STS_IOERR;
+	}
+
+	iod->op_flags = ublk_op | ublk_req_build_flags(req);
+	iod->nr_sectors = blk_rq_sectors(req);
+	iod->start_sector = blk_rq_pos(req);
+	iod->addr = io->addr;
+
+	return BLK_STS_OK;
+}
+
+#else
+
+#define ublk_report_zones (NULL)
+
+static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ublk_revalidate_disk_zones(struct ublk_device *ub)
+{
+	return 0;
+}
+
+static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
+					 struct request *req)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif
+
 static inline void __ublk_complete_rq(struct request *req);
 static void ublk_complete_rq(struct kref *ref);
 
@@ -286,6 +549,9 @@  static int ublk_validate_params(const struct ublk_device *ub)
 
 		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
 			return -EINVAL;
+
+		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
+			return -EINVAL;
 	} else
 		return -EINVAL;
 
@@ -304,6 +570,11 @@  static int ublk_validate_params(const struct ublk_device *ub)
 	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
 		return -EINVAL;
 
+	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
+		return ublk_dev_param_zoned_validate(ub);
+	else if (ublk_dev_is_zoned(ub))
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -317,6 +588,9 @@  static int ublk_apply_params(struct ublk_device *ub)
 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
 		ublk_dev_param_discard_apply(ub);
 
+	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
+		return ublk_dev_param_zoned_apply(ub);
+
 	return 0;
 }
 
@@ -487,6 +761,7 @@  static const struct block_device_operations ub_fops = {
 	.owner =	THIS_MODULE,
 	.open =		ublk_open,
 	.free_disk =	ublk_free_disk,
+	.report_zones =	ublk_report_zones,
 };
 
 #define UBLK_MAX_PIN_PAGES	32
@@ -601,7 +876,8 @@  static inline bool ublk_need_map_req(const struct request *req)
 
 static inline bool ublk_need_unmap_req(const struct request *req)
 {
-	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
+	return ublk_rq_has_data(req) &&
+	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
 }
 
 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
@@ -685,8 +961,13 @@  static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
 {
 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
 	struct ublk_io *io = &ubq->ios[req->tag];
+	enum req_op op = req_op(req);
 	u32 ublk_op;
 
+	if (!ublk_queue_is_zoned(ubq) &&
+	    (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
+		return -EIO;
+
 	switch (req_op(req)) {
 	case REQ_OP_READ:
 		ublk_op = UBLK_IO_OP_READ;
@@ -704,6 +985,8 @@  static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
 		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
 		break;
 	default:
+		if (ublk_queue_is_zoned(ubq))
+			return ublk_setup_iod_zoned(ubq, req);
 		return BLK_STS_IOERR;
 	}
 
@@ -756,7 +1039,8 @@  static inline void __ublk_complete_rq(struct request *req)
 	 *
 	 * Both the two needn't unmap.
 	 */
-	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
+	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
+	    req_op(req) != REQ_OP_DRV_IN)
 		goto exit;
 
 	/* for READ request, writing data in iod->addr to rq buffers */
@@ -1120,6 +1404,11 @@  static void ublk_commit_completion(struct ublk_device *ub,
 	/* find the io request and complete */
 	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
 
+	if (io->flags & UBLK_IO_FLAG_ZONE_APPEND) {
+		req->__sector = ub_cmd->addr;
+		io->flags &= ~UBLK_IO_FLAG_ZONE_APPEND;
+	}
+
 	if (req && likely(!blk_should_fake_timeout(req->q)))
 		ublk_put_req_ref(ubq, req);
 }
@@ -1419,7 +1708,8 @@  static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
 			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
 		goto out;
 
-	if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
+	if (ublk_support_user_copy(ubq) &&
+	    !(io->flags & UBLK_IO_FLAG_ZONE_APPEND) && ub_cmd->addr) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1542,11 +1832,14 @@  static inline bool ublk_check_ubuf_dir(const struct request *req,
 		int ubuf_dir)
 {
 	/* copy ubuf to request pages */
-	if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
+	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
+	    ubuf_dir == ITER_SOURCE)
 		return true;
 
 	/* copy request pages to ubuf */
-	if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
+	if ((req_op(req) == REQ_OP_WRITE ||
+	     req_op(req) == REQ_OP_ZONE_APPEND) &&
+	    ubuf_dir == ITER_DEST)
 		return true;
 
 	return false;
@@ -1885,17 +2178,24 @@  static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
 
 	get_device(&ub->cdev_dev);
 	ub->dev_info.state = UBLK_S_DEV_LIVE;
+
+	if (ublk_dev_is_zoned(ub)) {
+		ret = ublk_revalidate_disk_zones(ub);
+		if (ret)
+			goto out_put_cdev;
+	}
+
 	ret = add_disk(disk);
+	if (ret)
+		goto out_put_cdev;
+
+	set_bit(UB_STATE_USED, &ub->state);
+
+out_put_cdev:
 	if (ret) {
-		/*
-		 * Has to drop the reference since ->free_disk won't be
-		 * called in case of add_disk failure.
-		 */
 		ub->dev_info.state = UBLK_S_DEV_DEAD;
 		ublk_put_device(ub);
-		goto out_put_disk;
 	}
-	set_bit(UB_STATE_USED, &ub->state);
 out_put_disk:
 	if (ret)
 		put_disk(disk);
@@ -2045,6 +2345,13 @@  static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 	if (ublk_dev_is_user_copy(ub))
 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
 
+	/* Zoned storage support requires user copy feature */
+	if (ublk_dev_is_zoned(ub) &&
+	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
+		ret = -EINVAL;
+		goto out_free_dev_number;
+	}
+
 	/* We are not ready to support zero copy */
 	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
 
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 4b8558db90e1..4d97eb0f7d13 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -176,6 +176,12 @@ 
 /* Copy between request and user buffer by pread()/pwrite() */
 #define UBLK_F_USER_COPY	(1UL << 7)
 
+/*
+ * User space sets this flag when setting up the device to request zoned storage support. Kernel may
+ * deny the request by returning an error.
+ */
+#define UBLK_F_ZONED (1ULL << 8)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
@@ -232,9 +238,24 @@  struct ublksrv_ctrl_dev_info {
 #define		UBLK_IO_OP_READ		0
 #define		UBLK_IO_OP_WRITE		1
 #define		UBLK_IO_OP_FLUSH		2
-#define		UBLK_IO_OP_DISCARD	3
-#define		UBLK_IO_OP_WRITE_SAME	4
-#define		UBLK_IO_OP_WRITE_ZEROES	5
+#define		UBLK_IO_OP_DISCARD		3
+#define		UBLK_IO_OP_WRITE_SAME		4
+#define		UBLK_IO_OP_WRITE_ZEROES		5
+#define		UBLK_IO_OP_ZONE_OPEN		10
+#define		UBLK_IO_OP_ZONE_CLOSE		11
+#define		UBLK_IO_OP_ZONE_FINISH		12
+#define		UBLK_IO_OP_ZONE_APPEND		13
+#define		UBLK_IO_OP_ZONE_RESET		15
+/*
+ * Construct a zone report. The report request is carried in `struct
+ * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone
+ * and shall indicate the first zone of the report. The `nr_sectors` shall
+ * indicate how many zones should be reported (divide by zone size to get number
+ * of zones in the report) and must be an integer multiple of the zone size. The
+ * report shall be delivered as a `struct blk_zone` array. To report fewer zones
+ * than requested, zero the last entry of the returned array.
+ */
+#define		UBLK_IO_OP_REPORT_ZONES		18
 
 #define		UBLK_IO_F_FAILFAST_DEV		(1U << 8)
 #define		UBLK_IO_F_FAILFAST_TRANSPORT	(1U << 9)
@@ -331,6 +352,13 @@  struct ublk_param_devt {
 	__u32   disk_minor;
 };
 
+struct ublk_param_zoned {
+	__u32	max_open_zones;
+	__u32	max_active_zones;
+	__u32	max_zone_append_sectors;
+	__u8	reserved[20];
+};
+
 struct ublk_params {
 	/*
 	 * Total length of parameters, userspace has to set 'len' for both
@@ -342,11 +370,13 @@  struct ublk_params {
 #define UBLK_PARAM_TYPE_BASIC           (1 << 0)
 #define UBLK_PARAM_TYPE_DISCARD         (1 << 1)
 #define UBLK_PARAM_TYPE_DEVT            (1 << 2)
+#define UBLK_PARAM_TYPE_ZONED           (1 << 3)
 	__u32	types;			/* types of parameter included */
 
 	struct ublk_param_basic		basic;
 	struct ublk_param_discard	discard;
 	struct ublk_param_devt		devt;
+	struct ublk_param_zoned	zoned;
 };
 
 #endif