On 2/16/24 09:45, Daniel Wagner wrote:
> The life time of the controller is managed by the upper layers.
>
> Thus just ref counting the controller when creating it and giving the
> ref back on the cleanup path. This is how the other transport are
> managed as well. Until now, the ref count has been taken per LS request
> which is not really necessary as the core guarantees that there is no in
> flight request when shuting down (if we use the nvme APIs are used
> correctly).
>
> In fact we don't really need the ref count for nvme_fc_ctrl at this
> point. Though, the FC transport is offloading the connect attempt to a
> workqueue and in the next patch we introduce a sync option for which the
> ref counter is necessary. So let's keep it around.
>
> Also take a ref for lport and rport when creating the controller and
> give it back when we destroy the controller. This means these refs are
> tied to the life time of the controller and not the other way around.
>
> We have also to reorder the cleanup code in nvme_fc_delete_ctrl and
> nvme_fc_free_ctrl so that we do not expose resources too long and run
> into use after free situations which are currently possible.
>
> Signed-off-by: Daniel Wagner <dwagner@suse.de>
> ---
> drivers/nvme/host/fc.c | 136 +++++++++++++----------------------------
> 1 file changed, 41 insertions(+), 95 deletions(-)
>
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index ddbc5b21af5b..7f9edab57550 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -229,6 +229,9 @@ static struct device *fc_udev_device;
>
> static void nvme_fc_complete_rq(struct request *rq);
>
> +static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
> +static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
> +
> /* *********************** FC-NVME Port Management ************************ */
>
> static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *,
> @@ -800,7 +803,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
> dev_warn(ctrl->ctrl.device,
> "NVME-FC{%d}: Couldn't schedule reset.\n",
> ctrl->cnum);
> - nvme_delete_ctrl(&ctrl->ctrl);
> + nvme_fc_ctrl_put(ctrl);
> }
> break;
>
> @@ -868,7 +871,7 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
> dev_warn(ctrl->ctrl.device,
> "NVME-FC{%d}: controller connectivity lost.\n",
> ctrl->cnum);
> - nvme_delete_ctrl(&ctrl->ctrl);
> + nvme_fc_ctrl_put(ctrl);
> } else
> nvme_fc_ctrl_connectivity_loss(ctrl);
> }
> @@ -1022,9 +1025,6 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
>
> /* *********************** FC-NVME LS Handling **************************** */
>
> -static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
> -static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
> -
> static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
>
> static void
> @@ -1050,8 +1050,6 @@ __nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop)
> fc_dma_unmap_single(rport->dev, lsreq->rqstdma,
> (lsreq->rqstlen + lsreq->rsplen),
> DMA_BIDIRECTIONAL);
> -
> - nvme_fc_rport_put(rport);
> }
>
Hmm. I'm a bit unsure about this; essentially you change the rport
refcounting (and not just the controller refcounting).
And the problem here is that rport refcounting is actually tied to
the driver-internal rports, which have a different lifetime
(dev_loss_tmo and all that).
Would it be possible to break this in two, with one patch changing the
controller/options refcounting and the other one changing the rport
refcounting?
Cheers,
Hannes
@@ -229,6 +229,9 @@ static struct device *fc_udev_device;
static void nvme_fc_complete_rq(struct request *rq);
+static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
+static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
+
/* *********************** FC-NVME Port Management ************************ */
static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *,
@@ -800,7 +803,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: Couldn't schedule reset.\n",
ctrl->cnum);
- nvme_delete_ctrl(&ctrl->ctrl);
+ nvme_fc_ctrl_put(ctrl);
}
break;
@@ -868,7 +871,7 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: controller connectivity lost.\n",
ctrl->cnum);
- nvme_delete_ctrl(&ctrl->ctrl);
+ nvme_fc_ctrl_put(ctrl);
} else
nvme_fc_ctrl_connectivity_loss(ctrl);
}
@@ -1022,9 +1025,6 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
/* *********************** FC-NVME LS Handling **************************** */
-static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
-static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
-
static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
static void
@@ -1050,8 +1050,6 @@ __nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop)
fc_dma_unmap_single(rport->dev, lsreq->rqstdma,
(lsreq->rqstlen + lsreq->rsplen),
DMA_BIDIRECTIONAL);
-
- nvme_fc_rport_put(rport);
}
static int
@@ -1066,9 +1064,6 @@ __nvme_fc_send_ls_req(struct nvme_fc_rport *rport,
if (rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
return -ECONNREFUSED;
- if (!nvme_fc_rport_get(rport))
- return -ESHUTDOWN;
-
lsreq->done = done;
lsop->rport = rport;
lsop->req_queued = false;
@@ -1078,10 +1073,8 @@ __nvme_fc_send_ls_req(struct nvme_fc_rport *rport,
lsreq->rqstdma = fc_dma_map_single(rport->dev, lsreq->rqstaddr,
lsreq->rqstlen + lsreq->rsplen,
DMA_BIDIRECTIONAL);
- if (fc_dma_mapping_error(rport->dev, lsreq->rqstdma)) {
- ret = -EFAULT;
- goto out_putrport;
- }
+ if (fc_dma_mapping_error(rport->dev, lsreq->rqstdma))
+ return -EFAULT;
lsreq->rspdma = lsreq->rqstdma + lsreq->rqstlen;
spin_lock_irqsave(&rport->lock, flags);
@@ -1108,9 +1101,6 @@ __nvme_fc_send_ls_req(struct nvme_fc_rport *rport,
fc_dma_unmap_single(rport->dev, lsreq->rqstdma,
(lsreq->rqstlen + lsreq->rsplen),
DMA_BIDIRECTIONAL);
-out_putrport:
- nvme_fc_rport_put(rport);
-
return ret;
}
@@ -1471,8 +1461,6 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
kfree(lsop->rspbuf);
kfree(lsop->rqstbuf);
kfree(lsop);
-
- nvme_fc_rport_put(rport);
}
static void
@@ -1511,8 +1499,6 @@ nvme_fc_match_disconn_ls(struct nvme_fc_rport *rport,
spin_lock_irqsave(&rport->lock, flags);
list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
- if (!nvme_fc_ctrl_get(ctrl))
- continue;
spin_lock(&ctrl->lock);
if (association_id == ctrl->association_id) {
oldls = ctrl->rcv_disconn;
@@ -1520,10 +1506,6 @@ nvme_fc_match_disconn_ls(struct nvme_fc_rport *rport,
ret = ctrl;
}
spin_unlock(&ctrl->lock);
- if (ret)
- /* leave the ctrl get reference */
- break;
- nvme_fc_ctrl_put(ctrl);
}
spin_unlock_irqrestore(&rport->lock, flags);
@@ -1602,9 +1584,6 @@ nvme_fc_ls_disconnect_assoc(struct nvmefc_ls_rcv_op *lsop)
/* fail the association */
nvme_fc_error_recovery(ctrl, "Disconnect Association LS received");
- /* release the reference taken by nvme_fc_match_disconn_ls() */
- nvme_fc_ctrl_put(ctrl);
-
return false;
}
@@ -1734,16 +1713,13 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr,
unsigned long flags;
int ret;
- nvme_fc_rport_get(rport);
-
/* validate there's a routine to transmit a response */
if (!lport->ops->xmt_ls_rsp) {
dev_info(lport->dev,
"RCV %s LS failed: no LLDD xmt_ls_rsp\n",
(w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
nvmefc_ls_names[w0->ls_cmd] : "");
- ret = -EINVAL;
- goto out_put;
+ return -EINVAL;
}
if (lsreqbuf_len > sizeof(union nvmefc_ls_requests)) {
@@ -1751,15 +1727,13 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr,
"RCV %s LS failed: payload too large\n",
(w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
nvmefc_ls_names[w0->ls_cmd] : "");
- ret = -E2BIG;
- goto out_put;
+ return -E2BIG;
}
lsop = kzalloc(sizeof(*lsop), GFP_KERNEL);
if (!lsop) {
nvme_fc_rcv_ls_req_err_msg(lport, w0);
- ret = -ENOMEM;
- goto out_put;
+ return -ENOMEM;
}
lsop->rqstbuf = kzalloc(sizeof(*lsop->rqstbuf), GFP_KERNEL);
@@ -1808,8 +1782,6 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr,
kfree(lsop->rspbuf);
kfree(lsop->rqstbuf);
kfree(lsop);
-out_put:
- nvme_fc_rport_put(rport);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_fc_rcv_ls_req);
@@ -2071,7 +2043,6 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
atomic_set(&op->state, FCPOP_STATE_IDLE);
op->flags = FCOP_FLAGS_AEN; /* clear other flags */
- nvme_fc_ctrl_put(ctrl);
goto check_error;
}
@@ -2383,37 +2354,18 @@ nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl)
}
static void
-nvme_fc_ctrl_free(struct kref *ref)
+nvme_fc_ctrl_delete(struct kref *ref)
{
struct nvme_fc_ctrl *ctrl =
container_of(ref, struct nvme_fc_ctrl, ref);
- unsigned long flags;
-
- if (ctrl->ctrl.tagset)
- nvme_remove_io_tag_set(&ctrl->ctrl);
-
- /* remove from rport list */
- spin_lock_irqsave(&ctrl->rport->lock, flags);
- list_del(&ctrl->ctrl_list);
- spin_unlock_irqrestore(&ctrl->rport->lock, flags);
-
- nvme_unquiesce_admin_queue(&ctrl->ctrl);
- nvme_remove_admin_tag_set(&ctrl->ctrl);
-
- kfree(ctrl->queues);
-
- put_device(ctrl->dev);
- nvme_fc_rport_put(ctrl->rport);
- ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum);
- nvmf_ctrl_options_put(ctrl->ctrl.opts);
- kfree(ctrl);
+ nvme_delete_ctrl(&ctrl->ctrl);
}
static void
nvme_fc_ctrl_put(struct nvme_fc_ctrl *ctrl)
{
- kref_put(&ctrl->ref, nvme_fc_ctrl_free);
+ kref_put(&ctrl->ref, nvme_fc_ctrl_delete);
}
static int
@@ -2431,9 +2383,18 @@ nvme_fc_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
- WARN_ON(nctrl != &ctrl->ctrl);
- nvme_fc_ctrl_put(ctrl);
+ if (ctrl->ctrl.tagset)
+ nvme_remove_io_tag_set(&ctrl->ctrl);
+
+ nvme_unquiesce_admin_queue(&ctrl->ctrl);
+ nvme_remove_admin_tag_set(&ctrl->ctrl);
+
+ kfree(ctrl->queues);
+
+ ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum);
+ nvmf_ctrl_options_put(ctrl->ctrl.opts);
+ kfree(ctrl);
}
/*
@@ -2682,9 +2643,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
return BLK_STS_RESOURCE;
- if (!nvme_fc_ctrl_get(ctrl))
- return BLK_STS_IOERR;
-
/* format the FC-NVME CMD IU and fcp_req */
cmdiu->connection_id = cpu_to_be64(queue->connection_id);
cmdiu->data_len = cpu_to_be32(data_len);
@@ -2729,7 +2687,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
ret = nvme_fc_map_data(ctrl, op->rq, op);
if (ret < 0) {
nvme_cleanup_cmd(op->rq);
- nvme_fc_ctrl_put(ctrl);
if (ret == -ENOMEM || ret == -EAGAIN)
return BLK_STS_RESOURCE;
return BLK_STS_IOERR;
@@ -2770,8 +2727,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
nvme_cleanup_cmd(op->rq);
}
- nvme_fc_ctrl_put(ctrl);
-
if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE &&
ret != -EBUSY)
return BLK_STS_IOERR;
@@ -2855,7 +2810,6 @@ nvme_fc_complete_rq(struct request *rq)
nvme_fc_unmap_data(ctrl, rq, op);
nvme_complete_rq(rq);
- nvme_fc_ctrl_put(ctrl);
}
static void nvme_fc_map_queues(struct blk_mq_tag_set *set)
@@ -3284,14 +3238,24 @@ static void
nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+ unsigned long flags;
cancel_work_sync(&ctrl->ioerr_work);
cancel_delayed_work_sync(&ctrl->connect_work);
+
+ /* remove from rport list */
+ spin_lock_irqsave(&ctrl->rport->lock, flags);
+ list_del(&ctrl->ctrl_list);
+ spin_unlock_irqrestore(&ctrl->rport->lock, flags);
+
/*
* kill the association on the link side. this will block
* waiting for io to terminate
*/
nvme_fc_delete_association(ctrl);
+
+ nvme_fc_rport_put(ctrl->rport);
+ nvme_fc_lport_put(ctrl->lport);
}
static void
@@ -3344,7 +3308,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
ctrl->cnum, min_t(int, portptr->dev_loss_tmo,
(ctrl->ctrl.opts->max_reconnects *
ctrl->ctrl.opts->reconnect_delay)));
- WARN_ON(nvme_delete_ctrl(&ctrl->ctrl));
+ nvme_fc_ctrl_put(ctrl);
}
}
@@ -3502,12 +3466,13 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
INIT_LIST_HEAD(&ctrl->ctrl_list);
ctrl->lport = lport;
ctrl->rport = rport;
+ nvme_fc_lport_get(lport);
+ nvme_fc_rport_get(rport);
ctrl->dev = lport->dev;
ctrl->cnum = idx;
ctrl->ioq_live = false;
init_waitqueue_head(&ctrl->ioabort_wait);
- get_device(ctrl->dev);
kref_init(&ctrl->ref);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work);
@@ -3582,32 +3547,13 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
return &ctrl->ctrl;
fail_ctrl:
- nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING);
- cancel_work_sync(&ctrl->ioerr_work);
- cancel_work_sync(&ctrl->ctrl.reset_work);
- cancel_delayed_work_sync(&ctrl->connect_work);
-
- /* initiate nvme ctrl ref counting teardown */
- nvme_uninit_ctrl(&ctrl->ctrl);
-
- /* Remove core ctrl ref. */
- nvme_put_ctrl(&ctrl->ctrl);
-
- /* as we're past the point where we transition to the ref
- * counting teardown path, if we return a bad pointer here,
- * the calling routine, thinking it's prior to the
- * transition, will do an rport put. Since the teardown
- * path also does a rport put, we do an extra get here to
- * so proper order/teardown happens.
- */
- nvme_fc_rport_get(rport);
+ nvme_fc_ctrl_put(ctrl);
return ERR_PTR(-EIO);
out_free_queues:
kfree(ctrl->queues);
out_free_ida:
- put_device(ctrl->dev);
ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum);
out_free_opts:
nvmf_ctrl_options_put(opts);
@@ -3724,8 +3670,8 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
spin_unlock_irqrestore(&nvme_fc_lock, flags);
ctrl = nvme_fc_init_ctrl(dev, opts, lport, rport);
- if (IS_ERR(ctrl))
- nvme_fc_rport_put(rport);
+ nvme_fc_rport_put(rport);
+
return ctrl;
}
}
@@ -3950,7 +3896,7 @@ nvme_fc_delete_controllers(struct nvme_fc_rport *rport)
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: transport unloading: deleting ctrl\n",
ctrl->cnum);
- nvme_delete_ctrl(&ctrl->ctrl);
+ nvme_fc_ctrl_put(ctrl);
}
spin_unlock(&rport->lock);
}