[v9,03/12] net: mana: Handle vport sharing between devices
Commit Message
From: Long Li <longli@microsoft.com>
For outgoing packets, the PF requires the VF to configure the vport with
corresponding protection domain and doorbell ID for the kernel or user
context. The vport can't be shared between different contexts.
Implement the logic to exclusively take over the vport by either the
Ethernet device or RDMA device.
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
Acked-by: Haiyang Zhang <haiyangz@microsoft.com>
---
Change log:
v2: use refcount instead of directly using atomic variables
v4: change to mutex to avoid possible race with refcount
v5: add detailed comments explaining vport sharing, use EXPORT_SYMBOL_NS
v6: rebased to rdma-next
drivers/net/ethernet/microsoft/mana/mana.h | 7 +++
drivers/net/ethernet/microsoft/mana/mana_en.c | 53 ++++++++++++++++++-
2 files changed, 58 insertions(+), 2 deletions(-)
Comments
On 2022/10/22 8:01, longli@linuxonhyperv.com wrote:
> From: Long Li <longli@microsoft.com>
>
> For outgoing packets, the PF requires the VF to configure the vport with
> corresponding protection domain and doorbell ID for the kernel or user
> context. The vport can't be shared between different contexts.
>
> Implement the logic to exclusively take over the vport by either the
> Ethernet device or RDMA device.
>
> Reviewed-by: Dexuan Cui <decui@microsoft.com>
> Signed-off-by: Long Li <longli@microsoft.com>
> Acked-by: Haiyang Zhang <haiyangz@microsoft.com>
> ---
> Change log:
> v2: use refcount instead of directly using atomic variables
> v4: change to mutex to avoid possible race with refcount
> v5: add detailed comments explaining vport sharing, use EXPORT_SYMBOL_NS
> v6: rebased to rdma-next
>
> drivers/net/ethernet/microsoft/mana/mana.h | 7 +++
> drivers/net/ethernet/microsoft/mana/mana_en.c | 53 ++++++++++++++++++-
> 2 files changed, 58 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/mana.h b/drivers/net/ethernet/microsoft/mana/mana.h
> index d58be64374c8..2883a08dbfb5 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana.h
> +++ b/drivers/net/ethernet/microsoft/mana/mana.h
> @@ -380,6 +380,10 @@ struct mana_port_context {
> mana_handle_t port_handle;
> mana_handle_t pf_filter_handle;
>
> + /* Mutex for sharing access to vport_use_count */
> + struct mutex vport_mutex;
> + int vport_use_count;
> +
> u16 port_idx;
>
> bool port_is_up;
> @@ -631,4 +635,7 @@ struct mana_tx_package {
> struct gdma_posted_wqe_info wqe_info;
> };
>
> +int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
> + u32 doorbell_pg_id);
> +void mana_uncfg_vport(struct mana_port_context *apc);
> #endif /* _MANA_H */
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index 8751e475d1ba..efe14a343fd1 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -646,13 +646,48 @@ static int mana_query_vport_cfg(struct mana_port_context *apc, u32 vport_index,
> return 0;
> }
>
> -static int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
> - u32 doorbell_pg_id)
> +void mana_uncfg_vport(struct mana_port_context *apc)
> +{
> + mutex_lock(&apc->vport_mutex);
> + apc->vport_use_count--;
> + WARN_ON(apc->vport_use_count < 0);
> + mutex_unlock(&apc->vport_mutex);
> +}
> +EXPORT_SYMBOL_NS(mana_uncfg_vport, NET_MANA);
> +
> +int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
> + u32 doorbell_pg_id)
> {
> struct mana_config_vport_resp resp = {};
> struct mana_config_vport_req req = {};
> int err;
>
> + /* This function is used to program the Ethernet port in the hardware
> + * table. It can be called from the Ethernet driver or the RDMA driver.
> + *
> + * For Ethernet usage, the hardware supports only one active user on a
> + * physical port. The driver checks on the port usage before programming
> + * the hardware when creating the RAW QP (RDMA driver) or exposing the
> + * device to kernel NET layer (Ethernet driver).
> + *
> + * Because the RDMA driver doesn't know in advance which QP type the
> + * user will create, it exposes the device with all its ports. The user
> + * may not be able to create RAW QP on a port if this port is already
> + * in used by the Ethernet driver from the kernel.
> + *
> + * This physical port limitation only applies to the RAW QP. For RC QP,
> + * the hardware doesn't have this limitation. The user can create RC
> + * QPs on a physical port up to the hardware limits independent of the
> + * Ethernet usage on the same port.
> + */
> + mutex_lock(&apc->vport_mutex);
> + if (apc->vport_use_count > 0) {
> + mutex_unlock(&apc->vport_mutex);
> + return -EBUSY;
> + }
> + apc->vport_use_count++;
> + mutex_unlock(&apc->vport_mutex);
> +
> mana_gd_init_req_hdr(&req.hdr, MANA_CONFIG_VPORT_TX,
> sizeof(req), sizeof(resp));
> req.vport = apc->port_handle;
> @@ -679,9 +714,16 @@ static int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
>
> apc->tx_shortform_allowed = resp.short_form_allowed;
> apc->tx_vp_offset = resp.tx_vport_offset;
> +
> + netdev_info(apc->ndev, "Configured vPort %llu PD %u DB %u\n",
> + apc->port_handle, protection_dom_id, doorbell_pg_id);
> out:
> + if (err)
> + mana_uncfg_vport(apc);
There seems to be a similar race between error handling here and the
"apc->vport_use_count > 0" checking above as pointed out in v7.
> +
> return err;
> }
> +EXPORT_SYMBOL_NS(mana_cfg_vport, NET_MANA);
>
> static int mana_cfg_vport_steering(struct mana_port_context *apc,
> enum TRI_STATE rx,
> @@ -742,6 +784,9 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc,
> resp.hdr.status);
> err = -EPROTO;
> }
> +
> + netdev_info(ndev, "Configured steering vPort %llu entries %u\n",
> + apc->port_handle, num_entries);
> out:
> kfree(req);
> return err;
> @@ -1804,6 +1849,7 @@ static void mana_destroy_vport(struct mana_port_context *apc)
> }
>
> mana_destroy_txq(apc);
> + mana_uncfg_vport(apc);
>
> if (gd->gdma_context->is_pf)
> mana_pf_deregister_hw_vport(apc);
> @@ -2076,6 +2122,9 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
> apc->pf_filter_handle = INVALID_MANA_HANDLE;
> apc->port_idx = port_idx;
>
> + mutex_init(&apc->vport_mutex);
> + apc->vport_use_count = 0;
> +
> ndev->netdev_ops = &mana_devops;
> ndev->ethtool_ops = &mana_ethtool_ops;
> ndev->mtu = ETH_DATA_LEN;
>
> > +int mana_cfg_vport(struct mana_port_context *apc, u32
> protection_dom_id,
> > + u32 doorbell_pg_id)
> > {
> > struct mana_config_vport_resp resp = {};
> > struct mana_config_vport_req req = {};
> > int err;
> >
> > + /* This function is used to program the Ethernet port in the hardware
> > + * table. It can be called from the Ethernet driver or the RDMA driver.
> > + *
> > + * For Ethernet usage, the hardware supports only one active user on
> a
> > + * physical port. The driver checks on the port usage before
> programming
> > + * the hardware when creating the RAW QP (RDMA driver) or
> exposing the
> > + * device to kernel NET layer (Ethernet driver).
> > + *
> > + * Because the RDMA driver doesn't know in advance which QP type
> the
> > + * user will create, it exposes the device with all its ports. The user
> > + * may not be able to create RAW QP on a port if this port is already
> > + * in used by the Ethernet driver from the kernel.
> > + *
> > + * This physical port limitation only applies to the RAW QP. For RC QP,
> > + * the hardware doesn't have this limitation. The user can create RC
> > + * QPs on a physical port up to the hardware limits independent of
> the
> > + * Ethernet usage on the same port.
> > + */
> > + mutex_lock(&apc->vport_mutex);
> > + if (apc->vport_use_count > 0) {
> > + mutex_unlock(&apc->vport_mutex);
> > + return -EBUSY;
> > + }
> > + apc->vport_use_count++;
> > + mutex_unlock(&apc->vport_mutex);
> > +
> > mana_gd_init_req_hdr(&req.hdr, MANA_CONFIG_VPORT_TX,
> > sizeof(req), sizeof(resp));
> > req.vport = apc->port_handle;
> > @@ -679,9 +714,16 @@ static int mana_cfg_vport(struct
> > mana_port_context *apc, u32 protection_dom_id,
> >
> > apc->tx_shortform_allowed = resp.short_form_allowed;
> > apc->tx_vp_offset = resp.tx_vport_offset;
> > +
> > + netdev_info(apc->ndev, "Configured vPort %llu PD %u DB %u\n",
> > + apc->port_handle, protection_dom_id, doorbell_pg_id);
> > out:
> > + if (err)
> > + mana_uncfg_vport(apc);
>
> There seems to be a similar race between error handling here and the "apc-
> >vport_use_count > 0" checking above as pointed out in v7.
Thanks for looking into this.
This is different to the locking bug in mana_ib_cfg_vport(). The vport sharing
between Ethernet and RDMA is exclusive, not shared. If another driver tries
to take the vport while it is being configured, it will fail immediately. It is by
design to prevent possible deadlock.
On 2022/10/25 2:45, Long Li wrote:
>>> +int mana_cfg_vport(struct mana_port_context *apc, u32
>> protection_dom_id,
>>> + u32 doorbell_pg_id)
>>> {
>>> struct mana_config_vport_resp resp = {};
>>> struct mana_config_vport_req req = {};
>>> int err;
>>>
>>> + /* This function is used to program the Ethernet port in the hardware
>>> + * table. It can be called from the Ethernet driver or the RDMA driver.
>>> + *
>>> + * For Ethernet usage, the hardware supports only one active user on
>> a
>>> + * physical port. The driver checks on the port usage before
>> programming
>>> + * the hardware when creating the RAW QP (RDMA driver) or
>> exposing the
>>> + * device to kernel NET layer (Ethernet driver).
>>> + *
>>> + * Because the RDMA driver doesn't know in advance which QP type
>> the
>>> + * user will create, it exposes the device with all its ports. The user
>>> + * may not be able to create RAW QP on a port if this port is already
>>> + * in used by the Ethernet driver from the kernel.
>>> + *
>>> + * This physical port limitation only applies to the RAW QP. For RC QP,
>>> + * the hardware doesn't have this limitation. The user can create RC
>>> + * QPs on a physical port up to the hardware limits independent of
>> the
>>> + * Ethernet usage on the same port.
>>> + */
>>> + mutex_lock(&apc->vport_mutex);
>>> + if (apc->vport_use_count > 0) {
>>> + mutex_unlock(&apc->vport_mutex);
>>> + return -EBUSY;
>>> + }
>>> + apc->vport_use_count++;
>>> + mutex_unlock(&apc->vport_mutex);
>>> +
>>> mana_gd_init_req_hdr(&req.hdr, MANA_CONFIG_VPORT_TX,
>>> sizeof(req), sizeof(resp));
>>> req.vport = apc->port_handle;
>>> @@ -679,9 +714,16 @@ static int mana_cfg_vport(struct
>>> mana_port_context *apc, u32 protection_dom_id,
>>>
>>> apc->tx_shortform_allowed = resp.short_form_allowed;
>>> apc->tx_vp_offset = resp.tx_vport_offset;
>>> +
>>> + netdev_info(apc->ndev, "Configured vPort %llu PD %u DB %u\n",
>>> + apc->port_handle, protection_dom_id, doorbell_pg_id);
>>> out:
>>> + if (err)
>>> + mana_uncfg_vport(apc);
>>
>> There seems to be a similar race between error handling here and the "apc-
>>> vport_use_count > 0" checking above as pointed out in v7.
>
> Thanks for looking into this.
>
> This is different to the locking bug in mana_ib_cfg_vport(). The vport sharing
> between Ethernet and RDMA is exclusive, not shared. If another driver tries
> to take the vport while it is being configured, it will fail immediately. It is by
Suppose the following steps:
1. Ethernet driver take the lock first and do a "apc->vport_use_count++", and
release the lock;
2. RDMA driver take the lock, do "apc->vport_use_count > 0" checking and return
-EBUSY;
3. mana_send_request() or mana_verify_resp_hdr() return error to Ethernet driver.
It seems that vport is left unused when above happens, if that is what you wanted?
> design to prevent possible deadlock.
I am not sure I understand the deadlock here.
>
> >>> @@ -679,9 +714,16 @@ static int mana_cfg_vport(struct
> >>> mana_port_context *apc, u32 protection_dom_id,
> >>>
> >>> apc->tx_shortform_allowed = resp.short_form_allowed;
> >>> apc->tx_vp_offset = resp.tx_vport_offset;
> >>> +
> >>> + netdev_info(apc->ndev, "Configured vPort %llu PD %u DB %u\n",
> >>> + apc->port_handle, protection_dom_id, doorbell_pg_id);
> >>> out:
> >>> + if (err)
> >>> + mana_uncfg_vport(apc);
> >>
> >> There seems to be a similar race between error handling here and the
> >> "apc-
> >>> vport_use_count > 0" checking above as pointed out in v7.
> >
> > Thanks for looking into this.
> >
> > This is different to the locking bug in mana_ib_cfg_vport(). The vport
> > sharing between Ethernet and RDMA is exclusive, not shared. If another
> > driver tries to take the vport while it is being configured, it will
> > fail immediately. It is by
>
> Suppose the following steps:
> 1. Ethernet driver take the lock first and do a "apc->vport_use_count++",
> and
> release the lock;
> 2. RDMA driver take the lock, do "apc->vport_use_count > 0" checking and
> return
> -EBUSY;
> 3. mana_send_request() or mana_verify_resp_hdr() return error to
> Ethernet driver.
>
> It seems that vport is left unused when above happens, if that is what you
> wanted?
Yes, in this case the vport is left unused. There is no resource leak.
This is expected.
>
>
> > design to prevent possible deadlock.
>
> I am not sure I understand the deadlock here.
Because we are dealing with two drivers. I don't want to block as
mana_send_request() is a blocking call.
@@ -380,6 +380,10 @@ struct mana_port_context {
mana_handle_t port_handle;
mana_handle_t pf_filter_handle;
+ /* Mutex for sharing access to vport_use_count */
+ struct mutex vport_mutex;
+ int vport_use_count;
+
u16 port_idx;
bool port_is_up;
@@ -631,4 +635,7 @@ struct mana_tx_package {
struct gdma_posted_wqe_info wqe_info;
};
+int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
+ u32 doorbell_pg_id);
+void mana_uncfg_vport(struct mana_port_context *apc);
#endif /* _MANA_H */
@@ -646,13 +646,48 @@ static int mana_query_vport_cfg(struct mana_port_context *apc, u32 vport_index,
return 0;
}
-static int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
- u32 doorbell_pg_id)
+void mana_uncfg_vport(struct mana_port_context *apc)
+{
+ mutex_lock(&apc->vport_mutex);
+ apc->vport_use_count--;
+ WARN_ON(apc->vport_use_count < 0);
+ mutex_unlock(&apc->vport_mutex);
+}
+EXPORT_SYMBOL_NS(mana_uncfg_vport, NET_MANA);
+
+int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
+ u32 doorbell_pg_id)
{
struct mana_config_vport_resp resp = {};
struct mana_config_vport_req req = {};
int err;
+ /* This function is used to program the Ethernet port in the hardware
+ * table. It can be called from the Ethernet driver or the RDMA driver.
+ *
+ * For Ethernet usage, the hardware supports only one active user on a
+ * physical port. The driver checks on the port usage before programming
+ * the hardware when creating the RAW QP (RDMA driver) or exposing the
+ * device to kernel NET layer (Ethernet driver).
+ *
+ * Because the RDMA driver doesn't know in advance which QP type the
+ * user will create, it exposes the device with all its ports. The user
+ * may not be able to create RAW QP on a port if this port is already
+ * in used by the Ethernet driver from the kernel.
+ *
+ * This physical port limitation only applies to the RAW QP. For RC QP,
+ * the hardware doesn't have this limitation. The user can create RC
+ * QPs on a physical port up to the hardware limits independent of the
+ * Ethernet usage on the same port.
+ */
+ mutex_lock(&apc->vport_mutex);
+ if (apc->vport_use_count > 0) {
+ mutex_unlock(&apc->vport_mutex);
+ return -EBUSY;
+ }
+ apc->vport_use_count++;
+ mutex_unlock(&apc->vport_mutex);
+
mana_gd_init_req_hdr(&req.hdr, MANA_CONFIG_VPORT_TX,
sizeof(req), sizeof(resp));
req.vport = apc->port_handle;
@@ -679,9 +714,16 @@ static int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
apc->tx_shortform_allowed = resp.short_form_allowed;
apc->tx_vp_offset = resp.tx_vport_offset;
+
+ netdev_info(apc->ndev, "Configured vPort %llu PD %u DB %u\n",
+ apc->port_handle, protection_dom_id, doorbell_pg_id);
out:
+ if (err)
+ mana_uncfg_vport(apc);
+
return err;
}
+EXPORT_SYMBOL_NS(mana_cfg_vport, NET_MANA);
static int mana_cfg_vport_steering(struct mana_port_context *apc,
enum TRI_STATE rx,
@@ -742,6 +784,9 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc,
resp.hdr.status);
err = -EPROTO;
}
+
+ netdev_info(ndev, "Configured steering vPort %llu entries %u\n",
+ apc->port_handle, num_entries);
out:
kfree(req);
return err;
@@ -1804,6 +1849,7 @@ static void mana_destroy_vport(struct mana_port_context *apc)
}
mana_destroy_txq(apc);
+ mana_uncfg_vport(apc);
if (gd->gdma_context->is_pf)
mana_pf_deregister_hw_vport(apc);
@@ -2076,6 +2122,9 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
apc->pf_filter_handle = INVALID_MANA_HANDLE;
apc->port_idx = port_idx;
+ mutex_init(&apc->vport_mutex);
+ apc->vport_use_count = 0;
+
ndev->netdev_ops = &mana_devops;
ndev->ethtool_ops = &mana_ethtool_ops;
ndev->mtu = ETH_DATA_LEN;