[2/2,V3,net] net: mana: Fix MANA VF unload when host is unresponsive

Message ID 1687771224-27162-1-git-send-email-schakrabarti@linux.microsoft.com
State New
Headers
Series [1/2,V3,net] net: mana: Fix MANA VF unload when host is unresponsive |

Commit Message

Souradeep Chakrabarti June 26, 2023, 9:20 a.m. UTC
  From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>

This is the second part of the fix.

Also this patch adds a new attribute in mana_context, which gets set when
mana_hwc_send_request() hits a timeout because of host unresponsiveness.
This flag then helps to avoid the timeouts in successive calls.

Fixes: ca9c54d2d6a5ab2430c4eda364c77125d62e5e0f (net: mana: Add a driver for
Microsoft Azure Network Adapter)
Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
---
V2 -> V3:
* Removed the initialization of vf_unload_timeout
* Splitted the patch in two.
* Fixed extra space from the commit message.
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c  |  4 +++-
 drivers/net/ethernet/microsoft/mana/hw_channel.c | 12 +++++++++++-
 include/net/mana/mana.h                          |  2 ++
 3 files changed, 16 insertions(+), 2 deletions(-)
  

Comments

Praveen Kumar June 26, 2023, 2:13 p.m. UTC | #1
On 6/26/2023 2:50 PM, souradeep chakrabarti wrote:
> From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> 
> This is the second part of the fix.
> 
> Also this patch adds a new attribute in mana_context, which gets set when
> mana_hwc_send_request() hits a timeout because of host unresponsiveness.
> This flag then helps to avoid the timeouts in successive calls.
> 
> Fixes: ca9c54d2d6a5ab2430c4eda364c77125d62e5e0f (net: mana: Add a driver for
> Microsoft Azure Network Adapter)
> Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> ---
> V2 -> V3:
> * Removed the initialization of vf_unload_timeout
> * Splitted the patch in two.
> * Fixed extra space from the commit message.
> ---
>  drivers/net/ethernet/microsoft/mana/gdma_main.c  |  4 +++-
>  drivers/net/ethernet/microsoft/mana/hw_channel.c | 12 +++++++++++-
>  include/net/mana/mana.h                          |  2 ++
>  3 files changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 8f3f78b68592..6411f01be0d9 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -946,10 +946,12 @@ int mana_gd_deregister_device(struct gdma_dev *gd)
>  	struct gdma_context *gc = gd->gdma_context;
>  	struct gdma_general_resp resp = {};
>  	struct gdma_general_req req = {};
> +	struct mana_context *ac;
>  	int err;
>  
>  	if (gd->pdid == INVALID_PDID)
>  		return -EINVAL;
> +	ac = gd->driver_data;
>  
>  	mana_gd_init_req_hdr(&req.hdr, GDMA_DEREGISTER_DEVICE, sizeof(req),
>  			     sizeof(resp));
> @@ -957,7 +959,7 @@ int mana_gd_deregister_device(struct gdma_dev *gd)
>  	req.hdr.dev_id = gd->dev_id;
>  
>  	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
> -	if (err || resp.hdr.status) {
> +	if ((err || resp.hdr.status) && !ac->vf_unload_timeout) {
>  		dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n",
>  			err, resp.hdr.status);

With !ac->vf_unload_timeout option, this message may not be correctly showing err, status. Probably you want to add explicit information during timeouts so that it give right information ? Or have the err, status field properly updated.

>  		if (!err)
> diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
> index 9d1507eba5b9..492cb2c6e2cb 100644
> --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
> +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
> @@ -1,8 +1,10 @@
>  // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
>  /* Copyright (c) 2021, Microsoft Corporation. */
>  
> +#include "asm-generic/errno.h"
>  #include <net/mana/gdma.h>
>  #include <net/mana/hw_channel.h>
> +#include <net/mana/mana.h>
>  
>  static int mana_hwc_get_msg_index(struct hw_channel_context *hwc, u16 *msg_id)
>  {
> @@ -786,12 +788,19 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
>  	struct hwc_wq *txq = hwc->txq;
>  	struct gdma_req_hdr *req_msg;
>  	struct hwc_caller_ctx *ctx;
> +	struct mana_context *ac;
>  	u32 dest_vrcq = 0;
>  	u32 dest_vrq = 0;
>  	u16 msg_id;
>  	int err;
>  
>  	mana_hwc_get_msg_index(hwc, &msg_id);
> +	ac = hwc->gdma_dev->driver_data;

Is there a case where gdma_dev be invalid here ? If so, lets check the state and then proceed further ?

> +	if (ac->vf_unload_timeout) {
> +		dev_err(hwc->dev, "HWC: vport is already unloaded.\n");
> +		err = -ETIMEDOUT;
> +		goto out;
> +	}
>  
>  	tx_wr = &txq->msg_buf->reqs[msg_id];
>  
> @@ -825,9 +834,10 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
>  		goto out;
>  	}
>  
> -	if (!wait_for_completion_timeout(&ctx->comp_event, 30 * HZ)) {
> +	if (!wait_for_completion_timeout(&ctx->comp_event, 5 * HZ)) {

IMHO we should have macros instead of magic numbers (5 , 30 or so). But would like others to comment here.

>  		dev_err(hwc->dev, "HWC: Request timed out!\n");
>  		err = -ETIMEDOUT;
> +		ac->vf_unload_timeout = true;
>  		goto out;
>  	}
>  
> diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
> index 9eef19972845..5f5affdca1eb 100644
> --- a/include/net/mana/mana.h
> +++ b/include/net/mana/mana.h
> @@ -358,6 +358,8 @@ struct mana_context {
>  
>  	u16 num_ports;
>  
> +	bool vf_unload_timeout;
> +
>  	struct mana_eq *eqs;
>  
>  	struct net_device *ports[MAX_PORTS_IN_MANA_DEV];
  
Haiyang Zhang June 26, 2023, 3:53 p.m. UTC | #2
> -----Original Message-----
> From: souradeep chakrabarti <schakrabarti@linux.microsoft.com>
> Sent: Monday, June 26, 2023 5:20 AM
> To: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> <haiyangz@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
> <decui@microsoft.com>; davem@davemloft.net; edumazet@google.com;
> kuba@kernel.org; pabeni@redhat.com; Long Li <longli@microsoft.com>; Ajay
> Sharma <sharmaajay@microsoft.com>; leon@kernel.org;
> cai.huoqing@linux.dev; ssengar@linux.microsoft.com; vkuznets@redhat.com;
> tglx@linutronix.de; linux-hyperv@vger.kernel.org; netdev@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux-rdma@vger.kernel.org
> Cc: stable@vger.kernel.org; Souradeep Chakrabarti
> <schakrabarti@microsoft.com>; Souradeep Chakrabarti
> <schakrabarti@linux.microsoft.com>
> Subject: [PATCH 2/2 V3 net] net: mana: Fix MANA VF unload when host is
> unresponsive

In general, two patches shouldn't have the same Subject.

For this patch set, the two patches are two steps to fix the unloading issue, and
they are not very long. IMHO, they should be in one patch.

Thanks,
- Haiyang
  
Haiyang Zhang June 26, 2023, 4:02 p.m. UTC | #3
> -----Original Message-----
> From: Haiyang Zhang <haiyangz@microsoft.com>
> Sent: Monday, June 26, 2023 11:54 AM
> To: souradeep chakrabarti <schakrabarti@linux.microsoft.com>; KY Srinivasan
> <kys@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
> <decui@microsoft.com>; davem@davemloft.net; edumazet@google.com;
> kuba@kernel.org; pabeni@redhat.com; Long Li <longli@microsoft.com>; Ajay
> Sharma <sharmaajay@microsoft.com>; leon@kernel.org;
> cai.huoqing@linux.dev; ssengar@linux.microsoft.com; vkuznets@redhat.com;
> tglx@linutronix.de; linux-hyperv@vger.kernel.org; netdev@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux-rdma@vger.kernel.org
> Cc: stable@vger.kernel.org; Souradeep Chakrabarti
> <schakrabarti@microsoft.com>
> Subject: RE: [PATCH 2/2 V3 net] net: mana: Fix MANA VF unload when host is
> unresponsive
> 
> 
> 
> > -----Original Message-----
> > From: souradeep chakrabarti <schakrabarti@linux.microsoft.com>
> > Sent: Monday, June 26, 2023 5:20 AM
> > To: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> > <haiyangz@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
> > <decui@microsoft.com>; davem@davemloft.net; edumazet@google.com;
> > kuba@kernel.org; pabeni@redhat.com; Long Li <longli@microsoft.com>; Ajay
> > Sharma <sharmaajay@microsoft.com>; leon@kernel.org;
> > cai.huoqing@linux.dev; ssengar@linux.microsoft.com; vkuznets@redhat.com;
> > tglx@linutronix.de; linux-hyperv@vger.kernel.org; netdev@vger.kernel.org;
> > linux-kernel@vger.kernel.org; linux-rdma@vger.kernel.org
> > Cc: stable@vger.kernel.org; Souradeep Chakrabarti
> > <schakrabarti@microsoft.com>; Souradeep Chakrabarti
> > <schakrabarti@linux.microsoft.com>
> > Subject: [PATCH 2/2 V3 net] net: mana: Fix MANA VF unload when host is
> > unresponsive
> 
> In general, two patches shouldn't have the same Subject.
> 

If two patches are preferred, please use more descriptive subjects like these:

1/2: Fix the infinite waiting on pending_sends during host unresponsiveness
2/2: Avoid extra waits if host not responding in earlier steps

Thanks,
- Haiyang
  
Haiyang Zhang June 26, 2023, 8:32 p.m. UTC | #4
> -----Original Message-----
> From: Praveen Kumar <kumarpraveen@linux.microsoft.com>
> Sent: Monday, June 26, 2023 10:13 AM
> To: souradeep chakrabarti <schakrabarti@linux.microsoft.com>; KY Srinivasan
> <kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>;
> wei.liu@kernel.org; Dexuan Cui <decui@microsoft.com>;
> davem@davemloft.net; edumazet@google.com; kuba@kernel.org;
> pabeni@redhat.com; Long Li <longli@microsoft.com>; Ajay Sharma
> <sharmaajay@microsoft.com>; leon@kernel.org; cai.huoqing@linux.dev;
> ssengar@linux.microsoft.com; vkuznets@redhat.com; tglx@linutronix.de; linux-
> hyperv@vger.kernel.org; netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
> linux-rdma@vger.kernel.org
> Cc: stable@vger.kernel.org; Souradeep Chakrabarti
> <schakrabarti@microsoft.com>
> Subject: Re: [PATCH 2/2 V3 net] net: mana: Fix MANA VF unload when host is
> unresponsive
> 
> On 6/26/2023 2:50 PM, souradeep chakrabarti wrote:
> > From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> >
> > This is the second part of the fix.
> >
> > Also this patch adds a new attribute in mana_context, which gets set when
> > mana_hwc_send_request() hits a timeout because of host unresponsiveness.
> > This flag then helps to avoid the timeouts in successive calls.
> >
> > Fixes: ca9c54d2d6a5ab2430c4eda364c77125d62e5e0f (net: mana: Add a
> driver for
> > Microsoft Azure Network Adapter)
> > Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > ---
> > V2 -> V3:
> > * Removed the initialization of vf_unload_timeout
> > * Splitted the patch in two.
> > * Fixed extra space from the commit message.
> > ---
> >  drivers/net/ethernet/microsoft/mana/gdma_main.c  |  4 +++-
> >  drivers/net/ethernet/microsoft/mana/hw_channel.c | 12 +++++++++++-
> >  include/net/mana/mana.h                          |  2 ++
> >  3 files changed, 16 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > index 8f3f78b68592..6411f01be0d9 100644
> > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > @@ -946,10 +946,12 @@ int mana_gd_deregister_device(struct gdma_dev
> *gd)
> >  	struct gdma_context *gc = gd->gdma_context;
> >  	struct gdma_general_resp resp = {};
> >  	struct gdma_general_req req = {};
> > +	struct mana_context *ac;
> >  	int err;
> >
> >  	if (gd->pdid == INVALID_PDID)
> >  		return -EINVAL;
> > +	ac = gd->driver_data;
> >
> >  	mana_gd_init_req_hdr(&req.hdr, GDMA_DEREGISTER_DEVICE,
> sizeof(req),
> >  			     sizeof(resp));
> > @@ -957,7 +959,7 @@ int mana_gd_deregister_device(struct gdma_dev *gd)
> >  	req.hdr.dev_id = gd->dev_id;
> >
> >  	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
> > -	if (err || resp.hdr.status) {
> > +	if ((err || resp.hdr.status) && !ac->vf_unload_timeout) {
> >  		dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n",
> >  			err, resp.hdr.status);
> 
> With !ac->vf_unload_timeout option, this message may not be correctly
> showing err, status. Probably you want to add explicit information during
> timeouts so that it give right information ? Or have the err, status field properly
> updated.
> 
> >  		if (!err)
> > diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c
> b/drivers/net/ethernet/microsoft/mana/hw_channel.c
> > index 9d1507eba5b9..492cb2c6e2cb 100644
> > --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
> > +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
> > @@ -1,8 +1,10 @@
> >  // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
> >  /* Copyright (c) 2021, Microsoft Corporation. */
> >
> > +#include "asm-generic/errno.h"
> >  #include <net/mana/gdma.h>
> >  #include <net/mana/hw_channel.h>
> > +#include <net/mana/mana.h>
> >
> >  static int mana_hwc_get_msg_index(struct hw_channel_context *hwc, u16
> *msg_id)
> >  {
> > @@ -786,12 +788,19 @@ int mana_hwc_send_request(struct
> hw_channel_context *hwc, u32 req_len,
> >  	struct hwc_wq *txq = hwc->txq;
> >  	struct gdma_req_hdr *req_msg;
> >  	struct hwc_caller_ctx *ctx;
> > +	struct mana_context *ac;
> >  	u32 dest_vrcq = 0;
> >  	u32 dest_vrq = 0;
> >  	u16 msg_id;
> >  	int err;
> >
> >  	mana_hwc_get_msg_index(hwc, &msg_id);
> > +	ac = hwc->gdma_dev->driver_data;
> 
> Is there a case where gdma_dev be invalid here ? If so, lets check the state and
> then proceed further ?

Yes, hwc->gdma_dev is assigned shortly after it's allocated - see the code below. So
it's valid.
But hwc->gdma_dev->driver_data is hwc, not "mana_context *ac". There are two 
gdma_dev in gdma_context: hwc & mana.

You can get ac from: hwc->gdma_dev->gdma_context->mana.driver_data
Or, to avoid too many pointer deference, I suggest to put the vf_unload_timeout 
into gdma_context.

int mana_hwc_create_channel(struct gdma_context *gc)
{
        hwc = kzalloc(sizeof(*hwc), GFP_KERNEL);
...
        gd->gdma_context = gc;
        gd->driver_data = hwc;
        hwc->gdma_dev = gd;
        hwc->dev = gc->dev;


Also, mana_gd_send_request/mana_hwc_send_request() is used in many places,
not just unloading.
Should you use timeout value 5 sec, and the vf_unload_timeout flag in unloading
path only, and avoid touching other code paths? Please check with hostnet team for
suggestions.

If we decide to let the vf_unload_timeout flag affect all code paths, not just 
unloading, then it should be renamed to hwc_timeout, and submit the second patch
separately.
If just use it for unloading, since mana_gd_deregister_device() is used by PF too, 
name it like: unload_hwc_timeout.

Thanks,
-Haiyang
  
Souradeep Chakrabarti June 27, 2023, 8:42 a.m. UTC | #5
On Mon, Jun 26, 2023 at 07:43:07PM +0530, Praveen Kumar wrote:
> On 6/26/2023 2:50 PM, souradeep chakrabarti wrote:
> > From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > 
> > This is the second part of the fix.
> > 
> > Also this patch adds a new attribute in mana_context, which gets set when
> > mana_hwc_send_request() hits a timeout because of host unresponsiveness.
> > This flag then helps to avoid the timeouts in successive calls.
> > 
> > Fixes: ca9c54d2d6a5ab2430c4eda364c77125d62e5e0f (net: mana: Add a driver for
> > Microsoft Azure Network Adapter)
> > Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > ---
> > V2 -> V3:
> > * Removed the initialization of vf_unload_timeout
> > * Splitted the patch in two.
> > * Fixed extra space from the commit message.
> > ---
> >  drivers/net/ethernet/microsoft/mana/gdma_main.c  |  4 +++-
> >  drivers/net/ethernet/microsoft/mana/hw_channel.c | 12 +++++++++++-
> >  include/net/mana/mana.h                          |  2 ++
> >  3 files changed, 16 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > index 8f3f78b68592..6411f01be0d9 100644
> > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > @@ -946,10 +946,12 @@ int mana_gd_deregister_device(struct gdma_dev *gd)
> >  	struct gdma_context *gc = gd->gdma_context;
> >  	struct gdma_general_resp resp = {};
> >  	struct gdma_general_req req = {};
> > +	struct mana_context *ac;
> >  	int err;
> >  
> >  	if (gd->pdid == INVALID_PDID)
> >  		return -EINVAL;
> > +	ac = gd->driver_data;
> >  
> >  	mana_gd_init_req_hdr(&req.hdr, GDMA_DEREGISTER_DEVICE, sizeof(req),
> >  			     sizeof(resp));
> > @@ -957,7 +959,7 @@ int mana_gd_deregister_device(struct gdma_dev *gd)
> >  	req.hdr.dev_id = gd->dev_id;
> >  
> >  	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
> > -	if (err || resp.hdr.status) {
> > +	if ((err || resp.hdr.status) && !ac->vf_unload_timeout) {
> >  		dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n",
> >  			err, resp.hdr.status);
> 
> With !ac->vf_unload_timeout option, this message may not be correctly showing err, status. Probably you want to add explicit information during timeouts so that it give right information ? Or have the err, status field properly updated.
This check !ac->vf_unload_timeout here means if ac->vf_unload_timeout is not yet set,
then only consider the err path, else continue the remaining operation.
> 
> >  		if (!err)
> > diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
> > index 9d1507eba5b9..492cb2c6e2cb 100644
> > --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
> > +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
> > @@ -1,8 +1,10 @@
> >  // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
> >  /* Copyright (c) 2021, Microsoft Corporation. */
> >  
> > +#include "asm-generic/errno.h"
> >  #include <net/mana/gdma.h>
> >  #include <net/mana/hw_channel.h>
> > +#include <net/mana/mana.h>
> >  
> >  static int mana_hwc_get_msg_index(struct hw_channel_context *hwc, u16 *msg_id)
> >  {
> > @@ -786,12 +788,19 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
> >  	struct hwc_wq *txq = hwc->txq;
> >  	struct gdma_req_hdr *req_msg;
> >  	struct hwc_caller_ctx *ctx;
> > +	struct mana_context *ac;
> >  	u32 dest_vrcq = 0;
> >  	u32 dest_vrq = 0;
> >  	u16 msg_id;
> >  	int err;
> >  
> >  	mana_hwc_get_msg_index(hwc, &msg_id);
> > +	ac = hwc->gdma_dev->driver_data;
> 
> Is there a case where gdma_dev be invalid here ? If so, lets check the state and then proceed further ?
I can see Haiyang has already in his comment, responded on the same.
hwc->gdma_dev will be valid here, but as Haiyang pointed we need to use
hwc->gdma_dev->gdma_context->mana.driver_data, or better to relocate the
attribute in gdma_context.
> 
> > +	if (ac->vf_unload_timeout) {
> > +		dev_err(hwc->dev, "HWC: vport is already unloaded.\n");
> > +		err = -ETIMEDOUT;
> > +		goto out;
> > +	}
> >  
> >  	tx_wr = &txq->msg_buf->reqs[msg_id];
> >  
> > @@ -825,9 +834,10 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
> >  		goto out;
> >  	}
> >  
> > -	if (!wait_for_completion_timeout(&ctx->comp_event, 30 * HZ)) {
> > +	if (!wait_for_completion_timeout(&ctx->comp_event, 5 * HZ)) {
> 
> IMHO we should have macros instead of magic numbers (5 , 30 or so). But would like others to comment here.
> 
> >  		dev_err(hwc->dev, "HWC: Request timed out!\n");
> >  		err = -ETIMEDOUT;
> > +		ac->vf_unload_timeout = true;
> >  		goto out;
> >  	}
> >  
> > diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
> > index 9eef19972845..5f5affdca1eb 100644
> > --- a/include/net/mana/mana.h
> > +++ b/include/net/mana/mana.h
> > @@ -358,6 +358,8 @@ struct mana_context {
> >  
> >  	u16 num_ports;
> >  
> > +	bool vf_unload_timeout;
> > +
> >  	struct mana_eq *eqs;
> >  
> >  	struct net_device *ports[MAX_PORTS_IN_MANA_DEV];
  
Souradeep Chakrabarti June 27, 2023, 8:45 a.m. UTC | #6
On Mon, Jun 26, 2023 at 03:53:50PM +0000, Haiyang Zhang wrote:
> 
> 
> > -----Original Message-----
> > From: souradeep chakrabarti <schakrabarti@linux.microsoft.com>
> > Sent: Monday, June 26, 2023 5:20 AM
> > To: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> > <haiyangz@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
> > <decui@microsoft.com>; davem@davemloft.net; edumazet@google.com;
> > kuba@kernel.org; pabeni@redhat.com; Long Li <longli@microsoft.com>; Ajay
> > Sharma <sharmaajay@microsoft.com>; leon@kernel.org;
> > cai.huoqing@linux.dev; ssengar@linux.microsoft.com; vkuznets@redhat.com;
> > tglx@linutronix.de; linux-hyperv@vger.kernel.org; netdev@vger.kernel.org;
> > linux-kernel@vger.kernel.org; linux-rdma@vger.kernel.org
> > Cc: stable@vger.kernel.org; Souradeep Chakrabarti
> > <schakrabarti@microsoft.com>; Souradeep Chakrabarti
> > <schakrabarti@linux.microsoft.com>
> > Subject: [PATCH 2/2 V3 net] net: mana: Fix MANA VF unload when host is
> > unresponsive
> 
> In general, two patches shouldn't have the same Subject.
> 
> For this patch set, the two patches are two steps to fix the unloading issue, and
> they are not very long. IMHO, they should be in one patch.
> 
> Thanks,
> - Haiyang
I will create a single patch in next version. As this fixes the unloading issue.
  

Patch

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 8f3f78b68592..6411f01be0d9 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -946,10 +946,12 @@  int mana_gd_deregister_device(struct gdma_dev *gd)
 	struct gdma_context *gc = gd->gdma_context;
 	struct gdma_general_resp resp = {};
 	struct gdma_general_req req = {};
+	struct mana_context *ac;
 	int err;
 
 	if (gd->pdid == INVALID_PDID)
 		return -EINVAL;
+	ac = gd->driver_data;
 
 	mana_gd_init_req_hdr(&req.hdr, GDMA_DEREGISTER_DEVICE, sizeof(req),
 			     sizeof(resp));
@@ -957,7 +959,7 @@  int mana_gd_deregister_device(struct gdma_dev *gd)
 	req.hdr.dev_id = gd->dev_id;
 
 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
-	if (err || resp.hdr.status) {
+	if ((err || resp.hdr.status) && !ac->vf_unload_timeout) {
 		dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n",
 			err, resp.hdr.status);
 		if (!err)
diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index 9d1507eba5b9..492cb2c6e2cb 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -1,8 +1,10 @@ 
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /* Copyright (c) 2021, Microsoft Corporation. */
 
+#include "asm-generic/errno.h"
 #include <net/mana/gdma.h>
 #include <net/mana/hw_channel.h>
+#include <net/mana/mana.h>
 
 static int mana_hwc_get_msg_index(struct hw_channel_context *hwc, u16 *msg_id)
 {
@@ -786,12 +788,19 @@  int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
 	struct hwc_wq *txq = hwc->txq;
 	struct gdma_req_hdr *req_msg;
 	struct hwc_caller_ctx *ctx;
+	struct mana_context *ac;
 	u32 dest_vrcq = 0;
 	u32 dest_vrq = 0;
 	u16 msg_id;
 	int err;
 
 	mana_hwc_get_msg_index(hwc, &msg_id);
+	ac = hwc->gdma_dev->driver_data;
+	if (ac->vf_unload_timeout) {
+		dev_err(hwc->dev, "HWC: vport is already unloaded.\n");
+		err = -ETIMEDOUT;
+		goto out;
+	}
 
 	tx_wr = &txq->msg_buf->reqs[msg_id];
 
@@ -825,9 +834,10 @@  int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
 		goto out;
 	}
 
-	if (!wait_for_completion_timeout(&ctx->comp_event, 30 * HZ)) {
+	if (!wait_for_completion_timeout(&ctx->comp_event, 5 * HZ)) {
 		dev_err(hwc->dev, "HWC: Request timed out!\n");
 		err = -ETIMEDOUT;
+		ac->vf_unload_timeout = true;
 		goto out;
 	}
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 9eef19972845..5f5affdca1eb 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -358,6 +358,8 @@  struct mana_context {
 
 	u16 num_ports;
 
+	bool vf_unload_timeout;
+
 	struct mana_eq *eqs;
 
 	struct net_device *ports[MAX_PORTS_IN_MANA_DEV];