[net-next,v1,1/4] net: phy: mxl-gpy: add MDINT workaround

Message ID 20221202151204.3318592-2-michael@walle.cc
State New
Headers
Series net: phy: mxl-gpy: broken interrupt fixes |

Commit Message

Michael Walle Dec. 2, 2022, 3:12 p.m. UTC
  At least the GPY215B and GPY215C has a bug where it is still driving the
interrupt line (MDINT) even after the interrupt status register is read
and its bits are cleared. This will cause an interrupt storm.

Although the MDINT is multiplexed with a GPIO pin and theoretically we
could switch the pinmux to GPIO input mode, this isn't possible because
the access to this register will stall exactly as long as the interrupt
line is asserted. We exploit this very fact and just read a random
internal register in our interrupt handler. This way, it will be delayed
until the external interrupt line is released and an interrupt storm is
avoided.

The internal register access via the mailbox was deduced by looking at
the downstream PHY API because the datasheet doesn't mention any of
this.

Signed-off-by: Michael Walle <michael@walle.cc>
---
 drivers/net/phy/mxl-gpy.c | 83 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
  

Comments

Andrew Lunn Dec. 2, 2022, 6:23 p.m. UTC | #1
On Fri, Dec 02, 2022 at 04:12:01PM +0100, Michael Walle wrote:
> At least the GPY215B and GPY215C has a bug where it is still driving the
> interrupt line (MDINT) even after the interrupt status register is read
> and its bits are cleared. This will cause an interrupt storm.
> 
> Although the MDINT is multiplexed with a GPIO pin and theoretically we
> could switch the pinmux to GPIO input mode, this isn't possible because
> the access to this register will stall exactly as long as the interrupt
> line is asserted. We exploit this very fact and just read a random
> internal register in our interrupt handler. This way, it will be delayed
> until the external interrupt line is released and an interrupt storm is
> avoided.
> 
> The internal register access via the mailbox was deduced by looking at
> the downstream PHY API because the datasheet doesn't mention any of
> this.
> 
> Signed-off-by: Michael Walle <michael@walle.cc>
> ---
>  drivers/net/phy/mxl-gpy.c | 83 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 83 insertions(+)
> 
> diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c
> index 0ff7ef076072..20e610dda891 100644
> --- a/drivers/net/phy/mxl-gpy.c
> +++ b/drivers/net/phy/mxl-gpy.c
> @@ -9,6 +9,7 @@
>  #include <linux/module.h>
>  #include <linux/bitfield.h>
>  #include <linux/hwmon.h>
> +#include <linux/mutex.h>
>  #include <linux/phy.h>
>  #include <linux/polynomial.h>
>  #include <linux/netdevice.h>
> @@ -81,6 +82,14 @@
>  #define VSPEC1_TEMP_STA	0x0E
>  #define VSPEC1_TEMP_STA_DATA	GENMASK(9, 0)
>  
> +/* Mailbox */
> +#define VSPEC1_MBOX_DATA	0x5
> +#define VSPEC1_MBOX_ADDRLO	0x6
> +#define VSPEC1_MBOX_CMD		0x7
> +#define VSPEC1_MBOX_CMD_ADDRHI	GENMASK(7, 0)
> +#define VSPEC1_MBOX_CMD_RD	(0 << 8)
> +#define VSPEC1_MBOX_CMD_READY	BIT(15)
> +
>  /* WoL */
>  #define VPSPEC2_WOL_CTL		0x0E06
>  #define VPSPEC2_WOL_AD01	0x0E08
> @@ -88,7 +97,15 @@
>  #define VPSPEC2_WOL_AD45	0x0E0A
>  #define WOL_EN			BIT(0)
>  
> +/* Internal registers, access via mbox */
> +#define REG_GPIO0_OUT		0xd3ce00
> +
>  struct gpy_priv {
> +	struct phy_device *phydev;
> +
> +	/* serialize mailbox acesses */
> +	struct mutex mbox_lock;
> +

>  static int gpy_probe(struct phy_device *phydev)
>  {
>  	struct device *dev = &phydev->mdio.dev;
> @@ -228,7 +286,9 @@ static int gpy_probe(struct phy_device *phydev)
>  	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
>  	if (!priv)
>  		return -ENOMEM;
> +	priv->phydev = phydev;

I don't think you use this anywhere. Maybe in one of the following
patches?

	Andrew
  
Michael Walle Dec. 2, 2022, 10:53 p.m. UTC | #2
Am 2022-12-02 19:23, schrieb Andrew Lunn:
> On Fri, Dec 02, 2022 at 04:12:01PM +0100, Michael Walle wrote:
>> At least the GPY215B and GPY215C has a bug where it is still driving 
>> the
>> interrupt line (MDINT) even after the interrupt status register is 
>> read
>> and its bits are cleared. This will cause an interrupt storm.
>> 
>> Although the MDINT is multiplexed with a GPIO pin and theoretically we
>> could switch the pinmux to GPIO input mode, this isn't possible 
>> because
>> the access to this register will stall exactly as long as the 
>> interrupt
>> line is asserted. We exploit this very fact and just read a random
>> internal register in our interrupt handler. This way, it will be 
>> delayed
>> until the external interrupt line is released and an interrupt storm 
>> is
>> avoided.
>> 
>> The internal register access via the mailbox was deduced by looking at
>> the downstream PHY API because the datasheet doesn't mention any of
>> this.
>> 
>> Signed-off-by: Michael Walle <michael@walle.cc>
>> ---
>>  drivers/net/phy/mxl-gpy.c | 83 
>> +++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 83 insertions(+)
>> 
>> diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c
>> index 0ff7ef076072..20e610dda891 100644
>> --- a/drivers/net/phy/mxl-gpy.c
>> +++ b/drivers/net/phy/mxl-gpy.c
>> @@ -9,6 +9,7 @@
>>  #include <linux/module.h>
>>  #include <linux/bitfield.h>
>>  #include <linux/hwmon.h>
>> +#include <linux/mutex.h>
>>  #include <linux/phy.h>
>>  #include <linux/polynomial.h>
>>  #include <linux/netdevice.h>
>> @@ -81,6 +82,14 @@
>>  #define VSPEC1_TEMP_STA	0x0E
>>  #define VSPEC1_TEMP_STA_DATA	GENMASK(9, 0)
>> 
>> +/* Mailbox */
>> +#define VSPEC1_MBOX_DATA	0x5
>> +#define VSPEC1_MBOX_ADDRLO	0x6
>> +#define VSPEC1_MBOX_CMD		0x7
>> +#define VSPEC1_MBOX_CMD_ADDRHI	GENMASK(7, 0)
>> +#define VSPEC1_MBOX_CMD_RD	(0 << 8)
>> +#define VSPEC1_MBOX_CMD_READY	BIT(15)
>> +
>>  /* WoL */
>>  #define VPSPEC2_WOL_CTL		0x0E06
>>  #define VPSPEC2_WOL_AD01	0x0E08
>> @@ -88,7 +97,15 @@
>>  #define VPSPEC2_WOL_AD45	0x0E0A
>>  #define WOL_EN			BIT(0)
>> 
>> +/* Internal registers, access via mbox */
>> +#define REG_GPIO0_OUT		0xd3ce00
>> +
>>  struct gpy_priv {
>> +	struct phy_device *phydev;
>> +
>> +	/* serialize mailbox acesses */
>> +	struct mutex mbox_lock;
>> +
> 
>>  static int gpy_probe(struct phy_device *phydev)
>>  {
>>  	struct device *dev = &phydev->mdio.dev;
>> @@ -228,7 +286,9 @@ static int gpy_probe(struct phy_device *phydev)
>>  	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
>>  	if (!priv)
>>  		return -ENOMEM;
>> +	priv->phydev = phydev;
> 
> I don't think you use this anywhere. Maybe in one of the following
> patches?

Arg. Yes, it's an leftover from when I was using a workqueue to
reenable the interrupts again.

Any opinion whether this patch should be net or net-next?

-michael
  

Patch

diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c
index 0ff7ef076072..20e610dda891 100644
--- a/drivers/net/phy/mxl-gpy.c
+++ b/drivers/net/phy/mxl-gpy.c
@@ -9,6 +9,7 @@ 
 #include <linux/module.h>
 #include <linux/bitfield.h>
 #include <linux/hwmon.h>
+#include <linux/mutex.h>
 #include <linux/phy.h>
 #include <linux/polynomial.h>
 #include <linux/netdevice.h>
@@ -81,6 +82,14 @@ 
 #define VSPEC1_TEMP_STA	0x0E
 #define VSPEC1_TEMP_STA_DATA	GENMASK(9, 0)
 
+/* Mailbox */
+#define VSPEC1_MBOX_DATA	0x5
+#define VSPEC1_MBOX_ADDRLO	0x6
+#define VSPEC1_MBOX_CMD		0x7
+#define VSPEC1_MBOX_CMD_ADDRHI	GENMASK(7, 0)
+#define VSPEC1_MBOX_CMD_RD	(0 << 8)
+#define VSPEC1_MBOX_CMD_READY	BIT(15)
+
 /* WoL */
 #define VPSPEC2_WOL_CTL		0x0E06
 #define VPSPEC2_WOL_AD01	0x0E08
@@ -88,7 +97,15 @@ 
 #define VPSPEC2_WOL_AD45	0x0E0A
 #define WOL_EN			BIT(0)
 
+/* Internal registers, access via mbox */
+#define REG_GPIO0_OUT		0xd3ce00
+
 struct gpy_priv {
+	struct phy_device *phydev;
+
+	/* serialize mailbox acesses */
+	struct mutex mbox_lock;
+
 	u8 fw_major;
 	u8 fw_minor;
 };
@@ -198,6 +215,40 @@  static int gpy_hwmon_register(struct phy_device *phydev)
 }
 #endif
 
+static int gpy_mbox_read(struct phy_device *phydev, u32 addr)
+{
+	struct gpy_priv *priv = phydev->priv;
+	int val, ret;
+	u16 cmd;
+
+	mutex_lock(&priv->mbox_lock);
+
+	ret = phy_write_mmd(phydev, MDIO_MMD_VEND1, VSPEC1_MBOX_ADDRLO,
+			    addr);
+	if (ret)
+		goto out;
+
+	cmd = VSPEC1_MBOX_CMD_RD;
+	cmd |= FIELD_PREP(VSPEC1_MBOX_CMD_ADDRHI, addr >> 16);
+
+	ret = phy_write_mmd(phydev, MDIO_MMD_VEND1, VSPEC1_MBOX_CMD, cmd);
+	if (ret)
+		goto out;
+
+	ret = phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND1,
+					VSPEC1_MBOX_CMD, val,
+					(val & VSPEC1_MBOX_CMD_READY),
+					500, 10000, false);
+	if (ret)
+		goto out;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND1, VSPEC1_MBOX_DATA);
+
+out:
+	mutex_unlock(&priv->mbox_lock);
+	return ret;
+}
+
 static int gpy_config_init(struct phy_device *phydev)
 {
 	int ret;
@@ -212,6 +263,13 @@  static int gpy_config_init(struct phy_device *phydev)
 	return ret < 0 ? ret : 0;
 }
 
+static bool gpy_has_broken_mdint(struct phy_device *phydev)
+{
+	/* At least these PHYs are known to have broken interrupt handling */
+	return phydev->drv->phy_id == PHY_ID_GPY215B ||
+	       phydev->drv->phy_id == PHY_ID_GPY215C;
+}
+
 static int gpy_probe(struct phy_device *phydev)
 {
 	struct device *dev = &phydev->mdio.dev;
@@ -228,7 +286,9 @@  static int gpy_probe(struct phy_device *phydev)
 	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
+	priv->phydev = phydev;
 	phydev->priv = priv;
+	mutex_init(&priv->mbox_lock);
 
 	fw_version = phy_read(phydev, PHY_FWV);
 	if (fw_version < 0)
@@ -574,6 +634,29 @@  static irqreturn_t gpy_handle_interrupt(struct phy_device *phydev)
 	if (!(reg & PHY_IMASK_MASK))
 		return IRQ_NONE;
 
+	/* The PHY might leave the interrupt line asserted even after PHY_ISTAT
+	 * is read. To avoid interrupt storms, delay the interrupt handling as
+	 * long as the PHY drives the interrupt line. An internal bus read will
+	 * stall as long as the interrupt line is asserted, thus just read a
+	 * random register here.
+	 * Because we cannot access the internal bus at all while the interrupt
+	 * is driven by the PHY, there is no way to make the interrupt line
+	 * unstuck (e.g. by changing the pinmux to GPIO input) during that time
+	 * frame. Therefore, polling is the best we can do and won't do any more
+	 * harm.
+	 * It was observed that this bug happens on link state and link speed
+	 * changes on a GPY215B and GYP215C independent of the firmware version
+	 * (which doesn't mean that this list is exhaustive).
+	 */
+	if (gpy_has_broken_mdint(phydev) &&
+	    (reg & (PHY_IMASK_LSTC | PHY_IMASK_LSPC))) {
+		reg = gpy_mbox_read(phydev, REG_GPIO0_OUT);
+		if (reg < 0) {
+			phy_error(phydev);
+			return IRQ_NONE;
+		}
+	}
+
 	phy_trigger_machine(phydev);
 
 	return IRQ_HANDLED;