[v8,net-next,10/12] net: dsa: mv88e6xxx: mac-auth/MAB implementation

Message ID 20221018165619.134535-11-netdev@kapio-technology.com
State New
Headers
Series Extend locked port feature with FDB locked flag (MAC-Auth/MAB) |

Commit Message

Hans Schultz Oct. 18, 2022, 4:56 p.m. UTC
  This implementation for the Marvell mv88e6xxx chip series,
is based on handling ATU miss violations occurring when packets
ingress on a port that is locked. The mac address triggering
the ATU miss violation will be added to the ATU with a zero-DPV,
and is then communicated through switchdev to the bridge module,
which adds a fdb entry with the fdb locked flag set. The entry
is kept according to the bridges ageing time, thus simulating a
dynamic entry.

Additionally the driver will set the sticky and masked flags, as
the driver does not support roaming and forwarding from any port
to a locked entry.

As this is essentially a form of CPU based learning, the amount
of locked entries will be limited by a hardcoded value for now,
so as to prevent DOS attacks.

Signed-off-by: Hans J. Schultz <netdev@kapio-technology.com>
---
 drivers/net/dsa/mv88e6xxx/Makefile      |   1 +
 drivers/net/dsa/mv88e6xxx/chip.c        |  76 +++++--
 drivers/net/dsa/mv88e6xxx/chip.h        |  19 ++
 drivers/net/dsa/mv88e6xxx/global1.h     |   1 +
 drivers/net/dsa/mv88e6xxx/global1_atu.c |  12 +-
 drivers/net/dsa/mv88e6xxx/port.c        |  15 +-
 drivers/net/dsa/mv88e6xxx/port.h        |   6 +
 drivers/net/dsa/mv88e6xxx/switchdev.c   | 284 ++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/switchdev.h   |  37 +++
 9 files changed, 429 insertions(+), 22 deletions(-)
 create mode 100644 drivers/net/dsa/mv88e6xxx/switchdev.c
 create mode 100644 drivers/net/dsa/mv88e6xxx/switchdev.h
  

Comments

Vladimir Oltean Oct. 20, 2022, 1:25 p.m. UTC | #1
On Tue, Oct 18, 2022 at 06:56:17PM +0200, Hans J. Schultz wrote:
> This implementation for the Marvell mv88e6xxx chip series,
> is based on handling ATU miss violations occurring when packets
> ingress on a port that is locked. The mac address triggering
> the ATU miss violation will be added to the ATU with a zero-DPV,
> and is then communicated through switchdev to the bridge module,
> which adds a fdb entry with the fdb locked flag set. The entry
> is kept according to the bridges ageing time, thus simulating a
> dynamic entry.
> 
> Additionally the driver will set the sticky and masked flags, as
> the driver does not support roaming and forwarding from any port
> to a locked entry.
> 
> As this is essentially a form of CPU based learning, the amount
> of locked entries will be limited by a hardcoded value for now,
> so as to prevent DOS attacks.
> 
> Signed-off-by: Hans J. Schultz <netdev@kapio-technology.com>
> ---
>  drivers/net/dsa/mv88e6xxx/Makefile      |   1 +
>  drivers/net/dsa/mv88e6xxx/chip.c        |  76 +++++--
>  drivers/net/dsa/mv88e6xxx/chip.h        |  19 ++
>  drivers/net/dsa/mv88e6xxx/global1.h     |   1 +
>  drivers/net/dsa/mv88e6xxx/global1_atu.c |  12 +-
>  drivers/net/dsa/mv88e6xxx/port.c        |  15 +-
>  drivers/net/dsa/mv88e6xxx/port.h        |   6 +
>  drivers/net/dsa/mv88e6xxx/switchdev.c   | 284 ++++++++++++++++++++++++
>  drivers/net/dsa/mv88e6xxx/switchdev.h   |  37 +++
>  9 files changed, 429 insertions(+), 22 deletions(-)
>  create mode 100644 drivers/net/dsa/mv88e6xxx/switchdev.c
>  create mode 100644 drivers/net/dsa/mv88e6xxx/switchdev.h
> 
> diff --git a/drivers/net/dsa/mv88e6xxx/Makefile b/drivers/net/dsa/mv88e6xxx/Makefile
> index c8eca2b6f959..be903a983780 100644
> --- a/drivers/net/dsa/mv88e6xxx/Makefile
> +++ b/drivers/net/dsa/mv88e6xxx/Makefile
> @@ -15,3 +15,4 @@ mv88e6xxx-objs += port_hidden.o
>  mv88e6xxx-$(CONFIG_NET_DSA_MV88E6XXX_PTP) += ptp.o
>  mv88e6xxx-objs += serdes.o
>  mv88e6xxx-objs += smi.o
> +mv88e6xxx-objs += switchdev.o
> diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
> index 352121cce77e..71843fe87f77 100644
> --- a/drivers/net/dsa/mv88e6xxx/chip.c
> +++ b/drivers/net/dsa/mv88e6xxx/chip.c
> @@ -42,6 +42,7 @@
>  #include "ptp.h"
>  #include "serdes.h"
>  #include "smi.h"
> +#include "switchdev.h"
>  
>  static void assert_reg_lock(struct mv88e6xxx_chip *chip)
>  {
> @@ -924,6 +925,13 @@ static void mv88e6xxx_mac_link_down(struct dsa_switch *ds, int port,
>  	if (err)
>  		dev_err(chip->dev,
>  			"p%d: failed to force MAC link down\n", port);
> +	else
> +		if (mv88e6xxx_port_is_locked(chip, port)) {
> +			err = mv88e6xxx_atu_locked_entry_flush(ds, port);
> +			if (err)
> +				dev_err(chip->dev,
> +					"p%d: failed to clear locked entries\n", port);
> +		}

This would not have been needed if dsa_port_set_state() would have
called dsa_port_fast_age().

Currently it only does that if dp->learning is true. From previous
conversations I get the idea that with MAB, port learning will be false.
But I don't understand why; isn't MAB CPU-assisted learning? I'm looking
at the ocelot hardware support for this and I think it could be
implemented using a similar mechanism, but I certainly don't want to add
more workarounds such as this in other drivers.

Are there any other ways to implement MAB other than through CPU
assisted learning?

We could add one more dp->mab flag which tracks the "mab" brport flag,
and extend dsa_port_set_state() to also call dsa_port_fast_age() in that
case, but I want to make sure there isn't something extremely obvious
I'm missing about the "learning" flag.

>  }
>  
>  static void mv88e6xxx_mac_link_up(struct dsa_switch *ds, int port,
> @@ -1690,6 +1698,13 @@ static void mv88e6xxx_port_fast_age(struct dsa_switch *ds, int port)
>  	struct mv88e6xxx_chip *chip = ds->priv;
>  	int err;
>  
> +	if (mv88e6xxx_port_is_locked(chip, port)) {
> +		err = mv88e6xxx_atu_locked_entry_flush(ds, port);
> +		if (err)
> +			dev_err(chip->ds->dev, "p%d: failed to clear locked entries: %d\n",
> +				port, err);
> +	}
> +
>  	mv88e6xxx_reg_lock(chip);
>  	err = mv88e6xxx_port_fast_age_fid(chip, port, 0);
>  	mv88e6xxx_reg_unlock(chip);
> @@ -1726,11 +1741,11 @@ static int mv88e6xxx_vtu_get(struct mv88e6xxx_chip *chip, u16 vid,
>  	return err;
>  }
>  
> -static int mv88e6xxx_vtu_walk(struct mv88e6xxx_chip *chip,
> -			      int (*cb)(struct mv88e6xxx_chip *chip,
> -					const struct mv88e6xxx_vtu_entry *entry,
> -					void *priv),
> -			      void *priv)
> +int mv88e6xxx_vtu_walk(struct mv88e6xxx_chip *chip,
> +		       int (*cb)(struct mv88e6xxx_chip *chip,
> +				 const struct mv88e6xxx_vtu_entry *entry,
> +				 void *priv),
> +		       void *priv)
>  {
>  	struct mv88e6xxx_vtu_entry entry = {
>  		.vid = mv88e6xxx_max_vid(chip),
> @@ -2731,6 +2746,9 @@ static int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
>  	if (fdb_flags)
>  		return 0;
>  
> +	if (mv88e6xxx_port_is_locked(chip, port))
> +		mv88e6xxx_atu_locked_entry_find_purge(ds, port, addr, vid);
> +
>  	mv88e6xxx_reg_lock(chip);
>  	err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid,
>  					   MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC);
> @@ -2744,16 +2762,21 @@ static int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
>  				  u16 fdb_flags, struct dsa_db db)
>  {
>  	struct mv88e6xxx_chip *chip = ds->priv;
> -	int err;
> +	bool locked_found = false;
> +	int err = 0;
>  
>  	/* Ignore entries with flags set */
>  	if (fdb_flags)
>  		return 0;
>  
> -	mv88e6xxx_reg_lock(chip);
> -	err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid, 0);
> -	mv88e6xxx_reg_unlock(chip);
> +	if (mv88e6xxx_port_is_locked(chip, port))
> +		locked_found = mv88e6xxx_atu_locked_entry_find_purge(ds, port, addr, vid);
>  
> +	if (!locked_found) {
> +		mv88e6xxx_reg_lock(chip);
> +		err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid, 0);
> +		mv88e6xxx_reg_unlock(chip);
> +	}
>  	return err;
>  }
>  
> @@ -3849,11 +3872,18 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
>  
>  static int mv88e6xxx_port_setup(struct dsa_switch *ds, int port)
>  {
> -	return mv88e6xxx_setup_devlink_regions_port(ds, port);
> +	int err;
> +
> +	err = mv88e6xxx_setup_devlink_regions_port(ds, port);
> +	if (!err)
> +		return mv88e6xxx_init_violation_handler(ds, port);
> +
> +	return err;
>  }
>  
>  static void mv88e6xxx_port_teardown(struct dsa_switch *ds, int port)
>  {
> +	mv88e6xxx_teardown_violation_handler(ds, port);
>  	mv88e6xxx_teardown_devlink_regions_port(ds, port);
>  }
>  
> @@ -6528,7 +6558,7 @@ static int mv88e6xxx_port_pre_bridge_flags(struct dsa_switch *ds, int port,
>  	const struct mv88e6xxx_ops *ops;
>  
>  	if (flags.mask & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD |
> -			   BR_BCAST_FLOOD | BR_PORT_LOCKED))
> +			   BR_BCAST_FLOOD | BR_PORT_LOCKED | BR_PORT_MAB))
>  		return -EINVAL;
>  
>  	ops = chip->info->ops;
> @@ -6549,13 +6579,13 @@ static int mv88e6xxx_port_bridge_flags(struct dsa_switch *ds, int port,
>  	struct mv88e6xxx_chip *chip = ds->priv;
>  	int err = -EOPNOTSUPP;
>  
> -	mv88e6xxx_reg_lock(chip);
> -

Separate commit which changes the locking?

>  	if (flags.mask & BR_LEARNING) {
>  		bool learning = !!(flags.val & BR_LEARNING);
>  		u16 pav = learning ? (1 << port) : 0;
>  
> +		mv88e6xxx_reg_lock(chip);
>  		err = mv88e6xxx_port_set_assoc_vector(chip, port, pav);
> +		mv88e6xxx_reg_unlock(chip);
>  		if (err)
>  			goto out;
>  	}
> @@ -6563,8 +6593,10 @@ static int mv88e6xxx_port_bridge_flags(struct dsa_switch *ds, int port,
>  	if (flags.mask & BR_FLOOD) {
>  		bool unicast = !!(flags.val & BR_FLOOD);
>  
> +		mv88e6xxx_reg_lock(chip);
>  		err = chip->info->ops->port_set_ucast_flood(chip, port,
>  							    unicast);
> +		mv88e6xxx_reg_unlock(chip);
>  		if (err)
>  			goto out;
>  	}
> @@ -6572,8 +6604,10 @@ static int mv88e6xxx_port_bridge_flags(struct dsa_switch *ds, int port,
>  	if (flags.mask & BR_MCAST_FLOOD) {
>  		bool multicast = !!(flags.val & BR_MCAST_FLOOD);
>  
> +		mv88e6xxx_reg_lock(chip);
>  		err = chip->info->ops->port_set_mcast_flood(chip, port,
>  							    multicast);
> +		mv88e6xxx_reg_unlock(chip);
>  		if (err)
>  			goto out;
>  	}
> @@ -6581,20 +6615,34 @@ static int mv88e6xxx_port_bridge_flags(struct dsa_switch *ds, int port,
>  	if (flags.mask & BR_BCAST_FLOOD) {
>  		bool broadcast = !!(flags.val & BR_BCAST_FLOOD);
>  
> +		mv88e6xxx_reg_lock(chip);
>  		err = mv88e6xxx_port_broadcast_sync(chip, port, broadcast);
> +		mv88e6xxx_reg_unlock(chip);
>  		if (err)
>  			goto out;
>  	}
>  
> +	if (flags.mask & BR_PORT_MAB) {
> +		chip->ports[port].mab = !!(flags.val & BR_PORT_MAB);
> +
> +		if (!chip->ports[port].mab)
> +			err = mv88e6xxx_atu_locked_entry_flush(ds, port);
> +		else
> +			err = 0;

Again, dsa_port_fast_age() is also called when dp->learning is turned
off in dsa_port_bridge_flags(). I don't want to see the mv88e6xxx driver
doing this manually.

> +	}
> +
>  	if (flags.mask & BR_PORT_LOCKED) {
>  		bool locked = !!(flags.val & BR_PORT_LOCKED);
>  
> +		mv88e6xxx_reg_lock(chip);
>  		err = mv88e6xxx_port_set_lock(chip, port, locked);
> +		mv88e6xxx_reg_unlock(chip);
>  		if (err)
>  			goto out;
> +
> +		chip->ports[port].locked = locked;
>  	}
>  out:
> -	mv88e6xxx_reg_unlock(chip);
>  
>  	return err;
>  }
> diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
> index e693154cf803..180fbcf596fa 100644
> --- a/drivers/net/dsa/mv88e6xxx/chip.h
> +++ b/drivers/net/dsa/mv88e6xxx/chip.h
> @@ -280,6 +280,16 @@ struct mv88e6xxx_port {
>  	unsigned int serdes_irq;
>  	char serdes_irq_name[64];
>  	struct devlink_region *region;
> +
> +	/* Locked port and MacAuth control flags */
> +	bool locked;
> +	bool mab;
> +
> +	/* List and maintenance of ATU locked entries */
> +	struct mutex ale_list_lock;
> +	struct list_head ale_list;
> +	struct delayed_work ale_work;
> +	int ale_cnt;
>  };
>  
>  enum mv88e6xxx_region_id {
> @@ -399,6 +409,9 @@ struct mv88e6xxx_chip {
>  	int egress_dest_port;
>  	int ingress_dest_port;
>  
> +	/* Keep the register written age time for easy access */
> +	u8 age_time;
> +
>  	/* Per-port timestamping resources. */
>  	struct mv88e6xxx_port_hwtstamp port_hwtstamp[DSA_MAX_PORTS];
>  
> @@ -802,6 +815,12 @@ static inline void mv88e6xxx_reg_unlock(struct mv88e6xxx_chip *chip)
>  	mutex_unlock(&chip->reg_lock);
>  }
>  
> +int mv88e6xxx_vtu_walk(struct mv88e6xxx_chip *chip,
> +		       int (*cb)(struct mv88e6xxx_chip *chip,
> +				 const struct mv88e6xxx_vtu_entry *entry,
> +				 void *priv),
> +		       void *priv);
> +
>  int mv88e6xxx_fid_map(struct mv88e6xxx_chip *chip, unsigned long *bitmap);
>  
>  #endif /* _MV88E6XXX_CHIP_H */
> diff --git a/drivers/net/dsa/mv88e6xxx/global1.h b/drivers/net/dsa/mv88e6xxx/global1.h
> index 65958b2a0d3a..503fbf216670 100644
> --- a/drivers/net/dsa/mv88e6xxx/global1.h
> +++ b/drivers/net/dsa/mv88e6xxx/global1.h
> @@ -136,6 +136,7 @@
>  #define MV88E6XXX_G1_ATU_DATA_TRUNK				0x8000
>  #define MV88E6XXX_G1_ATU_DATA_TRUNK_ID_MASK			0x00f0
>  #define MV88E6XXX_G1_ATU_DATA_PORT_VECTOR_MASK			0x3ff0
> +#define MV88E6XXX_G1_ATU_DATA_PORT_VECTOR_NO_EGRESS		0x0000
>  #define MV88E6XXX_G1_ATU_DATA_STATE_MASK			0x000f
>  #define MV88E6XXX_G1_ATU_DATA_STATE_UC_UNUSED			0x0000
>  #define MV88E6XXX_G1_ATU_DATA_STATE_UC_AGE_1_OLDEST		0x0001
> diff --git a/drivers/net/dsa/mv88e6xxx/global1_atu.c b/drivers/net/dsa/mv88e6xxx/global1_atu.c
> index d9dfa1159cde..67907cd00b87 100644
> --- a/drivers/net/dsa/mv88e6xxx/global1_atu.c
> +++ b/drivers/net/dsa/mv88e6xxx/global1_atu.c
> @@ -12,6 +12,8 @@
>  
>  #include "chip.h"
>  #include "global1.h"
> +#include "port.h"
> +#include "switchdev.h"
>  
>  /* Offset 0x01: ATU FID Register */
>  
> @@ -54,6 +56,7 @@ int mv88e6xxx_g1_atu_set_age_time(struct mv88e6xxx_chip *chip,
>  
>  	/* Round to nearest multiple of coeff */
>  	age_time = (msecs + coeff / 2) / coeff;
> +	chip->age_time = age_time;
>  
>  	err = mv88e6xxx_g1_read(chip, MV88E6XXX_G1_ATU_CTL, &val);
>  	if (err)
> @@ -426,6 +429,8 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id)
>  	if (err)
>  		goto out;
>  
> +	mv88e6xxx_reg_unlock(chip);
> +
>  	spid = entry.state;
>  
>  	if (val & MV88E6XXX_G1_ATU_OP_AGE_OUT_VIOLATION) {
> @@ -446,6 +451,12 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id)
>  				    "ATU miss violation for %pM portvec %x spid %d\n",
>  				    entry.mac, entry.portvec, spid);
>  		chip->ports[spid].atu_miss_violation++;
> +
> +		if (fid && chip->ports[spid].mab)
> +			err = mv88e6xxx_handle_violation(chip, spid, &entry, fid,
> +							 MV88E6XXX_G1_ATU_OP_MISS_VIOLATION);
> +		if (err)
> +			goto out;
>  	}
>  
>  	if (val & MV88E6XXX_G1_ATU_OP_FULL_VIOLATION) {
> @@ -454,7 +465,6 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id)
>  				    entry.mac, entry.portvec, spid);
>  		chip->ports[spid].atu_full_violation++;
>  	}
> -	mv88e6xxx_reg_unlock(chip);
>  
>  	return IRQ_HANDLED;
>  
> diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
> index 5c4195c635b0..67e457ce67ae 100644
> --- a/drivers/net/dsa/mv88e6xxx/port.c
> +++ b/drivers/net/dsa/mv88e6xxx/port.c
> @@ -14,9 +14,11 @@
>  #include <linux/phylink.h>
>  
>  #include "chip.h"
> +#include "global1.h"
>  #include "global2.h"
>  #include "port.h"
>  #include "serdes.h"
> +#include "switchdev.h"
>  
>  int mv88e6xxx_port_read(struct mv88e6xxx_chip *chip, int port, int reg,
>  			u16 *val)
> @@ -1240,13 +1242,12 @@ int mv88e6xxx_port_set_lock(struct mv88e6xxx_chip *chip, int port,
>  	if (err)
>  		return err;
>  
> -	err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_ASSOC_VECTOR, &reg);
> -	if (err)
> -		return err;
> -
> -	reg &= ~MV88E6XXX_PORT_ASSOC_VECTOR_LOCKED_PORT;
> -	if (locked)
> -		reg |= MV88E6XXX_PORT_ASSOC_VECTOR_LOCKED_PORT;
> +	reg = 0;
> +	if (locked) {
> +		reg = (1 << port);
> +		reg |= MV88E6XXX_PORT_ASSOC_VECTOR_IGNORE_WRONG |
> +			MV88E6XXX_PORT_ASSOC_VECTOR_LOCKED_PORT;
> +	}
>  
>  	return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_ASSOC_VECTOR, reg);
>  }
> diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
> index cb04243f37c1..9475bc6e95a2 100644
> --- a/drivers/net/dsa/mv88e6xxx/port.h
> +++ b/drivers/net/dsa/mv88e6xxx/port.h
> @@ -231,6 +231,7 @@
>  #define MV88E6XXX_PORT_ASSOC_VECTOR_LOCKED_PORT		0x2000
>  #define MV88E6XXX_PORT_ASSOC_VECTOR_IGNORE_WRONG	0x1000
>  #define MV88E6XXX_PORT_ASSOC_VECTOR_REFRESH_LOCKED	0x0800
> +#define MV88E6XXX_PORT_ASSOC_VECTOR_PAV_MASK		0x07ff
>  
>  /* Offset 0x0C: Port ATU Control */
>  #define MV88E6XXX_PORT_ATU_CTL		0x0c
> @@ -375,6 +376,11 @@ int mv88e6xxx_port_set_pvid(struct mv88e6xxx_chip *chip, int port, u16 pvid);
>  int mv88e6xxx_port_set_lock(struct mv88e6xxx_chip *chip, int port,
>  			    bool locked);
>  
> +static inline bool mv88e6xxx_port_is_locked(struct mv88e6xxx_chip *chip, int port)
> +{
> +	return chip->ports[port].locked;
> +}
> +
>  int mv88e6xxx_port_set_8021q_mode(struct mv88e6xxx_chip *chip, int port,
>  				  u16 mode);
>  int mv88e6095_port_tag_remap(struct mv88e6xxx_chip *chip, int port);
> diff --git a/drivers/net/dsa/mv88e6xxx/switchdev.c b/drivers/net/dsa/mv88e6xxx/switchdev.c
> new file mode 100644
> index 000000000000..cd332a10fad5
> --- /dev/null
> +++ b/drivers/net/dsa/mv88e6xxx/switchdev.c
> @@ -0,0 +1,284 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * switchdev.c
> + *
> + *	Authors:
> + *	Hans J. Schultz		<hans.schultz@westermo.com>
> + *
> + */
> +
> +#include <net/switchdev.h>
> +#include <linux/list.h>
> +#include "chip.h"
> +#include "global1.h"
> +#include "switchdev.h"
> +
> +static void mv88e6xxx_atu_locked_entry_purge(struct mv88e6xxx_atu_locked_entry *ale,
> +					     bool notify, bool take_nl_lock)
> +{
> +	struct switchdev_notifier_fdb_info info = {
> +		.addr = ale->mac,
> +		.vid = ale->vid,
> +		.locked = true,
> +		.offloaded = true,
> +	};
> +	struct mv88e6xxx_atu_entry entry;
> +	struct net_device *brport;
> +	struct dsa_port *dp;
> +
> +	entry.portvec = MV88E6XXX_G1_ATU_DATA_PORT_VECTOR_NO_EGRESS;
> +	entry.state = MV88E6XXX_G1_ATU_DATA_STATE_UC_UNUSED;
> +	entry.trunk = false;
> +	ether_addr_copy(entry.mac, ale->mac);
> +
> +	mv88e6xxx_reg_lock(ale->chip);
> +	mv88e6xxx_g1_atu_loadpurge(ale->chip, ale->fid, &entry);
> +	mv88e6xxx_reg_unlock(ale->chip);
> +
> +	dp = dsa_to_port(ale->chip->ds, ale->port);
> +
> +	if (notify) {
> +		if (take_nl_lock)
> +			rtnl_lock();

Is this tested with lockdep? I see the function is called with other
locks held (p->ale_list_lock). Isn't there a lock inversion anywhere?
Locks always need to be taken in the same order, and rtnl_lock is a
pretty high level lock, not exactly the kind you could take just like
that.

> +		brport = dsa_port_to_bridge_port(dp);
> +
> +		if (brport) {
> +			call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE,
> +						 brport, &info.info, NULL);
> +		} else {
> +			dev_err(ale->chip->dev, "No bridge port for dsa port belonging to port %d\n",
> +				ale->port);
> +		}
> +		if (take_nl_lock)
> +			rtnl_unlock();
> +	}
> +
> +	list_del(&ale->list);
> +	kfree(ale);
> +}
  
Hans Schultz Oct. 20, 2022, 7:59 p.m. UTC | #2
On 2022-10-20 15:25, Vladimir Oltean wrote:
>> +
>> +#include <net/switchdev.h>
>> +#include <linux/list.h>
>> +#include "chip.h"
>> +#include "global1.h"
>> +#include "switchdev.h"
>> +
>> +static void mv88e6xxx_atu_locked_entry_purge(struct 
>> mv88e6xxx_atu_locked_entry *ale,
>> +					     bool notify, bool take_nl_lock)
>> +{
>> +	struct switchdev_notifier_fdb_info info = {
>> +		.addr = ale->mac,
>> +		.vid = ale->vid,
>> +		.locked = true,
>> +		.offloaded = true,
>> +	};
>> +	struct mv88e6xxx_atu_entry entry;
>> +	struct net_device *brport;
>> +	struct dsa_port *dp;
>> +
>> +	entry.portvec = MV88E6XXX_G1_ATU_DATA_PORT_VECTOR_NO_EGRESS;
>> +	entry.state = MV88E6XXX_G1_ATU_DATA_STATE_UC_UNUSED;
>> +	entry.trunk = false;
>> +	ether_addr_copy(entry.mac, ale->mac);
>> +
>> +	mv88e6xxx_reg_lock(ale->chip);
>> +	mv88e6xxx_g1_atu_loadpurge(ale->chip, ale->fid, &entry);
>> +	mv88e6xxx_reg_unlock(ale->chip);
>> +
>> +	dp = dsa_to_port(ale->chip->ds, ale->port);
>> +
>> +	if (notify) {
>> +		if (take_nl_lock)
>> +			rtnl_lock();
> 
> Is this tested with lockdep? I see the function is called with other
> locks held (p->ale_list_lock). Isn't there a lock inversion anywhere?
> Locks always need to be taken in the same order, and rtnl_lock is a
> pretty high level lock, not exactly the kind you could take just like
> that.
> 

I am very sure that there is no lock inversions or double locks taken.
It is only in the clean-up from time-out of driver locked entries that
the nl lock needs to be taken (as the code reveals). In all other
instances, the nl lock is already taken as far as this implementation 
goes.
  
Hans Schultz Oct. 20, 2022, 8:20 p.m. UTC | #3
On 2022-10-20 15:25, Vladimir Oltean wrote:
>> diff --git a/drivers/net/dsa/mv88e6xxx/chip.c 
>> b/drivers/net/dsa/mv88e6xxx/chip.c
>> index 352121cce77e..71843fe87f77 100644
>> --- a/drivers/net/dsa/mv88e6xxx/chip.c
>> +++ b/drivers/net/dsa/mv88e6xxx/chip.c
>> @@ -42,6 +42,7 @@
>>  #include "ptp.h"
>>  #include "serdes.h"
>>  #include "smi.h"
>> +#include "switchdev.h"
>> 
>>  static void assert_reg_lock(struct mv88e6xxx_chip *chip)
>>  {
>> @@ -924,6 +925,13 @@ static void mv88e6xxx_mac_link_down(struct 
>> dsa_switch *ds, int port,
>>  	if (err)
>>  		dev_err(chip->dev,
>>  			"p%d: failed to force MAC link down\n", port);
>> +	else
>> +		if (mv88e6xxx_port_is_locked(chip, port)) {
>> +			err = mv88e6xxx_atu_locked_entry_flush(ds, port);
>> +			if (err)
>> +				dev_err(chip->dev,
>> +					"p%d: failed to clear locked entries\n", port);
>> +		}
> 
> This would not have been needed if dsa_port_set_state() would have
> called dsa_port_fast_age().
> 
> Currently it only does that if dp->learning is true. From previous
> conversations I get the idea that with MAB, port learning will be 
> false.
> But I don't understand why; isn't MAB CPU-assisted learning? I'm 
> looking
> at the ocelot hardware support for this and I think it could be
> implemented using a similar mechanism, but I certainly don't want to 
> add
> more workarounds such as this in other drivers.
> 
> Are there any other ways to implement MAB other than through CPU
> assisted learning?
> 
> We could add one more dp->mab flag which tracks the "mab" brport flag,
> and extend dsa_port_set_state() to also call dsa_port_fast_age() in 
> that
> case, but I want to make sure there isn't something extremely obvious
> I'm missing about the "learning" flag.
> 

In general locked ports block traffic from a host based on if there is a
FDB entry or not. In the non-offloaded case, there is only CPU assisted
learning, so the normal learning mechanism has to be disabled as any
learned entry will open the port for the learned MAC,vlan.
Thus learning is off for locked ports, which of course includes MAB.

So the 'learning' is based on authorizing MAC,vlan addresses, which
is done by userspace daemons, e.g. hostapd or what could be called
mabd.
  
Hans Schultz Oct. 20, 2022, 9:09 p.m. UTC | #4
On 2022-10-20 15:25, Vladimir Oltean wrote:
> 
> This would not have been needed if dsa_port_set_state() would have
> called dsa_port_fast_age().
> 
> Currently it only does that if dp->learning is true. From previous
> conversations I get the idea that with MAB, port learning will be 
> false.
> But I don't understand why; isn't MAB CPU-assisted learning? I'm 
> looking
> at the ocelot hardware support for this and I think it could be
> implemented using a similar mechanism, but I certainly don't want to 
> add
> more workarounds such as this in other drivers.
> 
> Are there any other ways to implement MAB other than through CPU
> assisted learning?
> 
> We could add one more dp->mab flag which tracks the "mab" brport flag,
> and extend dsa_port_set_state() to also call dsa_port_fast_age() in 
> that
> case, but I want to make sure there isn't something extremely obvious
> I'm missing about the "learning" flag.
> 

As learning is off on locked ports, see other response, your dp->mab 
flag
idea might be a way to go, just need confirmation that this is needed.


>> @@ -6572,8 +6604,10 @@ static int mv88e6xxx_port_bridge_flags(struct 
>> dsa_switch *ds, int port,
>>  	if (flags.mask & BR_MCAST_FLOOD) {
>>  		bool multicast = !!(flags.val & BR_MCAST_FLOOD);
>> 
>> +		mv88e6xxx_reg_lock(chip);
>>  		err = chip->info->ops->port_set_mcast_flood(chip, port,
>>  							    multicast);
>> +		mv88e6xxx_reg_unlock(chip);
>>  		if (err)
>>  			goto out;
>>  	}
>> @@ -6581,20 +6615,34 @@ static int mv88e6xxx_port_bridge_flags(struct 
>> dsa_switch *ds, int port,
>>  	if (flags.mask & BR_BCAST_FLOOD) {
>>  		bool broadcast = !!(flags.val & BR_BCAST_FLOOD);
>> 
>> +		mv88e6xxx_reg_lock(chip);
>>  		err = mv88e6xxx_port_broadcast_sync(chip, port, broadcast);
>> +		mv88e6xxx_reg_unlock(chip);
>>  		if (err)
>>  			goto out;
>>  	}
>> 
>> +	if (flags.mask & BR_PORT_MAB) {
>> +		chip->ports[port].mab = !!(flags.val & BR_PORT_MAB);
>> +
>> +		if (!chip->ports[port].mab)
>> +			err = mv88e6xxx_atu_locked_entry_flush(ds, port);
>> +		else
>> +			err = 0;
> 
> Again, dsa_port_fast_age() is also called when dp->learning is turned
> off in dsa_port_bridge_flags(). I don't want to see the mv88e6xxx 
> driver
> doing this manually.
> 

Maybe I am wrong, but I have only been able to trigger fast ageing by 
setting
the STP state of the port to blocked...
  
Vladimir Oltean Oct. 20, 2022, 10:57 p.m. UTC | #5
On Thu, Oct 20, 2022 at 10:20:50PM +0200, netdev@kapio-technology.com wrote:
> In general locked ports block traffic from a host based on if there is a
> FDB entry or not. In the non-offloaded case, there is only CPU assisted
> learning, so the normal learning mechanism has to be disabled as any
> learned entry will open the port for the learned MAC,vlan.

Does it have to be that way? Why can't BR_LEARNING on a BR_PORT_LOCKED
cause the learned FDB entries to have BR_FDB_LOCKED, and everything
would be ok in that case (the port will not be opened for the learned
MAC/VLAN)?

> Thus learning is off for locked ports, which of course includes MAB.
> 
> So the 'learning' is based on authorizing MAC,vlan addresses, which
> is done by userspace daemons, e.g. hostapd or what could be called
> mabd.
  
Vladimir Oltean Oct. 20, 2022, 11 p.m. UTC | #6
On Thu, Oct 20, 2022 at 11:09:40PM +0200, netdev@kapio-technology.com wrote:
> > Again, dsa_port_fast_age() is also called when dp->learning is turned
> > off in dsa_port_bridge_flags(). I don't want to see the mv88e6xxx driver
> > doing this manually.
> 
> Maybe I am wrong, but I have only been able to trigger fast ageing by setting
> the STP state of the port to blocked...

Maybe you didn't try hard enough? On a DSA bridge port that is up and in
the FORWARDING state and with 'learning' on, running "ip link set dev
swp0 type bridge_slave learning off" triggers dsa_port_fast_age().
  
Hans Schultz Oct. 21, 2022, 6:47 a.m. UTC | #7
On 2022-10-21 00:57, Vladimir Oltean wrote:
> On Thu, Oct 20, 2022 at 10:20:50PM +0200, netdev@kapio-technology.com 
> wrote:
>> In general locked ports block traffic from a host based on if there is 
>> a
>> FDB entry or not. In the non-offloaded case, there is only CPU 
>> assisted
>> learning, so the normal learning mechanism has to be disabled as any
>> learned entry will open the port for the learned MAC,vlan.
> 
> Does it have to be that way? Why can't BR_LEARNING on a BR_PORT_LOCKED
> cause the learned FDB entries to have BR_FDB_LOCKED, and everything
> would be ok in that case (the port will not be opened for the learned
> MAC/VLAN)?
> 

I suppose you are right that basing it solely on BR_FDB_LOCKED is 
possible.

The question is then maybe if the common case where you don't need 
learned
entries for the scheme to work, e.g. with EAPOL link local packets, 
requires
less CPU load to work and is cleaner than if using BR_FDB_LOCKED 
entries?

>> Thus learning is off for locked ports, which of course includes MAB.
>> 
>> So the 'learning' is based on authorizing MAC,vlan addresses, which
>> is done by userspace daemons, e.g. hostapd or what could be called
>> mabd.
  
Vladimir Oltean Oct. 21, 2022, 11:22 a.m. UTC | #8
On Fri, Oct 21, 2022 at 08:47:42AM +0200, netdev@kapio-technology.com wrote:
> On 2022-10-21 00:57, Vladimir Oltean wrote:
> > On Thu, Oct 20, 2022 at 10:20:50PM +0200, netdev@kapio-technology.com
> > wrote:
> > > In general locked ports block traffic from a host based on if there
> > > is a
> > > FDB entry or not. In the non-offloaded case, there is only CPU
> > > assisted
> > > learning, so the normal learning mechanism has to be disabled as any
> > > learned entry will open the port for the learned MAC,vlan.
> > 
> > Does it have to be that way? Why can't BR_LEARNING on a BR_PORT_LOCKED
> > cause the learned FDB entries to have BR_FDB_LOCKED, and everything
> > would be ok in that case (the port will not be opened for the learned
> > MAC/VLAN)?
> 
> I suppose you are right that basing it solely on BR_FDB_LOCKED is possible.
> 
> The question is then maybe if the common case where you don't need learned
> entries for the scheme to work, e.g. with EAPOL link local packets, requires
> less CPU load to work and is cleaner than if using BR_FDB_LOCKED entries?

I suppose the real question is what does the bridge currently do with
BR_LEARNING + BR_PORT_LOCKED, and if that is sane and useful in any case?
It isn't a configuration that's rejected, for sure. The configuration
could be rejected via a bug fix patch, then in net-next it could be made
to learn these addresses with the BR_FDB_LOCKED flag.

To your question regarding the common case (no MAB): that can be supported
just fine when BR_LEARNING is off and BR_PORT_LOCKED is on, no?
No BR_FDB_LOCKED entries will be learned.
  
Hans Schultz Oct. 21, 2022, 1:16 p.m. UTC | #9
On 2022-10-21 13:22, Vladimir Oltean wrote:
> On Fri, Oct 21, 2022 at 08:47:42AM +0200, netdev@kapio-technology.com 
> wrote:
>> On 2022-10-21 00:57, Vladimir Oltean wrote:
>> > On Thu, Oct 20, 2022 at 10:20:50PM +0200, netdev@kapio-technology.com
>> > wrote:
>> > > In general locked ports block traffic from a host based on if there
>> > > is a
>> > > FDB entry or not. In the non-offloaded case, there is only CPU
>> > > assisted
>> > > learning, so the normal learning mechanism has to be disabled as any
>> > > learned entry will open the port for the learned MAC,vlan.
>> >
>> > Does it have to be that way? Why can't BR_LEARNING on a BR_PORT_LOCKED
>> > cause the learned FDB entries to have BR_FDB_LOCKED, and everything
>> > would be ok in that case (the port will not be opened for the learned
>> > MAC/VLAN)?
>> 
>> I suppose you are right that basing it solely on BR_FDB_LOCKED is 
>> possible.
>> 
>> The question is then maybe if the common case where you don't need 
>> learned
>> entries for the scheme to work, e.g. with EAPOL link local packets, 
>> requires
>> less CPU load to work and is cleaner than if using BR_FDB_LOCKED 
>> entries?
> 
> I suppose the real question is what does the bridge currently do with
> BR_LEARNING + BR_PORT_LOCKED, and if that is sane and useful in any 
> case?
> It isn't a configuration that's rejected, for sure. The configuration
> could be rejected via a bug fix patch, then in net-next it could be 
> made
> to learn these addresses with the BR_FDB_LOCKED flag.
> 
> To your question regarding the common case (no MAB): that can be 
> supported
> just fine when BR_LEARNING is off and BR_PORT_LOCKED is on, no?
> No BR_FDB_LOCKED entries will be learned.

As it is now in the bridge, the locked port part is handled before 
learning
in the ingress data path, so with BR_LEARNING and BR_PORT_LOCKED, I 
think it
will work as it does now except link local packages.

If your suggestion of BR_LEARNING causing BR_FDB_LOCKED on a locked 
port, I
guess it would be implemented under br_fdb_update() and BR_LEARNING +
BR_PORT_LOCKED would go together, forcing BR_LEARNING in this case, thus 
also
for all drivers?
  
Vladimir Oltean Oct. 21, 2022, 4:30 p.m. UTC | #10
On Fri, Oct 21, 2022 at 03:16:21PM +0200, netdev@kapio-technology.com wrote:
> As it is now in the bridge, the locked port part is handled before learning
> in the ingress data path, so with BR_LEARNING and BR_PORT_LOCKED, I think it
> will work as it does now except link local packages.

If link-local learning is enabled on a locked port, I think those
addresses should also be learned with the BR_FDB_LOCKED flag. The
creation of those locked FDB entries can be further suppressed by the
BROPT_NO_LL_LEARN flag.

> If your suggestion of BR_LEARNING causing BR_FDB_LOCKED on a locked port, I
> guess it would be implemented under br_fdb_update() and BR_LEARNING +
> BR_PORT_LOCKED would go together, forcing BR_LEARNING in this case, thus also
> for all drivers?

Yes, basically where this is placed right now (in br_handle_frame_finish):

	if (p->flags & BR_PORT_LOCKED) {
		struct net_bridge_fdb_entry *fdb_src =
			br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid);

		if (!fdb_src) {
			unsigned long flags = 0;

			if (p->flags & BR_PORT_MAB) {
			   ~~~~~~~~~~~~~~~~~~~~~~~~
			   except check for BR_LEARNING

				__set_bit(BR_FDB_LOCKED, &flags);
				br_fdb_update(br, p, eth_hdr(skb)->h_source,
					      vid, flags);
			}
			goto drop;
		} else if (READ_ONCE(fdb_src->dst) != p ||
			   test_bit(BR_FDB_LOCAL, &fdb_src->flags) ||
			   test_bit(BR_FDB_LOCKED, &fdb_src->flags)) {
			goto drop;
		}
	}
  
Hans Schultz Oct. 21, 2022, 5:18 p.m. UTC | #11
On 2022-10-21 18:30, Vladimir Oltean wrote:
> On Fri, Oct 21, 2022 at 03:16:21PM +0200, netdev@kapio-technology.com 
> wrote:
>> As it is now in the bridge, the locked port part is handled before 
>> learning
>> in the ingress data path, so with BR_LEARNING and BR_PORT_LOCKED, I 
>> think it
>> will work as it does now except link local packages.
> 
> If link-local learning is enabled on a locked port, I think those
> addresses should also be learned with the BR_FDB_LOCKED flag. The
> creation of those locked FDB entries can be further suppressed by the
> BROPT_NO_LL_LEARN flag.
> 
>> If your suggestion of BR_LEARNING causing BR_FDB_LOCKED on a locked 
>> port, I
>> guess it would be implemented under br_fdb_update() and BR_LEARNING +
>> BR_PORT_LOCKED would go together, forcing BR_LEARNING in this case, 
>> thus also
>> for all drivers?
> 
> Yes, basically where this is placed right now (in 
> br_handle_frame_finish):
> 
> 	if (p->flags & BR_PORT_LOCKED) {
> 		struct net_bridge_fdb_entry *fdb_src =
> 			br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid);
> 
> 		if (!fdb_src) {
> 			unsigned long flags = 0;
> 
> 			if (p->flags & BR_PORT_MAB) {
> 			   ~~~~~~~~~~~~~~~~~~~~~~~~
> 			   except check for BR_LEARNING
> 
> 				__set_bit(BR_FDB_LOCKED, &flags);
> 				br_fdb_update(br, p, eth_hdr(skb)->h_source,
> 					      vid, flags);
> 			}
> 			goto drop;
> 		} else if (READ_ONCE(fdb_src->dst) != p ||
> 			   test_bit(BR_FDB_LOCAL, &fdb_src->flags) ||
> 			   test_bit(BR_FDB_LOCKED, &fdb_src->flags)) {
> 			goto drop;
> 		}
> 	}

As I don't know what implications it would have for other drivers to 
have learning
forced enabled on locked ports, I cannot say if it is a good idea or 
not. Right now
learning is not forced either way as is, but the consensus is that 
learning should
be off with locked ports, which it would be either way in the common 
case I think.
  
Vladimir Oltean Oct. 21, 2022, 5:30 p.m. UTC | #12
On Fri, Oct 21, 2022 at 07:18:59PM +0200, netdev@kapio-technology.com wrote:
> On 2022-10-21 18:30, Vladimir Oltean wrote:
> > On Fri, Oct 21, 2022 at 03:16:21PM +0200, netdev@kapio-technology.com wrote:
> > > As it is now in the bridge, the locked port part is handled before learning
> > > in the ingress data path, so with BR_LEARNING and BR_PORT_LOCKED, I think it
> > > will work as it does now except link local packages.
> > 
> > If link-local learning is enabled on a locked port, I think those
> > addresses should also be learned with the BR_FDB_LOCKED flag. The
> > creation of those locked FDB entries can be further suppressed by the
> > BROPT_NO_LL_LEARN flag.
> > 
> > > If your suggestion of BR_LEARNING causing BR_FDB_LOCKED on a locked port, I
> > > guess it would be implemented under br_fdb_update() and BR_LEARNING +
> > > BR_PORT_LOCKED would go together, forcing BR_LEARNING in this case, thus also
> > > for all drivers?
> > 
> > Yes, basically where this is placed right now (in br_handle_frame_finish):
> 
> As I don't know what implications it would have for other drivers to have learning
> forced enabled on locked ports, I cannot say if it is a good idea or not.
> Right now learning is not forced either way as is, but the consensus is that learning
> should be off with locked ports, which it would be either way in the common case I
> think.

I don't think I fully understand what you mean by forcing BR_LEARNING.
A bridge port gets created with a default set of flags as can be seen in new_nbp().
Those flags include BR_LEARNING but don't include BR_PORT_LOCKED.

The user can decide he wants to make the port use 802.1X without MAB, so
he enables BR_PORT_LOCKED and disables BR_LEARNING, all with the same
netlink command (ip link set swp0 type bridge_slave learning off locked on).

How was the driver forced into anything?
  
Hans Schultz Oct. 21, 2022, 5:39 p.m. UTC | #13
On 2022-10-21 19:30, Vladimir Oltean wrote:
> On Fri, Oct 21, 2022 at 07:18:59PM +0200, netdev@kapio-technology.com 
> wrote:
>> On 2022-10-21 18:30, Vladimir Oltean wrote:
>> > On Fri, Oct 21, 2022 at 03:16:21PM +0200, netdev@kapio-technology.com wrote:
>> > > As it is now in the bridge, the locked port part is handled before learning
>> > > in the ingress data path, so with BR_LEARNING and BR_PORT_LOCKED, I think it
>> > > will work as it does now except link local packages.
>> >
>> > If link-local learning is enabled on a locked port, I think those
>> > addresses should also be learned with the BR_FDB_LOCKED flag. The
>> > creation of those locked FDB entries can be further suppressed by the
>> > BROPT_NO_LL_LEARN flag.
>> >
>> > > If your suggestion of BR_LEARNING causing BR_FDB_LOCKED on a locked port, I
>> > > guess it would be implemented under br_fdb_update() and BR_LEARNING +
>> > > BR_PORT_LOCKED would go together, forcing BR_LEARNING in this case, thus also
>> > > for all drivers?
>> >
>> > Yes, basically where this is placed right now (in br_handle_frame_finish):
>> 
>> As I don't know what implications it would have for other drivers to 
>> have learning
>> forced enabled on locked ports, I cannot say if it is a good idea or 
>> not.
>> Right now learning is not forced either way as is, but the consensus 
>> is that learning
>> should be off with locked ports, which it would be either way in the 
>> common case I
>> think.
> 
> I don't think I fully understand what you mean by forcing BR_LEARNING.
> A bridge port gets created with a default set of flags as can be seen
> in new_nbp().
> Those flags include BR_LEARNING but don't include BR_PORT_LOCKED.
> 
> The user can decide he wants to make the port use 802.1X without MAB, 
> so
> he enables BR_PORT_LOCKED and disables BR_LEARNING, all with the same
> netlink command (ip link set swp0 type bridge_slave learning off locked 
> on).
> 
> How was the driver forced into anything?

Well, with this change, to have MAB working, the bridge would need 
learning on
of course, but how things work with the bridge according to the flags, 
they
should also work in the offloaded case if you ask me. There should be no
difference between the two, thus MAB in drivers would have to be with
learning on.
  
Vladimir Oltean Oct. 21, 2022, 6:14 p.m. UTC | #14
On Fri, Oct 21, 2022 at 07:39:34PM +0200, netdev@kapio-technology.com wrote:
> Well, with this change, to have MAB working, the bridge would need learning on
> of course, but how things work with the bridge according to the flags, they
> should also work in the offloaded case if you ask me. There should be no
> difference between the two, thus MAB in drivers would have to be with
> learning on.

Am I proposing for things to work differently in the offload and
software case, and not realizing it? :-/

The essence of my proposal was to send a bug fix now which denies
BR_LEARNING to be set together with BR_PORT_LOCKED. The fact that
link-local traffic is learned by the software bridge is something
unintended as far as I understand.

You tried to fix it here, and as far as I could search in my inbox, that
didn't go anywhere:
https://lore.kernel.org/netdev/47d8d747-54ef-df52-3b9c-acb9a77fa14a@blackwall.org/T/#u

I thought only mv88e6xxx offloads BR_PORT_LOCKED, but now, after
searching, I also see prestera has support for it, so let me add
Oleksandr Mazur to the discussion as well. I wonder how they deal with
this? Has somebody come to rely on learning being enabled on a locked
port?


MAB in offloading drivers will have to be with learning on (same as in
software). When BR_PORT_LOCKED | BR_LEARNING will be allowed together
back in net-next (to denote the MAB configuration), offloading drivers
(mv88e6xxx and prestera) will be patched to reject them. They will only
accept the two together when they implement MAB support.

Future drivers after this mess has been cleaned up will have to look at
the BR_PORT_LOCKED and BR_LEARNING flag in combination, to see which
kind of learning is desired on a port (secure, CPU based learning or
autonomous learning).

Am I not making sense?
  
Hans Schultz Oct. 22, 2022, 7:24 a.m. UTC | #15
On 2022-10-21 20:14, Vladimir Oltean wrote:
> On Fri, Oct 21, 2022 at 07:39:34PM +0200, netdev@kapio-technology.com 
> wrote:
>> Well, with this change, to have MAB working, the bridge would need 
>> learning on
>> of course, but how things work with the bridge according to the flags, 
>> they
>> should also work in the offloaded case if you ask me. There should be 
>> no
>> difference between the two, thus MAB in drivers would have to be with
>> learning on.
> 
> Am I proposing for things to work differently in the offload and
> software case, and not realizing it? :-/
> 
> The essence of my proposal was to send a bug fix now which denies
> BR_LEARNING to be set together with BR_PORT_LOCKED. The fact that
> link-local traffic is learned by the software bridge is something
> unintended as far as I understand.
> 
> You tried to fix it here, and as far as I could search in my inbox, 
> that
> didn't go anywhere:
> https://lore.kernel.org/netdev/47d8d747-54ef-df52-3b9c-acb9a77fa14a@blackwall.org/T/#u
> 
> I thought only mv88e6xxx offloads BR_PORT_LOCKED, but now, after
> searching, I also see prestera has support for it, so let me add
> Oleksandr Mazur to the discussion as well. I wonder how they deal with
> this? Has somebody come to rely on learning being enabled on a locked
> port?
> 
> 
> MAB in offloading drivers will have to be with learning on (same as in
> software). When BR_PORT_LOCKED | BR_LEARNING will be allowed together
> back in net-next (to denote the MAB configuration), offloading drivers
> (mv88e6xxx and prestera) will be patched to reject them. They will only
> accept the two together when they implement MAB support.
> 
> Future drivers after this mess has been cleaned up will have to look at
> the BR_PORT_LOCKED and BR_LEARNING flag in combination, to see which
> kind of learning is desired on a port (secure, CPU based learning or
> autonomous learning).
> 
> Am I not making sense?

I will not say that you are not making sense as for the mv88e6xxx, as it
needs port association in all cases with BR_PORT_LOCKED, MAB or not, and
port association is turned on in the driver with learning turned on.

That said, there must be some resolution and agreement overall with this
issue to move on. Right now port association is turned on in the 
mv88e6xxx
driver when locking the port, thus setting learning off after locking 
will
break things.
  
Hans Schultz Oct. 22, 2022, 7:31 a.m. UTC | #16
On 2022-10-20 15:25, Vladimir Oltean wrote:
>>  	if (flags.mask & BR_LEARNING) {
>>  		bool learning = !!(flags.val & BR_LEARNING);
>>  		u16 pav = learning ? (1 << port) : 0;
>> 
>> +		mv88e6xxx_reg_lock(chip);
>>  		err = mv88e6xxx_port_set_assoc_vector(chip, port, pav);
>> +		mv88e6xxx_reg_unlock(chip);
>>  		if (err)
>>  			goto out;
>>  	}
>> @@ -6563,8 +6593,10 @@ static int mv88e6xxx_port_bridge_flags(struct 
>> dsa_switch *ds, int port,
>>  	if (flags.mask & BR_FLOOD) {
>>  		bool unicast = !!(flags.val & BR_FLOOD);
>> 
>> +		mv88e6xxx_reg_lock(chip);
>>  		err = chip->info->ops->port_set_ucast_flood(chip, port,
>>  							    unicast);
>> +		mv88e6xxx_reg_unlock(chip);
>>  		if (err)
>>  			goto out;
>>  	}
>> @@ -6572,8 +6604,10 @@ static int mv88e6xxx_port_bridge_flags(struct 
>> dsa_switch *ds, int port,
>>  	if (flags.mask & BR_MCAST_FLOOD) {
>>  		bool multicast = !!(flags.val & BR_MCAST_FLOOD);
>> 
>> +		mv88e6xxx_reg_lock(chip);
>>  		err = chip->info->ops->port_set_mcast_flood(chip, port,
>>  							    multicast);
>> +		mv88e6xxx_reg_unlock(chip);
>>  		if (err)
>>  			goto out;
>>  	}
>> @@ -6581,20 +6615,34 @@ static int mv88e6xxx_port_bridge_flags(struct 
>> dsa_switch *ds, int port,
>>  	if (flags.mask & BR_BCAST_FLOOD) {
>>  		bool broadcast = !!(flags.val & BR_BCAST_FLOOD);
>> 
>> +		mv88e6xxx_reg_lock(chip);
>>  		err = mv88e6xxx_port_broadcast_sync(chip, port, broadcast);
>> +		mv88e6xxx_reg_unlock(chip);
>>  		if (err)
>>  			goto out;
>>  	}
>> 
>> +	if (flags.mask & BR_PORT_MAB) {
>> +		chip->ports[port].mab = !!(flags.val & BR_PORT_MAB);
>> +
>> +		if (!chip->ports[port].mab)
>> +			err = mv88e6xxx_atu_locked_entry_flush(ds, port);
>> +		else
>> +			err = 0;
> 
> Again, dsa_port_fast_age() is also called when dp->learning is turned
> off in dsa_port_bridge_flags(). I don't want to see the mv88e6xxx 
> driver
> doing this manually.
> 

But I think it should be so that turning MAB off will clear the ALE 
entries
regardless, as the port can continue to be locked and needing port 
association,
or you want them to just age out normally in that case, thus lingering 
for
up to bridge ageing time?
  
Oleksandr Mazur Oct. 22, 2022, 8:50 a.m. UTC | #17
On Fri, Oct 21, 2022 at 07:39:34PM +0200, netdev@kapio-technology.com wrote:
>> Well, with this change, to have MAB working, the bridge would need learning on
>> of course, but how things work with the bridge according to the flags, they
>> should also work in the offloaded case if you ask me. There should be no
>> difference between the two, thus MAB in drivers would have to be with
>> learning on.

>Am I proposing for things to work differently in the offload and
>software case, and not realizing it? :-/

>The essence of my proposal was to send a bug fix now which denies
>BR_LEARNING to be set together with BR_PORT_LOCKED. The fact that
>link-local traffic is learned by the software bridge is something
>unintended as far as I understand.

>You tried to fix it here, and as far as I could search in my inbox, that
>didn't go anywhere:
>https://lore.kernel.org/netdev/47d8d747-54ef-df52-3b9c-acb9a77fa14a@blackwall.org/T/#u

>I thought only mv88e6xxx offloads BR_PORT_LOCKED, but now, after
>searching, I also see prestera has support for it, so let me add
>Oleksandr Mazur to the discussion as well. I wonder how they deal with
>this? Has somebody come to rely on learning being enabled on a locked
>port?

Hello,

>The fact that
>link-local traffic is learned by the software bridge is something
>unintended as far as I understand.

In prestera driver, if port is in blocked state only the PAE frames can be trapped, so i'm not sure where other traffic might come from that you are talking. Or maybe i didn't get the issue here right, sorry?

Also, basically, prestera driver does not rely on the learning flag if the port's flag BR_PORT_LOCKED is set. What this means, is that we discard any learning changes on the port if LOCKED is still set (done inside firmware, if i recall correctly). E.g. learning is always off, if port is in BR_PORT_LOCKED state, or in a block state but also has a static fdb entry (aka mac-auth entry).

The concept we follow is basically:
  - some userspace daemon blocks the port;
  - speaks with the <auth-center> (PAE traffic);
  - the daemon itself populates the FDB with authenticated MACs (adding static FDB MACs);
  - forces learning flag disable, disables the PORT_LOCKED flag. At this point switch can basically receive only the traffic from authorized addresses (fdb still has static entries; learning disabled).

Hope that helps.
Cheers.
  
Vladimir Oltean Oct. 22, 2022, 11:32 a.m. UTC | #18
Hi Oleksandr,

On Sat, Oct 22, 2022 at 08:50:20AM +0000, Oleksandr Mazur wrote:
> >The essence of my proposal was to send a bug fix now which denies
> >BR_LEARNING to be set together with BR_PORT_LOCKED. The fact that
> >link-local traffic is learned by the software bridge is something
> >unintended as far as I understand.
> 
> >You tried to fix it here, and as far as I could search in my inbox, that
> >didn't go anywhere:
> >https://lore.kernel.org/netdev/47d8d747-54ef-df52-3b9c-acb9a77fa14a@blackwall.org/T/#u
> 
> >I thought only mv88e6xxx offloads BR_PORT_LOCKED, but now, after
> >searching, I also see prestera has support for it, so let me add
> >Oleksandr Mazur to the discussion as well. I wonder how they deal with
> >this? Has somebody come to rely on learning being enabled on a locked
> >port?
> 
> Hello,
> 
> >The fact that
> >link-local traffic is learned by the software bridge is something
> >unintended as far as I understand.
> 
> In prestera driver, if port is in blocked state only the PAE frames
> can be trapped, so i'm not sure where other traffic might come from
> that you are talking. Or maybe i didn't get the issue here right,
> sorry?

I hope the following script will exemplify what I mean.

#!/bin/bash

ip netns add ns0
ip -n ns0 link add br0 type bridge
ip -n ns0 link add veth0 type veth peer name veth1
ip -n ns0 link set veth1 master br0
ip -n ns0 link set veth1 type bridge_slave locked on learning on
ip -n ns0 link set veth0 up
ip -n ns0 link set veth1 up
ip -n ns0 link set br0 up
addr=$(ip -j -n ns0 link show dev veth0 | jq -r '.[]["address"]')
ip netns exec ns0 mausezahn veth0 -q -c 1 -p 64 -b 01:80:c2:00:00:0e -t ip
sleep 1
ip netns exec ns0 bridge fdb show dev veth1 master | grep ${addr}
ip netns del ns0

It will print:

6e:71:0a:8d:85:9e master br0

or in other words, the brport veth1 has learned the MAC SA of veth0 as a
dynamic FDB entry even with no user space daemon to handle the
authentication protocol.

In turn, having this MAC SA present in the bridge FDB means that
communication with this station is now allowed. As far as I can tell,
this is *not* intended. Only the authentication protocol should create
the FDB entry.

Compare this with the same script, but with "locked on learning off".
No FDB entry will be printed.

> 
> Also, basically, prestera driver does not rely on the learning flag if
> the port's flag BR_PORT_LOCKED is set. What this means, is that we
> discard any learning changes on the port if LOCKED is still set (done
> inside firmware, if i recall correctly). E.g. learning is always off,
> if port is in BR_PORT_LOCKED state, or in a block state but also has a
> static fdb entry (aka mac-auth entry).

So I take this as meaning that we could deny BR_LEARNING on ports with
BR_PORT_LOCKED set, and prestera wouldn't be adversely affected. Ok.

> The concept we follow is basically:
>   - some userspace daemon blocks the port;
>   - speaks with the <auth-center> (PAE traffic);
>   - the daemon itself populates the FDB with authenticated MACs (adding static FDB MACs);
>   - forces learning flag disable, disables the PORT_LOCKED flag. At
>   this point switch can basically receive only the traffic from
>   authorized addresses (fdb still has static entries; learning
>   disabled).

I don't understand the last step. Why is the BR_PORT_LOCKED flag disabled?
If disabled, the port will receive frames with any unknown MAC SA,
not just the authorized ones.
  
Vladimir Oltean Oct. 22, 2022, 11:55 a.m. UTC | #19
On Sat, Oct 22, 2022 at 09:31:06AM +0200, netdev@kapio-technology.com wrote:
> But I think it should be so that turning MAB off will clear the ALE entries
> regardless, as the port can continue to be locked and needing port association,
> or you want them to just age out normally in that case, thus lingering for
> up to bridge ageing time?

Even without BR_PORT_LOCKED, I find it normal that dynamically learned
FDB entries are forcefully aged out when BR_LEARNING is turned off,
instead of lingering on until they expire.

This does not happen in the software bridge, and I did not understand
why (I suspected some backwards compatibility reasons), and for this
reason, it is only from within DSA that we are forcing this behavior to
take place. In dsa_port_bridge_flags(), when BR_LEARNING is turned off,
we call dsa_port_fast_age() which also calls SWITCHDEV_FDB_FLUSH_TO_BRIDGE
(and this clears the bridge software FDB of dynamically learned entries).

I very much expect the same thing with MAB and BR_FDB_LOCKED entries,
that they go away when the BR_PORT_MAB/BR_LEARNING flag (whichever way
we call it) is unset.

Now, if the bridge should initiate the flushing, or still DSA, is
perhaps a topic for further discussion. Given that BR_FDB_LOCKED entries
are new, maybe the bridge could do it in this case (no backwards
compatibility to handle).

Currently the DSA logic mentioned above is bypassed, because we treat
MAB and autonomous learning differently. If we accepted that MAB is
still a form of learning (managed through BR_LEARNING+BR_PORT_LOCKED),
then the DSA logic would kick in, and both the software bridge and the
hardware driver would have a hook to clean up the BR_FDB_LOCKED entries,
plus anything else that is dynamic. The DSA logic would also kick in if
we treated BR_PORT_MAB within DSA like BR_LEARNING, which basically
amounts to the same thing, except for the confusing (IMO) UAPI of having
a flag (BR_PORT_MAB) which is basically a form of learning that isn't
controlled by the BR_LEARNING flag (which is undefined and unclear if it
should be set or not, in BR_PORT_LOCKED mode).
  
Vladimir Oltean Oct. 22, 2022, 12:02 p.m. UTC | #20
On Sat, Oct 22, 2022 at 09:24:56AM +0200, netdev@kapio-technology.com wrote:
> I will not say that you are not making sense as for the mv88e6xxx, as it
> needs port association in all cases with BR_PORT_LOCKED, MAB or not, and
> port association is turned on in the driver with learning turned on.
> 
> That said, there must be some resolution and agreement overall with this
> issue to move on. Right now port association is turned on in the mv88e6xxx
> driver when locking the port, thus setting learning off after locking will
> break things.

This already needs to be treated as a bug and fixed on its own. Forget
about MAB.

You're saying that when BR_LEARNING=on and BR_PORT_LOCKED=on, the
mv88e6xxx driver works properly, but the software bridge is broken
(learns from link-local multicast).

When BR_LEARNING=off and BR_PORT_LOCKED=on, the software bridge is not
broken, but the mv88e6xxx driver is, because it requires the PAV
configured properly.

And you're saying that I'm the one who suggests things should work
differently in software mode vs offloaded mode?!

Why don't you
(a) deny BR_LEARNING + BR_PORT_LOCKED in the bridge layer
(b) fix the mv88e6xxx driver to always keep the assoc_vector set
    properly for the port, if BR_LEARNING *or* BR_PORT_LOCKED is set?
  
Oleksandr Mazur Oct. 22, 2022, 12:55 p.m. UTC | #21
> I hope the following script will exemplify what I mean.
..
Oh, i get it now.

Frankly speaking we haven't stumbled across such scenario / issue before. But i can tell it does indeed seems a bit broken;

I think there are 2 options here:
  1. The setup itself seems insecure, and user should be aware of such behavior / issue;
  2. Bridge indeed should not learn MACs if BR_PORT_LOCKED is set. E.g. learning condition should be something like: not BR_PORT_locked and learning is on; 


> I don't understand the last step. Why is the BR_PORT_LOCKED flag disabled?
> If disabled, the port will receive frames with any unknown MAC SA,
> not just the authorized ones.

Sorry for the confusion. Basically, what i described what i would expect from a daemon (e.g. daemon would disable LOCKED); So just ignore that part.
  
Hans Schultz Oct. 22, 2022, 1:15 p.m. UTC | #22
On 2022-10-22 14:02, Vladimir Oltean wrote:
> On Sat, Oct 22, 2022 at 09:24:56AM +0200, netdev@kapio-technology.com 
> wrote:
>> I will not say that you are not making sense as for the mv88e6xxx, as 
>> it
>> needs port association in all cases with BR_PORT_LOCKED, MAB or not, 
>> and
>> port association is turned on in the driver with learning turned on.
>> 
>> That said, there must be some resolution and agreement overall with 
>> this
>> issue to move on. Right now port association is turned on in the 
>> mv88e6xxx
>> driver when locking the port, thus setting learning off after locking 
>> will
>> break things.
> 
> This already needs to be treated as a bug and fixed on its own. Forget
> about MAB.
> 
> You're saying that when BR_LEARNING=on and BR_PORT_LOCKED=on, the
> mv88e6xxx driver works properly, but the software bridge is broken
> (learns from link-local multicast).
> 
> When BR_LEARNING=off and BR_PORT_LOCKED=on, the software bridge is not
> broken, but the mv88e6xxx driver is, because it requires the PAV
> configured properly.
> 
> And you're saying that I'm the one who suggests things should work
> differently in software mode vs offloaded mode?!

Well :-) To be specific, I am talking about how things work from a user
perspective, where I have kept to BR_LEARNING off before turning
BR_PORT_LOCKED on.

I admit to a weakness in that BR_LEARNING off after BR_PORT_LOCKED on is
a problem that from my perspective at this point would be a user error.

> 
> Why don't you
> (a) deny BR_LEARNING + BR_PORT_LOCKED in the bridge layer
> (b) fix the mv88e6xxx driver to always keep the assoc_vector set
>     properly for the port, if BR_LEARNING *or* BR_PORT_LOCKED is set?

(a) yes, I have thought that documentation could handle this, but maybe
     you are right, maybe it should be enforced...
(b) BR_PORT_LOCKED ensures now that the PAV is correctly set, so I have
     basically distinguished between learning and port association (which
     I know mechanically is the same in mv88e6xxx), but still I have
     adhered to learning off while port association is on for the port.
  
Vladimir Oltean Oct. 22, 2022, 1:39 p.m. UTC | #23
On Sat, Oct 22, 2022 at 12:55:14PM +0000, Oleksandr Mazur wrote:
> 
> > I hope the following script will exemplify what I mean.
> ..
> Oh, i get it now.
> 
> Frankly speaking we haven't stumbled across such scenario / issue
> before. But i can tell it does indeed seems a bit broken;
> 
> I think there are 2 options here:
>   1. The setup itself seems insecure, and user should be aware of such behavior / issue;

Be aware, and do what? Port locking is unfit for use if learning is left
enabled (in the way learning is currently done).

>   2. Bridge indeed should not learn MACs if BR_PORT_LOCKED is set.
>   E.g. learning condition should be something like: not BR_PORT_locked
>   and learning is on; 

Rather than violate the BR_LEARNING flag (have it set but do nothing,
which would require even more checks in the fast path), I was proposing
to not allow the BR_PORT_LOCKED | BR_LEARNING configuration at all.
My question to you was if you're aware of any regression in prestera
with such a change.

> > I don't understand the last step. Why is the BR_PORT_LOCKED flag disabled?
> > If disabled, the port will receive frames with any unknown MAC SA,
> > not just the authorized ones.
> 
> Sorry for the confusion. Basically, what i described what i would
> expect from a daemon (e.g. daemon would disable LOCKED); So just
> ignore that part.

But still, why would the daemon disable BR_PORT_LOCKED once a station is
authorized? You're describing a sample/test application, not a port
security solution...
  
Ido Schimmel Oct. 22, 2022, 1:49 p.m. UTC | #24
On Fri, Oct 21, 2022 at 09:14:11PM +0300, Vladimir Oltean wrote:
> On Fri, Oct 21, 2022 at 07:39:34PM +0200, netdev@kapio-technology.com wrote:
> > Well, with this change, to have MAB working, the bridge would need learning on
> > of course, but how things work with the bridge according to the flags, they
> > should also work in the offloaded case if you ask me. There should be no
> > difference between the two, thus MAB in drivers would have to be with
> > learning on.
> 
> Am I proposing for things to work differently in the offload and
> software case, and not realizing it? :-/
> 
> The essence of my proposal was to send a bug fix now which denies
> BR_LEARNING to be set together with BR_PORT_LOCKED. The fact that
> link-local traffic is learned by the software bridge is something
> unintended as far as I understand.
> 
> You tried to fix it here, and as far as I could search in my inbox, that
> didn't go anywhere:
> https://lore.kernel.org/netdev/47d8d747-54ef-df52-3b9c-acb9a77fa14a@blackwall.org/T/#u
> 
> I thought only mv88e6xxx offloads BR_PORT_LOCKED, but now, after
> searching, I also see prestera has support for it, so let me add
> Oleksandr Mazur to the discussion as well. I wonder how they deal with
> this? Has somebody come to rely on learning being enabled on a locked
> port?
> 
> 
> MAB in offloading drivers will have to be with learning on (same as in
> software). When BR_PORT_LOCKED | BR_LEARNING will be allowed together
> back in net-next (to denote the MAB configuration), offloading drivers
> (mv88e6xxx and prestera) will be patched to reject them. They will only
> accept the two together when they implement MAB support.
> 
> Future drivers after this mess has been cleaned up will have to look at
> the BR_PORT_LOCKED and BR_LEARNING flag in combination, to see which
> kind of learning is desired on a port (secure, CPU based learning or
> autonomous learning).
> 
> Am I not making sense?

I will try to summarize what I learned from past discussions because I
think it is not properly explained in the commit messages.

If you look at the hostapd fork by Westermo [1], you will see that they
are authorizing hosts by adding dynamic FDB entries from user space, not
static ones. Someone from Westermo will need to confirm this, but I
guess the reasons are that a) They want hosts that became silent to lose
their authentication after the aging time b) They want hosts to lose
their authentication when the carrier of the bridge port goes down. This
will cause the bridge driver to flush dynamic FDB entries, but not
static ones. Otherwise, an attacker with physical access to the switch
and knowledge of the MAC address of the authenticated host can connect a
different (malicious) host that will be able to communicate through the
bridge.

In the above scenario, learning does not need to be on for the bridge to
populate its FDB, but rather for the bridge to refresh the dynamic FDB
entries installed by hostapd. This seems like a valid use case and one
needs a good reason to break it in future kernels.

Regarding learning from link-local frames, this can be mitigated by [2]
without adding additional checks in the bridge. I don't know why this
bridge option was originally added, but if it wasn't for this use case,
then now it has another use case.

Regarding MAB, from the above you can see that a pure 802.1X
implementation that does not involve MAB can benefit from locked bridge
ports with learning enabled. It is therefore not accurate to say that
one wants MAB merely by enabling learning on a locked port. Given that
MAB is a proprietary extension and much less secure than 802.1X, we can
assume that there will be deployments out there that do not use MAB and
do not care about notifications regarding locked FDB entries. I
therefore think that MAB needs to be enabled by a separate bridge port
flag that is rejected unless the bridge port is locked and has learning
enabled.

Regarding hardware offload, I have an idea (needs testing) on how to
make mlxsw work in a similar way to mv88e6xxx. That is, does not involve
injecting frames that incurred a miss to the Rx path. If you guys want,
I'm willing to take a subset of the patches here, improve the commit
message, do some small changes and submit them along with an mlxsw
implementation. My intention is not to discredit anyone (I will keep the
original authorship), but to help push this forward and give another
example of hardware offload.

[1] https://github.com/westermo/hostapd/commit/10c584b875a63a9e58b0ad39835282545351c30e#diff-338b6fad34b4bdb015d7d96930974bd96796b754257473b6c91527789656d6ed
[2] https://git.kernel.org/pub/scm/network/iproute2/iproute2-next.git/commit/?id=c74a8bc9cf5d6b6c9d8c64d5a80c5740165f315a
  
Hans Schultz Oct. 22, 2022, 2:11 p.m. UTC | #25
On 2022-10-22 15:49, Ido Schimmel wrote:
> On Fri, Oct 21, 2022 at 09:14:11PM +0300, Vladimir Oltean wrote:
>> On Fri, Oct 21, 2022 at 07:39:34PM +0200, netdev@kapio-technology.com 
>> wrote:
>> > Well, with this change, to have MAB working, the bridge would need learning on
>> > of course, but how things work with the bridge according to the flags, they
>> > should also work in the offloaded case if you ask me. There should be no
>> > difference between the two, thus MAB in drivers would have to be with
>> > learning on.
>> 
>> Am I proposing for things to work differently in the offload and
>> software case, and not realizing it? :-/
>> 
>> The essence of my proposal was to send a bug fix now which denies
>> BR_LEARNING to be set together with BR_PORT_LOCKED. The fact that
>> link-local traffic is learned by the software bridge is something
>> unintended as far as I understand.
>> 
>> You tried to fix it here, and as far as I could search in my inbox, 
>> that
>> didn't go anywhere:
>> https://lore.kernel.org/netdev/47d8d747-54ef-df52-3b9c-acb9a77fa14a@blackwall.org/T/#u
>> 
>> I thought only mv88e6xxx offloads BR_PORT_LOCKED, but now, after
>> searching, I also see prestera has support for it, so let me add
>> Oleksandr Mazur to the discussion as well. I wonder how they deal with
>> this? Has somebody come to rely on learning being enabled on a locked
>> port?
>> 
>> 
>> MAB in offloading drivers will have to be with learning on (same as in
>> software). When BR_PORT_LOCKED | BR_LEARNING will be allowed together
>> back in net-next (to denote the MAB configuration), offloading drivers
>> (mv88e6xxx and prestera) will be patched to reject them. They will 
>> only
>> accept the two together when they implement MAB support.
>> 
>> Future drivers after this mess has been cleaned up will have to look 
>> at
>> the BR_PORT_LOCKED and BR_LEARNING flag in combination, to see which
>> kind of learning is desired on a port (secure, CPU based learning or
>> autonomous learning).
>> 
>> Am I not making sense?
> 
> I will try to summarize what I learned from past discussions because I
> think it is not properly explained in the commit messages.
> 
> If you look at the hostapd fork by Westermo [1], you will see that they
> are authorizing hosts by adding dynamic FDB entries from user space, 
> not

Those dynamic FDB entries are to be dynamic ATU entries by a patch set 
that
I have ready, but which I have not submitted as I was expecting to 
submit
it after this patch set was accepted.

The important aspect of Dynamic ATU entries is that the HW refreshes the
ATU entries with an active host.


> static ones. Someone from Westermo will need to confirm this, but I

I represent WesterMo in the upstreaming of these patches, and can 
confirm
that both for hostapd and the MAB solution, WesterMo authorizes by using
dynamic entries.

> guess the reasons are that a) They want hosts that became silent to 
> lose
> their authentication after the aging time b) They want hosts to lose
> their authentication when the carrier of the bridge port goes down. 
> This
> will cause the bridge driver to flush dynamic FDB entries, but not
> static ones. Otherwise, an attacker with physical access to the switch
> and knowledge of the MAC address of the authenticated host can connect 
> a
> different (malicious) host that will be able to communicate through the
> bridge.

Seems correct, only that it must be specified that it must be the 
switchcore
and not the bridge that ages the entries, thus ATU entries.

> 
> In the above scenario, learning does not need to be on for the bridge 
> to
> populate its FDB, but rather for the bridge to refresh the dynamic FDB
> entries installed by hostapd. This seems like a valid use case and one
> needs a good reason to break it in future kernels.
> 
> Regarding learning from link-local frames, this can be mitigated by [2]
> without adding additional checks in the bridge. I don't know why this
> bridge option was originally added, but if it wasn't for this use case,
> then now it has another use case.
> 
> Regarding MAB, from the above you can see that a pure 802.1X
> implementation that does not involve MAB can benefit from locked bridge
> ports with learning enabled. It is therefore not accurate to say that
> one wants MAB merely by enabling learning on a locked port. Given that
> MAB is a proprietary extension and much less secure than 802.1X, we can
> assume that there will be deployments out there that do not use MAB and
> do not care about notifications regarding locked FDB entries. I
> therefore think that MAB needs to be enabled by a separate bridge port
> flag that is rejected unless the bridge port is locked and has learning
> enabled.
> 
> Regarding hardware offload, I have an idea (needs testing) on how to
> make mlxsw work in a similar way to mv88e6xxx. That is, does not 
> involve
> injecting frames that incurred a miss to the Rx path. If you guys want,
> I'm willing to take a subset of the patches here, improve the commit
> message, do some small changes and submit them along with an mlxsw
> implementation. My intention is not to discredit anyone (I will keep 
> the
> original authorship), but to help push this forward and give another
> example of hardware offload.

You are very welcome to help pushing this forward for my sake, I just 
need
to know how it will affect this patch set. :-)

> 
> [1] 
> https://github.com/westermo/hostapd/commit/10c584b875a63a9e58b0ad39835282545351c30e#diff-338b6fad34b4bdb015d7d96930974bd96796b754257473b6c91527789656d6ed
> [2] 
> https://git.kernel.org/pub/scm/network/iproute2/iproute2-next.git/commit/?id=c74a8bc9cf5d6b6c9d8c64d5a80c5740165f315a
  
Vladimir Oltean Oct. 22, 2022, 2:49 p.m. UTC | #26
On Sat, Oct 22, 2022 at 04:49:50PM +0300, Ido Schimmel wrote:
> I will try to summarize what I learned from past discussions because I
> think it is not properly explained in the commit messages.
> 
> If you look at the hostapd fork by Westermo [1], you will see that they
> are authorizing hosts by adding dynamic FDB entries from user space, not
> static ones. Someone from Westermo will need to confirm this, but I
> guess the reasons are that a) They want hosts that became silent to lose
> their authentication after the aging time b) They want hosts to lose
> their authentication when the carrier of the bridge port goes down. This
> will cause the bridge driver to flush dynamic FDB entries, but not
> static ones. Otherwise, an attacker with physical access to the switch
> and knowledge of the MAC address of the authenticated host can connect a
> different (malicious) host that will be able to communicate through the
> bridge.

Not only is it not well explained, but Hans said back in February that
"in the common case you will want to use static entries":
https://lore.kernel.org/lkml/867da5viak.fsf@gmail.com/

> 
> In the above scenario, learning does not need to be on for the bridge to
> populate its FDB, but rather for the bridge to refresh the dynamic FDB
> entries installed by hostapd. This seems like a valid use case and one
> needs a good reason to break it in future kernels.

Before suggesting any alternatives, I'd like to know more details about
how this will work in practice, because I'm aware of the limitations
that come with DSA not syncing its hardware FDB with the software bridge.

So you add a dynamic FDB entry from user space, it gets propagated to
hardware via SWITCHDEV_FDB_ADD_TO_DEVICE, and from there on, they have
completely independent ageing timers.

You'll still suffer interruptions in authorization, if the software FDB
entry expires because it was never refreshed (which will happen if
traffic is forwarded autonomously and not seen by software). And at this
stage, you could just add static FDB entries which you periodically
delete from user space, since the effect would be equivalent.

If the mitigation to that is going to involve the extern_learn flag, the
whole point becomes moot (for mv88e6xxx), since FDB refreshing does not
happen in the bridge driver in that case (so the learning flag can be
whatever).

> 
> Regarding learning from link-local frames, this can be mitigated by [2]
> without adding additional checks in the bridge. I don't know why this
> bridge option was originally added, but if it wasn't for this use case,
> then now it has another use case.

There is still the problem that link-local learning is on by default
(follows the BR_LEARNING setting of the port). I don't feel exactly
comfortable with the fact that it's easy for a user to miss this and
leave the port completely insecure.

> 
> Regarding MAB, from the above you can see that a pure 802.1X
> implementation that does not involve MAB can benefit from locked bridge
> ports with learning enabled. It is therefore not accurate to say that
> one wants MAB merely by enabling learning on a locked port. Given that
> MAB is a proprietary extension and much less secure than 802.1X, we can
> assume that there will be deployments out there that do not use MAB and
> do not care about notifications regarding locked FDB entries. I
> therefore think that MAB needs to be enabled by a separate bridge port
> flag that is rejected unless the bridge port is locked and has learning
> enabled.

I had missed the detail that dynamic FDB entries will be refreshed only
with "learning" on. It makes the picture more complete. Only this is
said in "man bridge":

       learning on or learning off
              Controls whether a given port will learn MAC addresses
              from received traffic or not. If learning if off, the
              bridge will end up flooding any traffic for which it has
              no FDB entry. By default this flag is on.

Can live with MAB being a separate flag if it comes to that, as long as
'learning' will continue to have its own specific meaning, independent
of it (right now that meaning is subtle and undocumented, but makes sense).

> Regarding hardware offload, I have an idea (needs testing) on how to
> make mlxsw work in a similar way to mv88e6xxx. That is, does not involve
> injecting frames that incurred a miss to the Rx path. If you guys want,
> I'm willing to take a subset of the patches here, improve the commit
> message, do some small changes and submit them along with an mlxsw
> implementation. My intention is not to discredit anyone (I will keep the
> original authorship), but to help push this forward and give another
> example of hardware offload.
> 
> [1] https://github.com/westermo/hostapd/commit/10c584b875a63a9e58b0ad39835282545351c30e#diff-338b6fad34b4bdb015d7d96930974bd96796b754257473b6c91527789656d6ed
> [2] https://git.kernel.org/pub/scm/network/iproute2/iproute2-next.git/commit/?id=c74a8bc9cf5d6b6c9d8c64d5a80c5740165f315a

I think it would be very nice if you could do that. As a middle ground
between mv88e6xxx and mlxsw, I can also try to build a setup on ocelot
(which should trap frames with MAC SA misses in a similar way to mlxsw,
but does also not sync its FDB with the bridge, similar to the mv88e6xxx.
Not sure what to do with dynamic FDB entries).

If only I would figure out how to configure that hostapd fork (something
which I never did before).

Hans, would it be possible to lay out some usage instructions for this fork?
  
Ido Schimmel Oct. 23, 2022, 6:53 a.m. UTC | #27
On Sat, Oct 22, 2022 at 05:49:51PM +0300, Vladimir Oltean wrote:
> On Sat, Oct 22, 2022 at 04:49:50PM +0300, Ido Schimmel wrote:
> > In the above scenario, learning does not need to be on for the bridge to
> > populate its FDB, but rather for the bridge to refresh the dynamic FDB
> > entries installed by hostapd. This seems like a valid use case and one
> > needs a good reason to break it in future kernels.
> 
> Before suggesting any alternatives, I'd like to know more details about
> how this will work in practice, because I'm aware of the limitations
> that come with DSA not syncing its hardware FDB with the software bridge.
> 
> So you add a dynamic FDB entry from user space, it gets propagated to
> hardware via SWITCHDEV_FDB_ADD_TO_DEVICE, and from there on, they have
> completely independent ageing timers.
> 
> You'll still suffer interruptions in authorization, if the software FDB
> entry expires because it was never refreshed (which will happen if
> traffic is forwarded autonomously and not seen by software). And at this
> stage, you could just add static FDB entries which you periodically
> delete from user space, since the effect would be equivalent.
> 
> If the mitigation to that is going to involve the extern_learn flag, the
> whole point becomes moot (for mv88e6xxx), since FDB refreshing does not
> happen in the bridge driver in that case (so the learning flag can be
> whatever).

Once a dynamic FDB entry is installed in hardware the software bridge no
longer sees the majority of the traffic that refreshes this entry, which
means we need to prevent the bridge from mindlessly ageing and removing
the entry. I see two options, depending on the capabilities of the
underlying hardware implementation:

1. If the hardware is capable of generating an event that an entry was
aged out, then once the dynamic entry was installed in hardware the
device driver needs to let the bridge driver know that it is no longer
responsible for ageing the entry. This can be done by either marking the
entry as extern_learn or offloaded. The latter is more accurate, but we
need to patch br_fdb_cleanup(). Upon an ageing event, the device driver
will tell the bridge to remove the entry via
SWITCHDEV_FDB_DEL_TO_BRIDGE.

2. If the hardware is unable to generate ageing events, but allows
querying the activity of the entry, then the device driver will need to
emulate the behavior of the first option. This allows us to use the same
interface between the bridge and device driver regardless of the
underlying hardware implementation. My feeling is that most devices fall
in the first category.

> > Regarding learning from link-local frames, this can be mitigated by [2]
> > without adding additional checks in the bridge. I don't know why this
> > bridge option was originally added, but if it wasn't for this use case,
> > then now it has another use case.
> 
> There is still the problem that link-local learning is on by default
> (follows the BR_LEARNING setting of the port). I don't feel exactly
> comfortable with the fact that it's easy for a user to miss this and
> leave the port completely insecure.

I'm willing to patch the man page and add a note near the 'locked'
bridge port option.

> > Regarding MAB, from the above you can see that a pure 802.1X
> > implementation that does not involve MAB can benefit from locked bridge
> > ports with learning enabled. It is therefore not accurate to say that
> > one wants MAB merely by enabling learning on a locked port. Given that
> > MAB is a proprietary extension and much less secure than 802.1X, we can
> > assume that there will be deployments out there that do not use MAB and
> > do not care about notifications regarding locked FDB entries. I
> > therefore think that MAB needs to be enabled by a separate bridge port
> > flag that is rejected unless the bridge port is locked and has learning
> > enabled.
> 
> I had missed the detail that dynamic FDB entries will be refreshed only
> with "learning" on. It makes the picture more complete. Only this is
> said in "man bridge":
> 
>        learning on or learning off
>               Controls whether a given port will learn MAC addresses
>               from received traffic or not. If learning if off, the
>               bridge will end up flooding any traffic for which it has
>               no FDB entry. By default this flag is on.
> 
> Can live with MAB being a separate flag if it comes to that, as long as
> 'learning' will continue to have its own specific meaning, independent
> of it (right now that meaning is subtle and undocumented, but makes sense).

Yes, I agree it is subtle.

> > Regarding hardware offload, I have an idea (needs testing) on how to
> > make mlxsw work in a similar way to mv88e6xxx. That is, does not involve
> > injecting frames that incurred a miss to the Rx path. If you guys want,
> > I'm willing to take a subset of the patches here, improve the commit
> > message, do some small changes and submit them along with an mlxsw
> > implementation. My intention is not to discredit anyone (I will keep the
> > original authorship), but to help push this forward and give another
> > example of hardware offload.
> > 
> > [1] https://github.com/westermo/hostapd/commit/10c584b875a63a9e58b0ad39835282545351c30e#diff-338b6fad34b4bdb015d7d96930974bd96796b754257473b6c91527789656d6ed
> > [2] https://git.kernel.org/pub/scm/network/iproute2/iproute2-next.git/commit/?id=c74a8bc9cf5d6b6c9d8c64d5a80c5740165f315a
> 
> I think it would be very nice if you could do that. As a middle ground
> between mv88e6xxx and mlxsw, I can also try to build a setup on ocelot
> (which should trap frames with MAC SA misses in a similar way to mlxsw,
> but does also not sync its FDB with the bridge, similar to the mv88e6xxx.
> Not sure what to do with dynamic FDB entries).

Will try to post my patches this week.

> If only I would figure out how to configure that hostapd fork (something
> which I never did before).
> 
> Hans, would it be possible to lay out some usage instructions for this fork?

That would be good.
  

Patch

diff --git a/drivers/net/dsa/mv88e6xxx/Makefile b/drivers/net/dsa/mv88e6xxx/Makefile
index c8eca2b6f959..be903a983780 100644
--- a/drivers/net/dsa/mv88e6xxx/Makefile
+++ b/drivers/net/dsa/mv88e6xxx/Makefile
@@ -15,3 +15,4 @@  mv88e6xxx-objs += port_hidden.o
 mv88e6xxx-$(CONFIG_NET_DSA_MV88E6XXX_PTP) += ptp.o
 mv88e6xxx-objs += serdes.o
 mv88e6xxx-objs += smi.o
+mv88e6xxx-objs += switchdev.o
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 352121cce77e..71843fe87f77 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -42,6 +42,7 @@ 
 #include "ptp.h"
 #include "serdes.h"
 #include "smi.h"
+#include "switchdev.h"
 
 static void assert_reg_lock(struct mv88e6xxx_chip *chip)
 {
@@ -924,6 +925,13 @@  static void mv88e6xxx_mac_link_down(struct dsa_switch *ds, int port,
 	if (err)
 		dev_err(chip->dev,
 			"p%d: failed to force MAC link down\n", port);
+	else
+		if (mv88e6xxx_port_is_locked(chip, port)) {
+			err = mv88e6xxx_atu_locked_entry_flush(ds, port);
+			if (err)
+				dev_err(chip->dev,
+					"p%d: failed to clear locked entries\n", port);
+		}
 }
 
 static void mv88e6xxx_mac_link_up(struct dsa_switch *ds, int port,
@@ -1690,6 +1698,13 @@  static void mv88e6xxx_port_fast_age(struct dsa_switch *ds, int port)
 	struct mv88e6xxx_chip *chip = ds->priv;
 	int err;
 
+	if (mv88e6xxx_port_is_locked(chip, port)) {
+		err = mv88e6xxx_atu_locked_entry_flush(ds, port);
+		if (err)
+			dev_err(chip->ds->dev, "p%d: failed to clear locked entries: %d\n",
+				port, err);
+	}
+
 	mv88e6xxx_reg_lock(chip);
 	err = mv88e6xxx_port_fast_age_fid(chip, port, 0);
 	mv88e6xxx_reg_unlock(chip);
@@ -1726,11 +1741,11 @@  static int mv88e6xxx_vtu_get(struct mv88e6xxx_chip *chip, u16 vid,
 	return err;
 }
 
-static int mv88e6xxx_vtu_walk(struct mv88e6xxx_chip *chip,
-			      int (*cb)(struct mv88e6xxx_chip *chip,
-					const struct mv88e6xxx_vtu_entry *entry,
-					void *priv),
-			      void *priv)
+int mv88e6xxx_vtu_walk(struct mv88e6xxx_chip *chip,
+		       int (*cb)(struct mv88e6xxx_chip *chip,
+				 const struct mv88e6xxx_vtu_entry *entry,
+				 void *priv),
+		       void *priv)
 {
 	struct mv88e6xxx_vtu_entry entry = {
 		.vid = mv88e6xxx_max_vid(chip),
@@ -2731,6 +2746,9 @@  static int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
 	if (fdb_flags)
 		return 0;
 
+	if (mv88e6xxx_port_is_locked(chip, port))
+		mv88e6xxx_atu_locked_entry_find_purge(ds, port, addr, vid);
+
 	mv88e6xxx_reg_lock(chip);
 	err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid,
 					   MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC);
@@ -2744,16 +2762,21 @@  static int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
 				  u16 fdb_flags, struct dsa_db db)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
-	int err;
+	bool locked_found = false;
+	int err = 0;
 
 	/* Ignore entries with flags set */
 	if (fdb_flags)
 		return 0;
 
-	mv88e6xxx_reg_lock(chip);
-	err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid, 0);
-	mv88e6xxx_reg_unlock(chip);
+	if (mv88e6xxx_port_is_locked(chip, port))
+		locked_found = mv88e6xxx_atu_locked_entry_find_purge(ds, port, addr, vid);
 
+	if (!locked_found) {
+		mv88e6xxx_reg_lock(chip);
+		err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid, 0);
+		mv88e6xxx_reg_unlock(chip);
+	}
 	return err;
 }
 
@@ -3849,11 +3872,18 @@  static int mv88e6xxx_setup(struct dsa_switch *ds)
 
 static int mv88e6xxx_port_setup(struct dsa_switch *ds, int port)
 {
-	return mv88e6xxx_setup_devlink_regions_port(ds, port);
+	int err;
+
+	err = mv88e6xxx_setup_devlink_regions_port(ds, port);
+	if (!err)
+		return mv88e6xxx_init_violation_handler(ds, port);
+
+	return err;
 }
 
 static void mv88e6xxx_port_teardown(struct dsa_switch *ds, int port)
 {
+	mv88e6xxx_teardown_violation_handler(ds, port);
 	mv88e6xxx_teardown_devlink_regions_port(ds, port);
 }
 
@@ -6528,7 +6558,7 @@  static int mv88e6xxx_port_pre_bridge_flags(struct dsa_switch *ds, int port,
 	const struct mv88e6xxx_ops *ops;
 
 	if (flags.mask & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD |
-			   BR_BCAST_FLOOD | BR_PORT_LOCKED))
+			   BR_BCAST_FLOOD | BR_PORT_LOCKED | BR_PORT_MAB))
 		return -EINVAL;
 
 	ops = chip->info->ops;
@@ -6549,13 +6579,13 @@  static int mv88e6xxx_port_bridge_flags(struct dsa_switch *ds, int port,
 	struct mv88e6xxx_chip *chip = ds->priv;
 	int err = -EOPNOTSUPP;
 
-	mv88e6xxx_reg_lock(chip);
-
 	if (flags.mask & BR_LEARNING) {
 		bool learning = !!(flags.val & BR_LEARNING);
 		u16 pav = learning ? (1 << port) : 0;
 
+		mv88e6xxx_reg_lock(chip);
 		err = mv88e6xxx_port_set_assoc_vector(chip, port, pav);
+		mv88e6xxx_reg_unlock(chip);
 		if (err)
 			goto out;
 	}
@@ -6563,8 +6593,10 @@  static int mv88e6xxx_port_bridge_flags(struct dsa_switch *ds, int port,
 	if (flags.mask & BR_FLOOD) {
 		bool unicast = !!(flags.val & BR_FLOOD);
 
+		mv88e6xxx_reg_lock(chip);
 		err = chip->info->ops->port_set_ucast_flood(chip, port,
 							    unicast);
+		mv88e6xxx_reg_unlock(chip);
 		if (err)
 			goto out;
 	}
@@ -6572,8 +6604,10 @@  static int mv88e6xxx_port_bridge_flags(struct dsa_switch *ds, int port,
 	if (flags.mask & BR_MCAST_FLOOD) {
 		bool multicast = !!(flags.val & BR_MCAST_FLOOD);
 
+		mv88e6xxx_reg_lock(chip);
 		err = chip->info->ops->port_set_mcast_flood(chip, port,
 							    multicast);
+		mv88e6xxx_reg_unlock(chip);
 		if (err)
 			goto out;
 	}
@@ -6581,20 +6615,34 @@  static int mv88e6xxx_port_bridge_flags(struct dsa_switch *ds, int port,
 	if (flags.mask & BR_BCAST_FLOOD) {
 		bool broadcast = !!(flags.val & BR_BCAST_FLOOD);
 
+		mv88e6xxx_reg_lock(chip);
 		err = mv88e6xxx_port_broadcast_sync(chip, port, broadcast);
+		mv88e6xxx_reg_unlock(chip);
 		if (err)
 			goto out;
 	}
 
+	if (flags.mask & BR_PORT_MAB) {
+		chip->ports[port].mab = !!(flags.val & BR_PORT_MAB);
+
+		if (!chip->ports[port].mab)
+			err = mv88e6xxx_atu_locked_entry_flush(ds, port);
+		else
+			err = 0;
+	}
+
 	if (flags.mask & BR_PORT_LOCKED) {
 		bool locked = !!(flags.val & BR_PORT_LOCKED);
 
+		mv88e6xxx_reg_lock(chip);
 		err = mv88e6xxx_port_set_lock(chip, port, locked);
+		mv88e6xxx_reg_unlock(chip);
 		if (err)
 			goto out;
+
+		chip->ports[port].locked = locked;
 	}
 out:
-	mv88e6xxx_reg_unlock(chip);
 
 	return err;
 }
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index e693154cf803..180fbcf596fa 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -280,6 +280,16 @@  struct mv88e6xxx_port {
 	unsigned int serdes_irq;
 	char serdes_irq_name[64];
 	struct devlink_region *region;
+
+	/* Locked port and MacAuth control flags */
+	bool locked;
+	bool mab;
+
+	/* List and maintenance of ATU locked entries */
+	struct mutex ale_list_lock;
+	struct list_head ale_list;
+	struct delayed_work ale_work;
+	int ale_cnt;
 };
 
 enum mv88e6xxx_region_id {
@@ -399,6 +409,9 @@  struct mv88e6xxx_chip {
 	int egress_dest_port;
 	int ingress_dest_port;
 
+	/* Keep the register written age time for easy access */
+	u8 age_time;
+
 	/* Per-port timestamping resources. */
 	struct mv88e6xxx_port_hwtstamp port_hwtstamp[DSA_MAX_PORTS];
 
@@ -802,6 +815,12 @@  static inline void mv88e6xxx_reg_unlock(struct mv88e6xxx_chip *chip)
 	mutex_unlock(&chip->reg_lock);
 }
 
+int mv88e6xxx_vtu_walk(struct mv88e6xxx_chip *chip,
+		       int (*cb)(struct mv88e6xxx_chip *chip,
+				 const struct mv88e6xxx_vtu_entry *entry,
+				 void *priv),
+		       void *priv);
+
 int mv88e6xxx_fid_map(struct mv88e6xxx_chip *chip, unsigned long *bitmap);
 
 #endif /* _MV88E6XXX_CHIP_H */
diff --git a/drivers/net/dsa/mv88e6xxx/global1.h b/drivers/net/dsa/mv88e6xxx/global1.h
index 65958b2a0d3a..503fbf216670 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.h
+++ b/drivers/net/dsa/mv88e6xxx/global1.h
@@ -136,6 +136,7 @@ 
 #define MV88E6XXX_G1_ATU_DATA_TRUNK				0x8000
 #define MV88E6XXX_G1_ATU_DATA_TRUNK_ID_MASK			0x00f0
 #define MV88E6XXX_G1_ATU_DATA_PORT_VECTOR_MASK			0x3ff0
+#define MV88E6XXX_G1_ATU_DATA_PORT_VECTOR_NO_EGRESS		0x0000
 #define MV88E6XXX_G1_ATU_DATA_STATE_MASK			0x000f
 #define MV88E6XXX_G1_ATU_DATA_STATE_UC_UNUSED			0x0000
 #define MV88E6XXX_G1_ATU_DATA_STATE_UC_AGE_1_OLDEST		0x0001
diff --git a/drivers/net/dsa/mv88e6xxx/global1_atu.c b/drivers/net/dsa/mv88e6xxx/global1_atu.c
index d9dfa1159cde..67907cd00b87 100644
--- a/drivers/net/dsa/mv88e6xxx/global1_atu.c
+++ b/drivers/net/dsa/mv88e6xxx/global1_atu.c
@@ -12,6 +12,8 @@ 
 
 #include "chip.h"
 #include "global1.h"
+#include "port.h"
+#include "switchdev.h"
 
 /* Offset 0x01: ATU FID Register */
 
@@ -54,6 +56,7 @@  int mv88e6xxx_g1_atu_set_age_time(struct mv88e6xxx_chip *chip,
 
 	/* Round to nearest multiple of coeff */
 	age_time = (msecs + coeff / 2) / coeff;
+	chip->age_time = age_time;
 
 	err = mv88e6xxx_g1_read(chip, MV88E6XXX_G1_ATU_CTL, &val);
 	if (err)
@@ -426,6 +429,8 @@  static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id)
 	if (err)
 		goto out;
 
+	mv88e6xxx_reg_unlock(chip);
+
 	spid = entry.state;
 
 	if (val & MV88E6XXX_G1_ATU_OP_AGE_OUT_VIOLATION) {
@@ -446,6 +451,12 @@  static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id)
 				    "ATU miss violation for %pM portvec %x spid %d\n",
 				    entry.mac, entry.portvec, spid);
 		chip->ports[spid].atu_miss_violation++;
+
+		if (fid && chip->ports[spid].mab)
+			err = mv88e6xxx_handle_violation(chip, spid, &entry, fid,
+							 MV88E6XXX_G1_ATU_OP_MISS_VIOLATION);
+		if (err)
+			goto out;
 	}
 
 	if (val & MV88E6XXX_G1_ATU_OP_FULL_VIOLATION) {
@@ -454,7 +465,6 @@  static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id)
 				    entry.mac, entry.portvec, spid);
 		chip->ports[spid].atu_full_violation++;
 	}
-	mv88e6xxx_reg_unlock(chip);
 
 	return IRQ_HANDLED;
 
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index 5c4195c635b0..67e457ce67ae 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -14,9 +14,11 @@ 
 #include <linux/phylink.h>
 
 #include "chip.h"
+#include "global1.h"
 #include "global2.h"
 #include "port.h"
 #include "serdes.h"
+#include "switchdev.h"
 
 int mv88e6xxx_port_read(struct mv88e6xxx_chip *chip, int port, int reg,
 			u16 *val)
@@ -1240,13 +1242,12 @@  int mv88e6xxx_port_set_lock(struct mv88e6xxx_chip *chip, int port,
 	if (err)
 		return err;
 
-	err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_ASSOC_VECTOR, &reg);
-	if (err)
-		return err;
-
-	reg &= ~MV88E6XXX_PORT_ASSOC_VECTOR_LOCKED_PORT;
-	if (locked)
-		reg |= MV88E6XXX_PORT_ASSOC_VECTOR_LOCKED_PORT;
+	reg = 0;
+	if (locked) {
+		reg = (1 << port);
+		reg |= MV88E6XXX_PORT_ASSOC_VECTOR_IGNORE_WRONG |
+			MV88E6XXX_PORT_ASSOC_VECTOR_LOCKED_PORT;
+	}
 
 	return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_ASSOC_VECTOR, reg);
 }
diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
index cb04243f37c1..9475bc6e95a2 100644
--- a/drivers/net/dsa/mv88e6xxx/port.h
+++ b/drivers/net/dsa/mv88e6xxx/port.h
@@ -231,6 +231,7 @@ 
 #define MV88E6XXX_PORT_ASSOC_VECTOR_LOCKED_PORT		0x2000
 #define MV88E6XXX_PORT_ASSOC_VECTOR_IGNORE_WRONG	0x1000
 #define MV88E6XXX_PORT_ASSOC_VECTOR_REFRESH_LOCKED	0x0800
+#define MV88E6XXX_PORT_ASSOC_VECTOR_PAV_MASK		0x07ff
 
 /* Offset 0x0C: Port ATU Control */
 #define MV88E6XXX_PORT_ATU_CTL		0x0c
@@ -375,6 +376,11 @@  int mv88e6xxx_port_set_pvid(struct mv88e6xxx_chip *chip, int port, u16 pvid);
 int mv88e6xxx_port_set_lock(struct mv88e6xxx_chip *chip, int port,
 			    bool locked);
 
+static inline bool mv88e6xxx_port_is_locked(struct mv88e6xxx_chip *chip, int port)
+{
+	return chip->ports[port].locked;
+}
+
 int mv88e6xxx_port_set_8021q_mode(struct mv88e6xxx_chip *chip, int port,
 				  u16 mode);
 int mv88e6095_port_tag_remap(struct mv88e6xxx_chip *chip, int port);
diff --git a/drivers/net/dsa/mv88e6xxx/switchdev.c b/drivers/net/dsa/mv88e6xxx/switchdev.c
new file mode 100644
index 000000000000..cd332a10fad5
--- /dev/null
+++ b/drivers/net/dsa/mv88e6xxx/switchdev.c
@@ -0,0 +1,284 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * switchdev.c
+ *
+ *	Authors:
+ *	Hans J. Schultz		<hans.schultz@westermo.com>
+ *
+ */
+
+#include <net/switchdev.h>
+#include <linux/list.h>
+#include "chip.h"
+#include "global1.h"
+#include "switchdev.h"
+
+static void mv88e6xxx_atu_locked_entry_purge(struct mv88e6xxx_atu_locked_entry *ale,
+					     bool notify, bool take_nl_lock)
+{
+	struct switchdev_notifier_fdb_info info = {
+		.addr = ale->mac,
+		.vid = ale->vid,
+		.locked = true,
+		.offloaded = true,
+	};
+	struct mv88e6xxx_atu_entry entry;
+	struct net_device *brport;
+	struct dsa_port *dp;
+
+	entry.portvec = MV88E6XXX_G1_ATU_DATA_PORT_VECTOR_NO_EGRESS;
+	entry.state = MV88E6XXX_G1_ATU_DATA_STATE_UC_UNUSED;
+	entry.trunk = false;
+	ether_addr_copy(entry.mac, ale->mac);
+
+	mv88e6xxx_reg_lock(ale->chip);
+	mv88e6xxx_g1_atu_loadpurge(ale->chip, ale->fid, &entry);
+	mv88e6xxx_reg_unlock(ale->chip);
+
+	dp = dsa_to_port(ale->chip->ds, ale->port);
+
+	if (notify) {
+		if (take_nl_lock)
+			rtnl_lock();
+		brport = dsa_port_to_bridge_port(dp);
+
+		if (brport) {
+			call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE,
+						 brport, &info.info, NULL);
+		} else {
+			dev_err(ale->chip->dev, "No bridge port for dsa port belonging to port %d\n",
+				ale->port);
+		}
+		if (take_nl_lock)
+			rtnl_unlock();
+	}
+
+	list_del(&ale->list);
+	kfree(ale);
+}
+
+static void mv88e6xxx_atu_locked_entry_cleanup(struct work_struct *work)
+{
+	struct mv88e6xxx_port *p = container_of(work, struct mv88e6xxx_port, ale_work.work);
+	struct mv88e6xxx_atu_locked_entry *ale, *tmp;
+
+	mutex_lock(&p->ale_list_lock);
+	list_for_each_entry_safe(ale, tmp, &p->ale_list, list) {
+		if (time_after(jiffies, ale->expires)) {
+			mv88e6xxx_atu_locked_entry_purge(ale, true, true);
+			p->ale_cnt--;
+		}
+	}
+	mutex_unlock(&p->ale_list_lock);
+
+	mod_delayed_work(system_long_wq, &p->ale_work, msecs_to_jiffies(100));
+}
+
+static int mv88e6xxx_new_atu_locked_entry(struct mv88e6xxx_chip *chip, const unsigned char *addr,
+					  int port, u16 fid, u16 vid,
+					  struct mv88e6xxx_atu_locked_entry **alep)
+{
+	struct mv88e6xxx_atu_locked_entry *ale;
+	unsigned long now, age_time;
+
+	ale = kmalloc(sizeof(*ale), GFP_ATOMIC);
+	if (!ale)
+		return -ENOMEM;
+
+	ether_addr_copy(ale->mac, addr);
+	ale->chip = chip;
+	ale->port = port;
+	ale->fid = fid;
+	ale->vid = vid;
+	now = jiffies;
+	age_time = chip->age_time * chip->info->age_time_coeff;
+	ale->expires = now + age_time;
+
+	*alep = ale;
+	return 0;
+}
+
+struct mv88e6xxx_fid_search_ctx {
+	u16 fid_search;
+	u16 vid_found;
+};
+
+static int mv88e6xxx_find_vid_on_matching_fid(struct mv88e6xxx_chip *chip,
+					      const struct mv88e6xxx_vtu_entry *entry,
+					      void *priv)
+{
+	struct mv88e6xxx_fid_search_ctx *ctx = priv;
+
+	if (ctx->fid_search == entry->fid) {
+		ctx->vid_found = entry->vid;
+		return 1;
+	}
+
+	return 0;
+}
+
+int mv88e6xxx_handle_violation(struct mv88e6xxx_chip *chip, int port,
+			       struct mv88e6xxx_atu_entry *entry,
+			       u16 fid, u16 type)
+{
+	struct switchdev_notifier_fdb_info info = {
+		.addr = entry->mac,
+		.vid = 0,
+		.is_local = true,
+		.locked = true,
+		.blackhole = true,
+		.offloaded = true,
+	};
+	struct mv88e6xxx_atu_locked_entry *ale;
+	struct mv88e6xxx_fid_search_ctx ctx;
+	struct net_device *brport;
+	struct mv88e6xxx_port *p;
+	struct dsa_port *dp;
+	int err;
+
+	if (!mv88e6xxx_is_invalid_port(chip, port))
+		p = &chip->ports[port];
+	else
+		return -ENODEV;
+
+	ctx.fid_search = fid;
+	mv88e6xxx_reg_lock(chip);
+	err = mv88e6xxx_vtu_walk(chip, mv88e6xxx_find_vid_on_matching_fid, &ctx);
+	mv88e6xxx_reg_unlock(chip);
+	if (err < 0)
+		return err;
+	if (err == 1)
+		info.vid = ctx.vid_found;
+	else
+		return -ENODATA;
+
+	switch (type) {
+	case MV88E6XXX_G1_ATU_OP_MISS_VIOLATION:
+		mutex_lock(&p->ale_list_lock);
+		if (p->ale_cnt >= ATU_LOCKED_ENTRIES_MAX)
+			goto exit;
+		mutex_unlock(&p->ale_list_lock);
+		entry->portvec = MV88E6XXX_G1_ATU_DATA_PORT_VECTOR_NO_EGRESS;
+		entry->state = MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC;
+		entry->trunk = false;
+
+		mv88e6xxx_reg_lock(chip);
+		err = mv88e6xxx_g1_atu_loadpurge(chip, fid, entry);
+		if (err)
+			goto fail;
+		mv88e6xxx_reg_unlock(chip);
+
+		dp = dsa_to_port(chip->ds, port);
+		err = mv88e6xxx_new_atu_locked_entry(chip, entry->mac, port, fid,
+						     info.vid, &ale);
+		if (err)
+			return err;
+
+		mutex_lock(&p->ale_list_lock);
+		list_add(&ale->list, &p->ale_list);
+		p->ale_cnt++;
+		mutex_unlock(&p->ale_list_lock);
+
+		rtnl_lock();
+		brport = dsa_port_to_bridge_port(dp);
+		if (!brport) {
+			rtnl_unlock();
+			return -ENODEV;
+		}
+		err = call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_BRIDGE,
+					       brport, &info.info, NULL);
+		rtnl_unlock();
+		break;
+	}
+
+	return err;
+
+fail:
+	mv88e6xxx_reg_unlock(chip);
+	return err;
+
+exit:
+	mutex_unlock(&p->ale_list_lock);
+	return 0;
+}
+
+bool mv88e6xxx_atu_locked_entry_find_purge(struct dsa_switch *ds, int port,
+					   const unsigned char *addr, u16 vid)
+{
+	struct mv88e6xxx_atu_locked_entry *ale, *tmp;
+	struct mv88e6xxx_chip *chip = ds->priv;
+	struct mv88e6xxx_port *p;
+	bool found = false;
+
+	p = &chip->ports[port];
+	mutex_lock(&p->ale_list_lock);
+	list_for_each_entry_safe(ale, tmp, &p->ale_list, list) {
+		if (ether_addr_equal(ale->mac, addr)) {
+			if (ale->vid == vid) {
+				mv88e6xxx_atu_locked_entry_purge(ale, false, false);
+				p->ale_cnt--;
+				found = true;
+				break;
+			}
+		}
+	}
+	mutex_unlock(&p->ale_list_lock);
+	return found;
+}
+
+int mv88e6xxx_atu_locked_entry_flush(struct dsa_switch *ds, int port)
+{
+	struct mv88e6xxx_atu_locked_entry *ale, *tmp;
+	struct mv88e6xxx_chip *chip = ds->priv;
+	struct mv88e6xxx_port *p;
+
+	if (!mv88e6xxx_is_invalid_port(chip, port))
+		p = &chip->ports[port];
+	else
+		return -ENODEV;
+
+	mutex_lock(&p->ale_list_lock);
+	list_for_each_entry_safe(ale, tmp, &p->ale_list, list) {
+		mv88e6xxx_atu_locked_entry_purge(ale, true, false);
+		p->ale_cnt--;
+	}
+	mutex_unlock(&p->ale_list_lock);
+
+	return 0;
+}
+
+int mv88e6xxx_init_violation_handler(struct dsa_switch *ds, int port)
+{
+	struct mv88e6xxx_chip *chip = ds->priv;
+	struct mv88e6xxx_port *p;
+
+	if (!mv88e6xxx_is_invalid_port(chip, port))
+		p = &chip->ports[port];
+	else
+		return -ENODEV;
+
+	INIT_LIST_HEAD(&p->ale_list);
+	mutex_init(&p->ale_list_lock);
+	p->ale_cnt = 0;
+	INIT_DELAYED_WORK(&p->ale_work, mv88e6xxx_atu_locked_entry_cleanup);
+	mod_delayed_work(system_long_wq, &p->ale_work, msecs_to_jiffies(500));
+
+	return 0;
+}
+
+int mv88e6xxx_teardown_violation_handler(struct dsa_switch *ds, int port)
+{
+	struct mv88e6xxx_chip *chip = ds->priv;
+	struct mv88e6xxx_port *p;
+
+	if (!mv88e6xxx_is_invalid_port(chip, port))
+		p = &chip->ports[port];
+	else
+		return -ENODEV;
+
+	cancel_delayed_work(&p->ale_work);
+	mv88e6xxx_atu_locked_entry_flush(ds, port);
+	mutex_destroy(&p->ale_list_lock);
+
+	return 0;
+}
diff --git a/drivers/net/dsa/mv88e6xxx/switchdev.h b/drivers/net/dsa/mv88e6xxx/switchdev.h
new file mode 100644
index 000000000000..df2005c36f47
--- /dev/null
+++ b/drivers/net/dsa/mv88e6xxx/switchdev.h
@@ -0,0 +1,37 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * switchdev.h
+ *
+ *	Authors:
+ *	Hans J. Schultz		<hans.schultz@westermo.com>
+ *
+ */
+
+#ifndef DRIVERS_NET_DSA_MV88E6XXX_SWITCHDEV_H_
+#define DRIVERS_NET_DSA_MV88E6XXX_SWITCHDEV_H_
+
+#include <net/switchdev.h>
+#include "chip.h"
+
+#define ATU_LOCKED_ENTRIES_MAX	64
+
+struct mv88e6xxx_atu_locked_entry {
+	struct		list_head list;
+	struct		mv88e6xxx_chip *chip;
+	int		port;
+	u8		mac[ETH_ALEN];
+	u16		fid;
+	u16		vid;
+	unsigned long	expires;
+};
+
+int mv88e6xxx_handle_violation(struct mv88e6xxx_chip *chip, int port,
+			       struct mv88e6xxx_atu_entry *entry,
+			       u16 fid, u16 type);
+bool mv88e6xxx_atu_locked_entry_find_purge(struct dsa_switch *ds, int port,
+					   const unsigned char *addr, u16 vid);
+int mv88e6xxx_atu_locked_entry_flush(struct dsa_switch *ds, int port);
+int mv88e6xxx_init_violation_handler(struct dsa_switch *ds, int port);
+int mv88e6xxx_teardown_violation_handler(struct dsa_switch *ds, int port);
+
+#endif /* DRIVERS_NET_DSA_MV88E6XXX_SWITCHDEV_H_ */