[v11,3/7] genirq: Add mechanism to multiplex a single HW IPI

Message ID 20221114093904.1669461-4-apatel@ventanamicro.com
State New
Headers
Series RISC-V IPI Improvements |

Commit Message

Anup Patel Nov. 14, 2022, 9:39 a.m. UTC
  All RISC-V platforms have a single HW IPI provided by the INTC local
interrupt controller. The HW method to trigger INTC IPI can be through
external irqchip (e.g. RISC-V AIA), through platform specific device
(e.g. SiFive CLINT timer), or through firmware (e.g. SBI IPI call).

To support multiple IPIs on RISC-V, we add a generic IPI multiplexing
mechanism which help us create multiple virtual IPIs using a single
HW IPI. This generic IPI multiplexing is inspired from the Apple AIC
irqchip driver and it is shared by various RISC-V irqchip drivers.

Signed-off-by: Anup Patel <apatel@ventanamicro.com>
---
 include/linux/irq.h  |  18 +++
 kernel/irq/Kconfig   |   5 +
 kernel/irq/Makefile  |   1 +
 kernel/irq/ipi-mux.c | 268 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 292 insertions(+)
 create mode 100644 kernel/irq/ipi-mux.c
  

Comments

Marc Zyngier Nov. 26, 2022, 12:42 p.m. UTC | #1
On Mon, 14 Nov 2022 09:39:00 +0000,
Anup Patel <apatel@ventanamicro.com> wrote:
> 
> All RISC-V platforms have a single HW IPI provided by the INTC local
> interrupt controller. The HW method to trigger INTC IPI can be through
> external irqchip (e.g. RISC-V AIA), through platform specific device
> (e.g. SiFive CLINT timer), or through firmware (e.g. SBI IPI call).
> 
> To support multiple IPIs on RISC-V, we add a generic IPI multiplexing
> mechanism which help us create multiple virtual IPIs using a single
> HW IPI. This generic IPI multiplexing is inspired from the Apple AIC
> irqchip driver and it is shared by various RISC-V irqchip drivers.
> 
> Signed-off-by: Anup Patel <apatel@ventanamicro.com>
> ---
>  include/linux/irq.h  |  18 +++
>  kernel/irq/Kconfig   |   5 +
>  kernel/irq/Makefile  |   1 +
>  kernel/irq/ipi-mux.c | 268 +++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 292 insertions(+)
>  create mode 100644 kernel/irq/ipi-mux.c
> 
> diff --git a/include/linux/irq.h b/include/linux/irq.h
> index c3eb89606c2b..5ab702cb0a5b 100644
> --- a/include/linux/irq.h
> +++ b/include/linux/irq.h
> @@ -1266,6 +1266,24 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
>  int ipi_send_single(unsigned int virq, unsigned int cpu);
>  int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
>  
> +/**
> + * struct ipi_mux_ops - IPI multiplex operations
> + *
> + * @ipi_mux_pre_handle:	Optional function called before handling parent IPI
> + * @ipi_mux_post_handle:Optional function called after handling parent IPI
> + * @ipi_mux_send:	Trigger parent IPI on target CPUs
> + */
> +struct ipi_mux_ops {
> +	void (*ipi_mux_pre_handle)(unsigned int parent_virq, void *data);
> +	void (*ipi_mux_post_handle)(unsigned int parent_virq, void *data);

I still haven't seen any decent explanation for this other than "we
need it". What is it that cannot be achieved via the irq_ack() and
irq_eoi() callbacks?

> +	void (*ipi_mux_send)(unsigned int parent_virq, void *data,
> +			     const struct cpumask *mask);

In what context would the 'parent_virq' be useful? You are *sending*
an IPI, not receiving it, and I expect the mechanism by which you send
such an IPI to be independent of the Linux view of the irq.

Also, please swap data and mask in the function signature.

> +};
> +
> +void ipi_mux_process(void);
> +int ipi_mux_create(unsigned int parent_virq, unsigned int nr_ipi,
> +		   const struct ipi_mux_ops *ops, void *data);
> +
>  #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
>  /*
>   * Registers a generic IRQ handling function as the top-level IRQ handler in
> diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
> index db3d174c53d4..df17dbc54b02 100644
> --- a/kernel/irq/Kconfig
> +++ b/kernel/irq/Kconfig
> @@ -86,6 +86,11 @@ config GENERIC_IRQ_IPI
>  	depends on SMP
>  	select IRQ_DOMAIN_HIERARCHY
>  
> +# Generic IRQ IPI Mux support
> +config GENERIC_IRQ_IPI_MUX
> +	bool
> +	depends on SMP
> +
>  # Generic MSI interrupt support
>  config GENERIC_MSI_IRQ
>  	bool
> diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
> index b4f53717d143..f19d3080bf11 100644
> --- a/kernel/irq/Makefile
> +++ b/kernel/irq/Makefile
> @@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
>  obj-$(CONFIG_PM_SLEEP) += pm.o
>  obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
>  obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
> +obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
>  obj-$(CONFIG_SMP) += affinity.o
>  obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
>  obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
> diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
> new file mode 100644
> index 000000000000..259e00366dd7
> --- /dev/null
> +++ b/kernel/irq/ipi-mux.c
> @@ -0,0 +1,268 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Multiplex several virtual IPIs over a single HW IPI.
> + *
> + * Copyright The Asahi Linux Contributors
> + * Copyright (c) 2022 Ventana Micro Systems Inc.
> + */
> +
> +#define pr_fmt(fmt) "ipi-mux: " fmt
> +#include <linux/cpu.h>
> +#include <linux/init.h>
> +#include <linux/irq.h>
> +#include <linux/irqchip.h>
> +#include <linux/irqchip/chained_irq.h>
> +#include <linux/irqdomain.h>
> +#include <linux/jump_label.h>
> +#include <linux/percpu.h>
> +#include <linux/smp.h>
> +
> +struct ipi_mux_cpu {
> +	atomic_t			enable;
> +	atomic_t			bits;
> +	struct cpumask			send_mask;
> +};
> +
> +struct ipi_mux_control {
> +	void				*data;
> +	unsigned int			nr;

Honestly, I think we can get rid of this. The number of IPIs Linux
uses is pretty small, and assuming a huge value (like 32) would be
enough. It would save looking up this value on each IPI handling.

> +	unsigned int			parent_virq;
> +	struct irq_domain		*domain;
> +	const struct ipi_mux_ops	*ops;
> +	struct ipi_mux_cpu __percpu	*cpu;
> +};
> +
> +static struct ipi_mux_control *imux;
> +static DEFINE_STATIC_KEY_FALSE(imux_pre_handle);
> +static DEFINE_STATIC_KEY_FALSE(imux_post_handle);
> +
> +static void ipi_mux_mask(struct irq_data *d)
> +{
> +	struct ipi_mux_cpu *icpu = this_cpu_ptr(imux->cpu);
> +
> +	atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
> +}
> +
> +static void ipi_mux_unmask(struct irq_data *d)
> +{
> +	u32 ibit = BIT(irqd_to_hwirq(d));
> +	struct ipi_mux_cpu *icpu = this_cpu_ptr(imux->cpu);
> +
> +	atomic_or(ibit, &icpu->enable);
> +
> +	/*
> +	 * The atomic_or() above must complete before the atomic_read()
> +	 * below to avoid racing ipi_mux_send_mask().
> +	 */
> +	smp_mb__after_atomic();
> +
> +	/* If a pending IPI was unmasked, raise a parent IPI immediately. */
> +	if (atomic_read(&icpu->bits) & ibit)
> +		imux->ops->ipi_mux_send(imux->parent_virq, imux->data,
> +					cpumask_of(smp_processor_id()));
> +}
> +
> +static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
> +{
> +	u32 ibit = BIT(irqd_to_hwirq(d));
> +	struct ipi_mux_cpu *icpu = this_cpu_ptr(imux->cpu);
> +	struct cpumask *send_mask = &icpu->send_mask;
> +	unsigned long flags;
> +	int cpu;
> +
> +	/*
> +	 * We use send_mask as a per-CPU variable so disable local
> +	 * interrupts to avoid being preempted.
> +	 */
> +	local_irq_save(flags);
> +
> +	cpumask_clear(send_mask);
> +
> +	for_each_cpu(cpu, mask) {
> +		icpu = per_cpu_ptr(imux->cpu, cpu);
> +		atomic_or(ibit, &icpu->bits);
> +
> +		/*
> +		 * The atomic_or() above must complete before
> +		 * the atomic_read() below to avoid racing with
> +		 * ipi_mux_unmask().
> +		 */
> +		smp_mb__after_atomic();
> +
> +		if (atomic_read(&icpu->enable) & ibit)
> +			cpumask_set_cpu(cpu, send_mask);
> +	}
> +
> +	/* Trigger the parent IPI */
> +	imux->ops->ipi_mux_send(imux->parent_virq, imux->data, send_mask);
> +
> +	local_irq_restore(flags);
> +}
> +
> +static const struct irq_chip ipi_mux_chip = {
> +	.name		= "IPI Mux",
> +	.irq_mask	= ipi_mux_mask,
> +	.irq_unmask	= ipi_mux_unmask,
> +	.ipi_send_mask	= ipi_mux_send_mask,
> +};

I really think this could either be supplied by the irqchip, or
somehow patched to avoid the pointless imux->ops->ipi_mux_send
indirection. Pointer chasing hurts.

> +
> +static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
> +				unsigned int nr_irqs, void *arg)
> +{
> +	int i;
> +
> +	for (i = 0; i < nr_irqs; i++) {
> +		irq_set_percpu_devid(virq + i);
> +		irq_domain_set_info(d, virq + i, i,
> +				    &ipi_mux_chip, d->host_data,
> +				    handle_percpu_devid_irq, NULL, NULL);
> +	}
> +
> +	return 0;
> +}
> +
> +static const struct irq_domain_ops ipi_mux_domain_ops = {
> +	.alloc		= ipi_mux_domain_alloc,
> +	.free		= irq_domain_free_irqs_top,
> +};
> +
> +/**
> + * ipi_mux_process - Process multiplexed virtual IPIs
> + */
> +void ipi_mux_process(void)
> +{
> +	struct ipi_mux_cpu *icpu = this_cpu_ptr(imux->cpu);
> +	irq_hw_number_t hwirq;
> +	unsigned long ipis;
> +	int en;

Please use an unsigned type here (see the rationale in atomic_t.txt).

> +
> +	if (static_branch_unlikely(&imux_pre_handle))
> +		imux->ops->ipi_mux_pre_handle(imux->parent_virq, imux->data);
> +
> +	/*
> +	 * Reading enable mask does not need to be ordered as long as
> +	 * this function called from interrupt handler because only
> +	 * the CPU itself can change it's own enable mask.
> +	 */
> +	en = atomic_read(&icpu->enable);
> +
> +	/*
> +	 * Clear the IPIs we are about to handle. This pairs with the
> +	 * atomic_fetch_or_release() in ipi_mux_send_mask().
> +	 */
> +	ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
> +
> +	for_each_set_bit(hwirq, &ipis, imux->nr)
> +		generic_handle_domain_irq(imux->domain, hwirq);
> +
> +	if (static_branch_unlikely(&imux_post_handle))
> +		imux->ops->ipi_mux_post_handle(imux->parent_virq, imux->data);

Do you see what I meant about the {pre,post}_handle callback, and how
they are *exactly* like irq_{ack,eoi}?

> +}
> +
> +static void ipi_mux_handler(struct irq_desc *desc)
> +{
> +	struct irq_chip *chip = irq_desc_get_chip(desc);
> +
> +	chained_irq_enter(chip, desc);
> +	ipi_mux_process();
> +	chained_irq_exit(chip, desc);
> +}
> +
> +/**
> + * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
> + * parent IPI.
> + * @parent_virq:	virq of the parent per-CPU IRQ
> + * @nr_ipi:		number of virtual IPIs to create. This should
> + *			be <= BITS_PER_TYPE(int)
> + * @ops:		multiplexing operations for the parent IPI
> + * @data:		opaque data used by the multiplexing operations

What is the use for data? If anything, that data should be passed via
the mux interrupt. But the whole point of this is to make the mux
invisible. So this whole 'data' business is a mystery to me.

> + *
> + * If the parent IPI > 0 then ipi_mux_process() will be automatically
> + * called via chained handler.
> + *
> + * If the parent IPI <= 0 then it is responsibility of irqchip drivers
> + * to explicitly call ipi_mux_process() for processing muxed IPIs.

0 is a much more idiomatic value for "no parent IRQ". < 0 normally
represents an error.

> + *
> + * Returns first virq of the newly created virtual IPIs upon success
> + * or <=0 upon failure
> + */
> +int ipi_mux_create(unsigned int parent_virq, unsigned int nr_ipi,

Tell me how I can express a negative parent_virq here?

> +		   const struct ipi_mux_ops *ops, void *data)
> +{
> +	struct fwnode_handle *fwnode;
> +	struct irq_domain *domain;
> +	int rc;
> +
> +	if (imux)
> +		return -EEXIST;
> +
> +	if (BITS_PER_TYPE(int) < nr_ipi || !ops || !ops->ipi_mux_send)
> +		return -EINVAL;
> +
> +	if (parent_virq &&
> +	    !irqd_is_per_cpu(irq_desc_get_irq_data(irq_to_desc(parent_virq))))
> +		return -EINVAL;

See how buggy this is if I follow your definition of "no parent IRQ"?

	M.
  
Anup Patel Nov. 26, 2022, 1:31 p.m. UTC | #2
On Sat, Nov 26, 2022 at 6:12 PM Marc Zyngier <maz@kernel.org> wrote:
>
> On Mon, 14 Nov 2022 09:39:00 +0000,
> Anup Patel <apatel@ventanamicro.com> wrote:
> >
> > All RISC-V platforms have a single HW IPI provided by the INTC local
> > interrupt controller. The HW method to trigger INTC IPI can be through
> > external irqchip (e.g. RISC-V AIA), through platform specific device
> > (e.g. SiFive CLINT timer), or through firmware (e.g. SBI IPI call).
> >
> > To support multiple IPIs on RISC-V, we add a generic IPI multiplexing
> > mechanism which help us create multiple virtual IPIs using a single
> > HW IPI. This generic IPI multiplexing is inspired from the Apple AIC
> > irqchip driver and it is shared by various RISC-V irqchip drivers.
> >
> > Signed-off-by: Anup Patel <apatel@ventanamicro.com>
> > ---
> >  include/linux/irq.h  |  18 +++
> >  kernel/irq/Kconfig   |   5 +
> >  kernel/irq/Makefile  |   1 +
> >  kernel/irq/ipi-mux.c | 268 +++++++++++++++++++++++++++++++++++++++++++
> >  4 files changed, 292 insertions(+)
> >  create mode 100644 kernel/irq/ipi-mux.c
> >
> > diff --git a/include/linux/irq.h b/include/linux/irq.h
> > index c3eb89606c2b..5ab702cb0a5b 100644
> > --- a/include/linux/irq.h
> > +++ b/include/linux/irq.h
> > @@ -1266,6 +1266,24 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
> >  int ipi_send_single(unsigned int virq, unsigned int cpu);
> >  int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
> >
> > +/**
> > + * struct ipi_mux_ops - IPI multiplex operations
> > + *
> > + * @ipi_mux_pre_handle:      Optional function called before handling parent IPI
> > + * @ipi_mux_post_handle:Optional function called after handling parent IPI
> > + * @ipi_mux_send:    Trigger parent IPI on target CPUs
> > + */
> > +struct ipi_mux_ops {
> > +     void (*ipi_mux_pre_handle)(unsigned int parent_virq, void *data);
> > +     void (*ipi_mux_post_handle)(unsigned int parent_virq, void *data);
>
> I still haven't seen any decent explanation for this other than "we
> need it". What is it that cannot be achieved via the irq_ack() and
> irq_eoi() callbacks?

Sure, even I think these are strange looking callbacks. I will drop
these callbacks in the next patch revision.

>
> > +     void (*ipi_mux_send)(unsigned int parent_virq, void *data,
> > +                          const struct cpumask *mask);
>
> In what context would the 'parent_virq' be useful? You are *sending*
> an IPI, not receiving it, and I expect the mechanism by which you send
> such an IPI to be independent of the Linux view of the irq.

Okay, I will drop the 'parent_virq' parameter.

>
> Also, please swap data and mask in the function signature.

Okay, I will change the order.

>
> > +};
> > +
> > +void ipi_mux_process(void);
> > +int ipi_mux_create(unsigned int parent_virq, unsigned int nr_ipi,
> > +                const struct ipi_mux_ops *ops, void *data);
> > +
> >  #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
> >  /*
> >   * Registers a generic IRQ handling function as the top-level IRQ handler in
> > diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
> > index db3d174c53d4..df17dbc54b02 100644
> > --- a/kernel/irq/Kconfig
> > +++ b/kernel/irq/Kconfig
> > @@ -86,6 +86,11 @@ config GENERIC_IRQ_IPI
> >       depends on SMP
> >       select IRQ_DOMAIN_HIERARCHY
> >
> > +# Generic IRQ IPI Mux support
> > +config GENERIC_IRQ_IPI_MUX
> > +     bool
> > +     depends on SMP
> > +
> >  # Generic MSI interrupt support
> >  config GENERIC_MSI_IRQ
> >       bool
> > diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
> > index b4f53717d143..f19d3080bf11 100644
> > --- a/kernel/irq/Makefile
> > +++ b/kernel/irq/Makefile
> > @@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
> >  obj-$(CONFIG_PM_SLEEP) += pm.o
> >  obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
> >  obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
> > +obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
> >  obj-$(CONFIG_SMP) += affinity.o
> >  obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
> >  obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
> > diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
> > new file mode 100644
> > index 000000000000..259e00366dd7
> > --- /dev/null
> > +++ b/kernel/irq/ipi-mux.c
> > @@ -0,0 +1,268 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * Multiplex several virtual IPIs over a single HW IPI.
> > + *
> > + * Copyright The Asahi Linux Contributors
> > + * Copyright (c) 2022 Ventana Micro Systems Inc.
> > + */
> > +
> > +#define pr_fmt(fmt) "ipi-mux: " fmt
> > +#include <linux/cpu.h>
> > +#include <linux/init.h>
> > +#include <linux/irq.h>
> > +#include <linux/irqchip.h>
> > +#include <linux/irqchip/chained_irq.h>
> > +#include <linux/irqdomain.h>
> > +#include <linux/jump_label.h>
> > +#include <linux/percpu.h>
> > +#include <linux/smp.h>
> > +
> > +struct ipi_mux_cpu {
> > +     atomic_t                        enable;
> > +     atomic_t                        bits;
> > +     struct cpumask                  send_mask;
> > +};
> > +
> > +struct ipi_mux_control {
> > +     void                            *data;
> > +     unsigned int                    nr;
>
> Honestly, I think we can get rid of this. The number of IPIs Linux
> uses is pretty small, and assuming a huge value (like 32) would be
> enough. It would save looking up this value on each IPI handling.

I had kept in-case some driver wanted to create fewer (< 32)
muxed IPIs.

>
> > +     unsigned int                    parent_virq;
> > +     struct irq_domain               *domain;
> > +     const struct ipi_mux_ops        *ops;
> > +     struct ipi_mux_cpu __percpu     *cpu;
> > +};
> > +
> > +static struct ipi_mux_control *imux;
> > +static DEFINE_STATIC_KEY_FALSE(imux_pre_handle);
> > +static DEFINE_STATIC_KEY_FALSE(imux_post_handle);
> > +
> > +static void ipi_mux_mask(struct irq_data *d)
> > +{
> > +     struct ipi_mux_cpu *icpu = this_cpu_ptr(imux->cpu);
> > +
> > +     atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
> > +}
> > +
> > +static void ipi_mux_unmask(struct irq_data *d)
> > +{
> > +     u32 ibit = BIT(irqd_to_hwirq(d));
> > +     struct ipi_mux_cpu *icpu = this_cpu_ptr(imux->cpu);
> > +
> > +     atomic_or(ibit, &icpu->enable);
> > +
> > +     /*
> > +      * The atomic_or() above must complete before the atomic_read()
> > +      * below to avoid racing ipi_mux_send_mask().
> > +      */
> > +     smp_mb__after_atomic();
> > +
> > +     /* If a pending IPI was unmasked, raise a parent IPI immediately. */
> > +     if (atomic_read(&icpu->bits) & ibit)
> > +             imux->ops->ipi_mux_send(imux->parent_virq, imux->data,
> > +                                     cpumask_of(smp_processor_id()));
> > +}
> > +
> > +static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
> > +{
> > +     u32 ibit = BIT(irqd_to_hwirq(d));
> > +     struct ipi_mux_cpu *icpu = this_cpu_ptr(imux->cpu);
> > +     struct cpumask *send_mask = &icpu->send_mask;
> > +     unsigned long flags;
> > +     int cpu;
> > +
> > +     /*
> > +      * We use send_mask as a per-CPU variable so disable local
> > +      * interrupts to avoid being preempted.
> > +      */
> > +     local_irq_save(flags);
> > +
> > +     cpumask_clear(send_mask);
> > +
> > +     for_each_cpu(cpu, mask) {
> > +             icpu = per_cpu_ptr(imux->cpu, cpu);
> > +             atomic_or(ibit, &icpu->bits);
> > +
> > +             /*
> > +              * The atomic_or() above must complete before
> > +              * the atomic_read() below to avoid racing with
> > +              * ipi_mux_unmask().
> > +              */
> > +             smp_mb__after_atomic();
> > +
> > +             if (atomic_read(&icpu->enable) & ibit)
> > +                     cpumask_set_cpu(cpu, send_mask);
> > +     }
> > +
> > +     /* Trigger the parent IPI */
> > +     imux->ops->ipi_mux_send(imux->parent_virq, imux->data, send_mask);
> > +
> > +     local_irq_restore(flags);
> > +}
> > +
> > +static const struct irq_chip ipi_mux_chip = {
> > +     .name           = "IPI Mux",
> > +     .irq_mask       = ipi_mux_mask,
> > +     .irq_unmask     = ipi_mux_unmask,
> > +     .ipi_send_mask  = ipi_mux_send_mask,
> > +};
>
> I really think this could either be supplied by the irqchip, or
> somehow patched to avoid the pointless imux->ops->ipi_mux_send
> indirection. Pointer chasing hurts.

Once we remove ipi_mux_pre/post_handle() callbacks, the
"ops" will be pointless and we will be able to remove one level
of indirection here.

We certainly need a mux irqchip to implement the
mask/unmask semantics for muxed IPIs.

>
> > +
> > +static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
> > +                             unsigned int nr_irqs, void *arg)
> > +{
> > +     int i;
> > +
> > +     for (i = 0; i < nr_irqs; i++) {
> > +             irq_set_percpu_devid(virq + i);
> > +             irq_domain_set_info(d, virq + i, i,
> > +                                 &ipi_mux_chip, d->host_data,
> > +                                 handle_percpu_devid_irq, NULL, NULL);
> > +     }
> > +
> > +     return 0;
> > +}
> > +
> > +static const struct irq_domain_ops ipi_mux_domain_ops = {
> > +     .alloc          = ipi_mux_domain_alloc,
> > +     .free           = irq_domain_free_irqs_top,
> > +};
> > +
> > +/**
> > + * ipi_mux_process - Process multiplexed virtual IPIs
> > + */
> > +void ipi_mux_process(void)
> > +{
> > +     struct ipi_mux_cpu *icpu = this_cpu_ptr(imux->cpu);
> > +     irq_hw_number_t hwirq;
> > +     unsigned long ipis;
> > +     int en;
>
> Please use an unsigned type here (see the rationale in atomic_t.txt).

Sure, I will update.

>
> > +
> > +     if (static_branch_unlikely(&imux_pre_handle))
> > +             imux->ops->ipi_mux_pre_handle(imux->parent_virq, imux->data);
> > +
> > +     /*
> > +      * Reading enable mask does not need to be ordered as long as
> > +      * this function called from interrupt handler because only
> > +      * the CPU itself can change it's own enable mask.
> > +      */
> > +     en = atomic_read(&icpu->enable);
> > +
> > +     /*
> > +      * Clear the IPIs we are about to handle. This pairs with the
> > +      * atomic_fetch_or_release() in ipi_mux_send_mask().
> > +      */
> > +     ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
> > +
> > +     for_each_set_bit(hwirq, &ipis, imux->nr)
> > +             generic_handle_domain_irq(imux->domain, hwirq);
> > +
> > +     if (static_branch_unlikely(&imux_post_handle))
> > +             imux->ops->ipi_mux_post_handle(imux->parent_virq, imux->data);
>
> Do you see what I meant about the {pre,post}_handle callback, and how
> they are *exactly* like irq_{ack,eoi}?

I see. I will remove these in the next patch revision.

>
> > +}
> > +
> > +static void ipi_mux_handler(struct irq_desc *desc)
> > +{
> > +     struct irq_chip *chip = irq_desc_get_chip(desc);
> > +
> > +     chained_irq_enter(chip, desc);
> > +     ipi_mux_process();
> > +     chained_irq_exit(chip, desc);
> > +}
> > +
> > +/**
> > + * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
> > + * parent IPI.
> > + * @parent_virq:     virq of the parent per-CPU IRQ
> > + * @nr_ipi:          number of virtual IPIs to create. This should
> > + *                   be <= BITS_PER_TYPE(int)
> > + * @ops:             multiplexing operations for the parent IPI
> > + * @data:            opaque data used by the multiplexing operations
>
> What is the use for data? If anything, that data should be passed via
> the mux interrupt. But the whole point of this is to make the mux
> invisible. So this whole 'data' business is a mystery to me.

This is added only to pass back driver data in ipi_mux_send().

>
> > + *
> > + * If the parent IPI > 0 then ipi_mux_process() will be automatically
> > + * called via chained handler.
> > + *
> > + * If the parent IPI <= 0 then it is responsibility of irqchip drivers
> > + * to explicitly call ipi_mux_process() for processing muxed IPIs.
>
> 0 is a much more idiomatic value for "no parent IRQ". < 0 normally
> represents an error.

I am going to remove the "parent_virq" parameter itself so this
will go away.

>
> > + *
> > + * Returns first virq of the newly created virtual IPIs upon success
> > + * or <=0 upon failure
> > + */
> > +int ipi_mux_create(unsigned int parent_virq, unsigned int nr_ipi,
>
> Tell me how I can express a negative parent_virq here?

Sure, I will fix this.

>
> > +                const struct ipi_mux_ops *ops, void *data)
> > +{
> > +     struct fwnode_handle *fwnode;
> > +     struct irq_domain *domain;
> > +     int rc;
> > +
> > +     if (imux)
> > +             return -EEXIST;
> > +
> > +     if (BITS_PER_TYPE(int) < nr_ipi || !ops || !ops->ipi_mux_send)
> > +             return -EINVAL;
> > +
> > +     if (parent_virq &&
> > +         !irqd_is_per_cpu(irq_desc_get_irq_data(irq_to_desc(parent_virq))))
> > +             return -EINVAL;
>
> See how buggy this is if I follow your definition of "no parent IRQ"?

Sure, I will fix this.

>
>         M.
>
> --
> Without deviation from the norm, progress is not possible.

Thanks,
Anup
  
Marc Zyngier Nov. 26, 2022, 2:28 p.m. UTC | #3
On Sat, 26 Nov 2022 13:31:46 +0000,
Anup Patel <apatel@ventanamicro.com> wrote:
> 
> On Sat, Nov 26, 2022 at 6:12 PM Marc Zyngier <maz@kernel.org> wrote:
> >
> > On Mon, 14 Nov 2022 09:39:00 +0000,
> > Anup Patel <apatel@ventanamicro.com> wrote:
> > >
> > > +struct ipi_mux_control {
> > > +     void                            *data;
> > > +     unsigned int                    nr;
> >
> > Honestly, I think we can get rid of this. The number of IPIs Linux
> > uses is pretty small, and assuming a huge value (like 32) would be
> > enough. It would save looking up this value on each IPI handling.
> 
> I had kept in-case some driver wanted to create fewer (< 32)
> muxed IPIs.

I'm fine with being able to specifying the max, but I'm not sure there
is a need to keep track of it any further. Certainly, the overhead of
loading this value on each IPI could be removed. On most architecture,
for_each_set_bit() and co and better optimised with a fixed number of
bits.

> > > +static const struct irq_chip ipi_mux_chip = {
> > > +     .name           = "IPI Mux",
> > > +     .irq_mask       = ipi_mux_mask,
> > > +     .irq_unmask     = ipi_mux_unmask,
> > > +     .ipi_send_mask  = ipi_mux_send_mask,
> > > +};
> >
> > I really think this could either be supplied by the irqchip, or
> > somehow patched to avoid the pointless imux->ops->ipi_mux_send
> > indirection. Pointer chasing hurts.
> 
> Once we remove ipi_mux_pre/post_handle() callbacks, the
> "ops" will be pointless and we will be able to remove one level
> of indirection here.
> 
> We certainly need a mux irqchip to implement the
> mask/unmask semantics for muxed IPIs.

I'm not disputing that last point.

> > > +/**
> > > + * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
> > > + * parent IPI.
> > > + * @parent_virq:     virq of the parent per-CPU IRQ
> > > + * @nr_ipi:          number of virtual IPIs to create. This should
> > > + *                   be <= BITS_PER_TYPE(int)
> > > + * @ops:             multiplexing operations for the parent IPI
> > > + * @data:            opaque data used by the multiplexing operations
> >
> > What is the use for data? If anything, that data should be passed via
> > the mux interrupt. But the whole point of this is to make the mux
> > invisible. So this whole 'data' business is a mystery to me.
> 
> This is added only to pass back driver data in ipi_mux_send().

Again, what is the purpose of such data? If you need per-interrupt
data, this should be provided by the requester of the interrupt.

	M.
  
Anup Patel Nov. 26, 2022, 4 p.m. UTC | #4
On Sat, Nov 26, 2022 at 7:58 PM Marc Zyngier <maz@kernel.org> wrote:
>
> On Sat, 26 Nov 2022 13:31:46 +0000,
> Anup Patel <apatel@ventanamicro.com> wrote:
> >
> > On Sat, Nov 26, 2022 at 6:12 PM Marc Zyngier <maz@kernel.org> wrote:
> > >
> > > On Mon, 14 Nov 2022 09:39:00 +0000,
> > > Anup Patel <apatel@ventanamicro.com> wrote:
> > > >
> > > > +struct ipi_mux_control {
> > > > +     void                            *data;
> > > > +     unsigned int                    nr;
> > >
> > > Honestly, I think we can get rid of this. The number of IPIs Linux
> > > uses is pretty small, and assuming a huge value (like 32) would be
> > > enough. It would save looking up this value on each IPI handling.
> >
> > I had kept in-case some driver wanted to create fewer (< 32)
> > muxed IPIs.
>
> I'm fine with being able to specifying the max, but I'm not sure there
> is a need to keep track of it any further. Certainly, the overhead of
> loading this value on each IPI could be removed. On most architecture,
> for_each_set_bit() and co and better optimised with a fixed number of
> bits.

Okay, I will update like you suggested.

>
> > > > +static const struct irq_chip ipi_mux_chip = {
> > > > +     .name           = "IPI Mux",
> > > > +     .irq_mask       = ipi_mux_mask,
> > > > +     .irq_unmask     = ipi_mux_unmask,
> > > > +     .ipi_send_mask  = ipi_mux_send_mask,
> > > > +};
> > >
> > > I really think this could either be supplied by the irqchip, or
> > > somehow patched to avoid the pointless imux->ops->ipi_mux_send
> > > indirection. Pointer chasing hurts.
> >
> > Once we remove ipi_mux_pre/post_handle() callbacks, the
> > "ops" will be pointless and we will be able to remove one level
> > of indirection here.
> >
> > We certainly need a mux irqchip to implement the
> > mask/unmask semantics for muxed IPIs.
>
> I'm not disputing that last point.
>
> > > > +/**
> > > > + * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
> > > > + * parent IPI.
> > > > + * @parent_virq:     virq of the parent per-CPU IRQ
> > > > + * @nr_ipi:          number of virtual IPIs to create. This should
> > > > + *                   be <= BITS_PER_TYPE(int)
> > > > + * @ops:             multiplexing operations for the parent IPI
> > > > + * @data:            opaque data used by the multiplexing operations
> > >
> > > What is the use for data? If anything, that data should be passed via
> > > the mux interrupt. But the whole point of this is to make the mux
> > > invisible. So this whole 'data' business is a mystery to me.
> >
> > This is added only to pass back driver data in ipi_mux_send().
>
> Again, what is the purpose of such data? If you need per-interrupt
> data, this should be provided by the requester of the interrupt.

Currently, the irqchip drivers that we care about don't need this
data pointer so I will remove it. If required we can add it in future.

>
>         M.
>
> --
> Without deviation from the norm, progress is not possible.

Thanks,
Anup
  

Patch

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c3eb89606c2b..5ab702cb0a5b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -1266,6 +1266,24 @@  int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
 int ipi_send_single(unsigned int virq, unsigned int cpu);
 int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
 
+/**
+ * struct ipi_mux_ops - IPI multiplex operations
+ *
+ * @ipi_mux_pre_handle:	Optional function called before handling parent IPI
+ * @ipi_mux_post_handle:Optional function called after handling parent IPI
+ * @ipi_mux_send:	Trigger parent IPI on target CPUs
+ */
+struct ipi_mux_ops {
+	void (*ipi_mux_pre_handle)(unsigned int parent_virq, void *data);
+	void (*ipi_mux_post_handle)(unsigned int parent_virq, void *data);
+	void (*ipi_mux_send)(unsigned int parent_virq, void *data,
+			     const struct cpumask *mask);
+};
+
+void ipi_mux_process(void);
+int ipi_mux_create(unsigned int parent_virq, unsigned int nr_ipi,
+		   const struct ipi_mux_ops *ops, void *data);
+
 #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
 /*
  * Registers a generic IRQ handling function as the top-level IRQ handler in
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index db3d174c53d4..df17dbc54b02 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -86,6 +86,11 @@  config GENERIC_IRQ_IPI
 	depends on SMP
 	select IRQ_DOMAIN_HIERARCHY
 
+# Generic IRQ IPI Mux support
+config GENERIC_IRQ_IPI_MUX
+	bool
+	depends on SMP
+
 # Generic MSI interrupt support
 config GENERIC_MSI_IRQ
 	bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index b4f53717d143..f19d3080bf11 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -15,6 +15,7 @@  obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
 obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
 obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
 obj-$(CONFIG_SMP) += affinity.o
 obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
 obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
new file mode 100644
index 000000000000..259e00366dd7
--- /dev/null
+++ b/kernel/irq/ipi-mux.c
@@ -0,0 +1,268 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Multiplex several virtual IPIs over a single HW IPI.
+ *
+ * Copyright The Asahi Linux Contributors
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "ipi-mux: " fmt
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+struct ipi_mux_cpu {
+	atomic_t			enable;
+	atomic_t			bits;
+	struct cpumask			send_mask;
+};
+
+struct ipi_mux_control {
+	void				*data;
+	unsigned int			nr;
+	unsigned int			parent_virq;
+	struct irq_domain		*domain;
+	const struct ipi_mux_ops	*ops;
+	struct ipi_mux_cpu __percpu	*cpu;
+};
+
+static struct ipi_mux_control *imux;
+static DEFINE_STATIC_KEY_FALSE(imux_pre_handle);
+static DEFINE_STATIC_KEY_FALSE(imux_post_handle);
+
+static void ipi_mux_mask(struct irq_data *d)
+{
+	struct ipi_mux_cpu *icpu = this_cpu_ptr(imux->cpu);
+
+	atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
+}
+
+static void ipi_mux_unmask(struct irq_data *d)
+{
+	u32 ibit = BIT(irqd_to_hwirq(d));
+	struct ipi_mux_cpu *icpu = this_cpu_ptr(imux->cpu);
+
+	atomic_or(ibit, &icpu->enable);
+
+	/*
+	 * The atomic_or() above must complete before the atomic_read()
+	 * below to avoid racing ipi_mux_send_mask().
+	 */
+	smp_mb__after_atomic();
+
+	/* If a pending IPI was unmasked, raise a parent IPI immediately. */
+	if (atomic_read(&icpu->bits) & ibit)
+		imux->ops->ipi_mux_send(imux->parent_virq, imux->data,
+					cpumask_of(smp_processor_id()));
+}
+
+static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
+{
+	u32 ibit = BIT(irqd_to_hwirq(d));
+	struct ipi_mux_cpu *icpu = this_cpu_ptr(imux->cpu);
+	struct cpumask *send_mask = &icpu->send_mask;
+	unsigned long flags;
+	int cpu;
+
+	/*
+	 * We use send_mask as a per-CPU variable so disable local
+	 * interrupts to avoid being preempted.
+	 */
+	local_irq_save(flags);
+
+	cpumask_clear(send_mask);
+
+	for_each_cpu(cpu, mask) {
+		icpu = per_cpu_ptr(imux->cpu, cpu);
+		atomic_or(ibit, &icpu->bits);
+
+		/*
+		 * The atomic_or() above must complete before
+		 * the atomic_read() below to avoid racing with
+		 * ipi_mux_unmask().
+		 */
+		smp_mb__after_atomic();
+
+		if (atomic_read(&icpu->enable) & ibit)
+			cpumask_set_cpu(cpu, send_mask);
+	}
+
+	/* Trigger the parent IPI */
+	imux->ops->ipi_mux_send(imux->parent_virq, imux->data, send_mask);
+
+	local_irq_restore(flags);
+}
+
+static const struct irq_chip ipi_mux_chip = {
+	.name		= "IPI Mux",
+	.irq_mask	= ipi_mux_mask,
+	.irq_unmask	= ipi_mux_unmask,
+	.ipi_send_mask	= ipi_mux_send_mask,
+};
+
+static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
+				unsigned int nr_irqs, void *arg)
+{
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_set_percpu_devid(virq + i);
+		irq_domain_set_info(d, virq + i, i,
+				    &ipi_mux_chip, d->host_data,
+				    handle_percpu_devid_irq, NULL, NULL);
+	}
+
+	return 0;
+}
+
+static const struct irq_domain_ops ipi_mux_domain_ops = {
+	.alloc		= ipi_mux_domain_alloc,
+	.free		= irq_domain_free_irqs_top,
+};
+
+/**
+ * ipi_mux_process - Process multiplexed virtual IPIs
+ */
+void ipi_mux_process(void)
+{
+	struct ipi_mux_cpu *icpu = this_cpu_ptr(imux->cpu);
+	irq_hw_number_t hwirq;
+	unsigned long ipis;
+	int en;
+
+	if (static_branch_unlikely(&imux_pre_handle))
+		imux->ops->ipi_mux_pre_handle(imux->parent_virq, imux->data);
+
+	/*
+	 * Reading enable mask does not need to be ordered as long as
+	 * this function called from interrupt handler because only
+	 * the CPU itself can change it's own enable mask.
+	 */
+	en = atomic_read(&icpu->enable);
+
+	/*
+	 * Clear the IPIs we are about to handle. This pairs with the
+	 * atomic_fetch_or_release() in ipi_mux_send_mask().
+	 */
+	ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
+
+	for_each_set_bit(hwirq, &ipis, imux->nr)
+		generic_handle_domain_irq(imux->domain, hwirq);
+
+	if (static_branch_unlikely(&imux_post_handle))
+		imux->ops->ipi_mux_post_handle(imux->parent_virq, imux->data);
+}
+
+static void ipi_mux_handler(struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+
+	chained_irq_enter(chip, desc);
+	ipi_mux_process();
+	chained_irq_exit(chip, desc);
+}
+
+/**
+ * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
+ * parent IPI.
+ * @parent_virq:	virq of the parent per-CPU IRQ
+ * @nr_ipi:		number of virtual IPIs to create. This should
+ *			be <= BITS_PER_TYPE(int)
+ * @ops:		multiplexing operations for the parent IPI
+ * @data:		opaque data used by the multiplexing operations
+ *
+ * If the parent IPI > 0 then ipi_mux_process() will be automatically
+ * called via chained handler.
+ *
+ * If the parent IPI <= 0 then it is responsibility of irqchip drivers
+ * to explicitly call ipi_mux_process() for processing muxed IPIs.
+ *
+ * Returns first virq of the newly created virtual IPIs upon success
+ * or <=0 upon failure
+ */
+int ipi_mux_create(unsigned int parent_virq, unsigned int nr_ipi,
+		   const struct ipi_mux_ops *ops, void *data)
+{
+	struct fwnode_handle *fwnode;
+	struct irq_domain *domain;
+	int rc;
+
+	if (imux)
+		return -EEXIST;
+
+	if (BITS_PER_TYPE(int) < nr_ipi || !ops || !ops->ipi_mux_send)
+		return -EINVAL;
+
+	if (parent_virq &&
+	    !irqd_is_per_cpu(irq_desc_get_irq_data(irq_to_desc(parent_virq))))
+		return -EINVAL;
+
+	imux = kzalloc(sizeof(*imux), GFP_KERNEL);
+	if (!imux)
+		return -ENOMEM;
+
+	imux->cpu = alloc_percpu(typeof(*imux->cpu));
+	if (!imux->cpu) {
+		rc = -ENOMEM;
+		goto fail_free_mux;
+	}
+
+	fwnode = irq_domain_alloc_named_fwnode("IPI-Mux");
+	if (!fwnode) {
+		pr_err("unable to create IPI Mux fwnode\n");
+		rc = -ENOMEM;
+		goto fail_free_cpu;
+	}
+
+	domain = irq_domain_create_simple(fwnode, nr_ipi, 0,
+					  &ipi_mux_domain_ops, NULL);
+	if (!domain) {
+		pr_err("unable to add IPI Mux domain\n");
+		rc = -ENOMEM;
+		goto fail_free_fwnode;
+	}
+
+	domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE;
+	irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI);
+
+	rc = __irq_domain_alloc_irqs(domain, -1, nr_ipi,
+				     NUMA_NO_NODE, NULL, false, NULL);
+	if (rc <= 0) {
+		pr_err("unable to alloc IRQs from IPI Mux domain\n");
+		goto fail_free_domain;
+	}
+
+	imux->domain = domain;
+	imux->data = data;
+	imux->nr = nr_ipi;
+	imux->parent_virq = parent_virq;
+	imux->ops = ops;
+
+	if (imux->ops->ipi_mux_pre_handle)
+		static_branch_enable(&imux_pre_handle);
+
+	if (imux->ops->ipi_mux_post_handle)
+		static_branch_enable(&imux_post_handle);
+
+	if (parent_virq > 0)
+		irq_set_chained_handler(parent_virq, ipi_mux_handler);
+
+	return rc;
+
+fail_free_domain:
+	irq_domain_remove(domain);
+fail_free_fwnode:
+	irq_domain_free_fwnode(fwnode);
+fail_free_cpu:
+	free_percpu(imux->cpu);
+fail_free_mux:
+	kfree(imux);
+	imux = NULL;
+	return rc;
+}