On Mon, 2023-02-13 at 11:43 +0100, Johan Hovold wrote:
> The IRQ domain structures are currently protected by the global
> irq_domain_mutex. Switch to using more fine-grained per-domain locking,
> which can speed up parallel probing by reducing lock contention.
>
> On a recent arm64 laptop, the total time spent waiting for the locks
> during boot drops from 160 to 40 ms on average, while the maximum
> aggregate wait time drops from 550 to 90 ms over ten runs for example.
>
> Note that the domain lock of the root domain (innermost domain) must be
> used for hierarchical domains. For non-hierarchical domains (as for root
> domains), the new root pointer is set to the domain itself so that
> &domain->root->mutex always points to the right lock.
>
> Also note that hierarchical domains should be constructed using
> irq_domain_create_hierarchy() (or irq_domain_add_hierarchy()) to avoid
> having racing allocations access a not fully initialised domain. As a
> safeguard, the lockdep assertion in irq_domain_set_mapping() will catch
> any offenders that also fail to set the root domain pointer.
>
> Tested-by: Hsin-Yi Wang <hsinyi@chromium.org>
> Tested-by: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
> Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Broke Xen. And it's *so* easy to test. As long as you have qemu master
branch from no older than last Thursday, that is...
$ qemu-system-x86_64 -serial mon:stdio -display none \
-accel kvm,xen-version=0x4000e,kernel-irqchip=split \
-kernel arch/x86/boot/bzImage -append "console=ttyS0"
...
[ 0.466554] BUG: kernel NULL pointer dereference, address: 00000000000000c0
[ 0.467249] #PF: supervisor read access in kernel mode
[ 0.467249] #PF: error_code(0x0000) - not-present page
[ 0.467249] PGD 0 P4D 0
[ 0.467249] Oops: 0000 [#1] PREEMPT SMP PTI
[ 0.467249] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.2.0-rc4+ #1206
[ 0.467249] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.1-0-g3208b098f51a-prebuilt.qemu.org 04/01/2014
[ 0.467249] RIP: 0010:irq_domain_create_hierarchy+0x2c/0x70
[ 0.467249] Code: 1e fa 0f 1f 44 00 00 41 54 49 89 fc 48 89 cf 55 89 f5 53 85 d2 74 40 89 d6 31 c9 89 d2 e8 2c fa ff ff 48 89 c3 48 85 db 74 21 <49> 8b 84 24 c0 00 00 00 09 6b 28 48 89 df 4c 89 a3 f0 00 00 00 48
[ 0.467249] RSP: 0000:ffffc90000013e60 EFLAGS: 00010286
[ 0.467249] RAX: ffff8880053a1a00 RBX: ffff8880053a1a00 RCX: 0000000000000000
[ 0.467249] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffff84828fa0
[ 0.467249] RBP: 0000000000000010 R08: 0000000000000003 R09: 0000000000000000
[ 0.467249] R10: 0000000025a89be7 R11: 00000000442a63fa R12: 0000000000000000
[ 0.467249] R13: ffffffff83ac1b98 R14: 0000000000000000 R15: 0000000000000000
[ 0.467249] FS: 0000000000000000(0000) GS:ffff888007a00000(0000) knlGS:0000000000000000
[ 0.467249] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.467249] CR2: 00000000000000c0 CR3: 0000000002824000 CR4: 00000000000006f0
[ 0.467249] Call Trace:
[ 0.467249] <TASK>
[ 0.467249] ? __pfx_pci_arch_init+0x10/0x10
[ 0.467249] __msi_create_irq_domain+0x85/0x170
[ 0.467249] ? __pfx_pci_arch_init+0x10/0x10
[ 0.467249] xen_create_pci_msi_domain+0x34/0x40
[ 0.467249] x86_create_pci_msi_domain+0x12/0x1e
[ 0.467249] pci_arch_init+0x31/0x7a
[ 0.467249] ? __pfx_pci_arch_init+0x10/0x10
[ 0.467249] do_one_initcall+0x5f/0x320
[ 0.467249] ? rcu_read_lock_sched_held+0x43/0x80
[ 0.467249] kernel_init_freeable+0x189/0x1c6
[ 0.467249] ? __pfx_kernel_init+0x10/0x10
[ 0.467249] kernel_init+0x1a/0x130
[ 0.467249] ret_from_fork+0x2c/0x50
[ 0.467249] </TASK>
[ 0.467249] Modules linked in:
[ 0.467249] CR2: 00000000000000c0
[ 0.467249] ---[ end trace 0000000000000000 ]---
[ 0.467249] RIP: 0010:irq_domain_create_hierarchy+0x2c/0x70
[ 0.467249] Code: 1e fa 0f 1f 44 00 00 41 54 49 89 fc 48 89 cf 55 89 f5 53 85 d2 74 40 89 d6 31 c9 89 d2 e8 2c fa ff ff 48 89 c3 48 85 db 74 21 <49> 8b 84 24 c0 00 00 00 09 6b 28 48 89 df 4c 89 a3 f0 00 00 00 48
[ 0.467249] RSP: 0000:ffffc90000013e60 EFLAGS: 00010286
[ 0.467249] RAX: ffff8880053a1a00 RBX: ffff8880053a1a00 RCX: 0000000000000000
[ 0.467249] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffff84828fa0
[ 0.467249] RBP: 0000000000000010 R08: 0000000000000003 R09: 0000000000000000
[ 0.467249] R10: 0000000025a89be7 R11: 00000000442a63fa R12: 0000000000000000
[ 0.467249] R13: ffffffff83ac1b98 R14: 0000000000000000 R15: 0000000000000000
[ 0.467249] FS: 0000000000000000(0000) GS:ffff888007a00000(0000) knlGS:0000000000000000
[ 0.467249] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.467249] CR2: 00000000000000c0 CR3: 0000000002824000 CR4: 00000000000006f0
[ 0.467249] Kernel panic - not syncing: Fatal exception
@@ -125,6 +125,8 @@ struct irq_domain_chip_generic;
* core code.
* @flags: Per irq_domain flags
* @mapcount: The number of mapped interrupts
+ * @mutex: Domain lock, hierarchical domains use root domain's lock
+ * @root: Pointer to root domain, or containing structure if non-hierarchical
*
* Optional elements:
* @fwnode: Pointer to firmware node associated with the irq_domain. Pretty easy
@@ -152,6 +154,8 @@ struct irq_domain {
void *host_data;
unsigned int flags;
unsigned int mapcount;
+ struct mutex mutex;
+ struct irq_domain *root;
/* Optional data */
struct fwnode_handle *fwnode;
@@ -215,6 +215,17 @@ static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,
domain->revmap_size = size;
+ /*
+ * Hierarchical domains use the domain lock of the root domain
+ * (innermost domain).
+ *
+ * For non-hierarchical domains (as for root domains), the root
+ * pointer is set to the domain itself so that &domain->root->mutex
+ * always points to the right lock.
+ */
+ mutex_init(&domain->mutex);
+ domain->root = domain;
+
irq_domain_check_hierarchy(domain);
return domain;
@@ -524,7 +535,7 @@ static bool irq_domain_is_nomap(struct irq_domain *domain)
static void irq_domain_clear_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
- lockdep_assert_held(&irq_domain_mutex);
+ lockdep_assert_held(&domain->root->mutex);
if (irq_domain_is_nomap(domain))
return;
@@ -539,7 +550,11 @@ static void irq_domain_set_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq,
struct irq_data *irq_data)
{
- lockdep_assert_held(&irq_domain_mutex);
+ /*
+ * This also makes sure that all domains point to the same root when
+ * called from irq_domain_insert_irq() for each domain in a hierarchy.
+ */
+ lockdep_assert_held(&domain->root->mutex);
if (irq_domain_is_nomap(domain))
return;
@@ -561,7 +576,7 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
hwirq = irq_data->hwirq;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
irq_set_status_flags(irq, IRQ_NOREQUEST);
@@ -583,7 +598,7 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
/* Clear reverse map for this hwirq */
irq_domain_clear_mapping(domain, hwirq);
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
}
static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq,
@@ -633,9 +648,9 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
{
int ret;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
ret = irq_domain_associate_locked(domain, virq, hwirq);
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
return ret;
}
@@ -752,7 +767,7 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
return 0;
}
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
/* Check if mapping already exists */
virq = irq_find_mapping(domain, hwirq);
@@ -763,7 +778,7 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity);
out:
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -832,7 +847,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
type &= IRQ_TYPE_SENSE_MASK;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
/*
* If we've already configured this interrupt,
@@ -892,7 +907,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
/* Store trigger type */
irqd_set_trigger_type(irq_data, type);
out:
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -1157,6 +1172,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
domain = __irq_domain_create(fwnode, 0, ~0, 0, ops, host_data);
if (domain) {
+ domain->root = parent->root;
domain->parent = parent;
domain->flags |= flags;
@@ -1555,10 +1571,10 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
return -EINVAL;
}
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg,
realloc, affinity);
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
return ret;
}
@@ -1569,7 +1585,7 @@ static void irq_domain_fix_revmap(struct irq_data *d)
{
void __rcu **slot;
- lockdep_assert_held(&irq_domain_mutex);
+ lockdep_assert_held(&d->domain->root->mutex);
if (irq_domain_is_nomap(d->domain))
return;
@@ -1635,7 +1651,7 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
if (!parent_irq_data)
return -ENOMEM;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
/* Copy the original irq_data. */
*parent_irq_data = *irq_data;
@@ -1663,7 +1679,7 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
irq_domain_fix_revmap(parent_irq_data);
irq_domain_set_mapping(domain, irq_data->hwirq, irq_data);
error:
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
return rv;
}
@@ -1718,7 +1734,7 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq)
if (WARN_ON(!parent_irq_data))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
irq_data->parent_data = NULL;
@@ -1730,7 +1746,7 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq)
irq_domain_fix_revmap(irq_data);
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
kfree(parent_irq_data);
@@ -1746,17 +1762,20 @@ EXPORT_SYMBOL_GPL(irq_domain_pop_irq);
void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_domain *domain;
int i;
if (WARN(!data || !data->domain || !data->domain->ops->free,
"NULL pointer, cannot free irq\n"))
return;
- mutex_lock(&irq_domain_mutex);
+ domain = data->domain;
+
+ mutex_lock(&domain->root->mutex);
for (i = 0; i < nr_irqs; i++)
irq_domain_remove_irq(virq + i);
- irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs);
- mutex_unlock(&irq_domain_mutex);
+ irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs);
+ mutex_unlock(&domain->root->mutex);
irq_domain_free_irq_data(virq, nr_irqs);
irq_free_descs(virq, nr_irqs);