iommu: Avoid softlockup and rcu stall in fq_flush_timeout().

Message ID 20230216071148.2060-1-zhangpeng.00@bytedance.com
State New
Headers
Series iommu: Avoid softlockup and rcu stall in fq_flush_timeout(). |

Commit Message

Peng Zhang Feb. 16, 2023, 7:11 a.m. UTC
  There is softlockup under fio pressure test with smmu enabled:
watchdog: BUG: soft lockup - CPU#81 stuck for 22s!  [swapper/81:0]
...
Call trace:
 fq_flush_timeout+0xc0/0x110
 call_timer_fn+0x34/0x178
 expire_timers+0xec/0x158
 run_timer_softirq+0xc0/0x1f8
 __do_softirq+0x120/0x324
 irq_exit+0x11c/0x140
 __handle_domain_irq+0x6c/0xc0
 gic_handle_irq+0x6c/0x170
 el1_irq+0xb8/0x140
 arch_cpu_idle+0x38/0x1c0
 default_idle_call+0x24/0x44
 do_idle+0x1f4/0x2d8
 cpu_startup_entry+0x2c/0x30
 secondary_start_kernel+0x17c/0x1c8

Rcu stall may also be triggered:

rcu: INFO: rcu_sched self-detected stall on CPU
NMI backtrace for cpu 21
CPU: 21 PID: 118 Comm: ksoftirqd/21
...
Call trace:
 fq_flush_timeout+0x6d/0x90
 ? fq_ring_free+0xc0/0xc0
 call_timer_fn+0x2b/0x120
 run_timer_softirq+0x1a6/0x420
 ? finish_task_switch+0x80/0x280
 __do_softirq+0xda/0x2da
 ? sort_range+0x20/0x20
 run_ksoftirqd+0x26/0x40
 smpboot_thread_fn+0xb8/0x150
 kthread+0x110/0x130
 ? __kthread_cancel_work+0x40/0x40
 ret_from_fork+0x1f/0x30

This is because the timer callback fq_flush_timeout may run more than
10ms, and timer may be processed continuously in the softirq so trigger
softlockup and rcu stall. We can use work to deal with fq_ring_free for
each cpu which may take long time, that to avoid triggering softlockup
and rcu stall.

This patch is modified from the patch[1] of openEuler.

[1] https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/C3KYS4BXTDMVVBQNEQYNAOGOQWFFINHJ/

Signed-off-by: Li Bin <huawei.libin@huawei.com>
Reviewed-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
---
 drivers/iommu/dma-iommu.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)
  

Comments

Jerry Snitselaar April 10, 2023, 10:22 p.m. UTC | #1
On Thu, Feb 16, 2023 at 03:11:48PM +0800, Peng Zhang wrote:
> There is softlockup under fio pressure test with smmu enabled:
> watchdog: BUG: soft lockup - CPU#81 stuck for 22s!  [swapper/81:0]
> ...
> Call trace:
>  fq_flush_timeout+0xc0/0x110
>  call_timer_fn+0x34/0x178
>  expire_timers+0xec/0x158
>  run_timer_softirq+0xc0/0x1f8
>  __do_softirq+0x120/0x324
>  irq_exit+0x11c/0x140
>  __handle_domain_irq+0x6c/0xc0
>  gic_handle_irq+0x6c/0x170
>  el1_irq+0xb8/0x140
>  arch_cpu_idle+0x38/0x1c0
>  default_idle_call+0x24/0x44
>  do_idle+0x1f4/0x2d8
>  cpu_startup_entry+0x2c/0x30
>  secondary_start_kernel+0x17c/0x1c8
> 
> Rcu stall may also be triggered:
> 
> rcu: INFO: rcu_sched self-detected stall on CPU
> NMI backtrace for cpu 21
> CPU: 21 PID: 118 Comm: ksoftirqd/21
> ...
> Call trace:
>  fq_flush_timeout+0x6d/0x90
>  ? fq_ring_free+0xc0/0xc0
>  call_timer_fn+0x2b/0x120
>  run_timer_softirq+0x1a6/0x420
>  ? finish_task_switch+0x80/0x280
>  __do_softirq+0xda/0x2da
>  ? sort_range+0x20/0x20
>  run_ksoftirqd+0x26/0x40
>  smpboot_thread_fn+0xb8/0x150
>  kthread+0x110/0x130
>  ? __kthread_cancel_work+0x40/0x40
>  ret_from_fork+0x1f/0x30
> 
> This is because the timer callback fq_flush_timeout may run more than
> 10ms, and timer may be processed continuously in the softirq so trigger
> softlockup and rcu stall. We can use work to deal with fq_ring_free for
> each cpu which may take long time, that to avoid triggering softlockup
> and rcu stall.
> 
> This patch is modified from the patch[1] of openEuler.
> 

Hi Robin,

I was looking at something similar to this recently were in this case
they were beating the heck out the system with the hazard io stress
test, and someone else with some medusa test tool. In one case they
had them force a dump on the soft lockup. On the 384 core genoa, 90
cores were spinning the iovad rb tree lock for one domain, 1 had it,
and the poor flush queue timer handler was having to fight everyone
for the lock. I'm not sure what would be considered a realistic workload
compared to these stressors, but could this be an issue over time as
systems continue to get more cores since the timer handler potentially
grabs and releases the iova domain rb tree lock for each cpu? The only
cases I know of are using io stressors, so I don't know how big a deal
it is.

I think soft lockups could still be produced with this patch, since
there would still be the lock contention.

Regards,
Jerry

> [1] https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/C3KYS4BXTDMVVBQNEQYNAOGOQWFFINHJ/
> 
> Signed-off-by: Li Bin <huawei.libin@huawei.com>
> Reviewed-by: Xie XiuQi <xiexiuqi@huawei.com>
> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
> Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
> ---
>  drivers/iommu/dma-iommu.c | 33 ++++++++++++++++++++++-----------
>  1 file changed, 22 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 99b2646cb5c7..bc4c979d7091 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -59,6 +59,8 @@ struct iommu_dma_cookie {
>  			struct timer_list	fq_timer;
>  			/* 1 when timer is active, 0 when not */
>  			atomic_t		fq_timer_on;
> +			/* The work to free iova */
> +			struct work_struct free_iova_work;
>  		};
>  		/* Trivial linear page allocator for IOMMU_DMA_MSI_COOKIE */
>  		dma_addr_t		msi_iova;
> @@ -155,20 +157,10 @@ static void fq_flush_iotlb(struct iommu_dma_cookie *cookie)
>  static void fq_flush_timeout(struct timer_list *t)
>  {
>  	struct iommu_dma_cookie *cookie = from_timer(cookie, t, fq_timer);
> -	int cpu;
>  
>  	atomic_set(&cookie->fq_timer_on, 0);
>  	fq_flush_iotlb(cookie);
> -
> -	for_each_possible_cpu(cpu) {
> -		unsigned long flags;
> -		struct iova_fq *fq;
> -
> -		fq = per_cpu_ptr(cookie->fq, cpu);
> -		spin_lock_irqsave(&fq->lock, flags);
> -		fq_ring_free(cookie, fq);
> -		spin_unlock_irqrestore(&fq->lock, flags);
> -	}
> +	schedule_work(&cookie->free_iova_work);
>  }
>  
>  static void queue_iova(struct iommu_dma_cookie *cookie,
> @@ -227,6 +219,7 @@ static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie)
>  		return;
>  
>  	del_timer_sync(&cookie->fq_timer);
> +	cancel_work_sync(&cookie->free_iova_work);
>  	/* The IOVAs will be torn down separately, so just free our queued pages */
>  	for_each_possible_cpu(cpu) {
>  		struct iova_fq *fq = per_cpu_ptr(cookie->fq, cpu);
> @@ -238,6 +231,23 @@ static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie)
>  	free_percpu(cookie->fq);
>  }
>  
> +static void free_iova_work_func(struct work_struct *work)
> +{
> +	struct iommu_dma_cookie *cookie;
> +	int cpu;
> +
> +	cookie = container_of(work, struct iommu_dma_cookie, free_iova_work);
> +	for_each_possible_cpu(cpu) {
> +		unsigned long flags;
> +		struct iova_fq *fq;
> +
> +		fq = per_cpu_ptr(cookie->fq, cpu);
> +		spin_lock_irqsave(&fq->lock, flags);
> +		fq_ring_free(cookie, fq);
> +		spin_unlock_irqrestore(&fq->lock, flags);
> +	}
> +}
> +
>  /* sysfs updates are serialised by the mutex of the group owning @domain */
>  int iommu_dma_init_fq(struct iommu_domain *domain)
>  {
> @@ -271,6 +281,7 @@ int iommu_dma_init_fq(struct iommu_domain *domain)
>  
>  	cookie->fq = queue;
>  
> +	INIT_WORK(&cookie->free_iova_work, free_iova_work_func);
>  	timer_setup(&cookie->fq_timer, fq_flush_timeout, 0);
>  	atomic_set(&cookie->fq_timer_on, 0);
>  	/*
> -- 
> 2.20.1
>
  
Vasant Hegde April 18, 2023, 10:41 a.m. UTC | #2
Hi All,


On 4/11/2023 3:52 AM, Jerry Snitselaar wrote:
> On Thu, Feb 16, 2023 at 03:11:48PM +0800, Peng Zhang wrote:
>> There is softlockup under fio pressure test with smmu enabled:
>> watchdog: BUG: soft lockup - CPU#81 stuck for 22s!  [swapper/81:0]
>> ...
>> Call trace:
>>  fq_flush_timeout+0xc0/0x110
>>  call_timer_fn+0x34/0x178
>>  expire_timers+0xec/0x158
>>  run_timer_softirq+0xc0/0x1f8
>>  __do_softirq+0x120/0x324
>>  irq_exit+0x11c/0x140
>>  __handle_domain_irq+0x6c/0xc0
>>  gic_handle_irq+0x6c/0x170
>>  el1_irq+0xb8/0x140
>>  arch_cpu_idle+0x38/0x1c0
>>  default_idle_call+0x24/0x44
>>  do_idle+0x1f4/0x2d8
>>  cpu_startup_entry+0x2c/0x30
>>  secondary_start_kernel+0x17c/0x1c8
>>
>> Rcu stall may also be triggered:
>>
>> rcu: INFO: rcu_sched self-detected stall on CPU
>> NMI backtrace for cpu 21
>> CPU: 21 PID: 118 Comm: ksoftirqd/21
>> ...
>> Call trace:
>>  fq_flush_timeout+0x6d/0x90
>>  ? fq_ring_free+0xc0/0xc0
>>  call_timer_fn+0x2b/0x120
>>  run_timer_softirq+0x1a6/0x420
>>  ? finish_task_switch+0x80/0x280
>>  __do_softirq+0xda/0x2da
>>  ? sort_range+0x20/0x20
>>  run_ksoftirqd+0x26/0x40
>>  smpboot_thread_fn+0xb8/0x150
>>  kthread+0x110/0x130
>>  ? __kthread_cancel_work+0x40/0x40
>>  ret_from_fork+0x1f/0x30
>>
>> This is because the timer callback fq_flush_timeout may run more than
>> 10ms, and timer may be processed continuously in the softirq so trigger
>> softlockup and rcu stall. We can use work to deal with fq_ring_free for
>> each cpu which may take long time, that to avoid triggering softlockup
>> and rcu stall.
>>
>> This patch is modified from the patch[1] of openEuler.
>>
> 
> Hi Robin,
> 
> I was looking at something similar to this recently were in this case
> they were beating the heck out the system with the hazard io stress
> test, and someone else with some medusa test tool. In one case they
> had them force a dump on the soft lockup. On the 384 core genoa, 90
> cores were spinning the iovad rb tree lock for one domain, 1 had it,
> and the poor flush queue timer handler was having to fight everyone
> for the lock. I'm not sure what would be considered a realistic workload
> compared to these stressors, but could this be an issue over time as
> systems continue to get more cores since the timer handler potentially
> grabs and releases the iova domain rb tree lock for each cpu? The only
> cases I know of are using io stressors, so I don't know how big a deal
> it is.
> 
> I think soft lockups could still be produced with this patch, since
> there would still be the lock contention.

Right. We hit this issue when running stress-ng on 384 CPU system having NVMe
disk. I have tried this patch. This improved the situation. But we still hit
soft lockup after an hour or so.

Backtrace with this patch:

 4551.470797] watchdog: BUG: soft lockup - CPU#328 stuck for 30s!
[kworker/328:1:2710]
[ 4551.479441] Modules linked in: nvme_fabrics(E) intel_rapl_msr(E)
intel_rapl_common(E) amd64_edac(E) edac_mce_amd(E) kvm_amd(E) kvm(E)
crct10dif_pclmul(E) ghash_clmulni_intel(E) sha512_ssse3(E) aesni_intel(E)
crypto_simd(E) cryptd(E) rapl(E) wmi_bmof(E) binfmt_misc(E) ipmi_ssif(E)
nls_iso8859_1(E) ast(E) drm_shmem_helper(E) drm_kms_helper(E) i2c_algo_bit(E)
syscopyarea(E) sysfillrect(E) sysimgblt(E) acpi_ipmi(E) ccp(E) k10temp(E)
ipmi_si(E) ipmi_devintf(E) ipmi_msghandler(E) mac_hid(E) sch_fq_codel(E) msr(E)
parport_pc(E) ppdev(E) lp(E) parport(E) ramoops(E) reed_solomon(E) pstore_blk(E)
pstore_zone(E) efi_pstore(E) drm(E) ip_tables(E) x_tables(E) autofs4(E)
crc32_pclmul(E) nvme(E) ahci(E) xhci_pci(E) tg3(E) nvme_core(E) libahci(E)
i2c_piix4(E) xhci_pci_renesas(E) wmi(E)
[ 4551.479475] CPU: 328 PID: 2710 Comm: kworker/328:1 Kdump: loaded Tainted: G
          E      6.2.9schd_work+ #1
[ 4551.479477] Hardware name: AMD Corporation Genoa, BIOS RTI 03/20/2023
[ 4551.479479] Workqueue: events free_iova_work_func
[ 4551.479484] RIP: 0010:_raw_spin_unlock_irqrestore+0x21/0x60
[ 4551.479488] Code: 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 55 49 89 f0 48 89
e5 c6 07 00 0f 1f 00 41 f7 c0 00 02 00 00 74 06 fb 0f 1f 44 00 00 <65> ff 0d 50
f8 fc 7d 74 13 5d 31 c0 89 c2 89 c1 89 c6 89 c7 41 89
[ 4551.479489] RSP: 0018:ffa0000092e07e30 EFLAGS: 00000206
[ 4551.479491] RAX: 0000000000000000 RBX: 00000000000000f2 RCX: 0000000000000000
[ 4551.479492] RDX: 0000000000000000 RSI: 0000000000000282 RDI: ffd1ff3cf82be060
[ 4551.479492] RBP: ffa0000092e07e30 R08: 0000000000000282 R09: 0000000000000000
[ 4551.479493] R10: 0000000000000000 R11: 0000000000000000 R12: ff1100010d2af6d0
[ 4551.479493] R13: ffd1ff3cf82be060 R14: ffd1ff3cf82bb858 R15: 0000000000000282
[ 4551.479494] FS:  0000000000000000(0000) GS:ff11018056200000(0000)
knlGS:0000000000000000
[ 4551.479495] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4551.479496] CR2: 00007ffd392f5f28 CR3: 0000000152f06004 CR4: 0000000000771ee0
[ 4551.479497] PKRU: 55555554
[ 4551.479497] Call Trace:
[ 4551.479498]  <TASK>
[ 4551.479502]  free_iova_work_func+0x6a/0xc0
[ 4551.479505]  process_one_work+0x21f/0x440
[ 4551.479507]  worker_thread+0x50/0x3f0
[ 4551.479509]  ? __pfx_worker_thread+0x10/0x10
[ 4551.479510]  kthread+0xee/0x120
[ 4551.479513]  ? __pfx_kthread+0x10/0x10
[ 4551.479514]  ret_from_fork+0x2c/0x50
[ 4551.479518]  </TASK>

I did slight modification on top of this patch (schedule work on each CPU) and
it worked for me.

@Robin, Joerg,
  Does something like below (schedule work on each CPU to free iova
 (fq_ring_free()) is acceptable?

-Vasant


------

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7aed15e53f10..eb52814e67e5 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -157,10 +157,12 @@ static void fq_flush_iotlb(struct iommu_dma_cookie *cookie)
 static void fq_flush_timeout(struct timer_list *t)
 {
 	struct iommu_dma_cookie *cookie = from_timer(cookie, t, fq_timer);
+	int cpu;

 	atomic_set(&cookie->fq_timer_on, 0);
 	fq_flush_iotlb(cookie);
-	schedule_work(&cookie->free_iova_work);
+	for_each_possible_cpu(cpu)
+		schedule_work_on(cpu, &cookie->free_iova_work);
 }

 static void queue_iova(struct iommu_dma_cookie *cookie,
@@ -235,17 +237,15 @@ static void free_iova_work_func(struct work_struct *work)
 {
 	struct iommu_dma_cookie *cookie;
 	int cpu;
+	unsigned long flags;
+	struct iova_fq *fq;

 	cookie = container_of(work, struct iommu_dma_cookie, free_iova_work);
-	for_each_possible_cpu(cpu) {
-		unsigned long flags;
-		struct iova_fq *fq;
+	fq = this_cpu_ptr(cookie->fq);

-		fq = per_cpu_ptr(cookie->fq, cpu);
-		spin_lock_irqsave(&fq->lock, flags);
-		fq_ring_free(cookie, fq);
-		spin_unlock_irqrestore(&fq->lock, flags);
-	}
+	spin_lock_irqsave(&fq->lock, flags);
+	fq_ring_free(cookie, fq);
+	spin_unlock_irqrestore(&fq->lock, flags);
 }

 /* sysfs updates are serialised by the mutex of the group owning @domain */

---------
  
Vasant Hegde April 28, 2023, 5:44 a.m. UTC | #3
Hello Robin, Joerg,

Ping. Any suggestion on below proposal (schedule work on each CPU to free iova)?

-Vasant



On 4/18/2023 4:11 PM, Vasant Hegde wrote:
> Hi All,
> 
> 
> On 4/11/2023 3:52 AM, Jerry Snitselaar wrote:
>> On Thu, Feb 16, 2023 at 03:11:48PM +0800, Peng Zhang wrote:
>>> There is softlockup under fio pressure test with smmu enabled:
>>> watchdog: BUG: soft lockup - CPU#81 stuck for 22s!  [swapper/81:0]
>>> ...
>>> Call trace:
>>>  fq_flush_timeout+0xc0/0x110
>>>  call_timer_fn+0x34/0x178
>>>  expire_timers+0xec/0x158
>>>  run_timer_softirq+0xc0/0x1f8
>>>  __do_softirq+0x120/0x324
>>>  irq_exit+0x11c/0x140
>>>  __handle_domain_irq+0x6c/0xc0
>>>  gic_handle_irq+0x6c/0x170
>>>  el1_irq+0xb8/0x140
>>>  arch_cpu_idle+0x38/0x1c0
>>>  default_idle_call+0x24/0x44
>>>  do_idle+0x1f4/0x2d8
>>>  cpu_startup_entry+0x2c/0x30
>>>  secondary_start_kernel+0x17c/0x1c8
>>>
>>> Rcu stall may also be triggered:
>>>
>>> rcu: INFO: rcu_sched self-detected stall on CPU
>>> NMI backtrace for cpu 21
>>> CPU: 21 PID: 118 Comm: ksoftirqd/21
>>> ...
>>> Call trace:
>>>  fq_flush_timeout+0x6d/0x90
>>>  ? fq_ring_free+0xc0/0xc0
>>>  call_timer_fn+0x2b/0x120
>>>  run_timer_softirq+0x1a6/0x420
>>>  ? finish_task_switch+0x80/0x280
>>>  __do_softirq+0xda/0x2da
>>>  ? sort_range+0x20/0x20
>>>  run_ksoftirqd+0x26/0x40
>>>  smpboot_thread_fn+0xb8/0x150
>>>  kthread+0x110/0x130
>>>  ? __kthread_cancel_work+0x40/0x40
>>>  ret_from_fork+0x1f/0x30
>>>
>>> This is because the timer callback fq_flush_timeout may run more than
>>> 10ms, and timer may be processed continuously in the softirq so trigger
>>> softlockup and rcu stall. We can use work to deal with fq_ring_free for
>>> each cpu which may take long time, that to avoid triggering softlockup
>>> and rcu stall.
>>>
>>> This patch is modified from the patch[1] of openEuler.
>>>
>>
>> Hi Robin,
>>
>> I was looking at something similar to this recently were in this case
>> they were beating the heck out the system with the hazard io stress
>> test, and someone else with some medusa test tool. In one case they
>> had them force a dump on the soft lockup. On the 384 core genoa, 90
>> cores were spinning the iovad rb tree lock for one domain, 1 had it,
>> and the poor flush queue timer handler was having to fight everyone
>> for the lock. I'm not sure what would be considered a realistic workload
>> compared to these stressors, but could this be an issue over time as
>> systems continue to get more cores since the timer handler potentially
>> grabs and releases the iova domain rb tree lock for each cpu? The only
>> cases I know of are using io stressors, so I don't know how big a deal
>> it is.
>>
>> I think soft lockups could still be produced with this patch, since
>> there would still be the lock contention.
> 
> Right. We hit this issue when running stress-ng on 384 CPU system having NVMe
> disk. I have tried this patch. This improved the situation. But we still hit
> soft lockup after an hour or so.
> 
> Backtrace with this patch:
> 
>  4551.470797] watchdog: BUG: soft lockup - CPU#328 stuck for 30s!
> [kworker/328:1:2710]
> [ 4551.479441] Modules linked in: nvme_fabrics(E) intel_rapl_msr(E)
> intel_rapl_common(E) amd64_edac(E) edac_mce_amd(E) kvm_amd(E) kvm(E)
> crct10dif_pclmul(E) ghash_clmulni_intel(E) sha512_ssse3(E) aesni_intel(E)
> crypto_simd(E) cryptd(E) rapl(E) wmi_bmof(E) binfmt_misc(E) ipmi_ssif(E)
> nls_iso8859_1(E) ast(E) drm_shmem_helper(E) drm_kms_helper(E) i2c_algo_bit(E)
> syscopyarea(E) sysfillrect(E) sysimgblt(E) acpi_ipmi(E) ccp(E) k10temp(E)
> ipmi_si(E) ipmi_devintf(E) ipmi_msghandler(E) mac_hid(E) sch_fq_codel(E) msr(E)
> parport_pc(E) ppdev(E) lp(E) parport(E) ramoops(E) reed_solomon(E) pstore_blk(E)
> pstore_zone(E) efi_pstore(E) drm(E) ip_tables(E) x_tables(E) autofs4(E)
> crc32_pclmul(E) nvme(E) ahci(E) xhci_pci(E) tg3(E) nvme_core(E) libahci(E)
> i2c_piix4(E) xhci_pci_renesas(E) wmi(E)
> [ 4551.479475] CPU: 328 PID: 2710 Comm: kworker/328:1 Kdump: loaded Tainted: G
>           E      6.2.9schd_work+ #1
> [ 4551.479477] Hardware name: AMD Corporation Genoa, BIOS RTI 03/20/2023
> [ 4551.479479] Workqueue: events free_iova_work_func
> [ 4551.479484] RIP: 0010:_raw_spin_unlock_irqrestore+0x21/0x60
> [ 4551.479488] Code: 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 55 49 89 f0 48 89
> e5 c6 07 00 0f 1f 00 41 f7 c0 00 02 00 00 74 06 fb 0f 1f 44 00 00 <65> ff 0d 50
> f8 fc 7d 74 13 5d 31 c0 89 c2 89 c1 89 c6 89 c7 41 89
> [ 4551.479489] RSP: 0018:ffa0000092e07e30 EFLAGS: 00000206
> [ 4551.479491] RAX: 0000000000000000 RBX: 00000000000000f2 RCX: 0000000000000000
> [ 4551.479492] RDX: 0000000000000000 RSI: 0000000000000282 RDI: ffd1ff3cf82be060
> [ 4551.479492] RBP: ffa0000092e07e30 R08: 0000000000000282 R09: 0000000000000000
> [ 4551.479493] R10: 0000000000000000 R11: 0000000000000000 R12: ff1100010d2af6d0
> [ 4551.479493] R13: ffd1ff3cf82be060 R14: ffd1ff3cf82bb858 R15: 0000000000000282
> [ 4551.479494] FS:  0000000000000000(0000) GS:ff11018056200000(0000)
> knlGS:0000000000000000
> [ 4551.479495] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 4551.479496] CR2: 00007ffd392f5f28 CR3: 0000000152f06004 CR4: 0000000000771ee0
> [ 4551.479497] PKRU: 55555554
> [ 4551.479497] Call Trace:
> [ 4551.479498]  <TASK>
> [ 4551.479502]  free_iova_work_func+0x6a/0xc0
> [ 4551.479505]  process_one_work+0x21f/0x440
> [ 4551.479507]  worker_thread+0x50/0x3f0
> [ 4551.479509]  ? __pfx_worker_thread+0x10/0x10
> [ 4551.479510]  kthread+0xee/0x120
> [ 4551.479513]  ? __pfx_kthread+0x10/0x10
> [ 4551.479514]  ret_from_fork+0x2c/0x50
> [ 4551.479518]  </TASK>
> 
> I did slight modification on top of this patch (schedule work on each CPU) and
> it worked for me.
> 
> @Robin, Joerg,
>   Does something like below (schedule work on each CPU to free iova
>  (fq_ring_free()) is acceptable?
> 
> -Vasant
> 
> 
> ------
> 
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 7aed15e53f10..eb52814e67e5 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -157,10 +157,12 @@ static void fq_flush_iotlb(struct iommu_dma_cookie *cookie)
>  static void fq_flush_timeout(struct timer_list *t)
>  {
>  	struct iommu_dma_cookie *cookie = from_timer(cookie, t, fq_timer);
> +	int cpu;
> 
>  	atomic_set(&cookie->fq_timer_on, 0);
>  	fq_flush_iotlb(cookie);
> -	schedule_work(&cookie->free_iova_work);
> +	for_each_possible_cpu(cpu)
> +		schedule_work_on(cpu, &cookie->free_iova_work);
>  }
> 
>  static void queue_iova(struct iommu_dma_cookie *cookie,
> @@ -235,17 +237,15 @@ static void free_iova_work_func(struct work_struct *work)
>  {
>  	struct iommu_dma_cookie *cookie;
>  	int cpu;
> +	unsigned long flags;
> +	struct iova_fq *fq;
> 
>  	cookie = container_of(work, struct iommu_dma_cookie, free_iova_work);
> -	for_each_possible_cpu(cpu) {
> -		unsigned long flags;
> -		struct iova_fq *fq;
> +	fq = this_cpu_ptr(cookie->fq);
> 
> -		fq = per_cpu_ptr(cookie->fq, cpu);
> -		spin_lock_irqsave(&fq->lock, flags);
> -		fq_ring_free(cookie, fq);
> -		spin_unlock_irqrestore(&fq->lock, flags);
> -	}
> +	spin_lock_irqsave(&fq->lock, flags);
> +	fq_ring_free(cookie, fq);
> +	spin_unlock_irqrestore(&fq->lock, flags);
>  }
> 
>  /* sysfs updates are serialised by the mutex of the group owning @domain */
> 
> ---------
> 
> 
> 
> 
>
  
Joerg Roedel May 22, 2023, 2:58 p.m. UTC | #4
Hi,

On Fri, Apr 28, 2023 at 11:14:54AM +0530, Vasant Hegde wrote:
> Ping. Any suggestion on below proposal (schedule work on each CPU to free iova)?

Optimizing the flush-timeout path seems to be working on the symptoms
rather than the cause. The main question to look into first is why are
so many CPUs competing for the IOVA allocator lock.

That is a situation which the flush-queue code is there to avoid,
obviously it does not scale to the workloads tested here. Any chance to
check why?

My guess is that the allocations are too big and not covered by the
allocation sizes supported by the flush-queue code. But maybe this is
something that can be fixed. Or the flush-queue code could even be
changed to auto-adapt to allocation patterns of the device driver?

Regards,

	Joerg
  
Jerry Snitselaar May 22, 2023, 3:18 p.m. UTC | #5
On Mon, May 22, 2023 at 04:58:33PM +0200, Joerg Roedel wrote:
> Hi,
> 
> On Fri, Apr 28, 2023 at 11:14:54AM +0530, Vasant Hegde wrote:
> > Ping. Any suggestion on below proposal (schedule work on each CPU to free iova)?
> 
> Optimizing the flush-timeout path seems to be working on the symptoms
> rather than the cause. The main question to look into first is why are
> so many CPUs competing for the IOVA allocator lock.
> 
> That is a situation which the flush-queue code is there to avoid,
> obviously it does not scale to the workloads tested here. Any chance to
> check why?
> 
> My guess is that the allocations are too big and not covered by the
> allocation sizes supported by the flush-queue code. But maybe this is
> something that can be fixed. Or the flush-queue code could even be
> changed to auto-adapt to allocation patterns of the device driver?
> 
> Regards,
> 
> 	Joerg

In the case I know of it involved some proprietary test suites
(Hazard I/O, and Medusa?), and the lpfc driver. I was able to force
the condition using fio with a number of jobs running. I'll play
around and see if I can figure out a point where it starts to become
an issue.

I mentioned what the nvme driver did to the Broadcom folks for the max
dma size, but I haven't had a chance to go looking at it myself yet to
see if there is somewhere in the lpfc code to fix up.

Regards,
Jerry
  
John Garry May 22, 2023, 3:35 p.m. UTC | #6
On 22/05/2023 16:18, Jerry Snitselaar wrote:
>> My guess is that the allocations are too big and not covered by the
>> allocation sizes supported by the flush-queue code. But maybe this is
>> something that can be fixed. Or the flush-queue code could even be
>> changed to auto-adapt to allocation patterns of the device driver?
>>
>> Regards,
>>
>> 	Joerg
> In the case I know of it involved some proprietary test suites
> (Hazard I/O, and Medusa?), and the lpfc driver. I was able to force
> the condition using fio with a number of jobs running. I'll play
> around and see if I can figure out a point where it starts to become
> an issue.
> 
> I mentioned what the nvme driver did to the Broadcom folks for the max
> dma size, but I haven't had a chance to go looking at it myself yet to
> see if there is somewhere in the lpfc code to fix up.

JFYI, SCSI core already supports setting this in shost->opt_sectors, see 
example in sas_host_setup().

This issue may continue to pop up so we may need a better way to turn it 
on/off for all drivers or classes of drivers.

John
  

Patch

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 99b2646cb5c7..bc4c979d7091 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -59,6 +59,8 @@  struct iommu_dma_cookie {
 			struct timer_list	fq_timer;
 			/* 1 when timer is active, 0 when not */
 			atomic_t		fq_timer_on;
+			/* The work to free iova */
+			struct work_struct free_iova_work;
 		};
 		/* Trivial linear page allocator for IOMMU_DMA_MSI_COOKIE */
 		dma_addr_t		msi_iova;
@@ -155,20 +157,10 @@  static void fq_flush_iotlb(struct iommu_dma_cookie *cookie)
 static void fq_flush_timeout(struct timer_list *t)
 {
 	struct iommu_dma_cookie *cookie = from_timer(cookie, t, fq_timer);
-	int cpu;
 
 	atomic_set(&cookie->fq_timer_on, 0);
 	fq_flush_iotlb(cookie);
-
-	for_each_possible_cpu(cpu) {
-		unsigned long flags;
-		struct iova_fq *fq;
-
-		fq = per_cpu_ptr(cookie->fq, cpu);
-		spin_lock_irqsave(&fq->lock, flags);
-		fq_ring_free(cookie, fq);
-		spin_unlock_irqrestore(&fq->lock, flags);
-	}
+	schedule_work(&cookie->free_iova_work);
 }
 
 static void queue_iova(struct iommu_dma_cookie *cookie,
@@ -227,6 +219,7 @@  static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie)
 		return;
 
 	del_timer_sync(&cookie->fq_timer);
+	cancel_work_sync(&cookie->free_iova_work);
 	/* The IOVAs will be torn down separately, so just free our queued pages */
 	for_each_possible_cpu(cpu) {
 		struct iova_fq *fq = per_cpu_ptr(cookie->fq, cpu);
@@ -238,6 +231,23 @@  static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie)
 	free_percpu(cookie->fq);
 }
 
+static void free_iova_work_func(struct work_struct *work)
+{
+	struct iommu_dma_cookie *cookie;
+	int cpu;
+
+	cookie = container_of(work, struct iommu_dma_cookie, free_iova_work);
+	for_each_possible_cpu(cpu) {
+		unsigned long flags;
+		struct iova_fq *fq;
+
+		fq = per_cpu_ptr(cookie->fq, cpu);
+		spin_lock_irqsave(&fq->lock, flags);
+		fq_ring_free(cookie, fq);
+		spin_unlock_irqrestore(&fq->lock, flags);
+	}
+}
+
 /* sysfs updates are serialised by the mutex of the group owning @domain */
 int iommu_dma_init_fq(struct iommu_domain *domain)
 {
@@ -271,6 +281,7 @@  int iommu_dma_init_fq(struct iommu_domain *domain)
 
 	cookie->fq = queue;
 
+	INIT_WORK(&cookie->free_iova_work, free_iova_work_func);
 	timer_setup(&cookie->fq_timer, fq_flush_timeout, 0);
 	atomic_set(&cookie->fq_timer_on, 0);
 	/*