[6/7] iommu/vt-d: Add IOMMU perfmon overflow handler support

Message ID 20230111202504.378258-7-kan.liang@linux.intel.com
State New
Headers
Series iommu/vt-d: Support performance monitoring for IOMMU |

Commit Message

Liang, Kan Jan. 11, 2023, 8:25 p.m. UTC
  From: Kan Liang <kan.liang@linux.intel.com>

While enabled to count events and an event occurrence causes the counter
value to increment and roll over to or past zero, this is termed a
counter overflow. The overflow can trigger an interrupt. The IOMMU
perfmon needs to handle the case properly.

New HW IRQs are allocated for each IOMMU device for perfmon. The IRQ IDs
are after the SVM range.

In the overflow handler, the counter is not frozen. It's very unlikely
that the same counter overflows again during the period. But it's
possible that other counters overflow at the same time. Read the
overflow register at the end of the handler and check whether there are
more.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 drivers/iommu/intel/dmar.c    |  2 +
 drivers/iommu/intel/iommu.h   | 11 ++++-
 drivers/iommu/intel/perfmon.c | 82 +++++++++++++++++++++++++++++++++++
 drivers/iommu/intel/svm.c     |  2 +-
 4 files changed, 95 insertions(+), 2 deletions(-)
  

Comments

Baolu Lu Jan. 14, 2023, 11:05 a.m. UTC | #1
On 2023/1/12 4:25, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> While enabled to count events and an event occurrence causes the counter
> value to increment and roll over to or past zero, this is termed a
> counter overflow. The overflow can trigger an interrupt. The IOMMU
> perfmon needs to handle the case properly.
> 
> New HW IRQs are allocated for each IOMMU device for perfmon. The IRQ IDs
> are after the SVM range.
> 
> In the overflow handler, the counter is not frozen. It's very unlikely
> that the same counter overflows again during the period. But it's
> possible that other counters overflow at the same time. Read the
> overflow register at the end of the handler and check whether there are
> more.
> 
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> ---
>   drivers/iommu/intel/dmar.c    |  2 +
>   drivers/iommu/intel/iommu.h   | 11 ++++-
>   drivers/iommu/intel/perfmon.c | 82 +++++++++++++++++++++++++++++++++++
>   drivers/iommu/intel/svm.c     |  2 +-
>   4 files changed, 95 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
> index dcafa32875c1..94e314320cd9 100644
> --- a/drivers/iommu/intel/dmar.c
> +++ b/drivers/iommu/intel/dmar.c
> @@ -1887,6 +1887,8 @@ static inline int dmar_msi_reg(struct intel_iommu *iommu, int irq)
>   		return DMAR_FECTL_REG;
>   	else if (iommu->pr_irq == irq)
>   		return DMAR_PECTL_REG;
> +	else if (iommu->perf_irq == irq)
> +		return DMAR_PERFINTRCTL_REG;
>   	else
>   		BUG();
>   }
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index bbc5dda903e9..f616e4f3d765 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -130,6 +130,8 @@
>   #define DMAR_PERFCFGOFF_REG	0x310
>   #define DMAR_PERFOVFOFF_REG	0x318
>   #define DMAR_PERFCNTROFF_REG	0x31c
> +#define DMAR_PERFINTRSTS_REG	0x324
> +#define DMAR_PERFINTRCTL_REG	0x328
>   #define DMAR_PERFEVNTCAP_REG	0x380
>   #define DMAR_ECMD_REG		0x400
>   #define DMAR_ECEO_REG		0x408
> @@ -357,6 +359,9 @@
>   
>   #define DMA_VCS_PAS	((u64)1)
>   
> +/* PERFINTRSTS_REG */
> +#define DMA_PERFINTRSTS_PIS	((u32)1)
> +
>   #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)			\
>   do {									\
>   	cycles_t start_time = get_cycles();				\
> @@ -630,8 +635,12 @@ struct iommu_pmu {
>   	struct pmu		pmu;
>   	DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
>   	struct perf_event	*event_list[IOMMU_PMU_IDX_MAX];
> +	unsigned char		irq_name[16];
>   };
>   
> +#define IOMMU_IRQ_ID_OFFSET_PRQ		(DMAR_UNITS_SUPPORTED)
> +#define IOMMU_IRQ_ID_OFFSET_PERF	(2 * DMAR_UNITS_SUPPORTED)
> +
>   struct intel_iommu {
>   	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
>   	u64 		reg_phys; /* physical address of hw register set */
> @@ -645,7 +654,7 @@ struct intel_iommu {
>   	int		seq_id;	/* sequence id of the iommu */
>   	int		agaw; /* agaw of this iommu */
>   	int		msagaw; /* max sagaw of this iommu */
> -	unsigned int 	irq, pr_irq;
> +	unsigned int	irq, pr_irq, perf_irq;
>   	u16		segment;     /* PCI segment# */
>   	unsigned char 	name[13];    /* Device Name */
>   
> diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c
> index f332232bb345..d0fbf784c827 100644
> --- a/drivers/iommu/intel/perfmon.c
> +++ b/drivers/iommu/intel/perfmon.c
> @@ -476,6 +476,49 @@ static void iommu_pmu_disable(struct pmu *pmu)
>   	ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, false, 0);
>   }
>   
> +static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu)
> +{
> +	struct perf_event *event;
> +	int i, handled = 0;
> +	u64 status;
> +
> +	/*
> +	 * Two counters may be overflowed very close.
> +	 * Always check whether there are more to handle.
> +	 */

Keep comment style consistent, like this

         /*
          * Two counters may be overflowed very close. Always check whether
          * there are more to handle.
          */

Same to other places.

> +	while ((status = dmar_readq(iommu_pmu->overflow))) {

Unnecessary double brackets?

> +		for_each_set_bit(i, (unsigned long *)&status, iommu_pmu->num_cntr) {
> +			handled++;

"handled" isn't used anywhere. Please cleanup it.

> +
> +			/*
> +			 * Find the assigned event of the counter.
> +			 * Accumulate the value into the event->count.
> +			 */
> +			event = iommu_pmu->event_list[i];
> +			if (WARN_ON_ONCE(!event))

Again, do we need a kernel trace here? This is only because the hardware
reported an wrong event id, right? How about a pr_warn() to let the user
know this?

> +				continue;
> +			iommu_pmu_event_update(event);
> +		}
> +
> +		dmar_writeq(iommu_pmu->overflow, status);
> +	}
> +}
> +
> +static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id)
> +{
> +	struct intel_iommu *iommu = dev_id;
> +
> +	if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG))
> +		return IRQ_NONE;
> +
> +	iommu_pmu_counter_overflow(iommu->pmu);
> +
> +	/* Clear the status bit */
> +	dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS);
> +
> +	return IRQ_HANDLED;
> +}
> +
>   static int __iommu_pmu_register(struct intel_iommu *iommu)
>   {
>   	struct iommu_pmu *iommu_pmu = iommu->pmu;
> @@ -658,6 +701,38 @@ void free_iommu_pmu(struct intel_iommu *iommu)
>   	iommu->pmu = NULL;
>   }
>   
> +static int iommu_pmu_set_interrupt(struct intel_iommu *iommu)
> +{
> +	struct iommu_pmu *iommu_pmu = iommu->pmu;
> +	int irq, ret;
> +
> +	irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id, iommu->node, iommu);
> +	if (irq <= 0)
> +		return -EINVAL;
> +
> +	snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name), "dmar%d-perf", iommu->seq_id);
> +
> +	iommu->perf_irq = irq;
> +	ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler,
> +				   IRQF_ONESHOT, iommu_pmu->irq_name, iommu);
> +	if (ret) {
> +		dmar_free_hwirq(irq);
> +		iommu->perf_irq = 0;
> +		return ret;
> +	}
> +	return 0;
> +}
> +
> +static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
> +{
> +	if (!iommu->perf_irq)
> +		return;
> +
> +	free_irq(iommu->perf_irq, iommu);
> +	dmar_free_hwirq(iommu->perf_irq);
> +	iommu->perf_irq = 0;
> +}
> +
>   static int iommu_pmu_cpu_online(unsigned int cpu)
>   {
>   	if (cpumask_empty(&iommu_pmu_cpu_mask))
> @@ -734,8 +809,14 @@ void iommu_pmu_register(struct intel_iommu *iommu)
>   	if (iommu_pmu_cpuhp_setup(iommu_pmu))
>   		goto unregister;
>   
> +	/* Set interrupt for overflow */
> +	if (iommu_pmu_set_interrupt(iommu))
> +		goto cpuhp_free;
> +
>   	return;
>   
> +cpuhp_free:
> +	iommu_pmu_cpuhp_free(iommu_pmu);
>   unregister:
>   	perf_pmu_unregister(&iommu_pmu->pmu);
>   err:
> @@ -749,6 +830,7 @@ void iommu_pmu_unregister(struct intel_iommu *iommu)
>   	if (!iommu_pmu)
>   		return;
>   
> +	iommu_pmu_unset_interrupt(iommu);
>   	iommu_pmu_cpuhp_free(iommu_pmu);
>   	perf_pmu_unregister(&iommu_pmu->pmu);
>   }
> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
> index c76b66263467..b6c5edd80d5d 100644
> --- a/drivers/iommu/intel/svm.c
> +++ b/drivers/iommu/intel/svm.c
> @@ -79,7 +79,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
>   	}
>   	iommu->prq = page_address(pages);
>   
> -	irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
> +	irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu);
>   	if (irq <= 0) {
>   		pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
>   		       iommu->name);

--
Best regards,
baolu
  
Liang, Kan Jan. 16, 2023, 3:20 p.m. UTC | #2
On 2023-01-14 6:05 a.m., Baolu Lu wrote:
> On 2023/1/12 4:25, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> While enabled to count events and an event occurrence causes the counter
>> value to increment and roll over to or past zero, this is termed a
>> counter overflow. The overflow can trigger an interrupt. The IOMMU
>> perfmon needs to handle the case properly.
>>
>> New HW IRQs are allocated for each IOMMU device for perfmon. The IRQ IDs
>> are after the SVM range.
>>
>> In the overflow handler, the counter is not frozen. It's very unlikely
>> that the same counter overflows again during the period. But it's
>> possible that other counters overflow at the same time. Read the
>> overflow register at the end of the handler and check whether there are
>> more.
>>
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>> ---
>>   drivers/iommu/intel/dmar.c    |  2 +
>>   drivers/iommu/intel/iommu.h   | 11 ++++-
>>   drivers/iommu/intel/perfmon.c | 82 +++++++++++++++++++++++++++++++++++
>>   drivers/iommu/intel/svm.c     |  2 +-
>>   4 files changed, 95 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
>> index dcafa32875c1..94e314320cd9 100644
>> --- a/drivers/iommu/intel/dmar.c
>> +++ b/drivers/iommu/intel/dmar.c
>> @@ -1887,6 +1887,8 @@ static inline int dmar_msi_reg(struct
>> intel_iommu *iommu, int irq)
>>           return DMAR_FECTL_REG;
>>       else if (iommu->pr_irq == irq)
>>           return DMAR_PECTL_REG;
>> +    else if (iommu->perf_irq == irq)
>> +        return DMAR_PERFINTRCTL_REG;
>>       else
>>           BUG();
>>   }
>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>> index bbc5dda903e9..f616e4f3d765 100644
>> --- a/drivers/iommu/intel/iommu.h
>> +++ b/drivers/iommu/intel/iommu.h
>> @@ -130,6 +130,8 @@
>>   #define DMAR_PERFCFGOFF_REG    0x310
>>   #define DMAR_PERFOVFOFF_REG    0x318
>>   #define DMAR_PERFCNTROFF_REG    0x31c
>> +#define DMAR_PERFINTRSTS_REG    0x324
>> +#define DMAR_PERFINTRCTL_REG    0x328
>>   #define DMAR_PERFEVNTCAP_REG    0x380
>>   #define DMAR_ECMD_REG        0x400
>>   #define DMAR_ECEO_REG        0x408
>> @@ -357,6 +359,9 @@
>>     #define DMA_VCS_PAS    ((u64)1)
>>   +/* PERFINTRSTS_REG */
>> +#define DMA_PERFINTRSTS_PIS    ((u32)1)
>> +
>>   #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)            \
>>   do {                                    \
>>       cycles_t start_time = get_cycles();                \
>> @@ -630,8 +635,12 @@ struct iommu_pmu {
>>       struct pmu        pmu;
>>       DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
>>       struct perf_event    *event_list[IOMMU_PMU_IDX_MAX];
>> +    unsigned char        irq_name[16];
>>   };
>>   +#define IOMMU_IRQ_ID_OFFSET_PRQ        (DMAR_UNITS_SUPPORTED)
>> +#define IOMMU_IRQ_ID_OFFSET_PERF    (2 * DMAR_UNITS_SUPPORTED)
>> +
>>   struct intel_iommu {
>>       void __iomem    *reg; /* Pointer to hardware regs, virtual addr */
>>       u64         reg_phys; /* physical address of hw register set */
>> @@ -645,7 +654,7 @@ struct intel_iommu {
>>       int        seq_id;    /* sequence id of the iommu */
>>       int        agaw; /* agaw of this iommu */
>>       int        msagaw; /* max sagaw of this iommu */
>> -    unsigned int     irq, pr_irq;
>> +    unsigned int    irq, pr_irq, perf_irq;
>>       u16        segment;     /* PCI segment# */
>>       unsigned char     name[13];    /* Device Name */
>>   diff --git a/drivers/iommu/intel/perfmon.c
>> b/drivers/iommu/intel/perfmon.c
>> index f332232bb345..d0fbf784c827 100644
>> --- a/drivers/iommu/intel/perfmon.c
>> +++ b/drivers/iommu/intel/perfmon.c
>> @@ -476,6 +476,49 @@ static void iommu_pmu_disable(struct pmu *pmu)
>>       ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, false, 0);
>>   }
>>   +static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu)
>> +{
>> +    struct perf_event *event;
>> +    int i, handled = 0;
>> +    u64 status;
>> +
>> +    /*
>> +     * Two counters may be overflowed very close.
>> +     * Always check whether there are more to handle.
>> +     */
> 
> Keep comment style consistent, like this
> 
>         /*
>          * Two counters may be overflowed very close. Always check whether
>          * there are more to handle.
>          */
> 
> Same to other places.

Sure.

> 
>> +    while ((status = dmar_readq(iommu_pmu->overflow))) {
> 
> Unnecessary double brackets?
> 
>> +        for_each_set_bit(i, (unsigned long *)&status,
>> iommu_pmu->num_cntr) {
>> +            handled++;
> 
> "handled" isn't used anywhere. Please cleanup it.
> 

Sure.

>> +
>> +            /*
>> +             * Find the assigned event of the counter.
>> +             * Accumulate the value into the event->count.
>> +             */
>> +            event = iommu_pmu->event_list[i];
>> +            if (WARN_ON_ONCE(!event))
> 
> Again, do we need a kernel trace here? This is only because the hardware
> reported an wrong event id, right? How about a pr_warn() to let the user
> know this?

The hardware reported ID doesn't match the SW records. It's hard to say
which one is correct. But I guess pr_warn_once() may be enough. I will
change it in V2.

Thanks,
Kan

> 
>> +                continue;
>> +            iommu_pmu_event_update(event);
>> +        }
>> +
>> +        dmar_writeq(iommu_pmu->overflow, status);
>> +    }
>> +}
>> +
>> +static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id)
>> +{
>> +    struct intel_iommu *iommu = dev_id;
>> +
>> +    if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG))
>> +        return IRQ_NONE;
>> +
>> +    iommu_pmu_counter_overflow(iommu->pmu);
>> +
>> +    /* Clear the status bit */
>> +    dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS);
>> +
>> +    return IRQ_HANDLED;
>> +}
>> +
>>   static int __iommu_pmu_register(struct intel_iommu *iommu)
>>   {
>>       struct iommu_pmu *iommu_pmu = iommu->pmu;
>> @@ -658,6 +701,38 @@ void free_iommu_pmu(struct intel_iommu *iommu)
>>       iommu->pmu = NULL;
>>   }
>>   +static int iommu_pmu_set_interrupt(struct intel_iommu *iommu)
>> +{
>> +    struct iommu_pmu *iommu_pmu = iommu->pmu;
>> +    int irq, ret;
>> +
>> +    irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id,
>> iommu->node, iommu);
>> +    if (irq <= 0)
>> +        return -EINVAL;
>> +
>> +    snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name),
>> "dmar%d-perf", iommu->seq_id);
>> +
>> +    iommu->perf_irq = irq;
>> +    ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler,
>> +                   IRQF_ONESHOT, iommu_pmu->irq_name, iommu);
>> +    if (ret) {
>> +        dmar_free_hwirq(irq);
>> +        iommu->perf_irq = 0;
>> +        return ret;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
>> +{
>> +    if (!iommu->perf_irq)
>> +        return;
>> +
>> +    free_irq(iommu->perf_irq, iommu);
>> +    dmar_free_hwirq(iommu->perf_irq);
>> +    iommu->perf_irq = 0;
>> +}
>> +
>>   static int iommu_pmu_cpu_online(unsigned int cpu)
>>   {
>>       if (cpumask_empty(&iommu_pmu_cpu_mask))
>> @@ -734,8 +809,14 @@ void iommu_pmu_register(struct intel_iommu *iommu)
>>       if (iommu_pmu_cpuhp_setup(iommu_pmu))
>>           goto unregister;
>>   +    /* Set interrupt for overflow */
>> +    if (iommu_pmu_set_interrupt(iommu))
>> +        goto cpuhp_free;
>> +
>>       return;
>>   +cpuhp_free:
>> +    iommu_pmu_cpuhp_free(iommu_pmu);
>>   unregister:
>>       perf_pmu_unregister(&iommu_pmu->pmu);
>>   err:
>> @@ -749,6 +830,7 @@ void iommu_pmu_unregister(struct intel_iommu *iommu)
>>       if (!iommu_pmu)
>>           return;
>>   +    iommu_pmu_unset_interrupt(iommu);
>>       iommu_pmu_cpuhp_free(iommu_pmu);
>>       perf_pmu_unregister(&iommu_pmu->pmu);
>>   }
>> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
>> index c76b66263467..b6c5edd80d5d 100644
>> --- a/drivers/iommu/intel/svm.c
>> +++ b/drivers/iommu/intel/svm.c
>> @@ -79,7 +79,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
>>       }
>>       iommu->prq = page_address(pages);
>>   -    irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id,
>> iommu->node, iommu);
>> +    irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id,
>> iommu->node, iommu);
>>       if (irq <= 0) {
>>           pr_err("IOMMU: %s: Failed to create IRQ vector for page
>> request queue\n",
>>                  iommu->name);
> 
> -- 
> Best regards,
> baolu
  
Liang, Kan Jan. 17, 2023, 4:52 p.m. UTC | #3
On 2023-01-16 10:20 a.m., Liang, Kan wrote:
> 
> 
> On 2023-01-14 6:05 a.m., Baolu Lu wrote:
>> On 2023/1/12 4:25, kan.liang@linux.intel.com wrote:
>>> From: Kan Liang <kan.liang@linux.intel.com>
>>>
>>> While enabled to count events and an event occurrence causes the counter
>>> value to increment and roll over to or past zero, this is termed a
>>> counter overflow. The overflow can trigger an interrupt. The IOMMU
>>> perfmon needs to handle the case properly.
>>>
>>> New HW IRQs are allocated for each IOMMU device for perfmon. The IRQ IDs
>>> are after the SVM range.
>>>
>>> In the overflow handler, the counter is not frozen. It's very unlikely
>>> that the same counter overflows again during the period. But it's
>>> possible that other counters overflow at the same time. Read the
>>> overflow register at the end of the handler and check whether there are
>>> more.
>>>
>>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>>> ---
>>>   drivers/iommu/intel/dmar.c    |  2 +
>>>   drivers/iommu/intel/iommu.h   | 11 ++++-
>>>   drivers/iommu/intel/perfmon.c | 82 +++++++++++++++++++++++++++++++++++
>>>   drivers/iommu/intel/svm.c     |  2 +-
>>>   4 files changed, 95 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
>>> index dcafa32875c1..94e314320cd9 100644
>>> --- a/drivers/iommu/intel/dmar.c
>>> +++ b/drivers/iommu/intel/dmar.c
>>> @@ -1887,6 +1887,8 @@ static inline int dmar_msi_reg(struct
>>> intel_iommu *iommu, int irq)
>>>           return DMAR_FECTL_REG;
>>>       else if (iommu->pr_irq == irq)
>>>           return DMAR_PECTL_REG;
>>> +    else if (iommu->perf_irq == irq)
>>> +        return DMAR_PERFINTRCTL_REG;
>>>       else
>>>           BUG();
>>>   }
>>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>>> index bbc5dda903e9..f616e4f3d765 100644
>>> --- a/drivers/iommu/intel/iommu.h
>>> +++ b/drivers/iommu/intel/iommu.h
>>> @@ -130,6 +130,8 @@
>>>   #define DMAR_PERFCFGOFF_REG    0x310
>>>   #define DMAR_PERFOVFOFF_REG    0x318
>>>   #define DMAR_PERFCNTROFF_REG    0x31c
>>> +#define DMAR_PERFINTRSTS_REG    0x324
>>> +#define DMAR_PERFINTRCTL_REG    0x328
>>>   #define DMAR_PERFEVNTCAP_REG    0x380
>>>   #define DMAR_ECMD_REG        0x400
>>>   #define DMAR_ECEO_REG        0x408
>>> @@ -357,6 +359,9 @@
>>>     #define DMA_VCS_PAS    ((u64)1)
>>>   +/* PERFINTRSTS_REG */
>>> +#define DMA_PERFINTRSTS_PIS    ((u32)1)
>>> +
>>>   #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)            \
>>>   do {                                    \
>>>       cycles_t start_time = get_cycles();                \
>>> @@ -630,8 +635,12 @@ struct iommu_pmu {
>>>       struct pmu        pmu;
>>>       DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
>>>       struct perf_event    *event_list[IOMMU_PMU_IDX_MAX];
>>> +    unsigned char        irq_name[16];
>>>   };
>>>   +#define IOMMU_IRQ_ID_OFFSET_PRQ        (DMAR_UNITS_SUPPORTED)
>>> +#define IOMMU_IRQ_ID_OFFSET_PERF    (2 * DMAR_UNITS_SUPPORTED)
>>> +
>>>   struct intel_iommu {
>>>       void __iomem    *reg; /* Pointer to hardware regs, virtual addr */
>>>       u64         reg_phys; /* physical address of hw register set */
>>> @@ -645,7 +654,7 @@ struct intel_iommu {
>>>       int        seq_id;    /* sequence id of the iommu */
>>>       int        agaw; /* agaw of this iommu */
>>>       int        msagaw; /* max sagaw of this iommu */
>>> -    unsigned int     irq, pr_irq;
>>> +    unsigned int    irq, pr_irq, perf_irq;
>>>       u16        segment;     /* PCI segment# */
>>>       unsigned char     name[13];    /* Device Name */
>>>   diff --git a/drivers/iommu/intel/perfmon.c
>>> b/drivers/iommu/intel/perfmon.c
>>> index f332232bb345..d0fbf784c827 100644
>>> --- a/drivers/iommu/intel/perfmon.c
>>> +++ b/drivers/iommu/intel/perfmon.c
>>> @@ -476,6 +476,49 @@ static void iommu_pmu_disable(struct pmu *pmu)
>>>       ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, false, 0);
>>>   }
>>>   +static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu)
>>> +{
>>> +    struct perf_event *event;
>>> +    int i, handled = 0;
>>> +    u64 status;
>>> +
>>> +    /*
>>> +     * Two counters may be overflowed very close.
>>> +     * Always check whether there are more to handle.
>>> +     */
>>
>> Keep comment style consistent, like this
>>
>>         /*
>>          * Two counters may be overflowed very close. Always check whether
>>          * there are more to handle.
>>          */
>>
>> Same to other places.
> 
> Sure.
> 
>>
>>> +    while ((status = dmar_readq(iommu_pmu->overflow))) {
>>
>> Unnecessary double brackets?

I think we need the double brackets to aovid compiler warnning -
"suggest parentheses around assignment used as truth value"

Thanks,
Kan
>>
>>> +        for_each_set_bit(i, (unsigned long *)&status,
>>> iommu_pmu->num_cntr) {
>>> +            handled++;
>>
>> "handled" isn't used anywhere. Please cleanup it.
>>
> 
> Sure.
> 
>>> +
>>> +            /*
>>> +             * Find the assigned event of the counter.
>>> +             * Accumulate the value into the event->count.
>>> +             */
>>> +            event = iommu_pmu->event_list[i];
>>> +            if (WARN_ON_ONCE(!event))
>>
>> Again, do we need a kernel trace here? This is only because the hardware
>> reported an wrong event id, right? How about a pr_warn() to let the user
>> know this?
> 
> The hardware reported ID doesn't match the SW records. It's hard to say
> which one is correct. But I guess pr_warn_once() may be enough. I will
> change it in V2.
> 
> Thanks,
> Kan
> 
>>
>>> +                continue;
>>> +            iommu_pmu_event_update(event);
>>> +        }
>>> +
>>> +        dmar_writeq(iommu_pmu->overflow, status);
>>> +    }
>>> +}
>>> +
>>> +static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id)
>>> +{
>>> +    struct intel_iommu *iommu = dev_id;
>>> +
>>> +    if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG))
>>> +        return IRQ_NONE;
>>> +
>>> +    iommu_pmu_counter_overflow(iommu->pmu);
>>> +
>>> +    /* Clear the status bit */
>>> +    dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS);
>>> +
>>> +    return IRQ_HANDLED;
>>> +}
>>> +
>>>   static int __iommu_pmu_register(struct intel_iommu *iommu)
>>>   {
>>>       struct iommu_pmu *iommu_pmu = iommu->pmu;
>>> @@ -658,6 +701,38 @@ void free_iommu_pmu(struct intel_iommu *iommu)
>>>       iommu->pmu = NULL;
>>>   }
>>>   +static int iommu_pmu_set_interrupt(struct intel_iommu *iommu)
>>> +{
>>> +    struct iommu_pmu *iommu_pmu = iommu->pmu;
>>> +    int irq, ret;
>>> +
>>> +    irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id,
>>> iommu->node, iommu);
>>> +    if (irq <= 0)
>>> +        return -EINVAL;
>>> +
>>> +    snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name),
>>> "dmar%d-perf", iommu->seq_id);
>>> +
>>> +    iommu->perf_irq = irq;
>>> +    ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler,
>>> +                   IRQF_ONESHOT, iommu_pmu->irq_name, iommu);
>>> +    if (ret) {
>>> +        dmar_free_hwirq(irq);
>>> +        iommu->perf_irq = 0;
>>> +        return ret;
>>> +    }
>>> +    return 0;
>>> +}
>>> +
>>> +static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
>>> +{
>>> +    if (!iommu->perf_irq)
>>> +        return;
>>> +
>>> +    free_irq(iommu->perf_irq, iommu);
>>> +    dmar_free_hwirq(iommu->perf_irq);
>>> +    iommu->perf_irq = 0;
>>> +}
>>> +
>>>   static int iommu_pmu_cpu_online(unsigned int cpu)
>>>   {
>>>       if (cpumask_empty(&iommu_pmu_cpu_mask))
>>> @@ -734,8 +809,14 @@ void iommu_pmu_register(struct intel_iommu *iommu)
>>>       if (iommu_pmu_cpuhp_setup(iommu_pmu))
>>>           goto unregister;
>>>   +    /* Set interrupt for overflow */
>>> +    if (iommu_pmu_set_interrupt(iommu))
>>> +        goto cpuhp_free;
>>> +
>>>       return;
>>>   +cpuhp_free:
>>> +    iommu_pmu_cpuhp_free(iommu_pmu);
>>>   unregister:
>>>       perf_pmu_unregister(&iommu_pmu->pmu);
>>>   err:
>>> @@ -749,6 +830,7 @@ void iommu_pmu_unregister(struct intel_iommu *iommu)
>>>       if (!iommu_pmu)
>>>           return;
>>>   +    iommu_pmu_unset_interrupt(iommu);
>>>       iommu_pmu_cpuhp_free(iommu_pmu);
>>>       perf_pmu_unregister(&iommu_pmu->pmu);
>>>   }
>>> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
>>> index c76b66263467..b6c5edd80d5d 100644
>>> --- a/drivers/iommu/intel/svm.c
>>> +++ b/drivers/iommu/intel/svm.c
>>> @@ -79,7 +79,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
>>>       }
>>>       iommu->prq = page_address(pages);
>>>   -    irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id,
>>> iommu->node, iommu);
>>> +    irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id,
>>> iommu->node, iommu);
>>>       if (irq <= 0) {
>>>           pr_err("IOMMU: %s: Failed to create IRQ vector for page
>>> request queue\n",
>>>                  iommu->name);
>>
>> -- 
>> Best regards,
>> baolu
  

Patch

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index dcafa32875c1..94e314320cd9 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1887,6 +1887,8 @@  static inline int dmar_msi_reg(struct intel_iommu *iommu, int irq)
 		return DMAR_FECTL_REG;
 	else if (iommu->pr_irq == irq)
 		return DMAR_PECTL_REG;
+	else if (iommu->perf_irq == irq)
+		return DMAR_PERFINTRCTL_REG;
 	else
 		BUG();
 }
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index bbc5dda903e9..f616e4f3d765 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -130,6 +130,8 @@ 
 #define DMAR_PERFCFGOFF_REG	0x310
 #define DMAR_PERFOVFOFF_REG	0x318
 #define DMAR_PERFCNTROFF_REG	0x31c
+#define DMAR_PERFINTRSTS_REG	0x324
+#define DMAR_PERFINTRCTL_REG	0x328
 #define DMAR_PERFEVNTCAP_REG	0x380
 #define DMAR_ECMD_REG		0x400
 #define DMAR_ECEO_REG		0x408
@@ -357,6 +359,9 @@ 
 
 #define DMA_VCS_PAS	((u64)1)
 
+/* PERFINTRSTS_REG */
+#define DMA_PERFINTRSTS_PIS	((u32)1)
+
 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)			\
 do {									\
 	cycles_t start_time = get_cycles();				\
@@ -630,8 +635,12 @@  struct iommu_pmu {
 	struct pmu		pmu;
 	DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
 	struct perf_event	*event_list[IOMMU_PMU_IDX_MAX];
+	unsigned char		irq_name[16];
 };
 
+#define IOMMU_IRQ_ID_OFFSET_PRQ		(DMAR_UNITS_SUPPORTED)
+#define IOMMU_IRQ_ID_OFFSET_PERF	(2 * DMAR_UNITS_SUPPORTED)
+
 struct intel_iommu {
 	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
 	u64 		reg_phys; /* physical address of hw register set */
@@ -645,7 +654,7 @@  struct intel_iommu {
 	int		seq_id;	/* sequence id of the iommu */
 	int		agaw; /* agaw of this iommu */
 	int		msagaw; /* max sagaw of this iommu */
-	unsigned int 	irq, pr_irq;
+	unsigned int	irq, pr_irq, perf_irq;
 	u16		segment;     /* PCI segment# */
 	unsigned char 	name[13];    /* Device Name */
 
diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c
index f332232bb345..d0fbf784c827 100644
--- a/drivers/iommu/intel/perfmon.c
+++ b/drivers/iommu/intel/perfmon.c
@@ -476,6 +476,49 @@  static void iommu_pmu_disable(struct pmu *pmu)
 	ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, false, 0);
 }
 
+static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu)
+{
+	struct perf_event *event;
+	int i, handled = 0;
+	u64 status;
+
+	/*
+	 * Two counters may be overflowed very close.
+	 * Always check whether there are more to handle.
+	 */
+	while ((status = dmar_readq(iommu_pmu->overflow))) {
+		for_each_set_bit(i, (unsigned long *)&status, iommu_pmu->num_cntr) {
+			handled++;
+
+			/*
+			 * Find the assigned event of the counter.
+			 * Accumulate the value into the event->count.
+			 */
+			event = iommu_pmu->event_list[i];
+			if (WARN_ON_ONCE(!event))
+				continue;
+			iommu_pmu_event_update(event);
+		}
+
+		dmar_writeq(iommu_pmu->overflow, status);
+	}
+}
+
+static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id)
+{
+	struct intel_iommu *iommu = dev_id;
+
+	if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG))
+		return IRQ_NONE;
+
+	iommu_pmu_counter_overflow(iommu->pmu);
+
+	/* Clear the status bit */
+	dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS);
+
+	return IRQ_HANDLED;
+}
+
 static int __iommu_pmu_register(struct intel_iommu *iommu)
 {
 	struct iommu_pmu *iommu_pmu = iommu->pmu;
@@ -658,6 +701,38 @@  void free_iommu_pmu(struct intel_iommu *iommu)
 	iommu->pmu = NULL;
 }
 
+static int iommu_pmu_set_interrupt(struct intel_iommu *iommu)
+{
+	struct iommu_pmu *iommu_pmu = iommu->pmu;
+	int irq, ret;
+
+	irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id, iommu->node, iommu);
+	if (irq <= 0)
+		return -EINVAL;
+
+	snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name), "dmar%d-perf", iommu->seq_id);
+
+	iommu->perf_irq = irq;
+	ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler,
+				   IRQF_ONESHOT, iommu_pmu->irq_name, iommu);
+	if (ret) {
+		dmar_free_hwirq(irq);
+		iommu->perf_irq = 0;
+		return ret;
+	}
+	return 0;
+}
+
+static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
+{
+	if (!iommu->perf_irq)
+		return;
+
+	free_irq(iommu->perf_irq, iommu);
+	dmar_free_hwirq(iommu->perf_irq);
+	iommu->perf_irq = 0;
+}
+
 static int iommu_pmu_cpu_online(unsigned int cpu)
 {
 	if (cpumask_empty(&iommu_pmu_cpu_mask))
@@ -734,8 +809,14 @@  void iommu_pmu_register(struct intel_iommu *iommu)
 	if (iommu_pmu_cpuhp_setup(iommu_pmu))
 		goto unregister;
 
+	/* Set interrupt for overflow */
+	if (iommu_pmu_set_interrupt(iommu))
+		goto cpuhp_free;
+
 	return;
 
+cpuhp_free:
+	iommu_pmu_cpuhp_free(iommu_pmu);
 unregister:
 	perf_pmu_unregister(&iommu_pmu->pmu);
 err:
@@ -749,6 +830,7 @@  void iommu_pmu_unregister(struct intel_iommu *iommu)
 	if (!iommu_pmu)
 		return;
 
+	iommu_pmu_unset_interrupt(iommu);
 	iommu_pmu_cpuhp_free(iommu_pmu);
 	perf_pmu_unregister(&iommu_pmu->pmu);
 }
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index c76b66263467..b6c5edd80d5d 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -79,7 +79,7 @@  int intel_svm_enable_prq(struct intel_iommu *iommu)
 	}
 	iommu->prq = page_address(pages);
 
-	irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
+	irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu);
 	if (irq <= 0) {
 		pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
 		       iommu->name);