[3/6] iommu/arm-smmu-qcom: Add Qualcomm TBU driver

Message ID 20231019021923.13939-4-quic_c_gdjako@quicinc.com
State New
Headers
Series Add support for Translation Buffer Units |

Commit Message

Georgi Djakov Oct. 19, 2023, 2:19 a.m. UTC
  Add driver for the Qualcomm implementation of the ARM MMU-500 TBU.
The driver will enable the resources needed by the TBU and will
configure the registers for some debug features like checking if
there are any pending transactions, capturing transactions and
running ATOS (Address Translation Operations). ATOS/eCATS are used
to manually trigger an address translation of IOVA to physical
address by the SMMU hardware.

Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
---
 drivers/iommu/Kconfig                      |   8 +
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 384 +++++++++++++++++++++
 drivers/iommu/arm/arm-smmu/arm-smmu.h      |   2 +
 3 files changed, 394 insertions(+)
  

Comments

Bjorn Andersson Oct. 21, 2023, 9:05 p.m. UTC | #1
On Wed, Oct 18, 2023 at 07:19:20PM -0700, Georgi Djakov wrote:
> Add driver for the Qualcomm implementation of the ARM MMU-500 TBU.
> The driver will enable the resources needed by the TBU and will
> configure the registers for some debug features like checking if
> there are any pending transactions, capturing transactions and
> running ATOS (Address Translation Operations). ATOS/eCATS are used
> to manually trigger an address translation of IOVA to physical
> address by the SMMU hardware.

I still don't think this commit message clearly enough describe the
problem you're trying to solve.

Not until I had read the Kconfig help text did I pay attention to the
significance of the words "some debug features" in the middle of the
paragraph.


Please describe your changes in accordance with [1], i.e. clearly
describe the problem you're trying to solve, then discuss the technical
solution in the patch.

[1] https://docs.kernel.org/process/submitting-patches.html#describe-your-changes

[..]
> diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
[..]
> +#ifdef CONFIG_ARM_SMMU_QCOM_TBU
> +
> +struct qsmmuv500_tbu {
> +	struct device *dev;
> +	struct arm_smmu_device *smmu;
> +	u32 sid_range[2];
> +	struct list_head list;
> +	struct clk *clk;
> +	struct icc_path	*path;
> +	void __iomem *base;
> +	spinlock_t halt_lock; /* protects halt count */

But in particular it makes sure that multiple halt or resume can't
execute concurrently.

> +	int halt_count;
> +};
> +
> +static DEFINE_SPINLOCK(ecats_lock);
> +
> +static struct qsmmuv500_tbu *qsmmuv500_find_tbu(struct qcom_smmu *qsmmu, u32 sid)
> +{
> +	struct qsmmuv500_tbu *tbu = NULL;
> +	u32 start, end;
> +
> +	mutex_lock(&qsmmu->tbu_list_lock);
> +
> +	list_for_each_entry(tbu, &qsmmu->tbu_list, list) {
> +		start = tbu->sid_range[0];
> +		end = start + tbu->sid_range[1];
> +
> +		if (start <= sid && sid < end)
> +			break;
> +	}
> +
> +	mutex_unlock(&qsmmu->tbu_list_lock);
> +
> +	return tbu;
> +}
> +
> +static int qsmmuv500_tbu_halt(struct qsmmuv500_tbu *tbu, struct arm_smmu_domain *smmu_domain)
> +{
> +	struct arm_smmu_device *smmu = smmu_domain->smmu;
> +	int ret = 0, idx = smmu_domain->cfg.cbndx;
> +	unsigned long flags;
> +	u32 val, fsr, status;
> +
> +	spin_lock_irqsave(&tbu->halt_lock, flags);

Does this really need to run with interrupts disabled?

> +	if (tbu->halt_count) {
> +		tbu->halt_count++;
> +		goto out;
> +	}
> +
[..]
> +static phys_addr_t qsmmuv500_iova_to_phys(struct arm_smmu_domain *smmu_domain,
> +					  dma_addr_t iova, u32 sid)
> +{
[..]
> +	/* Only one concurrent atos operation */
> +	spin_lock_irqsave(&ecats_lock, flags);

Does this require interrupts to be disabled?

> +
> +	/*
> +	 * After a failed translation, the next successful translation will
> +	 * incorrectly be reported as a failure.

"So if the ECATS translation fails, attempt the lookup more time."

> +	 */
> +	do {
> +		phys = qsmmuv500_tbu_trigger_atos(smmu_domain, tbu, iova, sid);
> +
> +		fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
> +		if (fsr & ARM_SMMU_FSR_FAULT) {
> +			/* Clear pending interrupts */
> +			arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
> +			/*
> +			 * Barrier required to ensure that the FSR is cleared
> +			 * before resuming SMMU operation.
> +			 */

Better be clear on what this actually does, for future readers' sake:

	 /* Ensure that FSR and RESUME operations aren't reordered. */

But is this really necessary, the two writes are for the same device,
can they still be reordered?

> +			wmb();
> +
> +			if (fsr & ARM_SMMU_FSR_SS)
> +				arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
> +						  ARM_SMMU_RESUME_TERMINATE);
> +		}
> +	} while (!phys && needs_redo++ < 2);

"needs_redo" sounds like a boolean to me. I think "attempt" would be a
better fit here.

> +
> +	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr_orig);
> +	spin_unlock_irqrestore(&ecats_lock, flags);
> +	qsmmuv500_tbu_resume(tbu);
> +
> +	/* Read to complete prior write transcations */
> +	readl_relaxed(tbu->base + DEBUG_SR_HALT_ACK_REG);
> +
> +	/* Wait for read to complete */

That's not what rmb() does. You don't need to do anything here,
readl_relaxed() returns when the read is done.

> +	rmb();
> +
> +disable_clk:
> +	clk_disable_unprepare(tbu->clk);
> +disable_icc:
> +	icc_set_bw(tbu->path, 0, 0);
> +
> +	return phys;
> +}
> +#endif
> +
>  static void qcom_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
>  				int sync, int status)
>  {
> @@ -588,3 +895,80 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
>  
>  	return smmu;
>  }
> +
> +#ifdef CONFIG_ARM_SMMU_QCOM_TBU
> +
> +static const struct of_device_id qsmmuv500_tbu_of_match[] = {
> +	{ .compatible = "qcom,qsmmuv500-tbu" },
> +	{ }
> +};

Place this below the remove function, as most other drivers do.

> +
> +static int qsmmuv500_tbu_probe(struct platform_device *pdev)
> +{
[..]
> +	mutex_lock(&qsmmu->tbu_list_lock);
> +	list_add_tail(&tbu->list, &qsmmu->tbu_list);

"tbu" is devres allocated, but you don't pull it off the list (or
synchronize) during remove.

> +	mutex_unlock(&qsmmu->tbu_list_lock);
> +
> +	dev_set_drvdata(dev, tbu);
> +
> +	return 0;
> +}
> +
> +static void qsmmuv500_tbu_remove(struct platform_device *pdev)
> +{
> +	struct qsmmuv500_tbu *tbu = dev_get_drvdata(&pdev->dev);
> +
> +	clk_disable_unprepare(tbu->clk);

This isn't balanced.

> +	clk_put(tbu->clk);
> +	icc_put(tbu->path);
> +}
> +
> +static struct platform_driver qsmmuv500_tbu_driver = {
> +	.driver = {
> +		.name           = "qsmmuv500-tbu",
> +		.of_match_table = of_match_ptr(qsmmuv500_tbu_of_match),

Won't of_match_ptr() result in a build warning if built without
CONFIG_OF?

> +	},
> +	.probe  = qsmmuv500_tbu_probe,
> +	.remove_new = qsmmuv500_tbu_remove,
> +};
> +module_platform_driver(qsmmuv500_tbu_driver);

This file acts as a library for the arm-smmu driver today, adding a
platform_driver here makes it look like this is a separate driver.

Which makes me wonder, why is this a separate driver? Why not just
loop over the subnodes and build the tbu_list in qcom_smmu_impl_init()?

Regards,
Bjorn
  
kernel test robot Oct. 26, 2023, 3:25 p.m. UTC | #2
Hi Georgi,

kernel test robot noticed the following build errors:

[auto build test ERROR on robh/for-next]
[also build test ERROR on arm/for-next arm/fixes arm64/for-next/core clk/clk-next kvmarm/next rockchip/for-next shawnguo/for-next soc/for-next linus/master v6.6-rc7 next-20231025]
[cannot apply to joro-iommu/next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Georgi-Djakov/dt-bindings-iommu-Add-Translation-Buffer-Unit-bindings/20231019-102257
base:   https://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git for-next
patch link:    https://lore.kernel.org/r/20231019021923.13939-4-quic_c_gdjako%40quicinc.com
patch subject: [PATCH 3/6] iommu/arm-smmu-qcom: Add Qualcomm TBU driver
config: arm64-allmodconfig (https://download.01.org/0day-ci/archive/20231026/202310262330.pfzI76DH-lkp@intel.com/config)
compiler: aarch64-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231026/202310262330.pfzI76DH-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310262330.pfzI76DH-lkp@intel.com/

All errors (new ones prefixed by >>):

   aarch64-linux-ld: drivers/iommu/arm/arm-smmu/arm-smmu-qcom.o: in function `qsmmuv500_tbu_driver_init':
>> arm-smmu-qcom.c:(.init.text+0x8): multiple definition of `init_module'; drivers/iommu/arm/arm-smmu/arm-smmu.o:arm-smmu.c:(.init.text+0x8): first defined here
   aarch64-linux-ld: drivers/iommu/arm/arm-smmu/arm-smmu-qcom.o: in function `qsmmuv500_tbu_driver_exit':
>> arm-smmu-qcom.c:(.exit.text+0x0): multiple definition of `cleanup_module'; drivers/iommu/arm/arm-smmu/arm-smmu.o:arm-smmu.c:(.exit.text+0x0): first defined here
  
Georgi Djakov Nov. 18, 2023, 2:26 a.m. UTC | #3
On 10/22/2023 12:05 AM, Bjorn Andersson wrote:
> On Wed, Oct 18, 2023 at 07:19:20PM -0700, Georgi Djakov wrote:
>> Add driver for the Qualcomm implementation of the ARM MMU-500 TBU.
>> The driver will enable the resources needed by the TBU and will
>> configure the registers for some debug features like checking if
>> there are any pending transactions, capturing transactions and
>> running ATOS (Address Translation Operations). ATOS/eCATS are used
>> to manually trigger an address translation of IOVA to physical
>> address by the SMMU hardware.
> 
> I still don't think this commit message clearly enough describe the
> problem you're trying to solve.
> 
> Not until I had read the Kconfig help text did I pay attention to the
> significance of the words "some debug features" in the middle of the
> paragraph.
> 
> 
> Please describe your changes in accordance with [1], i.e. clearly
> describe the problem you're trying to solve, then discuss the technical
> solution in the patch.

Thanks Bjorn, I'll try to improve it! 

> [1] https://docs.kernel.org/process/submitting-patches.html#describe-your-changes
> 
> [..]
>> diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
> [..]
>> +#ifdef CONFIG_ARM_SMMU_QCOM_TBU
>> +
>> +struct qsmmuv500_tbu {
>> +	struct device *dev;
>> +	struct arm_smmu_device *smmu;
>> +	u32 sid_range[2];
>> +	struct list_head list;
>> +	struct clk *clk;
>> +	struct icc_path	*path;
>> +	void __iomem *base;
>> +	spinlock_t halt_lock; /* protects halt count */
> 
> But in particular it makes sure that multiple halt or resume can't
> execute concurrently.

Exactly. Will mention it. 

>> +	int halt_count;
>> +};
>> +
>> +static DEFINE_SPINLOCK(ecats_lock);
>> +
>> +static struct qsmmuv500_tbu *qsmmuv500_find_tbu(struct qcom_smmu *qsmmu, u32 sid)
>> +{
>> +	struct qsmmuv500_tbu *tbu = NULL;
>> +	u32 start, end;
>> +
>> +	mutex_lock(&qsmmu->tbu_list_lock);
>> +
>> +	list_for_each_entry(tbu, &qsmmu->tbu_list, list) {
>> +		start = tbu->sid_range[0];
>> +		end = start + tbu->sid_range[1];
>> +
>> +		if (start <= sid && sid < end)
>> +			break;
>> +	}
>> +
>> +	mutex_unlock(&qsmmu->tbu_list_lock);
>> +
>> +	return tbu;
>> +}
>> +
>> +static int qsmmuv500_tbu_halt(struct qsmmuv500_tbu *tbu, struct arm_smmu_domain *smmu_domain)
>> +{
>> +	struct arm_smmu_device *smmu = smmu_domain->smmu;
>> +	int ret = 0, idx = smmu_domain->cfg.cbndx;
>> +	unsigned long flags;
>> +	u32 val, fsr, status;
>> +
>> +	spin_lock_irqsave(&tbu->halt_lock, flags);
> 
> Does this really need to run with interrupts disabled?

This is being executed in threaded irq context. 

>> +	if (tbu->halt_count) {
>> +		tbu->halt_count++;
>> +		goto out;
>> +	}
>> +
> [..]
>> +static phys_addr_t qsmmuv500_iova_to_phys(struct arm_smmu_domain *smmu_domain,
>> +					  dma_addr_t iova, u32 sid)
>> +{
> [..]
>> +	/* Only one concurrent atos operation */
>> +	spin_lock_irqsave(&ecats_lock, flags);
> 
> Does this require interrupts to be disabled?

This also runs in the irq handler during context fault.

>> +
>> +	/*
>> +	 * After a failed translation, the next successful translation will
>> +	 * incorrectly be reported as a failure.
> 
> "So if the ECATS translation fails, attempt the lookup more time."
> 
>> +	 */
>> +	do {
>> +		phys = qsmmuv500_tbu_trigger_atos(smmu_domain, tbu, iova, sid);
>> +
>> +		fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
>> +		if (fsr & ARM_SMMU_FSR_FAULT) {
>> +			/* Clear pending interrupts */
>> +			arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
>> +			/*
>> +			 * Barrier required to ensure that the FSR is cleared
>> +			 * before resuming SMMU operation.
>> +			 */
> 
> Better be clear on what this actually does, for future readers' sake:
> 
> 	 /* Ensure that FSR and RESUME operations aren't reordered. */
> 
> But is this really necessary, the two writes are for the same device,
> can they still be reordered?

Right, these are to the same endpoint. It can be dropped. 

>> +			wmb();
>> +
>> +			if (fsr & ARM_SMMU_FSR_SS)
>> +				arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
>> +						  ARM_SMMU_RESUME_TERMINATE);
>> +		}
>> +	} while (!phys && needs_redo++ < 2);
> 
> "needs_redo" sounds like a boolean to me. I think "attempt" would be a
> better fit here.
> 

Ok.

>> +
>> +	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr_orig);
>> +	spin_unlock_irqrestore(&ecats_lock, flags);
>> +	qsmmuv500_tbu_resume(tbu);
>> +
>> +	/* Read to complete prior write transcations */
>> +	readl_relaxed(tbu->base + DEBUG_SR_HALT_ACK_REG);
>> +
>> +	/* Wait for read to complete */
> 
> That's not what rmb() does. You don't need to do anything here,
> readl_relaxed() returns when the read is done.

Ack.

>> +	rmb();
>> +
>> +disable_clk:
>> +	clk_disable_unprepare(tbu->clk);
>> +disable_icc:
>> +	icc_set_bw(tbu->path, 0, 0);
>> +
>> +	return phys;
>> +}
>> +#endif
>> +
>>  static void qcom_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
>>  				int sync, int status)
>>  {
>> @@ -588,3 +895,80 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
>>  
>>  	return smmu;
>>  }
>> +
>> +#ifdef CONFIG_ARM_SMMU_QCOM_TBU
>> +
>> +static const struct of_device_id qsmmuv500_tbu_of_match[] = {
>> +	{ .compatible = "qcom,qsmmuv500-tbu" },
>> +	{ }
>> +};
> 
> Place this below the remove function, as most other drivers do.

Ack.

>> +
>> +static int qsmmuv500_tbu_probe(struct platform_device *pdev)
>> +{
> [..]
>> +	mutex_lock(&qsmmu->tbu_list_lock);
>> +	list_add_tail(&tbu->list, &qsmmu->tbu_list);
> 
> "tbu" is devres allocated, but you don't pull it off the list (or
> synchronize) during remove.

Right, but I'll just make this a builtin.

>> +	mutex_unlock(&qsmmu->tbu_list_lock);
>> +
>> +	dev_set_drvdata(dev, tbu);
>> +
>> +	return 0;
>> +}
>> +
>> +static void qsmmuv500_tbu_remove(struct platform_device *pdev)
>> +{
>> +	struct qsmmuv500_tbu *tbu = dev_get_drvdata(&pdev->dev);
>> +
>> +	clk_disable_unprepare(tbu->clk);
> 
> This isn't balanced.
> 
>> +	clk_put(tbu->clk);
>> +	icc_put(tbu->path);
>> +}
>> +
>> +static struct platform_driver qsmmuv500_tbu_driver = {
>> +	.driver = {
>> +		.name           = "qsmmuv500-tbu",
>> +		.of_match_table = of_match_ptr(qsmmuv500_tbu_of_match),
> 
> Won't of_match_ptr() result in a build warning if built without
> CONFIG_OF?

Will drop.

>> +	},
>> +	.probe  = qsmmuv500_tbu_probe,
>> +	.remove_new = qsmmuv500_tbu_remove,
>> +};
>> +module_platform_driver(qsmmuv500_tbu_driver);
> 
> This file acts as a library for the arm-smmu driver today, adding a
> platform_driver here makes it look like this is a separate driver.
> 
> Which makes me wonder, why is this a separate driver? Why not just
> loop over the subnodes and build the tbu_list in qcom_smmu_impl_init()?
> 

I am using the platform framework in order to get a more compact code
for this optional driver, but it adds some overhead.. I'll start with
moving all the TBU stuff into a separate file and will try meanwhile
your suggestion..

Thanks,
Georgi
  

Patch

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 2b12b583ef4b..2b024d65b921 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -377,6 +377,14 @@  config ARM_SMMU_QCOM
 	  When running on a Qualcomm platform that has the custom variant
 	  of the ARM SMMU, this needs to be built into the SMMU driver.
 
+config ARM_SMMU_QCOM_TBU
+	bool "Qualcomm TBU driver"
+	depends on ARM_SMMU_QCOM
+	help
+	  The SMMU on Qualcomm platform may include a Translation Buffer
+	  Units (TBUs) for each master. Enabling support for these will
+	  allow operating the TBUs to help debugging context faults.
+
 config ARM_SMMU_QCOM_DEBUG
 	bool "ARM SMMU QCOM implementation defined debug support"
 	depends on ARM_SMMU_QCOM
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 655c7f50ca84..d20263a1cd2c 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -10,17 +10,324 @@ 
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #include <linux/firmware/qcom/qcom_scm.h>
+#include <linux/interconnect.h>
+#include <linux/iopoll.h>
 
 #include "arm-smmu.h"
 #include "arm-smmu-qcom.h"
 
 #define QCOM_DUMMY_VAL	-1
 
+#define TBU_DBG_TIMEOUT_US		100
+#define DEBUG_AXUSER_REG		0x30
+#define DEBUG_AXUSER_CDMID		GENMASK_ULL(43, 36)
+#define DEBUG_AXUSER_CDMID_VAL		0xff
+#define DEBUG_PAR_REG			0x28
+#define DEBUG_PAR_FAULT_VAL		BIT(0)
+#define DEBUG_PAR_PA			GENMASK_ULL(47, 12)
+#define DEBUG_SID_HALT_REG		0x0
+#define DEBUG_SID_HALT_VAL		BIT(16)
+#define DEBUG_SID_HALT_SID		GENMASK(9, 0)
+#define DEBUG_SR_HALT_ACK_REG		0x20
+#define DEBUG_SR_HALT_ACK_VAL		BIT(1)
+#define DEBUG_SR_ECATS_RUNNING_VAL	BIT(0)
+#define DEBUG_TXN_AXCACHE		GENMASK(5, 2)
+#define DEBUG_TXN_AXPROT		GENMASK(8, 6)
+#define DEBUG_TXN_AXPROT_PRIV		0x1
+#define DEBUG_TXN_AXPROT_NSEC		0x2
+#define DEBUG_TXN_TRIGG_REG		0x18
+#define DEBUG_TXN_TRIGGER		BIT(0)
+#define DEBUG_VA_ADDR_REG		0x8
+
 static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu)
 {
 	return container_of(smmu, struct qcom_smmu, smmu);
 }
 
+#ifdef CONFIG_ARM_SMMU_QCOM_TBU
+
+struct qsmmuv500_tbu {
+	struct device *dev;
+	struct arm_smmu_device *smmu;
+	u32 sid_range[2];
+	struct list_head list;
+	struct clk *clk;
+	struct icc_path	*path;
+	void __iomem *base;
+	spinlock_t halt_lock; /* protects halt count */
+	int halt_count;
+};
+
+static DEFINE_SPINLOCK(ecats_lock);
+
+static struct qsmmuv500_tbu *qsmmuv500_find_tbu(struct qcom_smmu *qsmmu, u32 sid)
+{
+	struct qsmmuv500_tbu *tbu = NULL;
+	u32 start, end;
+
+	mutex_lock(&qsmmu->tbu_list_lock);
+
+	list_for_each_entry(tbu, &qsmmu->tbu_list, list) {
+		start = tbu->sid_range[0];
+		end = start + tbu->sid_range[1];
+
+		if (start <= sid && sid < end)
+			break;
+	}
+
+	mutex_unlock(&qsmmu->tbu_list_lock);
+
+	return tbu;
+}
+
+static int qsmmuv500_tbu_halt(struct qsmmuv500_tbu *tbu, struct arm_smmu_domain *smmu_domain)
+{
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	int ret = 0, idx = smmu_domain->cfg.cbndx;
+	unsigned long flags;
+	u32 val, fsr, status;
+
+	spin_lock_irqsave(&tbu->halt_lock, flags);
+	if (tbu->halt_count) {
+		tbu->halt_count++;
+		goto out;
+	}
+
+	val = readl_relaxed(tbu->base + DEBUG_SID_HALT_REG);
+	val |= DEBUG_SID_HALT_VAL;
+	writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG);
+
+	fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
+	if ((fsr & ARM_SMMU_FSR_FAULT) && (fsr & ARM_SMMU_FSR_SS)) {
+		u32 sctlr_orig, sctlr;
+
+		/*
+		 * We are in a fault. Our request to halt the bus will not
+		 * complete until transactions in front of us (such as the fault
+		 * itself) have completed. Disable iommu faults and terminate
+		 * any existing transactions.
+		 */
+		sctlr_orig = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_SCTLR);
+		sctlr = sctlr_orig & ~(ARM_SMMU_SCTLR_CFCFG | ARM_SMMU_SCTLR_CFIE);
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr);
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+
+		/* Ensure that the FSR is cleared */
+		wmb();
+
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, ARM_SMMU_RESUME_TERMINATE);
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr_orig);
+	}
+
+	if (readl_poll_timeout_atomic(tbu->base + DEBUG_SR_HALT_ACK_REG, status,
+				      (status & DEBUG_SR_HALT_ACK_VAL),
+				      0, TBU_DBG_TIMEOUT_US)) {
+		dev_err(tbu->dev, "Timeout while trying to halt TBU!\n");
+		ret = -ETIMEDOUT;
+
+		val = readl_relaxed(tbu->base + DEBUG_SID_HALT_REG);
+		val &= ~DEBUG_SID_HALT_VAL;
+		writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG);
+
+		goto out;
+	}
+
+	tbu->halt_count = 1;
+
+out:
+	spin_unlock_irqrestore(&tbu->halt_lock, flags);
+	return ret;
+}
+
+static void qsmmuv500_tbu_resume(struct qsmmuv500_tbu *tbu)
+{
+	unsigned long flags;
+	u32 val;
+
+	spin_lock_irqsave(&tbu->halt_lock, flags);
+	if (!tbu->halt_count) {
+		WARN(1, "%s: halt_count is 0", dev_name(tbu->dev));
+		goto out;
+	}
+
+	if (tbu->halt_count > 1) {
+		tbu->halt_count--;
+		goto out;
+	}
+
+	val = readl_relaxed(tbu->base + DEBUG_SID_HALT_REG);
+	val &= ~DEBUG_SID_HALT_VAL;
+	writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG);
+
+	tbu->halt_count = 0;
+out:
+	spin_unlock_irqrestore(&tbu->halt_lock, flags);
+}
+
+static phys_addr_t qsmmuv500_tbu_trigger_atos(struct arm_smmu_domain *smmu_domain,
+					      struct qsmmuv500_tbu *tbu, dma_addr_t iova, u32 sid)
+{
+	bool ecats_timedout = false;
+	phys_addr_t phys = 0;
+	ktime_t timeout;
+	u64 val;
+
+	/* Set address and stream-id */
+	val = readq_relaxed(tbu->base + DEBUG_SID_HALT_REG);
+	val &= ~DEBUG_SID_HALT_SID;
+	val |= FIELD_PREP(DEBUG_SID_HALT_SID, sid);
+	writeq_relaxed(val, tbu->base + DEBUG_SID_HALT_REG);
+	writeq_relaxed(iova, tbu->base + DEBUG_VA_ADDR_REG);
+	val = FIELD_PREP(DEBUG_AXUSER_CDMID, DEBUG_AXUSER_CDMID_VAL);
+	writeq_relaxed(val, tbu->base + DEBUG_AXUSER_REG);
+
+	/* Write-back read and write-allocate */
+	val = FIELD_PREP(DEBUG_TXN_AXCACHE, 0xF);
+
+	/* Non-secure access */
+	val |= FIELD_PREP(DEBUG_TXN_AXPROT, DEBUG_TXN_AXPROT_NSEC);
+
+	/* Priviledged access */
+	val |= FIELD_PREP(DEBUG_TXN_AXPROT, DEBUG_TXN_AXPROT_PRIV);
+
+	val |= DEBUG_TXN_TRIGGER;
+	writeq_relaxed(val, tbu->base + DEBUG_TXN_TRIGG_REG);
+
+	timeout = ktime_add_us(ktime_get(), TBU_DBG_TIMEOUT_US);
+	for (;;) {
+		val = readl_relaxed(tbu->base + DEBUG_SR_HALT_ACK_REG);
+		if (!(val & DEBUG_SR_ECATS_RUNNING_VAL))
+			break;
+		val = readl_relaxed(tbu->base + DEBUG_PAR_REG);
+		if (val & DEBUG_PAR_FAULT_VAL)
+			break;
+		if (ktime_compare(ktime_get(), timeout) > 0) {
+			ecats_timedout = true;
+			break;
+		}
+	}
+
+	val = readq_relaxed(tbu->base + DEBUG_PAR_REG);
+	if (val & DEBUG_PAR_FAULT_VAL)
+		dev_err(tbu->dev, "ECATS generated a fault interrupt! PAR = %llx, SID=0x%x\n",
+			val, sid);
+	else if (ecats_timedout)
+		dev_err_ratelimited(tbu->dev, "ECATS translation timed out!\n");
+	else
+		phys = FIELD_GET(DEBUG_PAR_PA, val);
+
+	/* Reset hardware */
+	writeq_relaxed(0, tbu->base + DEBUG_TXN_TRIGG_REG);
+	writeq_relaxed(0, tbu->base + DEBUG_VA_ADDR_REG);
+	val = readl_relaxed(tbu->base + DEBUG_SID_HALT_REG);
+	val &= ~DEBUG_SID_HALT_SID;
+	writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG);
+
+	return phys;
+}
+
+static phys_addr_t qsmmuv500_iova_to_phys(struct arm_smmu_domain *smmu_domain,
+					  dma_addr_t iova, u32 sid)
+{
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct qcom_smmu *qsmmu = to_qcom_smmu(smmu);
+	int idx = smmu_domain->cfg.cbndx;
+	struct qsmmuv500_tbu *tbu;
+	u32 sctlr_orig, sctlr;
+	phys_addr_t phys = 0;
+	unsigned long flags;
+	int needs_redo = 0;
+	int ret;
+	u64 fsr;
+
+	tbu = qsmmuv500_find_tbu(qsmmu, sid);
+	if (!tbu)
+		return 0;
+
+	ret = icc_set_bw(tbu->path, 0, UINT_MAX);
+	if (ret)
+		return ret;
+
+	ret = clk_prepare_enable(tbu->clk);
+	if (ret)
+		goto disable_icc;
+
+	ret = qsmmuv500_tbu_halt(tbu, smmu_domain);
+	if (ret)
+		goto disable_clk;
+
+	/*
+	 * ECATS can trigger the fault interrupt, so disable it temporarily
+	 * and check for an interrupt manually.
+	 */
+	sctlr_orig = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_SCTLR);
+	sctlr = sctlr_orig & ~(ARM_SMMU_SCTLR_CFCFG | ARM_SMMU_SCTLR_CFIE);
+	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr);
+
+	fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
+	if (fsr & ARM_SMMU_FSR_FAULT) {
+		/* Clear pending interrupts */
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+		/*
+		 * Barrier required to ensure that the FSR is cleared
+		 * before resuming SMMU operation.
+		 */
+		wmb();
+
+		/*
+		 * TBU halt takes care of resuming any stalled transcation.
+		 * Kept it here for completeness sake.
+		 */
+		if (fsr & ARM_SMMU_FSR_SS)
+			arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
+					  ARM_SMMU_RESUME_TERMINATE);
+	}
+
+	/* Only one concurrent atos operation */
+	spin_lock_irqsave(&ecats_lock, flags);
+
+	/*
+	 * After a failed translation, the next successful translation will
+	 * incorrectly be reported as a failure.
+	 */
+	do {
+		phys = qsmmuv500_tbu_trigger_atos(smmu_domain, tbu, iova, sid);
+
+		fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
+		if (fsr & ARM_SMMU_FSR_FAULT) {
+			/* Clear pending interrupts */
+			arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+			/*
+			 * Barrier required to ensure that the FSR is cleared
+			 * before resuming SMMU operation.
+			 */
+			wmb();
+
+			if (fsr & ARM_SMMU_FSR_SS)
+				arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
+						  ARM_SMMU_RESUME_TERMINATE);
+		}
+	} while (!phys && needs_redo++ < 2);
+
+	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr_orig);
+	spin_unlock_irqrestore(&ecats_lock, flags);
+	qsmmuv500_tbu_resume(tbu);
+
+	/* Read to complete prior write transcations */
+	readl_relaxed(tbu->base + DEBUG_SR_HALT_ACK_REG);
+
+	/* Wait for read to complete */
+	rmb();
+
+disable_clk:
+	clk_disable_unprepare(tbu->clk);
+disable_icc:
+	icc_set_bw(tbu->path, 0, 0);
+
+	return phys;
+}
+#endif
+
 static void qcom_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
 				int sync, int status)
 {
@@ -588,3 +895,80 @@  struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
 
 	return smmu;
 }
+
+#ifdef CONFIG_ARM_SMMU_QCOM_TBU
+
+static const struct of_device_id qsmmuv500_tbu_of_match[] = {
+	{ .compatible = "qcom,qsmmuv500-tbu" },
+	{ }
+};
+
+static int qsmmuv500_tbu_probe(struct platform_device *pdev)
+{
+	struct device_node *np = pdev->dev.of_node;
+	struct device *dev = &pdev->dev;
+	struct arm_smmu_device *smmu;
+	struct qsmmuv500_tbu *tbu;
+	struct qcom_smmu *qsmmu;
+	int ret;
+
+	smmu = dev_get_drvdata(dev->parent);
+	if (!smmu)
+		return -EPROBE_DEFER;
+
+	qsmmu = to_qcom_smmu(smmu);
+
+	tbu = devm_kzalloc(dev, sizeof(*tbu), GFP_KERNEL);
+	if (!tbu)
+		return -ENOMEM;
+
+	tbu->dev = dev;
+	INIT_LIST_HEAD(&tbu->list);
+	spin_lock_init(&tbu->halt_lock);
+
+	tbu->base = devm_of_iomap(dev, np, 0, NULL);
+	if (IS_ERR(tbu->base))
+		return PTR_ERR(tbu->base);
+
+	ret = of_property_read_u32_array(np, "qcom,stream-id-range", tbu->sid_range, 2);
+	if (ret) {
+		dev_err(dev, "The DT property 'qcom,stream-id-range' is mandatory\n");
+		return ret;
+	}
+
+	tbu->clk = devm_clk_get_optional(dev, NULL);
+	if (IS_ERR(tbu->clk))
+		return PTR_ERR(tbu->clk);
+
+	tbu->path = devm_of_icc_get(dev, NULL);
+	if (IS_ERR(tbu->path))
+		return PTR_ERR(tbu->path);
+
+	mutex_lock(&qsmmu->tbu_list_lock);
+	list_add_tail(&tbu->list, &qsmmu->tbu_list);
+	mutex_unlock(&qsmmu->tbu_list_lock);
+
+	dev_set_drvdata(dev, tbu);
+
+	return 0;
+}
+
+static void qsmmuv500_tbu_remove(struct platform_device *pdev)
+{
+	struct qsmmuv500_tbu *tbu = dev_get_drvdata(&pdev->dev);
+
+	clk_disable_unprepare(tbu->clk);
+	clk_put(tbu->clk);
+	icc_put(tbu->path);
+}
+
+static struct platform_driver qsmmuv500_tbu_driver = {
+	.driver = {
+		.name           = "qsmmuv500-tbu",
+		.of_match_table = of_match_ptr(qsmmuv500_tbu_of_match),
+	},
+	.probe  = qsmmuv500_tbu_probe,
+	.remove_new = qsmmuv500_tbu_remove,
+};
+module_platform_driver(qsmmuv500_tbu_driver);
+#endif
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 703fd5817ec1..e5df65c0f81a 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -136,6 +136,7 @@  enum arm_smmu_cbar_type {
 #define ARM_SMMU_CBAR_VMID		GENMASK(7, 0)
 
 #define ARM_SMMU_GR1_CBFRSYNRA(n)	(0x400 + ((n) << 2))
+#define ARM_SMMU_CBFRSYNRA_SID		GENMASK(15, 0)
 
 #define ARM_SMMU_GR1_CBA2R(n)		(0x800 + ((n) << 2))
 #define ARM_SMMU_CBA2R_VMID16		GENMASK(31, 16)
@@ -238,6 +239,7 @@  enum arm_smmu_cbar_type {
 #define ARM_SMMU_CB_ATSR		0x8f0
 #define ARM_SMMU_ATSR_ACTIVE		BIT(0)
 
+#define ARM_SMMU_RESUME_TERMINATE	BIT(0)
 
 /* Maximum number of context banks per SMMU */
 #define ARM_SMMU_MAX_CBS		128