[3/7] iommu: dart: Support >64 stream IDs

Message ID 20230104110013.24738-4-marcan@marcan.st
State New
Headers
Series iommu: dart: Apple t8110 DART support |

Commit Message

Hector Martin Jan. 4, 2023, 11 a.m. UTC
  T8110 DARTs have up to 256 SIDs, so we need to switch to a bitmap to
handle them properly.

Signed-off-by: Hector Martin <marcan@marcan.st>
---
 drivers/iommu/apple-dart.c | 114 +++++++++++++++++++++++--------------
 1 file changed, 71 insertions(+), 43 deletions(-)
  

Comments

Sven Peter Jan. 4, 2023, 1:37 p.m. UTC | #1
On Wed, Jan 4, 2023, at 12:00, Hector Martin wrote:
> T8110 DARTs have up to 256 SIDs, so we need to switch to a bitmap to
> handle them properly.
>
> Signed-off-by: Hector Martin <marcan@marcan.st>
> ---
>  drivers/iommu/apple-dart.c | 114 +++++++++++++++++++++++--------------
>  1 file changed, 71 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
> index 2458416122f8..48743bcd5b9d 100644
> --- a/drivers/iommu/apple-dart.c
> +++ b/drivers/iommu/apple-dart.c
> @@ -34,11 +34,10 @@
> 
>  #include "dma-iommu.h"
> 
> -#define DART_MAX_STREAMS 16
> +#define DART_MAX_STREAMS 256

Feels a bit wasteful to allocate 256-wide sid2group and save_{tcr,ttbr}
arrays even for the M1 where 16 are enough. But then again, that's still <100 KiB
for all DARTs combined and these machine have >8 GiB of RAM so it probably won't
make a difference


>  #define DART_MAX_TTBR 4
>  #define MAX_DARTS_PER_DEVICE 2
> 
> -#define DART_STREAM_ALL 0xffff
> 
>  #define DART_PARAMS1 0x00
>  #define DART_PARAMS_PAGE_SHIFT GENMASK(27, 24)
> @@ -85,6 +84,8 @@
>  struct apple_dart_hw {
>  	u32 oas;
>  	enum io_pgtable_fmt fmt;
> +
> +	int max_sid_count;
>  };
> 
>  /*
> @@ -116,6 +117,7 @@ struct apple_dart {
>  	spinlock_t lock;
> 
>  	u32 pgsize;
> +	u32 num_streams;
>  	u32 supports_bypass : 1;
>  	u32 force_bypass : 1;
> 
> @@ -143,11 +145,11 @@ struct apple_dart {
>   */
>  struct apple_dart_stream_map {
>  	struct apple_dart *dart;
> -	unsigned long sidmap;
> +	DECLARE_BITMAP(sidmap, DART_MAX_STREAMS);
>  };
>  struct apple_dart_atomic_stream_map {
>  	struct apple_dart *dart;
> -	atomic64_t sidmap;
> +	atomic_long_t sidmap[BITS_TO_LONGS(DART_MAX_STREAMS)];
>  };
> 
>  /*
> @@ -205,50 +207,55 @@ static struct apple_dart_domain 
> *to_dart_domain(struct iommu_domain *dom)
>  static void
>  apple_dart_hw_enable_translation(struct apple_dart_stream_map 
> *stream_map)
>  {
> +	struct apple_dart *dart = stream_map->dart;
>  	int sid;
> 
> -	for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
> +	for_each_set_bit(sid, stream_map->sidmap, dart->num_streams)
>  		writel(DART_TCR_TRANSLATE_ENABLE,
> -		       stream_map->dart->regs + DART_TCR(sid));
> +		       dart->regs + DART_TCR(sid));
>  }
> 
>  static void apple_dart_hw_disable_dma(struct apple_dart_stream_map *stream_map)
>  {
> +	struct apple_dart *dart = stream_map->dart;
>  	int sid;
> 
> -	for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
> -		writel(0, stream_map->dart->regs + DART_TCR(sid));
> +	for_each_set_bit(sid, stream_map->sidmap, dart->num_streams)
> +		writel(0, dart->regs + DART_TCR(sid));
>  }
> 
>  static void
>  apple_dart_hw_enable_bypass(struct apple_dart_stream_map *stream_map)
>  {
> +	struct apple_dart *dart = stream_map->dart;
>  	int sid;
> 
>  	WARN_ON(!stream_map->dart->supports_bypass);
> -	for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
> +	for_each_set_bit(sid, stream_map->sidmap, dart->num_streams)
>  		writel(DART_TCR_BYPASS0_ENABLE | DART_TCR_BYPASS1_ENABLE,
> -		       stream_map->dart->regs + DART_TCR(sid));
> +		       dart->regs + DART_TCR(sid));
>  }
> 
>  static void apple_dart_hw_set_ttbr(struct apple_dart_stream_map *stream_map,
>  				   u8 idx, phys_addr_t paddr)
>  {
> +	struct apple_dart *dart = stream_map->dart;
>  	int sid;
> 
>  	WARN_ON(paddr & ((1 << DART_TTBR_SHIFT) - 1));
> -	for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
> +	for_each_set_bit(sid, stream_map->sidmap, dart->num_streams)
>  		writel(DART_TTBR_VALID | (paddr >> DART_TTBR_SHIFT),
> -		       stream_map->dart->regs + DART_TTBR(sid, idx));
> +		       dart->regs + DART_TTBR(sid, idx));
>  }
> 
>  static void apple_dart_hw_clear_ttbr(struct apple_dart_stream_map *stream_map,
>  				     u8 idx)
>  {
> +	struct apple_dart *dart = stream_map->dart;
>  	int sid;
> 
> -	for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
> -		writel(0, stream_map->dart->regs + DART_TTBR(sid, idx));
> +	for_each_set_bit(sid, stream_map->sidmap, dart->num_streams)
> +		writel(0, dart->regs + DART_TTBR(sid, idx));
>  }
> 
>  static void
> @@ -270,7 +277,7 @@ apple_dart_hw_stream_command(struct 
> apple_dart_stream_map *stream_map,
> 
>  	spin_lock_irqsave(&stream_map->dart->lock, flags);
> 
> -	writel(stream_map->sidmap, stream_map->dart->regs + DART_STREAM_SELECT);
> +	writel(stream_map->sidmap[0], stream_map->dart->regs + DART_STREAM_SELECT);
>  	writel(command, stream_map->dart->regs + DART_STREAM_COMMAND);
> 
>  	ret = readl_poll_timeout_atomic(
> @@ -283,7 +290,7 @@ apple_dart_hw_stream_command(struct 
> apple_dart_stream_map *stream_map,
>  	if (ret) {
>  		dev_err(stream_map->dart->dev,
>  			"busy bit did not clear after command %x for streams %lx\n",
> -			command, stream_map->sidmap);
> +			command, stream_map->sidmap[0]);
>  		return ret;
>  	}
> 
> @@ -301,6 +308,7 @@ static int apple_dart_hw_reset(struct apple_dart *dart)
>  {
>  	u32 config;
>  	struct apple_dart_stream_map stream_map;
> +	int i;
> 
>  	config = readl(dart->regs + DART_CONFIG);
>  	if (config & DART_CONFIG_LOCK) {
> @@ -310,12 +318,14 @@ static int apple_dart_hw_reset(struct apple_dart *dart)
>  	}
> 
>  	stream_map.dart = dart;
> -	stream_map.sidmap = DART_STREAM_ALL;
> +	bitmap_zero(stream_map.sidmap, DART_MAX_STREAMS);
> +	bitmap_set(stream_map.sidmap, 0, dart->num_streams);
>  	apple_dart_hw_disable_dma(&stream_map);
>  	apple_dart_hw_clear_all_ttbrs(&stream_map);
> 
>  	/* enable all streams globally since TCR is used to control isolation */
> -	writel(DART_STREAM_ALL, dart->regs + DART_STREAMS_ENABLE);
> +	for (i = 0; i < BITS_TO_U32(dart->num_streams); i++)
> +		writel(U32_MAX, dart->regs + DART_STREAMS_ENABLE);

This seems weird: this code writes U32_MAX to the same register
again and again. 


> 
>  	/* clear any pending errors before the interrupt is unmasked */
>  	writel(readl(dart->regs + DART_ERROR), dart->regs + DART_ERROR);
> @@ -325,13 +335,16 @@ static int apple_dart_hw_reset(struct apple_dart *dart)
> 
>  static void apple_dart_domain_flush_tlb(struct apple_dart_domain *domain)
>  {
> -	int i;
> +	int i, j;
>  	struct apple_dart_atomic_stream_map *domain_stream_map;
>  	struct apple_dart_stream_map stream_map;
> 
>  	for_each_stream_map(i, domain, domain_stream_map) {
>  		stream_map.dart = domain_stream_map->dart;
> -		stream_map.sidmap = atomic64_read(&domain_stream_map->sidmap);
> +
> +		for (j = 0; j < BITS_TO_LONGS(stream_map.dart->num_streams); j++)
> +			stream_map.sidmap[j] = 
> atomic_long_read(&domain_stream_map->sidmap[j]);
> +
>  		apple_dart_hw_invalidate_tlb(&stream_map);
>  	}
>  }
> @@ -416,7 +429,7 @@ static int apple_dart_finalize_domain(struct 
> iommu_domain *domain,
>  	struct apple_dart *dart = cfg->stream_maps[0].dart;
>  	struct io_pgtable_cfg pgtbl_cfg;
>  	int ret = 0;
> -	int i;
> +	int i, j;
> 
>  	mutex_lock(&dart_domain->init_lock);
> 
> @@ -425,8 +438,9 @@ static int apple_dart_finalize_domain(struct 
> iommu_domain *domain,
> 
>  	for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
>  		dart_domain->stream_maps[i].dart = cfg->stream_maps[i].dart;
> -		atomic64_set(&dart_domain->stream_maps[i].sidmap,
> -			     cfg->stream_maps[i].sidmap);
> +		for (j = 0; j < BITS_TO_LONGS(dart->num_streams); j++)
> +			atomic_long_set(&dart_domain->stream_maps[i].sidmap[j],
> +					cfg->stream_maps[i].sidmap[j]);
>  	}
> 
>  	pgtbl_cfg = (struct io_pgtable_cfg){
> @@ -461,7 +475,7 @@ apple_dart_mod_streams(struct 
> apple_dart_atomic_stream_map *domain_maps,
>  		       struct apple_dart_stream_map *master_maps,
>  		       bool add_streams)
>  {
> -	int i;
> +	int i, j;
> 
>  	for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
>  		if (domain_maps[i].dart != master_maps[i].dart)
> @@ -471,12 +485,14 @@ apple_dart_mod_streams(struct 
> apple_dart_atomic_stream_map *domain_maps,
>  	for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
>  		if (!domain_maps[i].dart)
>  			break;
> -		if (add_streams)
> -			atomic64_or(master_maps[i].sidmap,
> -				    &domain_maps[i].sidmap);
> -		else
> -			atomic64_and(~master_maps[i].sidmap,
> -				     &domain_maps[i].sidmap);
> +		for (j = 0; j < BITS_TO_LONGS(domain_maps[i].dart->num_streams); 
> j++) {
> +			if (add_streams)
> +				atomic_long_or(master_maps[i].sidmap[j],
> +					       &domain_maps[i].sidmap[j]);
> +			else
> +				atomic_long_and(~master_maps[i].sidmap[j],
> +						&domain_maps[i].sidmap[j]);
> +		}
>  	}
> 
>  	return 0;
> @@ -640,14 +656,14 @@ static int apple_dart_of_xlate(struct device 
> *dev, struct of_phandle_args *args)
> 
>  	for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
>  		if (cfg->stream_maps[i].dart == dart) {
> -			cfg->stream_maps[i].sidmap |= 1 << sid;
> +			set_bit(sid, cfg->stream_maps[i].sidmap);
>  			return 0;
>  		}
>  	}
>  	for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
>  		if (!cfg->stream_maps[i].dart) {
>  			cfg->stream_maps[i].dart = dart;
> -			cfg->stream_maps[i].sidmap = 1 << sid;
> +			set_bit(sid, cfg->stream_maps[i].sidmap);
>  			return 0;
>  		}
>  	}
> @@ -666,7 +682,7 @@ static void apple_dart_release_group(void *iommu_data)
>  	mutex_lock(&apple_dart_groups_lock);
> 
>  	for_each_stream_map(i, group_master_cfg, stream_map)
> -		for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
> +		for_each_set_bit(sid, stream_map->sidmap, stream_map->dart->num_streams)
>  			stream_map->dart->sid2group[sid] = NULL;
> 
>  	kfree(iommu_data);
> @@ -685,7 +701,7 @@ static struct iommu_group 
> *apple_dart_device_group(struct device *dev)
>  	mutex_lock(&apple_dart_groups_lock);
> 
>  	for_each_stream_map(i, cfg, stream_map) {
> -		for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) {
> +		for_each_set_bit(sid, stream_map->sidmap, stream_map->dart->num_streams) {
>  			struct iommu_group *stream_group =
>  				stream_map->dart->sid2group[sid];
> 
> @@ -724,7 +740,7 @@ static struct iommu_group 
> *apple_dart_device_group(struct device *dev)
>  		apple_dart_release_group);
> 
>  	for_each_stream_map(i, cfg, stream_map)
> -		for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
> +		for_each_set_bit(sid, stream_map->sidmap, stream_map->dart->num_streams)
>  			stream_map->dart->sid2group[sid] = group;
> 
>  	res = group;
> @@ -869,16 +885,26 @@ static int apple_dart_probe(struct platform_device *pdev)
>  	if (ret)
>  		return ret;
> 
> -	ret = apple_dart_hw_reset(dart);
> -	if (ret)
> -		goto err_clk_disable;
> -
>  	dart_params[0] = readl(dart->regs + DART_PARAMS1);
>  	dart_params[1] = readl(dart->regs + DART_PARAMS2);
>  	dart->pgsize = 1 << FIELD_GET(DART_PARAMS_PAGE_SHIFT, dart_params[0]);
>  	dart->supports_bypass = dart_params[1] & DART_PARAMS_BYPASS_SUPPORT;
> +
> +	dart->num_streams = dart->hw->max_sid_count;
> +
> +	if (dart->num_streams > DART_MAX_STREAMS) {
> +		dev_err(&pdev->dev, "Too many streams (%d > %d)\n",
> +			dart->num_streams, DART_MAX_STREAMS);
> +		ret = -EINVAL;
> +		goto err_clk_disable;
> +	}
> +
>  	dart->force_bypass = dart->pgsize > PAGE_SIZE;
> 
> +	ret = apple_dart_hw_reset(dart);
> +	if (ret)
> +		goto err_clk_disable;
> +
>  	ret = request_irq(dart->irq, apple_dart_irq, IRQF_SHARED,
>  			  "apple-dart fault handler", dart);
>  	if (ret)
> @@ -897,8 +923,8 @@ static int apple_dart_probe(struct platform_device *pdev)
> 
>  	dev_info(
>  		&pdev->dev,
> -		"DART [pagesize %x, bypass support: %d, bypass forced: %d] 
> initialized\n",
> -		dart->pgsize, dart->supports_bypass, dart->force_bypass);
> +		"DART [pagesize %x, %d streams, bypass support: %d, bypass forced: 
> %d] initialized\n",
> +		dart->pgsize, dart->num_streams, dart->supports_bypass, 
> dart->force_bypass);
>  	return 0;
> 
>  err_sysfs_remove:
> @@ -929,10 +955,12 @@ static int apple_dart_remove(struct platform_device *pdev)
>  static const struct apple_dart_hw apple_dart_hw_t8103 = {
>  	.oas = 36,
>  	.fmt = APPLE_DART,
> +	.max_sid_count = 16,
>  };
>  static const struct apple_dart_hw apple_dart_hw_t6000 = {
>  	.oas = 42,
>  	.fmt = APPLE_DART2,
> +	.max_sid_count = 16,
>  };
> 
>  static __maybe_unused int apple_dart_suspend(struct device *dev)
> @@ -940,7 +968,7 @@ static __maybe_unused int apple_dart_suspend(struct 
> device *dev)
>  	struct apple_dart *dart = dev_get_drvdata(dev);
>  	unsigned int sid, idx;
> 
> -	for (sid = 0; sid < DART_MAX_STREAMS; sid++) {
> +	for (sid = 0; sid < dart->num_streams; sid++) {
>  		dart->save_tcr[sid] = readl_relaxed(dart->regs + DART_TCR(sid));
>  		for (idx = 0; idx < DART_MAX_TTBR; idx++)
>  			dart->save_ttbr[sid][idx] =
> @@ -962,7 +990,7 @@ static __maybe_unused int apple_dart_resume(struct 
> device *dev)
>  		return ret;
>  	}
> 
> -	for (sid = 0; sid < DART_MAX_STREAMS; sid++) {
> +	for (sid = 0; sid < dart->num_streams; sid++) {
>  		for (idx = 0; idx < DART_MAX_TTBR; idx++)
>  			writel(dart->save_ttbr[sid][idx],
>  			       dart->regs + DART_TTBR(sid, idx));
> -- 
> 2.35.1
  
Hector Martin Jan. 5, 2023, 4:43 a.m. UTC | #2
On 2023/01/04 22:37, Sven Peter wrote:
>>  #include "dma-iommu.h"
>>
>> -#define DART_MAX_STREAMS 16
>> +#define DART_MAX_STREAMS 256
> 
> Feels a bit wasteful to allocate 256-wide sid2group and save_{tcr,ttbr}
> arrays even for the M1 where 16 are enough. But then again, that's still <100 KiB
> for all DARTs combined and these machine have >8 GiB of RAM so it probably won't
> make a difference

Yeah, I don't think this is worth the extra fumbling around with dynamic
allocation.

>>  	/* enable all streams globally since TCR is used to control isolation */
>> -	writel(DART_STREAM_ALL, dart->regs + DART_STREAMS_ENABLE);
>> +	for (i = 0; i < BITS_TO_U32(dart->num_streams); i++)
>> +		writel(U32_MAX, dart->regs + DART_STREAMS_ENABLE);
> 
> This seems weird: this code writes U32_MAX to the same register
> again and again. 

Whoops, that was supposed to have a `+ 4 * i` in there. Fixed for v2.

- Hector
  
Sven Peter Jan. 5, 2023, 9:47 a.m. UTC | #3
Hi,

On Thu, Jan 5, 2023, at 05:43, Hector Martin wrote:
> On 2023/01/04 22:37, Sven Peter wrote:
>>>  #include "dma-iommu.h"
>>>
>>> -#define DART_MAX_STREAMS 16
>>> +#define DART_MAX_STREAMS 256
>> 
>> Feels a bit wasteful to allocate 256-wide sid2group and save_{tcr,ttbr}
>> arrays even for the M1 where 16 are enough. But then again, that's still <100 KiB
>> for all DARTs combined and these machine have >8 GiB of RAM so it probably won't
>> make a difference
>
> Yeah, I don't think this is worth the extra fumbling around with dynamic
> allocation.
>
>>>  	/* enable all streams globally since TCR is used to control isolation */
>>> -	writel(DART_STREAM_ALL, dart->regs + DART_STREAMS_ENABLE);
>>> +	for (i = 0; i < BITS_TO_U32(dart->num_streams); i++)
>>> +		writel(U32_MAX, dart->regs + DART_STREAMS_ENABLE);
>> 
>> This seems weird: this code writes U32_MAX to the same register
>> again and again. 
>
> Whoops, that was supposed to have a `+ 4 * i` in there. Fixed for v2.

Great! Feel free to also add

Reviewed-by: Sven Peter <sven@svenpeter.dev>

then.


Best,

Sven
  

Patch

diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
index 2458416122f8..48743bcd5b9d 100644
--- a/drivers/iommu/apple-dart.c
+++ b/drivers/iommu/apple-dart.c
@@ -34,11 +34,10 @@ 
 
 #include "dma-iommu.h"
 
-#define DART_MAX_STREAMS 16
+#define DART_MAX_STREAMS 256
 #define DART_MAX_TTBR 4
 #define MAX_DARTS_PER_DEVICE 2
 
-#define DART_STREAM_ALL 0xffff
 
 #define DART_PARAMS1 0x00
 #define DART_PARAMS_PAGE_SHIFT GENMASK(27, 24)
@@ -85,6 +84,8 @@ 
 struct apple_dart_hw {
 	u32 oas;
 	enum io_pgtable_fmt fmt;
+
+	int max_sid_count;
 };
 
 /*
@@ -116,6 +117,7 @@  struct apple_dart {
 	spinlock_t lock;
 
 	u32 pgsize;
+	u32 num_streams;
 	u32 supports_bypass : 1;
 	u32 force_bypass : 1;
 
@@ -143,11 +145,11 @@  struct apple_dart {
  */
 struct apple_dart_stream_map {
 	struct apple_dart *dart;
-	unsigned long sidmap;
+	DECLARE_BITMAP(sidmap, DART_MAX_STREAMS);
 };
 struct apple_dart_atomic_stream_map {
 	struct apple_dart *dart;
-	atomic64_t sidmap;
+	atomic_long_t sidmap[BITS_TO_LONGS(DART_MAX_STREAMS)];
 };
 
 /*
@@ -205,50 +207,55 @@  static struct apple_dart_domain *to_dart_domain(struct iommu_domain *dom)
 static void
 apple_dart_hw_enable_translation(struct apple_dart_stream_map *stream_map)
 {
+	struct apple_dart *dart = stream_map->dart;
 	int sid;
 
-	for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
+	for_each_set_bit(sid, stream_map->sidmap, dart->num_streams)
 		writel(DART_TCR_TRANSLATE_ENABLE,
-		       stream_map->dart->regs + DART_TCR(sid));
+		       dart->regs + DART_TCR(sid));
 }
 
 static void apple_dart_hw_disable_dma(struct apple_dart_stream_map *stream_map)
 {
+	struct apple_dart *dart = stream_map->dart;
 	int sid;
 
-	for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
-		writel(0, stream_map->dart->regs + DART_TCR(sid));
+	for_each_set_bit(sid, stream_map->sidmap, dart->num_streams)
+		writel(0, dart->regs + DART_TCR(sid));
 }
 
 static void
 apple_dart_hw_enable_bypass(struct apple_dart_stream_map *stream_map)
 {
+	struct apple_dart *dart = stream_map->dart;
 	int sid;
 
 	WARN_ON(!stream_map->dart->supports_bypass);
-	for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
+	for_each_set_bit(sid, stream_map->sidmap, dart->num_streams)
 		writel(DART_TCR_BYPASS0_ENABLE | DART_TCR_BYPASS1_ENABLE,
-		       stream_map->dart->regs + DART_TCR(sid));
+		       dart->regs + DART_TCR(sid));
 }
 
 static void apple_dart_hw_set_ttbr(struct apple_dart_stream_map *stream_map,
 				   u8 idx, phys_addr_t paddr)
 {
+	struct apple_dart *dart = stream_map->dart;
 	int sid;
 
 	WARN_ON(paddr & ((1 << DART_TTBR_SHIFT) - 1));
-	for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
+	for_each_set_bit(sid, stream_map->sidmap, dart->num_streams)
 		writel(DART_TTBR_VALID | (paddr >> DART_TTBR_SHIFT),
-		       stream_map->dart->regs + DART_TTBR(sid, idx));
+		       dart->regs + DART_TTBR(sid, idx));
 }
 
 static void apple_dart_hw_clear_ttbr(struct apple_dart_stream_map *stream_map,
 				     u8 idx)
 {
+	struct apple_dart *dart = stream_map->dart;
 	int sid;
 
-	for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
-		writel(0, stream_map->dart->regs + DART_TTBR(sid, idx));
+	for_each_set_bit(sid, stream_map->sidmap, dart->num_streams)
+		writel(0, dart->regs + DART_TTBR(sid, idx));
 }
 
 static void
@@ -270,7 +277,7 @@  apple_dart_hw_stream_command(struct apple_dart_stream_map *stream_map,
 
 	spin_lock_irqsave(&stream_map->dart->lock, flags);
 
-	writel(stream_map->sidmap, stream_map->dart->regs + DART_STREAM_SELECT);
+	writel(stream_map->sidmap[0], stream_map->dart->regs + DART_STREAM_SELECT);
 	writel(command, stream_map->dart->regs + DART_STREAM_COMMAND);
 
 	ret = readl_poll_timeout_atomic(
@@ -283,7 +290,7 @@  apple_dart_hw_stream_command(struct apple_dart_stream_map *stream_map,
 	if (ret) {
 		dev_err(stream_map->dart->dev,
 			"busy bit did not clear after command %x for streams %lx\n",
-			command, stream_map->sidmap);
+			command, stream_map->sidmap[0]);
 		return ret;
 	}
 
@@ -301,6 +308,7 @@  static int apple_dart_hw_reset(struct apple_dart *dart)
 {
 	u32 config;
 	struct apple_dart_stream_map stream_map;
+	int i;
 
 	config = readl(dart->regs + DART_CONFIG);
 	if (config & DART_CONFIG_LOCK) {
@@ -310,12 +318,14 @@  static int apple_dart_hw_reset(struct apple_dart *dart)
 	}
 
 	stream_map.dart = dart;
-	stream_map.sidmap = DART_STREAM_ALL;
+	bitmap_zero(stream_map.sidmap, DART_MAX_STREAMS);
+	bitmap_set(stream_map.sidmap, 0, dart->num_streams);
 	apple_dart_hw_disable_dma(&stream_map);
 	apple_dart_hw_clear_all_ttbrs(&stream_map);
 
 	/* enable all streams globally since TCR is used to control isolation */
-	writel(DART_STREAM_ALL, dart->regs + DART_STREAMS_ENABLE);
+	for (i = 0; i < BITS_TO_U32(dart->num_streams); i++)
+		writel(U32_MAX, dart->regs + DART_STREAMS_ENABLE);
 
 	/* clear any pending errors before the interrupt is unmasked */
 	writel(readl(dart->regs + DART_ERROR), dart->regs + DART_ERROR);
@@ -325,13 +335,16 @@  static int apple_dart_hw_reset(struct apple_dart *dart)
 
 static void apple_dart_domain_flush_tlb(struct apple_dart_domain *domain)
 {
-	int i;
+	int i, j;
 	struct apple_dart_atomic_stream_map *domain_stream_map;
 	struct apple_dart_stream_map stream_map;
 
 	for_each_stream_map(i, domain, domain_stream_map) {
 		stream_map.dart = domain_stream_map->dart;
-		stream_map.sidmap = atomic64_read(&domain_stream_map->sidmap);
+
+		for (j = 0; j < BITS_TO_LONGS(stream_map.dart->num_streams); j++)
+			stream_map.sidmap[j] = atomic_long_read(&domain_stream_map->sidmap[j]);
+
 		apple_dart_hw_invalidate_tlb(&stream_map);
 	}
 }
@@ -416,7 +429,7 @@  static int apple_dart_finalize_domain(struct iommu_domain *domain,
 	struct apple_dart *dart = cfg->stream_maps[0].dart;
 	struct io_pgtable_cfg pgtbl_cfg;
 	int ret = 0;
-	int i;
+	int i, j;
 
 	mutex_lock(&dart_domain->init_lock);
 
@@ -425,8 +438,9 @@  static int apple_dart_finalize_domain(struct iommu_domain *domain,
 
 	for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
 		dart_domain->stream_maps[i].dart = cfg->stream_maps[i].dart;
-		atomic64_set(&dart_domain->stream_maps[i].sidmap,
-			     cfg->stream_maps[i].sidmap);
+		for (j = 0; j < BITS_TO_LONGS(dart->num_streams); j++)
+			atomic_long_set(&dart_domain->stream_maps[i].sidmap[j],
+					cfg->stream_maps[i].sidmap[j]);
 	}
 
 	pgtbl_cfg = (struct io_pgtable_cfg){
@@ -461,7 +475,7 @@  apple_dart_mod_streams(struct apple_dart_atomic_stream_map *domain_maps,
 		       struct apple_dart_stream_map *master_maps,
 		       bool add_streams)
 {
-	int i;
+	int i, j;
 
 	for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
 		if (domain_maps[i].dart != master_maps[i].dart)
@@ -471,12 +485,14 @@  apple_dart_mod_streams(struct apple_dart_atomic_stream_map *domain_maps,
 	for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
 		if (!domain_maps[i].dart)
 			break;
-		if (add_streams)
-			atomic64_or(master_maps[i].sidmap,
-				    &domain_maps[i].sidmap);
-		else
-			atomic64_and(~master_maps[i].sidmap,
-				     &domain_maps[i].sidmap);
+		for (j = 0; j < BITS_TO_LONGS(domain_maps[i].dart->num_streams); j++) {
+			if (add_streams)
+				atomic_long_or(master_maps[i].sidmap[j],
+					       &domain_maps[i].sidmap[j]);
+			else
+				atomic_long_and(~master_maps[i].sidmap[j],
+						&domain_maps[i].sidmap[j]);
+		}
 	}
 
 	return 0;
@@ -640,14 +656,14 @@  static int apple_dart_of_xlate(struct device *dev, struct of_phandle_args *args)
 
 	for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
 		if (cfg->stream_maps[i].dart == dart) {
-			cfg->stream_maps[i].sidmap |= 1 << sid;
+			set_bit(sid, cfg->stream_maps[i].sidmap);
 			return 0;
 		}
 	}
 	for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
 		if (!cfg->stream_maps[i].dart) {
 			cfg->stream_maps[i].dart = dart;
-			cfg->stream_maps[i].sidmap = 1 << sid;
+			set_bit(sid, cfg->stream_maps[i].sidmap);
 			return 0;
 		}
 	}
@@ -666,7 +682,7 @@  static void apple_dart_release_group(void *iommu_data)
 	mutex_lock(&apple_dart_groups_lock);
 
 	for_each_stream_map(i, group_master_cfg, stream_map)
-		for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
+		for_each_set_bit(sid, stream_map->sidmap, stream_map->dart->num_streams)
 			stream_map->dart->sid2group[sid] = NULL;
 
 	kfree(iommu_data);
@@ -685,7 +701,7 @@  static struct iommu_group *apple_dart_device_group(struct device *dev)
 	mutex_lock(&apple_dart_groups_lock);
 
 	for_each_stream_map(i, cfg, stream_map) {
-		for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) {
+		for_each_set_bit(sid, stream_map->sidmap, stream_map->dart->num_streams) {
 			struct iommu_group *stream_group =
 				stream_map->dart->sid2group[sid];
 
@@ -724,7 +740,7 @@  static struct iommu_group *apple_dart_device_group(struct device *dev)
 		apple_dart_release_group);
 
 	for_each_stream_map(i, cfg, stream_map)
-		for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
+		for_each_set_bit(sid, stream_map->sidmap, stream_map->dart->num_streams)
 			stream_map->dart->sid2group[sid] = group;
 
 	res = group;
@@ -869,16 +885,26 @@  static int apple_dart_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	ret = apple_dart_hw_reset(dart);
-	if (ret)
-		goto err_clk_disable;
-
 	dart_params[0] = readl(dart->regs + DART_PARAMS1);
 	dart_params[1] = readl(dart->regs + DART_PARAMS2);
 	dart->pgsize = 1 << FIELD_GET(DART_PARAMS_PAGE_SHIFT, dart_params[0]);
 	dart->supports_bypass = dart_params[1] & DART_PARAMS_BYPASS_SUPPORT;
+
+	dart->num_streams = dart->hw->max_sid_count;
+
+	if (dart->num_streams > DART_MAX_STREAMS) {
+		dev_err(&pdev->dev, "Too many streams (%d > %d)\n",
+			dart->num_streams, DART_MAX_STREAMS);
+		ret = -EINVAL;
+		goto err_clk_disable;
+	}
+
 	dart->force_bypass = dart->pgsize > PAGE_SIZE;
 
+	ret = apple_dart_hw_reset(dart);
+	if (ret)
+		goto err_clk_disable;
+
 	ret = request_irq(dart->irq, apple_dart_irq, IRQF_SHARED,
 			  "apple-dart fault handler", dart);
 	if (ret)
@@ -897,8 +923,8 @@  static int apple_dart_probe(struct platform_device *pdev)
 
 	dev_info(
 		&pdev->dev,
-		"DART [pagesize %x, bypass support: %d, bypass forced: %d] initialized\n",
-		dart->pgsize, dart->supports_bypass, dart->force_bypass);
+		"DART [pagesize %x, %d streams, bypass support: %d, bypass forced: %d] initialized\n",
+		dart->pgsize, dart->num_streams, dart->supports_bypass, dart->force_bypass);
 	return 0;
 
 err_sysfs_remove:
@@ -929,10 +955,12 @@  static int apple_dart_remove(struct platform_device *pdev)
 static const struct apple_dart_hw apple_dart_hw_t8103 = {
 	.oas = 36,
 	.fmt = APPLE_DART,
+	.max_sid_count = 16,
 };
 static const struct apple_dart_hw apple_dart_hw_t6000 = {
 	.oas = 42,
 	.fmt = APPLE_DART2,
+	.max_sid_count = 16,
 };
 
 static __maybe_unused int apple_dart_suspend(struct device *dev)
@@ -940,7 +968,7 @@  static __maybe_unused int apple_dart_suspend(struct device *dev)
 	struct apple_dart *dart = dev_get_drvdata(dev);
 	unsigned int sid, idx;
 
-	for (sid = 0; sid < DART_MAX_STREAMS; sid++) {
+	for (sid = 0; sid < dart->num_streams; sid++) {
 		dart->save_tcr[sid] = readl_relaxed(dart->regs + DART_TCR(sid));
 		for (idx = 0; idx < DART_MAX_TTBR; idx++)
 			dart->save_ttbr[sid][idx] =
@@ -962,7 +990,7 @@  static __maybe_unused int apple_dart_resume(struct device *dev)
 		return ret;
 	}
 
-	for (sid = 0; sid < DART_MAX_STREAMS; sid++) {
+	for (sid = 0; sid < dart->num_streams; sid++) {
 		for (idx = 0; idx < DART_MAX_TTBR; idx++)
 			writel(dart->save_ttbr[sid][idx],
 			       dart->regs + DART_TTBR(sid, idx));