[RFC,1/3] iommu/intel: Use page->refcount to count number of entries in IOMMU

Message ID 20231221031915.619337-2-pasha.tatashin@soleen.com
State New
Headers
Series iommu/intel: Free empty page tables on unmaps |

Commit Message

Pasha Tatashin Dec. 21, 2023, 3:19 a.m. UTC
  In order to be able to efficiently free empty page table levels, count the
number of entries in each page table my incremeanting and decremeanting
refcount every time a PTE is inserted or removed form the page table.

For this to work correctly, add two helper function:
dma_clear_pte and dma_set_pte where counting is performed,

Also, modify the code so every page table entry is always updated using the
two new functions.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
---
 drivers/iommu/intel/iommu.c | 40 +++++++++++++++++++++---------------
 drivers/iommu/intel/iommu.h | 41 +++++++++++++++++++++++++++++++------
 2 files changed, 58 insertions(+), 23 deletions(-)
  

Comments

Liu, Jingqi Dec. 25, 2023, 2:59 a.m. UTC | #1
On 12/21/2023 11:19 AM, Pasha Tatashin wrote:
> In order to be able to efficiently free empty page table levels, count the
> number of entries in each page table my incremeanting and decremeanting
s/incremeanting/incrementing
s/decremeanting/decrementing

> refcount every time a PTE is inserted or removed form the page table.
s/form/from

> 
> For this to work correctly, add two helper function:
> dma_clear_pte and dma_set_pte where counting is performed,
> 
> Also, modify the code so every page table entry is always updated using the
> two new functions.
> 
> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
> ---
>   drivers/iommu/intel/iommu.c | 40 +++++++++++++++++++++---------------
>   drivers/iommu/intel/iommu.h | 41 +++++++++++++++++++++++++++++++------
>   2 files changed, 58 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index 897159dba47d..4688ef797161 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -949,7 +949,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
>   			if (domain->use_first_level)
>   				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
>   
> -			if (cmpxchg64(&pte->val, 0ULL, pteval))
> +			if (dma_set_pte(pte, pteval))
>   				/* Someone else set it while we were thinking; use theirs. */
>   				free_pgtable_page(tmp_page);
>   			else
> @@ -1021,7 +1021,8 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
>   			continue;
>   		}
>   		do {
> -			dma_clear_pte(pte);
> +			if (dma_pte_present(pte))
> +				dma_clear_pte(pte);
>   			start_pfn += lvl_to_nr_pages(large_page);
>   			pte++;
>   		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
> @@ -1062,7 +1063,8 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level,
>   		 */
>   		if (level < retain_level && !(start_pfn > level_pfn ||
>   		      last_pfn < level_pfn + level_size(level) - 1)) {
> -			dma_clear_pte(pte);
> +			if (dma_pte_present(pte))
> +				dma_clear_pte(pte);
>   			domain_flush_cache(domain, pte, sizeof(*pte));
>   			free_pgtable_page(level_pte);
>   		}
> @@ -1093,12 +1095,13 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
>   	}
>   }
>   
> -/* When a page at a given level is being unlinked from its parent, we don't
> -   need to *modify* it at all. All we need to do is make a list of all the
> -   pages which can be freed just as soon as we've flushed the IOTLB and we
> -   know the hardware page-walk will no longer touch them.
> -   The 'pte' argument is the *parent* PTE, pointing to the page that is to
> -   be freed. */
> +/*
> + * A given page at a given level is being unlinked from its parent.
> + * We need to make a list of all the pages which can be freed just as soon as
> + * we've flushed the IOTLB and we know the hardware page-walk will no longer
> + * touch them. The 'pte' argument is the *parent* PTE, pointing to the page
> + * that is to be freed.
> + */
>   static void dma_pte_list_pagetables(struct dmar_domain *domain,
>   				    int level, struct dma_pte *pte,
>   				    struct list_head *freelist)
> @@ -1106,17 +1109,20 @@ static void dma_pte_list_pagetables(struct dmar_domain *domain,
>   	struct page *pg;
>   
>   	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
> -	list_add_tail(&pg->lru, freelist);
> -
> -	if (level == 1)
> -		return;
> -
>   	pte = page_address(pg);
> +
>   	do {
> -		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
> -			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
> +		if (dma_pte_present(pte)) {
> +			if (level > 1 && !dma_pte_superpage(pte)) {
> +				dma_pte_list_pagetables(domain, level - 1, pte,
> +							freelist);
> +			}
> +			dma_clear_pte(pte);
> +		}
>   		pte++;
>   	} while (!first_pte_in_page(pte));
> +
> +	list_add_tail(&pg->lru, freelist);
>   }
>   
How about calculating the page decrement when the pages in the freelist 
are really freed in iommu_free_pgtbl_pages() ?

Thanks,
Jingqi
  
Pasha Tatashin Dec. 28, 2023, 3:01 p.m. UTC | #2
On Sun, Dec 24, 2023 at 9:59 PM Liu, Jingqi <jingqi.liu@intel.com> wrote:
>
> On 12/21/2023 11:19 AM, Pasha Tatashin wrote:
> > In order to be able to efficiently free empty page table levels, count the
> > number of entries in each page table my incremeanting and decremeanting
> s/incremeanting/incrementing
> s/decremeanting/decrementing
>
> > refcount every time a PTE is inserted or removed form the page table.
> s/form/from
>
> >
> > For this to work correctly, add two helper function:
> > dma_clear_pte and dma_set_pte where counting is performed,
> >
> > Also, modify the code so every page table entry is always updated using the
> > two new functions.
> >
> > Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
> > ---
> >   drivers/iommu/intel/iommu.c | 40 +++++++++++++++++++++---------------
> >   drivers/iommu/intel/iommu.h | 41 +++++++++++++++++++++++++++++++------
> >   2 files changed, 58 insertions(+), 23 deletions(-)
> >
> > diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> > index 897159dba47d..4688ef797161 100644
> > --- a/drivers/iommu/intel/iommu.c
> > +++ b/drivers/iommu/intel/iommu.c
> > @@ -949,7 +949,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
> >                       if (domain->use_first_level)
> >                               pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
> >
> > -                     if (cmpxchg64(&pte->val, 0ULL, pteval))
> > +                     if (dma_set_pte(pte, pteval))
> >                               /* Someone else set it while we were thinking; use theirs. */
> >                               free_pgtable_page(tmp_page);
> >                       else
> > @@ -1021,7 +1021,8 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
> >                       continue;
> >               }
> >               do {
> > -                     dma_clear_pte(pte);
> > +                     if (dma_pte_present(pte))
> > +                             dma_clear_pte(pte);
> >                       start_pfn += lvl_to_nr_pages(large_page);
> >                       pte++;
> >               } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
> > @@ -1062,7 +1063,8 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level,
> >                */
> >               if (level < retain_level && !(start_pfn > level_pfn ||
> >                     last_pfn < level_pfn + level_size(level) - 1)) {
> > -                     dma_clear_pte(pte);
> > +                     if (dma_pte_present(pte))
> > +                             dma_clear_pte(pte);
> >                       domain_flush_cache(domain, pte, sizeof(*pte));
> >                       free_pgtable_page(level_pte);
> >               }
> > @@ -1093,12 +1095,13 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
> >       }
> >   }
> >
> > -/* When a page at a given level is being unlinked from its parent, we don't
> > -   need to *modify* it at all. All we need to do is make a list of all the
> > -   pages which can be freed just as soon as we've flushed the IOTLB and we
> > -   know the hardware page-walk will no longer touch them.
> > -   The 'pte' argument is the *parent* PTE, pointing to the page that is to
> > -   be freed. */
> > +/*
> > + * A given page at a given level is being unlinked from its parent.
> > + * We need to make a list of all the pages which can be freed just as soon as
> > + * we've flushed the IOTLB and we know the hardware page-walk will no longer
> > + * touch them. The 'pte' argument is the *parent* PTE, pointing to the page
> > + * that is to be freed.
> > + */
> >   static void dma_pte_list_pagetables(struct dmar_domain *domain,
> >                                   int level, struct dma_pte *pte,
> >                                   struct list_head *freelist)
> > @@ -1106,17 +1109,20 @@ static void dma_pte_list_pagetables(struct dmar_domain *domain,
> >       struct page *pg;
> >
> >       pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
> > -     list_add_tail(&pg->lru, freelist);
> > -
> > -     if (level == 1)
> > -             return;
> > -
> >       pte = page_address(pg);
> > +
> >       do {
> > -             if (dma_pte_present(pte) && !dma_pte_superpage(pte))
> > -                     dma_pte_list_pagetables(domain, level - 1, pte, freelist);
> > +             if (dma_pte_present(pte)) {
> > +                     if (level > 1 && !dma_pte_superpage(pte)) {
> > +                             dma_pte_list_pagetables(domain, level - 1, pte,
> > +                                                     freelist);
> > +                     }
> > +                     dma_clear_pte(pte);
> > +             }
> >               pte++;
> >       } while (!first_pte_in_page(pte));
> > +
> > +     list_add_tail(&pg->lru, freelist);
> >   }
> >
> How about calculating the page decrement when the pages in the freelist
> are really freed in iommu_free_pgtbl_pages() ?


Hi Jingqi,

Thank you for looking at this.

My understanding is what you are suggesting is to count number of
entries and subtract them from refcount only once before adding page
to the freelist, is this correct?

We could do that, but having dma_clear_pte() for each entry is very
beneficial, as we could extend IOMMU page tables with other debug
technologies, such as page_table_check, if every single entry in the
page table is added and removed via dma_clear_pte() and dma_set_pte(),
since we are alreadying clearing pte, there is no reason not to change
the refcount at the same time.

Pasha
  

Patch

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 897159dba47d..4688ef797161 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -949,7 +949,7 @@  static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 			if (domain->use_first_level)
 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
 
-			if (cmpxchg64(&pte->val, 0ULL, pteval))
+			if (dma_set_pte(pte, pteval))
 				/* Someone else set it while we were thinking; use theirs. */
 				free_pgtable_page(tmp_page);
 			else
@@ -1021,7 +1021,8 @@  static void dma_pte_clear_range(struct dmar_domain *domain,
 			continue;
 		}
 		do {
-			dma_clear_pte(pte);
+			if (dma_pte_present(pte))
+				dma_clear_pte(pte);
 			start_pfn += lvl_to_nr_pages(large_page);
 			pte++;
 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
@@ -1062,7 +1063,8 @@  static void dma_pte_free_level(struct dmar_domain *domain, int level,
 		 */
 		if (level < retain_level && !(start_pfn > level_pfn ||
 		      last_pfn < level_pfn + level_size(level) - 1)) {
-			dma_clear_pte(pte);
+			if (dma_pte_present(pte))
+				dma_clear_pte(pte);
 			domain_flush_cache(domain, pte, sizeof(*pte));
 			free_pgtable_page(level_pte);
 		}
@@ -1093,12 +1095,13 @@  static void dma_pte_free_pagetable(struct dmar_domain *domain,
 	}
 }
 
-/* When a page at a given level is being unlinked from its parent, we don't
-   need to *modify* it at all. All we need to do is make a list of all the
-   pages which can be freed just as soon as we've flushed the IOTLB and we
-   know the hardware page-walk will no longer touch them.
-   The 'pte' argument is the *parent* PTE, pointing to the page that is to
-   be freed. */
+/*
+ * A given page at a given level is being unlinked from its parent.
+ * We need to make a list of all the pages which can be freed just as soon as
+ * we've flushed the IOTLB and we know the hardware page-walk will no longer
+ * touch them. The 'pte' argument is the *parent* PTE, pointing to the page
+ * that is to be freed.
+ */
 static void dma_pte_list_pagetables(struct dmar_domain *domain,
 				    int level, struct dma_pte *pte,
 				    struct list_head *freelist)
@@ -1106,17 +1109,20 @@  static void dma_pte_list_pagetables(struct dmar_domain *domain,
 	struct page *pg;
 
 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
-	list_add_tail(&pg->lru, freelist);
-
-	if (level == 1)
-		return;
-
 	pte = page_address(pg);
+
 	do {
-		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
-			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
+		if (dma_pte_present(pte)) {
+			if (level > 1 && !dma_pte_superpage(pte)) {
+				dma_pte_list_pagetables(domain, level - 1, pte,
+							freelist);
+			}
+			dma_clear_pte(pte);
+		}
 		pte++;
 	} while (!first_pte_in_page(pte));
+
+	list_add_tail(&pg->lru, freelist);
 }
 
 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
@@ -2244,7 +2250,7 @@  __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 		/* We don't need lock here, nobody else
 		 * touches the iova range
 		 */
-		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
+		tmp = dma_set_pte(pte, pteval);
 		if (tmp) {
 			static int dumps = 5;
 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index ce030c5b5772..f1ea508f45bd 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -802,11 +802,6 @@  struct dma_pte {
 	u64 val;
 };
 
-static inline void dma_clear_pte(struct dma_pte *pte)
-{
-	pte->val = 0;
-}
-
 static inline u64 dma_pte_addr(struct dma_pte *pte)
 {
 #ifdef CONFIG_64BIT
@@ -818,9 +813,43 @@  static inline u64 dma_pte_addr(struct dma_pte *pte)
 #endif
 }
 
+#define DMA_PTEVAL_PRESENT(pteval) (((pteval) & 3) != 0)
 static inline bool dma_pte_present(struct dma_pte *pte)
 {
-	return (pte->val & 3) != 0;
+	return DMA_PTEVAL_PRESENT(pte->val);
+}
+
+static inline void dma_clear_pte(struct dma_pte *pte)
+{
+	u64 old_pteval;
+
+	old_pteval = xchg(&pte->val, 0ULL);
+	if (DMA_PTEVAL_PRESENT(old_pteval)) {
+		struct page *pg = virt_to_page(pte);
+		int rc = page_ref_dec_return(pg);
+
+		WARN_ON_ONCE(rc > 512 || rc < 1);
+	} else {
+		/* Ensure that we cleared a valid entry from the page table */
+		WARN_ON(1);
+	}
+}
+
+static inline u64 dma_set_pte(struct dma_pte *pte, u64 pteval)
+{
+	u64 old_pteval;
+
+	/* Ensure we about to set a valid entry to the page table */
+	WARN_ON(!DMA_PTEVAL_PRESENT(pteval));
+	old_pteval = cmpxchg64(&pte->val, 0ULL, pteval);
+	if (old_pteval == 0) {
+		struct page *pg = virt_to_page(pte);
+		int rc = page_ref_inc_return(pg);
+
+		WARN_ON_ONCE(rc > 513 || rc < 2);
+	}
+
+	return old_pteval;
 }
 
 static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,