@@ -40,6 +40,8 @@ struct vmemmap_remap_walk {
/* Skip the TLB flush when we split the PMD */
#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
+/* Skip the TLB flush when we remap the PTE */
+#define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1)
unsigned long flags;
};
@@ -214,7 +216,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
return ret;
} while (pgd++, addr = next, addr != end);
- if (walk->remap_pte)
+ if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
flush_tlb_kernel_range(start, end);
return 0;
@@ -355,19 +357,21 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end,
* @reuse: reuse address.
* @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers
* responsibility to free pages.
+ * @flags: modifications to vmemmap_remap_walk flags
*
* Return: %0 on success, negative error code otherwise.
*/
static int vmemmap_remap_free(unsigned long start, unsigned long end,
unsigned long reuse,
- struct list_head *vmemmap_pages)
+ struct list_head *vmemmap_pages,
+ unsigned long flags)
{
int ret;
struct vmemmap_remap_walk walk = {
.remap_pte = vmemmap_remap_pte,
.reuse_addr = reuse,
.vmemmap_pages = vmemmap_pages,
- .flags = 0,
+ .flags = flags,
};
int nid = page_to_nid((struct page *)reuse);
gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
@@ -629,7 +633,8 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h
static int __hugetlb_vmemmap_optimize(const struct hstate *h,
struct page *head,
- struct list_head *vmemmap_pages)
+ struct list_head *vmemmap_pages,
+ unsigned long flags)
{
int ret = 0;
unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
@@ -640,6 +645,18 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h,
return ret;
static_branch_inc(&hugetlb_optimize_vmemmap_key);
+ /*
+ * Very Subtle
+ * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
+ * immediately after remapping. As a result, subsequent accesses
+ * and modifications to struct pages associated with the hugetlb
+ * page could be to the OLD struct pages. Set the vmemmap optimized
+ * flag here so that it is copied to the new head page. This keeps
+ * the old and new struct pages in sync.
+ * If there is an error during optimization, we will immediately FLUSH
+ * the TLB and clear the flag below.
+ */
+ SetHPageVmemmapOptimized(head);
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
vmemmap_reuse = vmemmap_start;
@@ -651,11 +668,12 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h,
* mapping the range to vmemmap_pages list so that they can be freed by
* the caller.
*/
- ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages);
- if (ret)
+ ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
+ vmemmap_pages, flags);
+ if (ret) {
static_branch_dec(&hugetlb_optimize_vmemmap_key);
- else
- SetHPageVmemmapOptimized(head);
+ ClearHPageVmemmapOptimized(head);
+ }
return ret;
}
@@ -674,7 +692,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
{
LIST_HEAD(vmemmap_pages);
- __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages);
+ __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0);
free_vmemmap_page_list(&vmemmap_pages);
}
@@ -719,19 +737,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
list_for_each_entry(folio, folio_list, lru) {
int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
- &vmemmap_pages);
+ &vmemmap_pages,
+ VMEMMAP_REMAP_NO_TLB_FLUSH);
/*
* Pages to be freed may have been accumulated. If we
* encounter an ENOMEM, free what we have and try again.
+ * This can occur in the case that both spliting fails
+ * halfway and head page allocation also failed. In this
+ * case __hugetlb_vmemmap_optimize() would free memory
+ * allowing more vmemmap remaps to occur.
*/
if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
+ flush_tlb_all();
free_vmemmap_page_list(&vmemmap_pages);
INIT_LIST_HEAD(&vmemmap_pages);
- __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages);
+ __hugetlb_vmemmap_optimize(h, &folio->page,
+ &vmemmap_pages,
+ VMEMMAP_REMAP_NO_TLB_FLUSH);
}
}
+ flush_tlb_all();
free_vmemmap_page_list(&vmemmap_pages);
}