[09/46] mm: add MADV_SPLIT to enable HugeTLB HGM
Commit Message
Issuing ioctl(MADV_SPLIT) on a HugeTLB address range will enable
HugeTLB HGM. MADV_SPLIT was chosen for the name so that this API can be
applied to non-HugeTLB memory in the future, if such an application is
to arise.
MADV_SPLIT provides several API changes for some syscalls on HugeTLB
address ranges:
1. UFFDIO_CONTINUE is allowed for MAP_SHARED VMAs at PAGE_SIZE
alignment.
2. read()ing a page fault event from a userfaultfd will yield a
PAGE_SIZE-rounded address, instead of a huge-page-size-rounded
address (unless UFFD_FEATURE_EXACT_ADDRESS is used).
There is no way to disable the API changes that come with issuing
MADV_SPLIT. MADV_COLLAPSE can be used to collapse high-granularity page
table mappings that come from the extended functionality that comes with
using MADV_SPLIT.
For post-copy live migration, the expected use-case is:
1. mmap(MAP_SHARED, some_fd) primary mapping
2. mmap(MAP_SHARED, some_fd) alias mapping
3. MADV_SPLIT the primary mapping
4. UFFDIO_REGISTER/etc. the primary mapping
5. Copy memory contents into alias mapping and UFFDIO_CONTINUE the
corresponding PAGE_SIZE sections in the primary mapping.
More API changes may be added in the future.
Signed-off-by: James Houghton <jthoughton@google.com>
---
arch/alpha/include/uapi/asm/mman.h | 2 ++
arch/mips/include/uapi/asm/mman.h | 2 ++
arch/parisc/include/uapi/asm/mman.h | 2 ++
arch/xtensa/include/uapi/asm/mman.h | 2 ++
include/linux/hugetlb.h | 2 ++
include/uapi/asm-generic/mman-common.h | 2 ++
mm/hugetlb.c | 3 +--
mm/madvise.c | 26 ++++++++++++++++++++++++++
8 files changed, 39 insertions(+), 2 deletions(-)
Comments
Hi James,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on next-20230105]
[cannot apply to kvm/queue shuah-kselftest/next shuah-kselftest/fixes arnd-asm-generic/master linus/master kvm/linux-next v6.2-rc2 v6.2-rc1 v6.1 v6.2-rc2]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/James-Houghton/hugetlb-don-t-set-PageUptodate-for-UFFDIO_CONTINUE/20230105-182428
patch link: https://lore.kernel.org/r/20230105101844.1893104-10-jthoughton%40google.com
patch subject: [PATCH 09/46] mm: add MADV_SPLIT to enable HugeTLB HGM
config: m68k-allmodconfig
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/33a65f9a66e72ccc2c7151dc3ff9cb1d692074d8
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review James-Houghton/hugetlb-don-t-set-PageUptodate-for-UFFDIO_CONTINUE/20230105-182428
git checkout 33a65f9a66e72ccc2c7151dc3ff9cb1d692074d8
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
mm/madvise.c: In function 'madvise_split':
>> mm/madvise.c:1023:9: error: implicit declaration of function 'hugetlb_vma_lock_alloc'; did you mean 'hugetlb_vma_lock_write'? [-Werror=implicit-function-declaration]
1023 | hugetlb_vma_lock_alloc(vma);
| ^~~~~~~~~~~~~~~~~~~~~~
| hugetlb_vma_lock_write
cc1: some warnings being treated as errors
vim +1023 mm/madvise.c
1013
1014 static int madvise_split(struct vm_area_struct *vma,
1015 unsigned long *new_flags)
1016 {
1017 if (!is_vm_hugetlb_page(vma) || !hugetlb_hgm_eligible(vma))
1018 return -EINVAL;
1019 /*
1020 * Attempt to allocate the VMA lock again. If it isn't allocated,
1021 * MADV_COLLAPSE won't work.
1022 */
> 1023 hugetlb_vma_lock_alloc(vma);
1024
1025 /* PMD sharing doesn't work with HGM. */
1026 hugetlb_unshare_all_pmds(vma);
1027
1028 *new_flags |= VM_HUGETLB_HGM;
1029 return 0;
1030 }
1031
On 05.01.23 11:18, James Houghton wrote:
> Issuing ioctl(MADV_SPLIT) on a HugeTLB address range will enable
> HugeTLB HGM. MADV_SPLIT was chosen for the name so that this API can be
> applied to non-HugeTLB memory in the future, if such an application is
> to arise.
>
> MADV_SPLIT provides several API changes for some syscalls on HugeTLB
> address ranges:
> 1. UFFDIO_CONTINUE is allowed for MAP_SHARED VMAs at PAGE_SIZE
> alignment.
> 2. read()ing a page fault event from a userfaultfd will yield a
> PAGE_SIZE-rounded address, instead of a huge-page-size-rounded
> address (unless UFFD_FEATURE_EXACT_ADDRESS is used).
>
> There is no way to disable the API changes that come with issuing
> MADV_SPLIT. MADV_COLLAPSE can be used to collapse high-granularity page
> table mappings that come from the extended functionality that comes with
> using MADV_SPLIT.
>
> For post-copy live migration, the expected use-case is:
> 1. mmap(MAP_SHARED, some_fd) primary mapping
> 2. mmap(MAP_SHARED, some_fd) alias mapping
> 3. MADV_SPLIT the primary mapping
> 4. UFFDIO_REGISTER/etc. the primary mapping
> 5. Copy memory contents into alias mapping and UFFDIO_CONTINUE the
> corresponding PAGE_SIZE sections in the primary mapping.
>
> More API changes may be added in the future.
>
> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
> arch/alpha/include/uapi/asm/mman.h | 2 ++
> arch/mips/include/uapi/asm/mman.h | 2 ++
> arch/parisc/include/uapi/asm/mman.h | 2 ++
> arch/xtensa/include/uapi/asm/mman.h | 2 ++
> include/linux/hugetlb.h | 2 ++
> include/uapi/asm-generic/mman-common.h | 2 ++
> mm/hugetlb.c | 3 +--
> mm/madvise.c | 26 ++++++++++++++++++++++++++
> 8 files changed, 39 insertions(+), 2 deletions(-)
>
> diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
> index 763929e814e9..7a26f3648b90 100644
> --- a/arch/alpha/include/uapi/asm/mman.h
> +++ b/arch/alpha/include/uapi/asm/mman.h
> @@ -78,6 +78,8 @@
>
> #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
>
> +#define MADV_SPLIT 26 /* Enable hugepage high-granularity APIs */
I think we should make a split more generic, such that it also splits
(pte-maps) a THP. Has that been discussed?
On Thu, Jan 5, 2023 at 7:29 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 05.01.23 11:18, James Houghton wrote:
> > Issuing ioctl(MADV_SPLIT) on a HugeTLB address range will enable
> > HugeTLB HGM. MADV_SPLIT was chosen for the name so that this API can be
> > applied to non-HugeTLB memory in the future, if such an application is
> > to arise.
> >
> > MADV_SPLIT provides several API changes for some syscalls on HugeTLB
> > address ranges:
> > 1. UFFDIO_CONTINUE is allowed for MAP_SHARED VMAs at PAGE_SIZE
> > alignment.
> > 2. read()ing a page fault event from a userfaultfd will yield a
> > PAGE_SIZE-rounded address, instead of a huge-page-size-rounded
> > address (unless UFFD_FEATURE_EXACT_ADDRESS is used).
> >
> > There is no way to disable the API changes that come with issuing
> > MADV_SPLIT. MADV_COLLAPSE can be used to collapse high-granularity page
> > table mappings that come from the extended functionality that comes with
> > using MADV_SPLIT.
> >
> > For post-copy live migration, the expected use-case is:
> > 1. mmap(MAP_SHARED, some_fd) primary mapping
> > 2. mmap(MAP_SHARED, some_fd) alias mapping
> > 3. MADV_SPLIT the primary mapping
> > 4. UFFDIO_REGISTER/etc. the primary mapping
> > 5. Copy memory contents into alias mapping and UFFDIO_CONTINUE the
> > corresponding PAGE_SIZE sections in the primary mapping.
> >
> > More API changes may be added in the future.
> >
> > Signed-off-by: James Houghton <jthoughton@google.com>
> > ---
> > arch/alpha/include/uapi/asm/mman.h | 2 ++
> > arch/mips/include/uapi/asm/mman.h | 2 ++
> > arch/parisc/include/uapi/asm/mman.h | 2 ++
> > arch/xtensa/include/uapi/asm/mman.h | 2 ++
> > include/linux/hugetlb.h | 2 ++
> > include/uapi/asm-generic/mman-common.h | 2 ++
> > mm/hugetlb.c | 3 +--
> > mm/madvise.c | 26 ++++++++++++++++++++++++++
> > 8 files changed, 39 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
> > index 763929e814e9..7a26f3648b90 100644
> > --- a/arch/alpha/include/uapi/asm/mman.h
> > +++ b/arch/alpha/include/uapi/asm/mman.h
> > @@ -78,6 +78,8 @@
> >
> > #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
> >
> > +#define MADV_SPLIT 26 /* Enable hugepage high-granularity APIs */
>
> I think we should make a split more generic, such that it also splits
> (pte-maps) a THP. Has that been discussed?
Thanks James / David.
MADV_SPLIT for THP has come up a few times; firstly, during the
initial RFC about hugepage collapse in process context, as the natural
inverse operation required by a generic userspace-managed hugepage
daemon, the second -- which is more immediately practical -- is to
avoid stranding THPs on the deferred split queue (and thus still
incurring the memcg charge) for too long [1].
However, its exact semantics / API have yet to be discussed / flushed
out (though I'm planning to do exactly this in the near-term).
Just as James has co-opted MADV_COLLAPSE for hugetlb, we can co-opt
MADV_SPLIT for THP, when the time comes -- which I think makes a lot
of sense.
Hopefully I can get my ducks in order to start a discussion about this
eminently.
Best,
Zach
[1] https://lore.kernel.org/linux-mm/YZ9kUD5AG6inbUEg@xz-m1.local/
> --
> Thanks,
>
> David / dhildenb
>
@@ -78,6 +78,8 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_SPLIT 26 /* Enable hugepage high-granularity APIs */
+
/* compatibility flags */
#define MAP_FILE 0
@@ -105,6 +105,8 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_SPLIT 26 /* Enable hugepage high-granularity APIs */
+
/* compatibility flags */
#define MAP_FILE 0
@@ -72,6 +72,8 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_SPLIT 74 /* Enable hugepage high-granularity APIs */
+
#define MADV_HWPOISON 100 /* poison a page for testing */
#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
@@ -113,6 +113,8 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_SPLIT 26 /* Enable hugepage high-granularity APIs */
+
/* compatibility flags */
#define MAP_FILE 0
@@ -109,6 +109,8 @@ struct hugetlb_vma_lock {
struct vm_area_struct *vma;
};
+void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
+
extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);
@@ -79,6 +79,8 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_SPLIT 26 /* Enable hugepage high-granularity APIs */
+
/* compatibility flags */
#define MAP_FILE 0
@@ -92,7 +92,6 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
@@ -361,7 +360,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
}
}
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
{
struct hugetlb_vma_lock *vma_lock;
@@ -1011,6 +1011,24 @@ static long madvise_remove(struct vm_area_struct *vma,
return error;
}
+static int madvise_split(struct vm_area_struct *vma,
+ unsigned long *new_flags)
+{
+ if (!is_vm_hugetlb_page(vma) || !hugetlb_hgm_eligible(vma))
+ return -EINVAL;
+ /*
+ * Attempt to allocate the VMA lock again. If it isn't allocated,
+ * MADV_COLLAPSE won't work.
+ */
+ hugetlb_vma_lock_alloc(vma);
+
+ /* PMD sharing doesn't work with HGM. */
+ hugetlb_unshare_all_pmds(vma);
+
+ *new_flags |= VM_HUGETLB_HGM;
+ return 0;
+}
+
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
@@ -1089,6 +1107,11 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
case MADV_COLLAPSE:
return madvise_collapse(vma, prev, start, end);
+ case MADV_SPLIT:
+ error = madvise_split(vma, &new_flags);
+ if (error)
+ goto out;
+ break;
}
anon_name = anon_vma_name(vma);
@@ -1183,6 +1206,9 @@ madvise_behavior_valid(int behavior)
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
case MADV_COLLAPSE:
+#endif
+#ifdef CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING
+ case MADV_SPLIT:
#endif
case MADV_DONTDUMP:
case MADV_DODUMP: