On Wed, 2023-08-02 at 15:20 +0100, Jonathan Cameron wrote:
> On Tue, 01 Aug 2023 23:55:37 -0600
> Vishal Verma <vishal.l.verma@intel.com> wrote:
>
> > The MHP_MEMMAP_ON_MEMORY flag for hotplugged memory is restricted to
> > 'memblock_size' chunks of memory being added. Adding a larger span of
> > memory precludes memmap_on_memory semantics.
> >
> > For users of hotplug such as kmem, large amounts of memory might get
> > added from the CXL subsystem. In some cases, this amount may exceed the
> > available 'main memory' to store the memmap for the memory being added.
> > In this case, it is useful to have a way to place the memmap on the
> > memory being added, even if it means splitting the addition into
> > memblock-sized chunks.
> >
> > Change add_memory_resource() to loop over memblock-sized chunks of
> > memory if caller requested memmap_on_memory, and if other conditions for
> > it are met. Teach try_remove_memory() to also expect that a memory
> > range being removed might have been split up into memblock sized chunks,
> > and to loop through those as needed.
> >
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: David Hildenbrand <david@redhat.com>
> > Cc: Michal Hocko <mhocko@suse.com>
> > Cc: Oscar Salvador <osalvador@suse.de>
> > Cc: Dan Williams <dan.j.williams@intel.com>
> > Cc: Dave Jiang <dave.jiang@intel.com>
> > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: Huang Ying <ying.huang@intel.com>
> > Suggested-by: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
>
> A couple of trivial comments inline.
Hi Jonathan,
Thanks for taking a look.
>
> > ---
> > mm/memory_hotplug.c | 150 ++++++++++++++++++++++++++++++++--------------------
> > 1 file changed, 93 insertions(+), 57 deletions(-)
> >
> > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > index d282664f558e..cae03c8d4bbf 100644
> > --- a/mm/memory_hotplug.c
> > +++ b/mm/memory_hotplug.c
> > @@ -1383,6 +1383,44 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
> > return arch_supports_memmap_on_memory(vmemmap_size);
> > }
> >
> > +static int add_memory_create_devices(int nid, struct memory_group *group,
> > + u64 start, u64 size, mhp_t mhp_flags)
> > +{
> > + struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
> > + struct vmem_altmap mhp_altmap = {
> > + .base_pfn = PHYS_PFN(start),
> > + .end_pfn = PHYS_PFN(start + size - 1),
> > + };
> > + int ret;
> > +
> > + if ((mhp_flags & MHP_MEMMAP_ON_MEMORY)) {
> > + mhp_altmap.free = memory_block_memmap_on_memory_pages();
> > + params.altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
> > + if (!params.altmap)
> > + return -ENOMEM;
> > +
> > + memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap));
> > + }
> > +
> > + /* call arch's memory hotadd */
> > + ret = arch_add_memory(nid, start, size, ¶ms);
> > + if (ret < 0)
> > + goto error;
> > +
> > + /* create memory block devices after memory was added */
> > + ret = create_memory_block_devices(start, size, params.altmap, group);
> > + if (ret) {
> > + arch_remove_memory(start, size, NULL);
>
> Maybe push this down to a second label?
Yep will do.
>
<snip>
> > +
> > +static int __ref try_remove_memory(u64 start, u64 size)
> > +{
> > + int ret, nid = NUMA_NO_NODE;
>
> I'm not overly keen to see the trivial rename of rc -> ret in here.
> Just makes it ever so slightly harder to compare old code and new code.
Yep - this was to work around the patches I was based on, which added
both a ret and left the original rc [1]. Aneesh will stick to 'rc' so
my next revision should sort this out naturally.
[1]: https://lore.kernel.org/all/715042319ceb86016a4986862a82756e5629d725.camel@intel.com/
>
@@ -1383,6 +1383,44 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
return arch_supports_memmap_on_memory(vmemmap_size);
}
+static int add_memory_create_devices(int nid, struct memory_group *group,
+ u64 start, u64 size, mhp_t mhp_flags)
+{
+ struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+ struct vmem_altmap mhp_altmap = {
+ .base_pfn = PHYS_PFN(start),
+ .end_pfn = PHYS_PFN(start + size - 1),
+ };
+ int ret;
+
+ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY)) {
+ mhp_altmap.free = memory_block_memmap_on_memory_pages();
+ params.altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
+ if (!params.altmap)
+ return -ENOMEM;
+
+ memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap));
+ }
+
+ /* call arch's memory hotadd */
+ ret = arch_add_memory(nid, start, size, ¶ms);
+ if (ret < 0)
+ goto error;
+
+ /* create memory block devices after memory was added */
+ ret = create_memory_block_devices(start, size, params.altmap, group);
+ if (ret) {
+ arch_remove_memory(start, size, NULL);
+ goto error;
+ }
+
+ return 0;
+
+error:
+ kfree(params.altmap);
+ return ret;
+}
+
/*
* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
* and online/offline operations (triggered e.g. by sysfs).
@@ -1391,14 +1429,10 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
*/
int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
- struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+ unsigned long memblock_size = memory_block_size_bytes();
enum memblock_flags memblock_flags = MEMBLOCK_NONE;
- struct vmem_altmap mhp_altmap = {
- .base_pfn = PHYS_PFN(res->start),
- .end_pfn = PHYS_PFN(res->end),
- };
struct memory_group *group = NULL;
- u64 start, size;
+ u64 start, size, cur_start;
bool new_node = false;
int ret;
@@ -1439,28 +1473,21 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
/*
* Self hosted memmap array
*/
- if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
- if (mhp_supports_memmap_on_memory(size)) {
- mhp_altmap.free = memory_block_memmap_on_memory_pages();
- params.altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
- if (!params.altmap)
+ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
+ mhp_supports_memmap_on_memory(memblock_size)) {
+ for (cur_start = start; cur_start < start + size;
+ cur_start += memblock_size) {
+ ret = add_memory_create_devices(nid, group, cur_start,
+ memblock_size,
+ mhp_flags);
+ if (ret)
goto error;
-
- memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap));
}
- /* fallback to not using altmap */
- }
-
- /* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, ¶ms);
- if (ret < 0)
- goto error_free;
-
- /* create memory block devices after memory was added */
- ret = create_memory_block_devices(start, size, params.altmap, group);
- if (ret) {
- arch_remove_memory(start, size, NULL);
- goto error_free;
+ } else {
+ ret = add_memory_create_devices(nid, group, start, size,
+ mhp_flags);
+ if (ret)
+ goto error;
}
if (new_node) {
@@ -1497,8 +1524,6 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
-error_free:
- kfree(params.altmap);
error:
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
@@ -2149,40 +2174,14 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
-static int __ref try_remove_memory(u64 start, u64 size)
+static void __ref __try_remove_memory(int nid, u64 start, u64 size)
{
- int ret;
- struct memory_block *mem;
- int rc = 0, nid = NUMA_NO_NODE;
struct vmem_altmap *altmap = NULL;
+ struct memory_block *mem;
+ int ret;
- BUG_ON(check_hotplug_memory_range(start, size));
-
- /*
- * All memory blocks must be offlined before removing memory. Check
- * whether all memory blocks in question are offline and return error
- * if this is not the case.
- *
- * While at it, determine the nid. Note that if we'd have mixed nodes,
- * we'd only try to offline the last determined one -- which is good
- * enough for the cases we care about.
- */
- rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb);
- if (rc)
- return rc;
-
- /*
- * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
- * the same granularity it was added - a single memory block.
- */
ret = walk_memory_blocks(start, size, &mem, test_has_altmap_cb);
if (ret) {
- if (size != memory_block_size_bytes()) {
- pr_warn("Refuse to remove %#llx - %#llx,"
- "wrong granularity\n",
- start, start + size);
- return -EINVAL;
- }
altmap = mem->altmap;
/*
* Mark altmap NULL so that we can add a debug
@@ -2221,6 +2220,43 @@ static int __ref try_remove_memory(u64 start, u64 size)
try_offline_node(nid);
mem_hotplug_done();
+}
+
+static int __ref try_remove_memory(u64 start, u64 size)
+{
+ int ret, nid = NUMA_NO_NODE;
+
+ BUG_ON(check_hotplug_memory_range(start, size));
+
+ /*
+ * All memory blocks must be offlined before removing memory. Check
+ * whether all memory blocks in question are offline and return error
+ * if this is not the case.
+ *
+ * While at it, determine the nid. Note that if we'd have mixed nodes,
+ * we'd only try to offline the last determined one -- which is good
+ * enough for the cases we care about.
+ */
+ ret = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb);
+ if (ret)
+ return ret;
+
+ /*
+ * For memmap_on_memory, the altmaps could have been added on
+ * a per-memblock basis. Loop through the entire range if so,
+ * and remove each memblock and its altmap.
+ */
+ if (mhp_memmap_on_memory()) {
+ unsigned long memblock_size = memory_block_size_bytes();
+ u64 cur_start;
+
+ for (cur_start = start; cur_start < start + size;
+ cur_start += memblock_size)
+ __try_remove_memory(nid, cur_start, memblock_size);
+ } else {
+ __try_remove_memory(nid, start, size);
+ }
+
return 0;
}