[PATCHv4,14/14] x86/acpi: Add support for CPU offlining for ACPI MADT wakeup method

Message ID 20231205004510.27164-15-kirill.shutemov@linux.intel.com
State New
Headers
Series x86/tdx: Add kexec support |

Commit Message

Kirill A. Shutemov Dec. 5, 2023, 12:45 a.m. UTC
  MADT Multiprocessor Wakeup structure version 1 brings support of CPU
offlining: BIOS provides a reset vector where the CPU has to jump to
offline itself. The new TEST mailbox command can be used to test the CPU
offlined successfully and BIOS has control over it.

Add CPU offling support for ACPI MADT wakeup method by implementing
custom cpu_die, play_dead and stop_other_cpus SMP operations.

CPU offlining makes is possible to hand over secondary CPUs over kexec,
not limiting the second kernel to single CPU.

The change conforms to the approved ACPI spec change proposal. See the
Link.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Link: https://lore.kernel.org/all/13356251.uLZWGnKmhe@kreacher
---
 arch/x86/include/asm/smp.h           |   1 +
 arch/x86/kernel/acpi/Makefile        |   2 +-
 arch/x86/kernel/acpi/madt_playdead.S |  21 ++
 arch/x86/kernel/acpi/madt_wakeup.c   | 295 +++++++++++++++++++++++++--
 arch/x86/kernel/reboot.c             |  12 +-
 include/acpi/actbl2.h                |  15 +-
 6 files changed, 321 insertions(+), 25 deletions(-)
 create mode 100644 arch/x86/kernel/acpi/madt_playdead.S
  

Comments

Kai Huang Dec. 5, 2023, 11:36 p.m. UTC | #1
> +
> +static void acpi_mp_stop_other_cpus(int wait)
> +{
> +	smp_shutdown_nonboot_cpus(smp_processor_id());
> +}

Is this and ...

+	smp_ops.stop_other_cpus = acpi_mp_stop_other_cpus;

... this below still needed?

I think the current native_stop_other_cpus() should just work given you have set
up ...

+	smp_ops.crash_play_dead = crash_acpi_mp_play_dead;

... for TDX guest?

> +
> +/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
> +static void __init *alloc_pgt_page(void *dummy)
> +{
> +	return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
> +}
> +
> +/*
> + * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
> + * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
> + * to the identity mapping and the function has be present at the same spot in
> + * the virtual address space before and after switching page tables.
> + */
> +static int __init init_transition_pgtable(pgd_t *pgd)
> +{
> +	pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
> +	unsigned long vaddr, paddr;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte;
> +
> +	vaddr = (unsigned long)asm_acpi_mp_play_dead;
> +	pgd += pgd_index(vaddr);
> +	if (!pgd_present(*pgd)) {
> +		p4d = (p4d_t *)alloc_pgt_page(NULL);
> +		if (!p4d)
> +			return -ENOMEM;
> +		set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
> +	}
> +	p4d = p4d_offset(pgd, vaddr);
> +	if (!p4d_present(*p4d)) {
> +		pud = (pud_t *)alloc_pgt_page(NULL);
> +		if (!pud)
> +			return -ENOMEM;
> +		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
> +	}
> +	pud = pud_offset(p4d, vaddr);
> +	if (!pud_present(*pud)) {
> +		pmd = (pmd_t *)alloc_pgt_page(NULL);
> +		if (!pmd)
> +			return -ENOMEM;
> +		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
> +	}
> +	pmd = pmd_offset(pud, vaddr);
> +	if (!pmd_present(*pmd)) {
> +		pte = (pte_t *)alloc_pgt_page(NULL);
> +		if (!pte)
> +			return -ENOMEM;
> +		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
> +	}
> +	pte = pte_offset_kernel(pmd, vaddr);
> +
> +	paddr = __pa(vaddr);
> +	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
> +
> +	return 0;
> +}

Sorry for saying this late.  I think we can also use kernel_ident_mapping_init()
to do the init_transition_pgtable()?  We can set struct x86_mapping_info::offset
to __PAGE_OFFSET to do that?

Looks set_up_temporary_mappings() in arch/x86/power/hibernate_64.c uses the same
trick.

Anyway I am not sure how many LoC (assuming can do) can be saved so up to you.

> +
> +static void __init free_pte(pmd_t *pmd)
> +{
> +	pte_t *pte = pte_offset_kernel(pmd, 0);
> +
> +	memblock_free(pte, PAGE_SIZE);
> +}
> +
> +static void __init free_pmd(pud_t *pud)
> +{
> +	pmd_t *pmd = pmd_offset(pud, 0);
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++) {
> +		if (!pmd_present(pmd[i]))
> +		    continue;
> +
> +		if (pmd_leaf(pmd[i]))
> +		    continue;
> +
> +		free_pte(&pmd[i]);
> +	}
> +
> +	memblock_free(pmd, PAGE_SIZE);
> +}
> +
> +static void __init free_pud(p4d_t *p4d)
> +{
> +	pud_t *pud = pud_offset(p4d, 0);
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++) {
> +		if (!pud_present(pud[i]))
> +			continue;
> +
> +		if (pud_leaf(pud[i]))
> +		    continue;
> +
> +		free_pmd(&pud[i]);
> +	}
> +
> +	memblock_free(pud, PAGE_SIZE);
> +}
> +
> +static void __init free_p4d(pgd_t *pgd)
> +{
> +	p4d_t *p4d = p4d_offset(pgd, 0);
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_P4D; i++) {
> +		if (!p4d_present(p4d[i]))
> +			continue;
> +
> +		free_pud(&p4d[i]);
> +	}
> +
> +	if (pgtable_l5_enabled())
> +		memblock_free(p4d, PAGE_SIZE);
> +}
> +
> +static void __init free_pgd(pgd_t *pgd)
> +{
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PGD; i++) {
> +		if (!pgd_present(pgd[i]))
> +			continue;
> +
> +		free_p4d(&pgd[i]);
> +	}
> +
> +	memblock_free(pgd, PAGE_SIZE);
> +}

It's a little bit sad such cleanup code isn't in common code, e.g., with a 

	void (*free_pgt_page)(void *);

to allow the user to specify how to free the page table.

But this can be future job if needed.


[...]

>  int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
>  			      const unsigned long end)
>  {
>  	struct acpi_madt_multiproc_wakeup *mp_wake;
>  
>  	mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
> -	if (BAD_MADT_ENTRY(mp_wake, end))
> +
> +        /*
> +         * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
> +         * entry.  'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
> +         * than the actual size of the MP wakeup entry in ACPI table because the
> +	 * 'reset_vector' is only available in the V1 MP wakeup structure.
> +         */

Space/tab issue.

> +	if (!mp_wake)
> +		return -EINVAL;
> +	if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
> +		return -EINVAL;
> +	if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
>  		return -EINVAL;
>
  
Thomas Gleixner Dec. 15, 2023, 8:29 p.m. UTC | #2
On Tue, Dec 05 2023 at 03:45, Kirill A. Shutemov wrote:

> MADT Multiprocessor Wakeup structure version 1 brings support of CPU
> offlining: BIOS provides a reset vector where the CPU has to jump to
> offline itself.

CPU has to jump to for offlining itself.

> The new TEST mailbox command can be used to test the CPU offlined
> successfully and BIOS has control over it.

test whether the CPU offlined itself which means the BIOS has control
over the CPU and can online it again via the ACPI MADT wakeup method.

> Add CPU offling support for ACPI MADT wakeup method by implementing

for the ACPI

> custom cpu_die, play_dead and stop_other_cpus SMP operations.

cpu_die(), play_dead() ...

> CPU offlining makes is possible to hand over secondary CPUs over kexec,
> not limiting the second kernel to single CPU.

to a single CPU.

> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index 4fab2ed454f3..3c8efba86d5c 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -38,6 +38,7 @@ struct smp_ops {
>  	int (*cpu_disable)(void);
>  	void (*cpu_die)(unsigned int cpu);
>  	void (*play_dead)(void);
> +	void (*crash_play_dead)(void);

This new callback and the callsite change wants to be introduced in a
preparatory patch. This one is doing too many things at once, really.
  
> diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S
> new file mode 100644
> index 000000000000..68f83865a1e3
> --- /dev/null
> +++ b/arch/x86/kernel/acpi/madt_playdead.S
> @@ -0,0 +1,21 @@
> +#include <linux/linkage.h>
> +#include <asm/nospec-branch.h>
> +#include <asm/page_types.h>
> +#include <asm/processor-flags.h>
> +
> +	.text
> +	.align PAGE_SIZE

Newline please

Please document what the register arguments to this function are.

> +SYM_FUNC_START(asm_acpi_mp_play_dead)
> +	/* Turn off global entries. Following CR3 write will flush them. */
> +	movq	%cr4, %rdx
> +	andq	$~(X86_CR4_PGE), %rdx
> +	movq	%rdx, %cr4
> +
> +	/* Switch to identity mapping */
> +	movq	%rsi, %rax
> +	movq	%rax, %cr3
> +
> +	/* Jump to reset vector */
> +	ANNOTATE_RETPOLINE_SAFE
> +	jmp	*%rdi
> +SYM_FUNC_END(asm_acpi_mp_play_dead)

> +static u64 acpi_mp_pgd __ro_after_init;
> +static u64 acpi_mp_reset_vector_paddr __ro_after_init;
> +
> +void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);

Declarations want to be in a header file.

> +static void crash_acpi_mp_play_dead(void)
> +{
> +	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
> +			      acpi_mp_pgd);

Pointless line break.

> +}
> +
> +static void acpi_mp_play_dead(void)
> +{
> +	play_dead_common();
> +	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
> +			      acpi_mp_pgd);

Ditto.

> +}
> +
> +static void acpi_mp_cpu_die(unsigned int cpu)
> +{
> +	u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
> +	unsigned long timeout;
> +
> +	/*
> +	 * Use TEST mailbox command to prove that BIOS got control over
> +	 * the CPU before declaring it dead.
> +	 *
> +	 * BIOS has to clear 'command' field of the mailbox.
> +	 */
> +	acpi_mp_wake_mailbox->apic_id = apicid;
> +	smp_store_release(&acpi_mp_wake_mailbox->command,
> +			  ACPI_MP_WAKE_COMMAND_TEST);
> +
> +	/* Don't wait longer than a second. */
> +	timeout = USEC_PER_SEC;
> +	while (READ_ONCE(acpi_mp_wake_mailbox->command) && timeout--)
> +		udelay(1);

So this waits and then does nothing if the wait fails. What's the point?

...
<SNIP 170 lines of pagetable muck>

Do we really need this specific hackery or is there some similar
identity mapping muck which can be generalized?

> +	smp_ops.play_dead = acpi_mp_play_dead;
> +	smp_ops.crash_play_dead = crash_acpi_mp_play_dead;
> +	smp_ops.cpu_die = acpi_mp_cpu_die;
> +	smp_ops.stop_other_cpus = acpi_mp_stop_other_cpus;
> +
> +	acpi_mp_reset_vector_paddr = reset_vector;
> +	acpi_mp_pgd = __pa(pgd);
> +
> +	return 0;
> +}
> +
>  static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
>  {
>  	if (!acpi_mp_wake_mailbox_paddr) {
> @@ -68,37 +299,63 @@ static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
>  	return 0;
>  }
>  
> +static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
> +{
> +	cpu_hotplug_disable_offlining();
> +
> +	/*
> +	 * Zero out mailbox address in the ACPI MADT wakeup structure
> +	 * to indicate that the mailbox is not usable.  This prevents
> +	 * the kexec()-ed kernel from reading a vaild mailbox, which in
> +	 * turn makes the kexec()-ed kernel only be able to use the boot
> +	 * CPU.
> +	 *
> +	 * This is Linux-specific protocol and not reflected in ACPI spec.
> +	 *
> +	 * acpi_mp_wake_mailbox_paddr already has the mailbox address.
> +	 * The acpi_wakeup_cpu() will use it to bring up secondary cpus for
> +	 * the current kernel.
> +	 */
> +	mp_wake->mailbox_address = 0;
> +}

The previous patch could have split this out into a helper already, no?

> +
>  int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
>  			      const unsigned long end)
>  {
>  	struct acpi_madt_multiproc_wakeup *mp_wake;
>  
>  	mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
> -	if (BAD_MADT_ENTRY(mp_wake, end))
> +
> +        /*
> +         * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
> +         * entry.  'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
> +         * than the actual size of the MP wakeup entry in ACPI table because the
> +	 * 'reset_vector' is only available in the V1 MP wakeup structure.
> +         */

The comment is white space damaged. Use tabs everywhere please and not
only in one line.

Thanks,

        tglx
  
Kirill A. Shutemov Dec. 22, 2023, 11:19 a.m. UTC | #3
On Tue, Dec 05, 2023 at 11:36:55PM +0000, Huang, Kai wrote:
> 
> > +
> > +static void acpi_mp_stop_other_cpus(int wait)
> > +{
> > +	smp_shutdown_nonboot_cpus(smp_processor_id());
> > +}
> 
> Is this and ...
> 
> +	smp_ops.stop_other_cpus = acpi_mp_stop_other_cpus;
> 
> ... this below still needed?
> 
> I think the current native_stop_other_cpus() should just work given you have set
> up ...
> 
> +	smp_ops.crash_play_dead = crash_acpi_mp_play_dead;
> 
> ... for TDX guest?

To make it work stop_this_cpu() would need to be modified to use
smp_ops.crash_play_dead() instead of native_halt(). But name of the
callback doesn't match the function, so I renamed it to
smp_ops.stop_this_cpu().

> Sorry for saying this late.  I think we can also use kernel_ident_mapping_init()
> to do the init_transition_pgtable()?  We can set struct x86_mapping_info::offset
> to __PAGE_OFFSET to do that?
> 
> Looks set_up_temporary_mappings() in arch/x86/power/hibernate_64.c uses the same
> trick.
> 
> Anyway I am not sure how many LoC (assuming can do) can be saved so up to you.

Yeah. Benefit is not clear to me. I will leave it as is.


> 
> It's a little bit sad such cleanup code isn't in common code, e.g., with a 
> 
> 	void (*free_pgt_page)(void *);
> 
> to allow the user to specify how to free the page table.
> 
> But this can be future job if needed.

I will consider moving this cleanup in common code. And maybe fix other
users of kernel_ident_mapping_init(). Nobody seems to care to cleanup page
tables on ENOMEM.
  
Kai Huang Dec. 22, 2023, 11:38 a.m. UTC | #4
On Fri, 2023-12-22 at 14:19 +0300, kirill.shutemov@linux.intel.com wrote:
> On Tue, Dec 05, 2023 at 11:36:55PM +0000, Huang, Kai wrote:
> > 
> > > +
> > > +static void acpi_mp_stop_other_cpus(int wait)
> > > +{
> > > +	smp_shutdown_nonboot_cpus(smp_processor_id());
> > > +}
> > 
> > Is this and ...
> > 
> > +	smp_ops.stop_other_cpus = acpi_mp_stop_other_cpus;
> > 
> > ... this below still needed?
> > 
> > I think the current native_stop_other_cpus() should just work given you have set
> > up ...
> > 
> > +	smp_ops.crash_play_dead = crash_acpi_mp_play_dead;
> > 
> > ... for TDX guest?
> 
> To make it work stop_this_cpu() would need to be modified to use
> smp_ops.crash_play_dead() instead of native_halt(). But name of the
> callback doesn't match the function, so I renamed it to
> smp_ops.stop_this_cpu().

Seems reasonable to me.  Thanks.
  
Kirill A. Shutemov Dec. 22, 2023, 4:34 p.m. UTC | #5
On Fri, Dec 15, 2023 at 09:29:13PM +0100, Thomas Gleixner wrote:
> So this waits and then does nothing if the wait fails. What's the point?
> 
> ...
> <SNIP 170 lines of pagetable muck>
> 
> Do we really need this specific hackery or is there some similar
> identity mapping muck which can be generalized?

I've addressed all your feedback, but this gave me pause. Looks like none
of kernel_ident_mapping_init() users frees memory on failure.

Is it okay to get this part as is and I will follow up with patchset that
fixes memory handling for all kernel_ident_mapping_init() users?
  

Patch

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4fab2ed454f3..3c8efba86d5c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -38,6 +38,7 @@  struct smp_ops {
 	int (*cpu_disable)(void);
 	void (*cpu_die)(unsigned int cpu);
 	void (*play_dead)(void);
+	void (*crash_play_dead)(void);
 
 	void (*send_call_func_ipi)(const struct cpumask *mask);
 	void (*send_call_func_single_ipi)(int cpu);
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 8c7329c88a75..37b1f28846de 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,7 +4,7 @@  obj-$(CONFIG_ACPI)			+= boot.o
 obj-$(CONFIG_ACPI_SLEEP)		+= sleep.o wakeup_$(BITS).o
 obj-$(CONFIG_ACPI_APEI)			+= apei.o
 obj-$(CONFIG_ACPI_CPPC_LIB)		+= cppc.o
-obj-$(CONFIG_X86_ACPI_MADT_WAKEUP)	+= madt_wakeup.o
+obj-$(CONFIG_X86_ACPI_MADT_WAKEUP)	+= madt_wakeup.o madt_playdead.o
 
 ifneq ($(CONFIG_ACPI_PROCESSOR),)
 obj-y					+= cstate.o
diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S
new file mode 100644
index 000000000000..68f83865a1e3
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_playdead.S
@@ -0,0 +1,21 @@ 
+#include <linux/linkage.h>
+#include <asm/nospec-branch.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+
+	.text
+	.align PAGE_SIZE
+SYM_FUNC_START(asm_acpi_mp_play_dead)
+	/* Turn off global entries. Following CR3 write will flush them. */
+	movq	%cr4, %rdx
+	andq	$~(X86_CR4_PGE), %rdx
+	movq	%rdx, %cr4
+
+	/* Switch to identity mapping */
+	movq	%rsi, %rax
+	movq	%rax, %cr3
+
+	/* Jump to reset vector */
+	ANNOTATE_RETPOLINE_SAFE
+	jmp	*%rdi
+SYM_FUNC_END(asm_acpi_mp_play_dead)
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
index 5d92d12f1042..f8cf7a048743 100644
--- a/arch/x86/kernel/acpi/madt_wakeup.c
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -1,9 +1,18 @@ 
 #include <linux/acpi.h>
 #include <linux/cpu.h>
+#include <linux/delay.h>
 #include <linux/io.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+#include <linux/pgtable.h>
+#include <linux/sched/hotplug.h>
 #include <asm/apic.h>
 #include <asm/barrier.h>
+#include <asm/init.h>
+#include <asm/intel_pt.h>
+#include <asm/nmi.h>
 #include <asm/processor.h>
+#include <asm/reboot.h>
 
 /* Physical address of the Multiprocessor Wakeup Structure mailbox */
 static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
@@ -11,6 +20,228 @@  static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
 /* Virtual address of the Multiprocessor Wakeup Structure mailbox */
 static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init;
 
+static u64 acpi_mp_pgd __ro_after_init;
+static u64 acpi_mp_reset_vector_paddr __ro_after_init;
+
+void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);
+
+static void crash_acpi_mp_play_dead(void)
+{
+	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
+			      acpi_mp_pgd);
+}
+
+static void acpi_mp_play_dead(void)
+{
+	play_dead_common();
+	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
+			      acpi_mp_pgd);
+}
+
+static void acpi_mp_cpu_die(unsigned int cpu)
+{
+	u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
+	unsigned long timeout;
+
+	/*
+	 * Use TEST mailbox command to prove that BIOS got control over
+	 * the CPU before declaring it dead.
+	 *
+	 * BIOS has to clear 'command' field of the mailbox.
+	 */
+	acpi_mp_wake_mailbox->apic_id = apicid;
+	smp_store_release(&acpi_mp_wake_mailbox->command,
+			  ACPI_MP_WAKE_COMMAND_TEST);
+
+	/* Don't wait longer than a second. */
+	timeout = USEC_PER_SEC;
+	while (READ_ONCE(acpi_mp_wake_mailbox->command) && timeout--)
+		udelay(1);
+}
+
+static void acpi_mp_stop_other_cpus(int wait)
+{
+	smp_shutdown_nonboot_cpus(smp_processor_id());
+}
+
+/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
+static void __init *alloc_pgt_page(void *dummy)
+{
+	return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+/*
+ * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
+ * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
+ * to the identity mapping and the function has be present at the same spot in
+ * the virtual address space before and after switching page tables.
+ */
+static int __init init_transition_pgtable(pgd_t *pgd)
+{
+	pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
+	unsigned long vaddr, paddr;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	vaddr = (unsigned long)asm_acpi_mp_play_dead;
+	pgd += pgd_index(vaddr);
+	if (!pgd_present(*pgd)) {
+		p4d = (p4d_t *)alloc_pgt_page(NULL);
+		if (!p4d)
+			return -ENOMEM;
+		set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+	}
+	p4d = p4d_offset(pgd, vaddr);
+	if (!p4d_present(*p4d)) {
+		pud = (pud_t *)alloc_pgt_page(NULL);
+		if (!pud)
+			return -ENOMEM;
+		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+	}
+	pud = pud_offset(p4d, vaddr);
+	if (!pud_present(*pud)) {
+		pmd = (pmd_t *)alloc_pgt_page(NULL);
+		if (!pmd)
+			return -ENOMEM;
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+	}
+	pmd = pmd_offset(pud, vaddr);
+	if (!pmd_present(*pmd)) {
+		pte = (pte_t *)alloc_pgt_page(NULL);
+		if (!pte)
+			return -ENOMEM;
+		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+	}
+	pte = pte_offset_kernel(pmd, vaddr);
+
+	paddr = __pa(vaddr);
+	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+
+	return 0;
+}
+
+static void __init free_pte(pmd_t *pmd)
+{
+	pte_t *pte = pte_offset_kernel(pmd, 0);
+
+	memblock_free(pte, PAGE_SIZE);
+}
+
+static void __init free_pmd(pud_t *pud)
+{
+	pmd_t *pmd = pmd_offset(pud, 0);
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		if (!pmd_present(pmd[i]))
+		    continue;
+
+		if (pmd_leaf(pmd[i]))
+		    continue;
+
+		free_pte(&pmd[i]);
+	}
+
+	memblock_free(pmd, PAGE_SIZE);
+}
+
+static void __init free_pud(p4d_t *p4d)
+{
+	pud_t *pud = pud_offset(p4d, 0);
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		if (!pud_present(pud[i]))
+			continue;
+
+		if (pud_leaf(pud[i]))
+		    continue;
+
+		free_pmd(&pud[i]);
+	}
+
+	memblock_free(pud, PAGE_SIZE);
+}
+
+static void __init free_p4d(pgd_t *pgd)
+{
+	p4d_t *p4d = p4d_offset(pgd, 0);
+	int i;
+
+	for (i = 0; i < PTRS_PER_P4D; i++) {
+		if (!p4d_present(p4d[i]))
+			continue;
+
+		free_pud(&p4d[i]);
+	}
+
+	if (pgtable_l5_enabled())
+		memblock_free(p4d, PAGE_SIZE);
+}
+
+static void __init free_pgd(pgd_t *pgd)
+{
+	int i;
+
+	for (i = 0; i < PTRS_PER_PGD; i++) {
+		if (!pgd_present(pgd[i]))
+			continue;
+
+		free_p4d(&pgd[i]);
+	}
+
+	memblock_free(pgd, PAGE_SIZE);
+}
+
+static int __init acpi_mp_setup_reset(u64 reset_vector)
+{
+	pgd_t *pgd;
+	struct x86_mapping_info info = {
+		.alloc_pgt_page = alloc_pgt_page,
+		.page_flag      = __PAGE_KERNEL_LARGE_EXEC,
+		.kernpg_flag    = _KERNPG_TABLE_NOENC,
+	};
+
+	pgd = alloc_pgt_page(NULL);
+	if (!pgd)
+		return -ENOMEM;
+
+	for (int i = 0; i < nr_pfn_mapped; i++) {
+		unsigned long mstart, mend;
+
+		mstart = pfn_mapped[i].start << PAGE_SHIFT;
+		mend   = pfn_mapped[i].end << PAGE_SHIFT;
+		if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+			free_pgd(pgd);
+			return -ENOMEM;
+		}
+	}
+
+	if (kernel_ident_mapping_init(&info, pgd,
+				      PAGE_ALIGN_DOWN(reset_vector),
+				      PAGE_ALIGN(reset_vector + 1))) {
+		free_pgd(pgd);
+		return -ENOMEM;
+	}
+
+	if (init_transition_pgtable(pgd)) {
+		free_pgd(pgd);
+		return -ENOMEM;
+	}
+
+	smp_ops.play_dead = acpi_mp_play_dead;
+	smp_ops.crash_play_dead = crash_acpi_mp_play_dead;
+	smp_ops.cpu_die = acpi_mp_cpu_die;
+	smp_ops.stop_other_cpus = acpi_mp_stop_other_cpus;
+
+	acpi_mp_reset_vector_paddr = reset_vector;
+	acpi_mp_pgd = __pa(pgd);
+
+	return 0;
+}
+
 static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
 {
 	if (!acpi_mp_wake_mailbox_paddr) {
@@ -68,37 +299,63 @@  static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
 	return 0;
 }
 
+static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
+{
+	cpu_hotplug_disable_offlining();
+
+	/*
+	 * Zero out mailbox address in the ACPI MADT wakeup structure
+	 * to indicate that the mailbox is not usable.  This prevents
+	 * the kexec()-ed kernel from reading a vaild mailbox, which in
+	 * turn makes the kexec()-ed kernel only be able to use the boot
+	 * CPU.
+	 *
+	 * This is Linux-specific protocol and not reflected in ACPI spec.
+	 *
+	 * acpi_mp_wake_mailbox_paddr already has the mailbox address.
+	 * The acpi_wakeup_cpu() will use it to bring up secondary cpus for
+	 * the current kernel.
+	 */
+	mp_wake->mailbox_address = 0;
+}
+
 int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
 			      const unsigned long end)
 {
 	struct acpi_madt_multiproc_wakeup *mp_wake;
 
 	mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
-	if (BAD_MADT_ENTRY(mp_wake, end))
+
+        /*
+         * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
+         * entry.  'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
+         * than the actual size of the MP wakeup entry in ACPI table because the
+	 * 'reset_vector' is only available in the V1 MP wakeup structure.
+         */
+	if (!mp_wake)
+		return -EINVAL;
+	if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
+		return -EINVAL;
+	if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
 		return -EINVAL;
 
 	acpi_table_print_madt_entry(&header->common);
 
 	acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
 
-	cpu_hotplug_disable_offlining();
-
-	/*
-	 * ACPI MADT doesn't allow to offline CPU after it got woke up.
-	 * It limits kexec: the second kernel won't be able to use more than
-	 * one CPU.
-	 *
-	 * Now acpi_mp_wake_mailbox_paddr already has the mailbox address.
-	 * The acpi_wakeup_cpu() will use it to bring up secondary cpus.
-	 *
-	 * Zero out mailbox address in the ACPI MADT wakeup structure to
-	 * indicate that the mailbox is not usable.  This prevents the
-	 * kexec()-ed kernel from reading a vaild mailbox, which in turn
-	 * makes the kexec()-ed kernel only be able to use the boot CPU.
-	 *
-	 * This is Linux-specific protocol and not reflected in ACPI spec.
-	 */
-	mp_wake->mailbox_address = 0;
+	if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
+	    mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
+		if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
+			pr_warn("Failed to setup MADT reset vector\n");
+			acpi_mp_disable_offlining(mp_wake);
+		}
+	} else {
+		/*
+		 * CPU offlining requires version 1 of the ACPI MADT wakeup
+		 * structure.
+		 */
+		acpi_mp_disable_offlining(mp_wake);
+	}
 
 	apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
 
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c81afffaa954..99e6ab552da0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -878,10 +878,14 @@  static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
 	cpu_emergency_disable_virtualization();
 
 	atomic_dec(&waiting_for_crash_ipi);
-	/* Assume hlt works */
-	halt();
-	for (;;)
-		cpu_relax();
+
+	if (smp_ops.crash_play_dead) {
+	    smp_ops.crash_play_dead();
+	} else {
+		halt();
+		for (;;)
+			cpu_relax();
+	}
 
 	return NMI_HANDLED;
 }
diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 23b4cfb640fc..8348bf46a648 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -1112,8 +1112,20 @@  struct acpi_madt_multiproc_wakeup {
 	u16 version;
 	u32 reserved;		/* reserved - must be zero */
 	u64 mailbox_address;
+	u64 reset_vector;
 };
 
+/* Values for Version field above */
+
+enum acpi_madt_multiproc_wakeup_version {
+	ACPI_MADT_MP_WAKEUP_VERSION_NONE = 0,
+	ACPI_MADT_MP_WAKEUP_VERSION_V1 = 1,
+	ACPI_MADT_MP_WAKEUP_VERSION_RESERVED = 2, /* 2 and greater are reserved */
+};
+
+#define ACPI_MADT_MP_WAKEUP_SIZE_V0	16
+#define ACPI_MADT_MP_WAKEUP_SIZE_V1	24
+
 #define ACPI_MULTIPROC_WAKEUP_MB_OS_SIZE        2032
 #define ACPI_MULTIPROC_WAKEUP_MB_FIRMWARE_SIZE  2048
 
@@ -1126,7 +1138,8 @@  struct acpi_madt_multiproc_wakeup_mailbox {
 	u8 reserved_firmware[ACPI_MULTIPROC_WAKEUP_MB_FIRMWARE_SIZE];	/* reserved for firmware use */
 };
 
-#define ACPI_MP_WAKE_COMMAND_WAKEUP    1
+#define ACPI_MP_WAKE_COMMAND_WAKEUP	1
+#define ACPI_MP_WAKE_COMMAND_TEST	2
 
 /* 17: CPU Core Interrupt Controller (ACPI 6.5) */