[tip:,x86/fred] x86/ptrace: Cleanup the definition of the pt_regs structure

Message ID 170673569232.398.15041548048531772130.tip-bot2@tip-bot2
State New
Headers
Series [tip:,x86/fred] x86/ptrace: Cleanup the definition of the pt_regs structure |

Commit Message

tip-bot2 for Thomas Gleixner Jan. 31, 2024, 9:14 p.m. UTC
  The following commit has been merged into the x86/fred branch of tip:

Commit-ID:     ee63291aa8287cb7ded767d340155fe8681fc075
Gitweb:        https://git.kernel.org/tip/ee63291aa8287cb7ded767d340155fe8681fc075
Author:        Xin Li <xin3.li@intel.com>
AuthorDate:    Tue, 05 Dec 2023 02:50:02 -08:00
Committer:     Borislav Petkov (AMD) <bp@alien8.de>
CommitterDate: Wed, 31 Jan 2024 22:01:13 +01:00

x86/ptrace: Cleanup the definition of the pt_regs structure

struct pt_regs is hard to read because the member or section related
comments are not aligned with the members.

The 'cs' and 'ss' members of pt_regs are type of 'unsigned long' while
in reality they are only 16-bit wide. This works so far as the
remaining space is unused, but FRED will use the remaining bits for
other purposes.

To prepare for FRED:

  - Cleanup the formatting
  - Convert 'cs' and 'ss' to u16 and embed them into an union
    with a u64
  - Fixup the related printk() format strings

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Originally-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-14-xin3.li@intel.com
---
 arch/x86/entry/vsyscall/vsyscall_64.c |  2 +-
 arch/x86/include/asm/ptrace.h         | 48 ++++++++++++++++++--------
 arch/x86/kernel/process_64.c          |  2 +-
 3 files changed, 37 insertions(+), 15 deletions(-)
  

Comments

H. Peter Anvin Feb. 3, 2024, 11:52 p.m. UTC | #1
On January 31, 2024 1:14:52 PM PST, tip-bot2 for Xin Li <tip-bot2@linutronix.de> wrote:
>The following commit has been merged into the x86/fred branch of tip:
>
>Commit-ID:     ee63291aa8287cb7ded767d340155fe8681fc075
>Gitweb:        https://git.kernel.org/tip/ee63291aa8287cb7ded767d340155fe8681fc075
>Author:        Xin Li <xin3.li@intel.com>
>AuthorDate:    Tue, 05 Dec 2023 02:50:02 -08:00
>Committer:     Borislav Petkov (AMD) <bp@alien8.de>
>CommitterDate: Wed, 31 Jan 2024 22:01:13 +01:00
>
>x86/ptrace: Cleanup the definition of the pt_regs structure
>
>struct pt_regs is hard to read because the member or section related
>comments are not aligned with the members.
>
>The 'cs' and 'ss' members of pt_regs are type of 'unsigned long' while
>in reality they are only 16-bit wide. This works so far as the
>remaining space is unused, but FRED will use the remaining bits for
>other purposes.
>
>To prepare for FRED:
>
>  - Cleanup the formatting
>  - Convert 'cs' and 'ss' to u16 and embed them into an union
>    with a u64
>  - Fixup the related printk() format strings
>
>Suggested-by: Thomas Gleixner <tglx@linutronix.de>
>Originally-by: H. Peter Anvin (Intel) <hpa@zytor.com>
>Signed-off-by: Xin Li <xin3.li@intel.com>
>Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
>Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
>Tested-by: Shan Kang <shan.kang@intel.com>
>Link: https://lore.kernel.org/r/20231205105030.8698-14-xin3.li@intel.com
>---
> arch/x86/entry/vsyscall/vsyscall_64.c |  2 +-
> arch/x86/include/asm/ptrace.h         | 48 ++++++++++++++++++--------
> arch/x86/kernel/process_64.c          |  2 +-
> 3 files changed, 37 insertions(+), 15 deletions(-)
>
>diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
>index e0ca812..a3c0df1 100644
>--- a/arch/x86/entry/vsyscall/vsyscall_64.c
>+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
>@@ -76,7 +76,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
> 	if (!show_unhandled_signals)
> 		return;
> 
>-	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
>+	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
> 			   level, current->comm, task_pid_nr(current),
> 			   message, regs->ip, regs->cs,
> 			   regs->sp, regs->ax, regs->si, regs->di);
>diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
>index f4db78b..b268cd2 100644
>--- a/arch/x86/include/asm/ptrace.h
>+++ b/arch/x86/include/asm/ptrace.h
>@@ -57,17 +57,19 @@ struct pt_regs {
> #else /* __i386__ */
> 
> struct pt_regs {
>-/*
>- * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
>- * unless syscall needs a complete, fully filled "struct pt_regs".
>- */
>+	/*
>+	 * C ABI says these regs are callee-preserved. They aren't saved on
>+	 * kernel entry unless syscall needs a complete, fully filled
>+	 * "struct pt_regs".
>+	 */
> 	unsigned long r15;
> 	unsigned long r14;
> 	unsigned long r13;
> 	unsigned long r12;
> 	unsigned long bp;
> 	unsigned long bx;
>-/* These regs are callee-clobbered. Always saved on kernel entry. */
>+
>+	/* These regs are callee-clobbered. Always saved on kernel entry. */
> 	unsigned long r11;
> 	unsigned long r10;
> 	unsigned long r9;
>@@ -77,18 +79,38 @@ struct pt_regs {
> 	unsigned long dx;
> 	unsigned long si;
> 	unsigned long di;
>-/*
>- * On syscall entry, this is syscall#. On CPU exception, this is error code.
>- * On hw interrupt, it's IRQ number:
>- */
>+
>+	/*
>+	 * orig_ax is used on entry for:
>+	 * - the syscall number (syscall, sysenter, int80)
>+	 * - error_code stored by the CPU on traps and exceptions
>+	 * - the interrupt number for device interrupts
>+	 */
> 	unsigned long orig_ax;
>-/* Return frame for iretq */
>+
>+	/* The IRETQ return frame starts here */
> 	unsigned long ip;
>-	unsigned long cs;
>+
>+	union {
>+		/* The full 64-bit data slot containing CS */
>+		u64		csx;
>+		/* CS selector */
>+		u16		cs;
>+	};
>+
> 	unsigned long flags;
> 	unsigned long sp;
>-	unsigned long ss;
>-/* top of stack page */
>+
>+	union {
>+		/* The full 64-bit data slot containing SS */
>+		u64		ssx;
>+		/* SS selector */
>+		u16		ss;
>+	};
>+
>+	/*
>+	 * Top of stack on IDT systems.
>+	 */
> };
> 
> #endif /* !__i386__ */
>diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
>index 33b2687..0f78b58 100644
>--- a/arch/x86/kernel/process_64.c
>+++ b/arch/x86/kernel/process_64.c
>@@ -117,7 +117,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
> 
> 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
> 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
>-	printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
>+	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
> 		log_lvl, regs->cs, ds, es, cr0);
> 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
> 		log_lvl, cr2, cr3, cr4);

Incidentally, the comment about callee-saved registers is long since both obsolete and is now outright wrong.

The next version of gcc (14 I think) will have an attribute to turn off saving registers which we can use for top-level C functions.
  
Xin Li (Intel) Feb. 6, 2024, 7:04 p.m. UTC | #2
On 2/3/2024 3:52 PM, H. Peter Anvin wrote:
> On January 31, 2024 1:14:52 PM PST, tip-bot2 for Xin Li <tip-bot2@linutronix.de> wrote:
>> The following commit has been merged into the x86/fred branch of tip:
>>
>> Commit-ID:     ee63291aa8287cb7ded767d340155fe8681fc075
>> Gitweb:        https://git.kernel.org/tip/ee63291aa8287cb7ded767d340155fe8681fc075
>> Author:        Xin Li <xin3.li@intel.com>
>> AuthorDate:    Tue, 05 Dec 2023 02:50:02 -08:00
>> Committer:     Borislav Petkov (AMD) <bp@alien8.de>
>> CommitterDate: Wed, 31 Jan 2024 22:01:13 +01:00
>>
>> x86/ptrace: Cleanup the definition of the pt_regs structure
>>
>> struct pt_regs is hard to read because the member or section related
>> comments are not aligned with the members.
>>
>> The 'cs' and 'ss' members of pt_regs are type of 'unsigned long' while
>> in reality they are only 16-bit wide. This works so far as the
>> remaining space is unused, but FRED will use the remaining bits for
>> other purposes.
>>
>> To prepare for FRED:
>>
>>   - Cleanup the formatting
>>   - Convert 'cs' and 'ss' to u16 and embed them into an union
>>     with a u64
>>   - Fixup the related printk() format strings
>>
>> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
>> Originally-by: H. Peter Anvin (Intel) <hpa@zytor.com>
>> Signed-off-by: Xin Li <xin3.li@intel.com>
>> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
>> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
>> Tested-by: Shan Kang <shan.kang@intel.com>
>> Link: https://lore.kernel.org/r/20231205105030.8698-14-xin3.li@intel.com

[...]

>> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
>> index 33b2687..0f78b58 100644
>> --- a/arch/x86/kernel/process_64.c
>> +++ b/arch/x86/kernel/process_64.c
>> @@ -117,7 +117,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
>>
>> 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
>> 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
>> -	printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
>> +	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
>> 		log_lvl, regs->cs, ds, es, cr0);
>> 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
>> 		log_lvl, cr2, cr3, cr4);
> 
> Incidentally, the comment about callee-saved registers is long since both obsolete and is now outright wrong.
> 
> The next version of gcc (14 I think) will have an attribute to turn off saving registers which we can use for top-level C functions.
> 

Forgive my ignorance, do we have an official definition for "top-level C 
functions"?

Thanks!
     Xin
  
H. Peter Anvin Feb. 6, 2024, 8:45 p.m. UTC | #3
On February 6, 2024 11:04:13 AM PST, Xin Li <xin@zytor.com> wrote:
>On 2/3/2024 3:52 PM, H. Peter Anvin wrote:
>> On January 31, 2024 1:14:52 PM PST, tip-bot2 for Xin Li <tip-bot2@linutronix.de> wrote:
>>> The following commit has been merged into the x86/fred branch of tip:
>>> 
>>> Commit-ID:     ee63291aa8287cb7ded767d340155fe8681fc075
>>> Gitweb:        https://git.kernel.org/tip/ee63291aa8287cb7ded767d340155fe8681fc075
>>> Author:        Xin Li <xin3.li@intel.com>
>>> AuthorDate:    Tue, 05 Dec 2023 02:50:02 -08:00
>>> Committer:     Borislav Petkov (AMD) <bp@alien8.de>
>>> CommitterDate: Wed, 31 Jan 2024 22:01:13 +01:00
>>> 
>>> x86/ptrace: Cleanup the definition of the pt_regs structure
>>> 
>>> struct pt_regs is hard to read because the member or section related
>>> comments are not aligned with the members.
>>> 
>>> The 'cs' and 'ss' members of pt_regs are type of 'unsigned long' while
>>> in reality they are only 16-bit wide. This works so far as the
>>> remaining space is unused, but FRED will use the remaining bits for
>>> other purposes.
>>> 
>>> To prepare for FRED:
>>> 
>>>   - Cleanup the formatting
>>>   - Convert 'cs' and 'ss' to u16 and embed them into an union
>>>     with a u64
>>>   - Fixup the related printk() format strings
>>> 
>>> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
>>> Originally-by: H. Peter Anvin (Intel) <hpa@zytor.com>
>>> Signed-off-by: Xin Li <xin3.li@intel.com>
>>> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
>>> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
>>> Tested-by: Shan Kang <shan.kang@intel.com>
>>> Link: https://lore.kernel.org/r/20231205105030.8698-14-xin3.li@intel.com
>
>[...]
>
>>> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
>>> index 33b2687..0f78b58 100644
>>> --- a/arch/x86/kernel/process_64.c
>>> +++ b/arch/x86/kernel/process_64.c
>>> @@ -117,7 +117,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
>>> 
>>> 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
>>> 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
>>> -	printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
>>> +	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
>>> 		log_lvl, regs->cs, ds, es, cr0);
>>> 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
>>> 		log_lvl, cr2, cr3, cr4);
>> 
>> Incidentally, the comment about callee-saved registers is long since both obsolete and is now outright wrong.
>> 
>> The next version of gcc (14 I think) will have an attribute to turn off saving registers which we can use for top-level C functions.
>> 
>
>Forgive my ignorance, do we have an official definition for "top-level C functions"?
>
>Thanks!
>    Xin
>

(Adding H.J., who did the gcc implementation of __attribute__((no_callee_saved_registers))).

The top level C functions are the ones whose stack frame are immediately below the exception/syscall frame, i.e. the C function called from the entry assembly code and functions tailcalled from those (unless they set up a stack frame for things like memory structures passed to the called function.)

Note that the implementation should properly handle the case when calling these functions from C (accidentally, or because it is a rare case that can be validly pessimized.)
  
H.J. Lu Feb. 6, 2024, 9:10 p.m. UTC | #4
On Tue, Feb 6, 2024 at 12:45 PM H. Peter Anvin <hpa@zytor.com> wrote:
>
> On February 6, 2024 11:04:13 AM PST, Xin Li <xin@zytor.com> wrote:
> >On 2/3/2024 3:52 PM, H. Peter Anvin wrote:
> >> On January 31, 2024 1:14:52 PM PST, tip-bot2 for Xin Li <tip-bot2@linutronix.de> wrote:
> >>> The following commit has been merged into the x86/fred branch of tip:
> >>>
> >>> Commit-ID:     ee63291aa8287cb7ded767d340155fe8681fc075
> >>> Gitweb:        https://git.kernel.org/tip/ee63291aa8287cb7ded767d340155fe8681fc075
> >>> Author:        Xin Li <xin3.li@intel.com>
> >>> AuthorDate:    Tue, 05 Dec 2023 02:50:02 -08:00
> >>> Committer:     Borislav Petkov (AMD) <bp@alien8.de>
> >>> CommitterDate: Wed, 31 Jan 2024 22:01:13 +01:00
> >>>
> >>> x86/ptrace: Cleanup the definition of the pt_regs structure
> >>>
> >>> struct pt_regs is hard to read because the member or section related
> >>> comments are not aligned with the members.
> >>>
> >>> The 'cs' and 'ss' members of pt_regs are type of 'unsigned long' while
> >>> in reality they are only 16-bit wide. This works so far as the
> >>> remaining space is unused, but FRED will use the remaining bits for
> >>> other purposes.
> >>>
> >>> To prepare for FRED:
> >>>
> >>>   - Cleanup the formatting
> >>>   - Convert 'cs' and 'ss' to u16 and embed them into an union
> >>>     with a u64
> >>>   - Fixup the related printk() format strings
> >>>
> >>> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
> >>> Originally-by: H. Peter Anvin (Intel) <hpa@zytor.com>
> >>> Signed-off-by: Xin Li <xin3.li@intel.com>
> >>> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> >>> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
> >>> Tested-by: Shan Kang <shan.kang@intel.com>
> >>> Link: https://lore.kernel.org/r/20231205105030.8698-14-xin3.li@intel.com
> >
> >[...]
> >
> >>> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> >>> index 33b2687..0f78b58 100644
> >>> --- a/arch/x86/kernel/process_64.c
> >>> +++ b/arch/x86/kernel/process_64.c
> >>> @@ -117,7 +117,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
> >>>
> >>>     printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
> >>>            log_lvl, fs, fsindex, gs, gsindex, shadowgs);
> >>> -   printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
> >>> +   printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
> >>>             log_lvl, regs->cs, ds, es, cr0);
> >>>     printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
> >>>             log_lvl, cr2, cr3, cr4);
> >>
> >> Incidentally, the comment about callee-saved registers is long since both obsolete and is now outright wrong.
> >>
> >> The next version of gcc (14 I think) will have an attribute to turn off saving registers which we can use for top-level C functions.

__attribute__((no_callee_saved_registers))) has been added to GCC 14.

> >
> >Forgive my ignorance, do we have an official definition for "top-level C functions"?
> >
> >Thanks!
> >    Xin
> >
>
> (Adding H.J., who did the gcc implementation of __attribute__((no_callee_saved_registers))).
>
> The top level C functions are the ones whose stack frame are immediately below the exception/syscall frame, i.e. the C function called from the entry assembly code and functions tailcalled from those (unless they set up a stack frame for things like memory structures passed to the called function.)
>
> Note that the implementation should properly handle the case when calling these functions from C (accidentally, or because it is a rare case that can be validly pessimized.)

GCC 14 should handle it properly.  If not, please open a GCC bug.
  

Patch

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index e0ca812..a3c0df1 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -76,7 +76,7 @@  static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
 	if (!show_unhandled_signals)
 		return;
 
-	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
 			   level, current->comm, task_pid_nr(current),
 			   message, regs->ip, regs->cs,
 			   regs->sp, regs->ax, regs->si, regs->di);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index f4db78b..b268cd2 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -57,17 +57,19 @@  struct pt_regs {
 #else /* __i386__ */
 
 struct pt_regs {
-/*
- * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
- * unless syscall needs a complete, fully filled "struct pt_regs".
- */
+	/*
+	 * C ABI says these regs are callee-preserved. They aren't saved on
+	 * kernel entry unless syscall needs a complete, fully filled
+	 * "struct pt_regs".
+	 */
 	unsigned long r15;
 	unsigned long r14;
 	unsigned long r13;
 	unsigned long r12;
 	unsigned long bp;
 	unsigned long bx;
-/* These regs are callee-clobbered. Always saved on kernel entry. */
+
+	/* These regs are callee-clobbered. Always saved on kernel entry. */
 	unsigned long r11;
 	unsigned long r10;
 	unsigned long r9;
@@ -77,18 +79,38 @@  struct pt_regs {
 	unsigned long dx;
 	unsigned long si;
 	unsigned long di;
-/*
- * On syscall entry, this is syscall#. On CPU exception, this is error code.
- * On hw interrupt, it's IRQ number:
- */
+
+	/*
+	 * orig_ax is used on entry for:
+	 * - the syscall number (syscall, sysenter, int80)
+	 * - error_code stored by the CPU on traps and exceptions
+	 * - the interrupt number for device interrupts
+	 */
 	unsigned long orig_ax;
-/* Return frame for iretq */
+
+	/* The IRETQ return frame starts here */
 	unsigned long ip;
-	unsigned long cs;
+
+	union {
+		/* The full 64-bit data slot containing CS */
+		u64		csx;
+		/* CS selector */
+		u16		cs;
+	};
+
 	unsigned long flags;
 	unsigned long sp;
-	unsigned long ss;
-/* top of stack page */
+
+	union {
+		/* The full 64-bit data slot containing SS */
+		u64		ssx;
+		/* SS selector */
+		u16		ss;
+	};
+
+	/*
+	 * Top of stack on IDT systems.
+	 */
 };
 
 #endif /* !__i386__ */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 33b2687..0f78b58 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -117,7 +117,7 @@  void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
 
 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
-	printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
+	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
 		log_lvl, regs->cs, ds, es, cr0);
 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
 		log_lvl, cr2, cr3, cr4);