[v4,2/5] LoongArch: Use la.pcrel instead of la.abs for exception handlers

Message ID 1676018856-26520-3-git-send-email-tangyouling@loongson.cn
State New
Headers
Series LoongArch: Add kernel relocation and KASLR support |

Commit Message

Youling Tang Feb. 10, 2023, 8:47 a.m. UTC
  From: Xi Ruoyao <xry111@xry111.site>

It's needed to build the kernel as a PIE, or the linker will complain.

For the consideration about performance, we copy the exception handlers
to a dedicated 64 KB area for each CPU.  So, the PC-relative offset
calculated at link time will be incorrect and we need to relocate the
exception handlers after copying them.

For the simplicity, we don't use the ELF R_LARCH_* relocations, but code
an relocation entry as simply (offset_in_the_handler, symbol_addr).  For
each exception handler, we also code the number of relocation entries.
Then we can use the relocation information to fix up the handlers after
copying them.

Signed-off-by: Xi Ruoyao <xry111@xry111.site>
---
 arch/loongarch/include/asm/inst.h       |   1 +
 arch/loongarch/include/asm/setup.h      |   6 +-
 arch/loongarch/include/asm/stackframe.h |   3 +-
 arch/loongarch/kernel/genex.S           |  40 +++++-
 arch/loongarch/kernel/traps.c           | 158 ++++++++++++++++++++----
 arch/loongarch/mm/tlb.c                 |  23 ++--
 arch/loongarch/mm/tlbex.S               |  69 +++++++++--
 7 files changed, 255 insertions(+), 45 deletions(-)
  

Comments

Huacai Chen Feb. 10, 2023, 9:09 a.m. UTC | #1
Hi, Youling and Ruoyao,

Thank you very much for implementing the per-node exceptions. But I
want to know if the per-node solution is really worthy for a PIE
kernel. So, could you please test the performance? Maybe we can reduce
the complexity if we give up the per-node solution.

Huacai

On Fri, Feb 10, 2023 at 4:47 PM Youling Tang <tangyouling@loongson.cn> wrote:
>
> From: Xi Ruoyao <xry111@xry111.site>
>
> It's needed to build the kernel as a PIE, or the linker will complain.
>
> For the consideration about performance, we copy the exception handlers
> to a dedicated 64 KB area for each CPU.  So, the PC-relative offset
> calculated at link time will be incorrect and we need to relocate the
> exception handlers after copying them.
>
> For the simplicity, we don't use the ELF R_LARCH_* relocations, but code
> an relocation entry as simply (offset_in_the_handler, symbol_addr).  For
> each exception handler, we also code the number of relocation entries.
> Then we can use the relocation information to fix up the handlers after
> copying them.
>
> Signed-off-by: Xi Ruoyao <xry111@xry111.site>
> ---
>  arch/loongarch/include/asm/inst.h       |   1 +
>  arch/loongarch/include/asm/setup.h      |   6 +-
>  arch/loongarch/include/asm/stackframe.h |   3 +-
>  arch/loongarch/kernel/genex.S           |  40 +++++-
>  arch/loongarch/kernel/traps.c           | 158 ++++++++++++++++++++----
>  arch/loongarch/mm/tlb.c                 |  23 ++--
>  arch/loongarch/mm/tlbex.S               |  69 +++++++++--
>  7 files changed, 255 insertions(+), 45 deletions(-)
>
> diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h
> index 7eedd83fd0d7..426054518a3d 100644
> --- a/arch/loongarch/include/asm/inst.h
> +++ b/arch/loongarch/include/asm/inst.h
> @@ -32,6 +32,7 @@ enum reg1i20_op {
>         lu12iw_op       = 0x0a,
>         lu32id_op       = 0x0b,
>         pcaddi_op       = 0x0c,
> +       pcalau12i_op    = 0x0d,
>         pcaddu12i_op    = 0x0e,
>         pcaddu18i_op    = 0x0f,
>  };
> diff --git a/arch/loongarch/include/asm/setup.h b/arch/loongarch/include/asm/setup.h
> index 72ead58039f3..f0a2b34365f1 100644
> --- a/arch/loongarch/include/asm/setup.h
> +++ b/arch/loongarch/include/asm/setup.h
> @@ -11,6 +11,9 @@
>
>  #define VECSIZE 0x200
>
> +struct handler_reloc;
> +
> +extern struct handler_reloc *eentry_reloc[];
>  extern unsigned long eentry;
>  extern unsigned long tlbrentry;
>  extern char init_command_line[COMMAND_LINE_SIZE];
> @@ -18,7 +21,8 @@ extern void tlb_init(int cpu);
>  extern void cpu_cache_init(void);
>  extern void cache_error_setup(void);
>  extern void per_cpu_trap_init(int cpu);
> -extern void set_handler(unsigned long offset, void *addr, unsigned long len);
> +extern void set_handler(unsigned long exccode, void *addr);
>  extern void set_merr_handler(unsigned long offset, void *addr, unsigned long len);
> +extern void reloc_handler(unsigned long handler, struct handler_reloc *rel);
>
>  #endif /* __SETUP_H */
> diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h
> index 7deb043ce387..bbec1e56b61b 100644
> --- a/arch/loongarch/include/asm/stackframe.h
> +++ b/arch/loongarch/include/asm/stackframe.h
> @@ -77,7 +77,8 @@
>   * new value in sp.
>   */
>         .macro  get_saved_sp docfi=0
> -       la.abs    t1, kernelsp
> +       /* The label is used for generating reloc tables for handlers */
> +514:   la.pcrel  t1, t0, kernelsp
>  #ifdef CONFIG_SMP
>         csrrd     t0, PERCPU_BASE_KS
>         LONG_ADD  t1, t1, t0
> diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S
> index 7e5c293ed89f..005a10fe5a50 100644
> --- a/arch/loongarch/kernel/genex.S
> +++ b/arch/loongarch/kernel/genex.S
> @@ -34,7 +34,7 @@ SYM_FUNC_END(__arch_cpu_idle)
>  SYM_FUNC_START(handle_vint)
>         BACKUP_T0T1
>         SAVE_ALL
> -       la.abs  t1, __arch_cpu_idle
> +0:     la.pcrel t1, t2, __arch_cpu_idle
>         LONG_L  t0, sp, PT_ERA
>         /* 32 byte rollback region */
>         ori     t0, t0, 0x1f
> @@ -43,11 +43,25 @@ SYM_FUNC_START(handle_vint)
>         LONG_S  t0, sp, PT_ERA
>  1:     move    a0, sp
>         move    a1, sp
> -       la.abs  t0, do_vint
> +2:     la.pcrel t0, t2, do_vint
>         jirl    ra, t0, 0
>         RESTORE_ALL_AND_RET
>  SYM_FUNC_END(handle_vint)
>
> +SYM_DATA_START(rel_handle_vint)
> +LONG   3
> +
> +LONG   514b - handle_vint
> +LONG   kernelsp
> +
> +LONG   0b - handle_vint
> +LONG   __arch_cpu_idle
> +
> +LONG   2b - handle_vint
> +LONG   do_vint
> +
> +SYM_DATA_END(rel_handle_vint)
> +
>  SYM_FUNC_START(except_vec_cex)
>         b       cache_parity_error
>  SYM_FUNC_END(except_vec_cex)
> @@ -72,12 +86,24 @@ SYM_FUNC_END(except_vec_cex)
>         SAVE_ALL
>         build_prep_\prep
>         move    a0, sp
> -       la.abs  t0, do_\handler
> +       667:
> +       la.pcrel t0, t1, do_\handler
>         jirl    ra, t0, 0
>         668:
>         RESTORE_ALL_AND_RET
>         SYM_FUNC_END(handle_\exception)
>         SYM_DATA(unwind_hint_\exception, .word 668b - 666b)
> +
> +       SYM_DATA_START(rel_handle_\exception)
> +       LONG    2
> +
> +       LONG    514b - 666b
> +       LONG    kernelsp
> +
> +       LONG    667b - 666b
> +       LONG    do_\handler
> +
> +       SYM_DATA_END(rel_handle_\exception)
>         .endm
>
>         BUILD_HANDLER ade ade badv
> @@ -93,6 +119,12 @@ SYM_FUNC_END(except_vec_cex)
>         BUILD_HANDLER reserved reserved none    /* others */
>
>  SYM_FUNC_START(handle_sys)
> -       la.abs  t0, handle_syscall
> +       la.pcrel t0, t1, handle_syscall
>         jr      t0
>  SYM_FUNC_END(handle_sys)
> +
> +SYM_DATA_START(rel_handle_sys)
> +LONG   1
> +LONG   0
> +LONG   handle_syscall
> +SYM_DATA_END(rel_handle_sys)
> diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c
> index c38a146a973b..7e073854f493 100644
> --- a/arch/loongarch/kernel/traps.c
> +++ b/arch/loongarch/kernel/traps.c
> @@ -62,6 +62,127 @@ extern asmlinkage void handle_reserved(void);
>  extern asmlinkage void handle_watch(void);
>  extern asmlinkage void handle_vint(void);
>
> +struct handler_reloc_entry {
> +       unsigned long offset;
> +       unsigned long sym;
> +};
> +
> +struct handler_reloc {
> +       unsigned long cnt;
> +       struct handler_reloc_entry entries[];
> +};
> +
> +extern struct handler_reloc rel_handle_tlb_load;
> +extern struct handler_reloc rel_handle_tlb_store;
> +extern struct handler_reloc rel_handle_tlb_modify;
> +extern struct handler_reloc rel_handle_tlb_protect;
> +extern struct handler_reloc rel_handle_ade;
> +extern struct handler_reloc rel_handle_ale;
> +extern struct handler_reloc rel_handle_sys;
> +extern struct handler_reloc rel_handle_bp;
> +extern struct handler_reloc rel_handle_ri;
> +extern struct handler_reloc rel_handle_fpu;
> +extern struct handler_reloc rel_handle_lsx;
> +extern struct handler_reloc rel_handle_lasx;
> +extern struct handler_reloc rel_handle_fpe;
> +extern struct handler_reloc rel_handle_lbt;
> +extern struct handler_reloc rel_handle_watch;
> +extern struct handler_reloc rel_handle_reserved;
> +extern struct handler_reloc rel_handle_vint;
> +
> +struct handler_reloc *eentry_reloc[128] = {
> +       [0] = NULL, /* merr handler */
> +       [EXCCODE_TLBL] = &rel_handle_tlb_load,
> +       [EXCCODE_TLBS] = &rel_handle_tlb_store,
> +       [EXCCODE_TLBI] = &rel_handle_tlb_load,
> +       [EXCCODE_TLBM] = &rel_handle_tlb_modify,
> +       [EXCCODE_TLBNR] = &rel_handle_tlb_protect,
> +       [EXCCODE_TLBNX] = &rel_handle_tlb_protect,
> +       [EXCCODE_TLBPE] = &rel_handle_tlb_protect,
> +       [EXCCODE_ADE] = &rel_handle_ade,
> +       [EXCCODE_ALE] = &rel_handle_ale,
> +       [EXCCODE_SYS] = &rel_handle_sys,
> +       [EXCCODE_BP] = &rel_handle_bp,
> +       [EXCCODE_INE] = &rel_handle_ri,
> +       [EXCCODE_IPE] = &rel_handle_ri,
> +       [EXCCODE_FPDIS] = &rel_handle_fpu,
> +       [EXCCODE_LSXDIS] = &rel_handle_lsx,
> +       [EXCCODE_LASXDIS] = &rel_handle_lasx,
> +       [EXCCODE_FPE] = &rel_handle_fpe,
> +       [EXCCODE_BTDIS] = &rel_handle_lbt,
> +       [EXCCODE_WATCH] = &rel_handle_watch,
> +       [(EXCCODE_WATCH + 1) ... (EXCCODE_INT_START - 1)] = &rel_handle_reserved,
> +       [EXCCODE_INT_START ... (EXCCODE_INT_END - 1)] = &rel_handle_vint,
> +};
> +
> +void reloc_handler(unsigned long handler, struct handler_reloc *rel)
> +{
> +       if (!rel)
> +               return;
> +
> +       for (unsigned long i = 0; i < rel->cnt; i++) {
> +               unsigned long pc = handler + rel->entries[i].offset;
> +               union loongarch_instruction *insn =
> +                       (union loongarch_instruction *)pc;
> +               u32 imm[4];
> +               unsigned long v = rel->entries[i].sym;
> +
> +               /* GNU as >= 2.40 uses pcalau12i for la.pcrel, but GNU ld <= 2.39
> +                * uses pcaddu12i.
> +                */
> +               if (insn->reg1i20_format.opcode == pcalau12i_op) {
> +                       /* Use s32 deliberately for sign extension. */
> +                       s32 offset_hi20 = ((v + 0x800) & ~0xfff) -
> +                                         (pc & ~0xfff);
> +                       unsigned long anchor = (pc & ~0xfff) + offset_hi20;
> +                       unsigned long offset_rem = v - anchor;
> +
> +                       imm[0] = (offset_hi20 >> 12) & 0xfffff;
> +                       imm[1] = v & 0xfff;
> +                       imm[2] = (offset_rem >> 32) & 0xfffff;
> +                       imm[3] = offset_rem >> 52;
> +               } else if (insn->reg1i20_format.opcode == pcaddu12i_op) {
> +                       /* Use s32 deliberately for sign extension. */
> +                       s32 offset_lo = v - pc;
> +                       unsigned long offset_hi = v - pc - offset_lo;
> +
> +                       imm[0] = (offset_lo >> 12) & 0xfffff;
> +                       imm[1] = offset_lo & 0xfff;
> +                       imm[2] = (offset_hi >> 32) & 0xfffff;
> +                       imm[3] = offset_hi >> 52;
> +               } else
> +                       panic("Cannot fixup la.pcrel for exception handler at %lu: unexpected instruction %d!",
> +                             pc, insn->word);
> +
> +               insn[0].reg1i20_format.immediate = imm[0];
> +               insn[1].reg2i12_format.immediate = imm[1];
> +               insn[2].reg1i20_format.immediate = imm[2];
> +               insn[3].reg2i12_format.immediate = imm[3];
> +       }
> +}
> +
> +/* Install CPU exception handler */
> +static void do_set_handler(unsigned long exccode, void *addr,
> +                          struct handler_reloc *rel)
> +{
> +       unsigned long dest_addr = eentry + exccode * VECSIZE;
> +
> +       memcpy((void *)dest_addr, addr, VECSIZE);
> +       reloc_handler(dest_addr, rel);
> +       local_flush_icache_range(dest_addr, dest_addr + VECSIZE);
> +}
> +
> +/* Install CPU exception handler, with the reloc table from eentry_reloc */
> +void set_handler(unsigned long exccode, void *addr)
> +{
> +       do_set_handler(exccode, addr, eentry_reloc[exccode]);
> +}
> +
> +static void set_handler_reserved(unsigned long exccode)
> +{
> +       do_set_handler(exccode, handle_reserved, &rel_handle_reserved);
> +}
> +
>  static void show_backtrace(struct task_struct *task, const struct pt_regs *regs,
>                            const char *loglvl, bool user)
>  {
> @@ -704,19 +825,12 @@ void per_cpu_trap_init(int cpu)
>         /* Initialise exception handlers */
>         if (cpu == 0)
>                 for (i = 0; i < 64; i++)
> -                       set_handler(i * VECSIZE, handle_reserved, VECSIZE);
> +                       set_handler_reserved(i);
>
>         tlb_init(cpu);
>         cpu_cache_init();
>  }
>
> -/* Install CPU exception handler */
> -void set_handler(unsigned long offset, void *addr, unsigned long size)
> -{
> -       memcpy((void *)(eentry + offset), addr, size);
> -       local_flush_icache_range(eentry + offset, eentry + offset + size);
> -}
> -
>  static const char panic_null_cerr[] =
>         "Trying to set NULL cache error exception handler\n";
>
> @@ -741,20 +855,20 @@ void __init trap_init(void)
>
>         /* Set interrupt vector handler */
>         for (i = EXCCODE_INT_START; i < EXCCODE_INT_END; i++)
> -               set_handler(i * VECSIZE, handle_vint, VECSIZE);
> -
> -       set_handler(EXCCODE_ADE * VECSIZE, handle_ade, VECSIZE);
> -       set_handler(EXCCODE_ALE * VECSIZE, handle_ale, VECSIZE);
> -       set_handler(EXCCODE_SYS * VECSIZE, handle_sys, VECSIZE);
> -       set_handler(EXCCODE_BP * VECSIZE, handle_bp, VECSIZE);
> -       set_handler(EXCCODE_INE * VECSIZE, handle_ri, VECSIZE);
> -       set_handler(EXCCODE_IPE * VECSIZE, handle_ri, VECSIZE);
> -       set_handler(EXCCODE_FPDIS * VECSIZE, handle_fpu, VECSIZE);
> -       set_handler(EXCCODE_LSXDIS * VECSIZE, handle_lsx, VECSIZE);
> -       set_handler(EXCCODE_LASXDIS * VECSIZE, handle_lasx, VECSIZE);
> -       set_handler(EXCCODE_FPE * VECSIZE, handle_fpe, VECSIZE);
> -       set_handler(EXCCODE_BTDIS * VECSIZE, handle_lbt, VECSIZE);
> -       set_handler(EXCCODE_WATCH * VECSIZE, handle_watch, VECSIZE);
> +               set_handler(i, handle_vint);
> +
> +       set_handler(EXCCODE_ADE, handle_ade);
> +       set_handler(EXCCODE_ALE, handle_ale);
> +       set_handler(EXCCODE_SYS, handle_sys);
> +       set_handler(EXCCODE_BP, handle_bp);
> +       set_handler(EXCCODE_INE, handle_ri);
> +       set_handler(EXCCODE_IPE, handle_ri);
> +       set_handler(EXCCODE_FPDIS, handle_fpu);
> +       set_handler(EXCCODE_LSXDIS, handle_lsx);
> +       set_handler(EXCCODE_LASXDIS, handle_lasx);
> +       set_handler(EXCCODE_FPE, handle_fpe);
> +       set_handler(EXCCODE_BTDIS, handle_lbt);
> +       set_handler(EXCCODE_WATCH, handle_watch);
>
>         cache_error_setup();
>
> diff --git a/arch/loongarch/mm/tlb.c b/arch/loongarch/mm/tlb.c
> index 8bad6b0cff59..6f70aab7202a 100644
> --- a/arch/loongarch/mm/tlb.c
> +++ b/arch/loongarch/mm/tlb.c
> @@ -253,7 +253,6 @@ static void output_pgtable_bits_defines(void)
>  #ifdef CONFIG_NUMA
>  unsigned long pcpu_handlers[NR_CPUS];
>  #endif
> -extern long exception_handlers[VECSIZE * 128 / sizeof(long)];
>
>  void setup_tlb_handler(int cpu)
>  {
> @@ -264,19 +263,20 @@ void setup_tlb_handler(int cpu)
>         if (cpu == 0) {
>                 memcpy((void *)tlbrentry, handle_tlb_refill, 0x80);
>                 local_flush_icache_range(tlbrentry, tlbrentry + 0x80);
> -               set_handler(EXCCODE_TLBI * VECSIZE, handle_tlb_load, VECSIZE);
> -               set_handler(EXCCODE_TLBL * VECSIZE, handle_tlb_load, VECSIZE);
> -               set_handler(EXCCODE_TLBS * VECSIZE, handle_tlb_store, VECSIZE);
> -               set_handler(EXCCODE_TLBM * VECSIZE, handle_tlb_modify, VECSIZE);
> -               set_handler(EXCCODE_TLBNR * VECSIZE, handle_tlb_protect, VECSIZE);
> -               set_handler(EXCCODE_TLBNX * VECSIZE, handle_tlb_protect, VECSIZE);
> -               set_handler(EXCCODE_TLBPE * VECSIZE, handle_tlb_protect, VECSIZE);
> +               set_handler(EXCCODE_TLBI, handle_tlb_load);
> +               set_handler(EXCCODE_TLBL, handle_tlb_load);
> +               set_handler(EXCCODE_TLBS, handle_tlb_store);
> +               set_handler(EXCCODE_TLBM, handle_tlb_modify);
> +               set_handler(EXCCODE_TLBNR, handle_tlb_protect);
> +               set_handler(EXCCODE_TLBNX, handle_tlb_protect);
> +               set_handler(EXCCODE_TLBPE, handle_tlb_protect);
>         }
>  #ifdef CONFIG_NUMA
>         else {
>                 void *addr;
> +               unsigned long addr_ul;
>                 struct page *page;
> -               const int vec_sz = sizeof(exception_handlers);
> +               const int vec_sz = VECSIZE * 128;
>
>                 if (pcpu_handlers[cpu])
>                         return;
> @@ -286,8 +286,11 @@ void setup_tlb_handler(int cpu)
>                         return;
>
>                 addr = page_address(page);
> +               addr_ul = (unsigned long)addr;
>                 pcpu_handlers[cpu] = (unsigned long)addr;
> -               memcpy((void *)addr, (void *)eentry, vec_sz);
> +               memcpy(addr, (void *)eentry, vec_sz);
> +               for (unsigned long i = 0; i < 128; i++)
> +                       reloc_handler(addr_ul + i * VECSIZE, eentry_reloc[i]);
>                 local_flush_icache_range((unsigned long)addr, (unsigned long)addr + vec_sz);
>                 csr_write64(pcpu_handlers[cpu], LOONGARCH_CSR_EENTRY);
>                 csr_write64(pcpu_handlers[cpu], LOONGARCH_CSR_MERRENTRY);
> diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S
> index 3dd2a9615cd9..044c2190771a 100644
> --- a/arch/loongarch/mm/tlbex.S
> +++ b/arch/loongarch/mm/tlbex.S
> @@ -39,11 +39,21 @@ SYM_FUNC_START(handle_tlb_protect)
>         move            a1, zero
>         csrrd           a2, LOONGARCH_CSR_BADV
>         REG_S           a2, sp, PT_BVADDR
> -       la.abs          t0, do_page_fault
> +1:     la.pcrel        t0, t1, do_page_fault
>         jirl            ra, t0, 0
>         RESTORE_ALL_AND_RET
>  SYM_FUNC_END(handle_tlb_protect)
>
> +SYM_DATA_START(rel_handle_tlb_protect)
> +       LONG    2
> +
> +       LONG    514b - handle_tlb_protect
> +       LONG    kernelsp
> +
> +       LONG    1b - handle_tlb_protect
> +       LONG    do_page_fault
> +SYM_DATA_END(rel_handle_tlb_protect)
> +
>  SYM_FUNC_START(handle_tlb_load)
>         csrwr           t0, EXCEPTION_KS0
>         csrwr           t1, EXCEPTION_KS1
> @@ -115,7 +125,8 @@ smp_pgtable_change_load:
>
>  #ifdef CONFIG_64BIT
>  vmalloc_load:
> -       la.abs          t1, swapper_pg_dir
> +       /* The first insn of vmalloc_done_load overwrites ra */
> +1:     la.pcrel        t1, ra, swapper_pg_dir
>         b               vmalloc_done_load
>  #endif
>
> @@ -186,10 +197,24 @@ tlb_huge_update_load:
>  nopage_tlb_load:
>         dbar            0
>         csrrd           ra, EXCEPTION_KS2
> -       la.abs          t0, tlb_do_page_fault_0
> +2:     la.pcrel        t0, t1, tlb_do_page_fault_0
>         jr              t0
>  SYM_FUNC_END(handle_tlb_load)
>
> +SYM_DATA_START(rel_handle_tlb_load)
> +#ifdef CONFIG_64BIT
> +       LONG    2
> +
> +       LONG    1b - handle_tlb_load
> +       LONG    swapper_pg_dir
> +#else
> +       LONG    1
> +#endif
> +
> +       LONG    2b - handle_tlb_load
> +       LONG    tlb_do_page_fault_0
> +SYM_DATA_END(rel_handle_tlb_load)
> +
>  SYM_FUNC_START(handle_tlb_store)
>         csrwr           t0, EXCEPTION_KS0
>         csrwr           t1, EXCEPTION_KS1
> @@ -262,7 +287,8 @@ smp_pgtable_change_store:
>
>  #ifdef CONFIG_64BIT
>  vmalloc_store:
> -       la.abs          t1, swapper_pg_dir
> +       /* The first insn of vmalloc_done_store overwrites ra */
> +1:     la.pcrel        t1, ra, swapper_pg_dir
>         b               vmalloc_done_store
>  #endif
>
> @@ -335,10 +361,24 @@ tlb_huge_update_store:
>  nopage_tlb_store:
>         dbar            0
>         csrrd           ra, EXCEPTION_KS2
> -       la.abs          t0, tlb_do_page_fault_1
> +2:     la.pcrel        t0, t1, tlb_do_page_fault_1
>         jr              t0
>  SYM_FUNC_END(handle_tlb_store)
>
> +SYM_DATA_START(rel_handle_tlb_store)
> +#ifdef CONFIG_64BIT
> +       LONG    2
> +
> +       LONG    1b - handle_tlb_store
> +       LONG    swapper_pg_dir
> +#else
> +       LONG    1
> +#endif
> +
> +       LONG    2b - handle_tlb_store
> +       LONG    tlb_do_page_fault_1
> +SYM_DATA_END(rel_handle_tlb_store)
> +
>  SYM_FUNC_START(handle_tlb_modify)
>         csrwr           t0, EXCEPTION_KS0
>         csrwr           t1, EXCEPTION_KS1
> @@ -410,7 +450,8 @@ smp_pgtable_change_modify:
>
>  #ifdef CONFIG_64BIT
>  vmalloc_modify:
> -       la.abs          t1, swapper_pg_dir
> +       /* The first insn of vmalloc_done_modify overwrites ra */
> +1:     la.pcrel        t1, ra, swapper_pg_dir
>         b               vmalloc_done_modify
>  #endif
>
> @@ -482,10 +523,24 @@ tlb_huge_update_modify:
>  nopage_tlb_modify:
>         dbar            0
>         csrrd           ra, EXCEPTION_KS2
> -       la.abs          t0, tlb_do_page_fault_1
> +2:     la.pcrel        t0, t1, tlb_do_page_fault_1
>         jr              t0
>  SYM_FUNC_END(handle_tlb_modify)
>
> +SYM_DATA_START(rel_handle_tlb_modify)
> +#ifdef CONFIG_64BIT
> +       LONG    2
> +
> +       LONG    1b - handle_tlb_modify
> +       LONG    swapper_pg_dir
> +#else
> +       LONG    1
> +#endif
> +
> +       LONG    2b - handle_tlb_modify
> +       LONG    tlb_do_page_fault_1
> +SYM_DATA_END(rel_handle_tlb_modify)
> +
>  SYM_FUNC_START(handle_tlb_refill)
>         csrwr           t0, LOONGARCH_CSR_TLBRSAVE
>         csrrd           t0, LOONGARCH_CSR_PGD
> --
> 2.37.3
>
  
Youling Tang Feb. 10, 2023, 9:18 a.m. UTC | #2
On 02/10/2023 05:09 PM, Huacai Chen wrote:
> Hi, Youling and Ruoyao,
>
> Thank you very much for implementing the per-node exceptions. But I
> want to know if the per-node solution is really worthy for a PIE
> kernel. So, could you please test the performance? Maybe we can reduce
> the complexity if we give up the per-node solution.

I will test performance on NUMA machines based on v2 and v3 patch sets.

Youling.
>
> Huacai
>
> On Fri, Feb 10, 2023 at 4:47 PM Youling Tang <tangyouling@loongson.cn> wrote:
>>
>> From: Xi Ruoyao <xry111@xry111.site>
>>
>> It's needed to build the kernel as a PIE, or the linker will complain.
>>
>> For the consideration about performance, we copy the exception handlers
>> to a dedicated 64 KB area for each CPU.  So, the PC-relative offset
>> calculated at link time will be incorrect and we need to relocate the
>> exception handlers after copying them.
>>
>> For the simplicity, we don't use the ELF R_LARCH_* relocations, but code
>> an relocation entry as simply (offset_in_the_handler, symbol_addr).  For
>> each exception handler, we also code the number of relocation entries.
>> Then we can use the relocation information to fix up the handlers after
>> copying them.
>>
>> Signed-off-by: Xi Ruoyao <xry111@xry111.site>
>> ---
>>  arch/loongarch/include/asm/inst.h       |   1 +
>>  arch/loongarch/include/asm/setup.h      |   6 +-
>>  arch/loongarch/include/asm/stackframe.h |   3 +-
>>  arch/loongarch/kernel/genex.S           |  40 +++++-
>>  arch/loongarch/kernel/traps.c           | 158 ++++++++++++++++++++----
>>  arch/loongarch/mm/tlb.c                 |  23 ++--
>>  arch/loongarch/mm/tlbex.S               |  69 +++++++++--
>>  7 files changed, 255 insertions(+), 45 deletions(-)
>>
>> diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h
>> index 7eedd83fd0d7..426054518a3d 100644
>> --- a/arch/loongarch/include/asm/inst.h
>> +++ b/arch/loongarch/include/asm/inst.h
>> @@ -32,6 +32,7 @@ enum reg1i20_op {
>>         lu12iw_op       = 0x0a,
>>         lu32id_op       = 0x0b,
>>         pcaddi_op       = 0x0c,
>> +       pcalau12i_op    = 0x0d,
>>         pcaddu12i_op    = 0x0e,
>>         pcaddu18i_op    = 0x0f,
>>  };
>> diff --git a/arch/loongarch/include/asm/setup.h b/arch/loongarch/include/asm/setup.h
>> index 72ead58039f3..f0a2b34365f1 100644
>> --- a/arch/loongarch/include/asm/setup.h
>> +++ b/arch/loongarch/include/asm/setup.h
>> @@ -11,6 +11,9 @@
>>
>>  #define VECSIZE 0x200
>>
>> +struct handler_reloc;
>> +
>> +extern struct handler_reloc *eentry_reloc[];
>>  extern unsigned long eentry;
>>  extern unsigned long tlbrentry;
>>  extern char init_command_line[COMMAND_LINE_SIZE];
>> @@ -18,7 +21,8 @@ extern void tlb_init(int cpu);
>>  extern void cpu_cache_init(void);
>>  extern void cache_error_setup(void);
>>  extern void per_cpu_trap_init(int cpu);
>> -extern void set_handler(unsigned long offset, void *addr, unsigned long len);
>> +extern void set_handler(unsigned long exccode, void *addr);
>>  extern void set_merr_handler(unsigned long offset, void *addr, unsigned long len);
>> +extern void reloc_handler(unsigned long handler, struct handler_reloc *rel);
>>
>>  #endif /* __SETUP_H */
>> diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h
>> index 7deb043ce387..bbec1e56b61b 100644
>> --- a/arch/loongarch/include/asm/stackframe.h
>> +++ b/arch/loongarch/include/asm/stackframe.h
>> @@ -77,7 +77,8 @@
>>   * new value in sp.
>>   */
>>         .macro  get_saved_sp docfi=0
>> -       la.abs    t1, kernelsp
>> +       /* The label is used for generating reloc tables for handlers */
>> +514:   la.pcrel  t1, t0, kernelsp
>>  #ifdef CONFIG_SMP
>>         csrrd     t0, PERCPU_BASE_KS
>>         LONG_ADD  t1, t1, t0
>> diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S
>> index 7e5c293ed89f..005a10fe5a50 100644
>> --- a/arch/loongarch/kernel/genex.S
>> +++ b/arch/loongarch/kernel/genex.S
>> @@ -34,7 +34,7 @@ SYM_FUNC_END(__arch_cpu_idle)
>>  SYM_FUNC_START(handle_vint)
>>         BACKUP_T0T1
>>         SAVE_ALL
>> -       la.abs  t1, __arch_cpu_idle
>> +0:     la.pcrel t1, t2, __arch_cpu_idle
>>         LONG_L  t0, sp, PT_ERA
>>         /* 32 byte rollback region */
>>         ori     t0, t0, 0x1f
>> @@ -43,11 +43,25 @@ SYM_FUNC_START(handle_vint)
>>         LONG_S  t0, sp, PT_ERA
>>  1:     move    a0, sp
>>         move    a1, sp
>> -       la.abs  t0, do_vint
>> +2:     la.pcrel t0, t2, do_vint
>>         jirl    ra, t0, 0
>>         RESTORE_ALL_AND_RET
>>  SYM_FUNC_END(handle_vint)
>>
>> +SYM_DATA_START(rel_handle_vint)
>> +LONG   3
>> +
>> +LONG   514b - handle_vint
>> +LONG   kernelsp
>> +
>> +LONG   0b - handle_vint
>> +LONG   __arch_cpu_idle
>> +
>> +LONG   2b - handle_vint
>> +LONG   do_vint
>> +
>> +SYM_DATA_END(rel_handle_vint)
>> +
>>  SYM_FUNC_START(except_vec_cex)
>>         b       cache_parity_error
>>  SYM_FUNC_END(except_vec_cex)
>> @@ -72,12 +86,24 @@ SYM_FUNC_END(except_vec_cex)
>>         SAVE_ALL
>>         build_prep_\prep
>>         move    a0, sp
>> -       la.abs  t0, do_\handler
>> +       667:
>> +       la.pcrel t0, t1, do_\handler
>>         jirl    ra, t0, 0
>>         668:
>>         RESTORE_ALL_AND_RET
>>         SYM_FUNC_END(handle_\exception)
>>         SYM_DATA(unwind_hint_\exception, .word 668b - 666b)
>> +
>> +       SYM_DATA_START(rel_handle_\exception)
>> +       LONG    2
>> +
>> +       LONG    514b - 666b
>> +       LONG    kernelsp
>> +
>> +       LONG    667b - 666b
>> +       LONG    do_\handler
>> +
>> +       SYM_DATA_END(rel_handle_\exception)
>>         .endm
>>
>>         BUILD_HANDLER ade ade badv
>> @@ -93,6 +119,12 @@ SYM_FUNC_END(except_vec_cex)
>>         BUILD_HANDLER reserved reserved none    /* others */
>>
>>  SYM_FUNC_START(handle_sys)
>> -       la.abs  t0, handle_syscall
>> +       la.pcrel t0, t1, handle_syscall
>>         jr      t0
>>  SYM_FUNC_END(handle_sys)
>> +
>> +SYM_DATA_START(rel_handle_sys)
>> +LONG   1
>> +LONG   0
>> +LONG   handle_syscall
>> +SYM_DATA_END(rel_handle_sys)
>> diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c
>> index c38a146a973b..7e073854f493 100644
>> --- a/arch/loongarch/kernel/traps.c
>> +++ b/arch/loongarch/kernel/traps.c
>> @@ -62,6 +62,127 @@ extern asmlinkage void handle_reserved(void);
>>  extern asmlinkage void handle_watch(void);
>>  extern asmlinkage void handle_vint(void);
>>
>> +struct handler_reloc_entry {
>> +       unsigned long offset;
>> +       unsigned long sym;
>> +};
>> +
>> +struct handler_reloc {
>> +       unsigned long cnt;
>> +       struct handler_reloc_entry entries[];
>> +};
>> +
>> +extern struct handler_reloc rel_handle_tlb_load;
>> +extern struct handler_reloc rel_handle_tlb_store;
>> +extern struct handler_reloc rel_handle_tlb_modify;
>> +extern struct handler_reloc rel_handle_tlb_protect;
>> +extern struct handler_reloc rel_handle_ade;
>> +extern struct handler_reloc rel_handle_ale;
>> +extern struct handler_reloc rel_handle_sys;
>> +extern struct handler_reloc rel_handle_bp;
>> +extern struct handler_reloc rel_handle_ri;
>> +extern struct handler_reloc rel_handle_fpu;
>> +extern struct handler_reloc rel_handle_lsx;
>> +extern struct handler_reloc rel_handle_lasx;
>> +extern struct handler_reloc rel_handle_fpe;
>> +extern struct handler_reloc rel_handle_lbt;
>> +extern struct handler_reloc rel_handle_watch;
>> +extern struct handler_reloc rel_handle_reserved;
>> +extern struct handler_reloc rel_handle_vint;
>> +
>> +struct handler_reloc *eentry_reloc[128] = {
>> +       [0] = NULL, /* merr handler */
>> +       [EXCCODE_TLBL] = &rel_handle_tlb_load,
>> +       [EXCCODE_TLBS] = &rel_handle_tlb_store,
>> +       [EXCCODE_TLBI] = &rel_handle_tlb_load,
>> +       [EXCCODE_TLBM] = &rel_handle_tlb_modify,
>> +       [EXCCODE_TLBNR] = &rel_handle_tlb_protect,
>> +       [EXCCODE_TLBNX] = &rel_handle_tlb_protect,
>> +       [EXCCODE_TLBPE] = &rel_handle_tlb_protect,
>> +       [EXCCODE_ADE] = &rel_handle_ade,
>> +       [EXCCODE_ALE] = &rel_handle_ale,
>> +       [EXCCODE_SYS] = &rel_handle_sys,
>> +       [EXCCODE_BP] = &rel_handle_bp,
>> +       [EXCCODE_INE] = &rel_handle_ri,
>> +       [EXCCODE_IPE] = &rel_handle_ri,
>> +       [EXCCODE_FPDIS] = &rel_handle_fpu,
>> +       [EXCCODE_LSXDIS] = &rel_handle_lsx,
>> +       [EXCCODE_LASXDIS] = &rel_handle_lasx,
>> +       [EXCCODE_FPE] = &rel_handle_fpe,
>> +       [EXCCODE_BTDIS] = &rel_handle_lbt,
>> +       [EXCCODE_WATCH] = &rel_handle_watch,
>> +       [(EXCCODE_WATCH + 1) ... (EXCCODE_INT_START - 1)] = &rel_handle_reserved,
>> +       [EXCCODE_INT_START ... (EXCCODE_INT_END - 1)] = &rel_handle_vint,
>> +};
>> +
>> +void reloc_handler(unsigned long handler, struct handler_reloc *rel)
>> +{
>> +       if (!rel)
>> +               return;
>> +
>> +       for (unsigned long i = 0; i < rel->cnt; i++) {
>> +               unsigned long pc = handler + rel->entries[i].offset;
>> +               union loongarch_instruction *insn =
>> +                       (union loongarch_instruction *)pc;
>> +               u32 imm[4];
>> +               unsigned long v = rel->entries[i].sym;
>> +
>> +               /* GNU as >= 2.40 uses pcalau12i for la.pcrel, but GNU ld <= 2.39
>> +                * uses pcaddu12i.
>> +                */
>> +               if (insn->reg1i20_format.opcode == pcalau12i_op) {
>> +                       /* Use s32 deliberately for sign extension. */
>> +                       s32 offset_hi20 = ((v + 0x800) & ~0xfff) -
>> +                                         (pc & ~0xfff);
>> +                       unsigned long anchor = (pc & ~0xfff) + offset_hi20;
>> +                       unsigned long offset_rem = v - anchor;
>> +
>> +                       imm[0] = (offset_hi20 >> 12) & 0xfffff;
>> +                       imm[1] = v & 0xfff;
>> +                       imm[2] = (offset_rem >> 32) & 0xfffff;
>> +                       imm[3] = offset_rem >> 52;
>> +               } else if (insn->reg1i20_format.opcode == pcaddu12i_op) {
>> +                       /* Use s32 deliberately for sign extension. */
>> +                       s32 offset_lo = v - pc;
>> +                       unsigned long offset_hi = v - pc - offset_lo;
>> +
>> +                       imm[0] = (offset_lo >> 12) & 0xfffff;
>> +                       imm[1] = offset_lo & 0xfff;
>> +                       imm[2] = (offset_hi >> 32) & 0xfffff;
>> +                       imm[3] = offset_hi >> 52;
>> +               } else
>> +                       panic("Cannot fixup la.pcrel for exception handler at %lu: unexpected instruction %d!",
>> +                             pc, insn->word);
>> +
>> +               insn[0].reg1i20_format.immediate = imm[0];
>> +               insn[1].reg2i12_format.immediate = imm[1];
>> +               insn[2].reg1i20_format.immediate = imm[2];
>> +               insn[3].reg2i12_format.immediate = imm[3];
>> +       }
>> +}
>> +
>> +/* Install CPU exception handler */
>> +static void do_set_handler(unsigned long exccode, void *addr,
>> +                          struct handler_reloc *rel)
>> +{
>> +       unsigned long dest_addr = eentry + exccode * VECSIZE;
>> +
>> +       memcpy((void *)dest_addr, addr, VECSIZE);
>> +       reloc_handler(dest_addr, rel);
>> +       local_flush_icache_range(dest_addr, dest_addr + VECSIZE);
>> +}
>> +
>> +/* Install CPU exception handler, with the reloc table from eentry_reloc */
>> +void set_handler(unsigned long exccode, void *addr)
>> +{
>> +       do_set_handler(exccode, addr, eentry_reloc[exccode]);
>> +}
>> +
>> +static void set_handler_reserved(unsigned long exccode)
>> +{
>> +       do_set_handler(exccode, handle_reserved, &rel_handle_reserved);
>> +}
>> +
>>  static void show_backtrace(struct task_struct *task, const struct pt_regs *regs,
>>                            const char *loglvl, bool user)
>>  {
>> @@ -704,19 +825,12 @@ void per_cpu_trap_init(int cpu)
>>         /* Initialise exception handlers */
>>         if (cpu == 0)
>>                 for (i = 0; i < 64; i++)
>> -                       set_handler(i * VECSIZE, handle_reserved, VECSIZE);
>> +                       set_handler_reserved(i);
>>
>>         tlb_init(cpu);
>>         cpu_cache_init();
>>  }
>>
>> -/* Install CPU exception handler */
>> -void set_handler(unsigned long offset, void *addr, unsigned long size)
>> -{
>> -       memcpy((void *)(eentry + offset), addr, size);
>> -       local_flush_icache_range(eentry + offset, eentry + offset + size);
>> -}
>> -
>>  static const char panic_null_cerr[] =
>>         "Trying to set NULL cache error exception handler\n";
>>
>> @@ -741,20 +855,20 @@ void __init trap_init(void)
>>
>>         /* Set interrupt vector handler */
>>         for (i = EXCCODE_INT_START; i < EXCCODE_INT_END; i++)
>> -               set_handler(i * VECSIZE, handle_vint, VECSIZE);
>> -
>> -       set_handler(EXCCODE_ADE * VECSIZE, handle_ade, VECSIZE);
>> -       set_handler(EXCCODE_ALE * VECSIZE, handle_ale, VECSIZE);
>> -       set_handler(EXCCODE_SYS * VECSIZE, handle_sys, VECSIZE);
>> -       set_handler(EXCCODE_BP * VECSIZE, handle_bp, VECSIZE);
>> -       set_handler(EXCCODE_INE * VECSIZE, handle_ri, VECSIZE);
>> -       set_handler(EXCCODE_IPE * VECSIZE, handle_ri, VECSIZE);
>> -       set_handler(EXCCODE_FPDIS * VECSIZE, handle_fpu, VECSIZE);
>> -       set_handler(EXCCODE_LSXDIS * VECSIZE, handle_lsx, VECSIZE);
>> -       set_handler(EXCCODE_LASXDIS * VECSIZE, handle_lasx, VECSIZE);
>> -       set_handler(EXCCODE_FPE * VECSIZE, handle_fpe, VECSIZE);
>> -       set_handler(EXCCODE_BTDIS * VECSIZE, handle_lbt, VECSIZE);
>> -       set_handler(EXCCODE_WATCH * VECSIZE, handle_watch, VECSIZE);
>> +               set_handler(i, handle_vint);
>> +
>> +       set_handler(EXCCODE_ADE, handle_ade);
>> +       set_handler(EXCCODE_ALE, handle_ale);
>> +       set_handler(EXCCODE_SYS, handle_sys);
>> +       set_handler(EXCCODE_BP, handle_bp);
>> +       set_handler(EXCCODE_INE, handle_ri);
>> +       set_handler(EXCCODE_IPE, handle_ri);
>> +       set_handler(EXCCODE_FPDIS, handle_fpu);
>> +       set_handler(EXCCODE_LSXDIS, handle_lsx);
>> +       set_handler(EXCCODE_LASXDIS, handle_lasx);
>> +       set_handler(EXCCODE_FPE, handle_fpe);
>> +       set_handler(EXCCODE_BTDIS, handle_lbt);
>> +       set_handler(EXCCODE_WATCH, handle_watch);
>>
>>         cache_error_setup();
>>
>> diff --git a/arch/loongarch/mm/tlb.c b/arch/loongarch/mm/tlb.c
>> index 8bad6b0cff59..6f70aab7202a 100644
>> --- a/arch/loongarch/mm/tlb.c
>> +++ b/arch/loongarch/mm/tlb.c
>> @@ -253,7 +253,6 @@ static void output_pgtable_bits_defines(void)
>>  #ifdef CONFIG_NUMA
>>  unsigned long pcpu_handlers[NR_CPUS];
>>  #endif
>> -extern long exception_handlers[VECSIZE * 128 / sizeof(long)];
>>
>>  void setup_tlb_handler(int cpu)
>>  {
>> @@ -264,19 +263,20 @@ void setup_tlb_handler(int cpu)
>>         if (cpu == 0) {
>>                 memcpy((void *)tlbrentry, handle_tlb_refill, 0x80);
>>                 local_flush_icache_range(tlbrentry, tlbrentry + 0x80);
>> -               set_handler(EXCCODE_TLBI * VECSIZE, handle_tlb_load, VECSIZE);
>> -               set_handler(EXCCODE_TLBL * VECSIZE, handle_tlb_load, VECSIZE);
>> -               set_handler(EXCCODE_TLBS * VECSIZE, handle_tlb_store, VECSIZE);
>> -               set_handler(EXCCODE_TLBM * VECSIZE, handle_tlb_modify, VECSIZE);
>> -               set_handler(EXCCODE_TLBNR * VECSIZE, handle_tlb_protect, VECSIZE);
>> -               set_handler(EXCCODE_TLBNX * VECSIZE, handle_tlb_protect, VECSIZE);
>> -               set_handler(EXCCODE_TLBPE * VECSIZE, handle_tlb_protect, VECSIZE);
>> +               set_handler(EXCCODE_TLBI, handle_tlb_load);
>> +               set_handler(EXCCODE_TLBL, handle_tlb_load);
>> +               set_handler(EXCCODE_TLBS, handle_tlb_store);
>> +               set_handler(EXCCODE_TLBM, handle_tlb_modify);
>> +               set_handler(EXCCODE_TLBNR, handle_tlb_protect);
>> +               set_handler(EXCCODE_TLBNX, handle_tlb_protect);
>> +               set_handler(EXCCODE_TLBPE, handle_tlb_protect);
>>         }
>>  #ifdef CONFIG_NUMA
>>         else {
>>                 void *addr;
>> +               unsigned long addr_ul;
>>                 struct page *page;
>> -               const int vec_sz = sizeof(exception_handlers);
>> +               const int vec_sz = VECSIZE * 128;
>>
>>                 if (pcpu_handlers[cpu])
>>                         return;
>> @@ -286,8 +286,11 @@ void setup_tlb_handler(int cpu)
>>                         return;
>>
>>                 addr = page_address(page);
>> +               addr_ul = (unsigned long)addr;
>>                 pcpu_handlers[cpu] = (unsigned long)addr;
>> -               memcpy((void *)addr, (void *)eentry, vec_sz);
>> +               memcpy(addr, (void *)eentry, vec_sz);
>> +               for (unsigned long i = 0; i < 128; i++)
>> +                       reloc_handler(addr_ul + i * VECSIZE, eentry_reloc[i]);
>>                 local_flush_icache_range((unsigned long)addr, (unsigned long)addr + vec_sz);
>>                 csr_write64(pcpu_handlers[cpu], LOONGARCH_CSR_EENTRY);
>>                 csr_write64(pcpu_handlers[cpu], LOONGARCH_CSR_MERRENTRY);
>> diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S
>> index 3dd2a9615cd9..044c2190771a 100644
>> --- a/arch/loongarch/mm/tlbex.S
>> +++ b/arch/loongarch/mm/tlbex.S
>> @@ -39,11 +39,21 @@ SYM_FUNC_START(handle_tlb_protect)
>>         move            a1, zero
>>         csrrd           a2, LOONGARCH_CSR_BADV
>>         REG_S           a2, sp, PT_BVADDR
>> -       la.abs          t0, do_page_fault
>> +1:     la.pcrel        t0, t1, do_page_fault
>>         jirl            ra, t0, 0
>>         RESTORE_ALL_AND_RET
>>  SYM_FUNC_END(handle_tlb_protect)
>>
>> +SYM_DATA_START(rel_handle_tlb_protect)
>> +       LONG    2
>> +
>> +       LONG    514b - handle_tlb_protect
>> +       LONG    kernelsp
>> +
>> +       LONG    1b - handle_tlb_protect
>> +       LONG    do_page_fault
>> +SYM_DATA_END(rel_handle_tlb_protect)
>> +
>>  SYM_FUNC_START(handle_tlb_load)
>>         csrwr           t0, EXCEPTION_KS0
>>         csrwr           t1, EXCEPTION_KS1
>> @@ -115,7 +125,8 @@ smp_pgtable_change_load:
>>
>>  #ifdef CONFIG_64BIT
>>  vmalloc_load:
>> -       la.abs          t1, swapper_pg_dir
>> +       /* The first insn of vmalloc_done_load overwrites ra */
>> +1:     la.pcrel        t1, ra, swapper_pg_dir
>>         b               vmalloc_done_load
>>  #endif
>>
>> @@ -186,10 +197,24 @@ tlb_huge_update_load:
>>  nopage_tlb_load:
>>         dbar            0
>>         csrrd           ra, EXCEPTION_KS2
>> -       la.abs          t0, tlb_do_page_fault_0
>> +2:     la.pcrel        t0, t1, tlb_do_page_fault_0
>>         jr              t0
>>  SYM_FUNC_END(handle_tlb_load)
>>
>> +SYM_DATA_START(rel_handle_tlb_load)
>> +#ifdef CONFIG_64BIT
>> +       LONG    2
>> +
>> +       LONG    1b - handle_tlb_load
>> +       LONG    swapper_pg_dir
>> +#else
>> +       LONG    1
>> +#endif
>> +
>> +       LONG    2b - handle_tlb_load
>> +       LONG    tlb_do_page_fault_0
>> +SYM_DATA_END(rel_handle_tlb_load)
>> +
>>  SYM_FUNC_START(handle_tlb_store)
>>         csrwr           t0, EXCEPTION_KS0
>>         csrwr           t1, EXCEPTION_KS1
>> @@ -262,7 +287,8 @@ smp_pgtable_change_store:
>>
>>  #ifdef CONFIG_64BIT
>>  vmalloc_store:
>> -       la.abs          t1, swapper_pg_dir
>> +       /* The first insn of vmalloc_done_store overwrites ra */
>> +1:     la.pcrel        t1, ra, swapper_pg_dir
>>         b               vmalloc_done_store
>>  #endif
>>
>> @@ -335,10 +361,24 @@ tlb_huge_update_store:
>>  nopage_tlb_store:
>>         dbar            0
>>         csrrd           ra, EXCEPTION_KS2
>> -       la.abs          t0, tlb_do_page_fault_1
>> +2:     la.pcrel        t0, t1, tlb_do_page_fault_1
>>         jr              t0
>>  SYM_FUNC_END(handle_tlb_store)
>>
>> +SYM_DATA_START(rel_handle_tlb_store)
>> +#ifdef CONFIG_64BIT
>> +       LONG    2
>> +
>> +       LONG    1b - handle_tlb_store
>> +       LONG    swapper_pg_dir
>> +#else
>> +       LONG    1
>> +#endif
>> +
>> +       LONG    2b - handle_tlb_store
>> +       LONG    tlb_do_page_fault_1
>> +SYM_DATA_END(rel_handle_tlb_store)
>> +
>>  SYM_FUNC_START(handle_tlb_modify)
>>         csrwr           t0, EXCEPTION_KS0
>>         csrwr           t1, EXCEPTION_KS1
>> @@ -410,7 +450,8 @@ smp_pgtable_change_modify:
>>
>>  #ifdef CONFIG_64BIT
>>  vmalloc_modify:
>> -       la.abs          t1, swapper_pg_dir
>> +       /* The first insn of vmalloc_done_modify overwrites ra */
>> +1:     la.pcrel        t1, ra, swapper_pg_dir
>>         b               vmalloc_done_modify
>>  #endif
>>
>> @@ -482,10 +523,24 @@ tlb_huge_update_modify:
>>  nopage_tlb_modify:
>>         dbar            0
>>         csrrd           ra, EXCEPTION_KS2
>> -       la.abs          t0, tlb_do_page_fault_1
>> +2:     la.pcrel        t0, t1, tlb_do_page_fault_1
>>         jr              t0
>>  SYM_FUNC_END(handle_tlb_modify)
>>
>> +SYM_DATA_START(rel_handle_tlb_modify)
>> +#ifdef CONFIG_64BIT
>> +       LONG    2
>> +
>> +       LONG    1b - handle_tlb_modify
>> +       LONG    swapper_pg_dir
>> +#else
>> +       LONG    1
>> +#endif
>> +
>> +       LONG    2b - handle_tlb_modify
>> +       LONG    tlb_do_page_fault_1
>> +SYM_DATA_END(rel_handle_tlb_modify)
>> +
>>  SYM_FUNC_START(handle_tlb_refill)
>>         csrwr           t0, LOONGARCH_CSR_TLBRSAVE
>>         csrrd           t0, LOONGARCH_CSR_PGD
>> --
>> 2.37.3
>>
  
Youling Tang Feb. 16, 2023, 2:32 a.m. UTC | #3
Hi folks,

On 02/10/2023 05:18 PM, Youling Tang wrote:
>
>
> On 02/10/2023 05:09 PM, Huacai Chen wrote:
>> Hi, Youling and Ruoyao,
>>
>> Thank you very much for implementing the per-node exceptions. But I
>> want to know if the per-node solution is really worthy for a PIE
>> kernel. So, could you please test the performance? Maybe we can reduce
>> the complexity if we give up the per-node solution.

Tested on Loongson-3C5000L-LL machine, using CLFS7.3 system.

- nopernode:
   Based on the v1 patch method, and remove the else branch process in
   setup_tlb_handler().

- pernode: Based on the v4 patch method.

- pie: Enable RANDOMIZE_BASE (KASLR).

- nopie: Disable RANDOMIZE_BASE and RELOCATABLE.


The UnixBench test results are as follows:

- nopernode-nopie: 3938.7

- pernode-nopie: 4062.2

- nopernode-pie: 4009.7

- pernode-pie: 4028.7

In general, `pernode` is higher than `nopernode`, and `nopie` is higher
than `pie`. (except that nopernode-pie is higher than nopernode-nopie,
which is not as expected, which may be caused by the instability of the
machine).

Everyone is more inclined to use `pernode` or `nopernode` to implement
in the exception handling process?

Youling.
  
Huacai Chen Feb. 16, 2023, 6:56 a.m. UTC | #4
On Thu, Feb 16, 2023 at 10:32 AM Youling Tang <tangyouling@loongson.cn> wrote:
>
> Hi folks,
>
> On 02/10/2023 05:18 PM, Youling Tang wrote:
> >
> >
> > On 02/10/2023 05:09 PM, Huacai Chen wrote:
> >> Hi, Youling and Ruoyao,
> >>
> >> Thank you very much for implementing the per-node exceptions. But I
> >> want to know if the per-node solution is really worthy for a PIE
> >> kernel. So, could you please test the performance? Maybe we can reduce
> >> the complexity if we give up the per-node solution.
>
> Tested on Loongson-3C5000L-LL machine, using CLFS7.3 system.
>
> - nopernode:
>    Based on the v1 patch method, and remove the else branch process in
>    setup_tlb_handler().
>
> - pernode: Based on the v4 patch method.
>
> - pie: Enable RANDOMIZE_BASE (KASLR).
>
> - nopie: Disable RANDOMIZE_BASE and RELOCATABLE.
>
>
> The UnixBench test results are as follows:
>
> - nopernode-nopie: 3938.7
>
> - pernode-nopie: 4062.2
>
> - nopernode-pie: 4009.7
>
> - pernode-pie: 4028.7
>
> In general, `pernode` is higher than `nopernode`, and `nopie` is higher
> than `pie`. (except that nopernode-pie is higher than nopernode-nopie,
> which is not as expected, which may be caused by the instability of the
> machine).
>
> Everyone is more inclined to use `pernode` or `nopernode` to implement
> in the exception handling process?
From my point of view, for the PIE kernel the performance difference
between pernode and nopoernode is negligible. On the other hand,
pernode implementation needs some compiler hackings and makes the
logic significantly complex. So I prefer to remove the pernode
exception support.

Huacai
>
> Youling.
>
>
  
Jinyang He Feb. 16, 2023, 6:59 a.m. UTC | #5
On 2023-02-16 10:32, Youling Tang wrote:

> Hi folks,
>
> On 02/10/2023 05:18 PM, Youling Tang wrote:
>>
>>
>> On 02/10/2023 05:09 PM, Huacai Chen wrote:
>>> Hi, Youling and Ruoyao,
>>>
>>> Thank you very much for implementing the per-node exceptions. But I
>>> want to know if the per-node solution is really worthy for a PIE
>>> kernel. So, could you please test the performance? Maybe we can reduce
>>> the complexity if we give up the per-node solution.
>
> Tested on Loongson-3C5000L-LL machine, using CLFS7.3 system.
>
> - nopernode:
>   Based on the v1 patch method, and remove the else branch process in
>   setup_tlb_handler().
>
> - pernode: Based on the v4 patch method.
>
> - pie: Enable RANDOMIZE_BASE (KASLR).
>
> - nopie: Disable RANDOMIZE_BASE and RELOCATABLE.
>
>
> The UnixBench test results are as follows:
>
> - nopernode-nopie: 3938.7
>
> - pernode-nopie: 4062.2
>
> - nopernode-pie: 4009.7
>
> - pernode-pie: 4028.7
>
> In general, `pernode` is higher than `nopernode`, and `nopie` is higher
> than `pie`. (except that nopernode-pie is higher than nopernode-nopie,
> which is not as expected, which may be caused by the instability of the
> machine).
>
> Everyone is more inclined to use `pernode` or `nopernode` to implement
> in the exception handling process?
>
> Youling.

Hi, Youling,


Thanks for your test results.


I did an informal patch to keep la.abs, which think la.abs as a macro. 
just qemu test.

To test this patch, patch the [PATCH v4 1/5] [PATCH v4 3/5] as prediction.

This following patch just provides a method. I'm busy with other things. 
Hopefully it will help you simplify [PATCH v4 2/5].


Thanks,

Jinyang



diff --git a/arch/loongarch/include/asm/asmmacro.h 
b/arch/loongarch/include/asm/asmmacro.h
index 328bb956f241..6ebad458d662 100644
--- a/arch/loongarch/include/asm/asmmacro.h
+++ b/arch/loongarch/include/asm/asmmacro.h
@@ -667,4 +667,19 @@
      nor    \dst, \src, zero
  .endm

+.macro la.abs reg, sym
+766:
+    nop
+    nop
+    nop
+    nop
+    .pushsection ".laabs", "aw", %progbits
+768:
+    .word 768b-766b
+    parse_r regno, \reg
+    .word regno
+    .dword \sym
+    .popsection
+.endm
+
  #endif /* _ASM_ASMMACRO_H */
diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S
index d2ac26b5b22b..3b273f05be8c 100644
--- a/arch/loongarch/kernel/head.S
+++ b/arch/loongarch/kernel/head.S
@@ -86,6 +86,7 @@ SYM_CODE_START(kernel_entry)            # kernel entry 
point
      PTR_ADD        sp, sp, tp
      set_saved_sp    sp, t0, t1

+    bl        relocate_laabs
      bl        start_kernel
      ASM_BUG()

diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 4344502c0b31..9f8833a2524a 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -582,3 +582,30 @@ void __init setup_arch(char **cmdline_p)

      paging_init();
  }
+
+void __init relocate_laabs(void)
+{
+    extern void *__laabs_begin;
+    extern void *__laabs_end;
+    struct laabs {
+        int offset;
+        int reg;
+        long symvalue;
+    } *p;
+
+    for (p = (void *)&__laabs_begin; (void *)p < (void *)&__laabs_end; p++)
+    {
+        int lu12iw, ori, lu32id, lu52id;
+        long v = p->symvalue;
+        int reg = p->reg;
+        int *insn = (void *)p - p->offset;
+        lu12iw = 0x14000000 | reg | (((v & 0xfffff000) >> 12) << 5);
+        ori = 0x03800000 | reg | (reg<<5) | ((v & 0xfff) << 10);
+        lu32id = 0x16000000 | reg | (((v & 0x000fffff00000000) >> 32) 
<< 5);
+        lu52id = 0x03000000 | reg | (reg<<5) | (((v >> 52) & 0xfff) << 10);
+        insn[0] = lu12iw;
+        insn[1] = ori;
+        insn[2] = lu32id;
+        insn[3] = lu52id;
+    }
+}
diff --git a/arch/loongarch/kernel/vmlinux.lds.S 
b/arch/loongarch/kernel/vmlinux.lds.S
index 733b16e8d55d..4d128e089393 100644
--- a/arch/loongarch/kernel/vmlinux.lds.S
+++ b/arch/loongarch/kernel/vmlinux.lds.S
@@ -66,6 +66,13 @@ SECTIONS
          __alt_instructions_end = .;
      }

+    . = ALIGN(4);
+    .laabs : AT(ADDR(.laabs) - LOAD_OFFSET) {
+        __laabs_begin = .;
+        *(.laabs)
+        __laabs_end = .;
+    }
+
      .got : ALIGN(16) { *(.got) }
      .plt : ALIGN(16) { *(.plt) }
      .got.plt : ALIGN(16) { *(.got.plt) }
  
Xi Ruoyao Feb. 16, 2023, 7:10 a.m. UTC | #6
On Thu, 2023-02-16 at 14:59 +0800, Jinyang He wrote:
> +.macro la.abs reg, sym
> +766:
> +    nop
> +    nop
> +    nop
> +    nop

In the "formal" version we can code

lu12i.w		reg, 0
ori		reg, reg, 0
lu32i.d		reg, 0
lu52i.d		reg, reg, 0

here.  Then we only need to fixup the immediate slot so we can avoid
using parse_r.


> +    .pushsection ".laabs", "aw", %progbits
> +768:
> +    .word 768b-766b
> +    parse_r regno, \reg
> +    .word regno
> +    .dword \sym
> +    .popsection
> +.endm
  
Youling Tang Feb. 16, 2023, 8:03 a.m. UTC | #7
On 02/16/2023 03:10 PM, Xi Ruoyao wrote:
> On Thu, 2023-02-16 at 14:59 +0800, Jinyang He wrote:
>> +.macro la.abs reg, sym
>> +766:
>> +    nop
>> +    nop
>> +    nop
>> +    nop
>
> In the "formal" version we can code
>
> lu12i.w		reg, 0
> ori		reg, reg, 0
> lu32i.d		reg, 0
> lu52i.d		reg, reg, 0
>
> here.  Then we only need to fixup the immediate slot so we can avoid
> using parse_r.
>
>
>> +    .pushsection ".laabs", "aw", %progbits
>> +768:
>> +    .word 768b-766b
>> +    parse_r regno, \reg
>> +    .word regno
>> +    .dword \sym
>> +    .popsection
>> +.endm

I will try to modify a version for testing, using the following
definition, when the RELOCATABLE is turned on, the "la.abs macro" is
used, otherwise the "la.abs pseudo instruction" is still used as before.

#ifdef CONFIG_RELOCATABLE
.macro la.abs reg, sym
lu12i.w		reg, 0
ori		reg, reg, 0
lu32i.d		reg, 0
lu52i.d		reg, reg, 0
.endm
#endif

Youling.
  
Youling Tang Feb. 16, 2023, 11:18 a.m. UTC | #8
On 02/16/2023 04:03 PM, Youling Tang wrote:
>
>
> On 02/16/2023 03:10 PM, Xi Ruoyao wrote:
>> On Thu, 2023-02-16 at 14:59 +0800, Jinyang He wrote:
>>> +.macro la.abs reg, sym
>>> +766:
>>> +    nop
>>> +    nop
>>> +    nop
>>> +    nop
>>
>> In the "formal" version we can code
>>
>> lu12i.w        reg, 0
>> ori        reg, reg, 0
>> lu32i.d        reg, 0
>> lu52i.d        reg, reg, 0
>>
>> here.  Then we only need to fixup the immediate slot so we can avoid
>> using parse_r.
>>
>>
>>> +    .pushsection ".laabs", "aw", %progbits
>>> +768:
>>> +    .word 768b-766b
>>> +    parse_r regno, \reg
>>> +    .word regno
>>> +    .dword \sym
>>> +    .popsection
>>> +.endm
>
> I will try to modify a version for testing, using the following
> definition, when the RELOCATABLE is turned on, the "la.abs macro" is
> used, otherwise the "la.abs pseudo instruction" is still used as before.
>
> #ifdef CONFIG_RELOCATABLE
> .macro la.abs reg, sym
> lu12i.w        reg, 0
> ori        reg, reg, 0
> lu32i.d        reg, 0
> lu52i.d        reg, reg, 0
> .endm
> #endif

On the basis of the v4 patch set, remove patch2, and then add the 
following patches, and the test is successful on qemu.

If this method is more acceptable to everyone, I will send v5.

diff --git a/arch/loongarch/include/asm/asmmacro.h 
b/arch/loongarch/include/asm/asmmacro.h
index 328bb956f241..adb04ae6b208 100644
--- a/arch/loongarch/include/asm/asmmacro.h
+++ b/arch/loongarch/include/asm/asmmacro.h
@@ -667,4 +667,19 @@
         nor     \dst, \src, zero
  .endm

+#ifdef CONFIG_RELOCATABLE
+.macro la.abs reg, sym
+766:
+       lu12i.w \reg, 0
+       ori     \reg, \reg, 0
+       lu32i.d \reg, 0
+       lu52i.d \reg, \reg, 0
+       .pushsection ".laabs", "aw", %progbits
+768:
+       .dword 768b-766b
+       .dword \sym
+       .popsection
+.endm
+#endif
+
  #endif /* _ASM_ASMMACRO_H */
diff --git a/arch/loongarch/kernel/relocate.c 
b/arch/loongarch/kernel/relocate.c
index 7d19cc0d2185..7ad327a554f9 100644
--- a/arch/loongarch/kernel/relocate.c
+++ b/arch/loongarch/kernel/relocate.c
@@ -12,6 +12,7 @@
  #include <linux/start_kernel.h>
  #include <asm/bootinfo.h>
  #include <asm/early_ioremap.h>
+#include <asm/inst.h>
  #include <asm/sections.h>

  #define RELOCATED(x) ((void *)((long)x + reloc_offset))
@@ -45,6 +46,32 @@ static inline __init void relocate_relative(void)
         }
  }

+static inline void __init relocate_laabs(long offset)
+{
+       extern void *__laabs_begin;
+       extern void *__laabs_end;
+       struct laabs {
+               long offset;
+               long symvalue;
+       } *p;
+
+       for (p = (void *)&__laabs_begin; (void *)p < (void 
*)&__laabs_end; p++) {
+               long v = p->symvalue + reloc_offset;
+               union loongarch_instruction *insn = (void *)p - 
p->offset + offset;
+               u32 lu12iw, ori, lu32id, lu52id;
+
+               lu12iw = (v >> 12) & 0xfffff;
+               ori = v & 0xfff;
+               lu32id = (v >> 32) & 0xfffff;
+               lu52id = v >> 52;
+
+               insn[0].reg1i20_format.immediate = lu12iw;
+               insn[1].reg2i12_format.immediate = ori;
+               insn[2].reg1i20_format.immediate = lu32id;
+               insn[3].reg2i12_format.immediate = lu52id;
+    }
+}
+
  #ifdef CONFIG_RANDOMIZE_BASE
  static inline __init unsigned long rotate_xor(unsigned long hash,
                                               const void *area, size_t 
size)
@@ -168,8 +195,10 @@ void *__init do_kaslr(void)
                 update_reloc_offset(&reloc_offset, offset);
         }

-       if (reloc_offset)
+       if (reloc_offset) {
                 relocate_relative();
+               relocate_laabs(offset);
+       }

         return kernel_entry;
  }
@@ -181,6 +210,8 @@ void __init relocate_kernel(void)

         if (reloc_offset)
                 relocate_relative();
+
+       relocate_laabs(0);
  }

  /*
diff --git a/arch/loongarch/kernel/vmlinux.lds.S 
b/arch/loongarch/kernel/vmlinux.lds.S
index aec0b6567d24..0e58c68bf427 100644
--- a/arch/loongarch/kernel/vmlinux.lds.S
+++ b/arch/loongarch/kernel/vmlinux.lds.S
@@ -66,6 +66,13 @@ SECTIONS
                 __alt_instructions_end = .;
         }

+       . = ALIGN(8);
+       .laabs : AT(ADDR(.laabs) - LOAD_OFFSET) {
+               __laabs_begin = .;
+               *(.laabs)
+               __laabs_end = .;
+       }
+
         .got : ALIGN(16) { *(.got) }


Youling.
  
Youling Tang Feb. 16, 2023, 11:29 a.m. UTC | #9
On 02/16/2023 07:18 PM, Youling Tang wrote:
>
> On 02/16/2023 04:03 PM, Youling Tang wrote:
>>
>>
>> On 02/16/2023 03:10 PM, Xi Ruoyao wrote:
>>> On Thu, 2023-02-16 at 14:59 +0800, Jinyang He wrote:
>>>> +.macro la.abs reg, sym
>>>> +766:
>>>> +    nop
>>>> +    nop
>>>> +    nop
>>>> +    nop
>>>
>>> In the "formal" version we can code
>>>
>>> lu12i.w        reg, 0
>>> ori        reg, reg, 0
>>> lu32i.d        reg, 0
>>> lu52i.d        reg, reg, 0
>>>
>>> here.  Then we only need to fixup the immediate slot so we can avoid
>>> using parse_r.
>>>
>>>
>>>> +    .pushsection ".laabs", "aw", %progbits
>>>> +768:
>>>> +    .word 768b-766b
>>>> +    parse_r regno, \reg
>>>> +    .word regno
>>>> +    .dword \sym
>>>> +    .popsection
>>>> +.endm
>>
>> I will try to modify a version for testing, using the following
>> definition, when the RELOCATABLE is turned on, the "la.abs macro" is
>> used, otherwise the "la.abs pseudo instruction" is still used as before.
>>
>> #ifdef CONFIG_RELOCATABLE
>> .macro la.abs reg, sym
>> lu12i.w        reg, 0
>> ori        reg, reg, 0
>> lu32i.d        reg, 0
>> lu52i.d        reg, reg, 0
>> .endm
>> #endif
>
> On the basis of the v4 patch set, remove patch2, and then add the
> following patches, and the test is successful on qemu.
>
> If this method is more acceptable to everyone, I will send v5.
>
> diff --git a/arch/loongarch/include/asm/asmmacro.h
> b/arch/loongarch/include/asm/asmmacro.h
> index 328bb956f241..adb04ae6b208 100644
> --- a/arch/loongarch/include/asm/asmmacro.h
> +++ b/arch/loongarch/include/asm/asmmacro.h
> @@ -667,4 +667,19 @@
>         nor     \dst, \src, zero
>  .endm
>
> +#ifdef CONFIG_RELOCATABLE
> +.macro la.abs reg, sym
> +766:
> +       lu12i.w \reg, 0
> +       ori     \reg, \reg, 0
> +       lu32i.d \reg, 0
> +       lu52i.d \reg, \reg, 0
> +       .pushsection ".laabs", "aw", %progbits
> +768:
> +       .dword 768b-766b
> +       .dword \sym
> +       .popsection
> +.endm
> +#endif
> +
>  #endif /* _ASM_ASMMACRO_H */
> diff --git a/arch/loongarch/kernel/relocate.c
> b/arch/loongarch/kernel/relocate.c
> index 7d19cc0d2185..7ad327a554f9 100644
> --- a/arch/loongarch/kernel/relocate.c
> +++ b/arch/loongarch/kernel/relocate.c
> @@ -12,6 +12,7 @@
>  #include <linux/start_kernel.h>
>  #include <asm/bootinfo.h>
>  #include <asm/early_ioremap.h>
> +#include <asm/inst.h>
>  #include <asm/sections.h>
>
>  #define RELOCATED(x) ((void *)((long)x + reloc_offset))
> @@ -45,6 +46,32 @@ static inline __init void relocate_relative(void)
>         }
>  }
>
> +static inline void __init relocate_laabs(long offset)
> +{
> +       extern void *__laabs_begin;
> +       extern void *__laabs_end;
> +       struct laabs {
> +               long offset;
> +               long symvalue;
> +       } *p;
> +
> +       for (p = (void *)&__laabs_begin; (void *)p < (void
> *)&__laabs_end; p++) {
> +               long v = p->symvalue + reloc_offset;
> +               union loongarch_instruction *insn = (void *)p -
> p->offset + offset;
> +               u32 lu12iw, ori, lu32id, lu52id;
> +
> +               lu12iw = (v >> 12) & 0xfffff;
> +               ori = v & 0xfff;
> +               lu32id = (v >> 32) & 0xfffff;
> +               lu52id = v >> 52;
> +
> +               insn[0].reg1i20_format.immediate = lu12iw;
> +               insn[1].reg2i12_format.immediate = ori;
> +               insn[2].reg1i20_format.immediate = lu32id;
> +               insn[3].reg2i12_format.immediate = lu52id;
> +    }
> +}
> +
>  #ifdef CONFIG_RANDOMIZE_BASE
>  static inline __init unsigned long rotate_xor(unsigned long hash,
>                                               const void *area, size_t
> size)
> @@ -168,8 +195,10 @@ void *__init do_kaslr(void)
>                 update_reloc_offset(&reloc_offset, offset);
>         }
>
> -       if (reloc_offset)
> +       if (reloc_offset) {
>                 relocate_relative();
> +               relocate_laabs(offset);
> +       }

Self review:

         if (reloc_offset)
                 relocate_relative();

         relocate_laabs(offset);

>
>         return kernel_entry;
>  }
> @@ -181,6 +210,8 @@ void __init relocate_kernel(void)
>
>         if (reloc_offset)
>                 relocate_relative();
> +
> +       relocate_laabs(0);
>  }
>
>  /*
> diff --git a/arch/loongarch/kernel/vmlinux.lds.S
> b/arch/loongarch/kernel/vmlinux.lds.S
> index aec0b6567d24..0e58c68bf427 100644
> --- a/arch/loongarch/kernel/vmlinux.lds.S
> +++ b/arch/loongarch/kernel/vmlinux.lds.S
> @@ -66,6 +66,13 @@ SECTIONS
>                 __alt_instructions_end = .;
>         }
>
> +       . = ALIGN(8);
> +       .laabs : AT(ADDR(.laabs) - LOAD_OFFSET) {
> +               __laabs_begin = .;
> +               *(.laabs)
> +               __laabs_end = .;
> +       }
> +
>         .got : ALIGN(16) { *(.got) }
>
>
> Youling.
>
  

Patch

diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h
index 7eedd83fd0d7..426054518a3d 100644
--- a/arch/loongarch/include/asm/inst.h
+++ b/arch/loongarch/include/asm/inst.h
@@ -32,6 +32,7 @@  enum reg1i20_op {
 	lu12iw_op	= 0x0a,
 	lu32id_op	= 0x0b,
 	pcaddi_op	= 0x0c,
+	pcalau12i_op	= 0x0d,
 	pcaddu12i_op	= 0x0e,
 	pcaddu18i_op	= 0x0f,
 };
diff --git a/arch/loongarch/include/asm/setup.h b/arch/loongarch/include/asm/setup.h
index 72ead58039f3..f0a2b34365f1 100644
--- a/arch/loongarch/include/asm/setup.h
+++ b/arch/loongarch/include/asm/setup.h
@@ -11,6 +11,9 @@ 
 
 #define VECSIZE 0x200
 
+struct handler_reloc;
+
+extern struct handler_reloc *eentry_reloc[];
 extern unsigned long eentry;
 extern unsigned long tlbrentry;
 extern char init_command_line[COMMAND_LINE_SIZE];
@@ -18,7 +21,8 @@  extern void tlb_init(int cpu);
 extern void cpu_cache_init(void);
 extern void cache_error_setup(void);
 extern void per_cpu_trap_init(int cpu);
-extern void set_handler(unsigned long offset, void *addr, unsigned long len);
+extern void set_handler(unsigned long exccode, void *addr);
 extern void set_merr_handler(unsigned long offset, void *addr, unsigned long len);
+extern void reloc_handler(unsigned long handler, struct handler_reloc *rel);
 
 #endif /* __SETUP_H */
diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h
index 7deb043ce387..bbec1e56b61b 100644
--- a/arch/loongarch/include/asm/stackframe.h
+++ b/arch/loongarch/include/asm/stackframe.h
@@ -77,7 +77,8 @@ 
  * new value in sp.
  */
 	.macro	get_saved_sp docfi=0
-	la.abs	  t1, kernelsp
+	/* The label is used for generating reloc tables for handlers */
+514:	la.pcrel  t1, t0, kernelsp
 #ifdef CONFIG_SMP
 	csrrd	  t0, PERCPU_BASE_KS
 	LONG_ADD  t1, t1, t0
diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S
index 7e5c293ed89f..005a10fe5a50 100644
--- a/arch/loongarch/kernel/genex.S
+++ b/arch/loongarch/kernel/genex.S
@@ -34,7 +34,7 @@  SYM_FUNC_END(__arch_cpu_idle)
 SYM_FUNC_START(handle_vint)
 	BACKUP_T0T1
 	SAVE_ALL
-	la.abs	t1, __arch_cpu_idle
+0:	la.pcrel t1, t2, __arch_cpu_idle
 	LONG_L	t0, sp, PT_ERA
 	/* 32 byte rollback region */
 	ori	t0, t0, 0x1f
@@ -43,11 +43,25 @@  SYM_FUNC_START(handle_vint)
 	LONG_S	t0, sp, PT_ERA
 1:	move	a0, sp
 	move	a1, sp
-	la.abs	t0, do_vint
+2:	la.pcrel t0, t2, do_vint
 	jirl	ra, t0, 0
 	RESTORE_ALL_AND_RET
 SYM_FUNC_END(handle_vint)
 
+SYM_DATA_START(rel_handle_vint)
+LONG	3
+
+LONG	514b - handle_vint
+LONG	kernelsp
+
+LONG	0b - handle_vint
+LONG	__arch_cpu_idle
+
+LONG	2b - handle_vint
+LONG	do_vint
+
+SYM_DATA_END(rel_handle_vint)
+
 SYM_FUNC_START(except_vec_cex)
 	b	cache_parity_error
 SYM_FUNC_END(except_vec_cex)
@@ -72,12 +86,24 @@  SYM_FUNC_END(except_vec_cex)
 	SAVE_ALL
 	build_prep_\prep
 	move	a0, sp
-	la.abs	t0, do_\handler
+	667:
+	la.pcrel t0, t1, do_\handler
 	jirl	ra, t0, 0
 	668:
 	RESTORE_ALL_AND_RET
 	SYM_FUNC_END(handle_\exception)
 	SYM_DATA(unwind_hint_\exception, .word 668b - 666b)
+
+	SYM_DATA_START(rel_handle_\exception)
+	LONG	2
+
+	LONG	514b - 666b
+	LONG	kernelsp
+
+	LONG	667b - 666b
+	LONG	do_\handler
+
+	SYM_DATA_END(rel_handle_\exception)
 	.endm
 
 	BUILD_HANDLER ade ade badv
@@ -93,6 +119,12 @@  SYM_FUNC_END(except_vec_cex)
 	BUILD_HANDLER reserved reserved none	/* others */
 
 SYM_FUNC_START(handle_sys)
-	la.abs	t0, handle_syscall
+	la.pcrel t0, t1, handle_syscall
 	jr	t0
 SYM_FUNC_END(handle_sys)
+
+SYM_DATA_START(rel_handle_sys)
+LONG	1
+LONG	0
+LONG	handle_syscall
+SYM_DATA_END(rel_handle_sys)
diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c
index c38a146a973b..7e073854f493 100644
--- a/arch/loongarch/kernel/traps.c
+++ b/arch/loongarch/kernel/traps.c
@@ -62,6 +62,127 @@  extern asmlinkage void handle_reserved(void);
 extern asmlinkage void handle_watch(void);
 extern asmlinkage void handle_vint(void);
 
+struct handler_reloc_entry {
+	unsigned long offset;
+	unsigned long sym;
+};
+
+struct handler_reloc {
+	unsigned long cnt;
+	struct handler_reloc_entry entries[];
+};
+
+extern struct handler_reloc rel_handle_tlb_load;
+extern struct handler_reloc rel_handle_tlb_store;
+extern struct handler_reloc rel_handle_tlb_modify;
+extern struct handler_reloc rel_handle_tlb_protect;
+extern struct handler_reloc rel_handle_ade;
+extern struct handler_reloc rel_handle_ale;
+extern struct handler_reloc rel_handle_sys;
+extern struct handler_reloc rel_handle_bp;
+extern struct handler_reloc rel_handle_ri;
+extern struct handler_reloc rel_handle_fpu;
+extern struct handler_reloc rel_handle_lsx;
+extern struct handler_reloc rel_handle_lasx;
+extern struct handler_reloc rel_handle_fpe;
+extern struct handler_reloc rel_handle_lbt;
+extern struct handler_reloc rel_handle_watch;
+extern struct handler_reloc rel_handle_reserved;
+extern struct handler_reloc rel_handle_vint;
+
+struct handler_reloc *eentry_reloc[128] = {
+	[0] = NULL, /* merr handler */
+	[EXCCODE_TLBL] = &rel_handle_tlb_load,
+	[EXCCODE_TLBS] = &rel_handle_tlb_store,
+	[EXCCODE_TLBI] = &rel_handle_tlb_load,
+	[EXCCODE_TLBM] = &rel_handle_tlb_modify,
+	[EXCCODE_TLBNR] = &rel_handle_tlb_protect,
+	[EXCCODE_TLBNX] = &rel_handle_tlb_protect,
+	[EXCCODE_TLBPE] = &rel_handle_tlb_protect,
+	[EXCCODE_ADE] = &rel_handle_ade,
+	[EXCCODE_ALE] = &rel_handle_ale,
+	[EXCCODE_SYS] = &rel_handle_sys,
+	[EXCCODE_BP] = &rel_handle_bp,
+	[EXCCODE_INE] = &rel_handle_ri,
+	[EXCCODE_IPE] = &rel_handle_ri,
+	[EXCCODE_FPDIS] = &rel_handle_fpu,
+	[EXCCODE_LSXDIS] = &rel_handle_lsx,
+	[EXCCODE_LASXDIS] = &rel_handle_lasx,
+	[EXCCODE_FPE] = &rel_handle_fpe,
+	[EXCCODE_BTDIS] = &rel_handle_lbt,
+	[EXCCODE_WATCH] = &rel_handle_watch,
+	[(EXCCODE_WATCH + 1) ... (EXCCODE_INT_START - 1)] = &rel_handle_reserved,
+	[EXCCODE_INT_START ... (EXCCODE_INT_END - 1)] = &rel_handle_vint,
+};
+
+void reloc_handler(unsigned long handler, struct handler_reloc *rel)
+{
+	if (!rel)
+		return;
+
+	for (unsigned long i = 0; i < rel->cnt; i++) {
+		unsigned long pc = handler + rel->entries[i].offset;
+		union loongarch_instruction *insn =
+			(union loongarch_instruction *)pc;
+		u32 imm[4];
+		unsigned long v = rel->entries[i].sym;
+
+		/* GNU as >= 2.40 uses pcalau12i for la.pcrel, but GNU ld <= 2.39
+		 * uses pcaddu12i.
+		 */
+		if (insn->reg1i20_format.opcode == pcalau12i_op) {
+			/* Use s32 deliberately for sign extension. */
+			s32 offset_hi20 = ((v + 0x800) & ~0xfff) -
+					  (pc & ~0xfff);
+			unsigned long anchor = (pc & ~0xfff) + offset_hi20;
+			unsigned long offset_rem = v - anchor;
+
+			imm[0] = (offset_hi20 >> 12) & 0xfffff;
+			imm[1] = v & 0xfff;
+			imm[2] = (offset_rem >> 32) & 0xfffff;
+			imm[3] = offset_rem >> 52;
+		} else if (insn->reg1i20_format.opcode == pcaddu12i_op) {
+			/* Use s32 deliberately for sign extension. */
+			s32 offset_lo = v - pc;
+			unsigned long offset_hi = v - pc - offset_lo;
+
+			imm[0] = (offset_lo >> 12) & 0xfffff;
+			imm[1] = offset_lo & 0xfff;
+			imm[2] = (offset_hi >> 32) & 0xfffff;
+			imm[3] = offset_hi >> 52;
+		} else
+			panic("Cannot fixup la.pcrel for exception handler at %lu: unexpected instruction %d!",
+			      pc, insn->word);
+
+		insn[0].reg1i20_format.immediate = imm[0];
+		insn[1].reg2i12_format.immediate = imm[1];
+		insn[2].reg1i20_format.immediate = imm[2];
+		insn[3].reg2i12_format.immediate = imm[3];
+	}
+}
+
+/* Install CPU exception handler */
+static void do_set_handler(unsigned long exccode, void *addr,
+			   struct handler_reloc *rel)
+{
+	unsigned long dest_addr = eentry + exccode * VECSIZE;
+
+	memcpy((void *)dest_addr, addr, VECSIZE);
+	reloc_handler(dest_addr, rel);
+	local_flush_icache_range(dest_addr, dest_addr + VECSIZE);
+}
+
+/* Install CPU exception handler, with the reloc table from eentry_reloc */
+void set_handler(unsigned long exccode, void *addr)
+{
+	do_set_handler(exccode, addr, eentry_reloc[exccode]);
+}
+
+static void set_handler_reserved(unsigned long exccode)
+{
+	do_set_handler(exccode, handle_reserved, &rel_handle_reserved);
+}
+
 static void show_backtrace(struct task_struct *task, const struct pt_regs *regs,
 			   const char *loglvl, bool user)
 {
@@ -704,19 +825,12 @@  void per_cpu_trap_init(int cpu)
 	/* Initialise exception handlers */
 	if (cpu == 0)
 		for (i = 0; i < 64; i++)
-			set_handler(i * VECSIZE, handle_reserved, VECSIZE);
+			set_handler_reserved(i);
 
 	tlb_init(cpu);
 	cpu_cache_init();
 }
 
-/* Install CPU exception handler */
-void set_handler(unsigned long offset, void *addr, unsigned long size)
-{
-	memcpy((void *)(eentry + offset), addr, size);
-	local_flush_icache_range(eentry + offset, eentry + offset + size);
-}
-
 static const char panic_null_cerr[] =
 	"Trying to set NULL cache error exception handler\n";
 
@@ -741,20 +855,20 @@  void __init trap_init(void)
 
 	/* Set interrupt vector handler */
 	for (i = EXCCODE_INT_START; i < EXCCODE_INT_END; i++)
-		set_handler(i * VECSIZE, handle_vint, VECSIZE);
-
-	set_handler(EXCCODE_ADE * VECSIZE, handle_ade, VECSIZE);
-	set_handler(EXCCODE_ALE * VECSIZE, handle_ale, VECSIZE);
-	set_handler(EXCCODE_SYS * VECSIZE, handle_sys, VECSIZE);
-	set_handler(EXCCODE_BP * VECSIZE, handle_bp, VECSIZE);
-	set_handler(EXCCODE_INE * VECSIZE, handle_ri, VECSIZE);
-	set_handler(EXCCODE_IPE * VECSIZE, handle_ri, VECSIZE);
-	set_handler(EXCCODE_FPDIS * VECSIZE, handle_fpu, VECSIZE);
-	set_handler(EXCCODE_LSXDIS * VECSIZE, handle_lsx, VECSIZE);
-	set_handler(EXCCODE_LASXDIS * VECSIZE, handle_lasx, VECSIZE);
-	set_handler(EXCCODE_FPE * VECSIZE, handle_fpe, VECSIZE);
-	set_handler(EXCCODE_BTDIS * VECSIZE, handle_lbt, VECSIZE);
-	set_handler(EXCCODE_WATCH * VECSIZE, handle_watch, VECSIZE);
+		set_handler(i, handle_vint);
+
+	set_handler(EXCCODE_ADE, handle_ade);
+	set_handler(EXCCODE_ALE, handle_ale);
+	set_handler(EXCCODE_SYS, handle_sys);
+	set_handler(EXCCODE_BP, handle_bp);
+	set_handler(EXCCODE_INE, handle_ri);
+	set_handler(EXCCODE_IPE, handle_ri);
+	set_handler(EXCCODE_FPDIS, handle_fpu);
+	set_handler(EXCCODE_LSXDIS, handle_lsx);
+	set_handler(EXCCODE_LASXDIS, handle_lasx);
+	set_handler(EXCCODE_FPE, handle_fpe);
+	set_handler(EXCCODE_BTDIS, handle_lbt);
+	set_handler(EXCCODE_WATCH, handle_watch);
 
 	cache_error_setup();
 
diff --git a/arch/loongarch/mm/tlb.c b/arch/loongarch/mm/tlb.c
index 8bad6b0cff59..6f70aab7202a 100644
--- a/arch/loongarch/mm/tlb.c
+++ b/arch/loongarch/mm/tlb.c
@@ -253,7 +253,6 @@  static void output_pgtable_bits_defines(void)
 #ifdef CONFIG_NUMA
 unsigned long pcpu_handlers[NR_CPUS];
 #endif
-extern long exception_handlers[VECSIZE * 128 / sizeof(long)];
 
 void setup_tlb_handler(int cpu)
 {
@@ -264,19 +263,20 @@  void setup_tlb_handler(int cpu)
 	if (cpu == 0) {
 		memcpy((void *)tlbrentry, handle_tlb_refill, 0x80);
 		local_flush_icache_range(tlbrentry, tlbrentry + 0x80);
-		set_handler(EXCCODE_TLBI * VECSIZE, handle_tlb_load, VECSIZE);
-		set_handler(EXCCODE_TLBL * VECSIZE, handle_tlb_load, VECSIZE);
-		set_handler(EXCCODE_TLBS * VECSIZE, handle_tlb_store, VECSIZE);
-		set_handler(EXCCODE_TLBM * VECSIZE, handle_tlb_modify, VECSIZE);
-		set_handler(EXCCODE_TLBNR * VECSIZE, handle_tlb_protect, VECSIZE);
-		set_handler(EXCCODE_TLBNX * VECSIZE, handle_tlb_protect, VECSIZE);
-		set_handler(EXCCODE_TLBPE * VECSIZE, handle_tlb_protect, VECSIZE);
+		set_handler(EXCCODE_TLBI, handle_tlb_load);
+		set_handler(EXCCODE_TLBL, handle_tlb_load);
+		set_handler(EXCCODE_TLBS, handle_tlb_store);
+		set_handler(EXCCODE_TLBM, handle_tlb_modify);
+		set_handler(EXCCODE_TLBNR, handle_tlb_protect);
+		set_handler(EXCCODE_TLBNX, handle_tlb_protect);
+		set_handler(EXCCODE_TLBPE, handle_tlb_protect);
 	}
 #ifdef CONFIG_NUMA
 	else {
 		void *addr;
+		unsigned long addr_ul;
 		struct page *page;
-		const int vec_sz = sizeof(exception_handlers);
+		const int vec_sz = VECSIZE * 128;
 
 		if (pcpu_handlers[cpu])
 			return;
@@ -286,8 +286,11 @@  void setup_tlb_handler(int cpu)
 			return;
 
 		addr = page_address(page);
+		addr_ul = (unsigned long)addr;
 		pcpu_handlers[cpu] = (unsigned long)addr;
-		memcpy((void *)addr, (void *)eentry, vec_sz);
+		memcpy(addr, (void *)eentry, vec_sz);
+		for (unsigned long i = 0; i < 128; i++)
+			reloc_handler(addr_ul + i * VECSIZE, eentry_reloc[i]);
 		local_flush_icache_range((unsigned long)addr, (unsigned long)addr + vec_sz);
 		csr_write64(pcpu_handlers[cpu], LOONGARCH_CSR_EENTRY);
 		csr_write64(pcpu_handlers[cpu], LOONGARCH_CSR_MERRENTRY);
diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S
index 3dd2a9615cd9..044c2190771a 100644
--- a/arch/loongarch/mm/tlbex.S
+++ b/arch/loongarch/mm/tlbex.S
@@ -39,11 +39,21 @@  SYM_FUNC_START(handle_tlb_protect)
 	move		a1, zero
 	csrrd		a2, LOONGARCH_CSR_BADV
 	REG_S		a2, sp, PT_BVADDR
-	la.abs		t0, do_page_fault
+1:	la.pcrel	t0, t1, do_page_fault
 	jirl		ra, t0, 0
 	RESTORE_ALL_AND_RET
 SYM_FUNC_END(handle_tlb_protect)
 
+SYM_DATA_START(rel_handle_tlb_protect)
+	LONG	2
+
+	LONG	514b - handle_tlb_protect
+	LONG	kernelsp
+
+	LONG	1b - handle_tlb_protect
+	LONG	do_page_fault
+SYM_DATA_END(rel_handle_tlb_protect)
+
 SYM_FUNC_START(handle_tlb_load)
 	csrwr		t0, EXCEPTION_KS0
 	csrwr		t1, EXCEPTION_KS1
@@ -115,7 +125,8 @@  smp_pgtable_change_load:
 
 #ifdef CONFIG_64BIT
 vmalloc_load:
-	la.abs		t1, swapper_pg_dir
+	/* The first insn of vmalloc_done_load overwrites ra */
+1:	la.pcrel	t1, ra, swapper_pg_dir
 	b		vmalloc_done_load
 #endif
 
@@ -186,10 +197,24 @@  tlb_huge_update_load:
 nopage_tlb_load:
 	dbar		0
 	csrrd		ra, EXCEPTION_KS2
-	la.abs		t0, tlb_do_page_fault_0
+2:	la.pcrel	t0, t1, tlb_do_page_fault_0
 	jr		t0
 SYM_FUNC_END(handle_tlb_load)
 
+SYM_DATA_START(rel_handle_tlb_load)
+#ifdef CONFIG_64BIT
+	LONG	2
+
+	LONG	1b - handle_tlb_load
+	LONG	swapper_pg_dir
+#else
+	LONG	1
+#endif
+
+	LONG	2b - handle_tlb_load
+	LONG	tlb_do_page_fault_0
+SYM_DATA_END(rel_handle_tlb_load)
+
 SYM_FUNC_START(handle_tlb_store)
 	csrwr		t0, EXCEPTION_KS0
 	csrwr		t1, EXCEPTION_KS1
@@ -262,7 +287,8 @@  smp_pgtable_change_store:
 
 #ifdef CONFIG_64BIT
 vmalloc_store:
-	la.abs		t1, swapper_pg_dir
+	/* The first insn of vmalloc_done_store overwrites ra */
+1:	la.pcrel	t1, ra, swapper_pg_dir
 	b		vmalloc_done_store
 #endif
 
@@ -335,10 +361,24 @@  tlb_huge_update_store:
 nopage_tlb_store:
 	dbar		0
 	csrrd		ra, EXCEPTION_KS2
-	la.abs		t0, tlb_do_page_fault_1
+2:	la.pcrel	t0, t1, tlb_do_page_fault_1
 	jr		t0
 SYM_FUNC_END(handle_tlb_store)
 
+SYM_DATA_START(rel_handle_tlb_store)
+#ifdef CONFIG_64BIT
+	LONG	2
+
+	LONG	1b - handle_tlb_store
+	LONG	swapper_pg_dir
+#else
+	LONG	1
+#endif
+
+	LONG	2b - handle_tlb_store
+	LONG	tlb_do_page_fault_1
+SYM_DATA_END(rel_handle_tlb_store)
+
 SYM_FUNC_START(handle_tlb_modify)
 	csrwr		t0, EXCEPTION_KS0
 	csrwr		t1, EXCEPTION_KS1
@@ -410,7 +450,8 @@  smp_pgtable_change_modify:
 
 #ifdef CONFIG_64BIT
 vmalloc_modify:
-	la.abs		t1, swapper_pg_dir
+	/* The first insn of vmalloc_done_modify overwrites ra */
+1:	la.pcrel	t1, ra, swapper_pg_dir
 	b		vmalloc_done_modify
 #endif
 
@@ -482,10 +523,24 @@  tlb_huge_update_modify:
 nopage_tlb_modify:
 	dbar		0
 	csrrd		ra, EXCEPTION_KS2
-	la.abs		t0, tlb_do_page_fault_1
+2:	la.pcrel	t0, t1, tlb_do_page_fault_1
 	jr		t0
 SYM_FUNC_END(handle_tlb_modify)
 
+SYM_DATA_START(rel_handle_tlb_modify)
+#ifdef CONFIG_64BIT
+	LONG	2
+
+	LONG	1b - handle_tlb_modify
+	LONG	swapper_pg_dir
+#else
+	LONG	1
+#endif
+
+	LONG	2b - handle_tlb_modify
+	LONG	tlb_do_page_fault_1
+SYM_DATA_END(rel_handle_tlb_modify)
+
 SYM_FUNC_START(handle_tlb_refill)
 	csrwr		t0, LOONGARCH_CSR_TLBRSAVE
 	csrrd		t0, LOONGARCH_CSR_PGD