[v4,2/3] LoongArch: Add ptrace single step support

Message ID 20230217023745.20800-3-zhangqing@loongson.cn
State New
Headers
Series LoongArch: Add hardware breakpoints/watchpoints support |

Commit Message

Qing Zhang Feb. 17, 2023, 2:37 a.m. UTC
  Use the generic ptrace_resume code for PTRACE_SYSCALL, PTRACE_CONT,
PTRACE_KILL and PTRACE_SINGLESTEP. This implies defining
arch_has_single_step in  and implementing the
user_enable_single_step and user_disable_single_step functions.

LongArch has no hardware single-step register. the hardware single-step
function multiplex fetch instruction watchpoint(FWPS) and specifies that
the next instruction must trigger the watch exception by setting the mask bit.
Some scenarios use CSR.FWPS.Skip to ignore the next hit result, not to trigger
the watchpoint exception, and proceed to the next instruction.

Signed-off-by: Qing Zhang <zhangqing@loongson.cn>
---
 arch/loongarch/include/asm/inst.h      | 39 +++++++++++++++
 arch/loongarch/include/asm/loongarch.h |  3 ++
 arch/loongarch/include/asm/processor.h |  3 ++
 arch/loongarch/include/asm/ptrace.h    |  2 +
 arch/loongarch/kernel/hw_breakpoint.c  | 35 +++++++++++--
 arch/loongarch/kernel/ptrace.c         | 68 ++++++++++++++++++++++++++
 arch/loongarch/kernel/traps.c          | 34 +++++++++++--
 7 files changed, 176 insertions(+), 8 deletions(-)
  

Comments

Youling Tang Feb. 17, 2023, 9:50 a.m. UTC | #1
Hi, Qing

On 02/17/2023 10:37 AM, Qing Zhang wrote:
> Use the generic ptrace_resume code for PTRACE_SYSCALL, PTRACE_CONT,
> PTRACE_KILL and PTRACE_SINGLESTEP. This implies defining
> arch_has_single_step in  and implementing the
> user_enable_single_step and user_disable_single_step functions.
>
> LongArch has no hardware single-step register. the hardware single-step
> function multiplex fetch instruction watchpoint(FWPS) and specifies that
> the next instruction must trigger the watch exception by setting the mask bit.
> Some scenarios use CSR.FWPS.Skip to ignore the next hit result, not to trigger
> the watchpoint exception, and proceed to the next instruction.
>
> Signed-off-by: Qing Zhang <zhangqing@loongson.cn>
> ---
>  arch/loongarch/include/asm/inst.h      | 39 +++++++++++++++
>  arch/loongarch/include/asm/loongarch.h |  3 ++
>  arch/loongarch/include/asm/processor.h |  3 ++
>  arch/loongarch/include/asm/ptrace.h    |  2 +
>  arch/loongarch/kernel/hw_breakpoint.c  | 35 +++++++++++--
>  arch/loongarch/kernel/ptrace.c         | 68 ++++++++++++++++++++++++++
>  arch/loongarch/kernel/traps.c          | 34 +++++++++++--
>  7 files changed, 176 insertions(+), 8 deletions(-)
>
> diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h
> index ba18ce8fbdf2..00c6f261d9a2 100644
> --- a/arch/loongarch/include/asm/inst.h
> +++ b/arch/loongarch/include/asm/inst.h
> @@ -368,6 +368,45 @@ static inline bool is_stack_alloc_ins(union loongarch_instruction *ip)
>  		is_imm12_negative(ip->reg2i12_format.immediate);
>  }
>
> +static inline bool branch_ins_target_pc(union loongarch_instruction *ip)
> +{
> +	switch (ip->reg0i26_format.opcode) {
> +	case b_op:
> +	case bl_op:
> +		if (ip->reg0i26_format.immediate_l == 0
> +		   && ip->reg0i26_format.immediate_h == 0)
> +			return false;
> +	}
> +
> +	switch (ip->reg1i21_format.opcode) {
> +	case beqz_op:
> +	case bnez_op:
> +	case bceqz_op:
> +		if (ip->reg1i21_format.immediate_l == 0
> +		   && ip->reg1i21_format.immediate_h == 0)
> +			return false;
> +	}
> +
> +	switch (ip->reg2i16_format.opcode) {
> +	case jirl_op:
> +		if (ip->reg2i16_format.rj == 0x1
> +		   && ip->reg2i16_format.rd == 0x1
LOONGARCH_GPR_RA can be used instead of 0x1.

> +		   && ip->reg2i16_format.immediate == 0)
> +			return false;
> +		break;
> +	case beq_op:
> +	case bne_op:
> +	case blt_op:
> +	case bge_op:
> +	case bltu_op:
> +	case bgeu_op:
> +		if (ip->reg2i16_format.immediate == 0)
> +			return false;
> +	}
> +
> +	return true;
> +}
> +
>  void simu_pc(struct pt_regs *regs, union loongarch_instruction insn);
>  void simu_branch(struct pt_regs *regs, union loongarch_instruction insn);
>
> diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
> index e9aed583a064..65b7dcdea16d 100644
> --- a/arch/loongarch/include/asm/loongarch.h
> +++ b/arch/loongarch/include/asm/loongarch.h
> @@ -1055,6 +1055,9 @@ static __always_inline void iocsr_write64(u64 val, u32 reg)
>  #define LOONGARCH_CSR_DERA		0x501	/* debug era */
>  #define LOONGARCH_CSR_DESAVE		0x502	/* debug save */
>
> +#define CSR_FWPC_SKIP_SHIFT		16
> +#define CSR_FWPC_SKIP			(_ULCAST_(1) << CSR_FWPC_SKIP_SHIFT)
> +
>  /*
>   * CSR_ECFG IM
>   */
> diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h
> index db060c5a976f..3ea0f1910c23 100644
> --- a/arch/loongarch/include/asm/processor.h
> +++ b/arch/loongarch/include/asm/processor.h
> @@ -131,6 +131,9 @@ struct thread_struct {
>  	struct perf_event	*hbp_break[LOONGARCH_MAX_BRP];
>  	struct perf_event	*hbp_watch[LOONGARCH_MAX_WRP];
>
> +	/* Used by ptrace single_step */
> +	unsigned long single_step;
> +
>  	/*
>  	 * FPU & vector registers, must be at last because
>  	 * they are conditionally copied at fork().
> diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h
> index 58596c4f8a0f..66a0e6c480a3 100644
> --- a/arch/loongarch/include/asm/ptrace.h
> +++ b/arch/loongarch/include/asm/ptrace.h
> @@ -150,4 +150,6 @@ static inline void user_stack_pointer_set(struct pt_regs *regs,
>  	regs->regs[3] = val;
>  }
>
> +#define arch_has_single_step()		(1)
> +
>  #endif /* _ASM_PTRACE_H */
> diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c
> index 6431cd319c32..75d3652fbe00 100644
> --- a/arch/loongarch/kernel/hw_breakpoint.c
> +++ b/arch/loongarch/kernel/hw_breakpoint.c
> @@ -153,6 +153,22 @@ static int hw_breakpoint_slot_setup(struct perf_event **slots, int max_slots,
>   */
>  void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
>  {
> +	int i;
> +	struct thread_struct *t = &tsk->thread;
> +
> +	for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
> +		if (t->hbp_break[i]) {
> +			unregister_hw_breakpoint(t->hbp_break[i]);
> +			t->hbp_break[i] = NULL;
> +		}
> +	}
> +
> +	for (i = 0; i < LOONGARCH_MAX_WRP; i++) {
> +		if (t->hbp_watch[i]) {
> +			unregister_hw_breakpoint(t->hbp_watch[i]);
> +			t->hbp_watch[i] = NULL;
> +		}
> +	}
>  }
>
>  void ptrace_hw_copy_thread(struct task_struct *tsk)
> @@ -498,11 +514,20 @@ arch_initcall(arch_hw_breakpoint_init);
>  void hw_breakpoint_thread_switch(struct task_struct *next)
>  {
>  	struct pt_regs *regs = task_pt_regs(next);
> -
> -	/* Update breakpoints */
> -	update_bp_registers(regs, 1, 0);
> -	/* Update watchpoints */
> -	update_bp_registers(regs, 1, 1);
> +	u64 addr, mask;
> +
> +	if (test_bit(TIF_SINGLESTEP, &task_thread_info(next)->flags)) {
> +		addr = read_wb_reg(CSR_CFG_ADDR, 0, 0);
> +		mask = read_wb_reg(CSR_CFG_MASK, 0, 0);
> +		if ((task_pt_regs(next)->csr_era & ~mask) == (addr & ~mask))
> +			csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
> +		regs->csr_prmd |= CSR_PRMD_PWE;
> +	} else {
> +		/* Update breakpoints */
> +		update_bp_registers(regs, 1, 0);
> +		/* Update watchpoints */
> +		update_bp_registers(regs, 1, 1);
> +	}
>  }
>
>  void hw_breakpoint_pmu_read(struct perf_event *bp)
> diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c
> index bee4194177fd..52a3ee4366f4 100644
> --- a/arch/loongarch/kernel/ptrace.c
> +++ b/arch/loongarch/kernel/ptrace.c
> @@ -20,6 +20,7 @@
>  #include <linux/context_tracking.h>
>  #include <linux/elf.h>
>  #include <linux/errno.h>
> +#include <linux/hw_breakpoint.h>
>  #include <linux/mm.h>
>  #include <linux/ptrace.h>
>  #include <linux/regset.h>
> @@ -30,6 +31,7 @@
>  #include <linux/stddef.h>
>  #include <linux/seccomp.h>
>  #include <linux/uaccess.h>
> +#include <linux/thread_info.h>
>
>  #include <asm/byteorder.h>
>  #include <asm/cpu.h>
> @@ -39,6 +41,7 @@
>  #include <asm/page.h>
>  #include <asm/pgtable.h>
>  #include <asm/processor.h>
> +#include <asm/ptrace.h>
>  #include <asm/reg.h>
>  #include <asm/syscall.h>
>
> @@ -541,3 +544,68 @@ long arch_ptrace(struct task_struct *child, long request,
>
>  	return ret;
>  }
> +
> +void ptrace_triggered(struct perf_event *bp,
> +		      struct perf_sample_data *data, struct pt_regs *regs)
> +{
> +	struct perf_event_attr attr;
> +
> +	attr = bp->attr;
> +	attr.disabled = true;
> +	modify_user_hw_breakpoint(bp, &attr);
> +}
> +
> +static int set_single_step(struct task_struct *tsk, unsigned long addr)
> +{
> +	struct thread_struct *thread = &tsk->thread;
> +	struct perf_event *bp;
> +	struct perf_event_attr attr;
> +	struct arch_hw_breakpoint *info;
> +
> +	bp = thread->hbp_break[0];
> +	if (!bp) {
> +		ptrace_breakpoint_init(&attr);
> +
> +		attr.bp_addr = addr;
> +		attr.bp_len = HW_BREAKPOINT_LEN_8;
> +		attr.bp_type = HW_BREAKPOINT_X;
> +
> +		bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
> +						 NULL, tsk);
> +		if (IS_ERR(bp))
> +			return PTR_ERR(bp);
> +
> +		thread->hbp_break[0] = bp;
> +	} else {
> +		int err;
> +
> +		attr = bp->attr;
> +		attr.bp_addr = addr;
> +		/* reenable breakpoint */
> +		attr.disabled = false;
> +		err = modify_user_hw_breakpoint(bp, &attr);
> +		if (unlikely(err))
> +			return err;
> +
> +		csr_write64(attr.bp_addr, LOONGARCH_CSR_IB0ADDR);
> +	}
> +	info = counter_arch_bp(bp);
> +	info->mask = 0xffffffffffff;
If `mask` is only used for user space address mask, it should not be
fixed to 0xffffffffffff, the user space address size may have many
situations, which can be defined according to TASK_SIZE.

Use (TASK_SIZE - 1) instead.

> +
> +	return 0;
> +}
> +
> +/* ptrace API */
> +void user_enable_single_step(struct task_struct *task)
> +{
> +	struct thread_info *ti = task_thread_info(task);
> +
> +	set_single_step(task, task_pt_regs(task)->csr_era);
> +	task->thread.single_step = task_pt_regs(task)->csr_era;
> +	set_ti_thread_flag(ti, TIF_SINGLESTEP);
> +}
> +
> +void user_disable_single_step(struct task_struct *task)
> +{
> +	clear_tsk_thread_flag(task, TIF_SINGLESTEP);
> +}
> diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c
> index 2b133079e0f3..a59275a43bd1 100644
> --- a/arch/loongarch/kernel/traps.c
> +++ b/arch/loongarch/kernel/traps.c
> @@ -511,9 +511,37 @@ asmlinkage void noinstr do_watch(struct pt_regs *regs)
>  #ifdef CONFIG_HAVE_HW_BREAKPOINT
>  	irqentry_state_t state = irqentry_enter(regs);
>
> -	breakpoint_handler(regs);
> -	watchpoint_handler(regs);
> -	force_sig(SIGTRAP);
> +	if (test_tsk_thread_flag(current, TIF_SINGLESTEP)) {
> +		int llbit = (csr_read32(LOONGARCH_CSR_LLBCTL) & 0x1);
> +		unsigned long pc = regs->csr_era;
> +		union loongarch_instruction *ip = (union loongarch_instruction *)pc;
> +
> +		if (llbit) {
> +		/*
> +		 * When the ll-sc combo is encountered, it is regarded as an single
> +		 * instruction. So don't clear llbit and reset CSR.FWPS.Skip until
> +		 * the llsc execution is completed.
> +		 */
Comments should be aligned with code, similar modifications in other
places.

> +			csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
> +			csr_write32(CSR_LLBCTL_KLO, LOONGARCH_CSR_LLBCTL);
> +		} else if (pc == current->thread.single_step) {
> +		/*
> +		 * Maybe some hardware Bug caused that certain insns are occasionally
> +		 * not skipped when CSR.FWPS.Skip is set, such as fld.d/fst.d. So Singlestep
> +		 * needs to compare whether the csr_era is equal to the value of singlestep
> +		 * which last time set.
> +		 */
> +			if (branch_ins_target_pc(ip))
> +			/* Prevent rd == pc from causing single step to fail to stop */
IMO, that comment description is not accurate enough.

Youling.

> +				csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
> +		} else {
> +			force_sig(SIGTRAP);
> +		}
> +	} else {
> +		breakpoint_handler(regs);
> +		watchpoint_handler(regs);
> +		force_sig(SIGTRAP);
> +	}
>
>  	irqentry_exit(regs, state);
>  #endif
>
  
Qing Zhang Feb. 17, 2023, 10 a.m. UTC | #2
Hi, Youling

On 2023/2/17 下午5:50, Youling Tang wrote:
> Hi, Qing
> 
> On 02/17/2023 10:37 AM, Qing Zhang wrote:
>> Use the generic ptrace_resume code for PTRACE_SYSCALL, PTRACE_CONT,
>> PTRACE_KILL and PTRACE_SINGLESTEP. This implies defining
>> arch_has_single_step in  and implementing the
>> user_enable_single_step and user_disable_single_step functions.
>>
>> LongArch has no hardware single-step register. the hardware single-step
>> function multiplex fetch instruction watchpoint(FWPS) and specifies that
>> the next instruction must trigger the watch exception by setting the 
>> mask bit.
>> Some scenarios use CSR.FWPS.Skip to ignore the next hit result, not to 
>> trigger
>> the watchpoint exception, and proceed to the next instruction.
>>
>> Signed-off-by: Qing Zhang <zhangqing@loongson.cn>
>> ---
>>  arch/loongarch/include/asm/inst.h      | 39 +++++++++++++++
>>  arch/loongarch/include/asm/loongarch.h |  3 ++
>>  arch/loongarch/include/asm/processor.h |  3 ++
>>  arch/loongarch/include/asm/ptrace.h    |  2 +
>>  arch/loongarch/kernel/hw_breakpoint.c  | 35 +++++++++++--
>>  arch/loongarch/kernel/ptrace.c         | 68 ++++++++++++++++++++++++++
>>  arch/loongarch/kernel/traps.c          | 34 +++++++++++--
>>  7 files changed, 176 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/loongarch/include/asm/inst.h 
>> b/arch/loongarch/include/asm/inst.h
>> index ba18ce8fbdf2..00c6f261d9a2 100644
>> --- a/arch/loongarch/include/asm/inst.h
>> +++ b/arch/loongarch/include/asm/inst.h
>> @@ -368,6 +368,45 @@ static inline bool is_stack_alloc_ins(union 
>> loongarch_instruction *ip)
>>          is_imm12_negative(ip->reg2i12_format.immediate);
>>  }
>>
>> +static inline bool branch_ins_target_pc(union loongarch_instruction *ip)
>> +{
>> +    switch (ip->reg0i26_format.opcode) {
>> +    case b_op:
>> +    case bl_op:
>> +        if (ip->reg0i26_format.immediate_l == 0
>> +           && ip->reg0i26_format.immediate_h == 0)
>> +            return false;
>> +    }
>> +
>> +    switch (ip->reg1i21_format.opcode) {
>> +    case beqz_op:
>> +    case bnez_op:
>> +    case bceqz_op:
>> +        if (ip->reg1i21_format.immediate_l == 0
>> +           && ip->reg1i21_format.immediate_h == 0)
>> +            return false;
>> +    }
>> +
>> +    switch (ip->reg2i16_format.opcode) {
>> +    case jirl_op:
>> +        if (ip->reg2i16_format.rj == 0x1
>> +           && ip->reg2i16_format.rd == 0x1
> LOONGARCH_GPR_RA can be used instead of 0x1.
> 
>> +           && ip->reg2i16_format.immediate == 0)
>> +            return false;
>> +        break;
>> +    case beq_op:
>> +    case bne_op:
>> +    case blt_op:
>> +    case bge_op:
>> +    case bltu_op:
>> +    case bgeu_op:
>> +        if (ip->reg2i16_format.immediate == 0)
>> +            return false;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>>  void simu_pc(struct pt_regs *regs, union loongarch_instruction insn);
>>  void simu_branch(struct pt_regs *regs, union loongarch_instruction 
>> insn);
>>
>> diff --git a/arch/loongarch/include/asm/loongarch.h 
>> b/arch/loongarch/include/asm/loongarch.h
>> index e9aed583a064..65b7dcdea16d 100644
>> --- a/arch/loongarch/include/asm/loongarch.h
>> +++ b/arch/loongarch/include/asm/loongarch.h
>> @@ -1055,6 +1055,9 @@ static __always_inline void iocsr_write64(u64 
>> val, u32 reg)
>>  #define LOONGARCH_CSR_DERA        0x501    /* debug era */
>>  #define LOONGARCH_CSR_DESAVE        0x502    /* debug save */
>>
>> +#define CSR_FWPC_SKIP_SHIFT        16
>> +#define CSR_FWPC_SKIP            (_ULCAST_(1) << CSR_FWPC_SKIP_SHIFT)
>> +
>>  /*
>>   * CSR_ECFG IM
>>   */
>> diff --git a/arch/loongarch/include/asm/processor.h 
>> b/arch/loongarch/include/asm/processor.h
>> index db060c5a976f..3ea0f1910c23 100644
>> --- a/arch/loongarch/include/asm/processor.h
>> +++ b/arch/loongarch/include/asm/processor.h
>> @@ -131,6 +131,9 @@ struct thread_struct {
>>      struct perf_event    *hbp_break[LOONGARCH_MAX_BRP];
>>      struct perf_event    *hbp_watch[LOONGARCH_MAX_WRP];
>>
>> +    /* Used by ptrace single_step */
>> +    unsigned long single_step;
>> +
>>      /*
>>       * FPU & vector registers, must be at last because
>>       * they are conditionally copied at fork().
>> diff --git a/arch/loongarch/include/asm/ptrace.h 
>> b/arch/loongarch/include/asm/ptrace.h
>> index 58596c4f8a0f..66a0e6c480a3 100644
>> --- a/arch/loongarch/include/asm/ptrace.h
>> +++ b/arch/loongarch/include/asm/ptrace.h
>> @@ -150,4 +150,6 @@ static inline void user_stack_pointer_set(struct 
>> pt_regs *regs,
>>      regs->regs[3] = val;
>>  }
>>
>> +#define arch_has_single_step()        (1)
>> +
>>  #endif /* _ASM_PTRACE_H */
>> diff --git a/arch/loongarch/kernel/hw_breakpoint.c 
>> b/arch/loongarch/kernel/hw_breakpoint.c
>> index 6431cd319c32..75d3652fbe00 100644
>> --- a/arch/loongarch/kernel/hw_breakpoint.c
>> +++ b/arch/loongarch/kernel/hw_breakpoint.c
>> @@ -153,6 +153,22 @@ static int hw_breakpoint_slot_setup(struct 
>> perf_event **slots, int max_slots,
>>   */
>>  void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
>>  {
>> +    int i;
>> +    struct thread_struct *t = &tsk->thread;
>> +
>> +    for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
>> +        if (t->hbp_break[i]) {
>> +            unregister_hw_breakpoint(t->hbp_break[i]);
>> +            t->hbp_break[i] = NULL;
>> +        }
>> +    }
>> +
>> +    for (i = 0; i < LOONGARCH_MAX_WRP; i++) {
>> +        if (t->hbp_watch[i]) {
>> +            unregister_hw_breakpoint(t->hbp_watch[i]);
>> +            t->hbp_watch[i] = NULL;
>> +        }
>> +    }
>>  }
>>
>>  void ptrace_hw_copy_thread(struct task_struct *tsk)
>> @@ -498,11 +514,20 @@ arch_initcall(arch_hw_breakpoint_init);
>>  void hw_breakpoint_thread_switch(struct task_struct *next)
>>  {
>>      struct pt_regs *regs = task_pt_regs(next);
>> -
>> -    /* Update breakpoints */
>> -    update_bp_registers(regs, 1, 0);
>> -    /* Update watchpoints */
>> -    update_bp_registers(regs, 1, 1);
>> +    u64 addr, mask;
>> +
>> +    if (test_bit(TIF_SINGLESTEP, &task_thread_info(next)->flags)) {
>> +        addr = read_wb_reg(CSR_CFG_ADDR, 0, 0);
>> +        mask = read_wb_reg(CSR_CFG_MASK, 0, 0);
>> +        if ((task_pt_regs(next)->csr_era & ~mask) == (addr & ~mask))
>> +            csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
>> +        regs->csr_prmd |= CSR_PRMD_PWE;
>> +    } else {
>> +        /* Update breakpoints */
>> +        update_bp_registers(regs, 1, 0);
>> +        /* Update watchpoints */
>> +        update_bp_registers(regs, 1, 1);
>> +    }
>>  }
>>
>>  void hw_breakpoint_pmu_read(struct perf_event *bp)
>> diff --git a/arch/loongarch/kernel/ptrace.c 
>> b/arch/loongarch/kernel/ptrace.c
>> index bee4194177fd..52a3ee4366f4 100644
>> --- a/arch/loongarch/kernel/ptrace.c
>> +++ b/arch/loongarch/kernel/ptrace.c
>> @@ -20,6 +20,7 @@
>>  #include <linux/context_tracking.h>
>>  #include <linux/elf.h>
>>  #include <linux/errno.h>
>> +#include <linux/hw_breakpoint.h>
>>  #include <linux/mm.h>
>>  #include <linux/ptrace.h>
>>  #include <linux/regset.h>
>> @@ -30,6 +31,7 @@
>>  #include <linux/stddef.h>
>>  #include <linux/seccomp.h>
>>  #include <linux/uaccess.h>
>> +#include <linux/thread_info.h>
>>
>>  #include <asm/byteorder.h>
>>  #include <asm/cpu.h>
>> @@ -39,6 +41,7 @@
>>  #include <asm/page.h>
>>  #include <asm/pgtable.h>
>>  #include <asm/processor.h>
>> +#include <asm/ptrace.h>
>>  #include <asm/reg.h>
>>  #include <asm/syscall.h>
>>
>> @@ -541,3 +544,68 @@ long arch_ptrace(struct task_struct *child, long 
>> request,
>>
>>      return ret;
>>  }
>> +
>> +void ptrace_triggered(struct perf_event *bp,
>> +              struct perf_sample_data *data, struct pt_regs *regs)
>> +{
>> +    struct perf_event_attr attr;
>> +
>> +    attr = bp->attr;
>> +    attr.disabled = true;
>> +    modify_user_hw_breakpoint(bp, &attr);
>> +}
>> +
>> +static int set_single_step(struct task_struct *tsk, unsigned long addr)
>> +{
>> +    struct thread_struct *thread = &tsk->thread;
>> +    struct perf_event *bp;
>> +    struct perf_event_attr attr;
>> +    struct arch_hw_breakpoint *info;
>> +
>> +    bp = thread->hbp_break[0];
>> +    if (!bp) {
>> +        ptrace_breakpoint_init(&attr);
>> +
>> +        attr.bp_addr = addr;
>> +        attr.bp_len = HW_BREAKPOINT_LEN_8;
>> +        attr.bp_type = HW_BREAKPOINT_X;
>> +
>> +        bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
>> +                         NULL, tsk);
>> +        if (IS_ERR(bp))
>> +            return PTR_ERR(bp);
>> +
>> +        thread->hbp_break[0] = bp;
>> +    } else {
>> +        int err;
>> +
>> +        attr = bp->attr;
>> +        attr.bp_addr = addr;
>> +        /* reenable breakpoint */
>> +        attr.disabled = false;
>> +        err = modify_user_hw_breakpoint(bp, &attr);
>> +        if (unlikely(err))
>> +            return err;
>> +
>> +        csr_write64(attr.bp_addr, LOONGARCH_CSR_IB0ADDR);
>> +    }
>> +    info = counter_arch_bp(bp);
>> +    info->mask = 0xffffffffffff;
> If `mask` is only used for user space address mask, it should not be
> fixed to 0xffffffffffff, the user space address size may have many
> situations, which can be defined according to TASK_SIZE.
> 
> Use (TASK_SIZE - 1) instead.
> 
ok, got it.
>> +
>> +    return 0;
>> +}
>> +
>> +/* ptrace API */
>> +void user_enable_single_step(struct task_struct *task)
>> +{
>> +    struct thread_info *ti = task_thread_info(task);
>> +
>> +    set_single_step(task, task_pt_regs(task)->csr_era);
>> +    task->thread.single_step = task_pt_regs(task)->csr_era;
>> +    set_ti_thread_flag(ti, TIF_SINGLESTEP);
>> +}
>> +
>> +void user_disable_single_step(struct task_struct *task)
>> +{
>> +    clear_tsk_thread_flag(task, TIF_SINGLESTEP);
>> +}
>> diff --git a/arch/loongarch/kernel/traps.c 
>> b/arch/loongarch/kernel/traps.c
>> index 2b133079e0f3..a59275a43bd1 100644
>> --- a/arch/loongarch/kernel/traps.c
>> +++ b/arch/loongarch/kernel/traps.c
>> @@ -511,9 +511,37 @@ asmlinkage void noinstr do_watch(struct pt_regs 
>> *regs)
>>  #ifdef CONFIG_HAVE_HW_BREAKPOINT
>>      irqentry_state_t state = irqentry_enter(regs);
>>
>> -    breakpoint_handler(regs);
>> -    watchpoint_handler(regs);
>> -    force_sig(SIGTRAP);
>> +    if (test_tsk_thread_flag(current, TIF_SINGLESTEP)) {
>> +        int llbit = (csr_read32(LOONGARCH_CSR_LLBCTL) & 0x1);
>> +        unsigned long pc = regs->csr_era;
>> +        union loongarch_instruction *ip = (union 
>> loongarch_instruction *)pc;
>> +
>> +        if (llbit) {
>> +        /*
>> +         * When the ll-sc combo is encountered, it is regarded as an 
>> single
>> +         * instruction. So don't clear llbit and reset CSR.FWPS.Skip 
>> until
>> +         * the llsc execution is completed.
>> +         */
> Comments should be aligned with code, similar modifications in other
> places.
> 
>> +            csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
>> +            csr_write32(CSR_LLBCTL_KLO, LOONGARCH_CSR_LLBCTL);
>> +        } else if (pc == current->thread.single_step) {
>> +        /*
>> +         * Maybe some hardware Bug caused that certain insns are 
>> occasionally
>> +         * not skipped when CSR.FWPS.Skip is set, such as 
>> fld.d/fst.d. So Singlestep
>> +         * needs to compare whether the csr_era is equal to the value 
>> of singlestep
>> +         * which last time set.
>> +         */
>> +            if (branch_ins_target_pc(ip))
>> +            /* Prevent rd == pc from causing single step to fail to 
>> stop */
> IMO, that comment description is not accurate enough.ok, I'll add more comments.

Thanks,
-Qing
> Youling.
> 
>> +                csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
>> +        } else {
>> +            force_sig(SIGTRAP);
>> +        }
>> +    } else {
>> +        breakpoint_handler(regs);
>> +        watchpoint_handler(regs);
>> +        force_sig(SIGTRAP);
>> +    }
>>
>>      irqentry_exit(regs, state);
>>  #endif
>>
>
  
WANG Xuerui Feb. 17, 2023, 10:16 a.m. UTC | #3
On 2023/2/17 10:37, Qing Zhang wrote:
> Use the generic ptrace_resume code for PTRACE_SYSCALL, PTRACE_CONT,
> PTRACE_KILL and PTRACE_SINGLESTEP. This implies defining
> arch_has_single_step in  and implementing the
> user_enable_single_step and user_disable_single_step functions.
> 
> LongArch has no hardware single-step register. the hardware single-step

"LoongArch cannot do hardware single-stepping per se" could be more 
accurate a description. ("Hardware single-step register" could be 
ambiguous in implying the only way to achieve single-stepping would be 
via software, but in fact you explained in the next sentence that it's 
actually doable, just in an alternate way.)

> function multiplex fetch instruction watchpoint(FWPS) and specifies that
> the next instruction must trigger the watch exception by setting the mask bit.

I assume the "multiplex" is translated from "复用", in which case it's not 
used in the exact same way as in Chinese. Plus you have the subject 
wrong, "hardware single-step function" is the topic but not the actor 
doing the "multiplexing".

My take (continuing from the previous sentence I suggested):

"Instead it is achieved by configuring the instruction fetch watchpoints 
so that WPEF is triggered on the next instruction."

> Some scenarios use CSR.FWPS.Skip to ignore the next hit result, not to trigger
> the watchpoint exception, and proceed to the next instruction.

Again, wrong topicalization ("scenarios" is not the actor/subject). "In 
some scenarios CSR.FWPS.Skip is used to ..." sounds better.

And the multiple "next" usages sound unclear, you could probably reword 
this a bit. (I can't immediately understand this part so I'm unable to 
provide a suggestion.)

> 
> Signed-off-by: Qing Zhang <zhangqing@loongson.cn>
> ---
>   arch/loongarch/include/asm/inst.h      | 39 +++++++++++++++
>   arch/loongarch/include/asm/loongarch.h |  3 ++
>   arch/loongarch/include/asm/processor.h |  3 ++
>   arch/loongarch/include/asm/ptrace.h    |  2 +
>   arch/loongarch/kernel/hw_breakpoint.c  | 35 +++++++++++--
>   arch/loongarch/kernel/ptrace.c         | 68 ++++++++++++++++++++++++++
>   arch/loongarch/kernel/traps.c          | 34 +++++++++++--
>   7 files changed, 176 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h
> index ba18ce8fbdf2..00c6f261d9a2 100644
> --- a/arch/loongarch/include/asm/inst.h
> +++ b/arch/loongarch/include/asm/inst.h
> @@ -368,6 +368,45 @@ static inline bool is_stack_alloc_ins(union loongarch_instruction *ip)
>   		is_imm12_negative(ip->reg2i12_format.immediate);
>   }
>   
> +static inline bool branch_ins_target_pc(union loongarch_instruction *ip)

This name is not faithfully reflecting its purpose, which is to check if 
the given instruction will change PC; and the reason this has to be done 
in the first place being that, single stepping would never stop due to 
current hardware erratum if the PC would stay the same after the insn's 
execution.

You may want to add some comments regarding the erratum too if you're in 
a position to disclose more details.

> +{
> +	switch (ip->reg0i26_format.opcode) {
> +	case b_op:
> +	case bl_op:
> +		if (ip->reg0i26_format.immediate_l == 0
> +		   && ip->reg0i26_format.immediate_h == 0)
> +			return false;
> +	}
> +
> +	switch (ip->reg1i21_format.opcode) {
> +	case beqz_op:
> +	case bnez_op:
> +	case bceqz_op:
> +		if (ip->reg1i21_format.immediate_l == 0
> +		   && ip->reg1i21_format.immediate_h == 0)
> +			return false;
> +	}
> +
> +	switch (ip->reg2i16_format.opcode) {
> +	case jirl_op:
> +		if (ip->reg2i16_format.rj == 0x1
> +		   && ip->reg2i16_format.rd == 0x1
Given the appearance of the other cases, I feel like the triggering 
condition of the nonterminating singlestepping is that "new PC == 
current PC" always hold. So, isn't every `rd == rj && imm == 0` case 
problematic?

> +		   && ip->reg2i16_format.immediate == 0)
> +			return false;
> +		break;
> +	case beq_op:
> +	case bne_op:
> +	case blt_op:
> +	case bge_op:
> +	case bltu_op:
> +	case bgeu_op:
> +		if (ip->reg2i16_format.immediate == 0)
> +			return false;
> +	}
> +
> +	return true;
> +}
> +
>   void simu_pc(struct pt_regs *regs, union loongarch_instruction insn);
>   void simu_branch(struct pt_regs *regs, union loongarch_instruction insn);
>   
> diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
> index e9aed583a064..65b7dcdea16d 100644
> --- a/arch/loongarch/include/asm/loongarch.h
> +++ b/arch/loongarch/include/asm/loongarch.h
> @@ -1055,6 +1055,9 @@ static __always_inline void iocsr_write64(u64 val, u32 reg)
>   #define LOONGARCH_CSR_DERA		0x501	/* debug era */
>   #define LOONGARCH_CSR_DESAVE		0x502	/* debug save */
>   
> +#define CSR_FWPC_SKIP_SHIFT		16
> +#define CSR_FWPC_SKIP			(_ULCAST_(1) << CSR_FWPC_SKIP_SHIFT)
> +
>   /*
>    * CSR_ECFG IM
>    */
> diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h
> index db060c5a976f..3ea0f1910c23 100644
> --- a/arch/loongarch/include/asm/processor.h
> +++ b/arch/loongarch/include/asm/processor.h
> @@ -131,6 +131,9 @@ struct thread_struct {
>   	struct perf_event	*hbp_break[LOONGARCH_MAX_BRP];
>   	struct perf_event	*hbp_watch[LOONGARCH_MAX_WRP];
>   
> +	/* Used by ptrace single_step */

"PTRACE_SINGLESTEP" if you want to refer to the technical name of the 
feature, or "ptrace single stepping" which is more natural language.

> +	unsigned long single_step;
> +
>   	/*
>   	 * FPU & vector registers, must be at last because
>   	 * they are conditionally copied at fork().
> diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h
> index 58596c4f8a0f..66a0e6c480a3 100644
> --- a/arch/loongarch/include/asm/ptrace.h
> +++ b/arch/loongarch/include/asm/ptrace.h
> @@ -150,4 +150,6 @@ static inline void user_stack_pointer_set(struct pt_regs *regs,
>   	regs->regs[3] = val;
>   }
>   
> +#define arch_has_single_step()		(1)
> +
>   #endif /* _ASM_PTRACE_H */
> diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c
> index 6431cd319c32..75d3652fbe00 100644
> --- a/arch/loongarch/kernel/hw_breakpoint.c
> +++ b/arch/loongarch/kernel/hw_breakpoint.c
> @@ -153,6 +153,22 @@ static int hw_breakpoint_slot_setup(struct perf_event **slots, int max_slots,
>    */
>   void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
>   {
> +	int i;
> +	struct thread_struct *t = &tsk->thread;
> +
> +	for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
> +		if (t->hbp_break[i]) {
> +			unregister_hw_breakpoint(t->hbp_break[i]);
> +			t->hbp_break[i] = NULL;
> +		}
> +	}
> +
> +	for (i = 0; i < LOONGARCH_MAX_WRP; i++) {
> +		if (t->hbp_watch[i]) {
> +			unregister_hw_breakpoint(t->hbp_watch[i]);
> +			t->hbp_watch[i] = NULL;
> +		}
> +	}
>   }
>   
>   void ptrace_hw_copy_thread(struct task_struct *tsk)
> @@ -498,11 +514,20 @@ arch_initcall(arch_hw_breakpoint_init);
>   void hw_breakpoint_thread_switch(struct task_struct *next)
>   {
>   	struct pt_regs *regs = task_pt_regs(next);
> -
> -	/* Update breakpoints */
> -	update_bp_registers(regs, 1, 0);
> -	/* Update watchpoints */
> -	update_bp_registers(regs, 1, 1);
> +	u64 addr, mask;
> +
> +	if (test_bit(TIF_SINGLESTEP, &task_thread_info(next)->flags)) {
> +		addr = read_wb_reg(CSR_CFG_ADDR, 0, 0);
> +		mask = read_wb_reg(CSR_CFG_MASK, 0, 0);
> +		if ((task_pt_regs(next)->csr_era & ~mask) == (addr & ~mask))
> +			csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
> +		regs->csr_prmd |= CSR_PRMD_PWE;
> +	} else {
> +		/* Update breakpoints */
> +		update_bp_registers(regs, 1, 0);
> +		/* Update watchpoints */
> +		update_bp_registers(regs, 1, 1);
> +	}
>   }
>   
>   void hw_breakpoint_pmu_read(struct perf_event *bp)
> diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c
> index bee4194177fd..52a3ee4366f4 100644
> --- a/arch/loongarch/kernel/ptrace.c
> +++ b/arch/loongarch/kernel/ptrace.c
> @@ -20,6 +20,7 @@
>   #include <linux/context_tracking.h>
>   #include <linux/elf.h>
>   #include <linux/errno.h>
> +#include <linux/hw_breakpoint.h>
>   #include <linux/mm.h>
>   #include <linux/ptrace.h>
>   #include <linux/regset.h>
> @@ -30,6 +31,7 @@
>   #include <linux/stddef.h>
>   #include <linux/seccomp.h>
>   #include <linux/uaccess.h>
> +#include <linux/thread_info.h>
>   
>   #include <asm/byteorder.h>
>   #include <asm/cpu.h>
> @@ -39,6 +41,7 @@
>   #include <asm/page.h>
>   #include <asm/pgtable.h>
>   #include <asm/processor.h>
> +#include <asm/ptrace.h>
>   #include <asm/reg.h>
>   #include <asm/syscall.h>
>   
> @@ -541,3 +544,68 @@ long arch_ptrace(struct task_struct *child, long request,
>   
>   	return ret;
>   }
> +
> +void ptrace_triggered(struct perf_event *bp,
> +		      struct perf_sample_data *data, struct pt_regs *regs)
> +{
> +	struct perf_event_attr attr;
> +
> +	attr = bp->attr;
> +	attr.disabled = true;
> +	modify_user_hw_breakpoint(bp, &attr);
> +}
> +
> +static int set_single_step(struct task_struct *tsk, unsigned long addr)
> +{
> +	struct thread_struct *thread = &tsk->thread;
> +	struct perf_event *bp;
> +	struct perf_event_attr attr;
> +	struct arch_hw_breakpoint *info;
> +
> +	bp = thread->hbp_break[0];
> +	if (!bp) {
> +		ptrace_breakpoint_init(&attr);
> +
> +		attr.bp_addr = addr;
> +		attr.bp_len = HW_BREAKPOINT_LEN_8;
> +		attr.bp_type = HW_BREAKPOINT_X;
> +
> +		bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
> +						 NULL, tsk);
> +		if (IS_ERR(bp))
> +			return PTR_ERR(bp);
> +
> +		thread->hbp_break[0] = bp;
> +	} else {
> +		int err;
> +
> +		attr = bp->attr;
> +		attr.bp_addr = addr;
> +		/* reenable breakpoint */
> +		attr.disabled = false;
> +		err = modify_user_hw_breakpoint(bp, &attr);
> +		if (unlikely(err))
> +			return err;
> +
> +		csr_write64(attr.bp_addr, LOONGARCH_CSR_IB0ADDR);
> +	}
> +	info = counter_arch_bp(bp);
> +	info->mask = 0xffffffffffff;
> +
> +	return 0;
> +}
> +
> +/* ptrace API */
> +void user_enable_single_step(struct task_struct *task)
> +{
> +	struct thread_info *ti = task_thread_info(task);
> +
> +	set_single_step(task, task_pt_regs(task)->csr_era);
> +	task->thread.single_step = task_pt_regs(task)->csr_era;
> +	set_ti_thread_flag(ti, TIF_SINGLESTEP);
> +}
> +
> +void user_disable_single_step(struct task_struct *task)
> +{
> +	clear_tsk_thread_flag(task, TIF_SINGLESTEP);
> +}
> diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c
> index 2b133079e0f3..a59275a43bd1 100644
> --- a/arch/loongarch/kernel/traps.c
> +++ b/arch/loongarch/kernel/traps.c
> @@ -511,9 +511,37 @@ asmlinkage void noinstr do_watch(struct pt_regs *regs)
>   #ifdef CONFIG_HAVE_HW_BREAKPOINT
>   	irqentry_state_t state = irqentry_enter(regs);
>   
> -	breakpoint_handler(regs);
> -	watchpoint_handler(regs);
> -	force_sig(SIGTRAP);
> +	if (test_tsk_thread_flag(current, TIF_SINGLESTEP)) {
> +		int llbit = (csr_read32(LOONGARCH_CSR_LLBCTL) & 0x1);
> +		unsigned long pc = regs->csr_era;
> +		union loongarch_instruction *ip = (union loongarch_instruction *)pc;
> +
> +		if (llbit) {
> +		/*
> +		 * When the ll-sc combo is encountered, it is regarded as an single
> +		 * instruction. So don't clear llbit and reset CSR.FWPS.Skip until
> +		 * the llsc execution is completed.
> +		 */
> +			csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
> +			csr_write32(CSR_LLBCTL_KLO, LOONGARCH_CSR_LLBCTL);
> +		} else if (pc == current->thread.single_step) {
> +		/*
> +		 * Maybe some hardware Bug caused that certain insns are occasionally
> +		 * not skipped when CSR.FWPS.Skip is set, such as fld.d/fst.d. So Singlestep
> +		 * needs to compare whether the csr_era is equal to the value of singlestep
> +		 * which last time set.
> +		 */
> +			if (branch_ins_target_pc(ip))
> +			/* Prevent rd == pc from causing single step to fail to stop */
> +				csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
> +		} else {
> +			force_sig(SIGTRAP);
> +		}
> +	} else {
> +		breakpoint_handler(regs);
> +		watchpoint_handler(regs);
> +		force_sig(SIGTRAP);
> +	}
>   
>   	irqentry_exit(regs, state);
>   #endif
  
Jinyang He Feb. 18, 2023, 1:21 a.m. UTC | #4
On 2023-02-17 18:00, Qing Zhang wrote:
> Hi, Youling
>
> On 2023/2/17 下午5:50, Youling Tang wrote:
>> Hi, Qing
>>
>> On 02/17/2023 10:37 AM, Qing Zhang wrote:
>>> Use the generic ptrace_resume code for PTRACE_SYSCALL, PTRACE_CONT,
>>> PTRACE_KILL and PTRACE_SINGLESTEP. This implies defining
>>> arch_has_single_step in  and implementing the
>>> user_enable_single_step and user_disable_single_step functions.
>>>
>>> LongArch has no hardware single-step register. the hardware single-step
>>> function multiplex fetch instruction watchpoint(FWPS) and specifies 
>>> that
>>> the next instruction must trigger the watch exception by setting the 
>>> mask bit.
>>> Some scenarios use CSR.FWPS.Skip to ignore the next hit result, not 
>>> to trigger
>>> the watchpoint exception, and proceed to the next instruction.
>>>
>>> Signed-off-by: Qing Zhang <zhangqing@loongson.cn>
>>> ---
>>>  arch/loongarch/include/asm/inst.h      | 39 +++++++++++++++
>>>  arch/loongarch/include/asm/loongarch.h |  3 ++
>>>  arch/loongarch/include/asm/processor.h |  3 ++
>>>  arch/loongarch/include/asm/ptrace.h    |  2 +
>>>  arch/loongarch/kernel/hw_breakpoint.c  | 35 +++++++++++--
>>>  arch/loongarch/kernel/ptrace.c         | 68 ++++++++++++++++++++++++++
>>>  arch/loongarch/kernel/traps.c          | 34 +++++++++++--
>>>  7 files changed, 176 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/arch/loongarch/include/asm/inst.h 
>>> b/arch/loongarch/include/asm/inst.h
>>> index ba18ce8fbdf2..00c6f261d9a2 100644
>>> --- a/arch/loongarch/include/asm/inst.h
>>> +++ b/arch/loongarch/include/asm/inst.h
>>> @@ -368,6 +368,45 @@ static inline bool is_stack_alloc_ins(union 
>>> loongarch_instruction *ip)
>>>          is_imm12_negative(ip->reg2i12_format.immediate);
>>>  }
>>>
>>> +static inline bool branch_ins_target_pc(union loongarch_instruction 
>>> *ip)
>>> +{
>>> +    switch (ip->reg0i26_format.opcode) {
>>> +    case b_op:
>>> +    case bl_op:
>>> +        if (ip->reg0i26_format.immediate_l == 0
>>> +           && ip->reg0i26_format.immediate_h == 0)
>>> +            return false;
>>> +    }
>>> +
>>> +    switch (ip->reg1i21_format.opcode) {
>>> +    case beqz_op:
>>> +    case bnez_op:
>>> +    case bceqz_op:
>>> +        if (ip->reg1i21_format.immediate_l == 0
>>> +           && ip->reg1i21_format.immediate_h == 0)
>>> +            return false;
>>> +    }
>>> +
>>> +    switch (ip->reg2i16_format.opcode) {
>>> +    case jirl_op:
>>> +        if (ip->reg2i16_format.rj == 0x1
>>> +           && ip->reg2i16_format.rd == 0x1
>> LOONGARCH_GPR_RA can be used instead of 0x1.
>>
>>> +           && ip->reg2i16_format.immediate == 0)
>>> +            return false;
>>> +        break;
>>> +    case beq_op:
>>> +    case bne_op:
>>> +    case blt_op:
>>> +    case bge_op:
>>> +    case bltu_op:
>>> +    case bgeu_op:
>>> +        if (ip->reg2i16_format.immediate == 0)
>>> +            return false;
>>> +    }
>>> +
>>> +    return true;
>>> +}
>>> +
>>>  void simu_pc(struct pt_regs *regs, union loongarch_instruction insn);
>>>  void simu_branch(struct pt_regs *regs, union loongarch_instruction 
>>> insn);
>>>
>>> diff --git a/arch/loongarch/include/asm/loongarch.h 
>>> b/arch/loongarch/include/asm/loongarch.h
>>> index e9aed583a064..65b7dcdea16d 100644
>>> --- a/arch/loongarch/include/asm/loongarch.h
>>> +++ b/arch/loongarch/include/asm/loongarch.h
>>> @@ -1055,6 +1055,9 @@ static __always_inline void iocsr_write64(u64 
>>> val, u32 reg)
>>>  #define LOONGARCH_CSR_DERA        0x501    /* debug era */
>>>  #define LOONGARCH_CSR_DESAVE        0x502    /* debug save */
>>>
>>> +#define CSR_FWPC_SKIP_SHIFT        16
>>> +#define CSR_FWPC_SKIP            (_ULCAST_(1) << CSR_FWPC_SKIP_SHIFT)
>>> +
>>>  /*
>>>   * CSR_ECFG IM
>>>   */
>>> diff --git a/arch/loongarch/include/asm/processor.h 
>>> b/arch/loongarch/include/asm/processor.h
>>> index db060c5a976f..3ea0f1910c23 100644
>>> --- a/arch/loongarch/include/asm/processor.h
>>> +++ b/arch/loongarch/include/asm/processor.h
>>> @@ -131,6 +131,9 @@ struct thread_struct {
>>>      struct perf_event    *hbp_break[LOONGARCH_MAX_BRP];
>>>      struct perf_event    *hbp_watch[LOONGARCH_MAX_WRP];
>>>
>>> +    /* Used by ptrace single_step */
>>> +    unsigned long single_step;
>>> +
>>>      /*
>>>       * FPU & vector registers, must be at last because
>>>       * they are conditionally copied at fork().
>>> diff --git a/arch/loongarch/include/asm/ptrace.h 
>>> b/arch/loongarch/include/asm/ptrace.h
>>> index 58596c4f8a0f..66a0e6c480a3 100644
>>> --- a/arch/loongarch/include/asm/ptrace.h
>>> +++ b/arch/loongarch/include/asm/ptrace.h
>>> @@ -150,4 +150,6 @@ static inline void user_stack_pointer_set(struct 
>>> pt_regs *regs,
>>>      regs->regs[3] = val;
>>>  }
>>>
>>> +#define arch_has_single_step()        (1)
>>> +
>>>  #endif /* _ASM_PTRACE_H */
>>> diff --git a/arch/loongarch/kernel/hw_breakpoint.c 
>>> b/arch/loongarch/kernel/hw_breakpoint.c
>>> index 6431cd319c32..75d3652fbe00 100644
>>> --- a/arch/loongarch/kernel/hw_breakpoint.c
>>> +++ b/arch/loongarch/kernel/hw_breakpoint.c
>>> @@ -153,6 +153,22 @@ static int hw_breakpoint_slot_setup(struct 
>>> perf_event **slots, int max_slots,
>>>   */
>>>  void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
>>>  {
>>> +    int i;
>>> +    struct thread_struct *t = &tsk->thread;
>>> +
>>> +    for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
>>> +        if (t->hbp_break[i]) {
>>> +            unregister_hw_breakpoint(t->hbp_break[i]);
>>> +            t->hbp_break[i] = NULL;
>>> +        }
>>> +    }
>>> +
>>> +    for (i = 0; i < LOONGARCH_MAX_WRP; i++) {
>>> +        if (t->hbp_watch[i]) {
>>> +            unregister_hw_breakpoint(t->hbp_watch[i]);
>>> +            t->hbp_watch[i] = NULL;
>>> +        }
>>> +    }
>>>  }
>>>
>>>  void ptrace_hw_copy_thread(struct task_struct *tsk)
>>> @@ -498,11 +514,20 @@ arch_initcall(arch_hw_breakpoint_init);
>>>  void hw_breakpoint_thread_switch(struct task_struct *next)
>>>  {
>>>      struct pt_regs *regs = task_pt_regs(next);
>>> -
>>> -    /* Update breakpoints */
>>> -    update_bp_registers(regs, 1, 0);
>>> -    /* Update watchpoints */
>>> -    update_bp_registers(regs, 1, 1);
>>> +    u64 addr, mask;
>>> +
>>> +    if (test_bit(TIF_SINGLESTEP, &task_thread_info(next)->flags)) {
>>> +        addr = read_wb_reg(CSR_CFG_ADDR, 0, 0);
>>> +        mask = read_wb_reg(CSR_CFG_MASK, 0, 0);
>>> +        if ((task_pt_regs(next)->csr_era & ~mask) == (addr & ~mask))
>>> +            csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
>>> +        regs->csr_prmd |= CSR_PRMD_PWE;
>>> +    } else {
>>> +        /* Update breakpoints */
>>> +        update_bp_registers(regs, 1, 0);
>>> +        /* Update watchpoints */
>>> +        update_bp_registers(regs, 1, 1);
>>> +    }
>>>  }
>>>
>>>  void hw_breakpoint_pmu_read(struct perf_event *bp)
>>> diff --git a/arch/loongarch/kernel/ptrace.c 
>>> b/arch/loongarch/kernel/ptrace.c
>>> index bee4194177fd..52a3ee4366f4 100644
>>> --- a/arch/loongarch/kernel/ptrace.c
>>> +++ b/arch/loongarch/kernel/ptrace.c
>>> @@ -20,6 +20,7 @@
>>>  #include <linux/context_tracking.h>
>>>  #include <linux/elf.h>
>>>  #include <linux/errno.h>
>>> +#include <linux/hw_breakpoint.h>
>>>  #include <linux/mm.h>
>>>  #include <linux/ptrace.h>
>>>  #include <linux/regset.h>
>>> @@ -30,6 +31,7 @@
>>>  #include <linux/stddef.h>
>>>  #include <linux/seccomp.h>
>>>  #include <linux/uaccess.h>
>>> +#include <linux/thread_info.h>
>>>
>>>  #include <asm/byteorder.h>
>>>  #include <asm/cpu.h>
>>> @@ -39,6 +41,7 @@
>>>  #include <asm/page.h>
>>>  #include <asm/pgtable.h>
>>>  #include <asm/processor.h>
>>> +#include <asm/ptrace.h>
>>>  #include <asm/reg.h>
>>>  #include <asm/syscall.h>
>>>
>>> @@ -541,3 +544,68 @@ long arch_ptrace(struct task_struct *child, 
>>> long request,
>>>
>>>      return ret;
>>>  }
>>> +
>>> +void ptrace_triggered(struct perf_event *bp,
>>> +              struct perf_sample_data *data, struct pt_regs *regs)
>>> +{
>>> +    struct perf_event_attr attr;
>>> +
>>> +    attr = bp->attr;
>>> +    attr.disabled = true;
>>> +    modify_user_hw_breakpoint(bp, &attr);
>>> +}
>>> +
>>> +static int set_single_step(struct task_struct *tsk, unsigned long 
>>> addr)
>>> +{
>>> +    struct thread_struct *thread = &tsk->thread;
>>> +    struct perf_event *bp;
>>> +    struct perf_event_attr attr;
>>> +    struct arch_hw_breakpoint *info;
>>> +
>>> +    bp = thread->hbp_break[0];
>>> +    if (!bp) {
>>> +        ptrace_breakpoint_init(&attr);
>>> +
>>> +        attr.bp_addr = addr;
>>> +        attr.bp_len = HW_BREAKPOINT_LEN_8;
>>> +        attr.bp_type = HW_BREAKPOINT_X;
>>> +
>>> +        bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
>>> +                         NULL, tsk);
>>> +        if (IS_ERR(bp))
>>> +            return PTR_ERR(bp);
>>> +
>>> +        thread->hbp_break[0] = bp;
>>> +    } else {
>>> +        int err;
>>> +
>>> +        attr = bp->attr;
>>> +        attr.bp_addr = addr;
>>> +        /* reenable breakpoint */
>>> +        attr.disabled = false;
>>> +        err = modify_user_hw_breakpoint(bp, &attr);
>>> +        if (unlikely(err))
>>> +            return err;
>>> +
>>> +        csr_write64(attr.bp_addr, LOONGARCH_CSR_IB0ADDR);
>>> +    }
>>> +    info = counter_arch_bp(bp);
>>> +    info->mask = 0xffffffffffff;
>> If `mask` is only used for user space address mask, it should not be
>> fixed to 0xffffffffffff, the user space address size may have many
>> situations, which can be defined according to TASK_SIZE.
>>
>> Use (TASK_SIZE - 1) instead.
>>
> ok, got it.

But according to to [1], the priority of watchpoint exception is highest
in the priority of exception which all fetching stage. Thus, we may get
the exception unexpected if set MASK to (TASK_SIZE - 1).

e.g.
     0x1000: addi.d $rd, $zero, -1 # a way to set a kernel address in rd
     0x1004: jr $rd

If singlestep is set at 0x1004, it will fetch insn in kernel address and
and get operation address error exception if we set MASK to (TASK_SIZE - 1).


[1] 
https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#exception-priority


Thanks,

Jinyang


>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +/* ptrace API */
>>> +void user_enable_single_step(struct task_struct *task)
>>> +{
>>> +    struct thread_info *ti = task_thread_info(task);
>>> +
>>> +    set_single_step(task, task_pt_regs(task)->csr_era);
>>> +    task->thread.single_step = task_pt_regs(task)->csr_era;
>>> +    set_ti_thread_flag(ti, TIF_SINGLESTEP);
>>> +}
>>> +
>>> +void user_disable_single_step(struct task_struct *task)
>>> +{
>>> +    clear_tsk_thread_flag(task, TIF_SINGLESTEP);
>>> +}
>>> diff --git a/arch/loongarch/kernel/traps.c 
>>> b/arch/loongarch/kernel/traps.c
>>> index 2b133079e0f3..a59275a43bd1 100644
>>> --- a/arch/loongarch/kernel/traps.c
>>> +++ b/arch/loongarch/kernel/traps.c
>>> @@ -511,9 +511,37 @@ asmlinkage void noinstr do_watch(struct pt_regs 
>>> *regs)
>>>  #ifdef CONFIG_HAVE_HW_BREAKPOINT
>>>      irqentry_state_t state = irqentry_enter(regs);
>>>
>>> -    breakpoint_handler(regs);
>>> -    watchpoint_handler(regs);
>>> -    force_sig(SIGTRAP);
>>> +    if (test_tsk_thread_flag(current, TIF_SINGLESTEP)) {
>>> +        int llbit = (csr_read32(LOONGARCH_CSR_LLBCTL) & 0x1);
>>> +        unsigned long pc = regs->csr_era;
>>> +        union loongarch_instruction *ip = (union 
>>> loongarch_instruction *)pc;
>>> +
>>> +        if (llbit) {
>>> +        /*
>>> +         * When the ll-sc combo is encountered, it is regarded as 
>>> an single
>>> +         * instruction. So don't clear llbit and reset 
>>> CSR.FWPS.Skip until
>>> +         * the llsc execution is completed.
>>> +         */
>> Comments should be aligned with code, similar modifications in other
>> places.
>>
>>> +            csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
>>> +            csr_write32(CSR_LLBCTL_KLO, LOONGARCH_CSR_LLBCTL);
>>> +        } else if (pc == current->thread.single_step) {
>>> +        /*
>>> +         * Maybe some hardware Bug caused that certain insns are 
>>> occasionally
>>> +         * not skipped when CSR.FWPS.Skip is set, such as 
>>> fld.d/fst.d. So Singlestep
>>> +         * needs to compare whether the csr_era is equal to the 
>>> value of singlestep
>>> +         * which last time set.
>>> +         */
>>> +            if (branch_ins_target_pc(ip))
>>> +            /* Prevent rd == pc from causing single step to fail to 
>>> stop */
>> IMO, that comment description is not accurate enough.ok, I'll add 
>> more comments.
>
> Thanks,
> -Qing
>> Youling.
>>
>>> + csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
>>> +        } else {
>>> +            force_sig(SIGTRAP);
>>> +        }
>>> +    } else {
>>> +        breakpoint_handler(regs);
>>> +        watchpoint_handler(regs);
>>> +        force_sig(SIGTRAP);
>>> +    }
>>>
>>>      irqentry_exit(regs, state);
>>>  #endif
>>>
>>
>
  
Qing Zhang Feb. 18, 2023, 6:31 a.m. UTC | #5
Hi, Xuerui

On 2023/2/17 下午6:16, WANG Xuerui wrote:
> On 2023/2/17 10:37, Qing Zhang wrote:
>> Use the generic ptrace_resume code for PTRACE_SYSCALL, PTRACE_CONT,
>> PTRACE_KILL and PTRACE_SINGLESTEP. This implies defining
>> arch_has_single_step in  and implementing the
>> user_enable_single_step and user_disable_single_step functions.
>>
>> LongArch has no hardware single-step register. the hardware single-step
> 
> "LoongArch cannot do hardware single-stepping per se" could be more 
> accurate a description. ("Hardware single-step register" could be 
> ambiguous in implying the only way to achieve single-stepping would be 
> via software, but in fact you explained in the next sentence that it's 
> actually doable, just in an alternate way.)
> 
>> function multiplex fetch instruction watchpoint(FWPS) and specifies that
>> the next instruction must trigger the watch exception by setting the 
>> mask bit.
> 
> I assume the "multiplex" is translated from "复用", in which case it's 
> not used in the exact same way as in Chinese. Plus you have the subject 
> wrong, "hardware single-step function" is the topic but not the actor 
> doing the "multiplexing".
> 
> My take (continuing from the previous sentence I suggested):
> 
> "Instead it is achieved by configuring the instruction fetch watchpoints 
> so that WPEF is triggered on the next instruction."
> 
>> Some scenarios use CSR.FWPS.Skip to ignore the next hit result, not to 
>> trigger
>> the watchpoint exception, and proceed to the next instruction.
> 
> Again, wrong topicalization ("scenarios" is not the actor/subject). "In 
> some scenarios CSR.FWPS.Skip is used to ..." sounds better.
> 
> And the multiple "next" usages sound unclear, you could probably reword 
> this a bit. (I can't immediately understand this part so I'm unable to 
> provide a suggestion.)
> 
>>
>> Signed-off-by: Qing Zhang <zhangqing@loongson.cn>
>> ---
>>   arch/loongarch/include/asm/inst.h      | 39 +++++++++++++++
>>   arch/loongarch/include/asm/loongarch.h |  3 ++
>>   arch/loongarch/include/asm/processor.h |  3 ++
>>   arch/loongarch/include/asm/ptrace.h    |  2 +
>>   arch/loongarch/kernel/hw_breakpoint.c  | 35 +++++++++++--
>>   arch/loongarch/kernel/ptrace.c         | 68 ++++++++++++++++++++++++++
>>   arch/loongarch/kernel/traps.c          | 34 +++++++++++--
>>   7 files changed, 176 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/loongarch/include/asm/inst.h 
>> b/arch/loongarch/include/asm/inst.h
>> index ba18ce8fbdf2..00c6f261d9a2 100644
>> --- a/arch/loongarch/include/asm/inst.h
>> +++ b/arch/loongarch/include/asm/inst.h
>> @@ -368,6 +368,45 @@ static inline bool is_stack_alloc_ins(union 
>> loongarch_instruction *ip)
>>           is_imm12_negative(ip->reg2i12_format.immediate);
>>   }
>> +static inline bool branch_ins_target_pc(union loongarch_instruction *ip)
> 
> This name is not faithfully reflecting its purpose, which is to check if 
> the given instruction will change PC; and the reason this has to be done 
> in the first place being that, single stepping would never stop due to 
> current hardware erratum if the PC would stay the same after the insn's 
> execution.
> 
> You may want to add some comments regarding the erratum too if you're in 
> a position to disclose more details.
> >> +{
>> +    switch (ip->reg0i26_format.opcode) {
>> +    case b_op:
>> +    case bl_op:
>> +        if (ip->reg0i26_format.immediate_l == 0
>> +           && ip->reg0i26_format.immediate_h == 0)
>> +            return false;
>> +    }
>> +
>> +    switch (ip->reg1i21_format.opcode) {
>> +    case beqz_op:
>> +    case bnez_op:
>> +    case bceqz_op:
>> +        if (ip->reg1i21_format.immediate_l == 0
>> +           && ip->reg1i21_format.immediate_h == 0)
>> +            return false;
>> +    }
>> +
>> +    switch (ip->reg2i16_format.opcode) {
>> +    case jirl_op:
>> +        if (ip->reg2i16_format.rj == 0x1
>> +           && ip->reg2i16_format.rd == 0x1
> Given the appearance of the other cases, I feel like the triggering 
> condition of the nonterminating singlestepping is that "new PC == 
> current PC" always hold. So, isn't every `rd == rj && imm == 0` case 
> problematic?
> 
We just have to think about dest_pc == current_pc,
Not every case where 'rd == rj &&imm == 0' is a problem.

current_pc = csr_era (The user mode of the Exception pc)
dest_pc = rj + offset

eg:
jirl ra,ra 0
1. When rj == ra, current_pc = dest_pc.
jirl t0,t0 0
2. When rj is another register, if the value in it is equal to ra, 
current_pc = dest_pc. The value of rj is recovered by pt_reg.

Therefore, rd does not need comparison, I modified jirl's judgment and
others in v5.

Thanks,
-Qing
>> +           && ip->reg2i16_format.immediate == 0)
>> +            return false;
>> +        break;
>> +    case beq_op:
>> +    case bne_op:
>> +    case blt_op:
>> +    case bge_op:
>> +    case bltu_op:
>> +    case bgeu_op:
>> +        if (ip->reg2i16_format.immediate == 0)
>> +            return false;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>>   void simu_pc(struct pt_regs *regs, union loongarch_instruction insn);
>>   void simu_branch(struct pt_regs *regs, union loongarch_instruction 
>> insn);
>> diff --git a/arch/loongarch/include/asm/loongarch.h 
>> b/arch/loongarch/include/asm/loongarch.h
>> index e9aed583a064..65b7dcdea16d 100644
>> --- a/arch/loongarch/include/asm/loongarch.h
>> +++ b/arch/loongarch/include/asm/loongarch.h
>> @@ -1055,6 +1055,9 @@ static __always_inline void iocsr_write64(u64 
>> val, u32 reg)
>>   #define LOONGARCH_CSR_DERA        0x501    /* debug era */
>>   #define LOONGARCH_CSR_DESAVE        0x502    /* debug save */
>> +#define CSR_FWPC_SKIP_SHIFT        16
>> +#define CSR_FWPC_SKIP            (_ULCAST_(1) << CSR_FWPC_SKIP_SHIFT)
>> +
>>   /*
>>    * CSR_ECFG IM
>>    */
>> diff --git a/arch/loongarch/include/asm/processor.h 
>> b/arch/loongarch/include/asm/processor.h
>> index db060c5a976f..3ea0f1910c23 100644
>> --- a/arch/loongarch/include/asm/processor.h
>> +++ b/arch/loongarch/include/asm/processor.h
>> @@ -131,6 +131,9 @@ struct thread_struct {
>>       struct perf_event    *hbp_break[LOONGARCH_MAX_BRP];
>>       struct perf_event    *hbp_watch[LOONGARCH_MAX_WRP];
>> +    /* Used by ptrace single_step */
> 
> "PTRACE_SINGLESTEP" if you want to refer to the technical name of the 
> feature, or "ptrace single stepping" which is more natural language.
> 
>> +    unsigned long single_step;
>> +
>>       /*
>>        * FPU & vector registers, must be at last because
>>        * they are conditionally copied at fork().
>> diff --git a/arch/loongarch/include/asm/ptrace.h 
>> b/arch/loongarch/include/asm/ptrace.h
>> index 58596c4f8a0f..66a0e6c480a3 100644
>> --- a/arch/loongarch/include/asm/ptrace.h
>> +++ b/arch/loongarch/include/asm/ptrace.h
>> @@ -150,4 +150,6 @@ static inline void user_stack_pointer_set(struct 
>> pt_regs *regs,
>>       regs->regs[3] = val;
>>   }
>> +#define arch_has_single_step()        (1)
>> +
>>   #endif /* _ASM_PTRACE_H */
>> diff --git a/arch/loongarch/kernel/hw_breakpoint.c 
>> b/arch/loongarch/kernel/hw_breakpoint.c
>> index 6431cd319c32..75d3652fbe00 100644
>> --- a/arch/loongarch/kernel/hw_breakpoint.c
>> +++ b/arch/loongarch/kernel/hw_breakpoint.c
>> @@ -153,6 +153,22 @@ static int hw_breakpoint_slot_setup(struct 
>> perf_event **slots, int max_slots,
>>    */
>>   void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
>>   {
>> +    int i;
>> +    struct thread_struct *t = &tsk->thread;
>> +
>> +    for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
>> +        if (t->hbp_break[i]) {
>> +            unregister_hw_breakpoint(t->hbp_break[i]);
>> +            t->hbp_break[i] = NULL;
>> +        }
>> +    }
>> +
>> +    for (i = 0; i < LOONGARCH_MAX_WRP; i++) {
>> +        if (t->hbp_watch[i]) {
>> +            unregister_hw_breakpoint(t->hbp_watch[i]);
>> +            t->hbp_watch[i] = NULL;
>> +        }
>> +    }
>>   }
>>   void ptrace_hw_copy_thread(struct task_struct *tsk)
>> @@ -498,11 +514,20 @@ arch_initcall(arch_hw_breakpoint_init);
>>   void hw_breakpoint_thread_switch(struct task_struct *next)
>>   {
>>       struct pt_regs *regs = task_pt_regs(next);
>> -
>> -    /* Update breakpoints */
>> -    update_bp_registers(regs, 1, 0);
>> -    /* Update watchpoints */
>> -    update_bp_registers(regs, 1, 1);
>> +    u64 addr, mask;
>> +
>> +    if (test_bit(TIF_SINGLESTEP, &task_thread_info(next)->flags)) {
>> +        addr = read_wb_reg(CSR_CFG_ADDR, 0, 0);
>> +        mask = read_wb_reg(CSR_CFG_MASK, 0, 0);
>> +        if ((task_pt_regs(next)->csr_era & ~mask) == (addr & ~mask))
>> +            csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
>> +        regs->csr_prmd |= CSR_PRMD_PWE;
>> +    } else {
>> +        /* Update breakpoints */
>> +        update_bp_registers(regs, 1, 0);
>> +        /* Update watchpoints */
>> +        update_bp_registers(regs, 1, 1);
>> +    }
>>   }
>>   void hw_breakpoint_pmu_read(struct perf_event *bp)
>> diff --git a/arch/loongarch/kernel/ptrace.c 
>> b/arch/loongarch/kernel/ptrace.c
>> index bee4194177fd..52a3ee4366f4 100644
>> --- a/arch/loongarch/kernel/ptrace.c
>> +++ b/arch/loongarch/kernel/ptrace.c
>> @@ -20,6 +20,7 @@
>>   #include <linux/context_tracking.h>
>>   #include <linux/elf.h>
>>   #include <linux/errno.h>
>> +#include <linux/hw_breakpoint.h>
>>   #include <linux/mm.h>
>>   #include <linux/ptrace.h>
>>   #include <linux/regset.h>
>> @@ -30,6 +31,7 @@
>>   #include <linux/stddef.h>
>>   #include <linux/seccomp.h>
>>   #include <linux/uaccess.h>
>> +#include <linux/thread_info.h>
>>   #include <asm/byteorder.h>
>>   #include <asm/cpu.h>
>> @@ -39,6 +41,7 @@
>>   #include <asm/page.h>
>>   #include <asm/pgtable.h>
>>   #include <asm/processor.h>
>> +#include <asm/ptrace.h>
>>   #include <asm/reg.h>
>>   #include <asm/syscall.h>
>> @@ -541,3 +544,68 @@ long arch_ptrace(struct task_struct *child, long 
>> request,
>>       return ret;
>>   }
>> +
>> +void ptrace_triggered(struct perf_event *bp,
>> +              struct perf_sample_data *data, struct pt_regs *regs)
>> +{
>> +    struct perf_event_attr attr;
>> +
>> +    attr = bp->attr;
>> +    attr.disabled = true;
>> +    modify_user_hw_breakpoint(bp, &attr);
>> +}
>> +
>> +static int set_single_step(struct task_struct *tsk, unsigned long addr)
>> +{
>> +    struct thread_struct *thread = &tsk->thread;
>> +    struct perf_event *bp;
>> +    struct perf_event_attr attr;
>> +    struct arch_hw_breakpoint *info;
>> +
>> +    bp = thread->hbp_break[0];
>> +    if (!bp) {
>> +        ptrace_breakpoint_init(&attr);
>> +
>> +        attr.bp_addr = addr;
>> +        attr.bp_len = HW_BREAKPOINT_LEN_8;
>> +        attr.bp_type = HW_BREAKPOINT_X;
>> +
>> +        bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
>> +                         NULL, tsk);
>> +        if (IS_ERR(bp))
>> +            return PTR_ERR(bp);
>> +
>> +        thread->hbp_break[0] = bp;
>> +    } else {
>> +        int err;
>> +
>> +        attr = bp->attr;
>> +        attr.bp_addr = addr;
>> +        /* reenable breakpoint */
>> +        attr.disabled = false;
>> +        err = modify_user_hw_breakpoint(bp, &attr);
>> +        if (unlikely(err))
>> +            return err;
>> +
>> +        csr_write64(attr.bp_addr, LOONGARCH_CSR_IB0ADDR);
>> +    }
>> +    info = counter_arch_bp(bp);
>> +    info->mask = 0xffffffffffff;
>> +
>> +    return 0;
>> +}
>> +
>> +/* ptrace API */
>> +void user_enable_single_step(struct task_struct *task)
>> +{
>> +    struct thread_info *ti = task_thread_info(task);
>> +
>> +    set_single_step(task, task_pt_regs(task)->csr_era);
>> +    task->thread.single_step = task_pt_regs(task)->csr_era;
>> +    set_ti_thread_flag(ti, TIF_SINGLESTEP);
>> +}
>> +
>> +void user_disable_single_step(struct task_struct *task)
>> +{
>> +    clear_tsk_thread_flag(task, TIF_SINGLESTEP);
>> +}
>> diff --git a/arch/loongarch/kernel/traps.c 
>> b/arch/loongarch/kernel/traps.c
>> index 2b133079e0f3..a59275a43bd1 100644
>> --- a/arch/loongarch/kernel/traps.c
>> +++ b/arch/loongarch/kernel/traps.c
>> @@ -511,9 +511,37 @@ asmlinkage void noinstr do_watch(struct pt_regs 
>> *regs)
>>   #ifdef CONFIG_HAVE_HW_BREAKPOINT
>>       irqentry_state_t state = irqentry_enter(regs);
>> -    breakpoint_handler(regs);
>> -    watchpoint_handler(regs);
>> -    force_sig(SIGTRAP);
>> +    if (test_tsk_thread_flag(current, TIF_SINGLESTEP)) {
>> +        int llbit = (csr_read32(LOONGARCH_CSR_LLBCTL) & 0x1);
>> +        unsigned long pc = regs->csr_era;
>> +        union loongarch_instruction *ip = (union 
>> loongarch_instruction *)pc;
>> +
>> +        if (llbit) {
>> +        /*
>> +         * When the ll-sc combo is encountered, it is regarded as an 
>> single
>> +         * instruction. So don't clear llbit and reset CSR.FWPS.Skip 
>> until
>> +         * the llsc execution is completed.
>> +         */
>> +            csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
>> +            csr_write32(CSR_LLBCTL_KLO, LOONGARCH_CSR_LLBCTL);
>> +        } else if (pc == current->thread.single_step) {
>> +        /*
>> +         * Maybe some hardware Bug caused that certain insns are 
>> occasionally
>> +         * not skipped when CSR.FWPS.Skip is set, such as 
>> fld.d/fst.d. So Singlestep
>> +         * needs to compare whether the csr_era is equal to the value 
>> of singlestep
>> +         * which last time set.
>> +         */
>> +            if (branch_ins_target_pc(ip))
>> +            /* Prevent rd == pc from causing single step to fail to 
>> stop */
>> +                csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
>> +        } else {
>> +            force_sig(SIGTRAP);
>> +        }
>> +    } else {
>> +        breakpoint_handler(regs);
>> +        watchpoint_handler(regs);
>> +        force_sig(SIGTRAP);
>> +    }
>>       irqentry_exit(regs, state);
>>   #endif
>
  
Qing Zhang Feb. 18, 2023, 6:38 a.m. UTC | #6
Hi, Jinyang

On 2023/2/18 上午9:21, Jinyang He wrote:
> 
> On 2023-02-17 18:00, Qing Zhang wrote:
>> Hi, Youling
>>
>> On 2023/2/17 下午5:50, Youling Tang wrote:
>>> Hi, Qing
>>>
>>> On 02/17/2023 10:37 AM, Qing Zhang wrote:
>>>> Use the generic ptrace_resume code for PTRACE_SYSCALL, PTRACE_CONT,
>>>> PTRACE_KILL and PTRACE_SINGLESTEP. This implies defining
>>>> arch_has_single_step in  and implementing the
>>>> user_enable_single_step and user_disable_single_step functions.
>>>>
>>>> LongArch has no hardware single-step register. the hardware single-step
>>>> function multiplex fetch instruction watchpoint(FWPS) and specifies 
>>>> that
>>>> the next instruction must trigger the watch exception by setting the 
>>>> mask bit.
>>>> Some scenarios use CSR.FWPS.Skip to ignore the next hit result, not 
>>>> to trigger
>>>> the watchpoint exception, and proceed to the next instruction.
>>>>
>>>> Signed-off-by: Qing Zhang <zhangqing@loongson.cn>
>>>> ---
>>>>  arch/loongarch/include/asm/inst.h      | 39 +++++++++++++++
>>>>  arch/loongarch/include/asm/loongarch.h |  3 ++
>>>>  arch/loongarch/include/asm/processor.h |  3 ++
>>>>  arch/loongarch/include/asm/ptrace.h    |  2 +
>>>>  arch/loongarch/kernel/hw_breakpoint.c  | 35 +++++++++++--
>>>>  arch/loongarch/kernel/ptrace.c         | 68 ++++++++++++++++++++++++++
>>>>  arch/loongarch/kernel/traps.c          | 34 +++++++++++--
>>>>  7 files changed, 176 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/arch/loongarch/include/asm/inst.h 
>>>> b/arch/loongarch/include/asm/inst.h
>>>> index ba18ce8fbdf2..00c6f261d9a2 100644
>>>> --- a/arch/loongarch/include/asm/inst.h
>>>> +++ b/arch/loongarch/include/asm/inst.h
>>>> @@ -368,6 +368,45 @@ static inline bool is_stack_alloc_ins(union 
>>>> loongarch_instruction *ip)
>>>>          is_imm12_negative(ip->reg2i12_format.immediate);
>>>>  }
>>>>
>>>> +static inline bool branch_ins_target_pc(union loongarch_instruction 
>>>> *ip)
>>>> +{
>>>> +    switch (ip->reg0i26_format.opcode) {
>>>> +    case b_op:
>>>> +    case bl_op:
>>>> +        if (ip->reg0i26_format.immediate_l == 0
>>>> +           && ip->reg0i26_format.immediate_h == 0)
>>>> +            return false;
>>>> +    }
>>>> +
>>>> +    switch (ip->reg1i21_format.opcode) {
>>>> +    case beqz_op:
>>>> +    case bnez_op:
>>>> +    case bceqz_op:
>>>> +        if (ip->reg1i21_format.immediate_l == 0
>>>> +           && ip->reg1i21_format.immediate_h == 0)
>>>> +            return false;
>>>> +    }
>>>> +
>>>> +    switch (ip->reg2i16_format.opcode) {
>>>> +    case jirl_op:
>>>> +        if (ip->reg2i16_format.rj == 0x1
>>>> +           && ip->reg2i16_format.rd == 0x1
>>> LOONGARCH_GPR_RA can be used instead of 0x1.
>>>
>>>> +           && ip->reg2i16_format.immediate == 0)
>>>> +            return false;
>>>> +        break;
>>>> +    case beq_op:
>>>> +    case bne_op:
>>>> +    case blt_op:
>>>> +    case bge_op:
>>>> +    case bltu_op:
>>>> +    case bgeu_op:
>>>> +        if (ip->reg2i16_format.immediate == 0)
>>>> +            return false;
>>>> +    }
>>>> +
>>>> +    return true;
>>>> +}
>>>> +
>>>>  void simu_pc(struct pt_regs *regs, union loongarch_instruction insn);
>>>>  void simu_branch(struct pt_regs *regs, union loongarch_instruction 
>>>> insn);
>>>>
>>>> diff --git a/arch/loongarch/include/asm/loongarch.h 
>>>> b/arch/loongarch/include/asm/loongarch.h
>>>> index e9aed583a064..65b7dcdea16d 100644
>>>> --- a/arch/loongarch/include/asm/loongarch.h
>>>> +++ b/arch/loongarch/include/asm/loongarch.h
>>>> @@ -1055,6 +1055,9 @@ static __always_inline void iocsr_write64(u64 
>>>> val, u32 reg)
>>>>  #define LOONGARCH_CSR_DERA        0x501    /* debug era */
>>>>  #define LOONGARCH_CSR_DESAVE        0x502    /* debug save */
>>>>
>>>> +#define CSR_FWPC_SKIP_SHIFT        16
>>>> +#define CSR_FWPC_SKIP            (_ULCAST_(1) << CSR_FWPC_SKIP_SHIFT)
>>>> +
>>>>  /*
>>>>   * CSR_ECFG IM
>>>>   */
>>>> diff --git a/arch/loongarch/include/asm/processor.h 
>>>> b/arch/loongarch/include/asm/processor.h
>>>> index db060c5a976f..3ea0f1910c23 100644
>>>> --- a/arch/loongarch/include/asm/processor.h
>>>> +++ b/arch/loongarch/include/asm/processor.h
>>>> @@ -131,6 +131,9 @@ struct thread_struct {
>>>>      struct perf_event    *hbp_break[LOONGARCH_MAX_BRP];
>>>>      struct perf_event    *hbp_watch[LOONGARCH_MAX_WRP];
>>>>
>>>> +    /* Used by ptrace single_step */
>>>> +    unsigned long single_step;
>>>> +
>>>>      /*
>>>>       * FPU & vector registers, must be at last because
>>>>       * they are conditionally copied at fork().
>>>> diff --git a/arch/loongarch/include/asm/ptrace.h 
>>>> b/arch/loongarch/include/asm/ptrace.h
>>>> index 58596c4f8a0f..66a0e6c480a3 100644
>>>> --- a/arch/loongarch/include/asm/ptrace.h
>>>> +++ b/arch/loongarch/include/asm/ptrace.h
>>>> @@ -150,4 +150,6 @@ static inline void user_stack_pointer_set(struct 
>>>> pt_regs *regs,
>>>>      regs->regs[3] = val;
>>>>  }
>>>>
>>>> +#define arch_has_single_step()        (1)
>>>> +
>>>>  #endif /* _ASM_PTRACE_H */
>>>> diff --git a/arch/loongarch/kernel/hw_breakpoint.c 
>>>> b/arch/loongarch/kernel/hw_breakpoint.c
>>>> index 6431cd319c32..75d3652fbe00 100644
>>>> --- a/arch/loongarch/kernel/hw_breakpoint.c
>>>> +++ b/arch/loongarch/kernel/hw_breakpoint.c
>>>> @@ -153,6 +153,22 @@ static int hw_breakpoint_slot_setup(struct 
>>>> perf_event **slots, int max_slots,
>>>>   */
>>>>  void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
>>>>  {
>>>> +    int i;
>>>> +    struct thread_struct *t = &tsk->thread;
>>>> +
>>>> +    for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
>>>> +        if (t->hbp_break[i]) {
>>>> +            unregister_hw_breakpoint(t->hbp_break[i]);
>>>> +            t->hbp_break[i] = NULL;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    for (i = 0; i < LOONGARCH_MAX_WRP; i++) {
>>>> +        if (t->hbp_watch[i]) {
>>>> +            unregister_hw_breakpoint(t->hbp_watch[i]);
>>>> +            t->hbp_watch[i] = NULL;
>>>> +        }
>>>> +    }
>>>>  }
>>>>
>>>>  void ptrace_hw_copy_thread(struct task_struct *tsk)
>>>> @@ -498,11 +514,20 @@ arch_initcall(arch_hw_breakpoint_init);
>>>>  void hw_breakpoint_thread_switch(struct task_struct *next)
>>>>  {
>>>>      struct pt_regs *regs = task_pt_regs(next);
>>>> -
>>>> -    /* Update breakpoints */
>>>> -    update_bp_registers(regs, 1, 0);
>>>> -    /* Update watchpoints */
>>>> -    update_bp_registers(regs, 1, 1);
>>>> +    u64 addr, mask;
>>>> +
>>>> +    if (test_bit(TIF_SINGLESTEP, &task_thread_info(next)->flags)) {
>>>> +        addr = read_wb_reg(CSR_CFG_ADDR, 0, 0);
>>>> +        mask = read_wb_reg(CSR_CFG_MASK, 0, 0);
>>>> +        if ((task_pt_regs(next)->csr_era & ~mask) == (addr & ~mask))
>>>> +            csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
>>>> +        regs->csr_prmd |= CSR_PRMD_PWE;
>>>> +    } else {
>>>> +        /* Update breakpoints */
>>>> +        update_bp_registers(regs, 1, 0);
>>>> +        /* Update watchpoints */
>>>> +        update_bp_registers(regs, 1, 1);
>>>> +    }
>>>>  }
>>>>
>>>>  void hw_breakpoint_pmu_read(struct perf_event *bp)
>>>> diff --git a/arch/loongarch/kernel/ptrace.c 
>>>> b/arch/loongarch/kernel/ptrace.c
>>>> index bee4194177fd..52a3ee4366f4 100644
>>>> --- a/arch/loongarch/kernel/ptrace.c
>>>> +++ b/arch/loongarch/kernel/ptrace.c
>>>> @@ -20,6 +20,7 @@
>>>>  #include <linux/context_tracking.h>
>>>>  #include <linux/elf.h>
>>>>  #include <linux/errno.h>
>>>> +#include <linux/hw_breakpoint.h>
>>>>  #include <linux/mm.h>
>>>>  #include <linux/ptrace.h>
>>>>  #include <linux/regset.h>
>>>> @@ -30,6 +31,7 @@
>>>>  #include <linux/stddef.h>
>>>>  #include <linux/seccomp.h>
>>>>  #include <linux/uaccess.h>
>>>> +#include <linux/thread_info.h>
>>>>
>>>>  #include <asm/byteorder.h>
>>>>  #include <asm/cpu.h>
>>>> @@ -39,6 +41,7 @@
>>>>  #include <asm/page.h>
>>>>  #include <asm/pgtable.h>
>>>>  #include <asm/processor.h>
>>>> +#include <asm/ptrace.h>
>>>>  #include <asm/reg.h>
>>>>  #include <asm/syscall.h>
>>>>
>>>> @@ -541,3 +544,68 @@ long arch_ptrace(struct task_struct *child, 
>>>> long request,
>>>>
>>>>      return ret;
>>>>  }
>>>> +
>>>> +void ptrace_triggered(struct perf_event *bp,
>>>> +              struct perf_sample_data *data, struct pt_regs *regs)
>>>> +{
>>>> +    struct perf_event_attr attr;
>>>> +
>>>> +    attr = bp->attr;
>>>> +    attr.disabled = true;
>>>> +    modify_user_hw_breakpoint(bp, &attr);
>>>> +}
>>>> +
>>>> +static int set_single_step(struct task_struct *tsk, unsigned long 
>>>> addr)
>>>> +{
>>>> +    struct thread_struct *thread = &tsk->thread;
>>>> +    struct perf_event *bp;
>>>> +    struct perf_event_attr attr;
>>>> +    struct arch_hw_breakpoint *info;
>>>> +
>>>> +    bp = thread->hbp_break[0];
>>>> +    if (!bp) {
>>>> +        ptrace_breakpoint_init(&attr);
>>>> +
>>>> +        attr.bp_addr = addr;
>>>> +        attr.bp_len = HW_BREAKPOINT_LEN_8;
>>>> +        attr.bp_type = HW_BREAKPOINT_X;
>>>> +
>>>> +        bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
>>>> +                         NULL, tsk);
>>>> +        if (IS_ERR(bp))
>>>> +            return PTR_ERR(bp);
>>>> +
>>>> +        thread->hbp_break[0] = bp;
>>>> +    } else {
>>>> +        int err;
>>>> +
>>>> +        attr = bp->attr;
>>>> +        attr.bp_addr = addr;
>>>> +        /* reenable breakpoint */
>>>> +        attr.disabled = false;
>>>> +        err = modify_user_hw_breakpoint(bp, &attr);
>>>> +        if (unlikely(err))
>>>> +            return err;
>>>> +
>>>> +        csr_write64(attr.bp_addr, LOONGARCH_CSR_IB0ADDR);
>>>> +    }
>>>> +    info = counter_arch_bp(bp);
>>>> +    info->mask = 0xffffffffffff;
>>> If `mask` is only used for user space address mask, it should not be
>>> fixed to 0xffffffffffff, the user space address size may have many
>>> situations, which can be defined according to TASK_SIZE.
>>>
>>> Use (TASK_SIZE - 1) instead.
>>>
>> ok, got it.
> 
> But according to to [1], the priority of watchpoint exception is highest
> in the priority of exception which all fetching stage. Thus, we may get
> the exception unexpected if set MASK to (TASK_SIZE - 1).
> 
> e.g.
>      0x1000: addi.d $rd, $zero, -1 # a way to set a kernel address in rd
>      0x1004: jr $rd
> 
> If singlestep is set at 0x1004, it will fetch insn in kernel address and
> and get operation address error exception if we set MASK to (TASK_SIZE - 
> 1).
> 
> 
> [1] 
> https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#exception-priority 
> 
> 
In the future, breakpoints at kernel_enable_single_step will be added, 
possibly with mask set to -1.
Now, because FWPS CTRL is set to 1 for PLV0-3, mask is set to 
0xffffffffffff at user_enable_single_step.
As discussed below, user_code will be added to the breakpoint framework.

Thanks,
-Qing
> 
> Thanks,
> 
> Jinyang
> 
> 
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +/* ptrace API */
>>>> +void user_enable_single_step(struct task_struct *task)
>>>> +{
>>>> +    struct thread_info *ti = task_thread_info(task);
>>>> +
>>>> +    set_single_step(task, task_pt_regs(task)->csr_era);
>>>> +    task->thread.single_step = task_pt_regs(task)->csr_era;
>>>> +    set_ti_thread_flag(ti, TIF_SINGLESTEP);
>>>> +}
>>>> +
>>>> +void user_disable_single_step(struct task_struct *task)
>>>> +{
>>>> +    clear_tsk_thread_flag(task, TIF_SINGLESTEP);
>>>> +}
>>>> diff --git a/arch/loongarch/kernel/traps.c 
>>>> b/arch/loongarch/kernel/traps.c
>>>> index 2b133079e0f3..a59275a43bd1 100644
>>>> --- a/arch/loongarch/kernel/traps.c
>>>> +++ b/arch/loongarch/kernel/traps.c
>>>> @@ -511,9 +511,37 @@ asmlinkage void noinstr do_watch(struct pt_regs 
>>>> *regs)
>>>>  #ifdef CONFIG_HAVE_HW_BREAKPOINT
>>>>      irqentry_state_t state = irqentry_enter(regs);
>>>>
>>>> -    breakpoint_handler(regs);
>>>> -    watchpoint_handler(regs);
>>>> -    force_sig(SIGTRAP);
>>>> +    if (test_tsk_thread_flag(current, TIF_SINGLESTEP)) {
>>>> +        int llbit = (csr_read32(LOONGARCH_CSR_LLBCTL) & 0x1);
>>>> +        unsigned long pc = regs->csr_era;
>>>> +        union loongarch_instruction *ip = (union 
>>>> loongarch_instruction *)pc;
>>>> +
>>>> +        if (llbit) {
>>>> +        /*
>>>> +         * When the ll-sc combo is encountered, it is regarded as 
>>>> an single
>>>> +         * instruction. So don't clear llbit and reset 
>>>> CSR.FWPS.Skip until
>>>> +         * the llsc execution is completed.
>>>> +         */
>>> Comments should be aligned with code, similar modifications in other
>>> places.
>>>
>>>> +            csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
>>>> +            csr_write32(CSR_LLBCTL_KLO, LOONGARCH_CSR_LLBCTL);
>>>> +        } else if (pc == current->thread.single_step) {
>>>> +        /*
>>>> +         * Maybe some hardware Bug caused that certain insns are 
>>>> occasionally
>>>> +         * not skipped when CSR.FWPS.Skip is set, such as 
>>>> fld.d/fst.d. So Singlestep
>>>> +         * needs to compare whether the csr_era is equal to the 
>>>> value of singlestep
>>>> +         * which last time set.
>>>> +         */
>>>> +            if (branch_ins_target_pc(ip))
>>>> +            /* Prevent rd == pc from causing single step to fail to 
>>>> stop */
>>> IMO, that comment description is not accurate enough.ok, I'll add 
>>> more comments.
>>
>> Thanks,
>> -Qing
>>> Youling.
>>>
>>>> + csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
>>>> +        } else {
>>>> +            force_sig(SIGTRAP);
>>>> +        }
>>>> +    } else {
>>>> +        breakpoint_handler(regs);
>>>> +        watchpoint_handler(regs);
>>>> +        force_sig(SIGTRAP);
>>>> +    }
>>>>
>>>>      irqentry_exit(regs, state);
>>>>  #endif
>>>>
>>>
>>
>
  

Patch

diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h
index ba18ce8fbdf2..00c6f261d9a2 100644
--- a/arch/loongarch/include/asm/inst.h
+++ b/arch/loongarch/include/asm/inst.h
@@ -368,6 +368,45 @@  static inline bool is_stack_alloc_ins(union loongarch_instruction *ip)
 		is_imm12_negative(ip->reg2i12_format.immediate);
 }
 
+static inline bool branch_ins_target_pc(union loongarch_instruction *ip)
+{
+	switch (ip->reg0i26_format.opcode) {
+	case b_op:
+	case bl_op:
+		if (ip->reg0i26_format.immediate_l == 0
+		   && ip->reg0i26_format.immediate_h == 0)
+			return false;
+	}
+
+	switch (ip->reg1i21_format.opcode) {
+	case beqz_op:
+	case bnez_op:
+	case bceqz_op:
+		if (ip->reg1i21_format.immediate_l == 0
+		   && ip->reg1i21_format.immediate_h == 0)
+			return false;
+	}
+
+	switch (ip->reg2i16_format.opcode) {
+	case jirl_op:
+		if (ip->reg2i16_format.rj == 0x1
+		   && ip->reg2i16_format.rd == 0x1
+		   && ip->reg2i16_format.immediate == 0)
+			return false;
+		break;
+	case beq_op:
+	case bne_op:
+	case blt_op:
+	case bge_op:
+	case bltu_op:
+	case bgeu_op:
+		if (ip->reg2i16_format.immediate == 0)
+			return false;
+	}
+
+	return true;
+}
+
 void simu_pc(struct pt_regs *regs, union loongarch_instruction insn);
 void simu_branch(struct pt_regs *regs, union loongarch_instruction insn);
 
diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
index e9aed583a064..65b7dcdea16d 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -1055,6 +1055,9 @@  static __always_inline void iocsr_write64(u64 val, u32 reg)
 #define LOONGARCH_CSR_DERA		0x501	/* debug era */
 #define LOONGARCH_CSR_DESAVE		0x502	/* debug save */
 
+#define CSR_FWPC_SKIP_SHIFT		16
+#define CSR_FWPC_SKIP			(_ULCAST_(1) << CSR_FWPC_SKIP_SHIFT)
+
 /*
  * CSR_ECFG IM
  */
diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h
index db060c5a976f..3ea0f1910c23 100644
--- a/arch/loongarch/include/asm/processor.h
+++ b/arch/loongarch/include/asm/processor.h
@@ -131,6 +131,9 @@  struct thread_struct {
 	struct perf_event	*hbp_break[LOONGARCH_MAX_BRP];
 	struct perf_event	*hbp_watch[LOONGARCH_MAX_WRP];
 
+	/* Used by ptrace single_step */
+	unsigned long single_step;
+
 	/*
 	 * FPU & vector registers, must be at last because
 	 * they are conditionally copied at fork().
diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h
index 58596c4f8a0f..66a0e6c480a3 100644
--- a/arch/loongarch/include/asm/ptrace.h
+++ b/arch/loongarch/include/asm/ptrace.h
@@ -150,4 +150,6 @@  static inline void user_stack_pointer_set(struct pt_regs *regs,
 	regs->regs[3] = val;
 }
 
+#define arch_has_single_step()		(1)
+
 #endif /* _ASM_PTRACE_H */
diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c
index 6431cd319c32..75d3652fbe00 100644
--- a/arch/loongarch/kernel/hw_breakpoint.c
+++ b/arch/loongarch/kernel/hw_breakpoint.c
@@ -153,6 +153,22 @@  static int hw_breakpoint_slot_setup(struct perf_event **slots, int max_slots,
  */
 void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
 {
+	int i;
+	struct thread_struct *t = &tsk->thread;
+
+	for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
+		if (t->hbp_break[i]) {
+			unregister_hw_breakpoint(t->hbp_break[i]);
+			t->hbp_break[i] = NULL;
+		}
+	}
+
+	for (i = 0; i < LOONGARCH_MAX_WRP; i++) {
+		if (t->hbp_watch[i]) {
+			unregister_hw_breakpoint(t->hbp_watch[i]);
+			t->hbp_watch[i] = NULL;
+		}
+	}
 }
 
 void ptrace_hw_copy_thread(struct task_struct *tsk)
@@ -498,11 +514,20 @@  arch_initcall(arch_hw_breakpoint_init);
 void hw_breakpoint_thread_switch(struct task_struct *next)
 {
 	struct pt_regs *regs = task_pt_regs(next);
-
-	/* Update breakpoints */
-	update_bp_registers(regs, 1, 0);
-	/* Update watchpoints */
-	update_bp_registers(regs, 1, 1);
+	u64 addr, mask;
+
+	if (test_bit(TIF_SINGLESTEP, &task_thread_info(next)->flags)) {
+		addr = read_wb_reg(CSR_CFG_ADDR, 0, 0);
+		mask = read_wb_reg(CSR_CFG_MASK, 0, 0);
+		if ((task_pt_regs(next)->csr_era & ~mask) == (addr & ~mask))
+			csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
+		regs->csr_prmd |= CSR_PRMD_PWE;
+	} else {
+		/* Update breakpoints */
+		update_bp_registers(regs, 1, 0);
+		/* Update watchpoints */
+		update_bp_registers(regs, 1, 1);
+	}
 }
 
 void hw_breakpoint_pmu_read(struct perf_event *bp)
diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c
index bee4194177fd..52a3ee4366f4 100644
--- a/arch/loongarch/kernel/ptrace.c
+++ b/arch/loongarch/kernel/ptrace.c
@@ -20,6 +20,7 @@ 
 #include <linux/context_tracking.h>
 #include <linux/elf.h>
 #include <linux/errno.h>
+#include <linux/hw_breakpoint.h>
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 #include <linux/regset.h>
@@ -30,6 +31,7 @@ 
 #include <linux/stddef.h>
 #include <linux/seccomp.h>
 #include <linux/uaccess.h>
+#include <linux/thread_info.h>
 
 #include <asm/byteorder.h>
 #include <asm/cpu.h>
@@ -39,6 +41,7 @@ 
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
+#include <asm/ptrace.h>
 #include <asm/reg.h>
 #include <asm/syscall.h>
 
@@ -541,3 +544,68 @@  long arch_ptrace(struct task_struct *child, long request,
 
 	return ret;
 }
+
+void ptrace_triggered(struct perf_event *bp,
+		      struct perf_sample_data *data, struct pt_regs *regs)
+{
+	struct perf_event_attr attr;
+
+	attr = bp->attr;
+	attr.disabled = true;
+	modify_user_hw_breakpoint(bp, &attr);
+}
+
+static int set_single_step(struct task_struct *tsk, unsigned long addr)
+{
+	struct thread_struct *thread = &tsk->thread;
+	struct perf_event *bp;
+	struct perf_event_attr attr;
+	struct arch_hw_breakpoint *info;
+
+	bp = thread->hbp_break[0];
+	if (!bp) {
+		ptrace_breakpoint_init(&attr);
+
+		attr.bp_addr = addr;
+		attr.bp_len = HW_BREAKPOINT_LEN_8;
+		attr.bp_type = HW_BREAKPOINT_X;
+
+		bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
+						 NULL, tsk);
+		if (IS_ERR(bp))
+			return PTR_ERR(bp);
+
+		thread->hbp_break[0] = bp;
+	} else {
+		int err;
+
+		attr = bp->attr;
+		attr.bp_addr = addr;
+		/* reenable breakpoint */
+		attr.disabled = false;
+		err = modify_user_hw_breakpoint(bp, &attr);
+		if (unlikely(err))
+			return err;
+
+		csr_write64(attr.bp_addr, LOONGARCH_CSR_IB0ADDR);
+	}
+	info = counter_arch_bp(bp);
+	info->mask = 0xffffffffffff;
+
+	return 0;
+}
+
+/* ptrace API */
+void user_enable_single_step(struct task_struct *task)
+{
+	struct thread_info *ti = task_thread_info(task);
+
+	set_single_step(task, task_pt_regs(task)->csr_era);
+	task->thread.single_step = task_pt_regs(task)->csr_era;
+	set_ti_thread_flag(ti, TIF_SINGLESTEP);
+}
+
+void user_disable_single_step(struct task_struct *task)
+{
+	clear_tsk_thread_flag(task, TIF_SINGLESTEP);
+}
diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c
index 2b133079e0f3..a59275a43bd1 100644
--- a/arch/loongarch/kernel/traps.c
+++ b/arch/loongarch/kernel/traps.c
@@ -511,9 +511,37 @@  asmlinkage void noinstr do_watch(struct pt_regs *regs)
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 	irqentry_state_t state = irqentry_enter(regs);
 
-	breakpoint_handler(regs);
-	watchpoint_handler(regs);
-	force_sig(SIGTRAP);
+	if (test_tsk_thread_flag(current, TIF_SINGLESTEP)) {
+		int llbit = (csr_read32(LOONGARCH_CSR_LLBCTL) & 0x1);
+		unsigned long pc = regs->csr_era;
+		union loongarch_instruction *ip = (union loongarch_instruction *)pc;
+
+		if (llbit) {
+		/*
+		 * When the ll-sc combo is encountered, it is regarded as an single
+		 * instruction. So don't clear llbit and reset CSR.FWPS.Skip until
+		 * the llsc execution is completed.
+		 */
+			csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
+			csr_write32(CSR_LLBCTL_KLO, LOONGARCH_CSR_LLBCTL);
+		} else if (pc == current->thread.single_step) {
+		/*
+		 * Maybe some hardware Bug caused that certain insns are occasionally
+		 * not skipped when CSR.FWPS.Skip is set, such as fld.d/fst.d. So Singlestep
+		 * needs to compare whether the csr_era is equal to the value of singlestep
+		 * which last time set.
+		 */
+			if (branch_ins_target_pc(ip))
+			/* Prevent rd == pc from causing single step to fail to stop */
+				csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
+		} else {
+			force_sig(SIGTRAP);
+		}
+	} else {
+		breakpoint_handler(regs);
+		watchpoint_handler(regs);
+		force_sig(SIGTRAP);
+	}
 
 	irqentry_exit(regs, state);
 #endif