@@ -37,7 +37,23 @@ static inline void flush_dcache_page(struct page *page)
flush_icache_mm(vma->vm_mm, 0)
#ifdef CONFIG_64BIT
-#define flush_cache_vmap(start, end) flush_tlb_kernel_range(start, end)
+extern u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
+extern char _end[];
+#define flush_cache_vmap flush_cache_vmap
+static inline void flush_cache_vmap(unsigned long start, unsigned long end)
+{
+ if (is_vmalloc_or_module_addr((void *)start)) {
+ int i;
+
+ /*
+ * We don't care if concurrently a cpu resets this value since
+ * the only place this can happen is in handle_exception() where
+ * an sfence.vma is emitted.
+ */
+ for (i = 0; i < ARRAY_SIZE(new_vmalloc); ++i)
+ new_vmalloc[i] = -1ULL;
+ }
+}
#define flush_cache_vmap_early(start, end) local_flush_tlb_kernel_range(start, end)
#endif
@@ -60,6 +60,11 @@ struct thread_info {
void *scs_base;
void *scs_sp;
#endif
+ /*
+ * Used in handle_exception() to save a0, a1 and a2 before knowing if we
+ * can access the kernel stack.
+ */
+ unsigned long a0, a1, a2;
};
#ifdef CONFIG_SHADOW_CALL_STACK
@@ -35,6 +35,8 @@ void asm_offsets(void)
OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
+
+ OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags);
OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
@@ -42,6 +44,9 @@ void asm_offsets(void)
#ifdef CONFIG_SHADOW_CALL_STACK
OFFSET(TASK_TI_SCS_SP, task_struct, thread_info.scs_sp);
#endif
+ OFFSET(TASK_TI_A0, task_struct, thread_info.a0);
+ OFFSET(TASK_TI_A1, task_struct, thread_info.a1);
+ OFFSET(TASK_TI_A2, task_struct, thread_info.a2);
OFFSET(TASK_TI_CPU_NUM, task_struct, thread_info.cpu);
OFFSET(TASK_THREAD_F0, task_struct, thread.fstate.f[0]);
@@ -19,6 +19,78 @@
.section .irqentry.text, "ax"
+.macro new_vmalloc_check
+ REG_S a0, TASK_TI_A0(tp)
+ REG_S a1, TASK_TI_A1(tp)
+ REG_S a2, TASK_TI_A2(tp)
+
+ csrr a0, CSR_CAUSE
+ /* Exclude IRQs */
+ blt a0, zero, _new_vmalloc_restore_context
+ /* Only check new_vmalloc if we are in page/protection fault */
+ li a1, EXC_LOAD_PAGE_FAULT
+ beq a0, a1, _new_vmalloc_kernel_address
+ li a1, EXC_STORE_PAGE_FAULT
+ beq a0, a1, _new_vmalloc_kernel_address
+ li a1, EXC_INST_PAGE_FAULT
+ bne a0, a1, _new_vmalloc_restore_context
+
+_new_vmalloc_kernel_address:
+ /* Is it a kernel address? */
+ csrr a0, CSR_TVAL
+ bge a0, zero, _new_vmalloc_restore_context
+
+ /* Check if a new vmalloc mapping appeared that could explain the trap */
+
+ /*
+ * Computes:
+ * a0 = &new_vmalloc[BIT_WORD(cpu)]
+ * a1 = BIT_MASK(cpu)
+ */
+ REG_L a2, TASK_TI_CPU(tp)
+ /*
+ * Compute the new_vmalloc element position:
+ * (cpu / 64) * 8 = (cpu >> 6) << 3
+ */
+ srli a1, a2, 6
+ slli a1, a1, 3
+ la a0, new_vmalloc
+ add a0, a0, a1
+ /*
+ * Compute the bit position in the new_vmalloc element:
+ * bit_pos = cpu % 64 = cpu - (cpu / 64) * 64 = cpu - (cpu >> 6) << 6
+ * = cpu - ((cpu >> 6) << 3) << 3
+ */
+ slli a1, a1, 3
+ sub a1, a2, a1
+ /* Compute the "get mask": 1 << bit_pos */
+ li a2, 1
+ sll a1, a2, a1
+
+ /* Check the value of new_vmalloc for this cpu */
+ REG_L a2, 0(a0)
+ and a2, a2, a1
+ beq a2, zero, _new_vmalloc_restore_context
+
+ /* Atomically reset the current cpu bit in new_vmalloc */
+ amoxor.w a0, a1, (a0)
+
+ /* Only emit a sfence.vma if the uarch caches invalid entries */
+ ALTERNATIVE("sfence.vma", "nop", 0, RISCV_ISA_EXT_SVVPTC, 1)
+
+ REG_L a0, TASK_TI_A0(tp)
+ REG_L a1, TASK_TI_A1(tp)
+ REG_L a2, TASK_TI_A2(tp)
+ csrw CSR_SCRATCH, x0
+ sret
+
+_new_vmalloc_restore_context:
+ REG_L a0, TASK_TI_A0(tp)
+ REG_L a1, TASK_TI_A1(tp)
+ REG_L a2, TASK_TI_A2(tp)
+.endm
+
+
SYM_CODE_START(handle_exception)
/*
* If coming from userspace, preserve the user thread pointer and load
@@ -30,6 +102,18 @@ SYM_CODE_START(handle_exception)
.Lrestore_kernel_tpsp:
csrr tp, CSR_SCRATCH
+
+ /*
+ * The RISC-V kernel does not eagerly emit a sfence.vma after each
+ * new vmalloc mapping, which may result in exceptions:
+ * - if the uarch caches invalid entries, the new mapping would not be
+ * observed by the page table walker and an invalidation is needed.
+ * - if the uarch does not cache invalid entries, a reordered access
+ * could "miss" the new mapping and traps: in that case, we only need
+ * to retry the access, no sfence.vma is required.
+ */
+ new_vmalloc_check
+
REG_S sp, TASK_TI_KERNEL_SP(tp)
#ifdef CONFIG_VMAP_STACK
@@ -36,6 +36,8 @@
#include "../kernel/head.h"
+u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
+
struct kernel_mapping kernel_map __ro_after_init;
EXPORT_SYMBOL(kernel_map);
#ifdef CONFIG_XIP_KERNEL