[v3,04/13] mm/execmem, arch: convert remaining overrides of module_alloc to execmem

Message ID 20230918072955.2507221-5-rppt@kernel.org
State New
Headers
Series mm: jit/text allocator |

Commit Message

Mike Rapoport Sept. 18, 2023, 7:29 a.m. UTC
  From: "Mike Rapoport (IBM)" <rppt@kernel.org>

Extend execmem parameters to accommodate more complex overrides of
module_alloc() by architectures.

This includes specification of a fallback range required by arm, arm64
and powerpc and support for allocation of KASAN shadow required by
arm64, s390 and x86.

The core implementation of execmem_alloc() takes care of suppressing
warnings when the initial allocation fails but there is a fallback range
defined.

Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
---
 arch/arm/kernel/module.c     | 38 ++++++++++++---------
 arch/arm64/kernel/module.c   | 57 ++++++++++++++------------------
 arch/powerpc/kernel/module.c | 52 ++++++++++++++---------------
 arch/s390/kernel/module.c    | 52 +++++++++++------------------
 arch/x86/kernel/module.c     | 64 +++++++++++-------------------------
 include/linux/execmem.h      | 14 ++++++++
 mm/execmem.c                 | 43 ++++++++++++++++++++++--
 7 files changed, 167 insertions(+), 153 deletions(-)
  

Comments

Edgecombe, Rick P Oct. 4, 2023, 12:29 a.m. UTC | #1
On Mon, 2023-09-18 at 10:29 +0300, Mike Rapoport wrote:
> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> index 5f71a0cf4399..9d37375e2f05 100644
> --- a/arch/x86/kernel/module.c
> +++ b/arch/x86/kernel/module.c
> @@ -19,6 +19,7 @@
>  #include <linux/jump_label.h>
>  #include <linux/random.h>
>  #include <linux/memory.h>
> +#include <linux/execmem.h>
>  
>  #include <asm/text-patching.h>
>  #include <asm/page.h>
> @@ -36,55 +37,30 @@ do
> {                                                        \
>  } while (0)
>  #endif
>  
> -#ifdef CONFIG_RANDOMIZE_BASE
> -static unsigned long module_load_offset;
> +static struct execmem_params execmem_params __ro_after_init = {
> +       .ranges = {
> +               [EXECMEM_DEFAULT] = {
> +                       .flags = EXECMEM_KASAN_SHADOW,
> +                       .alignment = MODULE_ALIGN,
> +               },
> +       },
> +};
>  
> -/* Mutex protects the module_load_offset. */
> -static DEFINE_MUTEX(module_kaslr_mutex);
> -
> -static unsigned long int get_module_load_offset(void)
> -{
> -       if (kaslr_enabled()) {
> -               mutex_lock(&module_kaslr_mutex);
> -               /*
> -                * Calculate the module_load_offset the first time
> this
> -                * code is called. Once calculated it stays the same
> until
> -                * reboot.
> -                */
> -               if (module_load_offset == 0)
> -                       module_load_offset =
> -                               get_random_u32_inclusive(1, 1024) *
> PAGE_SIZE;
> -               mutex_unlock(&module_kaslr_mutex);
> -       }
> -       return module_load_offset;
> -}
> -#else
> -static unsigned long int get_module_load_offset(void)
> -{
> -       return 0;
> -}
> -#endif
> -
> -void *module_alloc(unsigned long size)
> +struct execmem_params __init *execmem_arch_params(void)
>  {
> -       gfp_t gfp_mask = GFP_KERNEL;
> -       void *p;
> -
> -       if (PAGE_ALIGN(size) > MODULES_LEN)
> -               return NULL;
> +       unsigned long module_load_offset = 0;
> +       unsigned long start;
>  
> -       p = __vmalloc_node_range(size, MODULE_ALIGN,
> -                                MODULES_VADDR +
> get_module_load_offset(),
> -                                MODULES_END, gfp_mask, PAGE_KERNEL,
> -                                VM_FLUSH_RESET_PERMS |
> VM_DEFER_KMEMLEAK,
> -                                NUMA_NO_NODE,
> __builtin_return_address(0));
> +       if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled())
> +               module_load_offset =
> +                       get_random_u32_inclusive(1, 1024) *
> PAGE_SIZE;

Minor:
I think you can skip the IS_ENABLED(CONFIG_RANDOMIZE_BASE) part because
CONFIG_RANDOMIZE_MEMORY depends on CONFIG_RANDOMIZE_BASE (which is
checked in kaslr_enabled()).
  
Mike Rapoport Oct. 5, 2023, 5:28 a.m. UTC | #2
On Wed, Oct 04, 2023 at 12:29:36AM +0000, Edgecombe, Rick P wrote:
> On Mon, 2023-09-18 at 10:29 +0300, Mike Rapoport wrote:
> > diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> > index 5f71a0cf4399..9d37375e2f05 100644
> > --- a/arch/x86/kernel/module.c
> > +++ b/arch/x86/kernel/module.c
> >
> > -void *module_alloc(unsigned long size)
> > +struct execmem_params __init *execmem_arch_params(void)
> >  {
> > -       gfp_t gfp_mask = GFP_KERNEL;
> > -       void *p;
> > -
> > -       if (PAGE_ALIGN(size) > MODULES_LEN)
> > -               return NULL;
> > +       unsigned long module_load_offset = 0;
> > +       unsigned long start;
> >  
> > -       p = __vmalloc_node_range(size, MODULE_ALIGN,
> > -                                MODULES_VADDR +
> > get_module_load_offset(),
> > -                                MODULES_END, gfp_mask, PAGE_KERNEL,
> > -                                VM_FLUSH_RESET_PERMS |
> > VM_DEFER_KMEMLEAK,
> > -                                NUMA_NO_NODE,
> > __builtin_return_address(0));
> > +       if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled())
> > +               module_load_offset =
> > +                       get_random_u32_inclusive(1, 1024) *
> > PAGE_SIZE;
> 
> Minor:
> I think you can skip the IS_ENABLED(CONFIG_RANDOMIZE_BASE) part because
> CONFIG_RANDOMIZE_MEMORY depends on CONFIG_RANDOMIZE_BASE (which is
> checked in kaslr_enabled()).

Thanks, I'll look into it.
  
Will Deacon Oct. 23, 2023, 5:14 p.m. UTC | #3
Hi Mike,

On Mon, Sep 18, 2023 at 10:29:46AM +0300, Mike Rapoport wrote:
> From: "Mike Rapoport (IBM)" <rppt@kernel.org>
> 
> Extend execmem parameters to accommodate more complex overrides of
> module_alloc() by architectures.
> 
> This includes specification of a fallback range required by arm, arm64
> and powerpc and support for allocation of KASAN shadow required by
> arm64, s390 and x86.
> 
> The core implementation of execmem_alloc() takes care of suppressing
> warnings when the initial allocation fails but there is a fallback range
> defined.
> 
> Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
> ---
>  arch/arm/kernel/module.c     | 38 ++++++++++++---------
>  arch/arm64/kernel/module.c   | 57 ++++++++++++++------------------
>  arch/powerpc/kernel/module.c | 52 ++++++++++++++---------------
>  arch/s390/kernel/module.c    | 52 +++++++++++------------------
>  arch/x86/kernel/module.c     | 64 +++++++++++-------------------------
>  include/linux/execmem.h      | 14 ++++++++
>  mm/execmem.c                 | 43 ++++++++++++++++++++++--
>  7 files changed, 167 insertions(+), 153 deletions(-)

[...]

> diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
> index dd851297596e..cd6320de1c54 100644
> --- a/arch/arm64/kernel/module.c
> +++ b/arch/arm64/kernel/module.c
> @@ -20,6 +20,7 @@
>  #include <linux/random.h>
>  #include <linux/scs.h>
>  #include <linux/vmalloc.h>
> +#include <linux/execmem.h>
>  
>  #include <asm/alternative.h>
>  #include <asm/insn.h>
> @@ -108,46 +109,38 @@ static int __init module_init_limits(void)
>  
>  	return 0;
>  }
> -subsys_initcall(module_init_limits);
>  
> -void *module_alloc(unsigned long size)
> +static struct execmem_params execmem_params __ro_after_init = {
> +	.ranges = {
> +		[EXECMEM_DEFAULT] = {
> +			.flags = EXECMEM_KASAN_SHADOW,
> +			.alignment = MODULE_ALIGN,
> +		},
> +	},
> +};
> +
> +struct execmem_params __init *execmem_arch_params(void)
>  {
> -	void *p = NULL;
> +	struct execmem_range *r = &execmem_params.ranges[EXECMEM_DEFAULT];
>  
> -	/*
> -	 * Where possible, prefer to allocate within direct branch range of the
> -	 * kernel such that no PLTs are necessary.
> -	 */

Why are you removing this comment? I think you could just move it next
to the part where we set a 128MiB range.

> -	if (module_direct_base) {
> -		p = __vmalloc_node_range(size, MODULE_ALIGN,
> -					 module_direct_base,
> -					 module_direct_base + SZ_128M,
> -					 GFP_KERNEL | __GFP_NOWARN,
> -					 PAGE_KERNEL, 0, NUMA_NO_NODE,
> -					 __builtin_return_address(0));
> -	}
> +	module_init_limits();

Hmm, this used to be run from subsys_initcall(), but now you're running
it _really_ early, before random_init(), so randomization of the module
space is no longer going to be very random if we don't have early entropy
from the firmware or the CPU, which is likely to be the case on most SoCs.

>  
> -	if (!p && module_plt_base) {
> -		p = __vmalloc_node_range(size, MODULE_ALIGN,
> -					 module_plt_base,
> -					 module_plt_base + SZ_2G,
> -					 GFP_KERNEL | __GFP_NOWARN,
> -					 PAGE_KERNEL, 0, NUMA_NO_NODE,
> -					 __builtin_return_address(0));
> -	}
> +	r->pgprot = PAGE_KERNEL;
>  
> -	if (!p) {
> -		pr_warn_ratelimited("%s: unable to allocate memory\n",
> -				    __func__);
> -	}
> +	if (module_direct_base) {
> +		r->start = module_direct_base;
> +		r->end = module_direct_base + SZ_128M;
>  
> -	if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) {
> -		vfree(p);
> -		return NULL;
> +		if (module_plt_base) {
> +			r->fallback_start = module_plt_base;
> +			r->fallback_end = module_plt_base + SZ_2G;
> +		}
> +	} else if (module_plt_base) {
> +		r->start = module_plt_base;
> +		r->end = module_plt_base + SZ_2G;
>  	}
>  
> -	/* Memory is intended to be executable, reset the pointer tag. */
> -	return kasan_reset_tag(p);
> +	return &execmem_params;
>  }
>  
>  enum aarch64_reloc_op {

[...]

> diff --git a/include/linux/execmem.h b/include/linux/execmem.h
> index 44e213625053..806ad1a0088d 100644
> --- a/include/linux/execmem.h
> +++ b/include/linux/execmem.h
> @@ -32,19 +32,33 @@ enum execmem_type {
>  	EXECMEM_TYPE_MAX,
>  };
>  
> +/**
> + * enum execmem_module_flags - options for executable memory allocations
> + * @EXECMEM_KASAN_SHADOW:	allocate kasan shadow
> + */
> +enum execmem_range_flags {
> +	EXECMEM_KASAN_SHADOW	= (1 << 0),
> +};
> +
>  /**
>   * struct execmem_range - definition of a memory range suitable for code and
>   *			  related data allocations
>   * @start:	address space start
>   * @end:	address space end (inclusive)
> + * @fallback_start:	start of the range for fallback allocations
> + * @fallback_end:	end of the range for fallback allocations (inclusive)
>   * @pgprot:	permissions for memory in this address space
>   * @alignment:	alignment required for text allocations
> + * @flags:	options for memory allocations for this range
>   */
>  struct execmem_range {
>  	unsigned long   start;
>  	unsigned long   end;
> +	unsigned long   fallback_start;
> +	unsigned long   fallback_end;
>  	pgprot_t        pgprot;
>  	unsigned int	alignment;
> +	enum execmem_range_flags flags;
>  };
>  
>  /**
> diff --git a/mm/execmem.c b/mm/execmem.c
> index f25a5e064886..a8c2f44d0133 100644
> --- a/mm/execmem.c
> +++ b/mm/execmem.c
> @@ -11,12 +11,46 @@ static void *execmem_alloc(size_t size, struct execmem_range *range)
>  {
>  	unsigned long start = range->start;
>  	unsigned long end = range->end;
> +	unsigned long fallback_start = range->fallback_start;
> +	unsigned long fallback_end = range->fallback_end;
>  	unsigned int align = range->alignment;
>  	pgprot_t pgprot = range->pgprot;
> +	bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
> +	unsigned long vm_flags  = VM_FLUSH_RESET_PERMS;
> +	bool fallback  = !!fallback_start;
> +	gfp_t gfp_flags = GFP_KERNEL;
> +	void *p;
>  
> -	return __vmalloc_node_range(size, align, start, end,
> -				   GFP_KERNEL, pgprot, VM_FLUSH_RESET_PERMS,
> -				   NUMA_NO_NODE, __builtin_return_address(0));
> +	if (PAGE_ALIGN(size) > (end - start))
> +		return NULL;
> +
> +	if (kasan)
> +		vm_flags |= VM_DEFER_KMEMLEAK;

Hmm, I don't think we passed this before on arm64, should we have done?

Will
  
Mike Rapoport Oct. 26, 2023, 8:58 a.m. UTC | #4
Hi Will,

On Mon, Oct 23, 2023 at 06:14:20PM +0100, Will Deacon wrote:
> Hi Mike,
> 
> On Mon, Sep 18, 2023 at 10:29:46AM +0300, Mike Rapoport wrote:
> > From: "Mike Rapoport (IBM)" <rppt@kernel.org>
> > 
> > Extend execmem parameters to accommodate more complex overrides of
> > module_alloc() by architectures.
> > 
> > This includes specification of a fallback range required by arm, arm64
> > and powerpc and support for allocation of KASAN shadow required by
> > arm64, s390 and x86.
> > 
> > The core implementation of execmem_alloc() takes care of suppressing
> > warnings when the initial allocation fails but there is a fallback range
> > defined.
> > 
> > Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
> > ---
> >  arch/arm/kernel/module.c     | 38 ++++++++++++---------
> >  arch/arm64/kernel/module.c   | 57 ++++++++++++++------------------
> >  arch/powerpc/kernel/module.c | 52 ++++++++++++++---------------
> >  arch/s390/kernel/module.c    | 52 +++++++++++------------------
> >  arch/x86/kernel/module.c     | 64 +++++++++++-------------------------
> >  include/linux/execmem.h      | 14 ++++++++
> >  mm/execmem.c                 | 43 ++++++++++++++++++++++--
> >  7 files changed, 167 insertions(+), 153 deletions(-)
> 
> [...]
> 
> > diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
> > index dd851297596e..cd6320de1c54 100644
> > --- a/arch/arm64/kernel/module.c
> > +++ b/arch/arm64/kernel/module.c
> > @@ -20,6 +20,7 @@
> >  #include <linux/random.h>
> >  #include <linux/scs.h>
> >  #include <linux/vmalloc.h>
> > +#include <linux/execmem.h>
> >  
> >  #include <asm/alternative.h>
> >  #include <asm/insn.h>
> > @@ -108,46 +109,38 @@ static int __init module_init_limits(void)
> >  
> >  	return 0;
> >  }
> > -subsys_initcall(module_init_limits);
> >  
> > -void *module_alloc(unsigned long size)
> > +static struct execmem_params execmem_params __ro_after_init = {
> > +	.ranges = {
> > +		[EXECMEM_DEFAULT] = {
> > +			.flags = EXECMEM_KASAN_SHADOW,
> > +			.alignment = MODULE_ALIGN,
> > +		},
> > +	},
> > +};
> > +
> > +struct execmem_params __init *execmem_arch_params(void)
> >  {
> > -	void *p = NULL;
> > +	struct execmem_range *r = &execmem_params.ranges[EXECMEM_DEFAULT];
> >  
> > -	/*
> > -	 * Where possible, prefer to allocate within direct branch range of the
> > -	 * kernel such that no PLTs are necessary.
> > -	 */
> 
> Why are you removing this comment? I think you could just move it next
> to the part where we set a 128MiB range.
 
Oops, my bad. Will add it back.

> > -	if (module_direct_base) {
> > -		p = __vmalloc_node_range(size, MODULE_ALIGN,
> > -					 module_direct_base,
> > -					 module_direct_base + SZ_128M,
> > -					 GFP_KERNEL | __GFP_NOWARN,
> > -					 PAGE_KERNEL, 0, NUMA_NO_NODE,
> > -					 __builtin_return_address(0));
> > -	}
> > +	module_init_limits();
> 
> Hmm, this used to be run from subsys_initcall(), but now you're running
> it _really_ early, before random_init(), so randomization of the module
> space is no longer going to be very random if we don't have early entropy
> from the firmware or the CPU, which is likely to be the case on most SoCs.

Well, it will be as random as KASLR. Won't that be enough?
 
> > diff --git a/mm/execmem.c b/mm/execmem.c
> > index f25a5e064886..a8c2f44d0133 100644
> > --- a/mm/execmem.c
> > +++ b/mm/execmem.c
> > @@ -11,12 +11,46 @@ static void *execmem_alloc(size_t size, struct execmem_range *range)
> >  {
> >  	unsigned long start = range->start;
> >  	unsigned long end = range->end;
> > +	unsigned long fallback_start = range->fallback_start;
> > +	unsigned long fallback_end = range->fallback_end;
> >  	unsigned int align = range->alignment;
> >  	pgprot_t pgprot = range->pgprot;
> > +	bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
> > +	unsigned long vm_flags  = VM_FLUSH_RESET_PERMS;
> > +	bool fallback  = !!fallback_start;
> > +	gfp_t gfp_flags = GFP_KERNEL;
> > +	void *p;
> >  
> > -	return __vmalloc_node_range(size, align, start, end,
> > -				   GFP_KERNEL, pgprot, VM_FLUSH_RESET_PERMS,
> > -				   NUMA_NO_NODE, __builtin_return_address(0));
> > +	if (PAGE_ALIGN(size) > (end - start))
> > +		return NULL;
> > +
> > +	if (kasan)
> > +		vm_flags |= VM_DEFER_KMEMLEAK;
> 
> Hmm, I don't think we passed this before on arm64, should we have done?

It was there on arm64 before commit 8339f7d8e178 ("arm64: module: remove
old !KASAN_VMALLOC logic").
There's no need to pass VM_DEFER_KMEMLEAK when KASAN_VMALLOC is enabled and
arm64 always selects KASAN_VMALLOC with KASAN.

And for the generic case, I should have made the condition to check for
KASAN_VMALLOC as well.
 
> Will
  
Will Deacon Oct. 26, 2023, 10:24 a.m. UTC | #5
On Thu, Oct 26, 2023 at 11:58:00AM +0300, Mike Rapoport wrote:
> On Mon, Oct 23, 2023 at 06:14:20PM +0100, Will Deacon wrote:
> > On Mon, Sep 18, 2023 at 10:29:46AM +0300, Mike Rapoport wrote:
> > > diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
> > > index dd851297596e..cd6320de1c54 100644
> > > --- a/arch/arm64/kernel/module.c
> > > +++ b/arch/arm64/kernel/module.c
> > > @@ -20,6 +20,7 @@
> > >  #include <linux/random.h>
> > >  #include <linux/scs.h>
> > >  #include <linux/vmalloc.h>
> > > +#include <linux/execmem.h>
> > >  
> > >  #include <asm/alternative.h>
> > >  #include <asm/insn.h>
> > > @@ -108,46 +109,38 @@ static int __init module_init_limits(void)
> > >  
> > >  	return 0;
> > >  }
> > > -subsys_initcall(module_init_limits);
> > >  
> > > -void *module_alloc(unsigned long size)
> > > +static struct execmem_params execmem_params __ro_after_init = {
> > > +	.ranges = {
> > > +		[EXECMEM_DEFAULT] = {
> > > +			.flags = EXECMEM_KASAN_SHADOW,
> > > +			.alignment = MODULE_ALIGN,
> > > +		},
> > > +	},
> > > +};
> > > +
> > > +struct execmem_params __init *execmem_arch_params(void)
> > >  {
> > > -	void *p = NULL;
> > > +	struct execmem_range *r = &execmem_params.ranges[EXECMEM_DEFAULT];
> > >  
> > > -	/*
> > > -	 * Where possible, prefer to allocate within direct branch range of the
> > > -	 * kernel such that no PLTs are necessary.
> > > -	 */
> > 
> > Why are you removing this comment? I think you could just move it next
> > to the part where we set a 128MiB range.
>  
> Oops, my bad. Will add it back.

Thanks.

> > > -	if (module_direct_base) {
> > > -		p = __vmalloc_node_range(size, MODULE_ALIGN,
> > > -					 module_direct_base,
> > > -					 module_direct_base + SZ_128M,
> > > -					 GFP_KERNEL | __GFP_NOWARN,
> > > -					 PAGE_KERNEL, 0, NUMA_NO_NODE,
> > > -					 __builtin_return_address(0));
> > > -	}
> > > +	module_init_limits();
> > 
> > Hmm, this used to be run from subsys_initcall(), but now you're running
> > it _really_ early, before random_init(), so randomization of the module
> > space is no longer going to be very random if we don't have early entropy
> > from the firmware or the CPU, which is likely to be the case on most SoCs.
> 
> Well, it will be as random as KASLR. Won't that be enough?

I don't think that's true -- we have the 'kaslr-seed' property for KASLR,
but I'm not seeing anything like that for the module randomisation and I
also don't see why we need to set these limits so early.

Will
  
Mike Rapoport Oct. 30, 2023, 7 a.m. UTC | #6
On Thu, Oct 26, 2023 at 11:24:39AM +0100, Will Deacon wrote:
> On Thu, Oct 26, 2023 at 11:58:00AM +0300, Mike Rapoport wrote:
> > On Mon, Oct 23, 2023 at 06:14:20PM +0100, Will Deacon wrote:
> > > On Mon, Sep 18, 2023 at 10:29:46AM +0300, Mike Rapoport wrote:
> > > > diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
> > > > index dd851297596e..cd6320de1c54 100644
> > > > --- a/arch/arm64/kernel/module.c
> > > > +++ b/arch/arm64/kernel/module.c

...

> > > > -	if (module_direct_base) {
> > > > -		p = __vmalloc_node_range(size, MODULE_ALIGN,
> > > > -					 module_direct_base,
> > > > -					 module_direct_base + SZ_128M,
> > > > -					 GFP_KERNEL | __GFP_NOWARN,
> > > > -					 PAGE_KERNEL, 0, NUMA_NO_NODE,
> > > > -					 __builtin_return_address(0));
> > > > -	}
> > > > +	module_init_limits();
> > > 
> > > Hmm, this used to be run from subsys_initcall(), but now you're running
> > > it _really_ early, before random_init(), so randomization of the module
> > > space is no longer going to be very random if we don't have early entropy
> > > from the firmware or the CPU, which is likely to be the case on most SoCs.
> > 
> > Well, it will be as random as KASLR. Won't that be enough?
> 
> I don't think that's true -- we have the 'kaslr-seed' property for KASLR,
> but I'm not seeing anything like that for the module randomisation and I
> also don't see why we need to set these limits so early.

x86 needs execmem initialized before ftrace_init() so I thought it would be
best to setup execmem along with most of MM in mm_core_init().

I'll move execmem initialization for !x86 to a later point, say
core_initcall.
 
> Will
  
Will Deacon Nov. 7, 2023, 10:44 a.m. UTC | #7
On Mon, Oct 30, 2023 at 09:00:53AM +0200, Mike Rapoport wrote:
> On Thu, Oct 26, 2023 at 11:24:39AM +0100, Will Deacon wrote:
> > On Thu, Oct 26, 2023 at 11:58:00AM +0300, Mike Rapoport wrote:
> > > On Mon, Oct 23, 2023 at 06:14:20PM +0100, Will Deacon wrote:
> > > > On Mon, Sep 18, 2023 at 10:29:46AM +0300, Mike Rapoport wrote:
> > > > > diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
> > > > > index dd851297596e..cd6320de1c54 100644
> > > > > --- a/arch/arm64/kernel/module.c
> > > > > +++ b/arch/arm64/kernel/module.c
> 
> ...
> 
> > > > > -	if (module_direct_base) {
> > > > > -		p = __vmalloc_node_range(size, MODULE_ALIGN,
> > > > > -					 module_direct_base,
> > > > > -					 module_direct_base + SZ_128M,
> > > > > -					 GFP_KERNEL | __GFP_NOWARN,
> > > > > -					 PAGE_KERNEL, 0, NUMA_NO_NODE,
> > > > > -					 __builtin_return_address(0));
> > > > > -	}
> > > > > +	module_init_limits();
> > > > 
> > > > Hmm, this used to be run from subsys_initcall(), but now you're running
> > > > it _really_ early, before random_init(), so randomization of the module
> > > > space is no longer going to be very random if we don't have early entropy
> > > > from the firmware or the CPU, which is likely to be the case on most SoCs.
> > > 
> > > Well, it will be as random as KASLR. Won't that be enough?
> > 
> > I don't think that's true -- we have the 'kaslr-seed' property for KASLR,
> > but I'm not seeing anything like that for the module randomisation and I
> > also don't see why we need to set these limits so early.
> 
> x86 needs execmem initialized before ftrace_init() so I thought it would be
> best to setup execmem along with most of MM in mm_core_init().
> 
> I'll move execmem initialization for !x86 to a later point, say
> core_initcall.

Thanks, Mike.

Will
  

Patch

diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index e74d84f58b77..2c7651a2d84c 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -16,6 +16,7 @@ 
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/gfp.h>
+#include <linux/execmem.h>
 
 #include <asm/sections.h>
 #include <asm/smp_plat.h>
@@ -34,23 +35,28 @@ 
 #endif
 
 #ifdef CONFIG_MMU
-void *module_alloc(unsigned long size)
+static struct execmem_params execmem_params __ro_after_init = {
+	.ranges = {
+		[EXECMEM_DEFAULT] = {
+			.start = MODULES_VADDR,
+			.end = MODULES_END,
+			.alignment = 1,
+		},
+	},
+};
+
+struct execmem_params __init *execmem_arch_params(void)
 {
-	gfp_t gfp_mask = GFP_KERNEL;
-	void *p;
-
-	/* Silence the initial allocation */
-	if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
-		gfp_mask |= __GFP_NOWARN;
-
-	p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-	if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
-		return p;
-	return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
+	struct execmem_range *r = &execmem_params.ranges[EXECMEM_DEFAULT];
+
+	r->pgprot = PAGE_KERNEL_EXEC;
+
+	if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
+		r->fallback_start = VMALLOC_START;
+		r->fallback_end = VMALLOC_END;
+	}
+
+	return &execmem_params;
 }
 #endif
 
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index dd851297596e..cd6320de1c54 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -20,6 +20,7 @@ 
 #include <linux/random.h>
 #include <linux/scs.h>
 #include <linux/vmalloc.h>
+#include <linux/execmem.h>
 
 #include <asm/alternative.h>
 #include <asm/insn.h>
@@ -108,46 +109,38 @@  static int __init module_init_limits(void)
 
 	return 0;
 }
-subsys_initcall(module_init_limits);
 
-void *module_alloc(unsigned long size)
+static struct execmem_params execmem_params __ro_after_init = {
+	.ranges = {
+		[EXECMEM_DEFAULT] = {
+			.flags = EXECMEM_KASAN_SHADOW,
+			.alignment = MODULE_ALIGN,
+		},
+	},
+};
+
+struct execmem_params __init *execmem_arch_params(void)
 {
-	void *p = NULL;
+	struct execmem_range *r = &execmem_params.ranges[EXECMEM_DEFAULT];
 
-	/*
-	 * Where possible, prefer to allocate within direct branch range of the
-	 * kernel such that no PLTs are necessary.
-	 */
-	if (module_direct_base) {
-		p = __vmalloc_node_range(size, MODULE_ALIGN,
-					 module_direct_base,
-					 module_direct_base + SZ_128M,
-					 GFP_KERNEL | __GFP_NOWARN,
-					 PAGE_KERNEL, 0, NUMA_NO_NODE,
-					 __builtin_return_address(0));
-	}
+	module_init_limits();
 
-	if (!p && module_plt_base) {
-		p = __vmalloc_node_range(size, MODULE_ALIGN,
-					 module_plt_base,
-					 module_plt_base + SZ_2G,
-					 GFP_KERNEL | __GFP_NOWARN,
-					 PAGE_KERNEL, 0, NUMA_NO_NODE,
-					 __builtin_return_address(0));
-	}
+	r->pgprot = PAGE_KERNEL;
 
-	if (!p) {
-		pr_warn_ratelimited("%s: unable to allocate memory\n",
-				    __func__);
-	}
+	if (module_direct_base) {
+		r->start = module_direct_base;
+		r->end = module_direct_base + SZ_128M;
 
-	if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) {
-		vfree(p);
-		return NULL;
+		if (module_plt_base) {
+			r->fallback_start = module_plt_base;
+			r->fallback_end = module_plt_base + SZ_2G;
+		}
+	} else if (module_plt_base) {
+		r->start = module_plt_base;
+		r->end = module_plt_base + SZ_2G;
 	}
 
-	/* Memory is intended to be executable, reset the pointer tag. */
-	return kasan_reset_tag(p);
+	return &execmem_params;
 }
 
 enum aarch64_reloc_op {
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index f6d6ae0a1692..f4dd26f693a3 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -10,6 +10,7 @@ 
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <linux/bug.h>
+#include <linux/execmem.h>
 #include <asm/module.h>
 #include <linux/uaccess.h>
 #include <asm/firmware.h>
@@ -89,39 +90,38 @@  int module_finalize(const Elf_Ehdr *hdr,
 	return 0;
 }
 
-static __always_inline void *
-__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn)
+static struct execmem_params execmem_params __ro_after_init = {
+	.ranges = {
+		[EXECMEM_DEFAULT] = {
+			.alignment = 1,
+		},
+	},
+};
+
+struct execmem_params __init *execmem_arch_params(void)
 {
 	pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC;
-	gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0);
-
-	/*
-	 * Don't do huge page allocations for modules yet until more testing
-	 * is done. STRICT_MODULE_RWX may require extra work to support this
-	 * too.
-	 */
-	return __vmalloc_node_range(size, 1, start, end, gfp, prot,
-				    VM_FLUSH_RESET_PERMS,
-				    NUMA_NO_NODE, __builtin_return_address(0));
-}
+	struct execmem_range *range = &execmem_params.ranges[EXECMEM_DEFAULT];
 
-void *module_alloc(unsigned long size)
-{
 #ifdef MODULES_VADDR
 	unsigned long limit = (unsigned long)_etext - SZ_32M;
-	void *ptr = NULL;
-
-	BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
 
 	/* First try within 32M limit from _etext to avoid branch trampolines */
-	if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
-		ptr = __module_alloc(size, limit, MODULES_END, true);
-
-	if (!ptr)
-		ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false);
-
-	return ptr;
+	if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) {
+		range->start = limit;
+		range->end = MODULES_END;
+		range->fallback_start = MODULES_VADDR;
+		range->fallback_end = MODULES_END;
+	} else {
+		range->start = MODULES_VADDR;
+		range->end = MODULES_END;
+	}
 #else
-	return __module_alloc(size, VMALLOC_START, VMALLOC_END, false);
+	range->start = VMALLOC_START;
+	range->end = VMALLOC_END;
 #endif
+
+	range->pgprot = prot;
+
+	return &execmem_params;
 }
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index db5561d0c233..538d5f24af66 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -37,41 +37,29 @@ 
 
 #define PLT_ENTRY_SIZE 22
 
-static unsigned long get_module_load_offset(void)
+static struct execmem_params execmem_params __ro_after_init = {
+	.ranges = {
+		[EXECMEM_DEFAULT] = {
+			.flags = EXECMEM_KASAN_SHADOW,
+			.alignment = MODULE_ALIGN,
+			.pgprot = PAGE_KERNEL,
+		},
+	},
+};
+
+struct execmem_params __init *execmem_arch_params(void)
 {
-	static DEFINE_MUTEX(module_kaslr_mutex);
-	static unsigned long module_load_offset;
-
-	if (!kaslr_enabled())
-		return 0;
-	/*
-	 * Calculate the module_load_offset the first time this code
-	 * is called. Once calculated it stays the same until reboot.
-	 */
-	mutex_lock(&module_kaslr_mutex);
-	if (!module_load_offset)
+	unsigned long module_load_offset = 0;
+	unsigned long start;
+
+	if (kaslr_enabled())
 		module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
-	mutex_unlock(&module_kaslr_mutex);
-	return module_load_offset;
-}
 
-void *module_alloc(unsigned long size)
-{
-	gfp_t gfp_mask = GFP_KERNEL;
-	void *p;
-
-	if (PAGE_ALIGN(size) > MODULES_LEN)
-		return NULL;
-	p = __vmalloc_node_range(size, MODULE_ALIGN,
-				 MODULES_VADDR + get_module_load_offset(),
-				 MODULES_END, gfp_mask, PAGE_KERNEL,
-				 VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
-				 NUMA_NO_NODE, __builtin_return_address(0));
-	if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
-		vfree(p);
-		return NULL;
-	}
-	return p;
+	start = MODULES_VADDR + module_load_offset;
+	execmem_params.ranges[EXECMEM_DEFAULT].start = start;
+	execmem_params.ranges[EXECMEM_DEFAULT].end = MODULES_END;
+
+	return &execmem_params;
 }
 
 #ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 5f71a0cf4399..9d37375e2f05 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -19,6 +19,7 @@ 
 #include <linux/jump_label.h>
 #include <linux/random.h>
 #include <linux/memory.h>
+#include <linux/execmem.h>
 
 #include <asm/text-patching.h>
 #include <asm/page.h>
@@ -36,55 +37,30 @@  do {							\
 } while (0)
 #endif
 
-#ifdef CONFIG_RANDOMIZE_BASE
-static unsigned long module_load_offset;
+static struct execmem_params execmem_params __ro_after_init = {
+	.ranges = {
+		[EXECMEM_DEFAULT] = {
+			.flags = EXECMEM_KASAN_SHADOW,
+			.alignment = MODULE_ALIGN,
+		},
+	},
+};
 
-/* Mutex protects the module_load_offset. */
-static DEFINE_MUTEX(module_kaslr_mutex);
-
-static unsigned long int get_module_load_offset(void)
-{
-	if (kaslr_enabled()) {
-		mutex_lock(&module_kaslr_mutex);
-		/*
-		 * Calculate the module_load_offset the first time this
-		 * code is called. Once calculated it stays the same until
-		 * reboot.
-		 */
-		if (module_load_offset == 0)
-			module_load_offset =
-				get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
-		mutex_unlock(&module_kaslr_mutex);
-	}
-	return module_load_offset;
-}
-#else
-static unsigned long int get_module_load_offset(void)
-{
-	return 0;
-}
-#endif
-
-void *module_alloc(unsigned long size)
+struct execmem_params __init *execmem_arch_params(void)
 {
-	gfp_t gfp_mask = GFP_KERNEL;
-	void *p;
-
-	if (PAGE_ALIGN(size) > MODULES_LEN)
-		return NULL;
+	unsigned long module_load_offset = 0;
+	unsigned long start;
 
-	p = __vmalloc_node_range(size, MODULE_ALIGN,
-				 MODULES_VADDR + get_module_load_offset(),
-				 MODULES_END, gfp_mask, PAGE_KERNEL,
-				 VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
-				 NUMA_NO_NODE, __builtin_return_address(0));
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled())
+		module_load_offset =
+			get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
 
-	if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
-		vfree(p);
-		return NULL;
-	}
+	start = MODULES_VADDR + module_load_offset;
+	execmem_params.ranges[EXECMEM_DEFAULT].start = start;
+	execmem_params.ranges[EXECMEM_DEFAULT].end = MODULES_END;
+	execmem_params.ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL;
 
-	return p;
+	return &execmem_params;
 }
 
 #ifdef CONFIG_X86_32
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 44e213625053..806ad1a0088d 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -32,19 +32,33 @@  enum execmem_type {
 	EXECMEM_TYPE_MAX,
 };
 
+/**
+ * enum execmem_module_flags - options for executable memory allocations
+ * @EXECMEM_KASAN_SHADOW:	allocate kasan shadow
+ */
+enum execmem_range_flags {
+	EXECMEM_KASAN_SHADOW	= (1 << 0),
+};
+
 /**
  * struct execmem_range - definition of a memory range suitable for code and
  *			  related data allocations
  * @start:	address space start
  * @end:	address space end (inclusive)
+ * @fallback_start:	start of the range for fallback allocations
+ * @fallback_end:	end of the range for fallback allocations (inclusive)
  * @pgprot:	permissions for memory in this address space
  * @alignment:	alignment required for text allocations
+ * @flags:	options for memory allocations for this range
  */
 struct execmem_range {
 	unsigned long   start;
 	unsigned long   end;
+	unsigned long   fallback_start;
+	unsigned long   fallback_end;
 	pgprot_t        pgprot;
 	unsigned int	alignment;
+	enum execmem_range_flags flags;
 };
 
 /**
diff --git a/mm/execmem.c b/mm/execmem.c
index f25a5e064886..a8c2f44d0133 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -11,12 +11,46 @@  static void *execmem_alloc(size_t size, struct execmem_range *range)
 {
 	unsigned long start = range->start;
 	unsigned long end = range->end;
+	unsigned long fallback_start = range->fallback_start;
+	unsigned long fallback_end = range->fallback_end;
 	unsigned int align = range->alignment;
 	pgprot_t pgprot = range->pgprot;
+	bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
+	unsigned long vm_flags  = VM_FLUSH_RESET_PERMS;
+	bool fallback  = !!fallback_start;
+	gfp_t gfp_flags = GFP_KERNEL;
+	void *p;
 
-	return __vmalloc_node_range(size, align, start, end,
-				   GFP_KERNEL, pgprot, VM_FLUSH_RESET_PERMS,
-				   NUMA_NO_NODE, __builtin_return_address(0));
+	if (PAGE_ALIGN(size) > (end - start))
+		return NULL;
+
+	if (kasan)
+		vm_flags |= VM_DEFER_KMEMLEAK;
+
+	if (fallback)
+		gfp_flags |= __GFP_NOWARN;
+
+	p = __vmalloc_node_range(size, align, start, end, gfp_flags,
+				 pgprot, vm_flags, NUMA_NO_NODE,
+				 __builtin_return_address(0));
+
+	if (!p && fallback) {
+		start = fallback_start;
+		end = fallback_end;
+		gfp_flags = GFP_KERNEL;
+
+		p = __vmalloc_node_range(size, align, start, end, gfp_flags,
+					 pgprot, vm_flags, NUMA_NO_NODE,
+					 __builtin_return_address(0));
+	}
+
+	if (p && kasan &&
+	    (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+
+	return kasan_reset_tag(p);
 }
 
 void *execmem_text_alloc(enum execmem_type type, size_t size)
@@ -66,6 +100,9 @@  static void execmem_init_missing(struct execmem_params *p)
 			r->alignment = default_range->alignment;
 			r->start = default_range->start;
 			r->end = default_range->end;
+			r->flags = default_range->flags;
+			r->fallback_start = default_range->fallback_start;
+			r->fallback_end = default_range->fallback_end;
 		}
 	}
 }