[0/3] mm: replace atomic_t with percpu_ref in mempolicy.

Message ID 20221204161432.2149375-1-hezhongkun.hzk@bytedance.com
State New
Headers

Commit Message

Zhongkun He Dec. 4, 2022, 4:14 p.m. UTC
  All vma manipulation is somewhat protected by a down_read on
mmap_lock, so vma mempolicy is clear to obtain a reference.
But there is no locking in process context and have a mix
of reference counting and per-task requirements which is rather
subtle and easy to get wrong.

we would have get_vma_policy() always returning a reference
counted policy, except for static policy. For better performance,
we replace atomic_t ref with percpu_ref in mempolicy, which is
usually the performance bottleneck in hot path.

This series adjust the reference of mempolicy in process context,
which will be protected by RCU in read hot path. Besides,
task->mempolicy is also protected by task_lock(). Percpu_ref
is a good way to reduce cache line bouncing.

The mpol_get/put() can just increment or decrement the local
counter. Mpol_kill() must be called to initiate the destruction
of mempolicy. A mempolicy will be freed when the mpol_kill()
is called and the reference count decrese to zero.

Suggested-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
---
 include/linux/mempolicy.h      | 65 +++++++++++++++++++------------
 include/uapi/linux/mempolicy.h |  2 +-
 mm/mempolicy.c                 | 71 ++++++++++++++++++++++------------
 3 files changed, 89 insertions(+), 49 deletions(-)
  

Comments

kernel test robot Dec. 4, 2022, 6:25 p.m. UTC | #1
Hi Zhongkun,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Zhongkun-He/mm-replace-atomic_t-with-percpu_ref-in-mempolicy/20221205-001554
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20221204161432.2149375-1-hezhongkun.hzk%40bytedance.com
patch subject: [PATCH 0/3] mm: replace atomic_t with percpu_ref in mempolicy.
config: i386-randconfig-a013
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/ea5329aa9a0c20be63930f7c0fbb8aa238772851
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Zhongkun-He/mm-replace-atomic_t-with-percpu_ref-in-mempolicy/20221205-001554
        git checkout ea5329aa9a0c20be63930f7c0fbb8aa238772851
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from drivers/char/mem.c:25:
   In file included from include/linux/shmem_fs.h:7:
>> include/linux/mempolicy.h:219:1: error: non-void function does not return a value [-Werror,-Wreturn-type]
   }
   ^
   1 error generated.
--
   In file included from kernel/exit.c:38:
>> include/linux/mempolicy.h:219:1: error: non-void function does not return a value [-Werror,-Wreturn-type]
   }
   ^
   kernel/exit.c:1839:13: warning: no previous prototype for function 'abort' [-Wmissing-prototypes]
   __weak void abort(void)
               ^
   kernel/exit.c:1839:8: note: declare 'static' if the function is not intended to be used outside of this translation unit
   __weak void abort(void)
          ^
          static 
   1 warning and 1 error generated.
--
   In file included from mm/shmem.c:38:
   In file included from include/linux/hugetlb.h:30:
>> include/linux/mempolicy.h:219:1: error: non-void function does not return a value [-Werror,-Wreturn-type]
   }
   ^
   mm/shmem.c:1487:2: error: implicit declaration of function 'mpol_cond_put' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
           mpol_cond_put(vma->vm_policy);
           ^
   2 errors generated.
--
   In file included from mm/hugetlb.c:15:
>> include/linux/mempolicy.h:219:1: error: non-void function does not return a value [-Werror,-Wreturn-type]
   }
   ^
   mm/hugetlb.c:1249:2: error: implicit declaration of function 'mpol_cond_put' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
           mpol_cond_put(mpol);
           ^
   mm/hugetlb.c:2324:2: error: implicit declaration of function 'mpol_cond_put' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
           mpol_cond_put(mpol);
           ^
   mm/hugetlb.c:2360:2: error: implicit declaration of function 'mpol_cond_put' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
           mpol_cond_put(mpol);
           ^
   4 errors generated.
--
   In file included from kernel/sched/fair.c:44:
>> include/linux/mempolicy.h:219:1: error: non-void function does not return a value [-Werror,-Wreturn-type]
   }
   ^
   kernel/sched/fair.c:5794:6: warning: no previous prototype for function 'init_cfs_bandwidth' [-Wmissing-prototypes]
   void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
        ^
   kernel/sched/fair.c:5794:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
   ^
   static 
   kernel/sched/fair.c:12118:6: warning: no previous prototype for function 'free_fair_sched_group' [-Wmissing-prototypes]
   void free_fair_sched_group(struct task_group *tg) { }
        ^
   kernel/sched/fair.c:12118:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void free_fair_sched_group(struct task_group *tg) { }
   ^
   static 
   kernel/sched/fair.c:12120:5: warning: no previous prototype for function 'alloc_fair_sched_group' [-Wmissing-prototypes]
   int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
       ^
   kernel/sched/fair.c:12120:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
   ^
   static 
   kernel/sched/fair.c:12125:6: warning: no previous prototype for function 'online_fair_sched_group' [-Wmissing-prototypes]
   void online_fair_sched_group(struct task_group *tg) { }
        ^
   kernel/sched/fair.c:12125:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void online_fair_sched_group(struct task_group *tg) { }
   ^
   static 
   kernel/sched/fair.c:12127:6: warning: no previous prototype for function 'unregister_fair_sched_group' [-Wmissing-prototypes]
   void unregister_fair_sched_group(struct task_group *tg) { }
        ^
   kernel/sched/fair.c:12127:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void unregister_fair_sched_group(struct task_group *tg) { }
   ^
   static 
   5 warnings and 1 error generated.


vim +219 include/linux/mempolicy.h

   216	
   217	static inline bool mpol_tryget(struct mempolicy *pol)
   218	{
 > 219	}
   220
  
Michal Hocko Jan. 13, 2023, 4:20 p.m. UTC | #2
On Mon 05-12-22 00:14:29, Zhongkun He wrote:
> All vma manipulation is somewhat protected by a down_read on
> mmap_lock, so vma mempolicy is clear to obtain a reference.
> But there is no locking in process context and have a mix
> of reference counting and per-task requirements which is rather
> subtle and easy to get wrong.
> 
> we would have get_vma_policy() always returning a reference
> counted policy, except for static policy. For better performance,
> we replace atomic_t ref with percpu_ref in mempolicy, which is
> usually the performance bottleneck in hot path.
> 
> This series adjust the reference of mempolicy in process context,
> which will be protected by RCU in read hot path. Besides,
> task->mempolicy is also protected by task_lock(). Percpu_ref
> is a good way to reduce cache line bouncing.
> 
> The mpol_get/put() can just increment or decrement the local
> counter. Mpol_kill() must be called to initiate the destruction
> of mempolicy. A mempolicy will be freed when the mpol_kill()
> is called and the reference count decrese to zero.

This is really hard to follow. Without having the context from previous
discussions I would be completely lost. Please structure your cover
letter but also other patch in general in the form:
- what is the problem you would like to deal with
	- want to introduce pidfd_set_mempolicy because XYZ
- what stands in the way
	- mempolicy objects access constrains (reliance on operating in
	  the current context)
	- reference counting needs to be unconditional
	- why regular reference counting is not sufficient (performance)
- what is this patchset proposing
	- per cpu reference counting
	- how is it implemented
- how is the patch series structured
	- make the reference counting unconditional
	- special case static (never released) policies
	- replace standard ref counting by per-cpu reference counting
- how has this been tested?

Makes sense?

> Suggested-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
> ---
>  include/linux/mempolicy.h      | 65 +++++++++++++++++++------------
>  include/uapi/linux/mempolicy.h |  2 +-
>  mm/mempolicy.c                 | 71 ++++++++++++++++++++++------------
>  3 files changed, 89 insertions(+), 49 deletions(-)
> 
Is the following the cumulative diff?

> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index d232de7cdc56..9178b008eadf 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -28,12 +28,16 @@ struct mm_struct;
>   * of the current process.
>   *
>   * Locking policy for interleave:
> - * In process context there is no locking because only the process accesses
> - * its own state. All vma manipulation is somewhat protected by a down_read on
> + * percpu_ref is used to reduce cache line bouncing.
> + * In process context we should obtain a reference via mpol_get()
> + * protected by RCU.
> + * All vma manipulation is somewhat protected by a down_read on
>   * mmap_lock.
>   *
>   * Freeing policy:
> - * Mempolicy objects are reference counted.  A mempolicy will be freed when
> + * Mempolicy objects are reference counted. The mpol_get/put can just increment
> + * or decrement the local counter. Mpol_kill() must be called to initiate the
> + * destruction of mempolicy. A mempolicy will be freed when mpol_kill()/
>   * mpol_put() decrements the reference count to zero.
>   *
>   * Duplicating policy objects:
> @@ -42,43 +46,36 @@ struct mm_struct;
>   * to 1, representing the caller of mpol_dup().
>   */
>  struct mempolicy {
> -	atomic_t refcnt;
> -	unsigned short mode; 	/* See MPOL_* above */
> +	struct percpu_ref refcnt;	/* reduce cache line bouncing */
> +	unsigned short mode;	/* See MPOL_* above */
>  	unsigned short flags;	/* See set_mempolicy() MPOL_F_* above */
> +	int home_node;          /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
>  	nodemask_t nodes;	/* interleave/bind/perfer */
> -	int home_node;		/* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
>  
>  	union {
>  		nodemask_t cpuset_mems_allowed;	/* relative to these nodes */
>  		nodemask_t user_nodemask;	/* nodemask passed by user */
> +		struct rcu_head rcu;	/* used for freeing in an RCU-safe manner */
>  	} w;
>  };
>  
>  /*
> - * Support for managing mempolicy data objects (clone, copy, destroy)
> - * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
> + * Mempolicy pol need explicit unref after use, except for
> + * static policies(default_policy, preferred_node_policy).
>   */
> -
> -extern void __mpol_put(struct mempolicy *pol);
> -static inline void mpol_put(struct mempolicy *pol)
> +static inline int mpol_needs_cond_ref(struct mempolicy *pol)
>  {
> -	if (pol)
> -		__mpol_put(pol);
> +	return (pol && !(pol->flags & MPOL_F_STATIC));
>  }
>  
>  /*
> - * Does mempolicy pol need explicit unref after use?
> - * Currently only needed for shared policies.
> + * Put a mpol reference obtained via mpol_get().
>   */
> -static inline int mpol_needs_cond_ref(struct mempolicy *pol)
> -{
> -	return (pol && (pol->flags & MPOL_F_SHARED));
> -}
>  
> -static inline void mpol_cond_put(struct mempolicy *pol)
> +static inline void mpol_put(struct mempolicy *pol)
>  {
>  	if (mpol_needs_cond_ref(pol))
> -		__mpol_put(pol);
> +		percpu_ref_put(&pol->refcnt);
>  }
>  
>  extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
> @@ -91,12 +88,28 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
>  
>  #define vma_policy(vma) ((vma)->vm_policy)
>  
> +/* Obtain a reference on the specified mpol */
>  static inline void mpol_get(struct mempolicy *pol)
>  {
>  	if (pol)
> -		atomic_inc(&pol->refcnt);
> +		percpu_ref_get(&pol->refcnt);
> +}
> +
> +static inline bool mpol_tryget(struct mempolicy *pol)
> +{
> +	return pol && percpu_ref_tryget(&pol->refcnt);
>  }
>  
> +/*
> + * This function initiates destruction of mempolicy.
> + */
> +static inline void mpol_kill(struct mempolicy *pol)
> +{
> +        if (pol)
> +                percpu_ref_kill(&pol->refcnt);
> +}
> +
> +
>  extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
>  static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
>  {
> @@ -197,11 +210,15 @@ static inline void mpol_put(struct mempolicy *p)
>  {
>  }
>  
> -static inline void mpol_cond_put(struct mempolicy *pol)
> +static inline void mpol_get(struct mempolicy *pol)
>  {
>  }
>  
> -static inline void mpol_get(struct mempolicy *pol)
> +static inline bool mpol_tryget(struct mempolicy *pol)
> +{
> +}
> +
> +static inline void mpol_kill(struct mempolicy *pol)
>  {
>  }
>  
> diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
> index 046d0ccba4cd..940e1a88a4da 100644
> --- a/include/uapi/linux/mempolicy.h
> +++ b/include/uapi/linux/mempolicy.h
> @@ -60,7 +60,7 @@ enum {
>   * "mode flags".  These flags are allocated from bit 0 up, as they
>   * are never OR'ed into the mode in mempolicy API arguments.
>   */
> -#define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
> +#define MPOL_F_STATIC	(1 << 0) /* identify static policies(e.g default_policy) */
>  #define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
>  #define MPOL_F_MORON	(1 << 4) /* Migrate On protnone Reference On Node */
>  
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 61aa9aedb728..ee3e2ed5ef07 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -124,8 +124,8 @@ enum zone_type policy_zone = 0;
>   * run-time system-wide default policy => local allocation
>   */
>  static struct mempolicy default_policy = {
> -	.refcnt = ATOMIC_INIT(1), /* never free it */
>  	.mode = MPOL_LOCAL,
> +	.flags = MPOL_F_STATIC,
>  };
>  
>  static struct mempolicy preferred_node_policy[MAX_NUMNODES];
> @@ -158,9 +158,32 @@ int numa_map_to_online_node(int node)
>  }
>  EXPORT_SYMBOL_GPL(numa_map_to_online_node);
>  
> +/* Obtain a reference on the specified task mempolicy */
> +static mempolicy *get_task_mpol(struct task_struct *p)
> +{
> +	struct mempolicy *pol;
> +
> +	rcu_read_lock();
> +	pol = rcu_dereference(p->mempolicy);
> +
> +	if (!pol || mpol_tryget(pol))
> +		pol = NULL;
> +	rcu_read_unlock();
> +
> +	return pol;
> +}
> +
> +static void mpol_release(struct percpu_ref *ref)
> +{
> +	struct mempolicy *mpol = container_of(ref, struct mempolicy, refcnt);
> +
> +	percpu_ref_exit(ref);
> +	kfree_rcu(mpol, w.rcu);
> +}
> +
>  struct mempolicy *get_task_policy(struct task_struct *p)
>  {
> -	struct mempolicy *pol = p->mempolicy;
> +	struct mempolicy *pol = get_task_mpol(p);
>  	int node;
>  
>  	if (pol)
> @@ -296,7 +319,12 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
>  	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
>  	if (!policy)
>  		return ERR_PTR(-ENOMEM);
> -	atomic_set(&policy->refcnt, 1);
> +
> +	if (percpu_ref_init(&policy->refcnt, mpol_release, 0,
> +			GFP_KERNEL)) {
> +		kmem_cache_free(policy_cache, policy);
> +		return ERR_PTR(-ENOMEM);
> +	}
>  	policy->mode = mode;
>  	policy->flags = flags;
>  	policy->home_node = NUMA_NO_NODE;
> @@ -304,14 +332,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
>  	return policy;
>  }
>  
> -/* Slow path of a mpol destructor. */
> -void __mpol_put(struct mempolicy *p)
> -{
> -	if (!atomic_dec_and_test(&p->refcnt))
> -		return;
> -	kmem_cache_free(policy_cache, p);
> -}
> -
>  static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
>  {
>  }
> @@ -1759,14 +1779,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
>  		} else if (vma->vm_policy) {
>  			pol = vma->vm_policy;
>  
> -			/*
> -			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
> -			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
> -			 * count on these policies which will be dropped by
> -			 * mpol_cond_put() later
> -			 */
> -			if (mpol_needs_cond_ref(pol))
> -				mpol_get(pol);
> +			/* vma policy is protected by mmap_lock. */
> +			mpol_get(pol);
>  		}
>  	}
>  
> @@ -2423,7 +2437,13 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
>  		nodemask_t mems = cpuset_mems_allowed(current);
>  		mpol_rebind_policy(new, &mems);
>  	}
> -	atomic_set(&new->refcnt, 1);
> +
> +	if (percpu_ref_init(&new->refcnt, mpol_release, 0,
> +			GFP_KERNEL)) {
> +		kmem_cache_free(policy_cache, new);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
>  	return new;
>  }
>  
> @@ -2687,7 +2707,6 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
>  		kmem_cache_free(sn_cache, n);
>  		return NULL;
>  	}
> -	newpol->flags |= MPOL_F_SHARED;
>  	sp_node_init(n, start, end, newpol);
>  
>  	return n;
> @@ -2720,7 +2739,10 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
>  					goto alloc_new;
>  
>  				*mpol_new = *n->policy;
> -				atomic_set(&mpol_new->refcnt, 1);
> +				ret = percpu_ref_init(&mpol_new->refcnt,
> +						mpol_release, 0, GFP_KERNEL);
> +				if (ret)
> +					goto err_out;
>  				sp_node_init(n_new, end, n->end, mpol_new);
>  				n->end = start;
>  				sp_insert(sp, n_new);
> @@ -2756,7 +2778,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
>  	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
>  	if (!mpol_new)
>  		goto err_out;
> -	atomic_set(&mpol_new->refcnt, 1);
> +
>  	goto restart;
>  }
>  
> @@ -2917,7 +2939,8 @@ void __init numa_policy_init(void)
>  		preferred_node_policy[nid] = (struct mempolicy) {
>  			.refcnt = ATOMIC_INIT(1),
>  			.mode = MPOL_PREFERRED,
> -			.flags = MPOL_F_MOF | MPOL_F_MORON,
> +			.flags = MPOL_F_MOF | MPOL_F_MORON
> +				| MPOL_F_STATIC,
>  			.nodes = nodemask_of_node(nid),
>  		};
>  	}
> -- 
> 2.25.1
  
Michal Hocko Jan. 13, 2023, 4:22 p.m. UTC | #3
On Fri 13-01-23 17:20:39, Michal Hocko wrote:
> On Mon 05-12-22 00:14:29, Zhongkun He wrote:
> > All vma manipulation is somewhat protected by a down_read on
> > mmap_lock, so vma mempolicy is clear to obtain a reference.
> > But there is no locking in process context and have a mix
> > of reference counting and per-task requirements which is rather
> > subtle and easy to get wrong.
> > 
> > we would have get_vma_policy() always returning a reference
> > counted policy, except for static policy. For better performance,
> > we replace atomic_t ref with percpu_ref in mempolicy, which is
> > usually the performance bottleneck in hot path.
> > 
> > This series adjust the reference of mempolicy in process context,
> > which will be protected by RCU in read hot path. Besides,
> > task->mempolicy is also protected by task_lock(). Percpu_ref
> > is a good way to reduce cache line bouncing.
> > 
> > The mpol_get/put() can just increment or decrement the local
> > counter. Mpol_kill() must be called to initiate the destruction
> > of mempolicy. A mempolicy will be freed when the mpol_kill()
> > is called and the reference count decrese to zero.
> 
> This is really hard to follow. Without having the context from previous
> discussions I would be completely lost. Please structure your cover
> letter but also other patch in general in the form:
> - what is the problem you would like to deal with
> 	- want to introduce pidfd_set_mempolicy because XYZ
> - what stands in the way
> 	- mempolicy objects access constrains (reliance on operating in
> 	  the current context)
> 	- reference counting needs to be unconditional
> 	- why regular reference counting is not sufficient (performance)
> - what is this patchset proposing
> 	- per cpu reference counting
> 	- how is it implemented
> - how is the patch series structured
> 	- make the reference counting unconditional
> 	- special case static (never released) policies
> 	- replace standard ref counting by per-cpu reference counting

	- introduce pidfd_set_mempolicy
> - how has this been tested?
  
Michal Hocko Jan. 13, 2023, 5:04 p.m. UTC | #4
On Mon 05-12-22 00:14:29, Zhongkun He wrote:
[...]
> +/* Obtain a reference on the specified mpol */
>  static inline void mpol_get(struct mempolicy *pol)
>  {
>  	if (pol)

Shouldn't this be mpol_needs_cond_ref?

> -		atomic_inc(&pol->refcnt);
> +		percpu_ref_get(&pol->refcnt);
> +}
> +
> +static inline bool mpol_tryget(struct mempolicy *pol)
> +{
> +	return pol && percpu_ref_tryget(&pol->refcnt);
>  }
>  
> +/*
> + * This function initiates destruction of mempolicy.

This is not a useful comment. It would be much more helpful to say when
this should be called.

> + */
> +static inline void mpol_kill(struct mempolicy *pol)
> +{
> +        if (pol)
> +                percpu_ref_kill(&pol->refcnt);
> +}
> +
> +
>  extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
>  static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
>  {
> @@ -197,11 +210,15 @@ static inline void mpol_put(struct mempolicy *p)
>  {
>  }
>  
> -static inline void mpol_cond_put(struct mempolicy *pol)
> +static inline void mpol_get(struct mempolicy *pol)
>  {
>  }
>  
> -static inline void mpol_get(struct mempolicy *pol)
> +static inline bool mpol_tryget(struct mempolicy *pol)
> +{
> +}

This should return false, right?

[...]
> +/* Obtain a reference on the specified task mempolicy */

Again, this is pretty much clear from the name. It would be more useful
to explain how the pointer can be used - e.g. needs to call mpol_put
or mpol_kill depending on the calling context.

> +static mempolicy *get_task_mpol(struct task_struct *p)
> +{
> +	struct mempolicy *pol;
> +
> +	rcu_read_lock();
> +	pol = rcu_dereference(p->mempolicy);
> +
> +	if (!pol || mpol_tryget(pol))

Shouldn't be !mpol_tryget?

> +		pol = NULL;
> +	rcu_read_unlock();
> +
> +	return pol;
> +}
> +

I do not see any rcu_assign_pointer for the newly created policy so this
seems incomplete. Ditto no mpol_kill calls. I am unlikely to get into
follow up patches now. Please split up the work so that it is reviewable
more easily and then I can have a further look.

Thanks!
  
Zhongkun He Jan. 14, 2023, 4:10 p.m. UTC | #5
> On Fri 13-01-23 17:20:39, Michal Hocko wrote:
>>
>> This is really hard to follow. Without having the context from previous
>> discussions I would be completely lost. Please structure your cover
>> letter but also other patch in general in the form:
>> - what is the problem you would like to deal with
>> 	- want to introduce pidfd_set_mempolicy because XYZ
>> - what stands in the way
>> 	- mempolicy objects access constrains (reliance on operating in
>> 	  the current context)
>> 	- reference counting needs to be unconditional
>> 	- why regular reference counting is not sufficient (performance)
>> - what is this patchset proposing
>> 	- per cpu reference counting
>> 	- how is it implemented
>> - how is the patch series structured
>> 	- make the reference counting unconditional
>> 	- special case static (never released) policies
>> 	- replace standard ref counting by per-cpu reference counting
> 	- introduce pidfd_set_mempolicy
>> - how has this been tested?

Hi Michal, thanks for your review and suggestions.

I will follow the advice above to structure the letter and
split the patches smaller on next version.

Thanks.
  
Zhongkun He Jan. 14, 2023, 4:52 p.m. UTC | #6
> On Mon 05-12-22 00:14:29, Zhongkun He wrote:
> [...]
>> +/* Obtain a reference on the specified mpol */
>>   static inline void mpol_get(struct mempolicy *pol)
>>   {
>>   	if (pol)
> 
> Shouldn't this be mpol_needs_cond_ref?
> 
>> -		atomic_inc(&pol->refcnt);
>> +		percpu_ref_get(&pol->refcnt);
>> +}
>> +
>> +static inline bool mpol_tryget(struct mempolicy *pol)
>> +{
>> +	return pol && percpu_ref_tryget(&pol->refcnt);
>>   }
>>   
>> +/*
>> + * This function initiates destruction of mempolicy.
> 
> This is not a useful comment. It would be much more helpful to say when
> this should be called.
> 
>> + */
>> +static inline void mpol_kill(struct mempolicy *pol)
>> +{
>> +        if (pol)
>> +                percpu_ref_kill(&pol->refcnt);
>> +}
>> +
>> +
>>   extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
>>   static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
>>   {
>> @@ -197,11 +210,15 @@ static inline void mpol_put(struct mempolicy *p)
>>   {
>>   }
>>   
>> -static inline void mpol_cond_put(struct mempolicy *pol)
>> +static inline void mpol_get(struct mempolicy *pol)
>>   {
>>   }
>>   
>> -static inline void mpol_get(struct mempolicy *pol)
>> +static inline bool mpol_tryget(struct mempolicy *pol)
>> +{
>> +}
> 
> This should return false, right?
> 
> [...]
>> +/* Obtain a reference on the specified task mempolicy */
> 
> Again, this is pretty much clear from the name. It would be more useful
> to explain how the pointer can be used - e.g. needs to call mpol_put
> or mpol_kill depending on the calling context.
> 
>> +static mempolicy *get_task_mpol(struct task_struct *p)
>> +{
>> +	struct mempolicy *pol;
>> +
>> +	rcu_read_lock();
>> +	pol = rcu_dereference(p->mempolicy);
>> +
>> +	if (!pol || mpol_tryget(pol))
> 
> Shouldn't be !mpol_tryget?
> 
>> +		pol = NULL;
>> +	rcu_read_unlock();
>> +
>> +	return pol;
>> +}
>> +
> 
> I do not see any rcu_assign_pointer for the newly created policy so this
> seems incomplete. Ditto no mpol_kill calls. I am unlikely to get into
> follow up patches now. Please split up the work so that it is reviewable
> more easily and then I can have a further look.
> 
> Thanks!

Thanks for your review, some changes may be in other patch,i will
reorganize the patches according to the suggestions to make
things clear.

Thanks.
  

Patch

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..9178b008eadf 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -28,12 +28,16 @@  struct mm_struct;
  * of the current process.
  *
  * Locking policy for interleave:
- * In process context there is no locking because only the process accesses
- * its own state. All vma manipulation is somewhat protected by a down_read on
+ * percpu_ref is used to reduce cache line bouncing.
+ * In process context we should obtain a reference via mpol_get()
+ * protected by RCU.
+ * All vma manipulation is somewhat protected by a down_read on
  * mmap_lock.
  *
  * Freeing policy:
- * Mempolicy objects are reference counted.  A mempolicy will be freed when
+ * Mempolicy objects are reference counted. The mpol_get/put can just increment
+ * or decrement the local counter. Mpol_kill() must be called to initiate the
+ * destruction of mempolicy. A mempolicy will be freed when mpol_kill()/
  * mpol_put() decrements the reference count to zero.
  *
  * Duplicating policy objects:
@@ -42,43 +46,36 @@  struct mm_struct;
  * to 1, representing the caller of mpol_dup().
  */
 struct mempolicy {
-	atomic_t refcnt;
-	unsigned short mode; 	/* See MPOL_* above */
+	struct percpu_ref refcnt;	/* reduce cache line bouncing */
+	unsigned short mode;	/* See MPOL_* above */
 	unsigned short flags;	/* See set_mempolicy() MPOL_F_* above */
+	int home_node;          /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
 	nodemask_t nodes;	/* interleave/bind/perfer */
-	int home_node;		/* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
 
 	union {
 		nodemask_t cpuset_mems_allowed;	/* relative to these nodes */
 		nodemask_t user_nodemask;	/* nodemask passed by user */
+		struct rcu_head rcu;	/* used for freeing in an RCU-safe manner */
 	} w;
 };
 
 /*
- * Support for managing mempolicy data objects (clone, copy, destroy)
- * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
+ * Mempolicy pol need explicit unref after use, except for
+ * static policies(default_policy, preferred_node_policy).
  */
-
-extern void __mpol_put(struct mempolicy *pol);
-static inline void mpol_put(struct mempolicy *pol)
+static inline int mpol_needs_cond_ref(struct mempolicy *pol)
 {
-	if (pol)
-		__mpol_put(pol);
+	return (pol && !(pol->flags & MPOL_F_STATIC));
 }
 
 /*
- * Does mempolicy pol need explicit unref after use?
- * Currently only needed for shared policies.
+ * Put a mpol reference obtained via mpol_get().
  */
-static inline int mpol_needs_cond_ref(struct mempolicy *pol)
-{
-	return (pol && (pol->flags & MPOL_F_SHARED));
-}
 
-static inline void mpol_cond_put(struct mempolicy *pol)
+static inline void mpol_put(struct mempolicy *pol)
 {
 	if (mpol_needs_cond_ref(pol))
-		__mpol_put(pol);
+		percpu_ref_put(&pol->refcnt);
 }
 
 extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
@@ -91,12 +88,28 @@  static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
 
 #define vma_policy(vma) ((vma)->vm_policy)
 
+/* Obtain a reference on the specified mpol */
 static inline void mpol_get(struct mempolicy *pol)
 {
 	if (pol)
-		atomic_inc(&pol->refcnt);
+		percpu_ref_get(&pol->refcnt);
+}
+
+static inline bool mpol_tryget(struct mempolicy *pol)
+{
+	return pol && percpu_ref_tryget(&pol->refcnt);
 }
 
+/*
+ * This function initiates destruction of mempolicy.
+ */
+static inline void mpol_kill(struct mempolicy *pol)
+{
+        if (pol)
+                percpu_ref_kill(&pol->refcnt);
+}
+
+
 extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
 static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
@@ -197,11 +210,15 @@  static inline void mpol_put(struct mempolicy *p)
 {
 }
 
-static inline void mpol_cond_put(struct mempolicy *pol)
+static inline void mpol_get(struct mempolicy *pol)
 {
 }
 
-static inline void mpol_get(struct mempolicy *pol)
+static inline bool mpol_tryget(struct mempolicy *pol)
+{
+}
+
+static inline void mpol_kill(struct mempolicy *pol)
 {
 }
 
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 046d0ccba4cd..940e1a88a4da 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -60,7 +60,7 @@  enum {
  * "mode flags".  These flags are allocated from bit 0 up, as they
  * are never OR'ed into the mode in mempolicy API arguments.
  */
-#define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
+#define MPOL_F_STATIC	(1 << 0) /* identify static policies(e.g default_policy) */
 #define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
 #define MPOL_F_MORON	(1 << 4) /* Migrate On protnone Reference On Node */
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 61aa9aedb728..ee3e2ed5ef07 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -124,8 +124,8 @@  enum zone_type policy_zone = 0;
  * run-time system-wide default policy => local allocation
  */
 static struct mempolicy default_policy = {
-	.refcnt = ATOMIC_INIT(1), /* never free it */
 	.mode = MPOL_LOCAL,
+	.flags = MPOL_F_STATIC,
 };
 
 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
@@ -158,9 +158,32 @@  int numa_map_to_online_node(int node)
 }
 EXPORT_SYMBOL_GPL(numa_map_to_online_node);
 
+/* Obtain a reference on the specified task mempolicy */
+static mempolicy *get_task_mpol(struct task_struct *p)
+{
+	struct mempolicy *pol;
+
+	rcu_read_lock();
+	pol = rcu_dereference(p->mempolicy);
+
+	if (!pol || mpol_tryget(pol))
+		pol = NULL;
+	rcu_read_unlock();
+
+	return pol;
+}
+
+static void mpol_release(struct percpu_ref *ref)
+{
+	struct mempolicy *mpol = container_of(ref, struct mempolicy, refcnt);
+
+	percpu_ref_exit(ref);
+	kfree_rcu(mpol, w.rcu);
+}
+
 struct mempolicy *get_task_policy(struct task_struct *p)
 {
-	struct mempolicy *pol = p->mempolicy;
+	struct mempolicy *pol = get_task_mpol(p);
 	int node;
 
 	if (pol)
@@ -296,7 +319,12 @@  static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!policy)
 		return ERR_PTR(-ENOMEM);
-	atomic_set(&policy->refcnt, 1);
+
+	if (percpu_ref_init(&policy->refcnt, mpol_release, 0,
+			GFP_KERNEL)) {
+		kmem_cache_free(policy_cache, policy);
+		return ERR_PTR(-ENOMEM);
+	}
 	policy->mode = mode;
 	policy->flags = flags;
 	policy->home_node = NUMA_NO_NODE;
@@ -304,14 +332,6 @@  static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	return policy;
 }
 
-/* Slow path of a mpol destructor. */
-void __mpol_put(struct mempolicy *p)
-{
-	if (!atomic_dec_and_test(&p->refcnt))
-		return;
-	kmem_cache_free(policy_cache, p);
-}
-
 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 {
 }
@@ -1759,14 +1779,8 @@  struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
 		} else if (vma->vm_policy) {
 			pol = vma->vm_policy;
 
-			/*
-			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
-			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
-			 * count on these policies which will be dropped by
-			 * mpol_cond_put() later
-			 */
-			if (mpol_needs_cond_ref(pol))
-				mpol_get(pol);
+			/* vma policy is protected by mmap_lock. */
+			mpol_get(pol);
 		}
 	}
 
@@ -2423,7 +2437,13 @@  struct mempolicy *__mpol_dup(struct mempolicy *old)
 		nodemask_t mems = cpuset_mems_allowed(current);
 		mpol_rebind_policy(new, &mems);
 	}
-	atomic_set(&new->refcnt, 1);
+
+	if (percpu_ref_init(&new->refcnt, mpol_release, 0,
+			GFP_KERNEL)) {
+		kmem_cache_free(policy_cache, new);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	return new;
 }
 
@@ -2687,7 +2707,6 @@  static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 		kmem_cache_free(sn_cache, n);
 		return NULL;
 	}
-	newpol->flags |= MPOL_F_SHARED;
 	sp_node_init(n, start, end, newpol);
 
 	return n;
@@ -2720,7 +2739,10 @@  static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
 					goto alloc_new;
 
 				*mpol_new = *n->policy;
-				atomic_set(&mpol_new->refcnt, 1);
+				ret = percpu_ref_init(&mpol_new->refcnt,
+						mpol_release, 0, GFP_KERNEL);
+				if (ret)
+					goto err_out;
 				sp_node_init(n_new, end, n->end, mpol_new);
 				n->end = start;
 				sp_insert(sp, n_new);
@@ -2756,7 +2778,7 @@  static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!mpol_new)
 		goto err_out;
-	atomic_set(&mpol_new->refcnt, 1);
+
 	goto restart;
 }
 
@@ -2917,7 +2939,8 @@  void __init numa_policy_init(void)
 		preferred_node_policy[nid] = (struct mempolicy) {
 			.refcnt = ATOMIC_INIT(1),
 			.mode = MPOL_PREFERRED,
-			.flags = MPOL_F_MOF | MPOL_F_MORON,
+			.flags = MPOL_F_MOF | MPOL_F_MORON
+				| MPOL_F_STATIC,
 			.nodes = nodemask_of_node(nid),
 		};
 	}