[v4,03/18] KVM: x86/mmu: Track count of pages in KVM MMU page caches globally

Message ID 20230306224127.1689967-4-vipinsh@google.com
State New
Headers
Series NUMA aware page table allocation |

Commit Message

Vipin Sharma March 6, 2023, 10:41 p.m. UTC
  Create a global counter for total number of pages available
in MMU page caches across all VMs. Add mmu_shadow_page_cache
pages to this counter.

This accounting will be used in future commits to shrink MMU caches via
KVM MMU shrinker.

Signed-off-by: Vipin Sharma <vipinsh@google.com>
---
 arch/x86/include/asm/kvm_host.h |  5 ++
 arch/x86/kvm/mmu/mmu.c          | 90 ++++++++++++++++++++++++++++-----
 arch/x86/kvm/mmu/mmu_internal.h |  2 +
 arch/x86/kvm/mmu/paging_tmpl.h  | 25 +++++----
 arch/x86/kvm/mmu/tdp_mmu.c      |  3 +-
 5 files changed, 100 insertions(+), 25 deletions(-)
  

Comments

kernel test robot March 7, 2023, 11:32 a.m. UTC | #1
Hi Vipin,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on kvm/queue]
[also build test WARNING on kvmarm/next linus/master v6.3-rc1 next-20230307]
[cannot apply to mst-vhost/linux-next kvm/linux-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Vipin-Sharma/KVM-x86-mmu-Change-KVM-mmu-shrinker-to-no-op/20230307-064510
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link:    https://lore.kernel.org/r/20230306224127.1689967-4-vipinsh%40google.com
patch subject: [Patch v4 03/18] KVM: x86/mmu: Track count of pages in KVM MMU page caches globally
config: i386-randconfig-a003-20230306 (https://download.01.org/0day-ci/archive/20230307/202303071940.sFeQ4FpU-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/511e837798da25063830276b8a3345c7601c6459
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Vipin-Sharma/KVM-x86-mmu-Change-KVM-mmu-shrinker-to-no-op/20230307-064510
        git checkout 511e837798da25063830276b8a3345c7601c6459
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=i386 olddefconfig
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash arch/x86/kvm/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303071940.sFeQ4FpU-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> arch/x86/kvm/mmu/mmu.c:676: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
    * Caller should hold mutex lock corresponding to cache, if available.
   arch/x86/kvm/mmu/mmu.c:693: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
    * Caller should hold mutex lock corresponding to kvm_mmu_memory_cache, if
   arch/x86/kvm/mmu/mmu.c:1404: warning: Function parameter or member 'kvm' not described in 'kvm_arch_mmu_enable_log_dirty_pt_masked'
   arch/x86/kvm/mmu/mmu.c:1404: warning: Function parameter or member 'slot' not described in 'kvm_arch_mmu_enable_log_dirty_pt_masked'
   arch/x86/kvm/mmu/mmu.c:1404: warning: Function parameter or member 'gfn_offset' not described in 'kvm_arch_mmu_enable_log_dirty_pt_masked'
   arch/x86/kvm/mmu/mmu.c:1404: warning: Function parameter or member 'mask' not described in 'kvm_arch_mmu_enable_log_dirty_pt_masked'


vim +676 arch/x86/kvm/mmu/mmu.c

   674	
   675	/**
 > 676	 * Caller should hold mutex lock corresponding to cache, if available.
   677	 */
   678	static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
   679					     int min)
   680	{
   681		int orig_nobjs, r;
   682	
   683		orig_nobjs = cache->nobjs;
   684		r = kvm_mmu_topup_memory_cache(cache, min);
   685		if (orig_nobjs != cache->nobjs)
   686			percpu_counter_add(&kvm_total_unused_cached_pages,
   687					   (cache->nobjs - orig_nobjs));
   688	
   689		return r;
   690	}
   691
  
kernel test robot March 7, 2023, 12:13 p.m. UTC | #2
Hi Vipin,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on kvm/queue]
[also build test WARNING on kvmarm/next linus/master v6.3-rc1 next-20230307]
[cannot apply to mst-vhost/linux-next kvm/linux-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Vipin-Sharma/KVM-x86-mmu-Change-KVM-mmu-shrinker-to-no-op/20230307-064510
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link:    https://lore.kernel.org/r/20230306224127.1689967-4-vipinsh%40google.com
patch subject: [Patch v4 03/18] KVM: x86/mmu: Track count of pages in KVM MMU page caches globally
config: x86_64-randconfig-a016-20230306 (https://download.01.org/0day-ci/archive/20230307/202303071922.uBuWNRnA-lkp@intel.com/config)
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/511e837798da25063830276b8a3345c7601c6459
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Vipin-Sharma/KVM-x86-mmu-Change-KVM-mmu-shrinker-to-no-op/20230307-064510
        git checkout 511e837798da25063830276b8a3345c7601c6459
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash arch/x86/kvm/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303071922.uBuWNRnA-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> arch/x86/kvm/mmu/mmu.c:676: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
    * Caller should hold mutex lock corresponding to cache, if available.
   arch/x86/kvm/mmu/mmu.c:693: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
    * Caller should hold mutex lock corresponding to kvm_mmu_memory_cache, if
   arch/x86/kvm/mmu/mmu.c:1404: warning: Function parameter or member 'kvm' not described in 'kvm_arch_mmu_enable_log_dirty_pt_masked'
   arch/x86/kvm/mmu/mmu.c:1404: warning: Function parameter or member 'slot' not described in 'kvm_arch_mmu_enable_log_dirty_pt_masked'
   arch/x86/kvm/mmu/mmu.c:1404: warning: Function parameter or member 'gfn_offset' not described in 'kvm_arch_mmu_enable_log_dirty_pt_masked'
   arch/x86/kvm/mmu/mmu.c:1404: warning: Function parameter or member 'mask' not described in 'kvm_arch_mmu_enable_log_dirty_pt_masked'


vim +676 arch/x86/kvm/mmu/mmu.c

   674	
   675	/**
 > 676	 * Caller should hold mutex lock corresponding to cache, if available.
   677	 */
   678	static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
   679					     int min)
   680	{
   681		int orig_nobjs, r;
   682	
   683		orig_nobjs = cache->nobjs;
   684		r = kvm_mmu_topup_memory_cache(cache, min);
   685		if (orig_nobjs != cache->nobjs)
   686			percpu_counter_add(&kvm_total_unused_cached_pages,
   687					   (cache->nobjs - orig_nobjs));
   688	
   689		return r;
   690	}
   691
  
Vipin Sharma March 7, 2023, 7:13 p.m. UTC | #3
On Tue, Mar 7, 2023 at 3:33 AM kernel test robot <lkp@intel.com> wrote:
>
> Hi Vipin,
>
> Thank you for the patch! Perhaps something to improve:
>
> [auto build test WARNING on kvm/queue]
> [also build test WARNING on kvmarm/next linus/master v6.3-rc1 next-20230307]
> [cannot apply to mst-vhost/linux-next kvm/linux-next]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> All warnings (new ones prefixed by >>):
>
> >> arch/x86/kvm/mmu/mmu.c:676: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst

> vim +676 arch/x86/kvm/mmu/mmu.c
>
>    674
>    675  /**
>  > 676   * Caller should hold mutex lock corresponding to cache, if available.
>    677   */

I will fix it in the next version.
  
Sean Christopherson March 7, 2023, 8:18 p.m. UTC | #4
On Tue, Mar 07, 2023, Vipin Sharma wrote:
> On Tue, Mar 7, 2023 at 3:33 AM kernel test robot <lkp@intel.com> wrote:
> >
> > Hi Vipin,
> >
> > Thank you for the patch! Perhaps something to improve:
> >
> > [auto build test WARNING on kvm/queue]
> > [also build test WARNING on kvmarm/next linus/master v6.3-rc1 next-20230307]
> > [cannot apply to mst-vhost/linux-next kvm/linux-next]
> > [If your patch is applied to the wrong git tree, kindly drop us a note.
> > And when submitting patch, we suggest to use '--base' as documented in
> > https://git-scm.com/docs/git-format-patch#_base_tree_information]
> >
> > All warnings (new ones prefixed by >>):
> >
> > >> arch/x86/kvm/mmu/mmu.c:676: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
> 
> > vim +676 arch/x86/kvm/mmu/mmu.c
> >
> >    674
> >    675  /**
> >  > 676   * Caller should hold mutex lock corresponding to cache, if available.
> >    677   */
> 
> I will fix it in the next version.

Don't bother reworking the code/comment, I will likely have feedback that results
in the demise of the comment altogether (comments that say "lock X must be held"
are almost always flawed).
  
Zhi Wang March 8, 2023, 8:33 p.m. UTC | #5
On Mon,  6 Mar 2023 14:41:12 -0800
Vipin Sharma <vipinsh@google.com> wrote:

> Create a global counter for total number of pages available
> in MMU page caches across all VMs. Add mmu_shadow_page_cache
> pages to this counter.
> 
> This accounting will be used in future commits to shrink MMU caches via
> KVM MMU shrinker.
> 
> Signed-off-by: Vipin Sharma <vipinsh@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  5 ++
>  arch/x86/kvm/mmu/mmu.c          | 90 ++++++++++++++++++++++++++++-----
>  arch/x86/kvm/mmu/mmu_internal.h |  2 +
>  arch/x86/kvm/mmu/paging_tmpl.h  | 25 +++++----
>  arch/x86/kvm/mmu/tdp_mmu.c      |  3 +-
>  5 files changed, 100 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ebbe692acf3f..4322c7020d5d 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -791,6 +791,11 @@ struct kvm_vcpu_arch {
>  	struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
>  	struct kvm_mmu_memory_cache mmu_page_header_cache;
>  
> +	/*
> +	 * Protect allocation and release of pages from mmu_shadow_page_cache.
> +	 */
> +	struct mutex mmu_shadow_page_cache_lock;
> +
>  	/*
>  	 * QEMU userspace and the guest each have their own FPU state.
>  	 * In vcpu_run, we switch between the user and guest FPU contexts.
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 3a452989f5cd..13f41b7ac280 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -167,6 +167,11 @@ struct kvm_shadow_walk_iterator {
>  static struct kmem_cache *pte_list_desc_cache;
>  struct kmem_cache *mmu_page_header_cache;
>  
> +/*
> + * Global count of unused pages in MMU page caches across all VMs.
> + */
> +static struct percpu_counter kvm_total_unused_cached_pages;
> +
>  static void mmu_spte_set(u64 *sptep, u64 spte);
>  
>  struct kvm_mmu_role_regs {
> @@ -667,6 +672,34 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
>  	}
>  }
>  
> +/**
> + * Caller should hold mutex lock corresponding to cache, if available.
> + */
> +static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> +				     int min)
> +{
> +	int orig_nobjs, r;
> +
> +	orig_nobjs = cache->nobjs;
> +	r = kvm_mmu_topup_memory_cache(cache, min);
> +	if (orig_nobjs != cache->nobjs)
> +		percpu_counter_add(&kvm_total_unused_cached_pages,
> +				   (cache->nobjs - orig_nobjs));
> +
> +	return r;
> +}
> +

Maybe kvm_mmu_topup_shadow_page_cache() would be better?

As a user of kvm_mmu_topup_memory_cache(), mmu_topup_memory_cache() is not
supposed to directly touch the kvm_mmu_memory_cache meta data.

The name "mmu_topup_sp_memory_cache()" seems similar with "mmu_topup_memory_cache()".
Renaming it would make its level self-documenting.

> +/**
> + * Caller should hold mutex lock corresponding to kvm_mmu_memory_cache, if
> + * available.
> + */
> +static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache)
> +{
> +	if (cache->nobjs)
> +		percpu_counter_sub(&kvm_total_unused_cached_pages, cache->nobjs);
> +	kvm_mmu_free_memory_cache(cache);
> +}
> +
>  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  {
>  	int r;
> @@ -676,10 +709,11 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
>  	if (r)
>  		return r;
> -	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> -				       PT64_ROOT_MAX_LEVEL);
> +
> +	r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache, PT64_ROOT_MAX_LEVEL);
>  	if (r)
>  		return r;
> +
>  	if (maybe_indirect) {
>  		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
>  					       PT64_ROOT_MAX_LEVEL);
> @@ -693,7 +727,9 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  {
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> -	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
> +	mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
>  }
> @@ -2148,6 +2184,7 @@ struct shadow_page_caches {
>  	struct kvm_mmu_memory_cache *page_header_cache;
>  	struct kvm_mmu_memory_cache *shadow_page_cache;
>  	struct kvm_mmu_memory_cache *shadowed_info_cache;
> +	bool count_shadow_page_allocation;
>  };
>  
>  static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
> @@ -2159,7 +2196,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
>  	struct kvm_mmu_page *sp;
>  
>  	sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
> -	sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
> +	sp->spt = mmu_sp_memory_cache_alloc(caches->shadow_page_cache,
> +					    caches->count_shadow_page_allocation);
>  	if (!role.direct)
>  		sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
>  
> @@ -2216,6 +2254,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
>  		.page_header_cache = &vcpu->arch.mmu_page_header_cache,
>  		.shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
>  		.shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
> +		.count_shadow_page_allocation = true,
>  	};
>  
>  	return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
> @@ -4314,29 +4353,32 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	if (r != RET_PF_INVALID)
>  		return r;
>  
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, false);
>  	if (r)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
>  	if (r != RET_PF_CONTINUE)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	r = RET_PF_RETRY;
>  	write_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (is_page_fault_stale(vcpu, fault))
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = make_mmu_pages_available(vcpu);
>  	if (r)
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = direct_map(vcpu, fault);
>  
> -out_unlock:
> +out_mmu_unlock:
>  	write_unlock(&vcpu->kvm->mmu_lock);
>  	kvm_release_pfn_clean(fault->pfn);
> +out_page_cache_unlock:
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	return r;
>  }
>  
> @@ -4396,25 +4438,28 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
>  	if (r != RET_PF_INVALID)
>  		return r;
>  
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);

Can you elaborate more why this lock is required? When will this lock contend?

1) Previously mmu_topup_memory_caches() works fine without a lock.
2) IMHO I was suspecting if this lock seems affects the parallelization
of the TDP MMU fault handling. 

TDP MMU fault handling is intend to be optimized for parallelization fault
handling by taking a read lock and operating the page table via atomic
operations. Multiple fault handling can enter the TDP MMU fault path
because of read_lock(&vcpu->kvm->mmu_lock) below.

W/ this lock, it seems the part of benefit of parallelization is gone
because the lock can contend earlier above. Will this cause performance
regression?

If the lock will not contend above, then I am not sure if we need it.

>  	r = mmu_topup_memory_caches(vcpu, false);
>  	if (r)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
>  	if (r != RET_PF_CONTINUE)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	r = RET_PF_RETRY;
>  	read_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (is_page_fault_stale(vcpu, fault))
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = kvm_tdp_mmu_map(vcpu, fault);
>  
> -out_unlock:
> +out_mmu_unlock:
>  	read_unlock(&vcpu->kvm->mmu_lock);
>  	kvm_release_pfn_clean(fault->pfn);
> +out_page_cache_unlock:
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	return r;
>  }
>  #endif
> @@ -5394,6 +5439,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
>  {
>  	int r;
>  
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
>  	if (r)
>  		goto out;
> @@ -5420,6 +5466,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
>  	 */
>  	static_call(kvm_x86_flush_tlb_current)(vcpu);
>  out:
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	return r;
>  }
>  
> @@ -5924,6 +5971,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
>  	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
>  
>  	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
> +	mutex_init(&vcpu->arch.mmu_shadow_page_cache_lock);
>  
>  	vcpu->arch.mmu = &vcpu->arch.root_mmu;
>  	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
> @@ -6769,12 +6817,17 @@ int kvm_mmu_vendor_module_init(void)
>  	if (!mmu_page_header_cache)
>  		goto out;
>  
> +	if (percpu_counter_init(&kvm_total_unused_cached_pages, 0, GFP_KERNEL))
> +		goto out;
> +
>  	ret = register_shrinker(&mmu_shrinker, "x86-mmu");
>  	if (ret)
> -		goto out;
> +		goto out_shrinker;
>  
>  	return 0;
>  
> +out_shrinker:
> +	percpu_counter_destroy(&kvm_total_unused_cached_pages);
>  out:
>  	mmu_destroy_caches();
>  	return ret;
> @@ -6792,6 +6845,7 @@ void kvm_mmu_vendor_module_exit(void)
>  {
>  	mmu_destroy_caches();
>  	unregister_shrinker(&mmu_shrinker);
> +	percpu_counter_destroy(&kvm_total_unused_cached_pages);
>  }
>  
>  /*
> @@ -6994,3 +7048,11 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
>  	if (kvm->arch.nx_huge_page_recovery_thread)
>  		kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
>  }
> +
> +void *mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> +				bool count_allocation)
> +{
> +	if (count_allocation && shadow_page_cache->nobjs)
> +		percpu_counter_dec(&kvm_total_unused_cached_pages);
> +	return kvm_mmu_memory_cache_alloc(shadow_page_cache);
> +}
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index cc58631e2336..798cfbf0a36b 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -338,5 +338,7 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  
>  void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> +void *mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *cache,
> +				bool count_allocation);
>  
>  #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 57f0b75c80f9..1dea9be6849d 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -821,9 +821,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  		return RET_PF_EMULATE;
>  	}
>  
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, true);
>  	if (r)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	vcpu->arch.write_fault_to_shadow_pgtable = false;
>  
> @@ -837,7 +838,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  
>  	r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
>  	if (r != RET_PF_CONTINUE)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	/*
>  	 * Do not change pte_access if the pfn is a mmio page, otherwise
> @@ -862,16 +863,18 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	write_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (is_page_fault_stale(vcpu, fault))
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = make_mmu_pages_available(vcpu);
>  	if (r)
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  	r = FNAME(fetch)(vcpu, fault, &walker);
>  
> -out_unlock:
> +out_mmu_unlock:
>  	write_unlock(&vcpu->kvm->mmu_lock);
>  	kvm_release_pfn_clean(fault->pfn);
> +out_page_cache_unlock:
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	return r;
>  }
>  
> @@ -897,17 +900,18 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
>  
>  	vcpu_clear_mmio_info(vcpu, gva);
>  
> +	if (!VALID_PAGE(root_hpa)) {
> +		WARN_ON(1);
> +		return;
> +	}
> +
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	/*
>  	 * No need to check return value here, rmap_can_add() can
>  	 * help us to skip pte prefetch later.
>  	 */
>  	mmu_topup_memory_caches(vcpu, true);
>  
> -	if (!VALID_PAGE(root_hpa)) {
> -		WARN_ON(1);
> -		return;
> -	}
> -
>  	write_lock(&vcpu->kvm->mmu_lock);
>  	for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
>  		level = iterator.level;
> @@ -943,6 +947,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
>  			break;
>  	}
>  	write_unlock(&vcpu->kvm->mmu_lock);
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  }
>  
>  /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 7c25dbf32ecc..fa6eb1e9101e 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -265,7 +265,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
>  	struct kvm_mmu_page *sp;
>  
>  	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> -	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> +	sp->spt = mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache,
> +					    true);
>  
>  	return sp;
>  }
  
Vipin Sharma March 8, 2023, 10:16 p.m. UTC | #6
On Wed, Mar 8, 2023 at 12:33 PM Zhi Wang <zhi.wang.linux@gmail.com> wrote:
>
> On Mon,  6 Mar 2023 14:41:12 -0800
> Vipin Sharma <vipinsh@google.com> wrote:
> > +/**
> > + * Caller should hold mutex lock corresponding to cache, if available.
> > + */
> > +static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> > +                                  int min)
> > +{
> > +     int orig_nobjs, r;
> > +
> > +     orig_nobjs = cache->nobjs;
> > +     r = kvm_mmu_topup_memory_cache(cache, min);
> > +     if (orig_nobjs != cache->nobjs)
> > +             percpu_counter_add(&kvm_total_unused_cached_pages,
> > +                                (cache->nobjs - orig_nobjs));
> > +
> > +     return r;
> > +}
> > +
>
> Maybe kvm_mmu_topup_shadow_page_cache() would be better?
>
> As a user of kvm_mmu_topup_memory_cache(), mmu_topup_memory_cache() is not
> supposed to directly touch the kvm_mmu_memory_cache meta data.
>
> The name "mmu_topup_sp_memory_cache()" seems similar with "mmu_topup_memory_cache()".
> Renaming it would make its level self-documenting.
>

Sounds good. I will rename it.

> > @@ -4396,25 +4438,28 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
> >       if (r != RET_PF_INVALID)
> >               return r;
> >
> > +     mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>
> Can you elaborate more why this lock is required? When will this lock contend?

This lock is not needed in this patch. In the patch 4 when I am
freeing up the cache in MMU shrinker this lock is used. In an internal
discussion Sean also mentioned it. I will move it to the patch where
it is actually used.

>
> 1) Previously mmu_topup_memory_caches() works fine without a lock.
> 2) IMHO I was suspecting if this lock seems affects the parallelization
> of the TDP MMU fault handling.
>
> TDP MMU fault handling is intend to be optimized for parallelization fault
> handling by taking a read lock and operating the page table via atomic
> operations. Multiple fault handling can enter the TDP MMU fault path
> because of read_lock(&vcpu->kvm->mmu_lock) below.
>
> W/ this lock, it seems the part of benefit of parallelization is gone
> because the lock can contend earlier above. Will this cause performance
> regression?

This is a per vCPU lock, with this lock each vCPU will still be able
to perform parallel fault handling without contending for lock.

>
> If the lock will not contend above, then I am not sure if we need it.
>

Not in this patch, but in patch 4 we will need it when clearing cache
via MMU shrinker. I will move it to the patch where it is actually
needed.
  
Mingwei Zhang March 9, 2023, 5:18 a.m. UTC | #7
> >
> > 1) Previously mmu_topup_memory_caches() works fine without a lock.
> > 2) IMHO I was suspecting if this lock seems affects the parallelization
> > of the TDP MMU fault handling.
> >
> > TDP MMU fault handling is intend to be optimized for parallelization fault
> > handling by taking a read lock and operating the page table via atomic
> > operations. Multiple fault handling can enter the TDP MMU fault path
> > because of read_lock(&vcpu->kvm->mmu_lock) below.
> >
> > W/ this lock, it seems the part of benefit of parallelization is gone
> > because the lock can contend earlier above. Will this cause performance
> > regression?
> 
> This is a per vCPU lock, with this lock each vCPU will still be able
> to perform parallel fault handling without contending for lock.
> 

I am curious how effective it is by trying to accquiring this per vCPU
lock? If a vcpu thread should stay within the (host) kernel (vmx
root/non-root) for the vast majority of the time, isn't the shrinker
always fail to make any progress?
  
Zhi Wang March 9, 2023, 12:52 p.m. UTC | #8
On Thu, 9 Mar 2023 05:18:11 +0000
Mingwei Zhang <mizhang@google.com> wrote:

> > >
> > > 1) Previously mmu_topup_memory_caches() works fine without a lock.
> > > 2) IMHO I was suspecting if this lock seems affects the parallelization
> > > of the TDP MMU fault handling.
> > >
> > > TDP MMU fault handling is intend to be optimized for parallelization fault
> > > handling by taking a read lock and operating the page table via atomic
> > > operations. Multiple fault handling can enter the TDP MMU fault path
> > > because of read_lock(&vcpu->kvm->mmu_lock) below.
> > >
> > > W/ this lock, it seems the part of benefit of parallelization is gone
> > > because the lock can contend earlier above. Will this cause performance
> > > regression?
> > 
> > This is a per vCPU lock, with this lock each vCPU will still be able
> > to perform parallel fault handling without contending for lock.
> > 
> 
> I am curious how effective it is by trying to accquiring this per vCPU
> lock? If a vcpu thread should stay within the (host) kernel (vmx
> root/non-root) for the vast majority of the time, isn't the shrinker
> always fail to make any progress?

IMHO the lock is to prevent the faulting path from being disturbed by the
shrinker. I guess even a vCPU thread stays in the host kernel, the shrinker
should still be able to harvest the pages from the cache as long as there is
no faulting flood.

I am curious about the effectiveness as well. It would be nice there can be
some unit tests that people can try by themselves to see the results, like
when the shrinker isn't triggered, the faulting is still as effective as
before and when the shrinker is triggered, what would actually happen when
the system memory is under different pressure. (like how much the faulting
will be slowed down)
  
Zhi Wang March 9, 2023, 3:37 p.m. UTC | #9
On Mon,  6 Mar 2023 14:41:12 -0800
Vipin Sharma <vipinsh@google.com> wrote:

> Create a global counter for total number of pages available
> in MMU page caches across all VMs. Add mmu_shadow_page_cache
> pages to this counter.
> 
> This accounting will be used in future commits to shrink MMU caches via
> KVM MMU shrinker.
> 
> Signed-off-by: Vipin Sharma <vipinsh@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  5 ++
>  arch/x86/kvm/mmu/mmu.c          | 90 ++++++++++++++++++++++++++++-----
>  arch/x86/kvm/mmu/mmu_internal.h |  2 +
>  arch/x86/kvm/mmu/paging_tmpl.h  | 25 +++++----
>  arch/x86/kvm/mmu/tdp_mmu.c      |  3 +-
>  5 files changed, 100 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ebbe692acf3f..4322c7020d5d 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -791,6 +791,11 @@ struct kvm_vcpu_arch {
>  	struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
>  	struct kvm_mmu_memory_cache mmu_page_header_cache;
>  
> +	/*
> +	 * Protect allocation and release of pages from mmu_shadow_page_cache.
> +	 */
> +	struct mutex mmu_shadow_page_cache_lock;
> +
>  	/*
>  	 * QEMU userspace and the guest each have their own FPU state.
>  	 * In vcpu_run, we switch between the user and guest FPU contexts.
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 3a452989f5cd..13f41b7ac280 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -167,6 +167,11 @@ struct kvm_shadow_walk_iterator {
>  static struct kmem_cache *pte_list_desc_cache;
>  struct kmem_cache *mmu_page_header_cache;
>  
> +/*
> + * Global count of unused pages in MMU page caches across all VMs.
> + */
> +static struct percpu_counter kvm_total_unused_cached_pages;
> +
>  static void mmu_spte_set(u64 *sptep, u64 spte);
>  
>  struct kvm_mmu_role_regs {
> @@ -667,6 +672,34 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
>  	}
>  }
>  
> +/**
> + * Caller should hold mutex lock corresponding to cache, if available.
> + */
> +static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> +				     int min)
> +{
> +	int orig_nobjs, r;
> +
> +	orig_nobjs = cache->nobjs;
> +	r = kvm_mmu_topup_memory_cache(cache, min);
> +	if (orig_nobjs != cache->nobjs)
> +		percpu_counter_add(&kvm_total_unused_cached_pages,
> +				   (cache->nobjs - orig_nobjs));
> +
> +	return r;
> +}
> +
> +/**
> + * Caller should hold mutex lock corresponding to kvm_mmu_memory_cache, if
> + * available.
> + */
> +static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache)
> +{
> +	if (cache->nobjs)
> +		percpu_counter_sub(&kvm_total_unused_cached_pages, cache->nobjs);
> +	kvm_mmu_free_memory_cache(cache);
> +}
> +
>  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  {
>  	int r;
> @@ -676,10 +709,11 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
>  	if (r)
>  		return r;
> -	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> -				       PT64_ROOT_MAX_LEVEL);
> +
> +	r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache, PT64_ROOT_MAX_LEVEL);
>  	if (r)
>  		return r;
> +
>  	if (maybe_indirect) {
>  		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
>  					       PT64_ROOT_MAX_LEVEL);
> @@ -693,7 +727,9 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  {
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> -	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
> +	mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
>  }
> @@ -2148,6 +2184,7 @@ struct shadow_page_caches {
>  	struct kvm_mmu_memory_cache *page_header_cache;
>  	struct kvm_mmu_memory_cache *shadow_page_cache;
>  	struct kvm_mmu_memory_cache *shadowed_info_cache;
> +	bool count_shadow_page_allocation;
>  };
>  
>  static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
> @@ -2159,7 +2196,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
>  	struct kvm_mmu_page *sp;
>  
>  	sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
> -	sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
> +	sp->spt = mmu_sp_memory_cache_alloc(caches->shadow_page_cache,
> +					    caches->count_shadow_page_allocation);
>  	if (!role.direct)
>  		sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
>  
> @@ -2216,6 +2254,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
>  		.page_header_cache = &vcpu->arch.mmu_page_header_cache,
>  		.shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
>  		.shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
> +		.count_shadow_page_allocation = true,
>  	};
>  
>  	return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
> @@ -4314,29 +4353,32 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	if (r != RET_PF_INVALID)
>  		return r;
>  
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, false);
>  	if (r)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
>  	if (r != RET_PF_CONTINUE)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	r = RET_PF_RETRY;
>  	write_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (is_page_fault_stale(vcpu, fault))
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = make_mmu_pages_available(vcpu);
>  	if (r)
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = direct_map(vcpu, fault);
>  
> -out_unlock:
> +out_mmu_unlock:
>  	write_unlock(&vcpu->kvm->mmu_lock);
>  	kvm_release_pfn_clean(fault->pfn);
> +out_page_cache_unlock:
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	return r;
>  }
>  
> @@ -4396,25 +4438,28 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
>  	if (r != RET_PF_INVALID)
>  		return r;
>  
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, false);
>  	if (r)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
>  	if (r != RET_PF_CONTINUE)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	r = RET_PF_RETRY;
>  	read_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (is_page_fault_stale(vcpu, fault))
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = kvm_tdp_mmu_map(vcpu, fault);
>  
> -out_unlock:
> +out_mmu_unlock:
>  	read_unlock(&vcpu->kvm->mmu_lock);
>  	kvm_release_pfn_clean(fault->pfn);
> +out_page_cache_unlock:
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	return r;
>  }
>  #endif
> @@ -5394,6 +5439,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
>  {
>  	int r;
>  
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
>  	if (r)
>  		goto out;
> @@ -5420,6 +5466,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
>  	 */
>  	static_call(kvm_x86_flush_tlb_current)(vcpu);
>  out:
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	return r;
>  }
>  
> @@ -5924,6 +5971,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
>  	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
>  
>  	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
> +	mutex_init(&vcpu->arch.mmu_shadow_page_cache_lock);
>  
>  	vcpu->arch.mmu = &vcpu->arch.root_mmu;
>  	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
> @@ -6769,12 +6817,17 @@ int kvm_mmu_vendor_module_init(void)
>  	if (!mmu_page_header_cache)
>  		goto out;
>  
> +	if (percpu_counter_init(&kvm_total_unused_cached_pages, 0, GFP_KERNEL))
> +		goto out;
> +
>  	ret = register_shrinker(&mmu_shrinker, "x86-mmu");
>  	if (ret)
> -		goto out;
> +		goto out_shrinker;
>  
>  	return 0;
>  
> +out_shrinker:
> +	percpu_counter_destroy(&kvm_total_unused_cached_pages);
>  out:
>  	mmu_destroy_caches();
>  	return ret;
> @@ -6792,6 +6845,7 @@ void kvm_mmu_vendor_module_exit(void)
>  {
>  	mmu_destroy_caches();
>  	unregister_shrinker(&mmu_shrinker);
> +	percpu_counter_destroy(&kvm_total_unused_cached_pages);
>  }
>  
>  /*
> @@ -6994,3 +7048,11 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
>  	if (kvm->arch.nx_huge_page_recovery_thread)
>  		kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
>  }
> +
> +void *mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> +				bool count_allocation)

Is it necessary to have the control of count_allocation in every call of
mmu_sp_memory_cache_alloc() instead of taking
shadow_page_cache->count_shadow_page_allocation directly?

> +{
> +	if (count_allocation && shadow_page_cache->nobjs)
> +		percpu_counter_dec(&kvm_total_unused_cached_pages);
> +	return kvm_mmu_memory_cache_alloc(shadow_page_cache);
> +}
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index cc58631e2336..798cfbf0a36b 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -338,5 +338,7 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  
>  void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> +void *mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *cache,
> +				bool count_allocation);
>  
>  #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 57f0b75c80f9..1dea9be6849d 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -821,9 +821,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  		return RET_PF_EMULATE;
>  	}
>  
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, true);
>  	if (r)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	vcpu->arch.write_fault_to_shadow_pgtable = false;
>  
> @@ -837,7 +838,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  
>  	r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
>  	if (r != RET_PF_CONTINUE)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	/*
>  	 * Do not change pte_access if the pfn is a mmio page, otherwise
> @@ -862,16 +863,18 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	write_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (is_page_fault_stale(vcpu, fault))
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = make_mmu_pages_available(vcpu);
>  	if (r)
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  	r = FNAME(fetch)(vcpu, fault, &walker);
>  
> -out_unlock:
> +out_mmu_unlock:
>  	write_unlock(&vcpu->kvm->mmu_lock);
>  	kvm_release_pfn_clean(fault->pfn);
> +out_page_cache_unlock:
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	return r;
>  }
>  
> @@ -897,17 +900,18 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
>  
>  	vcpu_clear_mmio_info(vcpu, gva);
>  
> +	if (!VALID_PAGE(root_hpa)) {
> +		WARN_ON(1);
> +		return;
> +	}
> +
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	/*
>  	 * No need to check return value here, rmap_can_add() can
>  	 * help us to skip pte prefetch later.
>  	 */
>  	mmu_topup_memory_caches(vcpu, true);
>  
> -	if (!VALID_PAGE(root_hpa)) {
> -		WARN_ON(1);
> -		return;
> -	}
> -
>  	write_lock(&vcpu->kvm->mmu_lock);
>  	for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
>  		level = iterator.level;
> @@ -943,6 +947,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
>  			break;
>  	}
>  	write_unlock(&vcpu->kvm->mmu_lock);
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  }
>  
>  /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 7c25dbf32ecc..fa6eb1e9101e 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -265,7 +265,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
>  	struct kvm_mmu_page *sp;
>  
>  	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> -	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> +	sp->spt = mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache,
> +					    true);
>  
>  	return sp;
>  }
  
Vipin Sharma March 9, 2023, 6:19 p.m. UTC | #10
On Thu, Mar 9, 2023 at 7:37 AM Zhi Wang <zhi.wang.linux@gmail.com> wrote:
>
> On Mon,  6 Mar 2023 14:41:12 -0800
> Vipin Sharma <vipinsh@google.com> wrote:
> >  /*
> > @@ -6994,3 +7048,11 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
> >       if (kvm->arch.nx_huge_page_recovery_thread)
> >               kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
> >  }
> > +
> > +void *mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> > +                             bool count_allocation)
>
> Is it necessary to have the control of count_allocation in every call of
> mmu_sp_memory_cache_alloc() instead of taking
> shadow_page_cache->count_shadow_page_allocation directly?
>
 You have found in patch 7 that this is cleaned up.
  
Vipin Sharma March 9, 2023, 7:52 p.m. UTC | #11
On Thu, Mar 9, 2023 at 4:52 AM Zhi Wang <zhi.wang.linux@gmail.com> wrote:
>
> On Thu, 9 Mar 2023 05:18:11 +0000
> Mingwei Zhang <mizhang@google.com> wrote:
>
> > > >
> > > > 1) Previously mmu_topup_memory_caches() works fine without a lock.
> > > > 2) IMHO I was suspecting if this lock seems affects the parallelization
> > > > of the TDP MMU fault handling.
> > > >
> > > > TDP MMU fault handling is intend to be optimized for parallelization fault
> > > > handling by taking a read lock and operating the page table via atomic
> > > > operations. Multiple fault handling can enter the TDP MMU fault path
> > > > because of read_lock(&vcpu->kvm->mmu_lock) below.
> > > >
> > > > W/ this lock, it seems the part of benefit of parallelization is gone
> > > > because the lock can contend earlier above. Will this cause performance
> > > > regression?
> > >
> > > This is a per vCPU lock, with this lock each vCPU will still be able
> > > to perform parallel fault handling without contending for lock.
> > >
> >
> > I am curious how effective it is by trying to accquiring this per vCPU
> > lock? If a vcpu thread should stay within the (host) kernel (vmx
> > root/non-root) for the vast majority of the time, isn't the shrinker
> > always fail to make any progress?
>
> IMHO the lock is to prevent the faulting path from being disturbed by the
> shrinker. I guess even a vCPU thread stays in the host kernel, the shrinker
> should still be able to harvest the pages from the cache as long as there is
> no faulting flood.

Yes, lock is to prevent the faulting path from being disturbed by the
shrinker. In this new approach, shrinker goes through each vCPU of
each VM alive on the host. All of these vCPUs collectively being in
the fault path while shrinker is invoked seems unlikely.

Let us say we free the cache during the fault path, now when a vCPU
asks a page from the cache, it will dynamically allocate a page via
GFP_ATOMIC which has higher chances of failing if a host is already
under memory pressure. Shrinker by default should be at lower priority
and based on discussions pointed in patch 1, it seems like it was of
not much practical use before either.

>
> I am curious about the effectiveness as well. It would be nice there can be
> some unit tests that people can try by themselves to see the results, like
> when the shrinker isn't triggered, the faulting is still as effective as
> before and when the shrinker is triggered, what would actually happen when
> the system memory is under different pressure. (like how much the faulting
> will be slowed down)
>

Not sure what can be a right test to measure this. My manual testing
was to just run dirty_log_perf_test with and without shrinker and I
didn't notice much difference. I did print some logs to see if
shrinker is getting invoked, caches are freed by shrinker and when VM
is freed to verify page accounting is right with patch 9 of the
series.
  
David Matlack March 9, 2023, 11:53 p.m. UTC | #12
On Mon, Mar 06, 2023 at 02:41:12PM -0800, Vipin Sharma wrote:
> Create a global counter for total number of pages available
> in MMU page caches across all VMs. Add mmu_shadow_page_cache
> pages to this counter.

I think I prefer counting the objects on-demand in mmu_shrink_count(),
instead of keeping track of the count. Keeping track of the count adds
complexity to the topup/alloc paths for the sole benefit of the
shrinker. I'd rather contain that complexity to the shrinker code unless
there is a compelling reason to optimize mmu_shrink_count().

IIRC we discussed this at one point. Was there a reason to take this
approach that I'm just forgetting?
  
David Matlack March 10, 2023, 12:22 a.m. UTC | #13
On Mon, Mar 06, 2023 at 02:41:12PM -0800, Vipin Sharma wrote:
>
>  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  {
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> -	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
> +	mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);

Is this lock necessary (even when the shrinker is hooked up)?
mmu_free_memory_caches() is only called when KVM fails to create a vCPU
(before it has been added to vcpu_array) or during VM destruction (after
the VM has been removed from vm_list).
  
Vipin Sharma March 10, 2023, 12:28 a.m. UTC | #14
On Thu, Mar 9, 2023 at 3:53 PM David Matlack <dmatlack@google.com> wrote:
>
> On Mon, Mar 06, 2023 at 02:41:12PM -0800, Vipin Sharma wrote:
> > Create a global counter for total number of pages available
> > in MMU page caches across all VMs. Add mmu_shadow_page_cache
> > pages to this counter.
>
> I think I prefer counting the objects on-demand in mmu_shrink_count(),
> instead of keeping track of the count. Keeping track of the count adds
> complexity to the topup/alloc paths for the sole benefit of the
> shrinker. I'd rather contain that complexity to the shrinker code unless
> there is a compelling reason to optimize mmu_shrink_count().
>
> IIRC we discussed this at one point. Was there a reason to take this
> approach that I'm just forgetting?

To count on demand, we first need to lock on kvm_lock and then for
each VMs iterate through each vCPU, take a lock, and sum the objects
count in caches. When the NUMA support will be introduced in this
series then it means we have to iterate even more caches. We
can't/shouldn't use mutex_trylock() as it will not give the correct
picture and when shrink_scan is called count can be totally different.

scan_count() API comment says to not do any deadlock check (I don't
know what does that mean) and percpu_counter is very fast when we are
adding/subtracting pages so the effect of using it to keep global
count is very minimal. Since, there is not much impact to using
percpu_count compared to previous one, we ended our discussion with
keeping this per cpu counter.
  
Vipin Sharma March 10, 2023, 12:36 a.m. UTC | #15
On Thu, Mar 9, 2023 at 4:22 PM David Matlack <dmatlack@google.com> wrote:
>
> On Mon, Mar 06, 2023 at 02:41:12PM -0800, Vipin Sharma wrote:
> >
> >  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
> >  {
> >       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> > -     kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> > +     mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
> > +     mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> > +     mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>
> Is this lock necessary (even when the shrinker is hooked up)?
> mmu_free_memory_caches() is only called when KVM fails to create a vCPU
> (before it has been added to vcpu_array) or during VM destruction (after
> the VM has been removed from vm_list).

My approach was if shrinker ran just before VM destruction and removed
pages, it would reduce nobjs variable in the cache. Now, when the VM
is being destroyed, mmu_free_sp_memory_cache() will first read the
nobjs variable to update the global counter and free the cache. To be
sure that the latest value is read and there is no memory ordering
issue I used mutex.

I discussed with Sean offline and he pointed out that x86 is strongly
ordered and mutex is not needed when freeing memory caches.
  
David Matlack March 10, 2023, 12:55 a.m. UTC | #16
On Thu, Mar 09, 2023 at 04:28:10PM -0800, Vipin Sharma wrote:
> On Thu, Mar 9, 2023 at 3:53 PM David Matlack <dmatlack@google.com> wrote:
> >
> > On Mon, Mar 06, 2023 at 02:41:12PM -0800, Vipin Sharma wrote:
> > > Create a global counter for total number of pages available
> > > in MMU page caches across all VMs. Add mmu_shadow_page_cache
> > > pages to this counter.
> >
> > I think I prefer counting the objects on-demand in mmu_shrink_count(),
> > instead of keeping track of the count. Keeping track of the count adds
> > complexity to the topup/alloc paths for the sole benefit of the
> > shrinker. I'd rather contain that complexity to the shrinker code unless
> > there is a compelling reason to optimize mmu_shrink_count().
> >
> > IIRC we discussed this at one point. Was there a reason to take this
> > approach that I'm just forgetting?
> 
> To count on demand, we first need to lock on kvm_lock and then for
> each VMs iterate through each vCPU, take a lock, and sum the objects
> count in caches. When the NUMA support will be introduced in this
> series then it means we have to iterate even more caches. We
> can't/shouldn't use mutex_trylock() as it will not give the correct
> picture and when shrink_scan is called count can be totally different.

Yeah good point. Hm, do we need to take the cache mutex to calculate the
count though? mmu_shrink_count() is inherently racy (something could get
freed or allocated in between count() and scan()). I don't think holding
the mutex buys us anything over just reading the count without the
mutex.

e.g.

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index df8dcb7e5de7..c80a5c52f0ea 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6739,10 +6739,20 @@ static unsigned long mmu_shrink_scan(struct shrinker *shrink,
 static unsigned long mmu_shrink_count(struct shrinker *shrink,
                                      struct shrink_control *sc)
 {
-       s64 count = percpu_counter_sum(&kvm_total_unused_cached_pages);
+       struct kvm *kvm, *next_kvm;
+       unsigned long count = 0;

-       WARN_ON(count < 0);
-       return count <= 0 ? SHRINK_EMPTY : count;
+       mutex_lock(&kvm_lock);
+       list_for_each_entry_safe(kvm, next_kvm, &vm_list, vm_list) {
+               struct kvm_vcpu *vcpu;
+               unsigned long i;
+
+               kvm_for_each_vcpu(i, vcpu, kvm)
+                       count += READ_ONCE(vcpu->arch.mmu_shadow_page_cache.nobjs);
+       }
+       mutex_unlock(&kvm_lock);
+
+       return count == 0 ? SHRINK_EMPTY : count;

 }

Then the only concern is an additional acquire of kvm_lock. But it
should be fairly quick (quicker than mmu_shrink_scan()). If we can
tolerate the kvm_lock overhead of mmu_shrink_scan(), then we should be
able to tolerate some here.

> 
> scan_count() API comment says to not do any deadlock check (I don't
> know what does that mean) and percpu_counter is very fast when we are
> adding/subtracting pages so the effect of using it to keep global
> count is very minimal. Since, there is not much impact to using
> percpu_count compared to previous one, we ended our discussion with
> keeping this per cpu counter.

Yeah it's just the code complexity of maintaing
kvm_total_unused_cached_pages that I'm hoping to avoid. We have to
create the counter, destroy it, and keep it up to date. Some
kvm_mmu_memory_caches have to update the counter, and others don't. It's
just adds a lot of bookkeeping code that I'm not convinced is worth the
it.
  
Vipin Sharma March 10, 2023, 1:09 a.m. UTC | #17
On Thu, Mar 9, 2023 at 4:56 PM David Matlack <dmatlack@google.com> wrote:
>
> On Thu, Mar 09, 2023 at 04:28:10PM -0800, Vipin Sharma wrote:
> > On Thu, Mar 9, 2023 at 3:53 PM David Matlack <dmatlack@google.com> wrote:
> > >
> > > On Mon, Mar 06, 2023 at 02:41:12PM -0800, Vipin Sharma wrote:
> > > > Create a global counter for total number of pages available
> > > > in MMU page caches across all VMs. Add mmu_shadow_page_cache
> > > > pages to this counter.
> > >
> > > I think I prefer counting the objects on-demand in mmu_shrink_count(),
> > > instead of keeping track of the count. Keeping track of the count adds
> > > complexity to the topup/alloc paths for the sole benefit of the
> > > shrinker. I'd rather contain that complexity to the shrinker code unless
> > > there is a compelling reason to optimize mmu_shrink_count().
> > >
> > > IIRC we discussed this at one point. Was there a reason to take this
> > > approach that I'm just forgetting?
> >
> > To count on demand, we first need to lock on kvm_lock and then for
> > each VMs iterate through each vCPU, take a lock, and sum the objects
> > count in caches. When the NUMA support will be introduced in this
> > series then it means we have to iterate even more caches. We
> > can't/shouldn't use mutex_trylock() as it will not give the correct
> > picture and when shrink_scan is called count can be totally different.
>
> Yeah good point. Hm, do we need to take the cache mutex to calculate the
> count though? mmu_shrink_count() is inherently racy (something could get
> freed or allocated in between count() and scan()). I don't think holding
> the mutex buys us anything over just reading the count without the
> mutex.
>

You are right, mutex and percpu_counter both are not not solving
accuracy problems with the shrinker. So, this can be removed.

> e.g.
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index df8dcb7e5de7..c80a5c52f0ea 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -6739,10 +6739,20 @@ static unsigned long mmu_shrink_scan(struct shrinker *shrink,
>  static unsigned long mmu_shrink_count(struct shrinker *shrink,
>                                       struct shrink_control *sc)
>  {
> -       s64 count = percpu_counter_sum(&kvm_total_unused_cached_pages);
> +       struct kvm *kvm, *next_kvm;
> +       unsigned long count = 0;
>
> -       WARN_ON(count < 0);
> -       return count <= 0 ? SHRINK_EMPTY : count;
> +       mutex_lock(&kvm_lock);
> +       list_for_each_entry_safe(kvm, next_kvm, &vm_list, vm_list) {
> +               struct kvm_vcpu *vcpu;
> +               unsigned long i;
> +
> +               kvm_for_each_vcpu(i, vcpu, kvm)
> +                       count += READ_ONCE(vcpu->arch.mmu_shadow_page_cache.nobjs);
> +       }
> +       mutex_unlock(&kvm_lock);
> +
> +       return count == 0 ? SHRINK_EMPTY : count;
>
>  }
>
> Then the only concern is an additional acquire of kvm_lock. But it
> should be fairly quick (quicker than mmu_shrink_scan()). If we can
> tolerate the kvm_lock overhead of mmu_shrink_scan(), then we should be
> able to tolerate some here.
>
> >
> > scan_count() API comment says to not do any deadlock check (I don't
> > know what does that mean) and percpu_counter is very fast when we are
> > adding/subtracting pages so the effect of using it to keep global
> > count is very minimal. Since, there is not much impact to using
> > percpu_count compared to previous one, we ended our discussion with
> > keeping this per cpu counter.
>
> Yeah it's just the code complexity of maintaing
> kvm_total_unused_cached_pages that I'm hoping to avoid. We have to
> create the counter, destroy it, and keep it up to date. Some
> kvm_mmu_memory_caches have to update the counter, and others don't. It's
> just adds a lot of bookkeeping code that I'm not convinced is worth the
> it.

Yeah, it will simplify code a lot. Also, we also don't need 100%
accuracy with Shrinker. I will remove this global counter in the next
version.
  

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ebbe692acf3f..4322c7020d5d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -791,6 +791,11 @@  struct kvm_vcpu_arch {
 	struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
+	/*
+	 * Protect allocation and release of pages from mmu_shadow_page_cache.
+	 */
+	struct mutex mmu_shadow_page_cache_lock;
+
 	/*
 	 * QEMU userspace and the guest each have their own FPU state.
 	 * In vcpu_run, we switch between the user and guest FPU contexts.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3a452989f5cd..13f41b7ac280 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -167,6 +167,11 @@  struct kvm_shadow_walk_iterator {
 static struct kmem_cache *pte_list_desc_cache;
 struct kmem_cache *mmu_page_header_cache;
 
+/*
+ * Global count of unused pages in MMU page caches across all VMs.
+ */
+static struct percpu_counter kvm_total_unused_cached_pages;
+
 static void mmu_spte_set(u64 *sptep, u64 spte);
 
 struct kvm_mmu_role_regs {
@@ -667,6 +672,34 @@  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 	}
 }
 
+/**
+ * Caller should hold mutex lock corresponding to cache, if available.
+ */
+static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
+				     int min)
+{
+	int orig_nobjs, r;
+
+	orig_nobjs = cache->nobjs;
+	r = kvm_mmu_topup_memory_cache(cache, min);
+	if (orig_nobjs != cache->nobjs)
+		percpu_counter_add(&kvm_total_unused_cached_pages,
+				   (cache->nobjs - orig_nobjs));
+
+	return r;
+}
+
+/**
+ * Caller should hold mutex lock corresponding to kvm_mmu_memory_cache, if
+ * available.
+ */
+static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache)
+{
+	if (cache->nobjs)
+		percpu_counter_sub(&kvm_total_unused_cached_pages, cache->nobjs);
+	kvm_mmu_free_memory_cache(cache);
+}
+
 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 {
 	int r;
@@ -676,10 +709,11 @@  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
 	if (r)
 		return r;
-	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
-				       PT64_ROOT_MAX_LEVEL);
+
+	r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache, PT64_ROOT_MAX_LEVEL);
 	if (r)
 		return r;
+
 	if (maybe_indirect) {
 		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
 					       PT64_ROOT_MAX_LEVEL);
@@ -693,7 +727,9 @@  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
-	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
+	mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
 }
@@ -2148,6 +2184,7 @@  struct shadow_page_caches {
 	struct kvm_mmu_memory_cache *page_header_cache;
 	struct kvm_mmu_memory_cache *shadow_page_cache;
 	struct kvm_mmu_memory_cache *shadowed_info_cache;
+	bool count_shadow_page_allocation;
 };
 
 static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
@@ -2159,7 +2196,8 @@  static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
 	struct kvm_mmu_page *sp;
 
 	sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
-	sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
+	sp->spt = mmu_sp_memory_cache_alloc(caches->shadow_page_cache,
+					    caches->count_shadow_page_allocation);
 	if (!role.direct)
 		sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
 
@@ -2216,6 +2254,7 @@  static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
 		.page_header_cache = &vcpu->arch.mmu_page_header_cache,
 		.shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
 		.shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
+		.count_shadow_page_allocation = true,
 	};
 
 	return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
@@ -4314,29 +4353,32 @@  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	if (r != RET_PF_INVALID)
 		return r;
 
+	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
 	r = mmu_topup_memory_caches(vcpu, false);
 	if (r)
-		return r;
+		goto out_page_cache_unlock;
 
 	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
 	if (r != RET_PF_CONTINUE)
-		return r;
+		goto out_page_cache_unlock;
 
 	r = RET_PF_RETRY;
 	write_lock(&vcpu->kvm->mmu_lock);
 
 	if (is_page_fault_stale(vcpu, fault))
-		goto out_unlock;
+		goto out_mmu_unlock;
 
 	r = make_mmu_pages_available(vcpu);
 	if (r)
-		goto out_unlock;
+		goto out_mmu_unlock;
 
 	r = direct_map(vcpu, fault);
 
-out_unlock:
+out_mmu_unlock:
 	write_unlock(&vcpu->kvm->mmu_lock);
 	kvm_release_pfn_clean(fault->pfn);
+out_page_cache_unlock:
+	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
 	return r;
 }
 
@@ -4396,25 +4438,28 @@  static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
 	if (r != RET_PF_INVALID)
 		return r;
 
+	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
 	r = mmu_topup_memory_caches(vcpu, false);
 	if (r)
-		return r;
+		goto out_page_cache_unlock;
 
 	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
 	if (r != RET_PF_CONTINUE)
-		return r;
+		goto out_page_cache_unlock;
 
 	r = RET_PF_RETRY;
 	read_lock(&vcpu->kvm->mmu_lock);
 
 	if (is_page_fault_stale(vcpu, fault))
-		goto out_unlock;
+		goto out_mmu_unlock;
 
 	r = kvm_tdp_mmu_map(vcpu, fault);
 
-out_unlock:
+out_mmu_unlock:
 	read_unlock(&vcpu->kvm->mmu_lock);
 	kvm_release_pfn_clean(fault->pfn);
+out_page_cache_unlock:
+	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
 	return r;
 }
 #endif
@@ -5394,6 +5439,7 @@  int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
 	int r;
 
+	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
 	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
 	if (r)
 		goto out;
@@ -5420,6 +5466,7 @@  int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	 */
 	static_call(kvm_x86_flush_tlb_current)(vcpu);
 out:
+	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
 	return r;
 }
 
@@ -5924,6 +5971,7 @@  int kvm_mmu_create(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
 
 	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+	mutex_init(&vcpu->arch.mmu_shadow_page_cache_lock);
 
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
@@ -6769,12 +6817,17 @@  int kvm_mmu_vendor_module_init(void)
 	if (!mmu_page_header_cache)
 		goto out;
 
+	if (percpu_counter_init(&kvm_total_unused_cached_pages, 0, GFP_KERNEL))
+		goto out;
+
 	ret = register_shrinker(&mmu_shrinker, "x86-mmu");
 	if (ret)
-		goto out;
+		goto out_shrinker;
 
 	return 0;
 
+out_shrinker:
+	percpu_counter_destroy(&kvm_total_unused_cached_pages);
 out:
 	mmu_destroy_caches();
 	return ret;
@@ -6792,6 +6845,7 @@  void kvm_mmu_vendor_module_exit(void)
 {
 	mmu_destroy_caches();
 	unregister_shrinker(&mmu_shrinker);
+	percpu_counter_destroy(&kvm_total_unused_cached_pages);
 }
 
 /*
@@ -6994,3 +7048,11 @@  void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
 	if (kvm->arch.nx_huge_page_recovery_thread)
 		kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
 }
+
+void *mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
+				bool count_allocation)
+{
+	if (count_allocation && shadow_page_cache->nobjs)
+		percpu_counter_dec(&kvm_total_unused_cached_pages);
+	return kvm_mmu_memory_cache_alloc(shadow_page_cache);
+}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index cc58631e2336..798cfbf0a36b 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -338,5 +338,7 @@  void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 
 void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+void *mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *cache,
+				bool count_allocation);
 
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 57f0b75c80f9..1dea9be6849d 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -821,9 +821,10 @@  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 		return RET_PF_EMULATE;
 	}
 
+	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
 	r = mmu_topup_memory_caches(vcpu, true);
 	if (r)
-		return r;
+		goto out_page_cache_unlock;
 
 	vcpu->arch.write_fault_to_shadow_pgtable = false;
 
@@ -837,7 +838,7 @@  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 
 	r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
 	if (r != RET_PF_CONTINUE)
-		return r;
+		goto out_page_cache_unlock;
 
 	/*
 	 * Do not change pte_access if the pfn is a mmio page, otherwise
@@ -862,16 +863,18 @@  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	write_lock(&vcpu->kvm->mmu_lock);
 
 	if (is_page_fault_stale(vcpu, fault))
-		goto out_unlock;
+		goto out_mmu_unlock;
 
 	r = make_mmu_pages_available(vcpu);
 	if (r)
-		goto out_unlock;
+		goto out_mmu_unlock;
 	r = FNAME(fetch)(vcpu, fault, &walker);
 
-out_unlock:
+out_mmu_unlock:
 	write_unlock(&vcpu->kvm->mmu_lock);
 	kvm_release_pfn_clean(fault->pfn);
+out_page_cache_unlock:
+	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
 	return r;
 }
 
@@ -897,17 +900,18 @@  static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
 
 	vcpu_clear_mmio_info(vcpu, gva);
 
+	if (!VALID_PAGE(root_hpa)) {
+		WARN_ON(1);
+		return;
+	}
+
+	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
 	/*
 	 * No need to check return value here, rmap_can_add() can
 	 * help us to skip pte prefetch later.
 	 */
 	mmu_topup_memory_caches(vcpu, true);
 
-	if (!VALID_PAGE(root_hpa)) {
-		WARN_ON(1);
-		return;
-	}
-
 	write_lock(&vcpu->kvm->mmu_lock);
 	for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
 		level = iterator.level;
@@ -943,6 +947,7 @@  static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
 			break;
 	}
 	write_unlock(&vcpu->kvm->mmu_lock);
+	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
 }
 
 /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7c25dbf32ecc..fa6eb1e9101e 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -265,7 +265,8 @@  static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
 	struct kvm_mmu_page *sp;
 
 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
-	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+	sp->spt = mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache,
+					    true);
 
 	return sp;
 }