[v2,11/11] arm64: ptdump: Add support for guest stage-2 pagetables dumping

Message ID 20231019144032.2943044-13-sebastianene@google.com
State New
Headers
Series arm64: ptdump: View the second stage page-tables |

Commit Message

Sebastian Ene Oct. 19, 2023, 2:40 p.m. UTC
  Register a debugfs file on guest creation to be able to view their
second translation tables with ptdump. This assumes that the host is in
control of the guest stage-2 and has direct access to the pagetables.

Signed-off-by: Sebastian Ene <sebastianene@google.com>
---
 arch/arm64/include/asm/ptdump.h | 21 +++++++--
 arch/arm64/kvm/mmu.c            |  3 ++
 arch/arm64/mm/ptdump.c          | 84 +++++++++++++++++++++++++++++++++
 arch/arm64/mm/ptdump_debugfs.c  |  5 +-
 4 files changed, 108 insertions(+), 5 deletions(-)
  

Comments

Vincent Donnefort Oct. 20, 2023, 8:40 a.m. UTC | #1
On Thu, Oct 19, 2023 at 02:40:33PM +0000, Sebastian Ene wrote:
> Register a debugfs file on guest creation to be able to view their
> second translation tables with ptdump. This assumes that the host is in
> control of the guest stage-2 and has direct access to the pagetables.

What about pKVM? The walker you wrote for the host stage-2 should be
reusable in that case?

> 
> Signed-off-by: Sebastian Ene <sebastianene@google.com>
> ---
>  arch/arm64/include/asm/ptdump.h | 21 +++++++--
>  arch/arm64/kvm/mmu.c            |  3 ++
>  arch/arm64/mm/ptdump.c          | 84 +++++++++++++++++++++++++++++++++
>  arch/arm64/mm/ptdump_debugfs.c  |  5 +-
>  4 files changed, 108 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h
> index 35b883524462..be86244d532b 100644
> --- a/arch/arm64/include/asm/ptdump.h
> +++ b/arch/arm64/include/asm/ptdump.h
> @@ -5,6 +5,8 @@
>  #ifndef __ASM_PTDUMP_H
>  #define __ASM_PTDUMP_H
>  
> +#include <asm/kvm_pgtable.h>
> +
>  #ifdef CONFIG_PTDUMP_CORE
>  
>  #include <linux/mm_types.h>
> @@ -30,14 +32,27 @@ struct ptdump_info {
>  void ptdump_walk(struct seq_file *s, struct ptdump_info *info);
>  #ifdef CONFIG_PTDUMP_DEBUGFS
>  #define EFI_RUNTIME_MAP_END	DEFAULT_MAP_WINDOW_64
> -void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name);
> +struct dentry *ptdump_debugfs_register(struct ptdump_info *info,
> +				       const char *name);
>  #else
> -static inline void ptdump_debugfs_register(struct ptdump_info *info,
> -					   const char *name) { }
> +static inline struct dentry *ptdump_debugfs_register(struct ptdump_info *info,
> +						     const char *name)
> +{
> +	return NULL;
> +}
>  #endif
>  void ptdump_check_wx(void);
>  #endif /* CONFIG_PTDUMP_CORE */
>  
> +#ifdef CONFIG_NVHE_EL2_PTDUMP_DEBUGFS
> +void ptdump_register_guest_stage2(struct kvm_pgtable *pgt, void *lock);
> +void ptdump_unregister_guest_stage2(struct kvm_pgtable *pgt);
> +#else
> +static inline void ptdump_register_guest_stage2(struct kvm_pgtable *pgt,
> +						void *lock) { }
> +static inline void ptdump_unregister_guest_stage2(struct kvm_pgtable *pgt) { }
> +#endif /* CONFIG_NVHE_EL2_PTDUMP_DEBUGFS */

I believe this should be compatible with VHE as well, that option should be
renamed.

> +
>  #ifdef CONFIG_DEBUG_WX
>  #define debug_checkwx()	ptdump_check_wx()
>  #else
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 482280fe22d7..e47988dba34d 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -11,6 +11,7 @@
>  #include <linux/sched/signal.h>
>  #include <trace/events/kvm.h>
>  #include <asm/pgalloc.h>
> +#include <asm/ptdump.h>
>  #include <asm/cacheflush.h>
>  #include <asm/kvm_arm.h>
>  #include <asm/kvm_mmu.h>
> @@ -908,6 +909,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
>  	if (err)
>  		goto out_free_pgtable;
>  
> +	ptdump_register_guest_stage2(pgt, &kvm->mmu_lock);
>  	mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
>  	if (!mmu->last_vcpu_ran) {
>  		err = -ENOMEM;
> @@ -1021,6 +1023,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>  	write_unlock(&kvm->mmu_lock);
>  
>  	if (pgt) {
> +		ptdump_unregister_guest_stage2(pgt);
>  		kvm_pgtable_stage2_destroy(pgt);
>  		kfree(pgt);
>  	}
> diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
> index 4687840dcb69..facfb15468f5 100644
> --- a/arch/arm64/mm/ptdump.c
> +++ b/arch/arm64/mm/ptdump.c
> @@ -26,6 +26,7 @@
>  #include <asm/ptdump.h>
>  #include <asm/kvm_pkvm.h>
>  #include <asm/kvm_pgtable.h>
> +#include <asm/kvm_host.h>
>  
>  
>  enum address_markers_idx {
> @@ -543,6 +544,22 @@ void ptdump_check_wx(void)
>  #ifdef CONFIG_NVHE_EL2_PTDUMP_DEBUGFS
>  static struct ptdump_info stage2_kernel_ptdump_info;
>  
> +#define GUEST_NAME_LEN	(32U)
> +
> +struct ptdump_registered_guest {
> +	struct list_head		reg_list;
> +	struct ptdump_info		info;
> +	struct mm_struct		mem;
> +	struct kvm_pgtable_snapshot	snapshot;
> +	struct dentry			*dentry;
> +	rwlock_t			*lock;
> +	char				reg_name[GUEST_NAME_LEN];
> +};
> +
> +static LIST_HEAD(ptdump_guest_list);
> +static DEFINE_MUTEX(ptdump_list_lock);
> +static u16 guest_no;

This is not robust enough: If 1 VM starts then 65535 others which are killed.
guest_no overflows. The next number is 0 which is already taken.

Linux has and ID allocation to solve this problem, but I don't think this is
necessary anyway. This should simply reuse the struct kvm->debugfs_dentry.

Also probably most of the informations contained in ptdump_registered_guest can
be found in struct kvm. The debugfs should then probably simply take struct kvm
for the private argument.

> +
>  static phys_addr_t ptdump_host_pa(void *addr)
>  {
>  	return __pa(addr);
> @@ -740,6 +757,73 @@ static void stage2_ptdump_walk(struct seq_file *s, struct ptdump_info *info)
>  
>  	kvm_pgtable_walk(pgtable, start_ipa, end_ipa, &walker);
>  }

[...]
  
Sebastian Ene Oct. 23, 2023, 2:45 p.m. UTC | #2
On Fri, Oct 20, 2023 at 09:40:06AM +0100, Vincent Donnefort wrote:
> On Thu, Oct 19, 2023 at 02:40:33PM +0000, Sebastian Ene wrote:
> > Register a debugfs file on guest creation to be able to view their
> > second translation tables with ptdump. This assumes that the host is in
> > control of the guest stage-2 and has direct access to the pagetables.
> 
> What about pKVM? The walker you wrote for the host stage-2 should be
> reusable in that case?
> 

Yes, when pKVM will be ready upstream the walker which duplicates the
pagetables for the host will be re-used for the guests. We will have to
add a separate HVC for this which receives as an argument the guest
vmid.

> > 
> > Signed-off-by: Sebastian Ene <sebastianene@google.com>
> > ---
> >  arch/arm64/include/asm/ptdump.h | 21 +++++++--
> >  arch/arm64/kvm/mmu.c            |  3 ++
> >  arch/arm64/mm/ptdump.c          | 84 +++++++++++++++++++++++++++++++++
> >  arch/arm64/mm/ptdump_debugfs.c  |  5 +-
> >  4 files changed, 108 insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h
> > index 35b883524462..be86244d532b 100644
> > --- a/arch/arm64/include/asm/ptdump.h
> > +++ b/arch/arm64/include/asm/ptdump.h
> > @@ -5,6 +5,8 @@
> >  #ifndef __ASM_PTDUMP_H
> >  #define __ASM_PTDUMP_H
> >  
> > +#include <asm/kvm_pgtable.h>
> > +
> >  #ifdef CONFIG_PTDUMP_CORE
> >  
> >  #include <linux/mm_types.h>
> > @@ -30,14 +32,27 @@ struct ptdump_info {
> >  void ptdump_walk(struct seq_file *s, struct ptdump_info *info);
> >  #ifdef CONFIG_PTDUMP_DEBUGFS
> >  #define EFI_RUNTIME_MAP_END	DEFAULT_MAP_WINDOW_64
> > -void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name);
> > +struct dentry *ptdump_debugfs_register(struct ptdump_info *info,
> > +				       const char *name);
> >  #else
> > -static inline void ptdump_debugfs_register(struct ptdump_info *info,
> > -					   const char *name) { }
> > +static inline struct dentry *ptdump_debugfs_register(struct ptdump_info *info,
> > +						     const char *name)
> > +{
> > +	return NULL;
> > +}
> >  #endif
> >  void ptdump_check_wx(void);
> >  #endif /* CONFIG_PTDUMP_CORE */
> >  
> > +#ifdef CONFIG_NVHE_EL2_PTDUMP_DEBUGFS
> > +void ptdump_register_guest_stage2(struct kvm_pgtable *pgt, void *lock);
> > +void ptdump_unregister_guest_stage2(struct kvm_pgtable *pgt);
> > +#else
> > +static inline void ptdump_register_guest_stage2(struct kvm_pgtable *pgt,
> > +						void *lock) { }
> > +static inline void ptdump_unregister_guest_stage2(struct kvm_pgtable *pgt) { }
> > +#endif /* CONFIG_NVHE_EL2_PTDUMP_DEBUGFS */
> 
> I believe this should be compatible with VHE as well, that option should be
> renamed.
> 

Good point, I will rename this.

> > +
> >  #ifdef CONFIG_DEBUG_WX
> >  #define debug_checkwx()	ptdump_check_wx()
> >  #else
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index 482280fe22d7..e47988dba34d 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -11,6 +11,7 @@
> >  #include <linux/sched/signal.h>
> >  #include <trace/events/kvm.h>
> >  #include <asm/pgalloc.h>
> > +#include <asm/ptdump.h>
> >  #include <asm/cacheflush.h>
> >  #include <asm/kvm_arm.h>
> >  #include <asm/kvm_mmu.h>
> > @@ -908,6 +909,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
> >  	if (err)
> >  		goto out_free_pgtable;
> >  
> > +	ptdump_register_guest_stage2(pgt, &kvm->mmu_lock);
> >  	mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
> >  	if (!mmu->last_vcpu_ran) {
> >  		err = -ENOMEM;
> > @@ -1021,6 +1023,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
> >  	write_unlock(&kvm->mmu_lock);
> >  
> >  	if (pgt) {
> > +		ptdump_unregister_guest_stage2(pgt);
> >  		kvm_pgtable_stage2_destroy(pgt);
> >  		kfree(pgt);
> >  	}
> > diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
> > index 4687840dcb69..facfb15468f5 100644
> > --- a/arch/arm64/mm/ptdump.c
> > +++ b/arch/arm64/mm/ptdump.c
> > @@ -26,6 +26,7 @@
> >  #include <asm/ptdump.h>
> >  #include <asm/kvm_pkvm.h>
> >  #include <asm/kvm_pgtable.h>
> > +#include <asm/kvm_host.h>
> >  
> >  
> >  enum address_markers_idx {
> > @@ -543,6 +544,22 @@ void ptdump_check_wx(void)
> >  #ifdef CONFIG_NVHE_EL2_PTDUMP_DEBUGFS
> >  static struct ptdump_info stage2_kernel_ptdump_info;
> >  
> > +#define GUEST_NAME_LEN	(32U)
> > +
> > +struct ptdump_registered_guest {
> > +	struct list_head		reg_list;
> > +	struct ptdump_info		info;
> > +	struct mm_struct		mem;
> > +	struct kvm_pgtable_snapshot	snapshot;
> > +	struct dentry			*dentry;
> > +	rwlock_t			*lock;
> > +	char				reg_name[GUEST_NAME_LEN];
> > +};
> > +
> > +static LIST_HEAD(ptdump_guest_list);
> > +static DEFINE_MUTEX(ptdump_list_lock);
> > +static u16 guest_no;
> 
> This is not robust enough: If 1 VM starts then 65535 others which are killed.
> guest_no overflows. The next number is 0 which is already taken.
>

Yes, I guess this should be improved. In the case you described we won't
register any debugfs file because of the name clash.

> Linux has and ID allocation to solve this problem, but I don't think this is
> necessary anyway. This should simply reuse the struct kvm->debugfs_dentry.
> 
> Also probably most of the informations contained in ptdump_registered_guest can
> be found in struct kvm. The debugfs should then probably simply take struct kvm
> for the private argument.
>

I would prefer to keep it as a separate struct here as it gives some
flexibility if we need to extend it for guests pKVM support. I think we
can drop the struct mm_struct from here.

Thanks,
Sebastian

> > +
> >  static phys_addr_t ptdump_host_pa(void *addr)
> >  {
> >  	return __pa(addr);
> > @@ -740,6 +757,73 @@ static void stage2_ptdump_walk(struct seq_file *s, struct ptdump_info *info)
> >  
> >  	kvm_pgtable_walk(pgtable, start_ipa, end_ipa, &walker);
> >  }
> 
> [...]
  

Patch

diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h
index 35b883524462..be86244d532b 100644
--- a/arch/arm64/include/asm/ptdump.h
+++ b/arch/arm64/include/asm/ptdump.h
@@ -5,6 +5,8 @@ 
 #ifndef __ASM_PTDUMP_H
 #define __ASM_PTDUMP_H
 
+#include <asm/kvm_pgtable.h>
+
 #ifdef CONFIG_PTDUMP_CORE
 
 #include <linux/mm_types.h>
@@ -30,14 +32,27 @@  struct ptdump_info {
 void ptdump_walk(struct seq_file *s, struct ptdump_info *info);
 #ifdef CONFIG_PTDUMP_DEBUGFS
 #define EFI_RUNTIME_MAP_END	DEFAULT_MAP_WINDOW_64
-void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name);
+struct dentry *ptdump_debugfs_register(struct ptdump_info *info,
+				       const char *name);
 #else
-static inline void ptdump_debugfs_register(struct ptdump_info *info,
-					   const char *name) { }
+static inline struct dentry *ptdump_debugfs_register(struct ptdump_info *info,
+						     const char *name)
+{
+	return NULL;
+}
 #endif
 void ptdump_check_wx(void);
 #endif /* CONFIG_PTDUMP_CORE */
 
+#ifdef CONFIG_NVHE_EL2_PTDUMP_DEBUGFS
+void ptdump_register_guest_stage2(struct kvm_pgtable *pgt, void *lock);
+void ptdump_unregister_guest_stage2(struct kvm_pgtable *pgt);
+#else
+static inline void ptdump_register_guest_stage2(struct kvm_pgtable *pgt,
+						void *lock) { }
+static inline void ptdump_unregister_guest_stage2(struct kvm_pgtable *pgt) { }
+#endif /* CONFIG_NVHE_EL2_PTDUMP_DEBUGFS */
+
 #ifdef CONFIG_DEBUG_WX
 #define debug_checkwx()	ptdump_check_wx()
 #else
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 482280fe22d7..e47988dba34d 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -11,6 +11,7 @@ 
 #include <linux/sched/signal.h>
 #include <trace/events/kvm.h>
 #include <asm/pgalloc.h>
+#include <asm/ptdump.h>
 #include <asm/cacheflush.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_mmu.h>
@@ -908,6 +909,7 @@  int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 	if (err)
 		goto out_free_pgtable;
 
+	ptdump_register_guest_stage2(pgt, &kvm->mmu_lock);
 	mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
 	if (!mmu->last_vcpu_ran) {
 		err = -ENOMEM;
@@ -1021,6 +1023,7 @@  void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 	write_unlock(&kvm->mmu_lock);
 
 	if (pgt) {
+		ptdump_unregister_guest_stage2(pgt);
 		kvm_pgtable_stage2_destroy(pgt);
 		kfree(pgt);
 	}
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 4687840dcb69..facfb15468f5 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -26,6 +26,7 @@ 
 #include <asm/ptdump.h>
 #include <asm/kvm_pkvm.h>
 #include <asm/kvm_pgtable.h>
+#include <asm/kvm_host.h>
 
 
 enum address_markers_idx {
@@ -543,6 +544,22 @@  void ptdump_check_wx(void)
 #ifdef CONFIG_NVHE_EL2_PTDUMP_DEBUGFS
 static struct ptdump_info stage2_kernel_ptdump_info;
 
+#define GUEST_NAME_LEN	(32U)
+
+struct ptdump_registered_guest {
+	struct list_head		reg_list;
+	struct ptdump_info		info;
+	struct mm_struct		mem;
+	struct kvm_pgtable_snapshot	snapshot;
+	struct dentry			*dentry;
+	rwlock_t			*lock;
+	char				reg_name[GUEST_NAME_LEN];
+};
+
+static LIST_HEAD(ptdump_guest_list);
+static DEFINE_MUTEX(ptdump_list_lock);
+static u16 guest_no;
+
 static phys_addr_t ptdump_host_pa(void *addr)
 {
 	return __pa(addr);
@@ -740,6 +757,73 @@  static void stage2_ptdump_walk(struct seq_file *s, struct ptdump_info *info)
 
 	kvm_pgtable_walk(pgtable, start_ipa, end_ipa, &walker);
 }
+
+static void guest_stage2_ptdump_walk(struct seq_file *s,
+				     struct ptdump_info *info)
+{
+	struct kvm_pgtable_snapshot *snapshot = info->priv;
+	struct ptdump_registered_guest *guest;
+
+	guest = container_of(snapshot, struct ptdump_registered_guest,
+			     snapshot);
+	read_lock(guest->lock);
+	stage2_ptdump_walk(s, info);
+	read_unlock(guest->lock);
+}
+
+void ptdump_register_guest_stage2(struct kvm_pgtable *pgt, void *lock)
+{
+	struct ptdump_registered_guest *guest;
+	struct dentry *d;
+
+	if (pgt == NULL || lock == NULL)
+		return;
+
+	guest = kzalloc(sizeof(struct ptdump_registered_guest), GFP_KERNEL);
+	if (!guest)
+		return;
+
+	memcpy(&guest->snapshot.pgtable, pgt, sizeof(struct kvm_pgtable));
+	guest->info = (struct ptdump_info) {
+		.ptdump_walk		= guest_stage2_ptdump_walk,
+		.priv			= &guest->snapshot
+	};
+
+	mutex_init(&guest->info.file_lock);
+	guest->lock = lock;
+	mutex_lock(&ptdump_list_lock);
+	snprintf(guest->reg_name, GUEST_NAME_LEN,
+		 "%u_guest_stage2_page_tables", guest_no++);
+	d = ptdump_debugfs_register(&guest->info, guest->reg_name);
+	if (!d) {
+		mutex_unlock(&ptdump_list_lock);
+		goto free_entry;
+	}
+
+	guest->dentry = d;
+	list_add(&guest->reg_list, &ptdump_guest_list);
+	mutex_unlock(&ptdump_list_lock);
+	return;
+
+free_entry:
+	kfree(guest);
+}
+
+void ptdump_unregister_guest_stage2(struct kvm_pgtable *pgt)
+{
+	struct ptdump_registered_guest *guest;
+
+	mutex_lock(&ptdump_list_lock);
+	list_for_each_entry(guest, &ptdump_guest_list, reg_list) {
+		if (guest->snapshot.pgtable.pgd == pgt->pgd) {
+			list_del(&guest->reg_list);
+			debugfs_remove(guest->dentry);
+			kfree(guest);
+			break;
+		}
+	}
+	mutex_unlock(&ptdump_list_lock);
+}
 #endif /* CONFIG_NVHE_EL2_PTDUMP_DEBUGFS */
 
 static void __init ptdump_register_host_stage2(void)
diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c
index 14619452dd8d..356753e27dee 100644
--- a/arch/arm64/mm/ptdump_debugfs.c
+++ b/arch/arm64/mm/ptdump_debugfs.c
@@ -49,7 +49,8 @@  static const struct file_operations ptdump_fops = {
 	.release	= ptdump_release,
 };
 
-void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name)
+struct dentry *ptdump_debugfs_register(struct ptdump_info *info,
+				       const char *name)
 {
-	debugfs_create_file(name, 0400, NULL, info, &ptdump_fops);
+	return debugfs_create_file(name, 0400, NULL, info, &ptdump_fops);
 }