[v5,4/4] KVM: arm64: Initialize the ptdump parser with stage-2 attributes

Message ID 20240207144832.1017815-6-sebastianene@google.com
State New
Headers
Series arm64: ptdump: View the second stage page-tables |

Commit Message

Sebastian Ene Feb. 7, 2024, 2:48 p.m. UTC
  Define a set of attributes used by the ptdump parser to display the
properties of a guest memory region covered by a pagetable descriptor.
Build a description of the pagetable levels and initialize the parser
with this configuration.

Signed-off-by: Sebastian Ene <sebastianene@google.com>
---
 arch/arm64/kvm/ptdump.c | 156 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
  

Comments

Oliver Upton Feb. 13, 2024, 12:42 a.m. UTC | #1
On Wed, Feb 07, 2024 at 02:48:33PM +0000, Sebastian Ene wrote:
> Define a set of attributes used by the ptdump parser to display the
> properties of a guest memory region covered by a pagetable descriptor.
> Build a description of the pagetable levels and initialize the parser
> with this configuration.
> 
> Signed-off-by: Sebastian Ene <sebastianene@google.com>
> ---
>  arch/arm64/kvm/ptdump.c | 156 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 156 insertions(+)
> 
> diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
> index a4e984da8aa7..60725d46f17b 100644
> --- a/arch/arm64/kvm/ptdump.c
> +++ b/arch/arm64/kvm/ptdump.c
> @@ -14,6 +14,69 @@
>  #include <kvm_ptdump.h>
>  
>  
> +#define ADDR_MARKER_LEN		(2)
> +#define MARKER_MSG_LEN		(32)
> +
> +static const struct prot_bits stage2_pte_bits[] = {
> +	{
> +		.mask	= PTE_VALID,
> +		.val	= PTE_VALID,
> +		.set	= " ",
> +		.clear	= "F",
> +	}, {
> +		.mask	= KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
> +		.val	= KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
> +		.set	= "XN",
> +		.clear	= "  ",
> +	}, {
> +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> +		.set	= "R",
> +		.clear	= " ",
> +	}, {
> +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> +		.set	= "W",
> +		.clear	= " ",
> +	}, {
> +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> +		.set	= "AF",
> +		.clear	= "  ",
> +	}, {
> +		.mask	= PTE_NG,
> +		.val	= PTE_NG,
> +		.set	= "FnXS",
> +		.clear	= "  ",
> +	}, {
> +		.mask	= PTE_CONT | PTE_VALID,
> +		.val	= PTE_CONT | PTE_VALID,
> +		.set	= "CON",
> +		.clear	= "   ",
> +	}, {
> +		.mask	= PTE_TABLE_BIT,
> +		.val	= PTE_TABLE_BIT,
> +		.set	= "   ",
> +		.clear	= "BLK",

<snip>

> +	}, {
> +		.mask	= KVM_PGTABLE_PROT_SW0,
> +		.val	= KVM_PGTABLE_PROT_SW0,
> +		.set	= "SW0", /* PKVM_PAGE_SHARED_OWNED */
> +	}, {
> +		.mask   = KVM_PGTABLE_PROT_SW1,
> +		.val	= KVM_PGTABLE_PROT_SW1,
> +		.set	= "SW1", /* PKVM_PAGE_SHARED_BORROWED */
> +	}, {
> +		.mask	= KVM_PGTABLE_PROT_SW2,
> +		.val	= KVM_PGTABLE_PROT_SW2,
> +		.set	= "SW2",
> +	}, {
> +		.mask   = KVM_PGTABLE_PROT_SW3,
> +		.val	= KVM_PGTABLE_PROT_SW3,
> +		.set	= "SW3",
> +	},

</snip>

These bits are never set in a 'normal' stage-2 PTE, does it make sense
to carry descriptors for them here? In contexts where the SW bits are
used it might be more useful if the ptdump used the specific meaning of
the bit (e.g. OWNED, BORROWED, etc) instead of the generic SW%d.

That can all wait for when the pKVM bits come into play though.

> +};
> +
>  static int kvm_ptdump_guest_open(struct inode *inode, struct file *file);
>  static int kvm_ptdump_guest_show(struct seq_file *m, void *);
>  
> @@ -52,6 +115,94 @@ static int kvm_ptdump_show_common(struct seq_file *m,
>  	return kvm_pgtable_walk(pgtable, 0, BIT(pgtable->ia_bits), &walker);
>  }
>  
> +static void kvm_ptdump_build_levels(struct pg_level *level, u32 start_lvl)
> +{
> +	static const char * const level_names[] = {"PGD", "PUD", "PMD", "PTE"};
> +	u32 i = 0;
> +	u64 mask_lvl = 0;

nit: _lvl adds nothing to this, and actually confused me for a sec as
to whether the mask changed per level.

> +	if (start_lvl > 2) {
> +		pr_err("invalid start_lvl %u\n", start_lvl);
> +		return;
> +	}

Can't we get something like -EINVAL out here and fail initialization?
Otherwise breadcrumbs like this pr_err() are hard to connect to a
specific failure.

> +	for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
> +		mask_lvl |= stage2_pte_bits[i].mask;
> +
> +	for (i = start_lvl; i <= KVM_PGTABLE_LAST_LEVEL; i++) {
> +		level[i].name = level_names[i];
> +		level[i].num = ARRAY_SIZE(stage2_pte_bits);
> +		level[i].bits = stage2_pte_bits;
> +		level[i].mask = mask_lvl;
> +	}
> +
> +	if (start_lvl > 0)
> +		level[start_lvl].name = level_names[0];
> +}
> +
> +static int kvm_ptdump_parser_init(struct pg_state *st,
> +				  struct kvm_pgtable *pgtable,
> +				  struct seq_file *m)
> +{
> +	struct addr_marker *ipa_addr_marker;
> +	char *marker_msg;
> +	struct pg_level *level_descr;
> +	struct ptdump_range *range;
> +
> +	ipa_addr_marker = kzalloc(sizeof(struct addr_marker) * ADDR_MARKER_LEN,
> +				  GFP_KERNEL_ACCOUNT);
> +	if (!ipa_addr_marker)
> +		return -ENOMEM;
> +
> +	marker_msg = kzalloc(MARKER_MSG_LEN, GFP_KERNEL_ACCOUNT);
> +	if (!marker_msg)
> +		goto free_with_marker;
> +
> +	level_descr = kzalloc(sizeof(struct pg_level) * (KVM_PGTABLE_LAST_LEVEL + 1),
> +			      GFP_KERNEL_ACCOUNT);
> +	if (!level_descr)
> +		goto free_with_msg;
> +
> +	range = kzalloc(sizeof(struct ptdump_range) * ADDR_MARKER_LEN,
> +			GFP_KERNEL_ACCOUNT);
> +	if (!range)
> +		goto free_with_level;
> +
> +	kvm_ptdump_build_levels(level_descr, pgtable->start_level);
> +
> +	snprintf(marker_msg, MARKER_MSG_LEN, "IPA bits %2u start lvl %1d",
> +		 pgtable->ia_bits, pgtable->start_level);
> +
> +	ipa_addr_marker[0].name = marker_msg;

Is the dynamic name worth the added complexity? I see nothing wrong with
exposing additional debugfs files for simple attributes like the IPA
range and page table levels.

I know it isn't *that* much, just looking for every opportunity to
simplify further.

> +	ipa_addr_marker[1].start_address = BIT(pgtable->ia_bits);
> +	range[0].end = BIT(pgtable->ia_bits);
> +
> +	st->seq = m;
> +	st->marker = ipa_addr_marker;
> +	st->level = -1,
> +	st->pg_level = level_descr,
> +	st->ptdump.range = range;
> +	return 0;
> +
> +free_with_level:
> +	kfree(level_descr);
> +free_with_msg:
> +	kfree(marker_msg);
> +free_with_marker:
> +	kfree(ipa_addr_marker);
> +	return -ENOMEM;
> +}
> +
> +static void kvm_ptdump_parser_teardown(struct pg_state *st)
> +{
> +	const struct addr_marker *ipa_addr_marker = st->marker;
> +
> +	kfree(ipa_addr_marker[0].name);
> +	kfree(ipa_addr_marker);
> +	kfree(st->pg_level);
> +	kfree(st->ptdump.range);
> +}
> +
>  static int kvm_ptdump_guest_show(struct seq_file *m, void *)
>  {
>  	struct kvm *guest_kvm = m->private;
> @@ -59,10 +210,15 @@ static int kvm_ptdump_guest_show(struct seq_file *m, void *)
>  	struct pg_state parser_state = {0};
>  	int ret;
>  
> +	ret = kvm_ptdump_parser_init(&parser_state, mmu->pgt, m);
> +	if (ret)
> +		return ret;
> +

Can this be done at open(), or am I missing something?

>  	write_lock(&guest_kvm->mmu_lock);
>  	ret = kvm_ptdump_show_common(m, mmu->pgt, &parser_state);
>  	write_unlock(&guest_kvm->mmu_lock);
>  
> +	kvm_ptdump_parser_teardown(&parser_state);

Same question here, can this happen at close()? I guess you'll need a
struct to encapsulate pg_state and a pointer to the VM at least.

Actually, come to think of it, if you embed all of the data you need for
the walker into a structure you can just do a single allocation for it
upfront.
  
Sebastian Ene Feb. 13, 2024, 4:59 p.m. UTC | #2
On Tue, Feb 13, 2024 at 12:42:42AM +0000, Oliver Upton wrote:
> On Wed, Feb 07, 2024 at 02:48:33PM +0000, Sebastian Ene wrote:
> > Define a set of attributes used by the ptdump parser to display the
> > properties of a guest memory region covered by a pagetable descriptor.
> > Build a description of the pagetable levels and initialize the parser
> > with this configuration.
> > 
> > Signed-off-by: Sebastian Ene <sebastianene@google.com>
> > ---
> >  arch/arm64/kvm/ptdump.c | 156 ++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 156 insertions(+)
> > 
> > diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
> > index a4e984da8aa7..60725d46f17b 100644
> > --- a/arch/arm64/kvm/ptdump.c
> > +++ b/arch/arm64/kvm/ptdump.c
> > @@ -14,6 +14,69 @@
> >  #include <kvm_ptdump.h>
> >  
> >  
> > +#define ADDR_MARKER_LEN		(2)
> > +#define MARKER_MSG_LEN		(32)
> > +
> > +static const struct prot_bits stage2_pte_bits[] = {
> > +	{
> > +		.mask	= PTE_VALID,
> > +		.val	= PTE_VALID,
> > +		.set	= " ",
> > +		.clear	= "F",
> > +	}, {
> > +		.mask	= KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
> > +		.val	= KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
> > +		.set	= "XN",
> > +		.clear	= "  ",
> > +	}, {
> > +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> > +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> > +		.set	= "R",
> > +		.clear	= " ",
> > +	}, {
> > +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> > +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> > +		.set	= "W",
> > +		.clear	= " ",
> > +	}, {
> > +		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> > +		.val	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> > +		.set	= "AF",
> > +		.clear	= "  ",
> > +	}, {
> > +		.mask	= PTE_NG,
> > +		.val	= PTE_NG,
> > +		.set	= "FnXS",
> > +		.clear	= "  ",
> > +	}, {
> > +		.mask	= PTE_CONT | PTE_VALID,
> > +		.val	= PTE_CONT | PTE_VALID,
> > +		.set	= "CON",
> > +		.clear	= "   ",
> > +	}, {
> > +		.mask	= PTE_TABLE_BIT,
> > +		.val	= PTE_TABLE_BIT,
> > +		.set	= "   ",
> > +		.clear	= "BLK",
> 
> <snip>
> 
> > +	}, {
> > +		.mask	= KVM_PGTABLE_PROT_SW0,
> > +		.val	= KVM_PGTABLE_PROT_SW0,
> > +		.set	= "SW0", /* PKVM_PAGE_SHARED_OWNED */
> > +	}, {
> > +		.mask   = KVM_PGTABLE_PROT_SW1,
> > +		.val	= KVM_PGTABLE_PROT_SW1,
> > +		.set	= "SW1", /* PKVM_PAGE_SHARED_BORROWED */
> > +	}, {
> > +		.mask	= KVM_PGTABLE_PROT_SW2,
> > +		.val	= KVM_PGTABLE_PROT_SW2,
> > +		.set	= "SW2",
> > +	}, {
> > +		.mask   = KVM_PGTABLE_PROT_SW3,
> > +		.val	= KVM_PGTABLE_PROT_SW3,
> > +		.set	= "SW3",
> > +	},
> 
> </snip>
> 
> These bits are never set in a 'normal' stage-2 PTE, does it make sense
> to carry descriptors for them here? In contexts where the SW bits are
> used it might be more useful if the ptdump used the specific meaning of
> the bit (e.g. OWNED, BORROWED, etc) instead of the generic SW%d.
> 
> That can all wait for when the pKVM bits come into play though.
> 

True, I guess we don't need these bits for now. We can insert them at a
later time when we will have the pKVM support and then we will have to
use their real maning for the state tracking.

> > +};
> > +
> >  static int kvm_ptdump_guest_open(struct inode *inode, struct file *file);
> >  static int kvm_ptdump_guest_show(struct seq_file *m, void *);
> >  
> > @@ -52,6 +115,94 @@ static int kvm_ptdump_show_common(struct seq_file *m,
> >  	return kvm_pgtable_walk(pgtable, 0, BIT(pgtable->ia_bits), &walker);
> >  }
> >  
> > +static void kvm_ptdump_build_levels(struct pg_level *level, u32 start_lvl)
> > +{
> > +	static const char * const level_names[] = {"PGD", "PUD", "PMD", "PTE"};
> > +	u32 i = 0;
> > +	u64 mask_lvl = 0;
> 
> nit: _lvl adds nothing to this, and actually confused me for a sec as
> to whether the mask changed per level.
> 

I will drop it from the name to avoid the confusion.

> > +	if (start_lvl > 2) {
> > +		pr_err("invalid start_lvl %u\n", start_lvl);
> > +		return;
> > +	}
> 
> Can't we get something like -EINVAL out here and fail initialization?
> Otherwise breadcrumbs like this pr_err() are hard to connect to a
> specific failure.
>

Ok, I will add a return code for this function.

> > +	for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
> > +		mask_lvl |= stage2_pte_bits[i].mask;
> > +
> > +	for (i = start_lvl; i <= KVM_PGTABLE_LAST_LEVEL; i++) {
> > +		level[i].name = level_names[i];
> > +		level[i].num = ARRAY_SIZE(stage2_pte_bits);
> > +		level[i].bits = stage2_pte_bits;
> > +		level[i].mask = mask_lvl;
> > +	}
> > +
> > +	if (start_lvl > 0)
> > +		level[start_lvl].name = level_names[0];
> > +}
> > +
> > +static int kvm_ptdump_parser_init(struct pg_state *st,
> > +				  struct kvm_pgtable *pgtable,
> > +				  struct seq_file *m)
> > +{
> > +	struct addr_marker *ipa_addr_marker;
> > +	char *marker_msg;
> > +	struct pg_level *level_descr;
> > +	struct ptdump_range *range;
> > +
> > +	ipa_addr_marker = kzalloc(sizeof(struct addr_marker) * ADDR_MARKER_LEN,
> > +				  GFP_KERNEL_ACCOUNT);
> > +	if (!ipa_addr_marker)
> > +		return -ENOMEM;
> > +
> > +	marker_msg = kzalloc(MARKER_MSG_LEN, GFP_KERNEL_ACCOUNT);
> > +	if (!marker_msg)
> > +		goto free_with_marker;
> > +
> > +	level_descr = kzalloc(sizeof(struct pg_level) * (KVM_PGTABLE_LAST_LEVEL + 1),
> > +			      GFP_KERNEL_ACCOUNT);
> > +	if (!level_descr)
> > +		goto free_with_msg;
> > +
> > +	range = kzalloc(sizeof(struct ptdump_range) * ADDR_MARKER_LEN,
> > +			GFP_KERNEL_ACCOUNT);
> > +	if (!range)
> > +		goto free_with_level;
> > +
> > +	kvm_ptdump_build_levels(level_descr, pgtable->start_level);
> > +
> > +	snprintf(marker_msg, MARKER_MSG_LEN, "IPA bits %2u start lvl %1d",
> > +		 pgtable->ia_bits, pgtable->start_level);
> > +
> > +	ipa_addr_marker[0].name = marker_msg;
> 
> Is the dynamic name worth the added complexity? I see nothing wrong with
> exposing additional debugfs files for simple attributes like the IPA
> range and page table levels.
> 
> I know it isn't *that* much, just looking for every opportunity to
> simplify further.
> 

We can keep them separate, I have no strong opinion about this. I think
this was Vincent's, original suggestion to have them so I will check with
him as well.

> > +	ipa_addr_marker[1].start_address = BIT(pgtable->ia_bits);
> > +	range[0].end = BIT(pgtable->ia_bits);
> > +
> > +	st->seq = m;
> > +	st->marker = ipa_addr_marker;
> > +	st->level = -1,
> > +	st->pg_level = level_descr,
> > +	st->ptdump.range = range;
> > +	return 0;
> > +
> > +free_with_level:
> > +	kfree(level_descr);
> > +free_with_msg:
> > +	kfree(marker_msg);
> > +free_with_marker:
> > +	kfree(ipa_addr_marker);
> > +	return -ENOMEM;
> > +}
> > +
> > +static void kvm_ptdump_parser_teardown(struct pg_state *st)
> > +{
> > +	const struct addr_marker *ipa_addr_marker = st->marker;
> > +
> > +	kfree(ipa_addr_marker[0].name);
> > +	kfree(ipa_addr_marker);
> > +	kfree(st->pg_level);
> > +	kfree(st->ptdump.range);
> > +}
> > +
> >  static int kvm_ptdump_guest_show(struct seq_file *m, void *)
> >  {
> >  	struct kvm *guest_kvm = m->private;
> > @@ -59,10 +210,15 @@ static int kvm_ptdump_guest_show(struct seq_file *m, void *)
> >  	struct pg_state parser_state = {0};
> >  	int ret;
> >  
> > +	ret = kvm_ptdump_parser_init(&parser_state, mmu->pgt, m);
> > +	if (ret)
> > +		return ret;
> > +
> 
> Can this be done at open(), or am I missing something?
> 

I guess we can do this in open() but then we will have to add again that
struct that wraps some ptdump specific state tracking. It seemed a bit cleaner in
this way. What do you think ?

> >  	write_lock(&guest_kvm->mmu_lock);
> >  	ret = kvm_ptdump_show_common(m, mmu->pgt, &parser_state);
> >  	write_unlock(&guest_kvm->mmu_lock);
> >  
> > +	kvm_ptdump_parser_teardown(&parser_state);
> 
> Same question here, can this happen at close()? I guess you'll need a
> struct to encapsulate pg_state and a pointer to the VM at least.
>

Right, I tried to avoid using a separate struct as we discussed in v4.

> Actually, come to think of it, if you embed all of the data you need for
> the walker into a structure you can just do a single allocation for it
> upfront.
> 
> -- 
> Thanks,
> Oliver

Thanks for the feedback,
Seb
  
Oliver Upton Feb. 13, 2024, 5:10 p.m. UTC | #3
On Tue, Feb 13, 2024 at 04:59:01PM +0000, Sebastian Ene wrote:
> On Tue, Feb 13, 2024 at 12:42:42AM +0000, Oliver Upton wrote:
> > On Wed, Feb 07, 2024 at 02:48:33PM +0000, Sebastian Ene wrote:

[...]

> > > +
> > > +	snprintf(marker_msg, MARKER_MSG_LEN, "IPA bits %2u start lvl %1d",
> > > +		 pgtable->ia_bits, pgtable->start_level);
> > > +
> > > +	ipa_addr_marker[0].name = marker_msg;
> > 
> > Is the dynamic name worth the added complexity? I see nothing wrong with
> > exposing additional debugfs files for simple attributes like the IPA
> > range and page table levels.
> > 
> > I know it isn't *that* much, just looking for every opportunity to
> > simplify further.
> > 
> 
> We can keep them separate, I have no strong opinion about this. I think
> this was Vincent's, original suggestion to have them so I will check with
> him as well.

Well, if we get to the place where there's a single struct containing
all of the required data upfront then this becomes less of an issue.
This is useful information still, so let's see if we can go about it the
other way.

> > > +	ret = kvm_ptdump_parser_init(&parser_state, mmu->pgt, m);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > 
> > Can this be done at open(), or am I missing something?
> > 
> 
> I guess we can do this in open() but then we will have to add again that
> struct that wraps some ptdump specific state tracking. It seemed a bit cleaner in
> this way. What do you think ?

Allocating something that looks like an iterator end embedding it in
->private isn't too uncommon.

> > >  	write_lock(&guest_kvm->mmu_lock);
> > >  	ret = kvm_ptdump_show_common(m, mmu->pgt, &parser_state);
> > >  	write_unlock(&guest_kvm->mmu_lock);
> > >  
> > > +	kvm_ptdump_parser_teardown(&parser_state);
> > 
> > Same question here, can this happen at close()? I guess you'll need a
> > struct to encapsulate pg_state and a pointer to the VM at least.
> >
> 
> Right, I tried to avoid using a separate struct as we discussed in v4.

Sorry, I hope I didn't confuse you in my prior feedback.

What I had issue with was the multiple layers of function ptr / ops
structs for managing the file interface. I have zero concerns with
organizing the _data_ for the walk this way.

> > Actually, come to think of it, if you embed all of the data you need for
> > the walker into a structure you can just do a single allocation for it
> > upfront.
  
Sebastian Ene Feb. 14, 2024, 4:29 p.m. UTC | #4
On Tue, Feb 13, 2024 at 05:10:30PM +0000, Oliver Upton wrote:
> On Tue, Feb 13, 2024 at 04:59:01PM +0000, Sebastian Ene wrote:
> > On Tue, Feb 13, 2024 at 12:42:42AM +0000, Oliver Upton wrote:
> > > On Wed, Feb 07, 2024 at 02:48:33PM +0000, Sebastian Ene wrote:
> 
> [...]
> 
> > > > +
> > > > +	snprintf(marker_msg, MARKER_MSG_LEN, "IPA bits %2u start lvl %1d",
> > > > +		 pgtable->ia_bits, pgtable->start_level);
> > > > +
> > > > +	ipa_addr_marker[0].name = marker_msg;
> > > 
> > > Is the dynamic name worth the added complexity? I see nothing wrong with
> > > exposing additional debugfs files for simple attributes like the IPA
> > > range and page table levels.
> > > 
> > > I know it isn't *that* much, just looking for every opportunity to
> > > simplify further.
> > > 
> > 
> > We can keep them separate, I have no strong opinion about this. I think
> > this was Vincent's, original suggestion to have them so I will check with
> > him as well.
> 
> Well, if we get to the place where there's a single struct containing
> all of the required data upfront then this becomes less of an issue.
> This is useful information still, so let's see if we can go about it the
> other way.
> 

Allright, I will create separate entries for getting the ipa_space and the start
level which will be exposed in the vm debugfs dir.

> > > > +	ret = kvm_ptdump_parser_init(&parser_state, mmu->pgt, m);
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > 
> > > Can this be done at open(), or am I missing something?
> > > 
> > 
> > I guess we can do this in open() but then we will have to add again that
> > struct that wraps some ptdump specific state tracking. It seemed a bit cleaner in
> > this way. What do you think ?
> 
> Allocating something that looks like an iterator end embedding it in
> ->private isn't too uncommon.
> 

Ack, will stick to this approach in this case.

> > > >  	write_lock(&guest_kvm->mmu_lock);
> > > >  	ret = kvm_ptdump_show_common(m, mmu->pgt, &parser_state);
> > > >  	write_unlock(&guest_kvm->mmu_lock);
> > > >  
> > > > +	kvm_ptdump_parser_teardown(&parser_state);
> > > 
> > > Same question here, can this happen at close()? I guess you'll need a
> > > struct to encapsulate pg_state and a pointer to the VM at least.
> > >
> > 
> > Right, I tried to avoid using a separate struct as we discussed in v4.
> 
> Sorry, I hope I didn't confuse you in my prior feedback.
> 
> What I had issue with was the multiple layers of function ptr / ops
> structs for managing the file interface. I have zero concerns with
> organizing the _data_ for the walk this way.
>

Right, I see what you mean, thanks for the clarification.

> > > Actually, come to think of it, if you embed all of the data you need for
> > > the walker into a structure you can just do a single allocation for it
> > > upfront.
> 
> -- 
> Thanks,
> Oliver

Thanks for the feedback,
Seb
  

Patch

diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
index a4e984da8aa7..60725d46f17b 100644
--- a/arch/arm64/kvm/ptdump.c
+++ b/arch/arm64/kvm/ptdump.c
@@ -14,6 +14,69 @@ 
 #include <kvm_ptdump.h>
 
 
+#define ADDR_MARKER_LEN		(2)
+#define MARKER_MSG_LEN		(32)
+
+static const struct prot_bits stage2_pte_bits[] = {
+	{
+		.mask	= PTE_VALID,
+		.val	= PTE_VALID,
+		.set	= " ",
+		.clear	= "F",
+	}, {
+		.mask	= KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
+		.val	= KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
+		.set	= "XN",
+		.clear	= "  ",
+	}, {
+		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
+		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
+		.set	= "R",
+		.clear	= " ",
+	}, {
+		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
+		.val	= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
+		.set	= "W",
+		.clear	= " ",
+	}, {
+		.mask	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
+		.val	= KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
+		.set	= "AF",
+		.clear	= "  ",
+	}, {
+		.mask	= PTE_NG,
+		.val	= PTE_NG,
+		.set	= "FnXS",
+		.clear	= "  ",
+	}, {
+		.mask	= PTE_CONT | PTE_VALID,
+		.val	= PTE_CONT | PTE_VALID,
+		.set	= "CON",
+		.clear	= "   ",
+	}, {
+		.mask	= PTE_TABLE_BIT,
+		.val	= PTE_TABLE_BIT,
+		.set	= "   ",
+		.clear	= "BLK",
+	}, {
+		.mask	= KVM_PGTABLE_PROT_SW0,
+		.val	= KVM_PGTABLE_PROT_SW0,
+		.set	= "SW0", /* PKVM_PAGE_SHARED_OWNED */
+	}, {
+		.mask   = KVM_PGTABLE_PROT_SW1,
+		.val	= KVM_PGTABLE_PROT_SW1,
+		.set	= "SW1", /* PKVM_PAGE_SHARED_BORROWED */
+	}, {
+		.mask	= KVM_PGTABLE_PROT_SW2,
+		.val	= KVM_PGTABLE_PROT_SW2,
+		.set	= "SW2",
+	}, {
+		.mask   = KVM_PGTABLE_PROT_SW3,
+		.val	= KVM_PGTABLE_PROT_SW3,
+		.set	= "SW3",
+	},
+};
+
 static int kvm_ptdump_guest_open(struct inode *inode, struct file *file);
 static int kvm_ptdump_guest_show(struct seq_file *m, void *);
 
@@ -52,6 +115,94 @@  static int kvm_ptdump_show_common(struct seq_file *m,
 	return kvm_pgtable_walk(pgtable, 0, BIT(pgtable->ia_bits), &walker);
 }
 
+static void kvm_ptdump_build_levels(struct pg_level *level, u32 start_lvl)
+{
+	static const char * const level_names[] = {"PGD", "PUD", "PMD", "PTE"};
+	u32 i = 0;
+	u64 mask_lvl = 0;
+
+	if (start_lvl > 2) {
+		pr_err("invalid start_lvl %u\n", start_lvl);
+		return;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
+		mask_lvl |= stage2_pte_bits[i].mask;
+
+	for (i = start_lvl; i <= KVM_PGTABLE_LAST_LEVEL; i++) {
+		level[i].name = level_names[i];
+		level[i].num = ARRAY_SIZE(stage2_pte_bits);
+		level[i].bits = stage2_pte_bits;
+		level[i].mask = mask_lvl;
+	}
+
+	if (start_lvl > 0)
+		level[start_lvl].name = level_names[0];
+}
+
+static int kvm_ptdump_parser_init(struct pg_state *st,
+				  struct kvm_pgtable *pgtable,
+				  struct seq_file *m)
+{
+	struct addr_marker *ipa_addr_marker;
+	char *marker_msg;
+	struct pg_level *level_descr;
+	struct ptdump_range *range;
+
+	ipa_addr_marker = kzalloc(sizeof(struct addr_marker) * ADDR_MARKER_LEN,
+				  GFP_KERNEL_ACCOUNT);
+	if (!ipa_addr_marker)
+		return -ENOMEM;
+
+	marker_msg = kzalloc(MARKER_MSG_LEN, GFP_KERNEL_ACCOUNT);
+	if (!marker_msg)
+		goto free_with_marker;
+
+	level_descr = kzalloc(sizeof(struct pg_level) * (KVM_PGTABLE_LAST_LEVEL + 1),
+			      GFP_KERNEL_ACCOUNT);
+	if (!level_descr)
+		goto free_with_msg;
+
+	range = kzalloc(sizeof(struct ptdump_range) * ADDR_MARKER_LEN,
+			GFP_KERNEL_ACCOUNT);
+	if (!range)
+		goto free_with_level;
+
+	kvm_ptdump_build_levels(level_descr, pgtable->start_level);
+
+	snprintf(marker_msg, MARKER_MSG_LEN, "IPA bits %2u start lvl %1d",
+		 pgtable->ia_bits, pgtable->start_level);
+
+	ipa_addr_marker[0].name = marker_msg;
+	ipa_addr_marker[1].start_address = BIT(pgtable->ia_bits);
+	range[0].end = BIT(pgtable->ia_bits);
+
+	st->seq = m;
+	st->marker = ipa_addr_marker;
+	st->level = -1,
+	st->pg_level = level_descr,
+	st->ptdump.range = range;
+	return 0;
+
+free_with_level:
+	kfree(level_descr);
+free_with_msg:
+	kfree(marker_msg);
+free_with_marker:
+	kfree(ipa_addr_marker);
+	return -ENOMEM;
+}
+
+static void kvm_ptdump_parser_teardown(struct pg_state *st)
+{
+	const struct addr_marker *ipa_addr_marker = st->marker;
+
+	kfree(ipa_addr_marker[0].name);
+	kfree(ipa_addr_marker);
+	kfree(st->pg_level);
+	kfree(st->ptdump.range);
+}
+
 static int kvm_ptdump_guest_show(struct seq_file *m, void *)
 {
 	struct kvm *guest_kvm = m->private;
@@ -59,10 +210,15 @@  static int kvm_ptdump_guest_show(struct seq_file *m, void *)
 	struct pg_state parser_state = {0};
 	int ret;
 
+	ret = kvm_ptdump_parser_init(&parser_state, mmu->pgt, m);
+	if (ret)
+		return ret;
+
 	write_lock(&guest_kvm->mmu_lock);
 	ret = kvm_ptdump_show_common(m, mmu->pgt, &parser_state);
 	write_unlock(&guest_kvm->mmu_lock);
 
+	kvm_ptdump_parser_teardown(&parser_state);
 	return ret;
 }