[PATCHv3,09/14] x86/tdx: Account shared memory

Message ID 20231115120044.8034-10-kirill.shutemov@linux.intel.com
State New
Headers
Series x86/tdx: Add kexec support |

Commit Message

Kirill A. Shutemov Nov. 15, 2023, noon UTC
  The kernel will convert all shared memory back to private during kexec.
The direct mapping page tables will provide information on which memory
is shared.

It is extremely important to convert all shared memory. If a page is
missed, it will cause the second kernel to crash when it accesses it.

Keep track of the number of shared pages. This will allow for
cross-checking against the shared information in the direct mapping and
reporting if the shared bit is lost.

Include a debugfs interface that allows for the check to be performed at
any point.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/x86/coco/tdx/tdx.c | 69 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
  

Comments

Kai Huang Nov. 21, 2023, 2:47 a.m. UTC | #1
> +static atomic_long_t nr_shared;
> +
> +static inline bool pte_decrypted(pte_t pte)
> +{
> +	return cc_mkdec(pte_val(pte)) == pte_val(pte);
> +}
> +
>  /* Called from __tdx_hypercall() for unrecoverable failure */
>  noinstr void __noreturn __tdx_hypercall_failed(void)
>  {
> @@ -820,6 +828,11 @@ static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
>  	if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))
>  		return -EIO;
>  
> +	if (enc)
> +		atomic_long_sub(numpages, &nr_shared);
> +	else
> +		atomic_long_add(numpages, &nr_shared);
> +
>  	return 0;
>  }
>  
> @@ -895,3 +908,59 @@ void __init tdx_early_init(void)
>  
>  	pr_info("Guest detected\n");
>  }
> +
> +#ifdef CONFIG_DEBUG_FS
> +static int tdx_shared_memory_show(struct seq_file *m, void *p)
> +{
> +	unsigned long addr, end;
> +	unsigned long found = 0;
> +
> +	addr = PAGE_OFFSET;
> +	end  = PAGE_OFFSET + get_max_mapped();
> +
> +	while (addr < end) {
> +		unsigned long size;
> +		unsigned int level;
> +		pte_t *pte;
> +
> +		pte = lookup_address(addr, &level);
> +		size = page_level_size(level);
> +
> +		if (pte && pte_decrypted(*pte))
> +			found += size / PAGE_SIZE;
> +
> +		addr += size;
> +
> +		cond_resched();
> +	}
> +
> +	seq_printf(m, "Number of unshared pages in kernel page tables:  %16lu\n",
> +		   found);
> +	seq_printf(m, "Number of pages accounted as unshared:           %16ld\n",
> +		   atomic_long_read(&nr_shared));

unshared -> shared?

Btw, I am not quite sure what's the purpose of reporting number of shared pages
in both kernel page table and that the kernel is accounting?

IIUC, there might be slight chance that the former is different from the latter
(i.e., when user reads this while the kernel is converting pages
simultaneously), but in most of the time the user should see they are the same.

I can see it might be helpful to report @nr_shared to the user, but how can
reporting both help the user?

That being said, I think perhaps you can separate the /sysfs part as a separate
patch because it's not a mandatory part of this series but a nice to have.  Then
the /sysfs part can be reviewed separately.
  
Kirill A. Shutemov Nov. 21, 2023, 9:42 a.m. UTC | #2
On Tue, Nov 21, 2023 at 02:47:29AM +0000, Huang, Kai wrote:
> 
> > +static atomic_long_t nr_shared;
> > +
> > +static inline bool pte_decrypted(pte_t pte)
> > +{
> > +	return cc_mkdec(pte_val(pte)) == pte_val(pte);
> > +}
> > +
> >  /* Called from __tdx_hypercall() for unrecoverable failure */
> >  noinstr void __noreturn __tdx_hypercall_failed(void)
> >  {
> > @@ -820,6 +828,11 @@ static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
> >  	if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))
> >  		return -EIO;
> >  
> > +	if (enc)
> > +		atomic_long_sub(numpages, &nr_shared);
> > +	else
> > +		atomic_long_add(numpages, &nr_shared);
> > +
> >  	return 0;
> >  }
> >  
> > @@ -895,3 +908,59 @@ void __init tdx_early_init(void)
> >  
> >  	pr_info("Guest detected\n");
> >  }
> > +
> > +#ifdef CONFIG_DEBUG_FS
> > +static int tdx_shared_memory_show(struct seq_file *m, void *p)
> > +{
> > +	unsigned long addr, end;
> > +	unsigned long found = 0;
> > +
> > +	addr = PAGE_OFFSET;
> > +	end  = PAGE_OFFSET + get_max_mapped();
> > +
> > +	while (addr < end) {
> > +		unsigned long size;
> > +		unsigned int level;
> > +		pte_t *pte;
> > +
> > +		pte = lookup_address(addr, &level);
> > +		size = page_level_size(level);
> > +
> > +		if (pte && pte_decrypted(*pte))
> > +			found += size / PAGE_SIZE;
> > +
> > +		addr += size;
> > +
> > +		cond_resched();
> > +	}
> > +
> > +	seq_printf(m, "Number of unshared pages in kernel page tables:  %16lu\n",
> > +		   found);
> > +	seq_printf(m, "Number of pages accounted as unshared:           %16ld\n",
> > +		   atomic_long_read(&nr_shared));
> 
> unshared -> shared?

Right.

> Btw, I am not quite sure what's the purpose of reporting number of shared pages
> in both kernel page table and that the kernel is accounting?
> 
> IIUC, there might be slight chance that the former is different from the latter
> (i.e., when user reads this while the kernel is converting pages
> simultaneously), but in most of the time the user should see they are the same.
> 
> I can see it might be helpful to report @nr_shared to the user, but how can
> reporting both help the user?

It is critical to unshared *all* pages on kexec or the second kernel will
crash at some point on accessing shared page as private.

This is the sanity check: if number of shared pages in page tables is less
than what we expected, we've lost shared bit somewhere. And kexec will
likely be a disaster.

Ability to trigger the check a any point can help to correlate the leak
with activity.

> That being said, I think perhaps you can separate the /sysfs part as a separate
> patch because it's not a mandatory part of this series but a nice to have.  Then
> the /sysfs part can be reviewed separately. 

Okay, makes sense.
  
Kirill A. Shutemov Nov. 21, 2023, 9:49 a.m. UTC | #3
On Tue, Nov 21, 2023 at 12:42:20PM +0300, kirill.shutemov@linux.intel.com wrote:
> > That being said, I think perhaps you can separate the /sysfs part as a separate
> > patch because it's not a mandatory part of this series but a nice to have.  Then
> > the /sysfs part can be reviewed separately. 
> 
> Okay, makes sense.

Hm. Without debugfs there's nothing really left in the patch. Accounting
itself is few lines.

I will probably leave it as is.
  

Patch

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 2d90043a0e91..039f81b7c172 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -5,6 +5,7 @@ 
 #define pr_fmt(fmt)     "tdx: " fmt
 
 #include <linux/cpufeature.h>
+#include <linux/debugfs.h>
 #include <linux/export.h>
 #include <linux/io.h>
 #include <asm/coco.h>
@@ -37,6 +38,13 @@ 
 
 #define TDREPORT_SUBTYPE_0	0
 
+static atomic_long_t nr_shared;
+
+static inline bool pte_decrypted(pte_t pte)
+{
+	return cc_mkdec(pte_val(pte)) == pte_val(pte);
+}
+
 /* Called from __tdx_hypercall() for unrecoverable failure */
 noinstr void __noreturn __tdx_hypercall_failed(void)
 {
@@ -820,6 +828,11 @@  static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
 	if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))
 		return -EIO;
 
+	if (enc)
+		atomic_long_sub(numpages, &nr_shared);
+	else
+		atomic_long_add(numpages, &nr_shared);
+
 	return 0;
 }
 
@@ -895,3 +908,59 @@  void __init tdx_early_init(void)
 
 	pr_info("Guest detected\n");
 }
+
+#ifdef CONFIG_DEBUG_FS
+static int tdx_shared_memory_show(struct seq_file *m, void *p)
+{
+	unsigned long addr, end;
+	unsigned long found = 0;
+
+	addr = PAGE_OFFSET;
+	end  = PAGE_OFFSET + get_max_mapped();
+
+	while (addr < end) {
+		unsigned long size;
+		unsigned int level;
+		pte_t *pte;
+
+		pte = lookup_address(addr, &level);
+		size = page_level_size(level);
+
+		if (pte && pte_decrypted(*pte))
+			found += size / PAGE_SIZE;
+
+		addr += size;
+
+		cond_resched();
+	}
+
+	seq_printf(m, "Number of unshared pages in kernel page tables:  %16lu\n",
+		   found);
+	seq_printf(m, "Number of pages accounted as unshared:           %16ld\n",
+		   atomic_long_read(&nr_shared));
+	return 0;
+}
+
+static int tdx_shared_memory_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, tdx_shared_memory_show, NULL);
+}
+
+static const struct file_operations tdx_shared_memory_fops = {
+	.open           = tdx_shared_memory_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = single_release,
+};
+
+static __init int debug_tdx_shared_memory(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+		return 0;
+
+	debugfs_create_file("tdx_shared_memory", S_IRUSR, arch_debugfs_dir,
+			    NULL, &tdx_shared_memory_fops);
+	return 0;
+}
+fs_initcall(debug_tdx_shared_memory);
+#endif