[12/12] PCI: hv: Enable PCI pass-thru devices in Confidential VMs

Message ID 1666288635-72591-13-git-send-email-mikelley@microsoft.com
State New
Headers
Series Add PCI pass-thru support to Hyper-V Confidential VMs |

Commit Message

Michael Kelley (LINUX) Oct. 20, 2022, 5:57 p.m. UTC
  For PCI pass-thru devices in a Confidential VM, Hyper-V requires
that PCI config space be accessed via hypercalls.  In normal VMs,
config space accesses are trapped to the Hyper-V host and emulated.
But in a confidential VM, the host can't access guest memory to
decode the instruction for emulation, so an explicit hypercall must
be used.

Update the PCI config space access functions to use the hypercalls
when such use is indicated by Hyper-V flags.  Also, set the flag to
allow the Hyper-V PCI driver to be loaded and used in a Confidential
VM (a.k.a., "Isolation VM").  The driver has previously been hardened
against a malicious Hyper-V host[1].

[1] https://lore.kernel.org/all/20220511223207.3386-2-parri.andrea@gmail.com/

Co-developed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Michael Kelley <mikelley@microsoft.com>
---
 drivers/hv/channel_mgmt.c           |   2 +-
 drivers/pci/controller/pci-hyperv.c | 153 +++++++++++++++++++++---------------
 2 files changed, 92 insertions(+), 63 deletions(-)
  

Comments

Boqun Feng Nov. 7, 2022, 9:41 p.m. UTC | #1
On Thu, Oct 20, 2022 at 10:57:15AM -0700, Michael Kelley wrote:
> For PCI pass-thru devices in a Confidential VM, Hyper-V requires
> that PCI config space be accessed via hypercalls.  In normal VMs,
> config space accesses are trapped to the Hyper-V host and emulated.
> But in a confidential VM, the host can't access guest memory to
> decode the instruction for emulation, so an explicit hypercall must
> be used.
> 
> Update the PCI config space access functions to use the hypercalls
> when such use is indicated by Hyper-V flags.  Also, set the flag to
> allow the Hyper-V PCI driver to be loaded and used in a Confidential
> VM (a.k.a., "Isolation VM").  The driver has previously been hardened
> against a malicious Hyper-V host[1].
> 
> [1] https://lore.kernel.org/all/20220511223207.3386-2-parri.andrea@gmail.com/
> 
> Co-developed-by: Dexuan Cui <decui@microsoft.com>
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> Signed-off-by: Michael Kelley <mikelley@microsoft.com>
> ---
>  drivers/hv/channel_mgmt.c           |   2 +-
>  drivers/pci/controller/pci-hyperv.c | 153 +++++++++++++++++++++---------------
>  2 files changed, 92 insertions(+), 63 deletions(-)
> 
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index 5b12040..c0f9ac2 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -67,7 +67,7 @@
>  	{ .dev_type = HV_PCIE,
>  	  HV_PCIE_GUID,
>  	  .perf_device = false,
> -	  .allowed_in_isolated = false,
> +	  .allowed_in_isolated = true,
>  	},
>  
>  	/* Synthetic Frame Buffer */
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> index 02ebf3e..9873296 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -514,6 +514,7 @@ struct hv_pcibus_device {
>  
>  	/* Highest slot of child device with resources allocated */
>  	int wslot_res_allocated;
> +	bool use_calls; /* Use hypercalls to access mmio cfg space */
>  
>  	/* hypercall arg, must not cross page boundary */
>  	struct hv_retarget_device_interrupt retarget_msi_interrupt_params;
> @@ -1134,8 +1135,9 @@ static void hv_pci_write_mmio(phys_addr_t gpa, int size, u32 val)
>  static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
>  				     int size, u32 *val)
>  {
> +	struct hv_pcibus_device *hbus = hpdev->hbus;
> +	int offset = where + CFG_PAGE_OFFSET;
>  	unsigned long flags;
> -	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
>  
>  	/*
>  	 * If the attempt is to read the IDs or the ROM BAR, simulate that.
> @@ -1163,56 +1165,74 @@ static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
>  		 */
>  		*val = 0;
>  	} else if (where + size <= CFG_PAGE_SIZE) {
> -		spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
> -		/* Choose the function to be read. (See comment above) */
> -		writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
> -		/* Make sure the function was chosen before we start reading. */
> -		mb();
> -		/* Read from that function's config space. */
> -		switch (size) {
> -		case 1:
> -			*val = readb(addr);
> -			break;
> -		case 2:
> -			*val = readw(addr);
> -			break;
> -		default:
> -			*val = readl(addr);
> -			break;
> +
> +		spin_lock_irqsave(&hbus->config_lock, flags);
> +		if (hbus->use_calls) {
> +			phys_addr_t addr = hbus->mem_config->start + offset;
> +
> +			hv_pci_write_mmio(hbus->mem_config->start, 4,
> +						hpdev->desc.win_slot.slot);
> +			hv_pci_read_mmio(addr, size, val);
> +		} else {
> +			void __iomem *addr = hbus->cfg_addr + offset;
> +
> +			/* Choose the function to be read. (See comment above) */
> +			writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
> +			/* Make sure the function was chosen before reading. */
> +			mb();
> +			/* Read from that function's config space. */
> +			switch (size) {
> +			case 1:
> +				*val = readb(addr);
> +				break;
> +			case 2:
> +				*val = readw(addr);
> +				break;
> +			default:
> +				*val = readl(addr);
> +				break;
> +			}

An mb() is missing here?

>  		}
> -		/*
> -		 * Make sure the read was done before we release the spinlock
> -		 * allowing consecutive reads/writes.
> -		 */
> -		mb();
> -		spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
> +		spin_unlock_irqrestore(&hbus->config_lock, flags);
>  	} else {
> -		dev_err(&hpdev->hbus->hdev->device,
> +		dev_err(&hbus->hdev->device,
>  			"Attempt to read beyond a function's config space.\n");
>  	}
>  }
>  
>  static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
>  {
> +	struct hv_pcibus_device *hbus = hpdev->hbus;
> +	u32 val;
>  	u16 ret;
>  	unsigned long flags;
> -	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET +
> -			     PCI_VENDOR_ID;
>  
> -	spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
> +	spin_lock_irqsave(&hbus->config_lock, flags);
>  
> -	/* Choose the function to be read. (See comment above) */
> -	writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
> -	/* Make sure the function was chosen before we start reading. */
> -	mb();
> -	/* Read from that function's config space. */
> -	ret = readw(addr);
> -	/*
> -	 * mb() is not required here, because the spin_unlock_irqrestore()
> -	 * is a barrier.
> -	 */
> +	if (hbus->use_calls) {
> +		phys_addr_t addr = hbus->mem_config->start +
> +					 CFG_PAGE_OFFSET + PCI_VENDOR_ID;
> +
> +		hv_pci_write_mmio(hbus->mem_config->start, 4,
> +					hpdev->desc.win_slot.slot);
> +		hv_pci_read_mmio(addr, 2, &val);
> +		ret = val;  /* Truncates to 16 bits */
> +	} else {
> +		void __iomem *addr = hbus->cfg_addr + CFG_PAGE_OFFSET +
> +					     PCI_VENDOR_ID;
> +		/* Choose the function to be read. (See comment above) */
> +		writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
> +		/* Make sure the function was chosen before we start reading. */
> +		mb();
> +		/* Read from that function's config space. */
> +		ret = readw(addr);
> +		/*
> +		 * mb() is not required here, because the
> +		 * spin_unlock_irqrestore() is a barrier.
> +		 */
> +	}
>  
> -	spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
> +	spin_unlock_irqrestore(&hbus->config_lock, flags);
>  
>  	return ret;
>  }
> @@ -1227,38 +1247,45 @@ static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
>  static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
>  				      int size, u32 val)
>  {
> +	struct hv_pcibus_device *hbus = hpdev->hbus;
> +	int offset = where + CFG_PAGE_OFFSET;
>  	unsigned long flags;
> -	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
>  
>  	if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
>  	    where + size <= PCI_CAPABILITY_LIST) {
>  		/* SSIDs and ROM BARs are read-only */
>  	} else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
> -		spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
> -		/* Choose the function to be written. (See comment above) */
> -		writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
> -		/* Make sure the function was chosen before we start writing. */
> -		wmb();
> -		/* Write to that function's config space. */
> -		switch (size) {
> -		case 1:
> -			writeb(val, addr);
> -			break;
> -		case 2:
> -			writew(val, addr);
> -			break;
> -		default:
> -			writel(val, addr);
> -			break;
> +		spin_lock_irqsave(&hbus->config_lock, flags);
> +
> +		if (hbus->use_calls) {
> +			phys_addr_t addr = hbus->mem_config->start + offset;
> +
> +			hv_pci_write_mmio(hbus->mem_config->start, 4,
> +						hpdev->desc.win_slot.slot);
> +			hv_pci_write_mmio(addr, size, val);
> +		} else {
> +			void __iomem *addr = hbus->cfg_addr + offset;
> +
> +			/* Choose the function to write. (See comment above) */
> +			writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
> +			/* Make sure the function was chosen before writing. */
> +			wmb();
> +			/* Write to that function's config space. */
> +			switch (size) {
> +			case 1:
> +				writeb(val, addr);
> +				break;
> +			case 2:
> +				writew(val, addr);
> +				break;
> +			default:
> +				writel(val, addr);
> +				break;
> +			}

Ditto, an mb() is needed here to align with the old code.

With these fixed, feel free to add

Reviewed-by: Boqun Feng <boqun.feng@gmail.com>

Regards,
BOqun

>  		}
> -		/*
> -		 * Make sure the write was done before we release the spinlock
> -		 * allowing consecutive reads/writes.
> -		 */
> -		mb();
> -		spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
> +		spin_unlock_irqrestore(&hbus->config_lock, flags);
>  	} else {
> -		dev_err(&hpdev->hbus->hdev->device,
> +		dev_err(&hbus->hdev->device,
>  			"Attempt to write beyond a function's config space.\n");
>  	}
>  }
> @@ -3568,6 +3595,7 @@ static int hv_pci_probe(struct hv_device *hdev,
>  	hbus->bridge->domain_nr = dom;
>  #ifdef CONFIG_X86
>  	hbus->sysdata.domain = dom;
> +	hbus->use_calls = !!(ms_hyperv.hints & HV_X64_USE_MMIO_HYPERCALLS);
>  #elif defined(CONFIG_ARM64)
>  	/*
>  	 * Set the PCI bus parent to be the corresponding VMbus
> @@ -3577,6 +3605,7 @@ static int hv_pci_probe(struct hv_device *hdev,
>  	 * information to devices created on the bus.
>  	 */
>  	hbus->sysdata.parent = hdev->device.parent;
> +	hbus->use_calls = false;
>  #endif
>  
>  	hbus->hdev = hdev;
> -- 
> 1.8.3.1
> 
>
  

Patch

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 5b12040..c0f9ac2 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -67,7 +67,7 @@ 
 	{ .dev_type = HV_PCIE,
 	  HV_PCIE_GUID,
 	  .perf_device = false,
-	  .allowed_in_isolated = false,
+	  .allowed_in_isolated = true,
 	},
 
 	/* Synthetic Frame Buffer */
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 02ebf3e..9873296 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -514,6 +514,7 @@  struct hv_pcibus_device {
 
 	/* Highest slot of child device with resources allocated */
 	int wslot_res_allocated;
+	bool use_calls; /* Use hypercalls to access mmio cfg space */
 
 	/* hypercall arg, must not cross page boundary */
 	struct hv_retarget_device_interrupt retarget_msi_interrupt_params;
@@ -1134,8 +1135,9 @@  static void hv_pci_write_mmio(phys_addr_t gpa, int size, u32 val)
 static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
 				     int size, u32 *val)
 {
+	struct hv_pcibus_device *hbus = hpdev->hbus;
+	int offset = where + CFG_PAGE_OFFSET;
 	unsigned long flags;
-	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
 
 	/*
 	 * If the attempt is to read the IDs or the ROM BAR, simulate that.
@@ -1163,56 +1165,74 @@  static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
 		 */
 		*val = 0;
 	} else if (where + size <= CFG_PAGE_SIZE) {
-		spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
-		/* Choose the function to be read. (See comment above) */
-		writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
-		/* Make sure the function was chosen before we start reading. */
-		mb();
-		/* Read from that function's config space. */
-		switch (size) {
-		case 1:
-			*val = readb(addr);
-			break;
-		case 2:
-			*val = readw(addr);
-			break;
-		default:
-			*val = readl(addr);
-			break;
+
+		spin_lock_irqsave(&hbus->config_lock, flags);
+		if (hbus->use_calls) {
+			phys_addr_t addr = hbus->mem_config->start + offset;
+
+			hv_pci_write_mmio(hbus->mem_config->start, 4,
+						hpdev->desc.win_slot.slot);
+			hv_pci_read_mmio(addr, size, val);
+		} else {
+			void __iomem *addr = hbus->cfg_addr + offset;
+
+			/* Choose the function to be read. (See comment above) */
+			writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
+			/* Make sure the function was chosen before reading. */
+			mb();
+			/* Read from that function's config space. */
+			switch (size) {
+			case 1:
+				*val = readb(addr);
+				break;
+			case 2:
+				*val = readw(addr);
+				break;
+			default:
+				*val = readl(addr);
+				break;
+			}
 		}
-		/*
-		 * Make sure the read was done before we release the spinlock
-		 * allowing consecutive reads/writes.
-		 */
-		mb();
-		spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
+		spin_unlock_irqrestore(&hbus->config_lock, flags);
 	} else {
-		dev_err(&hpdev->hbus->hdev->device,
+		dev_err(&hbus->hdev->device,
 			"Attempt to read beyond a function's config space.\n");
 	}
 }
 
 static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
 {
+	struct hv_pcibus_device *hbus = hpdev->hbus;
+	u32 val;
 	u16 ret;
 	unsigned long flags;
-	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET +
-			     PCI_VENDOR_ID;
 
-	spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
+	spin_lock_irqsave(&hbus->config_lock, flags);
 
-	/* Choose the function to be read. (See comment above) */
-	writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
-	/* Make sure the function was chosen before we start reading. */
-	mb();
-	/* Read from that function's config space. */
-	ret = readw(addr);
-	/*
-	 * mb() is not required here, because the spin_unlock_irqrestore()
-	 * is a barrier.
-	 */
+	if (hbus->use_calls) {
+		phys_addr_t addr = hbus->mem_config->start +
+					 CFG_PAGE_OFFSET + PCI_VENDOR_ID;
+
+		hv_pci_write_mmio(hbus->mem_config->start, 4,
+					hpdev->desc.win_slot.slot);
+		hv_pci_read_mmio(addr, 2, &val);
+		ret = val;  /* Truncates to 16 bits */
+	} else {
+		void __iomem *addr = hbus->cfg_addr + CFG_PAGE_OFFSET +
+					     PCI_VENDOR_ID;
+		/* Choose the function to be read. (See comment above) */
+		writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
+		/* Make sure the function was chosen before we start reading. */
+		mb();
+		/* Read from that function's config space. */
+		ret = readw(addr);
+		/*
+		 * mb() is not required here, because the
+		 * spin_unlock_irqrestore() is a barrier.
+		 */
+	}
 
-	spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
+	spin_unlock_irqrestore(&hbus->config_lock, flags);
 
 	return ret;
 }
@@ -1227,38 +1247,45 @@  static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
 static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
 				      int size, u32 val)
 {
+	struct hv_pcibus_device *hbus = hpdev->hbus;
+	int offset = where + CFG_PAGE_OFFSET;
 	unsigned long flags;
-	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
 
 	if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
 	    where + size <= PCI_CAPABILITY_LIST) {
 		/* SSIDs and ROM BARs are read-only */
 	} else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
-		spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
-		/* Choose the function to be written. (See comment above) */
-		writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
-		/* Make sure the function was chosen before we start writing. */
-		wmb();
-		/* Write to that function's config space. */
-		switch (size) {
-		case 1:
-			writeb(val, addr);
-			break;
-		case 2:
-			writew(val, addr);
-			break;
-		default:
-			writel(val, addr);
-			break;
+		spin_lock_irqsave(&hbus->config_lock, flags);
+
+		if (hbus->use_calls) {
+			phys_addr_t addr = hbus->mem_config->start + offset;
+
+			hv_pci_write_mmio(hbus->mem_config->start, 4,
+						hpdev->desc.win_slot.slot);
+			hv_pci_write_mmio(addr, size, val);
+		} else {
+			void __iomem *addr = hbus->cfg_addr + offset;
+
+			/* Choose the function to write. (See comment above) */
+			writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
+			/* Make sure the function was chosen before writing. */
+			wmb();
+			/* Write to that function's config space. */
+			switch (size) {
+			case 1:
+				writeb(val, addr);
+				break;
+			case 2:
+				writew(val, addr);
+				break;
+			default:
+				writel(val, addr);
+				break;
+			}
 		}
-		/*
-		 * Make sure the write was done before we release the spinlock
-		 * allowing consecutive reads/writes.
-		 */
-		mb();
-		spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
+		spin_unlock_irqrestore(&hbus->config_lock, flags);
 	} else {
-		dev_err(&hpdev->hbus->hdev->device,
+		dev_err(&hbus->hdev->device,
 			"Attempt to write beyond a function's config space.\n");
 	}
 }
@@ -3568,6 +3595,7 @@  static int hv_pci_probe(struct hv_device *hdev,
 	hbus->bridge->domain_nr = dom;
 #ifdef CONFIG_X86
 	hbus->sysdata.domain = dom;
+	hbus->use_calls = !!(ms_hyperv.hints & HV_X64_USE_MMIO_HYPERCALLS);
 #elif defined(CONFIG_ARM64)
 	/*
 	 * Set the PCI bus parent to be the corresponding VMbus
@@ -3577,6 +3605,7 @@  static int hv_pci_probe(struct hv_device *hdev,
 	 * information to devices created on the bus.
 	 */
 	hbus->sysdata.parent = hdev->device.parent;
+	hbus->use_calls = false;
 #endif
 
 	hbus->hdev = hdev;