virtio_ring: validate used buffer length

Message ID 20230526063041.18359-1-jasowang@redhat.com
State New
Headers
Series virtio_ring: validate used buffer length |

Commit Message

Jason Wang May 26, 2023, 6:30 a.m. UTC
  This patch validate the used buffer length provided by the device
before trying to use it. This is done by remembering the in buffer
length in a dedicated array during virtqueue_add(), then we can fail
the virtqueue_get_buf() when we find the device is trying to give us a
used buffer length which is greater than we stored before.

This validation is disable by default via module parameter to unbreak
some existing devices since some legacy devices are known to report
buggy used length.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
Changes since V4:
- drop the flat for driver to suppress the check
- validation is disabled by default
- don't do validation for legacy device
- rebase and support virtqueue resize
---
 drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
  

Comments

Michael S. Tsirkin May 28, 2023, 7:56 a.m. UTC | #1
On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> This patch validate

validates

> the used buffer length provided by the device
> before trying to use it.

before returning it to caller

> This is done by remembering the in buffer
> length in a dedicated array during virtqueue_add(), then we can fail
> the virtqueue_get_buf() when we find the device is trying to give us a
> used buffer length which is greater than we stored before.

than what we stored

> 
> This validation is disable

disabled

> by default via module parameter to unbreak
> some existing devices since some legacy devices are known to report
> buggy used length.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

First I'm not merging this without more data about
what is known to be broken and what is known to work well
in the commit log. And how exactly do things work if used length
is wrong?
Second what's wrong with dma_desc_extra that we already maintain?
Third motivation - it's part and parcel of the hardening effort yes?
I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
we do more hardening. If it's irrevocably broken let's rip it out?


> ---
> Changes since V4:
> - drop the flat for driver to suppress the check
> - validation is disabled by default
> - don't do validation for legacy device
> - rebase and support virtqueue resize
> ---
>  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 75 insertions(+)
> 
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 143f380baa1c..5b151605aaf8 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -15,6 +15,9 @@
>  #include <linux/spinlock.h>
>  #include <xen/xen.h>
>  
> +static bool force_used_validation = false;
> +module_param(force_used_validation, bool, 0444);
> +
>  #ifdef DEBUG
>  /* For development, we want to crash whenever the ring is screwed. */
>  #define BAD_RING(_vq, fmt, args...)				\
> @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
>  	struct vring_desc_state_split *desc_state;
>  	struct vring_desc_extra *desc_extra;
>  
> +	/* Maximum in buffer length, NULL means no used validation */
> +	u32 *buflen;
> +
>  	/* DMA address and size information */
>  	dma_addr_t queue_dma_addr;
>  	size_t queue_size_in_bytes;
> @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
>  	struct vring_desc_state_packed *desc_state;
>  	struct vring_desc_extra *desc_extra;
>  
> +	/* Maximum in buffer length, NULL means no used validation */
> +	u32 *buflen;
> +
>  	/* DMA address and size information */
>  	dma_addr_t ring_dma_addr;
>  	dma_addr_t driver_event_dma_addr;
> @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
>  	unsigned int i, n, avail, descs_used, prev, err_idx;
>  	int head;
>  	bool indirect;
> +	u32 buflen = 0;
>  
>  	START_USE(vq);
>  
> @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
>  						     VRING_DESC_F_NEXT |
>  						     VRING_DESC_F_WRITE,
>  						     indirect);
> +			buflen += sg->length;
>  		}
>  	}
>  	/* Last one doesn't continue. */
> @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
>  	else
>  		vq->split.desc_state[head].indir_desc = ctx;
>  
> +	/* Store in buffer length if necessary */
> +	if (vq->split.buflen)
> +		vq->split.buflen[head] = buflen;
> +
>  	/* Put entry in available array (but don't update avail->idx until they
>  	 * do sync). */
>  	avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
>  		BAD_RING(vq, "id %u is not a head!\n", i);
>  		return NULL;
>  	}
> +	if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> +		BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> +			*len, vq->split.buflen[i]);
> +		return NULL;
> +	}
>  
>  	/* detach_buf_split clears data, so grab it now. */
>  	ret = vq->split.desc_state[i].data;
> @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
>  			 vring_split->queue_dma_addr,
>  			 dma_dev);
>  
> +	kfree(vring_split->buflen);
>  	kfree(vring_split->desc_state);
>  	kfree(vring_split->desc_extra);
>  }
>  
> +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> +{
> +	/*
> +	 * Several legacy devices are known to produce buggy used
> +	 * length. In order to let driver work, we won't validate used
> +	 * buffer length in this case.
> +	 */
> +	if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> +		return false;
> +	if (force_used_validation)
> +		return true;
> +	return false;
> +}
> +
>  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
>  				   struct virtio_device *vdev,
>  				   u32 num,
> @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
>  	vring_split->vring_align = vring_align;
>  	vring_split->may_reduce_num = may_reduce_num;
>  
> +	if (vring_needs_used_validation(vdev)) {
> +		vring_split->buflen =
> +			kmalloc_array(num, sizeof(*vring_split->buflen),
> +				      GFP_KERNEL);
> +		if (!vring_split->buflen)
> +			goto err_buflen;
> +	}
> +
>  	return 0;
> +
> +err_buflen:
> +	vring_free_split(vring_split, vdev, dma_dev);
> +	return -ENOMEM;
>  }
>  
>  static struct virtqueue *vring_create_virtqueue_split(
> @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
>  	unsigned int i, n, err_idx;
>  	u16 head, id;
>  	dma_addr_t addr;
> +	u32 buflen = 0;
>  
>  	head = vq->packed.next_avail_idx;
>  	desc = alloc_indirect_packed(total_sg, gfp);
> @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
>  			desc[i].addr = cpu_to_le64(addr);
>  			desc[i].len = cpu_to_le32(sg->length);
>  			i++;
> +			if (n >= out_sgs)
> +				buflen += sg->length;
>  		}
>  	}
>  
> @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
>  	vq->packed.desc_state[id].last = id;
>  	vq->packed.desc_state[id].premapped = premapped;
>  
> +	/* Store in buffer length if necessary */
> +	if (vq->packed.buflen)
> +		vq->packed.buflen[id] = buflen;
> +
>  	vq->num_added += 1;
>  
>  	pr_debug("Added buffer head %i to %p\n", head, vq);
> @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
>  	__le16 head_flags, flags;
>  	u16 head, id, prev, curr, avail_used_flags;
>  	int err;
> +	u32 buflen = 0;
>  
>  	START_USE(vq);
>  
> @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
>  					1 << VRING_PACKED_DESC_F_AVAIL |
>  					1 << VRING_PACKED_DESC_F_USED;
>  			}
> +			if (n >= out_sgs)
> +				buflen += sg->length;
>  		}
>  	}
>  
> @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
>  	vq->packed.desc_state[id].last = prev;
>  	vq->packed.desc_state[id].premapped = premapped;
>  
> +	/* Store in buffer length if necessary */
> +	if (vq->packed.buflen)
> +		vq->packed.buflen[id] = buflen;
> +
>  	/*
>  	 * A driver MUST NOT make the first descriptor in the list
>  	 * available before all subsequent descriptors comprising
> @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
>  		BAD_RING(vq, "id %u is not a head!\n", id);
>  		return NULL;
>  	}
> +	if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> +		BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> +			*len, vq->packed.buflen[id]);
> +		return NULL;
> +	}
>  
>  	/* detach_buf_packed clears data, so grab it now. */
>  	ret = vq->packed.desc_state[id].data;
> @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
>  				 vring_packed->device_event_dma_addr,
>  				 dma_dev);
>  
> +	kfree(vring_packed->buflen);
>  	kfree(vring_packed->desc_state);
>  	kfree(vring_packed->desc_extra);
>  }
> @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
>  
>  	vring_packed->vring.num = num;
>  
> +	if (vring_needs_used_validation(vdev)) {
> +		vring_packed->buflen =
> +			kmalloc_array(num, sizeof(*vring_packed->buflen),
> +				      GFP_KERNEL);
> +		if (!vring_packed->buflen)
> +			goto err;
> +	}
> +
>  	return 0;
>  
>  err:
> -- 
> 2.25.1
  
Jason Wang May 29, 2023, 1:18 a.m. UTC | #2
On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > This patch validate
>
> validates
>
> > the used buffer length provided by the device
> > before trying to use it.
>
> before returning it to caller
>
> > This is done by remembering the in buffer
> > length in a dedicated array during virtqueue_add(), then we can fail
> > the virtqueue_get_buf() when we find the device is trying to give us a
> > used buffer length which is greater than we stored before.
>
> than what we stored
>
> >
> > This validation is disable
>
> disabled
>
> > by default via module parameter to unbreak
> > some existing devices since some legacy devices are known to report
> > buggy used length.
> >
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
>
> First I'm not merging this without more data about
> what is known to be broken and what is known to work well
> in the commit log. And how exactly do things work if used length
> is wrong?

Assuming the device is malicious, it would be very hard to answer.
Auditing and fuzzing won't cover every case. Instead of trying to seek
the answer, we can simply make sure the used in buffer length is
validated then we know we're fine or not.

> Second what's wrong with dma_desc_extra that we already maintain?
> Third motivation - it's part and parcel of the hardening effort yes?

They are different. dma_desc_extra is for a descriptor ring, but this
is for a used ring. Technically we can go back to iterate on the
descriptor ring for a legal used in buffer length. But it will have
worse performance.

> I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> we do more hardening. If it's irrevocably broken let's rip it out?

So the plan is

1) finish used ring validation (this had been proposed, merged and
reverted before notification hardening)
2) do notification hardening on top.

So let's leave it as is and I will do a rework after we finalize the
used ring validation.

Thanks

>
>
> > ---
> > Changes since V4:
> > - drop the flat for driver to suppress the check
> > - validation is disabled by default
> > - don't do validation for legacy device
> > - rebase and support virtqueue resize
> > ---
> >  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
> >  1 file changed, 75 insertions(+)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 143f380baa1c..5b151605aaf8 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -15,6 +15,9 @@
> >  #include <linux/spinlock.h>
> >  #include <xen/xen.h>
> >
> > +static bool force_used_validation = false;
> > +module_param(force_used_validation, bool, 0444);
> > +
> >  #ifdef DEBUG
> >  /* For development, we want to crash whenever the ring is screwed. */
> >  #define BAD_RING(_vq, fmt, args...)                          \
> > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> >       struct vring_desc_state_split *desc_state;
> >       struct vring_desc_extra *desc_extra;
> >
> > +     /* Maximum in buffer length, NULL means no used validation */
> > +     u32 *buflen;
> > +
> >       /* DMA address and size information */
> >       dma_addr_t queue_dma_addr;
> >       size_t queue_size_in_bytes;
> > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> >       struct vring_desc_state_packed *desc_state;
> >       struct vring_desc_extra *desc_extra;
> >
> > +     /* Maximum in buffer length, NULL means no used validation */
> > +     u32 *buflen;
> > +
> >       /* DMA address and size information */
> >       dma_addr_t ring_dma_addr;
> >       dma_addr_t driver_event_dma_addr;
> > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> >       unsigned int i, n, avail, descs_used, prev, err_idx;
> >       int head;
> >       bool indirect;
> > +     u32 buflen = 0;
> >
> >       START_USE(vq);
> >
> > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> >                                                    VRING_DESC_F_NEXT |
> >                                                    VRING_DESC_F_WRITE,
> >                                                    indirect);
> > +                     buflen += sg->length;
> >               }
> >       }
> >       /* Last one doesn't continue. */
> > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> >       else
> >               vq->split.desc_state[head].indir_desc = ctx;
> >
> > +     /* Store in buffer length if necessary */
> > +     if (vq->split.buflen)
> > +             vq->split.buflen[head] = buflen;
> > +
> >       /* Put entry in available array (but don't update avail->idx until they
> >        * do sync). */
> >       avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> > @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> >               BAD_RING(vq, "id %u is not a head!\n", i);
> >               return NULL;
> >       }
> > +     if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > +                     *len, vq->split.buflen[i]);
> > +             return NULL;
> > +     }
> >
> >       /* detach_buf_split clears data, so grab it now. */
> >       ret = vq->split.desc_state[i].data;
> > @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
> >                        vring_split->queue_dma_addr,
> >                        dma_dev);
> >
> > +     kfree(vring_split->buflen);
> >       kfree(vring_split->desc_state);
> >       kfree(vring_split->desc_extra);
> >  }
> >
> > +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > +{
> > +     /*
> > +      * Several legacy devices are known to produce buggy used
> > +      * length. In order to let driver work, we won't validate used
> > +      * buffer length in this case.
> > +      */
> > +     if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > +             return false;
> > +     if (force_used_validation)
> > +             return true;
> > +     return false;
> > +}
> > +
> >  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> >                                  struct virtio_device *vdev,
> >                                  u32 num,
> > @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> >       vring_split->vring_align = vring_align;
> >       vring_split->may_reduce_num = may_reduce_num;
> >
> > +     if (vring_needs_used_validation(vdev)) {
> > +             vring_split->buflen =
> > +                     kmalloc_array(num, sizeof(*vring_split->buflen),
> > +                                   GFP_KERNEL);
> > +             if (!vring_split->buflen)
> > +                     goto err_buflen;
> > +     }
> > +
> >       return 0;
> > +
> > +err_buflen:
> > +     vring_free_split(vring_split, vdev, dma_dev);
> > +     return -ENOMEM;
> >  }
> >
> >  static struct virtqueue *vring_create_virtqueue_split(
> > @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> >       unsigned int i, n, err_idx;
> >       u16 head, id;
> >       dma_addr_t addr;
> > +     u32 buflen = 0;
> >
> >       head = vq->packed.next_avail_idx;
> >       desc = alloc_indirect_packed(total_sg, gfp);
> > @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> >                       desc[i].addr = cpu_to_le64(addr);
> >                       desc[i].len = cpu_to_le32(sg->length);
> >                       i++;
> > +                     if (n >= out_sgs)
> > +                             buflen += sg->length;
> >               }
> >       }
> >
> > @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> >       vq->packed.desc_state[id].last = id;
> >       vq->packed.desc_state[id].premapped = premapped;
> >
> > +     /* Store in buffer length if necessary */
> > +     if (vq->packed.buflen)
> > +             vq->packed.buflen[id] = buflen;
> > +
> >       vq->num_added += 1;
> >
> >       pr_debug("Added buffer head %i to %p\n", head, vq);
> > @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> >       __le16 head_flags, flags;
> >       u16 head, id, prev, curr, avail_used_flags;
> >       int err;
> > +     u32 buflen = 0;
> >
> >       START_USE(vq);
> >
> > @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> >                                       1 << VRING_PACKED_DESC_F_AVAIL |
> >                                       1 << VRING_PACKED_DESC_F_USED;
> >                       }
> > +                     if (n >= out_sgs)
> > +                             buflen += sg->length;
> >               }
> >       }
> >
> > @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> >       vq->packed.desc_state[id].last = prev;
> >       vq->packed.desc_state[id].premapped = premapped;
> >
> > +     /* Store in buffer length if necessary */
> > +     if (vq->packed.buflen)
> > +             vq->packed.buflen[id] = buflen;
> > +
> >       /*
> >        * A driver MUST NOT make the first descriptor in the list
> >        * available before all subsequent descriptors comprising
> > @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> >               BAD_RING(vq, "id %u is not a head!\n", id);
> >               return NULL;
> >       }
> > +     if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > +                     *len, vq->packed.buflen[id]);
> > +             return NULL;
> > +     }
> >
> >       /* detach_buf_packed clears data, so grab it now. */
> >       ret = vq->packed.desc_state[id].data;
> > @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
> >                                vring_packed->device_event_dma_addr,
> >                                dma_dev);
> >
> > +     kfree(vring_packed->buflen);
> >       kfree(vring_packed->desc_state);
> >       kfree(vring_packed->desc_extra);
> >  }
> > @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
> >
> >       vring_packed->vring.num = num;
> >
> > +     if (vring_needs_used_validation(vdev)) {
> > +             vring_packed->buflen =
> > +                     kmalloc_array(num, sizeof(*vring_packed->buflen),
> > +                                   GFP_KERNEL);
> > +             if (!vring_packed->buflen)
> > +                     goto err;
> > +     }
> > +
> >       return 0;
> >
> >  err:
> > --
> > 2.25.1
>
  
Michael S. Tsirkin May 29, 2023, 10:03 a.m. UTC | #3
On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > This patch validate
> >
> > validates
> >
> > > the used buffer length provided by the device
> > > before trying to use it.
> >
> > before returning it to caller
> >
> > > This is done by remembering the in buffer
> > > length in a dedicated array during virtqueue_add(), then we can fail
> > > the virtqueue_get_buf() when we find the device is trying to give us a
> > > used buffer length which is greater than we stored before.
> >
> > than what we stored
> >
> > >
> > > This validation is disable
> >
> > disabled
> >
> > > by default via module parameter to unbreak
> > > some existing devices since some legacy devices are known to report
> > > buggy used length.
> > >
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> >
> > First I'm not merging this without more data about
> > what is known to be broken and what is known to work well
> > in the commit log. And how exactly do things work if used length
> > is wrong?
> 
> Assuming the device is malicious, it would be very hard to answer.
> Auditing and fuzzing won't cover every case. Instead of trying to seek
> the answer, we can simply make sure the used in buffer length is
> validated then we know we're fine or not.

To restate the question, you said above "some legacy devices are known
to report buggy used length". If they report buggy length then how
can things work?

> > Second what's wrong with dma_desc_extra that we already maintain?
> > Third motivation - it's part and parcel of the hardening effort yes?
> 
> They are different. dma_desc_extra is for a descriptor ring, but this
> is for a used ring. Technically we can go back to iterate on the
> descriptor ring for a legal used in buffer length. But it will have
> worse performance.

I don't really understand. We already iterate when we unmap -
all that is necessary is to subtract it from used length, if at
the end of the process it is >0 then we know used length is too
large.


> > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> > we do more hardening. If it's irrevocably broken let's rip it out?
> 
> So the plan is
> 
> 1) finish used ring validation (this had been proposed, merged and
> reverted before notification hardening)
> 2) do notification hardening on top.
> 
> So let's leave it as is and I will do a rework after we finalize the
> used ring validation.
> 
> Thanks
> 
> >
> >
> > > ---
> > > Changes since V4:
> > > - drop the flat for driver to suppress the check
> > > - validation is disabled by default
> > > - don't do validation for legacy device
> > > - rebase and support virtqueue resize
> > > ---
> > >  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 75 insertions(+)
> > >
> > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > index 143f380baa1c..5b151605aaf8 100644
> > > --- a/drivers/virtio/virtio_ring.c
> > > +++ b/drivers/virtio/virtio_ring.c
> > > @@ -15,6 +15,9 @@
> > >  #include <linux/spinlock.h>
> > >  #include <xen/xen.h>
> > >
> > > +static bool force_used_validation = false;
> > > +module_param(force_used_validation, bool, 0444);
> > > +
> > >  #ifdef DEBUG
> > >  /* For development, we want to crash whenever the ring is screwed. */
> > >  #define BAD_RING(_vq, fmt, args...)                          \
> > > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> > >       struct vring_desc_state_split *desc_state;
> > >       struct vring_desc_extra *desc_extra;
> > >
> > > +     /* Maximum in buffer length, NULL means no used validation */
> > > +     u32 *buflen;
> > > +
> > >       /* DMA address and size information */
> > >       dma_addr_t queue_dma_addr;
> > >       size_t queue_size_in_bytes;
> > > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> > >       struct vring_desc_state_packed *desc_state;
> > >       struct vring_desc_extra *desc_extra;
> > >
> > > +     /* Maximum in buffer length, NULL means no used validation */
> > > +     u32 *buflen;
> > > +
> > >       /* DMA address and size information */
> > >       dma_addr_t ring_dma_addr;
> > >       dma_addr_t driver_event_dma_addr;
> > > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > >       unsigned int i, n, avail, descs_used, prev, err_idx;
> > >       int head;
> > >       bool indirect;
> > > +     u32 buflen = 0;
> > >
> > >       START_USE(vq);
> > >
> > > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > >                                                    VRING_DESC_F_NEXT |
> > >                                                    VRING_DESC_F_WRITE,
> > >                                                    indirect);
> > > +                     buflen += sg->length;
> > >               }
> > >       }
> > >       /* Last one doesn't continue. */
> > > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > >       else
> > >               vq->split.desc_state[head].indir_desc = ctx;
> > >
> > > +     /* Store in buffer length if necessary */
> > > +     if (vq->split.buflen)
> > > +             vq->split.buflen[head] = buflen;
> > > +
> > >       /* Put entry in available array (but don't update avail->idx until they
> > >        * do sync). */
> > >       avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> > > @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> > >               BAD_RING(vq, "id %u is not a head!\n", i);
> > >               return NULL;
> > >       }
> > > +     if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > +                     *len, vq->split.buflen[i]);
> > > +             return NULL;
> > > +     }
> > >
> > >       /* detach_buf_split clears data, so grab it now. */
> > >       ret = vq->split.desc_state[i].data;
> > > @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
> > >                        vring_split->queue_dma_addr,
> > >                        dma_dev);
> > >
> > > +     kfree(vring_split->buflen);
> > >       kfree(vring_split->desc_state);
> > >       kfree(vring_split->desc_extra);
> > >  }
> > >
> > > +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > +{
> > > +     /*
> > > +      * Several legacy devices are known to produce buggy used
> > > +      * length. In order to let driver work, we won't validate used
> > > +      * buffer length in this case.
> > > +      */
> > > +     if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > +             return false;
> > > +     if (force_used_validation)
> > > +             return true;
> > > +     return false;
> > > +}
> > > +
> > >  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > >                                  struct virtio_device *vdev,
> > >                                  u32 num,
> > > @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > >       vring_split->vring_align = vring_align;
> > >       vring_split->may_reduce_num = may_reduce_num;
> > >
> > > +     if (vring_needs_used_validation(vdev)) {
> > > +             vring_split->buflen =
> > > +                     kmalloc_array(num, sizeof(*vring_split->buflen),
> > > +                                   GFP_KERNEL);
> > > +             if (!vring_split->buflen)
> > > +                     goto err_buflen;
> > > +     }
> > > +
> > >       return 0;
> > > +
> > > +err_buflen:
> > > +     vring_free_split(vring_split, vdev, dma_dev);
> > > +     return -ENOMEM;
> > >  }
> > >
> > >  static struct virtqueue *vring_create_virtqueue_split(
> > > @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > >       unsigned int i, n, err_idx;
> > >       u16 head, id;
> > >       dma_addr_t addr;
> > > +     u32 buflen = 0;
> > >
> > >       head = vq->packed.next_avail_idx;
> > >       desc = alloc_indirect_packed(total_sg, gfp);
> > > @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > >                       desc[i].addr = cpu_to_le64(addr);
> > >                       desc[i].len = cpu_to_le32(sg->length);
> > >                       i++;
> > > +                     if (n >= out_sgs)
> > > +                             buflen += sg->length;
> > >               }
> > >       }
> > >
> > > @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > >       vq->packed.desc_state[id].last = id;
> > >       vq->packed.desc_state[id].premapped = premapped;
> > >
> > > +     /* Store in buffer length if necessary */
> > > +     if (vq->packed.buflen)
> > > +             vq->packed.buflen[id] = buflen;
> > > +
> > >       vq->num_added += 1;
> > >
> > >       pr_debug("Added buffer head %i to %p\n", head, vq);
> > > @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > >       __le16 head_flags, flags;
> > >       u16 head, id, prev, curr, avail_used_flags;
> > >       int err;
> > > +     u32 buflen = 0;
> > >
> > >       START_USE(vq);
> > >
> > > @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > >                                       1 << VRING_PACKED_DESC_F_AVAIL |
> > >                                       1 << VRING_PACKED_DESC_F_USED;
> > >                       }
> > > +                     if (n >= out_sgs)
> > > +                             buflen += sg->length;
> > >               }
> > >       }
> > >
> > > @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > >       vq->packed.desc_state[id].last = prev;
> > >       vq->packed.desc_state[id].premapped = premapped;
> > >
> > > +     /* Store in buffer length if necessary */
> > > +     if (vq->packed.buflen)
> > > +             vq->packed.buflen[id] = buflen;
> > > +
> > >       /*
> > >        * A driver MUST NOT make the first descriptor in the list
> > >        * available before all subsequent descriptors comprising
> > > @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > >               BAD_RING(vq, "id %u is not a head!\n", id);
> > >               return NULL;
> > >       }
> > > +     if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > +                     *len, vq->packed.buflen[id]);
> > > +             return NULL;
> > > +     }
> > >
> > >       /* detach_buf_packed clears data, so grab it now. */
> > >       ret = vq->packed.desc_state[id].data;
> > > @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
> > >                                vring_packed->device_event_dma_addr,
> > >                                dma_dev);
> > >
> > > +     kfree(vring_packed->buflen);
> > >       kfree(vring_packed->desc_state);
> > >       kfree(vring_packed->desc_extra);
> > >  }
> > > @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
> > >
> > >       vring_packed->vring.num = num;
> > >
> > > +     if (vring_needs_used_validation(vdev)) {
> > > +             vring_packed->buflen =
> > > +                     kmalloc_array(num, sizeof(*vring_packed->buflen),
> > > +                                   GFP_KERNEL);
> > > +             if (!vring_packed->buflen)
> > > +                     goto err;
> > > +     }
> > > +
> > >       return 0;
> > >
> > >  err:
> > > --
> > > 2.25.1
> >
  
Jason Wang May 31, 2023, 1:05 a.m. UTC | #4
On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > This patch validate
> > >
> > > validates
> > >
> > > > the used buffer length provided by the device
> > > > before trying to use it.
> > >
> > > before returning it to caller
> > >
> > > > This is done by remembering the in buffer
> > > > length in a dedicated array during virtqueue_add(), then we can fail
> > > > the virtqueue_get_buf() when we find the device is trying to give us a
> > > > used buffer length which is greater than we stored before.
> > >
> > > than what we stored
> > >
> > > >
> > > > This validation is disable
> > >
> > > disabled
> > >
> > > > by default via module parameter to unbreak
> > > > some existing devices since some legacy devices are known to report
> > > > buggy used length.
> > > >
> > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > >
> > > First I'm not merging this without more data about
> > > what is known to be broken and what is known to work well
> > > in the commit log. And how exactly do things work if used length
> > > is wrong?
> >
> > Assuming the device is malicious, it would be very hard to answer.
> > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > the answer, we can simply make sure the used in buffer length is
> > validated then we know we're fine or not.
>
> To restate the question, you said above "some legacy devices are known
> to report buggy used length". If they report buggy length then how
> can things work?

The validation is disabled for legacy device (as stated in the changelog):

static bool vring_needs_used_validation(const struct virtio_device *vdev)
{
        /*
         * Several legacy devices are known to produce buggy used
         * length. In order to let driver work, we won't validate used
         * buffer length in this case.
         */
        if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
                return false;
        if (force_used_validation)
                return true;
        return false;
}

This seems to be what we've agreed in last version:

https://lore.kernel.org/all/CANLsYkxfhamUU0bb4j7y6N4_G9odKxLCjXxgXEx4SJ6_Kf+M2Q@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56

Thanks

>
> > > Second what's wrong with dma_desc_extra that we already maintain?
> > > Third motivation - it's part and parcel of the hardening effort yes?
> >
> > They are different. dma_desc_extra is for a descriptor ring, but this
> > is for a used ring. Technically we can go back to iterate on the
> > descriptor ring for a legal used in buffer length. But it will have
> > worse performance.
>
> I don't really understand. We already iterate when we unmap -
> all that is necessary is to subtract it from used length, if at
> the end of the process it is >0 then we know used length is too
> large.

Yes, but it is the job that is done in the driver level not the virtio
core. Validation in virtio core is still necessary since they're
working at different levels and it's hard to force the validation in
all drivers by codes. Last version introduces a
suppress_driver_validation to allow the driver to suppress the core
validation which seems not good, we need a way to force the
virtio_ring code to do validation before. Or such stuff could be added
on top since the validation is by default anyway.

Thanks

>
>
> > > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> > > we do more hardening. If it's irrevocably broken let's rip it out?
> >
> > So the plan is
> >
> > 1) finish used ring validation (this had been proposed, merged and
> > reverted before notification hardening)
> > 2) do notification hardening on top.
> >
> > So let's leave it as is and I will do a rework after we finalize the
> > used ring validation.
> >
> > Thanks
> >
> > >
> > >
> > > > ---
> > > > Changes since V4:
> > > > - drop the flat for driver to suppress the check
> > > > - validation is disabled by default
> > > > - don't do validation for legacy device
> > > > - rebase and support virtqueue resize
> > > > ---
> > > >  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 75 insertions(+)
> > > >
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index 143f380baa1c..5b151605aaf8 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -15,6 +15,9 @@
> > > >  #include <linux/spinlock.h>
> > > >  #include <xen/xen.h>
> > > >
> > > > +static bool force_used_validation = false;
> > > > +module_param(force_used_validation, bool, 0444);
> > > > +
> > > >  #ifdef DEBUG
> > > >  /* For development, we want to crash whenever the ring is screwed. */
> > > >  #define BAD_RING(_vq, fmt, args...)                          \
> > > > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> > > >       struct vring_desc_state_split *desc_state;
> > > >       struct vring_desc_extra *desc_extra;
> > > >
> > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > +     u32 *buflen;
> > > > +
> > > >       /* DMA address and size information */
> > > >       dma_addr_t queue_dma_addr;
> > > >       size_t queue_size_in_bytes;
> > > > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> > > >       struct vring_desc_state_packed *desc_state;
> > > >       struct vring_desc_extra *desc_extra;
> > > >
> > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > +     u32 *buflen;
> > > > +
> > > >       /* DMA address and size information */
> > > >       dma_addr_t ring_dma_addr;
> > > >       dma_addr_t driver_event_dma_addr;
> > > > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > >       unsigned int i, n, avail, descs_used, prev, err_idx;
> > > >       int head;
> > > >       bool indirect;
> > > > +     u32 buflen = 0;
> > > >
> > > >       START_USE(vq);
> > > >
> > > > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > >                                                    VRING_DESC_F_NEXT |
> > > >                                                    VRING_DESC_F_WRITE,
> > > >                                                    indirect);
> > > > +                     buflen += sg->length;
> > > >               }
> > > >       }
> > > >       /* Last one doesn't continue. */
> > > > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > >       else
> > > >               vq->split.desc_state[head].indir_desc = ctx;
> > > >
> > > > +     /* Store in buffer length if necessary */
> > > > +     if (vq->split.buflen)
> > > > +             vq->split.buflen[head] = buflen;
> > > > +
> > > >       /* Put entry in available array (but don't update avail->idx until they
> > > >        * do sync). */
> > > >       avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> > > > @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> > > >               BAD_RING(vq, "id %u is not a head!\n", i);
> > > >               return NULL;
> > > >       }
> > > > +     if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > +                     *len, vq->split.buflen[i]);
> > > > +             return NULL;
> > > > +     }
> > > >
> > > >       /* detach_buf_split clears data, so grab it now. */
> > > >       ret = vq->split.desc_state[i].data;
> > > > @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
> > > >                        vring_split->queue_dma_addr,
> > > >                        dma_dev);
> > > >
> > > > +     kfree(vring_split->buflen);
> > > >       kfree(vring_split->desc_state);
> > > >       kfree(vring_split->desc_extra);
> > > >  }
> > > >
> > > > +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > +{
> > > > +     /*
> > > > +      * Several legacy devices are known to produce buggy used
> > > > +      * length. In order to let driver work, we won't validate used
> > > > +      * buffer length in this case.
> > > > +      */
> > > > +     if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > +             return false;
> > > > +     if (force_used_validation)
> > > > +             return true;
> > > > +     return false;
> > > > +}
> > > > +
> > > >  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > >                                  struct virtio_device *vdev,
> > > >                                  u32 num,
> > > > @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > >       vring_split->vring_align = vring_align;
> > > >       vring_split->may_reduce_num = may_reduce_num;
> > > >
> > > > +     if (vring_needs_used_validation(vdev)) {
> > > > +             vring_split->buflen =
> > > > +                     kmalloc_array(num, sizeof(*vring_split->buflen),
> > > > +                                   GFP_KERNEL);
> > > > +             if (!vring_split->buflen)
> > > > +                     goto err_buflen;
> > > > +     }
> > > > +
> > > >       return 0;
> > > > +
> > > > +err_buflen:
> > > > +     vring_free_split(vring_split, vdev, dma_dev);
> > > > +     return -ENOMEM;
> > > >  }
> > > >
> > > >  static struct virtqueue *vring_create_virtqueue_split(
> > > > @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > >       unsigned int i, n, err_idx;
> > > >       u16 head, id;
> > > >       dma_addr_t addr;
> > > > +     u32 buflen = 0;
> > > >
> > > >       head = vq->packed.next_avail_idx;
> > > >       desc = alloc_indirect_packed(total_sg, gfp);
> > > > @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > >                       desc[i].addr = cpu_to_le64(addr);
> > > >                       desc[i].len = cpu_to_le32(sg->length);
> > > >                       i++;
> > > > +                     if (n >= out_sgs)
> > > > +                             buflen += sg->length;
> > > >               }
> > > >       }
> > > >
> > > > @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > >       vq->packed.desc_state[id].last = id;
> > > >       vq->packed.desc_state[id].premapped = premapped;
> > > >
> > > > +     /* Store in buffer length if necessary */
> > > > +     if (vq->packed.buflen)
> > > > +             vq->packed.buflen[id] = buflen;
> > > > +
> > > >       vq->num_added += 1;
> > > >
> > > >       pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > >       __le16 head_flags, flags;
> > > >       u16 head, id, prev, curr, avail_used_flags;
> > > >       int err;
> > > > +     u32 buflen = 0;
> > > >
> > > >       START_USE(vq);
> > > >
> > > > @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > >                                       1 << VRING_PACKED_DESC_F_AVAIL |
> > > >                                       1 << VRING_PACKED_DESC_F_USED;
> > > >                       }
> > > > +                     if (n >= out_sgs)
> > > > +                             buflen += sg->length;
> > > >               }
> > > >       }
> > > >
> > > > @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > >       vq->packed.desc_state[id].last = prev;
> > > >       vq->packed.desc_state[id].premapped = premapped;
> > > >
> > > > +     /* Store in buffer length if necessary */
> > > > +     if (vq->packed.buflen)
> > > > +             vq->packed.buflen[id] = buflen;
> > > > +
> > > >       /*
> > > >        * A driver MUST NOT make the first descriptor in the list
> > > >        * available before all subsequent descriptors comprising
> > > > @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > >               BAD_RING(vq, "id %u is not a head!\n", id);
> > > >               return NULL;
> > > >       }
> > > > +     if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > +                     *len, vq->packed.buflen[id]);
> > > > +             return NULL;
> > > > +     }
> > > >
> > > >       /* detach_buf_packed clears data, so grab it now. */
> > > >       ret = vq->packed.desc_state[id].data;
> > > > @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
> > > >                                vring_packed->device_event_dma_addr,
> > > >                                dma_dev);
> > > >
> > > > +     kfree(vring_packed->buflen);
> > > >       kfree(vring_packed->desc_state);
> > > >       kfree(vring_packed->desc_extra);
> > > >  }
> > > > @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
> > > >
> > > >       vring_packed->vring.num = num;
> > > >
> > > > +     if (vring_needs_used_validation(vdev)) {
> > > > +             vring_packed->buflen =
> > > > +                     kmalloc_array(num, sizeof(*vring_packed->buflen),
> > > > +                                   GFP_KERNEL);
> > > > +             if (!vring_packed->buflen)
> > > > +                     goto err;
> > > > +     }
> > > > +
> > > >       return 0;
> > > >
> > > >  err:
> > > > --
> > > > 2.25.1
> > >
>
  
Michael S. Tsirkin May 31, 2023, 5:50 a.m. UTC | #5
On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > This patch validate
> > > >
> > > > validates
> > > >
> > > > > the used buffer length provided by the device
> > > > > before trying to use it.
> > > >
> > > > before returning it to caller
> > > >
> > > > > This is done by remembering the in buffer
> > > > > length in a dedicated array during virtqueue_add(), then we can fail
> > > > > the virtqueue_get_buf() when we find the device is trying to give us a
> > > > > used buffer length which is greater than we stored before.
> > > >
> > > > than what we stored
> > > >
> > > > >
> > > > > This validation is disable
> > > >
> > > > disabled
> > > >
> > > > > by default via module parameter to unbreak
> > > > > some existing devices since some legacy devices are known to report
> > > > > buggy used length.
> > > > >
> > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > >
> > > > First I'm not merging this without more data about
> > > > what is known to be broken and what is known to work well
> > > > in the commit log. And how exactly do things work if used length
> > > > is wrong?
> > >
> > > Assuming the device is malicious, it would be very hard to answer.
> > > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > > the answer, we can simply make sure the used in buffer length is
> > > validated then we know we're fine or not.
> >
> > To restate the question, you said above "some legacy devices are known
> > to report buggy used length". If they report buggy length then how
> > can things work?
> 
> The validation is disabled for legacy device (as stated in the changelog):
> 
> static bool vring_needs_used_validation(const struct virtio_device *vdev)
> {
>         /*
>          * Several legacy devices are known to produce buggy used
>          * length. In order to let driver work, we won't validate used
>          * buffer length in this case.
>          */
>         if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
>                 return false;
>         if (force_used_validation)
>                 return true;
>         return false;
> }
> 
> This seems to be what we've agreed in last version:
> 
> https://lore.kernel.org/all/CANLsYkxfhamUU0bb4j7y6N4_G9odKxLCjXxgXEx4SJ6_Kf+M2Q@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56
> 
> Thanks
>

I don't get it. You wrote:

	This validation is disable
	by default via module parameter to unbreak
	some existing devices since some legacy devices are known to report
	buggy used length.

which devices? why do you need a module parameter?

 
> >
> > > > Second what's wrong with dma_desc_extra that we already maintain?
> > > > Third motivation - it's part and parcel of the hardening effort yes?
> > >
> > > They are different. dma_desc_extra is for a descriptor ring, but this
> > > is for a used ring. Technically we can go back to iterate on the
> > > descriptor ring for a legal used in buffer length. But it will have
> > > worse performance.
> >
> > I don't really understand. We already iterate when we unmap -
> > all that is necessary is to subtract it from used length, if at
> > the end of the process it is >0 then we know used length is too
> > large.
> 
> Yes, but it is the job that is done in the driver level not the virtio
> core.

What job? unmap is done in detach_buf_split and detach_buf_packed respectively.
vring_desc_extra isn't even visible outside drivers/virtio/virtio_ring.c

For drivers that do unmap at driver level - I guess they can do
validation there too.

> Validation in virtio core is still necessary since they're
> working at different levels and it's hard to force the validation in
> all drivers by codes. Last version introduces a
> suppress_driver_validation to allow the driver to suppress the core
> validation which seems not good, we need a way to force the
> virtio_ring code to do validation before.

Why do we? If driver validates length virtio_ring does not need to
validate.  If driver does not use length virtio_ring does not need to
validate. core can provide this service for the gazillion non
performance critical drivers that just want to keep things simple,
but the 4-5 critical ones can do their own validation if they want to.

> Or such stuff could be added
> on top since the validation is by default anyway.
> 
> Thanks



> >
> >
> > > > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> > > > we do more hardening. If it's irrevocably broken let's rip it out?
> > >
> > > So the plan is
> > >
> > > 1) finish used ring validation (this had been proposed, merged and
> > > reverted before notification hardening)
> > > 2) do notification hardening on top.
> > >
> > > So let's leave it as is and I will do a rework after we finalize the
> > > used ring validation.
> > >
> > > Thanks
> > >
> > > >
> > > >
> > > > > ---
> > > > > Changes since V4:
> > > > > - drop the flat for driver to suppress the check
> > > > > - validation is disabled by default
> > > > > - don't do validation for legacy device
> > > > > - rebase and support virtqueue resize
> > > > > ---
> > > > >  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
> > > > >  1 file changed, 75 insertions(+)
> > > > >
> > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > index 143f380baa1c..5b151605aaf8 100644
> > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > @@ -15,6 +15,9 @@
> > > > >  #include <linux/spinlock.h>
> > > > >  #include <xen/xen.h>
> > > > >
> > > > > +static bool force_used_validation = false;
> > > > > +module_param(force_used_validation, bool, 0444);
> > > > > +
> > > > >  #ifdef DEBUG
> > > > >  /* For development, we want to crash whenever the ring is screwed. */
> > > > >  #define BAD_RING(_vq, fmt, args...)                          \
> > > > > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> > > > >       struct vring_desc_state_split *desc_state;
> > > > >       struct vring_desc_extra *desc_extra;
> > > > >
> > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > +     u32 *buflen;
> > > > > +
> > > > >       /* DMA address and size information */
> > > > >       dma_addr_t queue_dma_addr;
> > > > >       size_t queue_size_in_bytes;
> > > > > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> > > > >       struct vring_desc_state_packed *desc_state;
> > > > >       struct vring_desc_extra *desc_extra;
> > > > >
> > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > +     u32 *buflen;
> > > > > +
> > > > >       /* DMA address and size information */
> > > > >       dma_addr_t ring_dma_addr;
> > > > >       dma_addr_t driver_event_dma_addr;
> > > > > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > >       unsigned int i, n, avail, descs_used, prev, err_idx;
> > > > >       int head;
> > > > >       bool indirect;
> > > > > +     u32 buflen = 0;
> > > > >
> > > > >       START_USE(vq);
> > > > >
> > > > > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > >                                                    VRING_DESC_F_NEXT |
> > > > >                                                    VRING_DESC_F_WRITE,
> > > > >                                                    indirect);
> > > > > +                     buflen += sg->length;
> > > > >               }
> > > > >       }
> > > > >       /* Last one doesn't continue. */
> > > > > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > >       else
> > > > >               vq->split.desc_state[head].indir_desc = ctx;
> > > > >
> > > > > +     /* Store in buffer length if necessary */
> > > > > +     if (vq->split.buflen)
> > > > > +             vq->split.buflen[head] = buflen;
> > > > > +
> > > > >       /* Put entry in available array (but don't update avail->idx until they
> > > > >        * do sync). */
> > > > >       avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> > > > > @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> > > > >               BAD_RING(vq, "id %u is not a head!\n", i);
> > > > >               return NULL;
> > > > >       }
> > > > > +     if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > +                     *len, vq->split.buflen[i]);
> > > > > +             return NULL;
> > > > > +     }
> > > > >
> > > > >       /* detach_buf_split clears data, so grab it now. */
> > > > >       ret = vq->split.desc_state[i].data;
> > > > > @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
> > > > >                        vring_split->queue_dma_addr,
> > > > >                        dma_dev);
> > > > >
> > > > > +     kfree(vring_split->buflen);
> > > > >       kfree(vring_split->desc_state);
> > > > >       kfree(vring_split->desc_extra);
> > > > >  }
> > > > >
> > > > > +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > > +{
> > > > > +     /*
> > > > > +      * Several legacy devices are known to produce buggy used
> > > > > +      * length. In order to let driver work, we won't validate used
> > > > > +      * buffer length in this case.
> > > > > +      */
> > > > > +     if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > > +             return false;
> > > > > +     if (force_used_validation)
> > > > > +             return true;
> > > > > +     return false;
> > > > > +}
> > > > > +
> > > > >  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > >                                  struct virtio_device *vdev,
> > > > >                                  u32 num,
> > > > > @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > >       vring_split->vring_align = vring_align;
> > > > >       vring_split->may_reduce_num = may_reduce_num;
> > > > >
> > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > +             vring_split->buflen =
> > > > > +                     kmalloc_array(num, sizeof(*vring_split->buflen),
> > > > > +                                   GFP_KERNEL);
> > > > > +             if (!vring_split->buflen)
> > > > > +                     goto err_buflen;
> > > > > +     }
> > > > > +
> > > > >       return 0;
> > > > > +
> > > > > +err_buflen:
> > > > > +     vring_free_split(vring_split, vdev, dma_dev);
> > > > > +     return -ENOMEM;
> > > > >  }
> > > > >
> > > > >  static struct virtqueue *vring_create_virtqueue_split(
> > > > > @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > >       unsigned int i, n, err_idx;
> > > > >       u16 head, id;
> > > > >       dma_addr_t addr;
> > > > > +     u32 buflen = 0;
> > > > >
> > > > >       head = vq->packed.next_avail_idx;
> > > > >       desc = alloc_indirect_packed(total_sg, gfp);
> > > > > @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > >                       desc[i].addr = cpu_to_le64(addr);
> > > > >                       desc[i].len = cpu_to_le32(sg->length);
> > > > >                       i++;
> > > > > +                     if (n >= out_sgs)
> > > > > +                             buflen += sg->length;
> > > > >               }
> > > > >       }
> > > > >
> > > > > @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > >       vq->packed.desc_state[id].last = id;
> > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > >
> > > > > +     /* Store in buffer length if necessary */
> > > > > +     if (vq->packed.buflen)
> > > > > +             vq->packed.buflen[id] = buflen;
> > > > > +
> > > > >       vq->num_added += 1;
> > > > >
> > > > >       pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > > @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > >       __le16 head_flags, flags;
> > > > >       u16 head, id, prev, curr, avail_used_flags;
> > > > >       int err;
> > > > > +     u32 buflen = 0;
> > > > >
> > > > >       START_USE(vq);
> > > > >
> > > > > @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > >                                       1 << VRING_PACKED_DESC_F_AVAIL |
> > > > >                                       1 << VRING_PACKED_DESC_F_USED;
> > > > >                       }
> > > > > +                     if (n >= out_sgs)
> > > > > +                             buflen += sg->length;
> > > > >               }
> > > > >       }
> > > > >
> > > > > @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > >       vq->packed.desc_state[id].last = prev;
> > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > >
> > > > > +     /* Store in buffer length if necessary */
> > > > > +     if (vq->packed.buflen)
> > > > > +             vq->packed.buflen[id] = buflen;
> > > > > +
> > > > >       /*
> > > > >        * A driver MUST NOT make the first descriptor in the list
> > > > >        * available before all subsequent descriptors comprising
> > > > > @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > >               BAD_RING(vq, "id %u is not a head!\n", id);
> > > > >               return NULL;
> > > > >       }
> > > > > +     if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > +                     *len, vq->packed.buflen[id]);
> > > > > +             return NULL;
> > > > > +     }
> > > > >
> > > > >       /* detach_buf_packed clears data, so grab it now. */
> > > > >       ret = vq->packed.desc_state[id].data;
> > > > > @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
> > > > >                                vring_packed->device_event_dma_addr,
> > > > >                                dma_dev);
> > > > >
> > > > > +     kfree(vring_packed->buflen);
> > > > >       kfree(vring_packed->desc_state);
> > > > >       kfree(vring_packed->desc_extra);
> > > > >  }
> > > > > @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
> > > > >
> > > > >       vring_packed->vring.num = num;
> > > > >
> > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > +             vring_packed->buflen =
> > > > > +                     kmalloc_array(num, sizeof(*vring_packed->buflen),
> > > > > +                                   GFP_KERNEL);
> > > > > +             if (!vring_packed->buflen)
> > > > > +                     goto err;
> > > > > +     }
> > > > > +
> > > > >       return 0;
> > > > >
> > > > >  err:
> > > > > --
> > > > > 2.25.1
> > > >
> >
  
Jason Wang May 31, 2023, 7:36 a.m. UTC | #6
On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > > This patch validate
> > > > >
> > > > > validates
> > > > >
> > > > > > the used buffer length provided by the device
> > > > > > before trying to use it.
> > > > >
> > > > > before returning it to caller
> > > > >
> > > > > > This is done by remembering the in buffer
> > > > > > length in a dedicated array during virtqueue_add(), then we can fail
> > > > > > the virtqueue_get_buf() when we find the device is trying to give us a
> > > > > > used buffer length which is greater than we stored before.
> > > > >
> > > > > than what we stored
> > > > >
> > > > > >
> > > > > > This validation is disable
> > > > >
> > > > > disabled
> > > > >
> > > > > > by default via module parameter to unbreak
> > > > > > some existing devices since some legacy devices are known to report
> > > > > > buggy used length.
> > > > > >
> > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > >
> > > > > First I'm not merging this without more data about
> > > > > what is known to be broken and what is known to work well
> > > > > in the commit log. And how exactly do things work if used length
> > > > > is wrong?
> > > >
> > > > Assuming the device is malicious, it would be very hard to answer.
> > > > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > > > the answer, we can simply make sure the used in buffer length is
> > > > validated then we know we're fine or not.
> > >
> > > To restate the question, you said above "some legacy devices are known
> > > to report buggy used length". If they report buggy length then how
> > > can things work?
> >
> > The validation is disabled for legacy device (as stated in the changelog):
> >
> > static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > {
> >         /*
> >          * Several legacy devices are known to produce buggy used
> >          * length. In order to let driver work, we won't validate used
> >          * buffer length in this case.
> >          */
> >         if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> >                 return false;
> >         if (force_used_validation)
> >                 return true;
> >         return false;
> > }
> >
> > This seems to be what we've agreed in last version:
> >
> > https://lore.kernel.org/all/CANLsYkxfhamUU0bb4j7y6N4_G9odKxLCjXxgXEx4SJ6_Kf+M2Q@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56
> >
> > Thanks
> >
>
> I don't get it. You wrote:
>
>         This validation is disable
>         by default via module parameter to unbreak
>         some existing devices since some legacy devices are known to report
>         buggy used length.
>
> which devices?

legacy rpmsg and vsock device (before 49d8c5ffad07) at least.

> why do you need a module parameter?

If we enable it unconditionally for modern devices, it may break some
buggy moden device (vsock without a fix as an example).

>
>
> > >
> > > > > Second what's wrong with dma_desc_extra that we already maintain?
> > > > > Third motivation - it's part and parcel of the hardening effort yes?
> > > >
> > > > They are different. dma_desc_extra is for a descriptor ring, but this
> > > > is for a used ring. Technically we can go back to iterate on the
> > > > descriptor ring for a legal used in buffer length. But it will have
> > > > worse performance.
> > >
> > > I don't really understand. We already iterate when we unmap -
> > > all that is necessary is to subtract it from used length, if at
> > > the end of the process it is >0 then we know used length is too
> > > large.
> >
> > Yes, but it is the job that is done in the driver level not the virtio
> > core.
>
> What job?

I meant the driver can do the validation since it has the knowledge of
the buffer length if it wants.

> unmap is done in detach_buf_split and detach_buf_packed respectively.
> vring_desc_extra isn't even visible outside drivers/virtio/virtio_ring.c

desc_extra doesn't contain buffer length for the case of indirect
descriptors. So we need to iterate in the descriptors when it looks
expensive if we don't need unmap.

Thanks

>
> For drivers that do unmap at driver level - I guess they can do
> validation there too.
>
> > Validation in virtio core is still necessary since they're
> > working at different levels and it's hard to force the validation in
> > all drivers by codes. Last version introduces a
> > suppress_driver_validation to allow the driver to suppress the core
> > validation which seems not good, we need a way to force the
> > virtio_ring code to do validation before.
>
> Why do we? If driver validates length virtio_ring does not need to
> validate.  If driver does not use length virtio_ring does not need to
> validate. core can provide this service for the gazillion non
> performance critical drivers that just want to keep things simple,
> but the 4-5 critical ones can do their own validation if they want to.
>
> > Or such stuff could be added
> > on top since the validation is by default anyway.
> >
> > Thanks
>
>
>
> > >
> > >
> > > > > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> > > > > we do more hardening. If it's irrevocably broken let's rip it out?
> > > >
> > > > So the plan is
> > > >
> > > > 1) finish used ring validation (this had been proposed, merged and
> > > > reverted before notification hardening)
> > > > 2) do notification hardening on top.
> > > >
> > > > So let's leave it as is and I will do a rework after we finalize the
> > > > used ring validation.
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > >
> > > > > > ---
> > > > > > Changes since V4:
> > > > > > - drop the flat for driver to suppress the check
> > > > > > - validation is disabled by default
> > > > > > - don't do validation for legacy device
> > > > > > - rebase and support virtqueue resize
> > > > > > ---
> > > > > >  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
> > > > > >  1 file changed, 75 insertions(+)
> > > > > >
> > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > index 143f380baa1c..5b151605aaf8 100644
> > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > @@ -15,6 +15,9 @@
> > > > > >  #include <linux/spinlock.h>
> > > > > >  #include <xen/xen.h>
> > > > > >
> > > > > > +static bool force_used_validation = false;
> > > > > > +module_param(force_used_validation, bool, 0444);
> > > > > > +
> > > > > >  #ifdef DEBUG
> > > > > >  /* For development, we want to crash whenever the ring is screwed. */
> > > > > >  #define BAD_RING(_vq, fmt, args...)                          \
> > > > > > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> > > > > >       struct vring_desc_state_split *desc_state;
> > > > > >       struct vring_desc_extra *desc_extra;
> > > > > >
> > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > +     u32 *buflen;
> > > > > > +
> > > > > >       /* DMA address and size information */
> > > > > >       dma_addr_t queue_dma_addr;
> > > > > >       size_t queue_size_in_bytes;
> > > > > > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> > > > > >       struct vring_desc_state_packed *desc_state;
> > > > > >       struct vring_desc_extra *desc_extra;
> > > > > >
> > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > +     u32 *buflen;
> > > > > > +
> > > > > >       /* DMA address and size information */
> > > > > >       dma_addr_t ring_dma_addr;
> > > > > >       dma_addr_t driver_event_dma_addr;
> > > > > > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > >       unsigned int i, n, avail, descs_used, prev, err_idx;
> > > > > >       int head;
> > > > > >       bool indirect;
> > > > > > +     u32 buflen = 0;
> > > > > >
> > > > > >       START_USE(vq);
> > > > > >
> > > > > > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > >                                                    VRING_DESC_F_NEXT |
> > > > > >                                                    VRING_DESC_F_WRITE,
> > > > > >                                                    indirect);
> > > > > > +                     buflen += sg->length;
> > > > > >               }
> > > > > >       }
> > > > > >       /* Last one doesn't continue. */
> > > > > > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > >       else
> > > > > >               vq->split.desc_state[head].indir_desc = ctx;
> > > > > >
> > > > > > +     /* Store in buffer length if necessary */
> > > > > > +     if (vq->split.buflen)
> > > > > > +             vq->split.buflen[head] = buflen;
> > > > > > +
> > > > > >       /* Put entry in available array (but don't update avail->idx until they
> > > > > >        * do sync). */
> > > > > >       avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> > > > > > @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> > > > > >               BAD_RING(vq, "id %u is not a head!\n", i);
> > > > > >               return NULL;
> > > > > >       }
> > > > > > +     if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > +                     *len, vq->split.buflen[i]);
> > > > > > +             return NULL;
> > > > > > +     }
> > > > > >
> > > > > >       /* detach_buf_split clears data, so grab it now. */
> > > > > >       ret = vq->split.desc_state[i].data;
> > > > > > @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
> > > > > >                        vring_split->queue_dma_addr,
> > > > > >                        dma_dev);
> > > > > >
> > > > > > +     kfree(vring_split->buflen);
> > > > > >       kfree(vring_split->desc_state);
> > > > > >       kfree(vring_split->desc_extra);
> > > > > >  }
> > > > > >
> > > > > > +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > > > +{
> > > > > > +     /*
> > > > > > +      * Several legacy devices are known to produce buggy used
> > > > > > +      * length. In order to let driver work, we won't validate used
> > > > > > +      * buffer length in this case.
> > > > > > +      */
> > > > > > +     if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > > > +             return false;
> > > > > > +     if (force_used_validation)
> > > > > > +             return true;
> > > > > > +     return false;
> > > > > > +}
> > > > > > +
> > > > > >  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > >                                  struct virtio_device *vdev,
> > > > > >                                  u32 num,
> > > > > > @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > >       vring_split->vring_align = vring_align;
> > > > > >       vring_split->may_reduce_num = may_reduce_num;
> > > > > >
> > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > +             vring_split->buflen =
> > > > > > +                     kmalloc_array(num, sizeof(*vring_split->buflen),
> > > > > > +                                   GFP_KERNEL);
> > > > > > +             if (!vring_split->buflen)
> > > > > > +                     goto err_buflen;
> > > > > > +     }
> > > > > > +
> > > > > >       return 0;
> > > > > > +
> > > > > > +err_buflen:
> > > > > > +     vring_free_split(vring_split, vdev, dma_dev);
> > > > > > +     return -ENOMEM;
> > > > > >  }
> > > > > >
> > > > > >  static struct virtqueue *vring_create_virtqueue_split(
> > > > > > @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > >       unsigned int i, n, err_idx;
> > > > > >       u16 head, id;
> > > > > >       dma_addr_t addr;
> > > > > > +     u32 buflen = 0;
> > > > > >
> > > > > >       head = vq->packed.next_avail_idx;
> > > > > >       desc = alloc_indirect_packed(total_sg, gfp);
> > > > > > @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > >                       desc[i].addr = cpu_to_le64(addr);
> > > > > >                       desc[i].len = cpu_to_le32(sg->length);
> > > > > >                       i++;
> > > > > > +                     if (n >= out_sgs)
> > > > > > +                             buflen += sg->length;
> > > > > >               }
> > > > > >       }
> > > > > >
> > > > > > @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > >       vq->packed.desc_state[id].last = id;
> > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > >
> > > > > > +     /* Store in buffer length if necessary */
> > > > > > +     if (vq->packed.buflen)
> > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > +
> > > > > >       vq->num_added += 1;
> > > > > >
> > > > > >       pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > > > @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > >       __le16 head_flags, flags;
> > > > > >       u16 head, id, prev, curr, avail_used_flags;
> > > > > >       int err;
> > > > > > +     u32 buflen = 0;
> > > > > >
> > > > > >       START_USE(vq);
> > > > > >
> > > > > > @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > >                                       1 << VRING_PACKED_DESC_F_AVAIL |
> > > > > >                                       1 << VRING_PACKED_DESC_F_USED;
> > > > > >                       }
> > > > > > +                     if (n >= out_sgs)
> > > > > > +                             buflen += sg->length;
> > > > > >               }
> > > > > >       }
> > > > > >
> > > > > > @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > >       vq->packed.desc_state[id].last = prev;
> > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > >
> > > > > > +     /* Store in buffer length if necessary */
> > > > > > +     if (vq->packed.buflen)
> > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > +
> > > > > >       /*
> > > > > >        * A driver MUST NOT make the first descriptor in the list
> > > > > >        * available before all subsequent descriptors comprising
> > > > > > @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > >               BAD_RING(vq, "id %u is not a head!\n", id);
> > > > > >               return NULL;
> > > > > >       }
> > > > > > +     if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > +                     *len, vq->packed.buflen[id]);
> > > > > > +             return NULL;
> > > > > > +     }
> > > > > >
> > > > > >       /* detach_buf_packed clears data, so grab it now. */
> > > > > >       ret = vq->packed.desc_state[id].data;
> > > > > > @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > >                                vring_packed->device_event_dma_addr,
> > > > > >                                dma_dev);
> > > > > >
> > > > > > +     kfree(vring_packed->buflen);
> > > > > >       kfree(vring_packed->desc_state);
> > > > > >       kfree(vring_packed->desc_extra);
> > > > > >  }
> > > > > > @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > >
> > > > > >       vring_packed->vring.num = num;
> > > > > >
> > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > +             vring_packed->buflen =
> > > > > > +                     kmalloc_array(num, sizeof(*vring_packed->buflen),
> > > > > > +                                   GFP_KERNEL);
> > > > > > +             if (!vring_packed->buflen)
> > > > > > +                     goto err;
> > > > > > +     }
> > > > > > +
> > > > > >       return 0;
> > > > > >
> > > > > >  err:
> > > > > > --
> > > > > > 2.25.1
> > > > >
> > >
>
  
Jason Wang May 31, 2023, 8:26 a.m. UTC | #7
On Wed, May 31, 2023 at 3:36 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > > > This patch validate
> > > > > >
> > > > > > validates
> > > > > >
> > > > > > > the used buffer length provided by the device
> > > > > > > before trying to use it.
> > > > > >
> > > > > > before returning it to caller
> > > > > >
> > > > > > > This is done by remembering the in buffer
> > > > > > > length in a dedicated array during virtqueue_add(), then we can fail
> > > > > > > the virtqueue_get_buf() when we find the device is trying to give us a
> > > > > > > used buffer length which is greater than we stored before.
> > > > > >
> > > > > > than what we stored
> > > > > >
> > > > > > >
> > > > > > > This validation is disable
> > > > > >
> > > > > > disabled
> > > > > >
> > > > > > > by default via module parameter to unbreak
> > > > > > > some existing devices since some legacy devices are known to report
> > > > > > > buggy used length.
> > > > > > >
> > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > >
> > > > > > First I'm not merging this without more data about
> > > > > > what is known to be broken and what is known to work well
> > > > > > in the commit log. And how exactly do things work if used length
> > > > > > is wrong?
> > > > >
> > > > > Assuming the device is malicious, it would be very hard to answer.
> > > > > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > > > > the answer, we can simply make sure the used in buffer length is
> > > > > validated then we know we're fine or not.
> > > >
> > > > To restate the question, you said above "some legacy devices are known
> > > > to report buggy used length". If they report buggy length then how
> > > > can things work?
> > >
> > > The validation is disabled for legacy device (as stated in the changelog):
> > >
> > > static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > {
> > >         /*
> > >          * Several legacy devices are known to produce buggy used
> > >          * length. In order to let driver work, we won't validate used
> > >          * buffer length in this case.
> > >          */
> > >         if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > >                 return false;
> > >         if (force_used_validation)
> > >                 return true;
> > >         return false;
> > > }
> > >
> > > This seems to be what we've agreed in last version:
> > >
> > > https://lore.kernel.org/all/CANLsYkxfhamUU0bb4j7y6N4_G9odKxLCjXxgXEx4SJ6_Kf+M2Q@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56
> > >
> > > Thanks
> > >
> >
> > I don't get it. You wrote:
> >
> >         This validation is disable
> >         by default via module parameter to unbreak
> >         some existing devices since some legacy devices are known to report
> >         buggy used length.
> >
> > which devices?
>
> legacy rpmsg and vsock device (before 49d8c5ffad07) at least.
>
> > why do you need a module parameter?
>
> If we enable it unconditionally for modern devices, it may break some
> buggy moden device (vsock without a fix as an example).
>
> >
> >
> > > >
> > > > > > Second what's wrong with dma_desc_extra that we already maintain?
> > > > > > Third motivation - it's part and parcel of the hardening effort yes?
> > > > >
> > > > > They are different. dma_desc_extra is for a descriptor ring, but this
> > > > > is for a used ring. Technically we can go back to iterate on the
> > > > > descriptor ring for a legal used in buffer length. But it will have
> > > > > worse performance.
> > > >
> > > > I don't really understand. We already iterate when we unmap -
> > > > all that is necessary is to subtract it from used length, if at
> > > > the end of the process it is >0 then we know used length is too
> > > > large.
> > >
> > > Yes, but it is the job that is done in the driver level not the virtio
> > > core.
> >
> > What job?
>
> I meant the driver can do the validation since it has the knowledge of
> the buffer length if it wants.
>
> > unmap is done in detach_buf_split and detach_buf_packed respectively.
> > vring_desc_extra isn't even visible outside drivers/virtio/virtio_ring.c
>
> desc_extra doesn't contain buffer length for the case of indirect
> descriptors. So we need to iterate in the descriptors when it looks
> expensive if we don't need unmap.
>
> Thanks
>
> >
> > For drivers that do unmap at driver level - I guess they can do
> > validation there too.
> >
> > > Validation in virtio core is still necessary since they're
> > > working at different levels and it's hard to force the validation in
> > > all drivers by codes. Last version introduces a
> > > suppress_driver_validation to allow the driver to suppress the core
> > > validation which seems not good, we need a way to force the
> > > virtio_ring code to do validation before.
> >
> > Why do we? If driver validates length virtio_ring does not need to
> > validate.

To be more safe, there's no guarantee that there's no bug in the driver.

>  If driver does not use length virtio_ring does not need to
> > validate.

This could be done on top assuming the validation is disabled by
default. But if the administrator wants to have belt and braces we
need to leave an option for them.

Thanks

> core can provide this service for the gazillion non
> > performance critical drivers that just want to keep things simple,
> > but the 4-5 critical ones can do their own validation if they want to.
> >
> > > Or such stuff could be added
> > > on top since the validation is by default anyway.
> > >
> > > Thanks
> >
> >
> >
> > > >
> > > >
> > > > > > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> > > > > > we do more hardening. If it's irrevocably broken let's rip it out?
> > > > >
> > > > > So the plan is
> > > > >
> > > > > 1) finish used ring validation (this had been proposed, merged and
> > > > > reverted before notification hardening)
> > > > > 2) do notification hardening on top.
> > > > >
> > > > > So let's leave it as is and I will do a rework after we finalize the
> > > > > used ring validation.
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > >
> > > > > > > ---
> > > > > > > Changes since V4:
> > > > > > > - drop the flat for driver to suppress the check
> > > > > > > - validation is disabled by default
> > > > > > > - don't do validation for legacy device
> > > > > > > - rebase and support virtqueue resize
> > > > > > > ---
> > > > > > >  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
> > > > > > >  1 file changed, 75 insertions(+)
> > > > > > >
> > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > index 143f380baa1c..5b151605aaf8 100644
> > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > @@ -15,6 +15,9 @@
> > > > > > >  #include <linux/spinlock.h>
> > > > > > >  #include <xen/xen.h>
> > > > > > >
> > > > > > > +static bool force_used_validation = false;
> > > > > > > +module_param(force_used_validation, bool, 0444);
> > > > > > > +
> > > > > > >  #ifdef DEBUG
> > > > > > >  /* For development, we want to crash whenever the ring is screwed. */
> > > > > > >  #define BAD_RING(_vq, fmt, args...)                          \
> > > > > > > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> > > > > > >       struct vring_desc_state_split *desc_state;
> > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > >
> > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > +     u32 *buflen;
> > > > > > > +
> > > > > > >       /* DMA address and size information */
> > > > > > >       dma_addr_t queue_dma_addr;
> > > > > > >       size_t queue_size_in_bytes;
> > > > > > > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> > > > > > >       struct vring_desc_state_packed *desc_state;
> > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > >
> > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > +     u32 *buflen;
> > > > > > > +
> > > > > > >       /* DMA address and size information */
> > > > > > >       dma_addr_t ring_dma_addr;
> > > > > > >       dma_addr_t driver_event_dma_addr;
> > > > > > > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > >       unsigned int i, n, avail, descs_used, prev, err_idx;
> > > > > > >       int head;
> > > > > > >       bool indirect;
> > > > > > > +     u32 buflen = 0;
> > > > > > >
> > > > > > >       START_USE(vq);
> > > > > > >
> > > > > > > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > >                                                    VRING_DESC_F_NEXT |
> > > > > > >                                                    VRING_DESC_F_WRITE,
> > > > > > >                                                    indirect);
> > > > > > > +                     buflen += sg->length;
> > > > > > >               }
> > > > > > >       }
> > > > > > >       /* Last one doesn't continue. */
> > > > > > > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > >       else
> > > > > > >               vq->split.desc_state[head].indir_desc = ctx;
> > > > > > >
> > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > +     if (vq->split.buflen)
> > > > > > > +             vq->split.buflen[head] = buflen;
> > > > > > > +
> > > > > > >       /* Put entry in available array (but don't update avail->idx until they
> > > > > > >        * do sync). */
> > > > > > >       avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> > > > > > > @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> > > > > > >               BAD_RING(vq, "id %u is not a head!\n", i);
> > > > > > >               return NULL;
> > > > > > >       }
> > > > > > > +     if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > +                     *len, vq->split.buflen[i]);
> > > > > > > +             return NULL;
> > > > > > > +     }
> > > > > > >
> > > > > > >       /* detach_buf_split clears data, so grab it now. */
> > > > > > >       ret = vq->split.desc_state[i].data;
> > > > > > > @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
> > > > > > >                        vring_split->queue_dma_addr,
> > > > > > >                        dma_dev);
> > > > > > >
> > > > > > > +     kfree(vring_split->buflen);
> > > > > > >       kfree(vring_split->desc_state);
> > > > > > >       kfree(vring_split->desc_extra);
> > > > > > >  }
> > > > > > >
> > > > > > > +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > > > > +{
> > > > > > > +     /*
> > > > > > > +      * Several legacy devices are known to produce buggy used
> > > > > > > +      * length. In order to let driver work, we won't validate used
> > > > > > > +      * buffer length in this case.
> > > > > > > +      */
> > > > > > > +     if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > > > > +             return false;
> > > > > > > +     if (force_used_validation)
> > > > > > > +             return true;
> > > > > > > +     return false;
> > > > > > > +}
> > > > > > > +
> > > > > > >  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > >                                  struct virtio_device *vdev,
> > > > > > >                                  u32 num,
> > > > > > > @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > >       vring_split->vring_align = vring_align;
> > > > > > >       vring_split->may_reduce_num = may_reduce_num;
> > > > > > >
> > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > +             vring_split->buflen =
> > > > > > > +                     kmalloc_array(num, sizeof(*vring_split->buflen),
> > > > > > > +                                   GFP_KERNEL);
> > > > > > > +             if (!vring_split->buflen)
> > > > > > > +                     goto err_buflen;
> > > > > > > +     }
> > > > > > > +
> > > > > > >       return 0;
> > > > > > > +
> > > > > > > +err_buflen:
> > > > > > > +     vring_free_split(vring_split, vdev, dma_dev);
> > > > > > > +     return -ENOMEM;
> > > > > > >  }
> > > > > > >
> > > > > > >  static struct virtqueue *vring_create_virtqueue_split(
> > > > > > > @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > >       unsigned int i, n, err_idx;
> > > > > > >       u16 head, id;
> > > > > > >       dma_addr_t addr;
> > > > > > > +     u32 buflen = 0;
> > > > > > >
> > > > > > >       head = vq->packed.next_avail_idx;
> > > > > > >       desc = alloc_indirect_packed(total_sg, gfp);
> > > > > > > @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > >                       desc[i].addr = cpu_to_le64(addr);
> > > > > > >                       desc[i].len = cpu_to_le32(sg->length);
> > > > > > >                       i++;
> > > > > > > +                     if (n >= out_sgs)
> > > > > > > +                             buflen += sg->length;
> > > > > > >               }
> > > > > > >       }
> > > > > > >
> > > > > > > @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > >       vq->packed.desc_state[id].last = id;
> > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > >
> > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > +     if (vq->packed.buflen)
> > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > +
> > > > > > >       vq->num_added += 1;
> > > > > > >
> > > > > > >       pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > > > > @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > >       __le16 head_flags, flags;
> > > > > > >       u16 head, id, prev, curr, avail_used_flags;
> > > > > > >       int err;
> > > > > > > +     u32 buflen = 0;
> > > > > > >
> > > > > > >       START_USE(vq);
> > > > > > >
> > > > > > > @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > >                                       1 << VRING_PACKED_DESC_F_AVAIL |
> > > > > > >                                       1 << VRING_PACKED_DESC_F_USED;
> > > > > > >                       }
> > > > > > > +                     if (n >= out_sgs)
> > > > > > > +                             buflen += sg->length;
> > > > > > >               }
> > > > > > >       }
> > > > > > >
> > > > > > > @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > >       vq->packed.desc_state[id].last = prev;
> > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > >
> > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > +     if (vq->packed.buflen)
> > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > +
> > > > > > >       /*
> > > > > > >        * A driver MUST NOT make the first descriptor in the list
> > > > > > >        * available before all subsequent descriptors comprising
> > > > > > > @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > > >               BAD_RING(vq, "id %u is not a head!\n", id);
> > > > > > >               return NULL;
> > > > > > >       }
> > > > > > > +     if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > +                     *len, vq->packed.buflen[id]);
> > > > > > > +             return NULL;
> > > > > > > +     }
> > > > > > >
> > > > > > >       /* detach_buf_packed clears data, so grab it now. */
> > > > > > >       ret = vq->packed.desc_state[id].data;
> > > > > > > @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > >                                vring_packed->device_event_dma_addr,
> > > > > > >                                dma_dev);
> > > > > > >
> > > > > > > +     kfree(vring_packed->buflen);
> > > > > > >       kfree(vring_packed->desc_state);
> > > > > > >       kfree(vring_packed->desc_extra);
> > > > > > >  }
> > > > > > > @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > >
> > > > > > >       vring_packed->vring.num = num;
> > > > > > >
> > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > +             vring_packed->buflen =
> > > > > > > +                     kmalloc_array(num, sizeof(*vring_packed->buflen),
> > > > > > > +                                   GFP_KERNEL);
> > > > > > > +             if (!vring_packed->buflen)
> > > > > > > +                     goto err;
> > > > > > > +     }
> > > > > > > +
> > > > > > >       return 0;
> > > > > > >
> > > > > > >  err:
> > > > > > > --
> > > > > > > 2.25.1
> > > > > >
> > > >
> >
  
Michael S. Tsirkin May 31, 2023, 9:55 a.m. UTC | #8
On Wed, May 31, 2023 at 03:36:51PM +0800, Jason Wang wrote:
> On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > > > This patch validate
> > > > > >
> > > > > > validates
> > > > > >
> > > > > > > the used buffer length provided by the device
> > > > > > > before trying to use it.
> > > > > >
> > > > > > before returning it to caller
> > > > > >
> > > > > > > This is done by remembering the in buffer
> > > > > > > length in a dedicated array during virtqueue_add(), then we can fail
> > > > > > > the virtqueue_get_buf() when we find the device is trying to give us a
> > > > > > > used buffer length which is greater than we stored before.
> > > > > >
> > > > > > than what we stored
> > > > > >
> > > > > > >
> > > > > > > This validation is disable
> > > > > >
> > > > > > disabled
> > > > > >
> > > > > > > by default via module parameter to unbreak
> > > > > > > some existing devices since some legacy devices are known to report
> > > > > > > buggy used length.
> > > > > > >
> > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > >
> > > > > > First I'm not merging this without more data about
> > > > > > what is known to be broken and what is known to work well
> > > > > > in the commit log. And how exactly do things work if used length
> > > > > > is wrong?
> > > > >
> > > > > Assuming the device is malicious, it would be very hard to answer.
> > > > > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > > > > the answer, we can simply make sure the used in buffer length is
> > > > > validated then we know we're fine or not.
> > > >
> > > > To restate the question, you said above "some legacy devices are known
> > > > to report buggy used length". If they report buggy length then how
> > > > can things work?
> > >
> > > The validation is disabled for legacy device (as stated in the changelog):
> > >
> > > static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > {
> > >         /*
> > >          * Several legacy devices are known to produce buggy used
> > >          * length. In order to let driver work, we won't validate used
> > >          * buffer length in this case.
> > >          */
> > >         if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > >                 return false;
> > >         if (force_used_validation)
> > >                 return true;
> > >         return false;
> > > }
> > >
> > > This seems to be what we've agreed in last version:
> > >
> > > https://lore.kernel.org/all/CANLsYkxfhamUU0bb4j7y6N4_G9odKxLCjXxgXEx4SJ6_Kf+M2Q@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56
> > >
> > > Thanks
> > >
> >
> > I don't get it. You wrote:
> >
> >         This validation is disable
> >         by default via module parameter to unbreak
> >         some existing devices since some legacy devices are known to report
> >         buggy used length.
> >
> > which devices?
> 
> legacy rpmsg and vsock device (before 49d8c5ffad07) at least.
> 
> > why do you need a module parameter?
> 
> If we enable it unconditionally for modern devices, it may break some
> buggy moden device (vsock without a fix as an example).

Presumably this happens because vsock does not have any inbuf at all
so it ignores the length.
We had the same with virtio net tx a long time ago.

My suggestion is then not to fail - just cap length at the dma length
set by driver. Another idea is that if dma len is 0 then don't validate
at all - driver that did not add any inbufs is not going to look
at length.

Allowing passing NULL as length and skipping validation
if len = 0 might also be a good idea.


> >
> >
> > > >
> > > > > > Second what's wrong with dma_desc_extra that we already maintain?
> > > > > > Third motivation - it's part and parcel of the hardening effort yes?
> > > > >
> > > > > They are different. dma_desc_extra is for a descriptor ring, but this
> > > > > is for a used ring. Technically we can go back to iterate on the
> > > > > descriptor ring for a legal used in buffer length. But it will have
> > > > > worse performance.
> > > >
> > > > I don't really understand. We already iterate when we unmap -
> > > > all that is necessary is to subtract it from used length, if at
> > > > the end of the process it is >0 then we know used length is too
> > > > large.
> > >
> > > Yes, but it is the job that is done in the driver level not the virtio
> > > core.
> >
> > What job?
> 
> I meant the driver can do the validation since it has the knowledge of
> the buffer length if it wants.

It does not necessarily have it - not if virtio is doing DMA
mapping.


> > unmap is done in detach_buf_split and detach_buf_packed respectively.
> > vring_desc_extra isn't even visible outside drivers/virtio/virtio_ring.c
> 
> desc_extra doesn't contain buffer length for the case of indirect
> descriptors. So we need to iterate in the descriptors when it looks
> expensive if we don't need unmap.
> 
> Thanks

Well at the moment we only don't unmap if DMA API is bypassed.  And then
we don't need to validate length either. Fundamentally, without
ACCESS_PLATFORM device is trusted.


> >
> > For drivers that do unmap at driver level - I guess they can do
> > validation there too.
> >
> > > Validation in virtio core is still necessary since they're
> > > working at different levels and it's hard to force the validation in
> > > all drivers by codes. Last version introduces a
> > > suppress_driver_validation to allow the driver to suppress the core
> > > validation which seems not good, we need a way to force the
> > > virtio_ring code to do validation before.
> >
> > Why do we? If driver validates length virtio_ring does not need to
> > validate.  If driver does not use length virtio_ring does not need to
> > validate. core can provide this service for the gazillion non
> > performance critical drivers that just want to keep things simple,
> > but the 4-5 critical ones can do their own validation if they want to.
> >
> > > Or such stuff could be added
> > > on top since the validation is by default anyway.
> > >
> > > Thanks
> >
> >
> >
> > > >
> > > >
> > > > > > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> > > > > > we do more hardening. If it's irrevocably broken let's rip it out?
> > > > >
> > > > > So the plan is
> > > > >
> > > > > 1) finish used ring validation (this had been proposed, merged and
> > > > > reverted before notification hardening)
> > > > > 2) do notification hardening on top.
> > > > >
> > > > > So let's leave it as is and I will do a rework after we finalize the
> > > > > used ring validation.
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > >
> > > > > > > ---
> > > > > > > Changes since V4:
> > > > > > > - drop the flat for driver to suppress the check
> > > > > > > - validation is disabled by default
> > > > > > > - don't do validation for legacy device
> > > > > > > - rebase and support virtqueue resize
> > > > > > > ---
> > > > > > >  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
> > > > > > >  1 file changed, 75 insertions(+)
> > > > > > >
> > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > index 143f380baa1c..5b151605aaf8 100644
> > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > @@ -15,6 +15,9 @@
> > > > > > >  #include <linux/spinlock.h>
> > > > > > >  #include <xen/xen.h>
> > > > > > >
> > > > > > > +static bool force_used_validation = false;
> > > > > > > +module_param(force_used_validation, bool, 0444);
> > > > > > > +
> > > > > > >  #ifdef DEBUG
> > > > > > >  /* For development, we want to crash whenever the ring is screwed. */
> > > > > > >  #define BAD_RING(_vq, fmt, args...)                          \
> > > > > > > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> > > > > > >       struct vring_desc_state_split *desc_state;
> > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > >
> > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > +     u32 *buflen;
> > > > > > > +
> > > > > > >       /* DMA address and size information */
> > > > > > >       dma_addr_t queue_dma_addr;
> > > > > > >       size_t queue_size_in_bytes;
> > > > > > > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> > > > > > >       struct vring_desc_state_packed *desc_state;
> > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > >
> > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > +     u32 *buflen;
> > > > > > > +
> > > > > > >       /* DMA address and size information */
> > > > > > >       dma_addr_t ring_dma_addr;
> > > > > > >       dma_addr_t driver_event_dma_addr;
> > > > > > > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > >       unsigned int i, n, avail, descs_used, prev, err_idx;
> > > > > > >       int head;
> > > > > > >       bool indirect;
> > > > > > > +     u32 buflen = 0;
> > > > > > >
> > > > > > >       START_USE(vq);
> > > > > > >
> > > > > > > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > >                                                    VRING_DESC_F_NEXT |
> > > > > > >                                                    VRING_DESC_F_WRITE,
> > > > > > >                                                    indirect);
> > > > > > > +                     buflen += sg->length;
> > > > > > >               }
> > > > > > >       }
> > > > > > >       /* Last one doesn't continue. */
> > > > > > > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > >       else
> > > > > > >               vq->split.desc_state[head].indir_desc = ctx;
> > > > > > >
> > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > +     if (vq->split.buflen)
> > > > > > > +             vq->split.buflen[head] = buflen;
> > > > > > > +
> > > > > > >       /* Put entry in available array (but don't update avail->idx until they
> > > > > > >        * do sync). */
> > > > > > >       avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> > > > > > > @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> > > > > > >               BAD_RING(vq, "id %u is not a head!\n", i);
> > > > > > >               return NULL;
> > > > > > >       }
> > > > > > > +     if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > +                     *len, vq->split.buflen[i]);
> > > > > > > +             return NULL;
> > > > > > > +     }
> > > > > > >
> > > > > > >       /* detach_buf_split clears data, so grab it now. */
> > > > > > >       ret = vq->split.desc_state[i].data;
> > > > > > > @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
> > > > > > >                        vring_split->queue_dma_addr,
> > > > > > >                        dma_dev);
> > > > > > >
> > > > > > > +     kfree(vring_split->buflen);
> > > > > > >       kfree(vring_split->desc_state);
> > > > > > >       kfree(vring_split->desc_extra);
> > > > > > >  }
> > > > > > >
> > > > > > > +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > > > > +{
> > > > > > > +     /*
> > > > > > > +      * Several legacy devices are known to produce buggy used
> > > > > > > +      * length. In order to let driver work, we won't validate used
> > > > > > > +      * buffer length in this case.
> > > > > > > +      */
> > > > > > > +     if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > > > > +             return false;
> > > > > > > +     if (force_used_validation)
> > > > > > > +             return true;
> > > > > > > +     return false;
> > > > > > > +}
> > > > > > > +
> > > > > > >  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > >                                  struct virtio_device *vdev,
> > > > > > >                                  u32 num,
> > > > > > > @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > >       vring_split->vring_align = vring_align;
> > > > > > >       vring_split->may_reduce_num = may_reduce_num;
> > > > > > >
> > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > +             vring_split->buflen =
> > > > > > > +                     kmalloc_array(num, sizeof(*vring_split->buflen),
> > > > > > > +                                   GFP_KERNEL);
> > > > > > > +             if (!vring_split->buflen)
> > > > > > > +                     goto err_buflen;
> > > > > > > +     }
> > > > > > > +
> > > > > > >       return 0;
> > > > > > > +
> > > > > > > +err_buflen:
> > > > > > > +     vring_free_split(vring_split, vdev, dma_dev);
> > > > > > > +     return -ENOMEM;
> > > > > > >  }
> > > > > > >
> > > > > > >  static struct virtqueue *vring_create_virtqueue_split(
> > > > > > > @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > >       unsigned int i, n, err_idx;
> > > > > > >       u16 head, id;
> > > > > > >       dma_addr_t addr;
> > > > > > > +     u32 buflen = 0;
> > > > > > >
> > > > > > >       head = vq->packed.next_avail_idx;
> > > > > > >       desc = alloc_indirect_packed(total_sg, gfp);
> > > > > > > @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > >                       desc[i].addr = cpu_to_le64(addr);
> > > > > > >                       desc[i].len = cpu_to_le32(sg->length);
> > > > > > >                       i++;
> > > > > > > +                     if (n >= out_sgs)
> > > > > > > +                             buflen += sg->length;
> > > > > > >               }
> > > > > > >       }
> > > > > > >
> > > > > > > @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > >       vq->packed.desc_state[id].last = id;
> > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > >
> > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > +     if (vq->packed.buflen)
> > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > +
> > > > > > >       vq->num_added += 1;
> > > > > > >
> > > > > > >       pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > > > > @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > >       __le16 head_flags, flags;
> > > > > > >       u16 head, id, prev, curr, avail_used_flags;
> > > > > > >       int err;
> > > > > > > +     u32 buflen = 0;
> > > > > > >
> > > > > > >       START_USE(vq);
> > > > > > >
> > > > > > > @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > >                                       1 << VRING_PACKED_DESC_F_AVAIL |
> > > > > > >                                       1 << VRING_PACKED_DESC_F_USED;
> > > > > > >                       }
> > > > > > > +                     if (n >= out_sgs)
> > > > > > > +                             buflen += sg->length;
> > > > > > >               }
> > > > > > >       }
> > > > > > >
> > > > > > > @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > >       vq->packed.desc_state[id].last = prev;
> > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > >
> > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > +     if (vq->packed.buflen)
> > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > +
> > > > > > >       /*
> > > > > > >        * A driver MUST NOT make the first descriptor in the list
> > > > > > >        * available before all subsequent descriptors comprising
> > > > > > > @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > > >               BAD_RING(vq, "id %u is not a head!\n", id);
> > > > > > >               return NULL;
> > > > > > >       }
> > > > > > > +     if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > +                     *len, vq->packed.buflen[id]);
> > > > > > > +             return NULL;
> > > > > > > +     }
> > > > > > >
> > > > > > >       /* detach_buf_packed clears data, so grab it now. */
> > > > > > >       ret = vq->packed.desc_state[id].data;
> > > > > > > @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > >                                vring_packed->device_event_dma_addr,
> > > > > > >                                dma_dev);
> > > > > > >
> > > > > > > +     kfree(vring_packed->buflen);
> > > > > > >       kfree(vring_packed->desc_state);
> > > > > > >       kfree(vring_packed->desc_extra);
> > > > > > >  }
> > > > > > > @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > >
> > > > > > >       vring_packed->vring.num = num;
> > > > > > >
> > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > +             vring_packed->buflen =
> > > > > > > +                     kmalloc_array(num, sizeof(*vring_packed->buflen),
> > > > > > > +                                   GFP_KERNEL);
> > > > > > > +             if (!vring_packed->buflen)
> > > > > > > +                     goto err;
> > > > > > > +     }
> > > > > > > +
> > > > > > >       return 0;
> > > > > > >
> > > > > > >  err:
> > > > > > > --
> > > > > > > 2.25.1
> > > > > >
> > > >
> >
  
Michael S. Tsirkin May 31, 2023, 10:24 a.m. UTC | #9
On Wed, May 31, 2023 at 04:26:38PM +0800, Jason Wang wrote:
> On Wed, May 31, 2023 at 3:36 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> > > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > > > > This patch validate
> > > > > > >
> > > > > > > validates
> > > > > > >
> > > > > > > > the used buffer length provided by the device
> > > > > > > > before trying to use it.
> > > > > > >
> > > > > > > before returning it to caller
> > > > > > >
> > > > > > > > This is done by remembering the in buffer
> > > > > > > > length in a dedicated array during virtqueue_add(), then we can fail
> > > > > > > > the virtqueue_get_buf() when we find the device is trying to give us a
> > > > > > > > used buffer length which is greater than we stored before.
> > > > > > >
> > > > > > > than what we stored
> > > > > > >
> > > > > > > >
> > > > > > > > This validation is disable
> > > > > > >
> > > > > > > disabled
> > > > > > >
> > > > > > > > by default via module parameter to unbreak
> > > > > > > > some existing devices since some legacy devices are known to report
> > > > > > > > buggy used length.
> > > > > > > >
> > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > >
> > > > > > > First I'm not merging this without more data about
> > > > > > > what is known to be broken and what is known to work well
> > > > > > > in the commit log. And how exactly do things work if used length
> > > > > > > is wrong?
> > > > > >
> > > > > > Assuming the device is malicious, it would be very hard to answer.
> > > > > > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > > > > > the answer, we can simply make sure the used in buffer length is
> > > > > > validated then we know we're fine or not.
> > > > >
> > > > > To restate the question, you said above "some legacy devices are known
> > > > > to report buggy used length". If they report buggy length then how
> > > > > can things work?
> > > >
> > > > The validation is disabled for legacy device (as stated in the changelog):
> > > >
> > > > static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > {
> > > >         /*
> > > >          * Several legacy devices are known to produce buggy used
> > > >          * length. In order to let driver work, we won't validate used
> > > >          * buffer length in this case.
> > > >          */
> > > >         if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > >                 return false;
> > > >         if (force_used_validation)
> > > >                 return true;
> > > >         return false;
> > > > }
> > > >
> > > > This seems to be what we've agreed in last version:
> > > >
> > > > https://lore.kernel.org/all/CANLsYkxfhamUU0bb4j7y6N4_G9odKxLCjXxgXEx4SJ6_Kf+M2Q@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56
> > > >
> > > > Thanks
> > > >
> > >
> > > I don't get it. You wrote:
> > >
> > >         This validation is disable
> > >         by default via module parameter to unbreak
> > >         some existing devices since some legacy devices are known to report
> > >         buggy used length.
> > >
> > > which devices?
> >
> > legacy rpmsg and vsock device (before 49d8c5ffad07) at least.
> >
> > > why do you need a module parameter?
> >
> > If we enable it unconditionally for modern devices, it may break some
> > buggy moden device (vsock without a fix as an example).
> >
> > >
> > >
> > > > >
> > > > > > > Second what's wrong with dma_desc_extra that we already maintain?
> > > > > > > Third motivation - it's part and parcel of the hardening effort yes?
> > > > > >
> > > > > > They are different. dma_desc_extra is for a descriptor ring, but this
> > > > > > is for a used ring. Technically we can go back to iterate on the
> > > > > > descriptor ring for a legal used in buffer length. But it will have
> > > > > > worse performance.
> > > > >
> > > > > I don't really understand. We already iterate when we unmap -
> > > > > all that is necessary is to subtract it from used length, if at
> > > > > the end of the process it is >0 then we know used length is too
> > > > > large.
> > > >
> > > > Yes, but it is the job that is done in the driver level not the virtio
> > > > core.
> > >
> > > What job?
> >
> > I meant the driver can do the validation since it has the knowledge of
> > the buffer length if it wants.
> >
> > > unmap is done in detach_buf_split and detach_buf_packed respectively.
> > > vring_desc_extra isn't even visible outside drivers/virtio/virtio_ring.c
> >
> > desc_extra doesn't contain buffer length for the case of indirect
> > descriptors. So we need to iterate in the descriptors when it looks
> > expensive if we don't need unmap.
> >
> > Thanks
> >
> > >
> > > For drivers that do unmap at driver level - I guess they can do
> > > validation there too.
> > >
> > > > Validation in virtio core is still necessary since they're
> > > > working at different levels and it's hard to force the validation in
> > > > all drivers by codes. Last version introduces a
> > > > suppress_driver_validation to allow the driver to suppress the core
> > > > validation which seems not good, we need a way to force the
> > > > virtio_ring code to do validation before.
> > >
> > > Why do we? If driver validates length virtio_ring does not need to
> > > validate.
> 
> To be more safe, there's no guarantee that there's no bug in the driver.

Extra options increase testing matrix size so - there be bugs.
We need to make these decisions for (most) users.

> >  If driver does not use length virtio_ring does not need to
> > > validate.
> 
> This could be done on top assuming the validation is disabled by
> default. But if the administrator wants to have belt and braces we
> need to leave an option for them.
> 
> Thanks

No, we don't regress then fix on top.
As for mod parameter I am not impressed -
no one's going to have the time or inclination to do the requisite
testing to know whether the module parameter is safe.

> > core can provide this service for the gazillion non
> > > performance critical drivers that just want to keep things simple,
> > > but the 4-5 critical ones can do their own validation if they want to.
> > >
> > > > Or such stuff could be added
> > > > on top since the validation is by default anyway.
> > > >
> > > > Thanks
> > >
> > >
> > >
> > > > >
> > > > >
> > > > > > > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> > > > > > > we do more hardening. If it's irrevocably broken let's rip it out?
> > > > > >
> > > > > > So the plan is
> > > > > >
> > > > > > 1) finish used ring validation (this had been proposed, merged and
> > > > > > reverted before notification hardening)
> > > > > > 2) do notification hardening on top.
> > > > > >
> > > > > > So let's leave it as is and I will do a rework after we finalize the
> > > > > > used ring validation.
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > ---
> > > > > > > > Changes since V4:
> > > > > > > > - drop the flat for driver to suppress the check
> > > > > > > > - validation is disabled by default
> > > > > > > > - don't do validation for legacy device
> > > > > > > > - rebase and support virtqueue resize
> > > > > > > > ---
> > > > > > > >  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
> > > > > > > >  1 file changed, 75 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > index 143f380baa1c..5b151605aaf8 100644
> > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > @@ -15,6 +15,9 @@
> > > > > > > >  #include <linux/spinlock.h>
> > > > > > > >  #include <xen/xen.h>
> > > > > > > >
> > > > > > > > +static bool force_used_validation = false;
> > > > > > > > +module_param(force_used_validation, bool, 0444);
> > > > > > > > +
> > > > > > > >  #ifdef DEBUG
> > > > > > > >  /* For development, we want to crash whenever the ring is screwed. */
> > > > > > > >  #define BAD_RING(_vq, fmt, args...)                          \
> > > > > > > > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> > > > > > > >       struct vring_desc_state_split *desc_state;
> > > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > > >
> > > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > > +     u32 *buflen;
> > > > > > > > +
> > > > > > > >       /* DMA address and size information */
> > > > > > > >       dma_addr_t queue_dma_addr;
> > > > > > > >       size_t queue_size_in_bytes;
> > > > > > > > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> > > > > > > >       struct vring_desc_state_packed *desc_state;
> > > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > > >
> > > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > > +     u32 *buflen;
> > > > > > > > +
> > > > > > > >       /* DMA address and size information */
> > > > > > > >       dma_addr_t ring_dma_addr;
> > > > > > > >       dma_addr_t driver_event_dma_addr;
> > > > > > > > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > >       unsigned int i, n, avail, descs_used, prev, err_idx;
> > > > > > > >       int head;
> > > > > > > >       bool indirect;
> > > > > > > > +     u32 buflen = 0;
> > > > > > > >
> > > > > > > >       START_USE(vq);
> > > > > > > >
> > > > > > > > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > >                                                    VRING_DESC_F_NEXT |
> > > > > > > >                                                    VRING_DESC_F_WRITE,
> > > > > > > >                                                    indirect);
> > > > > > > > +                     buflen += sg->length;
> > > > > > > >               }
> > > > > > > >       }
> > > > > > > >       /* Last one doesn't continue. */
> > > > > > > > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > >       else
> > > > > > > >               vq->split.desc_state[head].indir_desc = ctx;
> > > > > > > >
> > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > +     if (vq->split.buflen)
> > > > > > > > +             vq->split.buflen[head] = buflen;
> > > > > > > > +
> > > > > > > >       /* Put entry in available array (but don't update avail->idx until they
> > > > > > > >        * do sync). */
> > > > > > > >       avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> > > > > > > > @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> > > > > > > >               BAD_RING(vq, "id %u is not a head!\n", i);
> > > > > > > >               return NULL;
> > > > > > > >       }
> > > > > > > > +     if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> > > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > > +                     *len, vq->split.buflen[i]);
> > > > > > > > +             return NULL;
> > > > > > > > +     }
> > > > > > > >
> > > > > > > >       /* detach_buf_split clears data, so grab it now. */
> > > > > > > >       ret = vq->split.desc_state[i].data;
> > > > > > > > @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
> > > > > > > >                        vring_split->queue_dma_addr,
> > > > > > > >                        dma_dev);
> > > > > > > >
> > > > > > > > +     kfree(vring_split->buflen);
> > > > > > > >       kfree(vring_split->desc_state);
> > > > > > > >       kfree(vring_split->desc_extra);
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > > > > > +{
> > > > > > > > +     /*
> > > > > > > > +      * Several legacy devices are known to produce buggy used
> > > > > > > > +      * length. In order to let driver work, we won't validate used
> > > > > > > > +      * buffer length in this case.
> > > > > > > > +      */
> > > > > > > > +     if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > > > > > +             return false;
> > > > > > > > +     if (force_used_validation)
> > > > > > > > +             return true;
> > > > > > > > +     return false;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > > >                                  struct virtio_device *vdev,
> > > > > > > >                                  u32 num,
> > > > > > > > @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > > >       vring_split->vring_align = vring_align;
> > > > > > > >       vring_split->may_reduce_num = may_reduce_num;
> > > > > > > >
> > > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > > +             vring_split->buflen =
> > > > > > > > +                     kmalloc_array(num, sizeof(*vring_split->buflen),
> > > > > > > > +                                   GFP_KERNEL);
> > > > > > > > +             if (!vring_split->buflen)
> > > > > > > > +                     goto err_buflen;
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > >       return 0;
> > > > > > > > +
> > > > > > > > +err_buflen:
> > > > > > > > +     vring_free_split(vring_split, vdev, dma_dev);
> > > > > > > > +     return -ENOMEM;
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  static struct virtqueue *vring_create_virtqueue_split(
> > > > > > > > @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > >       unsigned int i, n, err_idx;
> > > > > > > >       u16 head, id;
> > > > > > > >       dma_addr_t addr;
> > > > > > > > +     u32 buflen = 0;
> > > > > > > >
> > > > > > > >       head = vq->packed.next_avail_idx;
> > > > > > > >       desc = alloc_indirect_packed(total_sg, gfp);
> > > > > > > > @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > >                       desc[i].addr = cpu_to_le64(addr);
> > > > > > > >                       desc[i].len = cpu_to_le32(sg->length);
> > > > > > > >                       i++;
> > > > > > > > +                     if (n >= out_sgs)
> > > > > > > > +                             buflen += sg->length;
> > > > > > > >               }
> > > > > > > >       }
> > > > > > > >
> > > > > > > > @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > >       vq->packed.desc_state[id].last = id;
> > > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > > >
> > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > +     if (vq->packed.buflen)
> > > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > > +
> > > > > > > >       vq->num_added += 1;
> > > > > > > >
> > > > > > > >       pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > > > > > @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > >       __le16 head_flags, flags;
> > > > > > > >       u16 head, id, prev, curr, avail_used_flags;
> > > > > > > >       int err;
> > > > > > > > +     u32 buflen = 0;
> > > > > > > >
> > > > > > > >       START_USE(vq);
> > > > > > > >
> > > > > > > > @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > >                                       1 << VRING_PACKED_DESC_F_AVAIL |
> > > > > > > >                                       1 << VRING_PACKED_DESC_F_USED;
> > > > > > > >                       }
> > > > > > > > +                     if (n >= out_sgs)
> > > > > > > > +                             buflen += sg->length;
> > > > > > > >               }
> > > > > > > >       }
> > > > > > > >
> > > > > > > > @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > >       vq->packed.desc_state[id].last = prev;
> > > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > > >
> > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > +     if (vq->packed.buflen)
> > > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > > +
> > > > > > > >       /*
> > > > > > > >        * A driver MUST NOT make the first descriptor in the list
> > > > > > > >        * available before all subsequent descriptors comprising
> > > > > > > > @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > > > >               BAD_RING(vq, "id %u is not a head!\n", id);
> > > > > > > >               return NULL;
> > > > > > > >       }
> > > > > > > > +     if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> > > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > > +                     *len, vq->packed.buflen[id]);
> > > > > > > > +             return NULL;
> > > > > > > > +     }
> > > > > > > >
> > > > > > > >       /* detach_buf_packed clears data, so grab it now. */
> > > > > > > >       ret = vq->packed.desc_state[id].data;
> > > > > > > > @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > > >                                vring_packed->device_event_dma_addr,
> > > > > > > >                                dma_dev);
> > > > > > > >
> > > > > > > > +     kfree(vring_packed->buflen);
> > > > > > > >       kfree(vring_packed->desc_state);
> > > > > > > >       kfree(vring_packed->desc_extra);
> > > > > > > >  }
> > > > > > > > @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > > >
> > > > > > > >       vring_packed->vring.num = num;
> > > > > > > >
> > > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > > +             vring_packed->buflen =
> > > > > > > > +                     kmalloc_array(num, sizeof(*vring_packed->buflen),
> > > > > > > > +                                   GFP_KERNEL);
> > > > > > > > +             if (!vring_packed->buflen)
> > > > > > > > +                     goto err;
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > >       return 0;
> > > > > > > >
> > > > > > > >  err:
> > > > > > > > --
> > > > > > > > 2.25.1
> > > > > > >
> > > > >
> > >
  
Jason Wang June 1, 2023, 1:12 a.m. UTC | #10
On Wed, May 31, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, May 31, 2023 at 03:36:51PM +0800, Jason Wang wrote:
> > On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> > > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > > > > This patch validate
> > > > > > >
> > > > > > > validates
> > > > > > >
> > > > > > > > the used buffer length provided by the device
> > > > > > > > before trying to use it.
> > > > > > >
> > > > > > > before returning it to caller
> > > > > > >
> > > > > > > > This is done by remembering the in buffer
> > > > > > > > length in a dedicated array during virtqueue_add(), then we can fail
> > > > > > > > the virtqueue_get_buf() when we find the device is trying to give us a
> > > > > > > > used buffer length which is greater than we stored before.
> > > > > > >
> > > > > > > than what we stored
> > > > > > >
> > > > > > > >
> > > > > > > > This validation is disable
> > > > > > >
> > > > > > > disabled
> > > > > > >
> > > > > > > > by default via module parameter to unbreak
> > > > > > > > some existing devices since some legacy devices are known to report
> > > > > > > > buggy used length.
> > > > > > > >
> > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > >
> > > > > > > First I'm not merging this without more data about
> > > > > > > what is known to be broken and what is known to work well
> > > > > > > in the commit log. And how exactly do things work if used length
> > > > > > > is wrong?
> > > > > >
> > > > > > Assuming the device is malicious, it would be very hard to answer.
> > > > > > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > > > > > the answer, we can simply make sure the used in buffer length is
> > > > > > validated then we know we're fine or not.
> > > > >
> > > > > To restate the question, you said above "some legacy devices are known
> > > > > to report buggy used length". If they report buggy length then how
> > > > > can things work?
> > > >
> > > > The validation is disabled for legacy device (as stated in the changelog):
> > > >
> > > > static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > {
> > > >         /*
> > > >          * Several legacy devices are known to produce buggy used
> > > >          * length. In order to let driver work, we won't validate used
> > > >          * buffer length in this case.
> > > >          */
> > > >         if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > >                 return false;
> > > >         if (force_used_validation)
> > > >                 return true;
> > > >         return false;
> > > > }
> > > >
> > > > This seems to be what we've agreed in last version:
> > > >
> > > > https://lore.kernel.org/all/CANLsYkxfhamUU0bb4j7y6N4_G9odKxLCjXxgXEx4SJ6_Kf+M2Q@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56
> > > >
> > > > Thanks
> > > >
> > >
> > > I don't get it. You wrote:
> > >
> > >         This validation is disable
> > >         by default via module parameter to unbreak
> > >         some existing devices since some legacy devices are known to report
> > >         buggy used length.
> > >
> > > which devices?
> >
> > legacy rpmsg and vsock device (before 49d8c5ffad07) at least.
> >
> > > why do you need a module parameter?
> >
> > If we enable it unconditionally for modern devices, it may break some
> > buggy moden device (vsock without a fix as an example).
>
> Presumably this happens because vsock does not have any inbuf at all
> so it ignores the length.
> We had the same with virtio net tx a long time ago.
>
> My suggestion is then not to fail - just cap length at the dma length
> set by driver. Another idea is that if dma len is 0 then don't validate
> at all - driver that did not add any inbufs is not going to look
> at length.
>
> Allowing passing NULL as length and skipping validation
> if len = 0 might also be a good idea.

The above should work for sure, but a question is, in the case of
confidential computing, if a driver detects a malicious device, is it
really good to try to keep working or warn and disable the device? Or
maybe we can have an option for the user to decide?

>
>
> > >
> > >
> > > > >
> > > > > > > Second what's wrong with dma_desc_extra that we already maintain?
> > > > > > > Third motivation - it's part and parcel of the hardening effort yes?
> > > > > >
> > > > > > They are different. dma_desc_extra is for a descriptor ring, but this
> > > > > > is for a used ring. Technically we can go back to iterate on the
> > > > > > descriptor ring for a legal used in buffer length. But it will have
> > > > > > worse performance.
> > > > >
> > > > > I don't really understand. We already iterate when we unmap -
> > > > > all that is necessary is to subtract it from used length, if at
> > > > > the end of the process it is >0 then we know used length is too
> > > > > large.
> > > >
> > > > Yes, but it is the job that is done in the driver level not the virtio
> > > > core.
> > >
> > > What job?
> >
> > I meant the driver can do the validation since it has the knowledge of
> > the buffer length if it wants.
>
> It does not necessarily have it - not if virtio is doing DMA
> mapping.

I don't see any dependencies for DMA, I mean anyhow the in buffer is
allocated by the driver, so it should know how large it is.

>
>
> > > unmap is done in detach_buf_split and detach_buf_packed respectively.
> > > vring_desc_extra isn't even visible outside drivers/virtio/virtio_ring.c
> >
> > desc_extra doesn't contain buffer length for the case of indirect
> > descriptors. So we need to iterate in the descriptors when it looks
> > expensive if we don't need unmap.
> >
> > Thanks
>
> Well at the moment we only don't unmap if DMA API is bypassed.

If I understand correctly, we will probably support premapped DMA
buffers soon (AF_XDP and page pool)?

Thanks

> And then
> we don't need to validate length either. Fundamentally, without
> ACCESS_PLATFORM device is trusted.
>
>
> > >
> > > For drivers that do unmap at driver level - I guess they can do
> > > validation there too.
> > >
> > > > Validation in virtio core is still necessary since they're
> > > > working at different levels and it's hard to force the validation in
> > > > all drivers by codes. Last version introduces a
> > > > suppress_driver_validation to allow the driver to suppress the core
> > > > validation which seems not good, we need a way to force the
> > > > virtio_ring code to do validation before.
> > >
> > > Why do we? If driver validates length virtio_ring does not need to
> > > validate.  If driver does not use length virtio_ring does not need to
> > > validate. core can provide this service for the gazillion non
> > > performance critical drivers that just want to keep things simple,
> > > but the 4-5 critical ones can do their own validation if they want to.
> > >
> > > > Or such stuff could be added
> > > > on top since the validation is by default anyway.
> > > >
> > > > Thanks
> > >
> > >
> > >
> > > > >
> > > > >
> > > > > > > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> > > > > > > we do more hardening. If it's irrevocably broken let's rip it out?
> > > > > >
> > > > > > So the plan is
> > > > > >
> > > > > > 1) finish used ring validation (this had been proposed, merged and
> > > > > > reverted before notification hardening)
> > > > > > 2) do notification hardening on top.
> > > > > >
> > > > > > So let's leave it as is and I will do a rework after we finalize the
> > > > > > used ring validation.
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > ---
> > > > > > > > Changes since V4:
> > > > > > > > - drop the flat for driver to suppress the check
> > > > > > > > - validation is disabled by default
> > > > > > > > - don't do validation for legacy device
> > > > > > > > - rebase and support virtqueue resize
> > > > > > > > ---
> > > > > > > >  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
> > > > > > > >  1 file changed, 75 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > index 143f380baa1c..5b151605aaf8 100644
> > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > @@ -15,6 +15,9 @@
> > > > > > > >  #include <linux/spinlock.h>
> > > > > > > >  #include <xen/xen.h>
> > > > > > > >
> > > > > > > > +static bool force_used_validation = false;
> > > > > > > > +module_param(force_used_validation, bool, 0444);
> > > > > > > > +
> > > > > > > >  #ifdef DEBUG
> > > > > > > >  /* For development, we want to crash whenever the ring is screwed. */
> > > > > > > >  #define BAD_RING(_vq, fmt, args...)                          \
> > > > > > > > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> > > > > > > >       struct vring_desc_state_split *desc_state;
> > > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > > >
> > > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > > +     u32 *buflen;
> > > > > > > > +
> > > > > > > >       /* DMA address and size information */
> > > > > > > >       dma_addr_t queue_dma_addr;
> > > > > > > >       size_t queue_size_in_bytes;
> > > > > > > > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> > > > > > > >       struct vring_desc_state_packed *desc_state;
> > > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > > >
> > > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > > +     u32 *buflen;
> > > > > > > > +
> > > > > > > >       /* DMA address and size information */
> > > > > > > >       dma_addr_t ring_dma_addr;
> > > > > > > >       dma_addr_t driver_event_dma_addr;
> > > > > > > > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > >       unsigned int i, n, avail, descs_used, prev, err_idx;
> > > > > > > >       int head;
> > > > > > > >       bool indirect;
> > > > > > > > +     u32 buflen = 0;
> > > > > > > >
> > > > > > > >       START_USE(vq);
> > > > > > > >
> > > > > > > > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > >                                                    VRING_DESC_F_NEXT |
> > > > > > > >                                                    VRING_DESC_F_WRITE,
> > > > > > > >                                                    indirect);
> > > > > > > > +                     buflen += sg->length;
> > > > > > > >               }
> > > > > > > >       }
> > > > > > > >       /* Last one doesn't continue. */
> > > > > > > > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > >       else
> > > > > > > >               vq->split.desc_state[head].indir_desc = ctx;
> > > > > > > >
> > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > +     if (vq->split.buflen)
> > > > > > > > +             vq->split.buflen[head] = buflen;
> > > > > > > > +
> > > > > > > >       /* Put entry in available array (but don't update avail->idx until they
> > > > > > > >        * do sync). */
> > > > > > > >       avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> > > > > > > > @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> > > > > > > >               BAD_RING(vq, "id %u is not a head!\n", i);
> > > > > > > >               return NULL;
> > > > > > > >       }
> > > > > > > > +     if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> > > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > > +                     *len, vq->split.buflen[i]);
> > > > > > > > +             return NULL;
> > > > > > > > +     }
> > > > > > > >
> > > > > > > >       /* detach_buf_split clears data, so grab it now. */
> > > > > > > >       ret = vq->split.desc_state[i].data;
> > > > > > > > @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
> > > > > > > >                        vring_split->queue_dma_addr,
> > > > > > > >                        dma_dev);
> > > > > > > >
> > > > > > > > +     kfree(vring_split->buflen);
> > > > > > > >       kfree(vring_split->desc_state);
> > > > > > > >       kfree(vring_split->desc_extra);
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > > > > > +{
> > > > > > > > +     /*
> > > > > > > > +      * Several legacy devices are known to produce buggy used
> > > > > > > > +      * length. In order to let driver work, we won't validate used
> > > > > > > > +      * buffer length in this case.
> > > > > > > > +      */
> > > > > > > > +     if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > > > > > +             return false;
> > > > > > > > +     if (force_used_validation)
> > > > > > > > +             return true;
> > > > > > > > +     return false;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > > >                                  struct virtio_device *vdev,
> > > > > > > >                                  u32 num,
> > > > > > > > @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > > >       vring_split->vring_align = vring_align;
> > > > > > > >       vring_split->may_reduce_num = may_reduce_num;
> > > > > > > >
> > > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > > +             vring_split->buflen =
> > > > > > > > +                     kmalloc_array(num, sizeof(*vring_split->buflen),
> > > > > > > > +                                   GFP_KERNEL);
> > > > > > > > +             if (!vring_split->buflen)
> > > > > > > > +                     goto err_buflen;
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > >       return 0;
> > > > > > > > +
> > > > > > > > +err_buflen:
> > > > > > > > +     vring_free_split(vring_split, vdev, dma_dev);
> > > > > > > > +     return -ENOMEM;
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  static struct virtqueue *vring_create_virtqueue_split(
> > > > > > > > @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > >       unsigned int i, n, err_idx;
> > > > > > > >       u16 head, id;
> > > > > > > >       dma_addr_t addr;
> > > > > > > > +     u32 buflen = 0;
> > > > > > > >
> > > > > > > >       head = vq->packed.next_avail_idx;
> > > > > > > >       desc = alloc_indirect_packed(total_sg, gfp);
> > > > > > > > @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > >                       desc[i].addr = cpu_to_le64(addr);
> > > > > > > >                       desc[i].len = cpu_to_le32(sg->length);
> > > > > > > >                       i++;
> > > > > > > > +                     if (n >= out_sgs)
> > > > > > > > +                             buflen += sg->length;
> > > > > > > >               }
> > > > > > > >       }
> > > > > > > >
> > > > > > > > @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > >       vq->packed.desc_state[id].last = id;
> > > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > > >
> > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > +     if (vq->packed.buflen)
> > > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > > +
> > > > > > > >       vq->num_added += 1;
> > > > > > > >
> > > > > > > >       pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > > > > > @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > >       __le16 head_flags, flags;
> > > > > > > >       u16 head, id, prev, curr, avail_used_flags;
> > > > > > > >       int err;
> > > > > > > > +     u32 buflen = 0;
> > > > > > > >
> > > > > > > >       START_USE(vq);
> > > > > > > >
> > > > > > > > @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > >                                       1 << VRING_PACKED_DESC_F_AVAIL |
> > > > > > > >                                       1 << VRING_PACKED_DESC_F_USED;
> > > > > > > >                       }
> > > > > > > > +                     if (n >= out_sgs)
> > > > > > > > +                             buflen += sg->length;
> > > > > > > >               }
> > > > > > > >       }
> > > > > > > >
> > > > > > > > @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > >       vq->packed.desc_state[id].last = prev;
> > > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > > >
> > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > +     if (vq->packed.buflen)
> > > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > > +
> > > > > > > >       /*
> > > > > > > >        * A driver MUST NOT make the first descriptor in the list
> > > > > > > >        * available before all subsequent descriptors comprising
> > > > > > > > @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > > > >               BAD_RING(vq, "id %u is not a head!\n", id);
> > > > > > > >               return NULL;
> > > > > > > >       }
> > > > > > > > +     if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> > > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > > +                     *len, vq->packed.buflen[id]);
> > > > > > > > +             return NULL;
> > > > > > > > +     }
> > > > > > > >
> > > > > > > >       /* detach_buf_packed clears data, so grab it now. */
> > > > > > > >       ret = vq->packed.desc_state[id].data;
> > > > > > > > @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > > >                                vring_packed->device_event_dma_addr,
> > > > > > > >                                dma_dev);
> > > > > > > >
> > > > > > > > +     kfree(vring_packed->buflen);
> > > > > > > >       kfree(vring_packed->desc_state);
> > > > > > > >       kfree(vring_packed->desc_extra);
> > > > > > > >  }
> > > > > > > > @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > > >
> > > > > > > >       vring_packed->vring.num = num;
> > > > > > > >
> > > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > > +             vring_packed->buflen =
> > > > > > > > +                     kmalloc_array(num, sizeof(*vring_packed->buflen),
> > > > > > > > +                                   GFP_KERNEL);
> > > > > > > > +             if (!vring_packed->buflen)
> > > > > > > > +                     goto err;
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > >       return 0;
> > > > > > > >
> > > > > > > >  err:
> > > > > > > > --
> > > > > > > > 2.25.1
> > > > > > >
> > > > >
> > >
>
  
Jason Wang June 1, 2023, 1:27 a.m. UTC | #11
On Wed, May 31, 2023 at 6:25 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, May 31, 2023 at 04:26:38PM +0800, Jason Wang wrote:
> > On Wed, May 31, 2023 at 3:36 PM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> > > > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > > > > > This patch validate
> > > > > > > >
> > > > > > > > validates
> > > > > > > >
> > > > > > > > > the used buffer length provided by the device
> > > > > > > > > before trying to use it.
> > > > > > > >
> > > > > > > > before returning it to caller
> > > > > > > >
> > > > > > > > > This is done by remembering the in buffer
> > > > > > > > > length in a dedicated array during virtqueue_add(), then we can fail
> > > > > > > > > the virtqueue_get_buf() when we find the device is trying to give us a
> > > > > > > > > used buffer length which is greater than we stored before.
> > > > > > > >
> > > > > > > > than what we stored
> > > > > > > >
> > > > > > > > >
> > > > > > > > > This validation is disable
> > > > > > > >
> > > > > > > > disabled
> > > > > > > >
> > > > > > > > > by default via module parameter to unbreak
> > > > > > > > > some existing devices since some legacy devices are known to report
> > > > > > > > > buggy used length.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > >
> > > > > > > > First I'm not merging this without more data about
> > > > > > > > what is known to be broken and what is known to work well
> > > > > > > > in the commit log. And how exactly do things work if used length
> > > > > > > > is wrong?
> > > > > > >
> > > > > > > Assuming the device is malicious, it would be very hard to answer.
> > > > > > > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > > > > > > the answer, we can simply make sure the used in buffer length is
> > > > > > > validated then we know we're fine or not.
> > > > > >
> > > > > > To restate the question, you said above "some legacy devices are known
> > > > > > to report buggy used length". If they report buggy length then how
> > > > > > can things work?
> > > > >
> > > > > The validation is disabled for legacy device (as stated in the changelog):
> > > > >
> > > > > static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > > {
> > > > >         /*
> > > > >          * Several legacy devices are known to produce buggy used
> > > > >          * length. In order to let driver work, we won't validate used
> > > > >          * buffer length in this case.
> > > > >          */
> > > > >         if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > >                 return false;
> > > > >         if (force_used_validation)
> > > > >                 return true;
> > > > >         return false;
> > > > > }
> > > > >
> > > > > This seems to be what we've agreed in last version:
> > > > >
> > > > > https://lore.kernel.org/all/CANLsYkxfhamUU0bb4j7y6N4_G9odKxLCjXxgXEx4SJ6_Kf+M2Q@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56
> > > > >
> > > > > Thanks
> > > > >
> > > >
> > > > I don't get it. You wrote:
> > > >
> > > >         This validation is disable
> > > >         by default via module parameter to unbreak
> > > >         some existing devices since some legacy devices are known to report
> > > >         buggy used length.
> > > >
> > > > which devices?
> > >
> > > legacy rpmsg and vsock device (before 49d8c5ffad07) at least.
> > >
> > > > why do you need a module parameter?
> > >
> > > If we enable it unconditionally for modern devices, it may break some
> > > buggy moden device (vsock without a fix as an example).
> > >
> > > >
> > > >
> > > > > >
> > > > > > > > Second what's wrong with dma_desc_extra that we already maintain?
> > > > > > > > Third motivation - it's part and parcel of the hardening effort yes?
> > > > > > >
> > > > > > > They are different. dma_desc_extra is for a descriptor ring, but this
> > > > > > > is for a used ring. Technically we can go back to iterate on the
> > > > > > > descriptor ring for a legal used in buffer length. But it will have
> > > > > > > worse performance.
> > > > > >
> > > > > > I don't really understand. We already iterate when we unmap -
> > > > > > all that is necessary is to subtract it from used length, if at
> > > > > > the end of the process it is >0 then we know used length is too
> > > > > > large.
> > > > >
> > > > > Yes, but it is the job that is done in the driver level not the virtio
> > > > > core.
> > > >
> > > > What job?
> > >
> > > I meant the driver can do the validation since it has the knowledge of
> > > the buffer length if it wants.
> > >
> > > > unmap is done in detach_buf_split and detach_buf_packed respectively.
> > > > vring_desc_extra isn't even visible outside drivers/virtio/virtio_ring.c
> > >
> > > desc_extra doesn't contain buffer length for the case of indirect
> > > descriptors. So we need to iterate in the descriptors when it looks
> > > expensive if we don't need unmap.
> > >
> > > Thanks
> > >
> > > >
> > > > For drivers that do unmap at driver level - I guess they can do
> > > > validation there too.
> > > >
> > > > > Validation in virtio core is still necessary since they're
> > > > > working at different levels and it's hard to force the validation in
> > > > > all drivers by codes. Last version introduces a
> > > > > suppress_driver_validation to allow the driver to suppress the core
> > > > > validation which seems not good, we need a way to force the
> > > > > virtio_ring code to do validation before.
> > > >
> > > > Why do we? If driver validates length virtio_ring does not need to
> > > > validate.
> >
> > To be more safe, there's no guarantee that there's no bug in the driver.
>
> Extra options increase testing matrix size so - there be bugs.
> We need to make these decisions for (most) users.

In some cases the requirement is conflict:

1) for "legacy users", the requirement is not to break existing setups
2) for confidential users, the requirement is to detect and fail early
instead of trying to mitigate silently

If we are trying to go in the middle (for example, doing a cap) it may
end up with a result that can satisfy neither:

1) it might still break in some drivers and setups like legacy, then
we need an option to disable it
2) it won't satisfy the users that ask for security

If we had an option, the management layer can choose to provision
VM/guests with the good kernel command line then we are fine.

>
> > >  If driver does not use length virtio_ring does not need to
> > > > validate.
> >
> > This could be done on top assuming the validation is disabled by
> > default. But if the administrator wants to have belt and braces we
> > need to leave an option for them.
> >
> > Thanks
>
> No, we don't regress then fix on top.

I'm not sure I understand here. It would be very hard to say there
won't be regression for any new codes, especially virtio had already
supported several different types of devices and drivers.

> As for mod parameter I am not impressed -
> no one's going to have the time or inclination to do the requisite
> testing to know whether the module parameter is safe.

Actually, that's one advantage, most of the users don't care so the
validation is just suppressed for them. Otherwise they might get
surprised after upgrading. This is what I learnt from the past
hardening effort like this and the IRQ hardening.

For the rest like confidential vendors, they need to test and make
sure such kernel command line were provisioned to the VM.

Thanks

>
> > > core can provide this service for the gazillion non
> > > > performance critical drivers that just want to keep things simple,
> > > > but the 4-5 critical ones can do their own validation if they want to.
> > > >
> > > > > Or such stuff could be added
> > > > > on top since the validation is by default anyway.
> > > > >
> > > > > Thanks
> > > >
> > > >
> > > >
> > > > > >
> > > > > >
> > > > > > > > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> > > > > > > > we do more hardening. If it's irrevocably broken let's rip it out?
> > > > > > >
> > > > > > > So the plan is
> > > > > > >
> > > > > > > 1) finish used ring validation (this had been proposed, merged and
> > > > > > > reverted before notification hardening)
> > > > > > > 2) do notification hardening on top.
> > > > > > >
> > > > > > > So let's leave it as is and I will do a rework after we finalize the
> > > > > > > used ring validation.
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > ---
> > > > > > > > > Changes since V4:
> > > > > > > > > - drop the flat for driver to suppress the check
> > > > > > > > > - validation is disabled by default
> > > > > > > > > - don't do validation for legacy device
> > > > > > > > > - rebase and support virtqueue resize
> > > > > > > > > ---
> > > > > > > > >  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
> > > > > > > > >  1 file changed, 75 insertions(+)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > index 143f380baa1c..5b151605aaf8 100644
> > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > @@ -15,6 +15,9 @@
> > > > > > > > >  #include <linux/spinlock.h>
> > > > > > > > >  #include <xen/xen.h>
> > > > > > > > >
> > > > > > > > > +static bool force_used_validation = false;
> > > > > > > > > +module_param(force_used_validation, bool, 0444);
> > > > > > > > > +
> > > > > > > > >  #ifdef DEBUG
> > > > > > > > >  /* For development, we want to crash whenever the ring is screwed. */
> > > > > > > > >  #define BAD_RING(_vq, fmt, args...)                          \
> > > > > > > > > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> > > > > > > > >       struct vring_desc_state_split *desc_state;
> > > > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > > > >
> > > > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > > > +     u32 *buflen;
> > > > > > > > > +
> > > > > > > > >       /* DMA address and size information */
> > > > > > > > >       dma_addr_t queue_dma_addr;
> > > > > > > > >       size_t queue_size_in_bytes;
> > > > > > > > > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> > > > > > > > >       struct vring_desc_state_packed *desc_state;
> > > > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > > > >
> > > > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > > > +     u32 *buflen;
> > > > > > > > > +
> > > > > > > > >       /* DMA address and size information */
> > > > > > > > >       dma_addr_t ring_dma_addr;
> > > > > > > > >       dma_addr_t driver_event_dma_addr;
> > > > > > > > > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > > >       unsigned int i, n, avail, descs_used, prev, err_idx;
> > > > > > > > >       int head;
> > > > > > > > >       bool indirect;
> > > > > > > > > +     u32 buflen = 0;
> > > > > > > > >
> > > > > > > > >       START_USE(vq);
> > > > > > > > >
> > > > > > > > > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > > >                                                    VRING_DESC_F_NEXT |
> > > > > > > > >                                                    VRING_DESC_F_WRITE,
> > > > > > > > >                                                    indirect);
> > > > > > > > > +                     buflen += sg->length;
> > > > > > > > >               }
> > > > > > > > >       }
> > > > > > > > >       /* Last one doesn't continue. */
> > > > > > > > > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > > >       else
> > > > > > > > >               vq->split.desc_state[head].indir_desc = ctx;
> > > > > > > > >
> > > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > > +     if (vq->split.buflen)
> > > > > > > > > +             vq->split.buflen[head] = buflen;
> > > > > > > > > +
> > > > > > > > >       /* Put entry in available array (but don't update avail->idx until they
> > > > > > > > >        * do sync). */
> > > > > > > > >       avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> > > > > > > > > @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> > > > > > > > >               BAD_RING(vq, "id %u is not a head!\n", i);
> > > > > > > > >               return NULL;
> > > > > > > > >       }
> > > > > > > > > +     if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> > > > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > > > +                     *len, vq->split.buflen[i]);
> > > > > > > > > +             return NULL;
> > > > > > > > > +     }
> > > > > > > > >
> > > > > > > > >       /* detach_buf_split clears data, so grab it now. */
> > > > > > > > >       ret = vq->split.desc_state[i].data;
> > > > > > > > > @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
> > > > > > > > >                        vring_split->queue_dma_addr,
> > > > > > > > >                        dma_dev);
> > > > > > > > >
> > > > > > > > > +     kfree(vring_split->buflen);
> > > > > > > > >       kfree(vring_split->desc_state);
> > > > > > > > >       kfree(vring_split->desc_extra);
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > > > > > > +{
> > > > > > > > > +     /*
> > > > > > > > > +      * Several legacy devices are known to produce buggy used
> > > > > > > > > +      * length. In order to let driver work, we won't validate used
> > > > > > > > > +      * buffer length in this case.
> > > > > > > > > +      */
> > > > > > > > > +     if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > > > > > > +             return false;
> > > > > > > > > +     if (force_used_validation)
> > > > > > > > > +             return true;
> > > > > > > > > +     return false;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > > > >                                  struct virtio_device *vdev,
> > > > > > > > >                                  u32 num,
> > > > > > > > > @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > > > >       vring_split->vring_align = vring_align;
> > > > > > > > >       vring_split->may_reduce_num = may_reduce_num;
> > > > > > > > >
> > > > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > > > +             vring_split->buflen =
> > > > > > > > > +                     kmalloc_array(num, sizeof(*vring_split->buflen),
> > > > > > > > > +                                   GFP_KERNEL);
> > > > > > > > > +             if (!vring_split->buflen)
> > > > > > > > > +                     goto err_buflen;
> > > > > > > > > +     }
> > > > > > > > > +
> > > > > > > > >       return 0;
> > > > > > > > > +
> > > > > > > > > +err_buflen:
> > > > > > > > > +     vring_free_split(vring_split, vdev, dma_dev);
> > > > > > > > > +     return -ENOMEM;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  static struct virtqueue *vring_create_virtqueue_split(
> > > > > > > > > @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > > >       unsigned int i, n, err_idx;
> > > > > > > > >       u16 head, id;
> > > > > > > > >       dma_addr_t addr;
> > > > > > > > > +     u32 buflen = 0;
> > > > > > > > >
> > > > > > > > >       head = vq->packed.next_avail_idx;
> > > > > > > > >       desc = alloc_indirect_packed(total_sg, gfp);
> > > > > > > > > @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > > >                       desc[i].addr = cpu_to_le64(addr);
> > > > > > > > >                       desc[i].len = cpu_to_le32(sg->length);
> > > > > > > > >                       i++;
> > > > > > > > > +                     if (n >= out_sgs)
> > > > > > > > > +                             buflen += sg->length;
> > > > > > > > >               }
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > > @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > > >       vq->packed.desc_state[id].last = id;
> > > > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > > > >
> > > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > > +     if (vq->packed.buflen)
> > > > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > > > +
> > > > > > > > >       vq->num_added += 1;
> > > > > > > > >
> > > > > > > > >       pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > > > > > > @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > > >       __le16 head_flags, flags;
> > > > > > > > >       u16 head, id, prev, curr, avail_used_flags;
> > > > > > > > >       int err;
> > > > > > > > > +     u32 buflen = 0;
> > > > > > > > >
> > > > > > > > >       START_USE(vq);
> > > > > > > > >
> > > > > > > > > @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > > >                                       1 << VRING_PACKED_DESC_F_AVAIL |
> > > > > > > > >                                       1 << VRING_PACKED_DESC_F_USED;
> > > > > > > > >                       }
> > > > > > > > > +                     if (n >= out_sgs)
> > > > > > > > > +                             buflen += sg->length;
> > > > > > > > >               }
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > > @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > > >       vq->packed.desc_state[id].last = prev;
> > > > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > > > >
> > > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > > +     if (vq->packed.buflen)
> > > > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > > > +
> > > > > > > > >       /*
> > > > > > > > >        * A driver MUST NOT make the first descriptor in the list
> > > > > > > > >        * available before all subsequent descriptors comprising
> > > > > > > > > @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > > > > >               BAD_RING(vq, "id %u is not a head!\n", id);
> > > > > > > > >               return NULL;
> > > > > > > > >       }
> > > > > > > > > +     if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> > > > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > > > +                     *len, vq->packed.buflen[id]);
> > > > > > > > > +             return NULL;
> > > > > > > > > +     }
> > > > > > > > >
> > > > > > > > >       /* detach_buf_packed clears data, so grab it now. */
> > > > > > > > >       ret = vq->packed.desc_state[id].data;
> > > > > > > > > @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > > > >                                vring_packed->device_event_dma_addr,
> > > > > > > > >                                dma_dev);
> > > > > > > > >
> > > > > > > > > +     kfree(vring_packed->buflen);
> > > > > > > > >       kfree(vring_packed->desc_state);
> > > > > > > > >       kfree(vring_packed->desc_extra);
> > > > > > > > >  }
> > > > > > > > > @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > > > >
> > > > > > > > >       vring_packed->vring.num = num;
> > > > > > > > >
> > > > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > > > +             vring_packed->buflen =
> > > > > > > > > +                     kmalloc_array(num, sizeof(*vring_packed->buflen),
> > > > > > > > > +                                   GFP_KERNEL);
> > > > > > > > > +             if (!vring_packed->buflen)
> > > > > > > > > +                     goto err;
> > > > > > > > > +     }
> > > > > > > > > +
> > > > > > > > >       return 0;
> > > > > > > > >
> > > > > > > > >  err:
> > > > > > > > > --
> > > > > > > > > 2.25.1
> > > > > > > >
> > > > > >
> > > >
>
  
Michael S. Tsirkin June 1, 2023, 3:34 a.m. UTC | #12
On Thu, Jun 01, 2023 at 09:12:53AM +0800, Jason Wang wrote:
> On Wed, May 31, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, May 31, 2023 at 03:36:51PM +0800, Jason Wang wrote:
> > > On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> > > > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > > > > > This patch validate
> > > > > > > >
> > > > > > > > validates
> > > > > > > >
> > > > > > > > > the used buffer length provided by the device
> > > > > > > > > before trying to use it.
> > > > > > > >
> > > > > > > > before returning it to caller
> > > > > > > >
> > > > > > > > > This is done by remembering the in buffer
> > > > > > > > > length in a dedicated array during virtqueue_add(), then we can fail
> > > > > > > > > the virtqueue_get_buf() when we find the device is trying to give us a
> > > > > > > > > used buffer length which is greater than we stored before.
> > > > > > > >
> > > > > > > > than what we stored
> > > > > > > >
> > > > > > > > >
> > > > > > > > > This validation is disable
> > > > > > > >
> > > > > > > > disabled
> > > > > > > >
> > > > > > > > > by default via module parameter to unbreak
> > > > > > > > > some existing devices since some legacy devices are known to report
> > > > > > > > > buggy used length.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > >
> > > > > > > > First I'm not merging this without more data about
> > > > > > > > what is known to be broken and what is known to work well
> > > > > > > > in the commit log. And how exactly do things work if used length
> > > > > > > > is wrong?
> > > > > > >
> > > > > > > Assuming the device is malicious, it would be very hard to answer.
> > > > > > > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > > > > > > the answer, we can simply make sure the used in buffer length is
> > > > > > > validated then we know we're fine or not.
> > > > > >
> > > > > > To restate the question, you said above "some legacy devices are known
> > > > > > to report buggy used length". If they report buggy length then how
> > > > > > can things work?
> > > > >
> > > > > The validation is disabled for legacy device (as stated in the changelog):
> > > > >
> > > > > static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > > {
> > > > >         /*
> > > > >          * Several legacy devices are known to produce buggy used
> > > > >          * length. In order to let driver work, we won't validate used
> > > > >          * buffer length in this case.
> > > > >          */
> > > > >         if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > >                 return false;
> > > > >         if (force_used_validation)
> > > > >                 return true;
> > > > >         return false;
> > > > > }
> > > > >
> > > > > This seems to be what we've agreed in last version:
> > > > >
> > > > > https://lore.kernel.org/all/CANLsYkxfhamUU0bb4j7y6N4_G9odKxLCjXxgXEx4SJ6_Kf+M2Q@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56
> > > > >
> > > > > Thanks
> > > > >
> > > >
> > > > I don't get it. You wrote:
> > > >
> > > >         This validation is disable
> > > >         by default via module parameter to unbreak
> > > >         some existing devices since some legacy devices are known to report
> > > >         buggy used length.
> > > >
> > > > which devices?
> > >
> > > legacy rpmsg and vsock device (before 49d8c5ffad07) at least.
> > >
> > > > why do you need a module parameter?
> > >
> > > If we enable it unconditionally for modern devices, it may break some
> > > buggy moden device (vsock without a fix as an example).
> >
> > Presumably this happens because vsock does not have any inbuf at all
> > so it ignores the length.
> > We had the same with virtio net tx a long time ago.
> >
> > My suggestion is then not to fail - just cap length at the dma length
> > set by driver. Another idea is that if dma len is 0 then don't validate
> > at all - driver that did not add any inbufs is not going to look
> > at length.
> >
> > Allowing passing NULL as length and skipping validation
> > if len = 0 might also be a good idea.
> 
> The above should work for sure, but a question is, in the case of
> confidential computing, if a driver detects a malicious device, is it
> really good to try to keep working or warn and disable the device? Or
> maybe we can have an option for the user to decide?

A minor bug in an unused field is not worth panicing about.

> >
> >
> > > >
> > > >
> > > > > >
> > > > > > > > Second what's wrong with dma_desc_extra that we already maintain?
> > > > > > > > Third motivation - it's part and parcel of the hardening effort yes?
> > > > > > >
> > > > > > > They are different. dma_desc_extra is for a descriptor ring, but this
> > > > > > > is for a used ring. Technically we can go back to iterate on the
> > > > > > > descriptor ring for a legal used in buffer length. But it will have
> > > > > > > worse performance.
> > > > > >
> > > > > > I don't really understand. We already iterate when we unmap -
> > > > > > all that is necessary is to subtract it from used length, if at
> > > > > > the end of the process it is >0 then we know used length is too
> > > > > > large.
> > > > >
> > > > > Yes, but it is the job that is done in the driver level not the virtio
> > > > > core.
> > > >
> > > > What job?
> > >
> > > I meant the driver can do the validation since it has the knowledge of
> > > the buffer length if it wants.
> >
> > It does not necessarily have it - not if virtio is doing DMA
> > mapping.
> 
> I don't see any dependencies for DMA, I mean anyhow the in buffer is
> allocated by the driver, so it should know how large it is.
> 
> >
> >
> > > > unmap is done in detach_buf_split and detach_buf_packed respectively.
> > > > vring_desc_extra isn't even visible outside drivers/virtio/virtio_ring.c
> > >
> > > desc_extra doesn't contain buffer length for the case of indirect
> > > descriptors. So we need to iterate in the descriptors when it looks
> > > expensive if we don't need unmap.
> > >
> > > Thanks
> >
> > Well at the moment we only don't unmap if DMA API is bypassed.
> 
> If I understand correctly, we will probably support premapped DMA
> buffers soon (AF_XDP and page pool)?
> 
> Thanks
> 
> > And then
> > we don't need to validate length either. Fundamentally, without
> > ACCESS_PLATFORM device is trusted.
> >
> >
> > > >
> > > > For drivers that do unmap at driver level - I guess they can do
> > > > validation there too.
> > > >
> > > > > Validation in virtio core is still necessary since they're
> > > > > working at different levels and it's hard to force the validation in
> > > > > all drivers by codes. Last version introduces a
> > > > > suppress_driver_validation to allow the driver to suppress the core
> > > > > validation which seems not good, we need a way to force the
> > > > > virtio_ring code to do validation before.
> > > >
> > > > Why do we? If driver validates length virtio_ring does not need to
> > > > validate.  If driver does not use length virtio_ring does not need to
> > > > validate. core can provide this service for the gazillion non
> > > > performance critical drivers that just want to keep things simple,
> > > > but the 4-5 critical ones can do their own validation if they want to.
> > > >
> > > > > Or such stuff could be added
> > > > > on top since the validation is by default anyway.
> > > > >
> > > > > Thanks
> > > >
> > > >
> > > >
> > > > > >
> > > > > >
> > > > > > > > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> > > > > > > > we do more hardening. If it's irrevocably broken let's rip it out?
> > > > > > >
> > > > > > > So the plan is
> > > > > > >
> > > > > > > 1) finish used ring validation (this had been proposed, merged and
> > > > > > > reverted before notification hardening)
> > > > > > > 2) do notification hardening on top.
> > > > > > >
> > > > > > > So let's leave it as is and I will do a rework after we finalize the
> > > > > > > used ring validation.
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > ---
> > > > > > > > > Changes since V4:
> > > > > > > > > - drop the flat for driver to suppress the check
> > > > > > > > > - validation is disabled by default
> > > > > > > > > - don't do validation for legacy device
> > > > > > > > > - rebase and support virtqueue resize
> > > > > > > > > ---
> > > > > > > > >  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
> > > > > > > > >  1 file changed, 75 insertions(+)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > index 143f380baa1c..5b151605aaf8 100644
> > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > @@ -15,6 +15,9 @@
> > > > > > > > >  #include <linux/spinlock.h>
> > > > > > > > >  #include <xen/xen.h>
> > > > > > > > >
> > > > > > > > > +static bool force_used_validation = false;
> > > > > > > > > +module_param(force_used_validation, bool, 0444);
> > > > > > > > > +
> > > > > > > > >  #ifdef DEBUG
> > > > > > > > >  /* For development, we want to crash whenever the ring is screwed. */
> > > > > > > > >  #define BAD_RING(_vq, fmt, args...)                          \
> > > > > > > > > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> > > > > > > > >       struct vring_desc_state_split *desc_state;
> > > > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > > > >
> > > > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > > > +     u32 *buflen;
> > > > > > > > > +
> > > > > > > > >       /* DMA address and size information */
> > > > > > > > >       dma_addr_t queue_dma_addr;
> > > > > > > > >       size_t queue_size_in_bytes;
> > > > > > > > > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> > > > > > > > >       struct vring_desc_state_packed *desc_state;
> > > > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > > > >
> > > > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > > > +     u32 *buflen;
> > > > > > > > > +
> > > > > > > > >       /* DMA address and size information */
> > > > > > > > >       dma_addr_t ring_dma_addr;
> > > > > > > > >       dma_addr_t driver_event_dma_addr;
> > > > > > > > > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > > >       unsigned int i, n, avail, descs_used, prev, err_idx;
> > > > > > > > >       int head;
> > > > > > > > >       bool indirect;
> > > > > > > > > +     u32 buflen = 0;
> > > > > > > > >
> > > > > > > > >       START_USE(vq);
> > > > > > > > >
> > > > > > > > > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > > >                                                    VRING_DESC_F_NEXT |
> > > > > > > > >                                                    VRING_DESC_F_WRITE,
> > > > > > > > >                                                    indirect);
> > > > > > > > > +                     buflen += sg->length;
> > > > > > > > >               }
> > > > > > > > >       }
> > > > > > > > >       /* Last one doesn't continue. */
> > > > > > > > > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > > >       else
> > > > > > > > >               vq->split.desc_state[head].indir_desc = ctx;
> > > > > > > > >
> > > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > > +     if (vq->split.buflen)
> > > > > > > > > +             vq->split.buflen[head] = buflen;
> > > > > > > > > +
> > > > > > > > >       /* Put entry in available array (but don't update avail->idx until they
> > > > > > > > >        * do sync). */
> > > > > > > > >       avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> > > > > > > > > @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> > > > > > > > >               BAD_RING(vq, "id %u is not a head!\n", i);
> > > > > > > > >               return NULL;
> > > > > > > > >       }
> > > > > > > > > +     if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> > > > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > > > +                     *len, vq->split.buflen[i]);
> > > > > > > > > +             return NULL;
> > > > > > > > > +     }
> > > > > > > > >
> > > > > > > > >       /* detach_buf_split clears data, so grab it now. */
> > > > > > > > >       ret = vq->split.desc_state[i].data;
> > > > > > > > > @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
> > > > > > > > >                        vring_split->queue_dma_addr,
> > > > > > > > >                        dma_dev);
> > > > > > > > >
> > > > > > > > > +     kfree(vring_split->buflen);
> > > > > > > > >       kfree(vring_split->desc_state);
> > > > > > > > >       kfree(vring_split->desc_extra);
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > > > > > > +{
> > > > > > > > > +     /*
> > > > > > > > > +      * Several legacy devices are known to produce buggy used
> > > > > > > > > +      * length. In order to let driver work, we won't validate used
> > > > > > > > > +      * buffer length in this case.
> > > > > > > > > +      */
> > > > > > > > > +     if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > > > > > > +             return false;
> > > > > > > > > +     if (force_used_validation)
> > > > > > > > > +             return true;
> > > > > > > > > +     return false;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > > > >                                  struct virtio_device *vdev,
> > > > > > > > >                                  u32 num,
> > > > > > > > > @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > > > >       vring_split->vring_align = vring_align;
> > > > > > > > >       vring_split->may_reduce_num = may_reduce_num;
> > > > > > > > >
> > > > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > > > +             vring_split->buflen =
> > > > > > > > > +                     kmalloc_array(num, sizeof(*vring_split->buflen),
> > > > > > > > > +                                   GFP_KERNEL);
> > > > > > > > > +             if (!vring_split->buflen)
> > > > > > > > > +                     goto err_buflen;
> > > > > > > > > +     }
> > > > > > > > > +
> > > > > > > > >       return 0;
> > > > > > > > > +
> > > > > > > > > +err_buflen:
> > > > > > > > > +     vring_free_split(vring_split, vdev, dma_dev);
> > > > > > > > > +     return -ENOMEM;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  static struct virtqueue *vring_create_virtqueue_split(
> > > > > > > > > @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > > >       unsigned int i, n, err_idx;
> > > > > > > > >       u16 head, id;
> > > > > > > > >       dma_addr_t addr;
> > > > > > > > > +     u32 buflen = 0;
> > > > > > > > >
> > > > > > > > >       head = vq->packed.next_avail_idx;
> > > > > > > > >       desc = alloc_indirect_packed(total_sg, gfp);
> > > > > > > > > @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > > >                       desc[i].addr = cpu_to_le64(addr);
> > > > > > > > >                       desc[i].len = cpu_to_le32(sg->length);
> > > > > > > > >                       i++;
> > > > > > > > > +                     if (n >= out_sgs)
> > > > > > > > > +                             buflen += sg->length;
> > > > > > > > >               }
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > > @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > > >       vq->packed.desc_state[id].last = id;
> > > > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > > > >
> > > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > > +     if (vq->packed.buflen)
> > > > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > > > +
> > > > > > > > >       vq->num_added += 1;
> > > > > > > > >
> > > > > > > > >       pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > > > > > > @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > > >       __le16 head_flags, flags;
> > > > > > > > >       u16 head, id, prev, curr, avail_used_flags;
> > > > > > > > >       int err;
> > > > > > > > > +     u32 buflen = 0;
> > > > > > > > >
> > > > > > > > >       START_USE(vq);
> > > > > > > > >
> > > > > > > > > @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > > >                                       1 << VRING_PACKED_DESC_F_AVAIL |
> > > > > > > > >                                       1 << VRING_PACKED_DESC_F_USED;
> > > > > > > > >                       }
> > > > > > > > > +                     if (n >= out_sgs)
> > > > > > > > > +                             buflen += sg->length;
> > > > > > > > >               }
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > > @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > > >       vq->packed.desc_state[id].last = prev;
> > > > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > > > >
> > > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > > +     if (vq->packed.buflen)
> > > > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > > > +
> > > > > > > > >       /*
> > > > > > > > >        * A driver MUST NOT make the first descriptor in the list
> > > > > > > > >        * available before all subsequent descriptors comprising
> > > > > > > > > @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > > > > >               BAD_RING(vq, "id %u is not a head!\n", id);
> > > > > > > > >               return NULL;
> > > > > > > > >       }
> > > > > > > > > +     if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> > > > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > > > +                     *len, vq->packed.buflen[id]);
> > > > > > > > > +             return NULL;
> > > > > > > > > +     }
> > > > > > > > >
> > > > > > > > >       /* detach_buf_packed clears data, so grab it now. */
> > > > > > > > >       ret = vq->packed.desc_state[id].data;
> > > > > > > > > @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > > > >                                vring_packed->device_event_dma_addr,
> > > > > > > > >                                dma_dev);
> > > > > > > > >
> > > > > > > > > +     kfree(vring_packed->buflen);
> > > > > > > > >       kfree(vring_packed->desc_state);
> > > > > > > > >       kfree(vring_packed->desc_extra);
> > > > > > > > >  }
> > > > > > > > > @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > > > >
> > > > > > > > >       vring_packed->vring.num = num;
> > > > > > > > >
> > > > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > > > +             vring_packed->buflen =
> > > > > > > > > +                     kmalloc_array(num, sizeof(*vring_packed->buflen),
> > > > > > > > > +                                   GFP_KERNEL);
> > > > > > > > > +             if (!vring_packed->buflen)
> > > > > > > > > +                     goto err;
> > > > > > > > > +     }
> > > > > > > > > +
> > > > > > > > >       return 0;
> > > > > > > > >
> > > > > > > > >  err:
> > > > > > > > > --
> > > > > > > > > 2.25.1
> > > > > > > >
> > > > > >
> > > >
> >
  
Michael S. Tsirkin June 1, 2023, 3:40 a.m. UTC | #13
On Thu, Jun 01, 2023 at 09:27:08AM +0800, Jason Wang wrote:
> On Wed, May 31, 2023 at 6:25 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, May 31, 2023 at 04:26:38PM +0800, Jason Wang wrote:
> > > On Wed, May 31, 2023 at 3:36 PM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> > > > > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > > > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > > > > > > This patch validate
> > > > > > > > >
> > > > > > > > > validates
> > > > > > > > >
> > > > > > > > > > the used buffer length provided by the device
> > > > > > > > > > before trying to use it.
> > > > > > > > >
> > > > > > > > > before returning it to caller
> > > > > > > > >
> > > > > > > > > > This is done by remembering the in buffer
> > > > > > > > > > length in a dedicated array during virtqueue_add(), then we can fail
> > > > > > > > > > the virtqueue_get_buf() when we find the device is trying to give us a
> > > > > > > > > > used buffer length which is greater than we stored before.
> > > > > > > > >
> > > > > > > > > than what we stored
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > This validation is disable
> > > > > > > > >
> > > > > > > > > disabled
> > > > > > > > >
> > > > > > > > > > by default via module parameter to unbreak
> > > > > > > > > > some existing devices since some legacy devices are known to report
> > > > > > > > > > buggy used length.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > >
> > > > > > > > > First I'm not merging this without more data about
> > > > > > > > > what is known to be broken and what is known to work well
> > > > > > > > > in the commit log. And how exactly do things work if used length
> > > > > > > > > is wrong?
> > > > > > > >
> > > > > > > > Assuming the device is malicious, it would be very hard to answer.
> > > > > > > > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > > > > > > > the answer, we can simply make sure the used in buffer length is
> > > > > > > > validated then we know we're fine or not.
> > > > > > >
> > > > > > > To restate the question, you said above "some legacy devices are known
> > > > > > > to report buggy used length". If they report buggy length then how
> > > > > > > can things work?
> > > > > >
> > > > > > The validation is disabled for legacy device (as stated in the changelog):
> > > > > >
> > > > > > static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > > > {
> > > > > >         /*
> > > > > >          * Several legacy devices are known to produce buggy used
> > > > > >          * length. In order to let driver work, we won't validate used
> > > > > >          * buffer length in this case.
> > > > > >          */
> > > > > >         if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > > >                 return false;
> > > > > >         if (force_used_validation)
> > > > > >                 return true;
> > > > > >         return false;
> > > > > > }
> > > > > >
> > > > > > This seems to be what we've agreed in last version:
> > > > > >
> > > > > > https://lore.kernel.org/all/CANLsYkxfhamUU0bb4j7y6N4_G9odKxLCjXxgXEx4SJ6_Kf+M2Q@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > >
> > > > > I don't get it. You wrote:
> > > > >
> > > > >         This validation is disable
> > > > >         by default via module parameter to unbreak
> > > > >         some existing devices since some legacy devices are known to report
> > > > >         buggy used length.
> > > > >
> > > > > which devices?
> > > >
> > > > legacy rpmsg and vsock device (before 49d8c5ffad07) at least.
> > > >
> > > > > why do you need a module parameter?
> > > >
> > > > If we enable it unconditionally for modern devices, it may break some
> > > > buggy moden device (vsock without a fix as an example).
> > > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > > > Second what's wrong with dma_desc_extra that we already maintain?
> > > > > > > > > Third motivation - it's part and parcel of the hardening effort yes?
> > > > > > > >
> > > > > > > > They are different. dma_desc_extra is for a descriptor ring, but this
> > > > > > > > is for a used ring. Technically we can go back to iterate on the
> > > > > > > > descriptor ring for a legal used in buffer length. But it will have
> > > > > > > > worse performance.
> > > > > > >
> > > > > > > I don't really understand. We already iterate when we unmap -
> > > > > > > all that is necessary is to subtract it from used length, if at
> > > > > > > the end of the process it is >0 then we know used length is too
> > > > > > > large.
> > > > > >
> > > > > > Yes, but it is the job that is done in the driver level not the virtio
> > > > > > core.
> > > > >
> > > > > What job?
> > > >
> > > > I meant the driver can do the validation since it has the knowledge of
> > > > the buffer length if it wants.
> > > >
> > > > > unmap is done in detach_buf_split and detach_buf_packed respectively.
> > > > > vring_desc_extra isn't even visible outside drivers/virtio/virtio_ring.c
> > > >
> > > > desc_extra doesn't contain buffer length for the case of indirect
> > > > descriptors. So we need to iterate in the descriptors when it looks
> > > > expensive if we don't need unmap.
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > For drivers that do unmap at driver level - I guess they can do
> > > > > validation there too.
> > > > >
> > > > > > Validation in virtio core is still necessary since they're
> > > > > > working at different levels and it's hard to force the validation in
> > > > > > all drivers by codes. Last version introduces a
> > > > > > suppress_driver_validation to allow the driver to suppress the core
> > > > > > validation which seems not good, we need a way to force the
> > > > > > virtio_ring code to do validation before.
> > > > >
> > > > > Why do we? If driver validates length virtio_ring does not need to
> > > > > validate.
> > >
> > > To be more safe, there's no guarantee that there's no bug in the driver.
> >
> > Extra options increase testing matrix size so - there be bugs.
> > We need to make these decisions for (most) users.
> 
> In some cases the requirement is conflict:
> 
> 1) for "legacy users", the requirement is not to break existing setups
> 2) for confidential users, the requirement is to detect and fail early
> instead of trying to mitigate silently

I think 2 is a red herring. No one wants panics/failures, it is just
tolerated in the confidential setup.

> If we are trying to go in the middle (for example, doing a cap) it may
> end up with a result that can satisfy neither:
> 
> 1) it might still break in some drivers and setups like legacy, then
> we need an option to disable it

I don't see how cap can break anything.

> 2) it won't satisfy the users that ask for security
> 
> If we had an option, the management layer can choose to provision
> VM/guests with the good kernel command line then we are fine.
> 
> >
> > > >  If driver does not use length virtio_ring does not need to
> > > > > validate.
> > >
> > > This could be done on top assuming the validation is disabled by
> > > default. But if the administrator wants to have belt and braces we
> > > need to leave an option for them.
> > >
> > > Thanks
> >
> > No, we don't regress then fix on top.
> 
> I'm not sure I understand here. It would be very hard to say there
> won't be regression for any new codes, especially virtio had already
> supported several different types of devices and drivers.

Sure. We test what we can, just don't introduce untested code or known
regressions.

> > As for mod parameter I am not impressed -
> > no one's going to have the time or inclination to do the requisite
> > testing to know whether the module parameter is safe.
> 
> Actually, that's one advantage, most of the users don't care so the
> validation is just suppressed for them. Otherwise they might get
> surprised after upgrading. This is what I learnt from the past
> hardening effort like this and the IRQ hardening.
> 
> For the rest like confidential vendors, they need to test and make
> sure such kernel command line were provisioned to the VM.
> 
> Thanks

Each new boolean option doubles QE effort.  One of the reasons
confidential seems to be successful where previous DRM schemes mostly
failed is because it allows running Linux without many changes.

> >
> > > > core can provide this service for the gazillion non
> > > > > performance critical drivers that just want to keep things simple,
> > > > > but the 4-5 critical ones can do their own validation if they want to.
> > > > >
> > > > > > Or such stuff could be added
> > > > > > on top since the validation is by default anyway.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> > > > > > > > > we do more hardening. If it's irrevocably broken let's rip it out?
> > > > > > > >
> > > > > > > > So the plan is
> > > > > > > >
> > > > > > > > 1) finish used ring validation (this had been proposed, merged and
> > > > > > > > reverted before notification hardening)
> > > > > > > > 2) do notification hardening on top.
> > > > > > > >
> > > > > > > > So let's leave it as is and I will do a rework after we finalize the
> > > > > > > > used ring validation.
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > ---
> > > > > > > > > > Changes since V4:
> > > > > > > > > > - drop the flat for driver to suppress the check
> > > > > > > > > > - validation is disabled by default
> > > > > > > > > > - don't do validation for legacy device
> > > > > > > > > > - rebase and support virtqueue resize
> > > > > > > > > > ---
> > > > > > > > > >  drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++
> > > > > > > > > >  1 file changed, 75 insertions(+)
> > > > > > > > > >
> > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > index 143f380baa1c..5b151605aaf8 100644
> > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > @@ -15,6 +15,9 @@
> > > > > > > > > >  #include <linux/spinlock.h>
> > > > > > > > > >  #include <xen/xen.h>
> > > > > > > > > >
> > > > > > > > > > +static bool force_used_validation = false;
> > > > > > > > > > +module_param(force_used_validation, bool, 0444);
> > > > > > > > > > +
> > > > > > > > > >  #ifdef DEBUG
> > > > > > > > > >  /* For development, we want to crash whenever the ring is screwed. */
> > > > > > > > > >  #define BAD_RING(_vq, fmt, args...)                          \
> > > > > > > > > > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> > > > > > > > > >       struct vring_desc_state_split *desc_state;
> > > > > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > > > > >
> > > > > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > > > > +     u32 *buflen;
> > > > > > > > > > +
> > > > > > > > > >       /* DMA address and size information */
> > > > > > > > > >       dma_addr_t queue_dma_addr;
> > > > > > > > > >       size_t queue_size_in_bytes;
> > > > > > > > > > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> > > > > > > > > >       struct vring_desc_state_packed *desc_state;
> > > > > > > > > >       struct vring_desc_extra *desc_extra;
> > > > > > > > > >
> > > > > > > > > > +     /* Maximum in buffer length, NULL means no used validation */
> > > > > > > > > > +     u32 *buflen;
> > > > > > > > > > +
> > > > > > > > > >       /* DMA address and size information */
> > > > > > > > > >       dma_addr_t ring_dma_addr;
> > > > > > > > > >       dma_addr_t driver_event_dma_addr;
> > > > > > > > > > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > > > >       unsigned int i, n, avail, descs_used, prev, err_idx;
> > > > > > > > > >       int head;
> > > > > > > > > >       bool indirect;
> > > > > > > > > > +     u32 buflen = 0;
> > > > > > > > > >
> > > > > > > > > >       START_USE(vq);
> > > > > > > > > >
> > > > > > > > > > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > > > >                                                    VRING_DESC_F_NEXT |
> > > > > > > > > >                                                    VRING_DESC_F_WRITE,
> > > > > > > > > >                                                    indirect);
> > > > > > > > > > +                     buflen += sg->length;
> > > > > > > > > >               }
> > > > > > > > > >       }
> > > > > > > > > >       /* Last one doesn't continue. */
> > > > > > > > > > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> > > > > > > > > >       else
> > > > > > > > > >               vq->split.desc_state[head].indir_desc = ctx;
> > > > > > > > > >
> > > > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > > > +     if (vq->split.buflen)
> > > > > > > > > > +             vq->split.buflen[head] = buflen;
> > > > > > > > > > +
> > > > > > > > > >       /* Put entry in available array (but don't update avail->idx until they
> > > > > > > > > >        * do sync). */
> > > > > > > > > >       avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> > > > > > > > > > @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> > > > > > > > > >               BAD_RING(vq, "id %u is not a head!\n", i);
> > > > > > > > > >               return NULL;
> > > > > > > > > >       }
> > > > > > > > > > +     if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
> > > > > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > > > > +                     *len, vq->split.buflen[i]);
> > > > > > > > > > +             return NULL;
> > > > > > > > > > +     }
> > > > > > > > > >
> > > > > > > > > >       /* detach_buf_split clears data, so grab it now. */
> > > > > > > > > >       ret = vq->split.desc_state[i].data;
> > > > > > > > > > @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split,
> > > > > > > > > >                        vring_split->queue_dma_addr,
> > > > > > > > > >                        dma_dev);
> > > > > > > > > >
> > > > > > > > > > +     kfree(vring_split->buflen);
> > > > > > > > > >       kfree(vring_split->desc_state);
> > > > > > > > > >       kfree(vring_split->desc_extra);
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > +static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > > > > > > > > +{
> > > > > > > > > > +     /*
> > > > > > > > > > +      * Several legacy devices are known to produce buggy used
> > > > > > > > > > +      * length. In order to let driver work, we won't validate used
> > > > > > > > > > +      * buffer length in this case.
> > > > > > > > > > +      */
> > > > > > > > > > +     if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > > > > > > > +             return false;
> > > > > > > > > > +     if (force_used_validation)
> > > > > > > > > > +             return true;
> > > > > > > > > > +     return false;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > > > > >                                  struct virtio_device *vdev,
> > > > > > > > > >                                  u32 num,
> > > > > > > > > > @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
> > > > > > > > > >       vring_split->vring_align = vring_align;
> > > > > > > > > >       vring_split->may_reduce_num = may_reduce_num;
> > > > > > > > > >
> > > > > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > > > > +             vring_split->buflen =
> > > > > > > > > > +                     kmalloc_array(num, sizeof(*vring_split->buflen),
> > > > > > > > > > +                                   GFP_KERNEL);
> > > > > > > > > > +             if (!vring_split->buflen)
> > > > > > > > > > +                     goto err_buflen;
> > > > > > > > > > +     }
> > > > > > > > > > +
> > > > > > > > > >       return 0;
> > > > > > > > > > +
> > > > > > > > > > +err_buflen:
> > > > > > > > > > +     vring_free_split(vring_split, vdev, dma_dev);
> > > > > > > > > > +     return -ENOMEM;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > >  static struct virtqueue *vring_create_virtqueue_split(
> > > > > > > > > > @@ -1297,6 +1344,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > > > >       unsigned int i, n, err_idx;
> > > > > > > > > >       u16 head, id;
> > > > > > > > > >       dma_addr_t addr;
> > > > > > > > > > +     u32 buflen = 0;
> > > > > > > > > >
> > > > > > > > > >       head = vq->packed.next_avail_idx;
> > > > > > > > > >       desc = alloc_indirect_packed(total_sg, gfp);
> > > > > > > > > > @@ -1325,6 +1373,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > > > >                       desc[i].addr = cpu_to_le64(addr);
> > > > > > > > > >                       desc[i].len = cpu_to_le32(sg->length);
> > > > > > > > > >                       i++;
> > > > > > > > > > +                     if (n >= out_sgs)
> > > > > > > > > > +                             buflen += sg->length;
> > > > > > > > > >               }
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > > @@ -1379,6 +1429,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > > > > > > > > >       vq->packed.desc_state[id].last = id;
> > > > > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > > > > >
> > > > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > > > +     if (vq->packed.buflen)
> > > > > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > > > > +
> > > > > > > > > >       vq->num_added += 1;
> > > > > > > > > >
> > > > > > > > > >       pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > > > > > > > @@ -1416,6 +1470,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > > > >       __le16 head_flags, flags;
> > > > > > > > > >       u16 head, id, prev, curr, avail_used_flags;
> > > > > > > > > >       int err;
> > > > > > > > > > +     u32 buflen = 0;
> > > > > > > > > >
> > > > > > > > > >       START_USE(vq);
> > > > > > > > > >
> > > > > > > > > > @@ -1498,6 +1553,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > > > >                                       1 << VRING_PACKED_DESC_F_AVAIL |
> > > > > > > > > >                                       1 << VRING_PACKED_DESC_F_USED;
> > > > > > > > > >                       }
> > > > > > > > > > +                     if (n >= out_sgs)
> > > > > > > > > > +                             buflen += sg->length;
> > > > > > > > > >               }
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > > @@ -1518,6 +1575,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > > > >       vq->packed.desc_state[id].last = prev;
> > > > > > > > > >       vq->packed.desc_state[id].premapped = premapped;
> > > > > > > > > >
> > > > > > > > > > +     /* Store in buffer length if necessary */
> > > > > > > > > > +     if (vq->packed.buflen)
> > > > > > > > > > +             vq->packed.buflen[id] = buflen;
> > > > > > > > > > +
> > > > > > > > > >       /*
> > > > > > > > > >        * A driver MUST NOT make the first descriptor in the list
> > > > > > > > > >        * available before all subsequent descriptors comprising
> > > > > > > > > > @@ -1718,6 +1779,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > > > > > >               BAD_RING(vq, "id %u is not a head!\n", id);
> > > > > > > > > >               return NULL;
> > > > > > > > > >       }
> > > > > > > > > > +     if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
> > > > > > > > > > +             BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
> > > > > > > > > > +                     *len, vq->packed.buflen[id]);
> > > > > > > > > > +             return NULL;
> > > > > > > > > > +     }
> > > > > > > > > >
> > > > > > > > > >       /* detach_buf_packed clears data, so grab it now. */
> > > > > > > > > >       ret = vq->packed.desc_state[id].data;
> > > > > > > > > > @@ -1937,6 +2003,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > > > > >                                vring_packed->device_event_dma_addr,
> > > > > > > > > >                                dma_dev);
> > > > > > > > > >
> > > > > > > > > > +     kfree(vring_packed->buflen);
> > > > > > > > > >       kfree(vring_packed->desc_state);
> > > > > > > > > >       kfree(vring_packed->desc_extra);
> > > > > > > > > >  }
> > > > > > > > > > @@ -1988,6 +2055,14 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
> > > > > > > > > >
> > > > > > > > > >       vring_packed->vring.num = num;
> > > > > > > > > >
> > > > > > > > > > +     if (vring_needs_used_validation(vdev)) {
> > > > > > > > > > +             vring_packed->buflen =
> > > > > > > > > > +                     kmalloc_array(num, sizeof(*vring_packed->buflen),
> > > > > > > > > > +                                   GFP_KERNEL);
> > > > > > > > > > +             if (!vring_packed->buflen)
> > > > > > > > > > +                     goto err;
> > > > > > > > > > +     }
> > > > > > > > > > +
> > > > > > > > > >       return 0;
> > > > > > > > > >
> > > > > > > > > >  err:
> > > > > > > > > > --
> > > > > > > > > > 2.25.1
> > > > > > > > >
> > > > > > >
> > > > >
> >
  

Patch

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 143f380baa1c..5b151605aaf8 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -15,6 +15,9 @@ 
 #include <linux/spinlock.h>
 #include <xen/xen.h>
 
+static bool force_used_validation = false;
+module_param(force_used_validation, bool, 0444);
+
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
 #define BAD_RING(_vq, fmt, args...)				\
@@ -105,6 +108,9 @@  struct vring_virtqueue_split {
 	struct vring_desc_state_split *desc_state;
 	struct vring_desc_extra *desc_extra;
 
+	/* Maximum in buffer length, NULL means no used validation */
+	u32 *buflen;
+
 	/* DMA address and size information */
 	dma_addr_t queue_dma_addr;
 	size_t queue_size_in_bytes;
@@ -145,6 +151,9 @@  struct vring_virtqueue_packed {
 	struct vring_desc_state_packed *desc_state;
 	struct vring_desc_extra *desc_extra;
 
+	/* Maximum in buffer length, NULL means no used validation */
+	u32 *buflen;
+
 	/* DMA address and size information */
 	dma_addr_t ring_dma_addr;
 	dma_addr_t driver_event_dma_addr;
@@ -552,6 +561,7 @@  static inline int virtqueue_add_split(struct virtqueue *_vq,
 	unsigned int i, n, avail, descs_used, prev, err_idx;
 	int head;
 	bool indirect;
+	u32 buflen = 0;
 
 	START_USE(vq);
 
@@ -635,6 +645,7 @@  static inline int virtqueue_add_split(struct virtqueue *_vq,
 						     VRING_DESC_F_NEXT |
 						     VRING_DESC_F_WRITE,
 						     indirect);
+			buflen += sg->length;
 		}
 	}
 	/* Last one doesn't continue. */
@@ -675,6 +686,10 @@  static inline int virtqueue_add_split(struct virtqueue *_vq,
 	else
 		vq->split.desc_state[head].indir_desc = ctx;
 
+	/* Store in buffer length if necessary */
+	if (vq->split.buflen)
+		vq->split.buflen[head] = buflen;
+
 	/* Put entry in available array (but don't update avail->idx until they
 	 * do sync). */
 	avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
@@ -861,6 +876,11 @@  static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
 		BAD_RING(vq, "id %u is not a head!\n", i);
 		return NULL;
 	}
+	if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
+		BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
+			*len, vq->split.buflen[i]);
+		return NULL;
+	}
 
 	/* detach_buf_split clears data, so grab it now. */
 	ret = vq->split.desc_state[i].data;
@@ -1085,10 +1105,25 @@  static void vring_free_split(struct vring_virtqueue_split *vring_split,
 			 vring_split->queue_dma_addr,
 			 dma_dev);
 
+	kfree(vring_split->buflen);
 	kfree(vring_split->desc_state);
 	kfree(vring_split->desc_extra);
 }
 
+static bool vring_needs_used_validation(const struct virtio_device *vdev)
+{
+	/*
+	 * Several legacy devices are known to produce buggy used
+	 * length. In order to let driver work, we won't validate used
+	 * buffer length in this case.
+	 */
+	if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
+		return false;
+	if (force_used_validation)
+		return true;
+	return false;
+}
+
 static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
 				   struct virtio_device *vdev,
 				   u32 num,
@@ -1137,7 +1172,19 @@  static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
 	vring_split->vring_align = vring_align;
 	vring_split->may_reduce_num = may_reduce_num;
 
+	if (vring_needs_used_validation(vdev)) {
+		vring_split->buflen =
+			kmalloc_array(num, sizeof(*vring_split->buflen),
+				      GFP_KERNEL);
+		if (!vring_split->buflen)
+			goto err_buflen;
+	}
+
 	return 0;
+
+err_buflen:
+	vring_free_split(vring_split, vdev, dma_dev);
+	return -ENOMEM;
 }
 
 static struct virtqueue *vring_create_virtqueue_split(
@@ -1297,6 +1344,7 @@  static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 	unsigned int i, n, err_idx;
 	u16 head, id;
 	dma_addr_t addr;
+	u32 buflen = 0;
 
 	head = vq->packed.next_avail_idx;
 	desc = alloc_indirect_packed(total_sg, gfp);
@@ -1325,6 +1373,8 @@  static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 			desc[i].addr = cpu_to_le64(addr);
 			desc[i].len = cpu_to_le32(sg->length);
 			i++;
+			if (n >= out_sgs)
+				buflen += sg->length;
 		}
 	}
 
@@ -1379,6 +1429,10 @@  static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 	vq->packed.desc_state[id].last = id;
 	vq->packed.desc_state[id].premapped = premapped;
 
+	/* Store in buffer length if necessary */
+	if (vq->packed.buflen)
+		vq->packed.buflen[id] = buflen;
+
 	vq->num_added += 1;
 
 	pr_debug("Added buffer head %i to %p\n", head, vq);
@@ -1416,6 +1470,7 @@  static inline int virtqueue_add_packed(struct virtqueue *_vq,
 	__le16 head_flags, flags;
 	u16 head, id, prev, curr, avail_used_flags;
 	int err;
+	u32 buflen = 0;
 
 	START_USE(vq);
 
@@ -1498,6 +1553,8 @@  static inline int virtqueue_add_packed(struct virtqueue *_vq,
 					1 << VRING_PACKED_DESC_F_AVAIL |
 					1 << VRING_PACKED_DESC_F_USED;
 			}
+			if (n >= out_sgs)
+				buflen += sg->length;
 		}
 	}
 
@@ -1518,6 +1575,10 @@  static inline int virtqueue_add_packed(struct virtqueue *_vq,
 	vq->packed.desc_state[id].last = prev;
 	vq->packed.desc_state[id].premapped = premapped;
 
+	/* Store in buffer length if necessary */
+	if (vq->packed.buflen)
+		vq->packed.buflen[id] = buflen;
+
 	/*
 	 * A driver MUST NOT make the first descriptor in the list
 	 * available before all subsequent descriptors comprising
@@ -1718,6 +1779,11 @@  static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
 		BAD_RING(vq, "id %u is not a head!\n", id);
 		return NULL;
 	}
+	if (vq->packed.buflen && unlikely(*len > vq->packed.buflen[id])) {
+		BAD_RING(vq, "used len %d is larger than max in buffer len %u\n",
+			*len, vq->packed.buflen[id]);
+		return NULL;
+	}
 
 	/* detach_buf_packed clears data, so grab it now. */
 	ret = vq->packed.desc_state[id].data;
@@ -1937,6 +2003,7 @@  static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
 				 vring_packed->device_event_dma_addr,
 				 dma_dev);
 
+	kfree(vring_packed->buflen);
 	kfree(vring_packed->desc_state);
 	kfree(vring_packed->desc_extra);
 }
@@ -1988,6 +2055,14 @@  static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
 
 	vring_packed->vring.num = num;
 
+	if (vring_needs_used_validation(vdev)) {
+		vring_packed->buflen =
+			kmalloc_array(num, sizeof(*vring_packed->buflen),
+				      GFP_KERNEL);
+		if (!vring_packed->buflen)
+			goto err;
+	}
+
 	return 0;
 
 err: