[RFC,net-next,v5,11/14] vhost/vsock: implement datagram support

Message ID 20230413-b4-vsock-dgram-v5-11-581bd37fdb26@bytedance.com
State New
Headers
Series virtio/vsock: support datagrams |

Commit Message

Bobby Eshleman July 19, 2023, 12:50 a.m. UTC
  This commit implements datagram support for vhost/vsock by teaching
vhost to use the common virtio transport datagram functions.

If the virtio RX buffer is too small, then the transmission is
abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
error queue.

Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
---
 drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
 net/vmw_vsock/af_vsock.c |  5 +++-
 2 files changed, 63 insertions(+), 4 deletions(-)
  

Comments

Arseniy Krasnov July 22, 2023, 8:42 a.m. UTC | #1
On 19.07.2023 03:50, Bobby Eshleman wrote:
> This commit implements datagram support for vhost/vsock by teaching
> vhost to use the common virtio transport datagram functions.
> 
> If the virtio RX buffer is too small, then the transmission is
> abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
> error queue.
> 
> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> ---
>  drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
>  net/vmw_vsock/af_vsock.c |  5 +++-
>  2 files changed, 63 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index d5d6a3c3f273..da14260c6654 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -8,6 +8,7 @@
>   */
>  #include <linux/miscdevice.h>
>  #include <linux/atomic.h>
> +#include <linux/errqueue.h>
>  #include <linux/module.h>
>  #include <linux/mutex.h>
>  #include <linux/vmalloc.h>
> @@ -32,7 +33,8 @@
>  enum {
>  	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
>  			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
> -			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
> +			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
> +			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
>  };
>  
>  enum {
> @@ -56,6 +58,7 @@ struct vhost_vsock {
>  	atomic_t queued_replies;
>  
>  	u32 guest_cid;
> +	bool dgram_allow;
>  	bool seqpacket_allow;
>  };
>  
> @@ -86,6 +89,32 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
>  	return NULL;
>  }
>  
> +/* Claims ownership of the skb, do not free the skb after calling! */
> +static void
> +vhost_transport_error(struct sk_buff *skb, int err)
> +{
> +	struct sock_exterr_skb *serr;
> +	struct sock *sk = skb->sk;
> +	struct sk_buff *clone;
> +
> +	serr = SKB_EXT_ERR(skb);
> +	memset(serr, 0, sizeof(*serr));
> +	serr->ee.ee_errno = err;
> +	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
> +
> +	clone = skb_clone(skb, GFP_KERNEL);

May for skb which is error carrier we can use 'sock_omalloc()', not 'skb_clone()' ? TCP uses skb
allocated by this function as carriers of error structure. I guess 'skb_clone()' also clones data of origin,
but i think that there is no need in data as we insert it to error queue of the socket.

What do You think?

> +	if (!clone)
> +		return;

What will happen here 'if (!clone)' ? skb will leak as it was removed from queue?

> +
> +	if (sock_queue_err_skb(sk, clone))
> +		kfree_skb(clone);
> +
> +	sk->sk_err = err;
> +	sk_error_report(sk);
> +
> +	kfree_skb(skb);
> +}
> +
>  static void
>  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>  			    struct vhost_virtqueue *vq)
> @@ -160,9 +189,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>  		hdr = virtio_vsock_hdr(skb);
>  
>  		/* If the packet is greater than the space available in the
> -		 * buffer, we split it using multiple buffers.
> +		 * buffer, we split it using multiple buffers for connectible
> +		 * sockets and drop the packet for datagram sockets.
>  		 */
>  		if (payload_len > iov_len - sizeof(*hdr)) {
> +			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
> +				vhost_transport_error(skb, EHOSTUNREACH);
> +				continue;
> +			}
> +
>  			payload_len = iov_len - sizeof(*hdr);
>  
>  			/* As we are copying pieces of large packet's buffer to
> @@ -394,6 +429,7 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
>  	return val < vq->num;
>  }
>  
> +static bool vhost_transport_dgram_allow(u32 cid, u32 port);
>  static bool vhost_transport_seqpacket_allow(u32 remote_cid);
>  
>  static struct virtio_transport vhost_transport = {
> @@ -410,7 +446,8 @@ static struct virtio_transport vhost_transport = {
>  		.cancel_pkt               = vhost_transport_cancel_pkt,
>  
>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
> -		.dgram_allow              = virtio_transport_dgram_allow,
> +		.dgram_allow              = vhost_transport_dgram_allow,
> +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
>  
>  		.stream_enqueue           = virtio_transport_stream_enqueue,
>  		.stream_dequeue           = virtio_transport_stream_dequeue,
> @@ -443,6 +480,22 @@ static struct virtio_transport vhost_transport = {
>  	.send_pkt = vhost_transport_send_pkt,
>  };
>  
> +static bool vhost_transport_dgram_allow(u32 cid, u32 port)
> +{
> +	struct vhost_vsock *vsock;
> +	bool dgram_allow = false;
> +
> +	rcu_read_lock();
> +	vsock = vhost_vsock_get(cid);
> +
> +	if (vsock)
> +		dgram_allow = vsock->dgram_allow;
> +
> +	rcu_read_unlock();
> +
> +	return dgram_allow;
> +}
> +
>  static bool vhost_transport_seqpacket_allow(u32 remote_cid)
>  {
>  	struct vhost_vsock *vsock;
> @@ -799,6 +852,9 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
>  	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
>  		vsock->seqpacket_allow = true;
>  
> +	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
> +		vsock->dgram_allow = true;
> +
>  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
>  		vq = &vsock->vqs[i];
>  		mutex_lock(&vq->mutex);
> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> index e73f3b2c52f1..449ed63ac2b0 100644
> --- a/net/vmw_vsock/af_vsock.c
> +++ b/net/vmw_vsock/af_vsock.c
> @@ -1427,9 +1427,12 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>  		return prot->recvmsg(sk, msg, len, flags, NULL);
>  #endif
>  
> -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
> +	if (unlikely(flags & MSG_OOB))
>  		return -EOPNOTSUPP;
>  
> +	if (unlikely(flags & MSG_ERRQUEUE))
> +		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
> +

Sorry, but I get build error here, because SOL_VSOCK in undefined. I think it should be added to
include/linux/socket.h and to uapi files also for future use in userspace.

Also Stefano Garzarella <sgarzare@redhat.com> suggested to add define something like VSOCK_RECVERR,
in the same way as IP_RECVERR, and use it as last parameter of 'sock_recv_errqueue()'.

>  	transport = vsk->transport;
>  
>  	/* Retrieve the head sk_buff from the socket's receive queue. */
> 

Thanks, Arseniy
  
Bobby Eshleman July 26, 2023, 5:55 p.m. UTC | #2
On Sat, Jul 22, 2023 at 11:42:38AM +0300, Arseniy Krasnov wrote:
> 
> 
> On 19.07.2023 03:50, Bobby Eshleman wrote:
> > This commit implements datagram support for vhost/vsock by teaching
> > vhost to use the common virtio transport datagram functions.
> > 
> > If the virtio RX buffer is too small, then the transmission is
> > abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
> > error queue.
> > 
> > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> > ---
> >  drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
> >  net/vmw_vsock/af_vsock.c |  5 +++-
> >  2 files changed, 63 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > index d5d6a3c3f273..da14260c6654 100644
> > --- a/drivers/vhost/vsock.c
> > +++ b/drivers/vhost/vsock.c
> > @@ -8,6 +8,7 @@
> >   */
> >  #include <linux/miscdevice.h>
> >  #include <linux/atomic.h>
> > +#include <linux/errqueue.h>
> >  #include <linux/module.h>
> >  #include <linux/mutex.h>
> >  #include <linux/vmalloc.h>
> > @@ -32,7 +33,8 @@
> >  enum {
> >  	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
> >  			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
> > -			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
> > +			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
> > +			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
> >  };
> >  
> >  enum {
> > @@ -56,6 +58,7 @@ struct vhost_vsock {
> >  	atomic_t queued_replies;
> >  
> >  	u32 guest_cid;
> > +	bool dgram_allow;
> >  	bool seqpacket_allow;
> >  };
> >  
> > @@ -86,6 +89,32 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
> >  	return NULL;
> >  }
> >  
> > +/* Claims ownership of the skb, do not free the skb after calling! */
> > +static void
> > +vhost_transport_error(struct sk_buff *skb, int err)
> > +{
> > +	struct sock_exterr_skb *serr;
> > +	struct sock *sk = skb->sk;
> > +	struct sk_buff *clone;
> > +
> > +	serr = SKB_EXT_ERR(skb);
> > +	memset(serr, 0, sizeof(*serr));
> > +	serr->ee.ee_errno = err;
> > +	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
> > +
> > +	clone = skb_clone(skb, GFP_KERNEL);
> 
> May for skb which is error carrier we can use 'sock_omalloc()', not 'skb_clone()' ? TCP uses skb
> allocated by this function as carriers of error structure. I guess 'skb_clone()' also clones data of origin,
> but i think that there is no need in data as we insert it to error queue of the socket.
> 
> What do You think?

IIUC skb_clone() is often used in this scenario so that the user can
retrieve the error-causing packet from the error queue.  Is there some
reason we shouldn't do this?

I'm seeing that the serr bits need to occur on the clone here, not the
original. I didn't realize the SKB_EXT_ERR() is a skb->cb cast. I'm not
actually sure how this passes the test case since ->cb isn't cloned.

> 
> > +	if (!clone)
> > +		return;
> 
> What will happen here 'if (!clone)' ? skb will leak as it was removed from queue?
> 

Ah yes, true.

> > +
> > +	if (sock_queue_err_skb(sk, clone))
> > +		kfree_skb(clone);
> > +
> > +	sk->sk_err = err;
> > +	sk_error_report(sk);
> > +
> > +	kfree_skb(skb);
> > +}
> > +
> >  static void
> >  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> >  			    struct vhost_virtqueue *vq)
> > @@ -160,9 +189,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> >  		hdr = virtio_vsock_hdr(skb);
> >  
> >  		/* If the packet is greater than the space available in the
> > -		 * buffer, we split it using multiple buffers.
> > +		 * buffer, we split it using multiple buffers for connectible
> > +		 * sockets and drop the packet for datagram sockets.
> >  		 */
> >  		if (payload_len > iov_len - sizeof(*hdr)) {
> > +			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
> > +				vhost_transport_error(skb, EHOSTUNREACH);
> > +				continue;
> > +			}
> > +
> >  			payload_len = iov_len - sizeof(*hdr);
> >  
> >  			/* As we are copying pieces of large packet's buffer to
> > @@ -394,6 +429,7 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
> >  	return val < vq->num;
> >  }
> >  
> > +static bool vhost_transport_dgram_allow(u32 cid, u32 port);
> >  static bool vhost_transport_seqpacket_allow(u32 remote_cid);
> >  
> >  static struct virtio_transport vhost_transport = {
> > @@ -410,7 +446,8 @@ static struct virtio_transport vhost_transport = {
> >  		.cancel_pkt               = vhost_transport_cancel_pkt,
> >  
> >  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
> > -		.dgram_allow              = virtio_transport_dgram_allow,
> > +		.dgram_allow              = vhost_transport_dgram_allow,
> > +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
> >  
> >  		.stream_enqueue           = virtio_transport_stream_enqueue,
> >  		.stream_dequeue           = virtio_transport_stream_dequeue,
> > @@ -443,6 +480,22 @@ static struct virtio_transport vhost_transport = {
> >  	.send_pkt = vhost_transport_send_pkt,
> >  };
> >  
> > +static bool vhost_transport_dgram_allow(u32 cid, u32 port)
> > +{
> > +	struct vhost_vsock *vsock;
> > +	bool dgram_allow = false;
> > +
> > +	rcu_read_lock();
> > +	vsock = vhost_vsock_get(cid);
> > +
> > +	if (vsock)
> > +		dgram_allow = vsock->dgram_allow;
> > +
> > +	rcu_read_unlock();
> > +
> > +	return dgram_allow;
> > +}
> > +
> >  static bool vhost_transport_seqpacket_allow(u32 remote_cid)
> >  {
> >  	struct vhost_vsock *vsock;
> > @@ -799,6 +852,9 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
> >  	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
> >  		vsock->seqpacket_allow = true;
> >  
> > +	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
> > +		vsock->dgram_allow = true;
> > +
> >  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
> >  		vq = &vsock->vqs[i];
> >  		mutex_lock(&vq->mutex);
> > diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> > index e73f3b2c52f1..449ed63ac2b0 100644
> > --- a/net/vmw_vsock/af_vsock.c
> > +++ b/net/vmw_vsock/af_vsock.c
> > @@ -1427,9 +1427,12 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
> >  		return prot->recvmsg(sk, msg, len, flags, NULL);
> >  #endif
> >  
> > -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
> > +	if (unlikely(flags & MSG_OOB))
> >  		return -EOPNOTSUPP;
> >  
> > +	if (unlikely(flags & MSG_ERRQUEUE))
> > +		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
> > +
> 
> Sorry, but I get build error here, because SOL_VSOCK in undefined. I think it should be added to
> include/linux/socket.h and to uapi files also for future use in userspace.
> 

Strange, I built each patch individually without issue. My base is
netdev/main with your SOL_VSOCK patch applied. I will look today and see
if I'm missing something.

> Also Stefano Garzarella <sgarzare@redhat.com> suggested to add define something like VSOCK_RECVERR,
> in the same way as IP_RECVERR, and use it as last parameter of 'sock_recv_errqueue()'.
> 

Got it, thanks.

> >  	transport = vsk->transport;
> >  
> >  	/* Retrieve the head sk_buff from the socket's receive queue. */
> > 
> 
> Thanks, Arseniy

Thanks,
Bobby
  
Michael S. Tsirkin July 26, 2023, 6:40 p.m. UTC | #3
On Wed, Jul 19, 2023 at 12:50:15AM +0000, Bobby Eshleman wrote:
> This commit implements datagram support for vhost/vsock by teaching
> vhost to use the common virtio transport datagram functions.
> 
> If the virtio RX buffer is too small, then the transmission is
> abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
> error queue.
> 
> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>

EHOSTUNREACH?


> ---
>  drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
>  net/vmw_vsock/af_vsock.c |  5 +++-
>  2 files changed, 63 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index d5d6a3c3f273..da14260c6654 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -8,6 +8,7 @@
>   */
>  #include <linux/miscdevice.h>
>  #include <linux/atomic.h>
> +#include <linux/errqueue.h>
>  #include <linux/module.h>
>  #include <linux/mutex.h>
>  #include <linux/vmalloc.h>
> @@ -32,7 +33,8 @@
>  enum {
>  	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
>  			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
> -			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
> +			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
> +			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
>  };
>  
>  enum {
> @@ -56,6 +58,7 @@ struct vhost_vsock {
>  	atomic_t queued_replies;
>  
>  	u32 guest_cid;
> +	bool dgram_allow;
>  	bool seqpacket_allow;
>  };
>  
> @@ -86,6 +89,32 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
>  	return NULL;
>  }
>  
> +/* Claims ownership of the skb, do not free the skb after calling! */
> +static void
> +vhost_transport_error(struct sk_buff *skb, int err)
> +{
> +	struct sock_exterr_skb *serr;
> +	struct sock *sk = skb->sk;
> +	struct sk_buff *clone;
> +
> +	serr = SKB_EXT_ERR(skb);
> +	memset(serr, 0, sizeof(*serr));
> +	serr->ee.ee_errno = err;
> +	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
> +
> +	clone = skb_clone(skb, GFP_KERNEL);
> +	if (!clone)
> +		return;
> +
> +	if (sock_queue_err_skb(sk, clone))
> +		kfree_skb(clone);
> +
> +	sk->sk_err = err;
> +	sk_error_report(sk);
> +
> +	kfree_skb(skb);
> +}
> +
>  static void
>  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>  			    struct vhost_virtqueue *vq)
> @@ -160,9 +189,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>  		hdr = virtio_vsock_hdr(skb);
>  
>  		/* If the packet is greater than the space available in the
> -		 * buffer, we split it using multiple buffers.
> +		 * buffer, we split it using multiple buffers for connectible
> +		 * sockets and drop the packet for datagram sockets.
>  		 */

won't this break things like recently proposed zerocopy?
I think splitup has to be supported for all types.


>  		if (payload_len > iov_len - sizeof(*hdr)) {
> +			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
> +				vhost_transport_error(skb, EHOSTUNREACH);
> +				continue;
> +			}
> +
>  			payload_len = iov_len - sizeof(*hdr);
>  
>  			/* As we are copying pieces of large packet's buffer to
> @@ -394,6 +429,7 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
>  	return val < vq->num;
>  }
>  
> +static bool vhost_transport_dgram_allow(u32 cid, u32 port);
>  static bool vhost_transport_seqpacket_allow(u32 remote_cid);
>  
>  static struct virtio_transport vhost_transport = {
> @@ -410,7 +446,8 @@ static struct virtio_transport vhost_transport = {
>  		.cancel_pkt               = vhost_transport_cancel_pkt,
>  
>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
> -		.dgram_allow              = virtio_transport_dgram_allow,
> +		.dgram_allow              = vhost_transport_dgram_allow,
> +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
>  
>  		.stream_enqueue           = virtio_transport_stream_enqueue,
>  		.stream_dequeue           = virtio_transport_stream_dequeue,
> @@ -443,6 +480,22 @@ static struct virtio_transport vhost_transport = {
>  	.send_pkt = vhost_transport_send_pkt,
>  };
>  
> +static bool vhost_transport_dgram_allow(u32 cid, u32 port)
> +{
> +	struct vhost_vsock *vsock;
> +	bool dgram_allow = false;
> +
> +	rcu_read_lock();
> +	vsock = vhost_vsock_get(cid);
> +
> +	if (vsock)
> +		dgram_allow = vsock->dgram_allow;
> +
> +	rcu_read_unlock();
> +
> +	return dgram_allow;
> +}
> +
>  static bool vhost_transport_seqpacket_allow(u32 remote_cid)
>  {
>  	struct vhost_vsock *vsock;
> @@ -799,6 +852,9 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
>  	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
>  		vsock->seqpacket_allow = true;
>  
> +	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
> +		vsock->dgram_allow = true;
> +
>  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
>  		vq = &vsock->vqs[i];
>  		mutex_lock(&vq->mutex);
> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> index e73f3b2c52f1..449ed63ac2b0 100644
> --- a/net/vmw_vsock/af_vsock.c
> +++ b/net/vmw_vsock/af_vsock.c
> @@ -1427,9 +1427,12 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>  		return prot->recvmsg(sk, msg, len, flags, NULL);
>  #endif
>  
> -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
> +	if (unlikely(flags & MSG_OOB))
>  		return -EOPNOTSUPP;
>  
> +	if (unlikely(flags & MSG_ERRQUEUE))
> +		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
> +
>  	transport = vsk->transport;
>  
>  	/* Retrieve the head sk_buff from the socket's receive queue. */
> 
> -- 
> 2.30.2
  
Arseniy Krasnov July 27, 2023, 8 a.m. UTC | #4
On 26.07.2023 20:55, Bobby Eshleman wrote:
> On Sat, Jul 22, 2023 at 11:42:38AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 19.07.2023 03:50, Bobby Eshleman wrote:
>>> This commit implements datagram support for vhost/vsock by teaching
>>> vhost to use the common virtio transport datagram functions.
>>>
>>> If the virtio RX buffer is too small, then the transmission is
>>> abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
>>> error queue.
>>>
>>> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>>> ---
>>>  drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
>>>  net/vmw_vsock/af_vsock.c |  5 +++-
>>>  2 files changed, 63 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>> index d5d6a3c3f273..da14260c6654 100644
>>> --- a/drivers/vhost/vsock.c
>>> +++ b/drivers/vhost/vsock.c
>>> @@ -8,6 +8,7 @@
>>>   */
>>>  #include <linux/miscdevice.h>
>>>  #include <linux/atomic.h>
>>> +#include <linux/errqueue.h>
>>>  #include <linux/module.h>
>>>  #include <linux/mutex.h>
>>>  #include <linux/vmalloc.h>
>>> @@ -32,7 +33,8 @@
>>>  enum {
>>>  	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
>>>  			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
>>> -			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
>>> +			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
>>> +			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
>>>  };
>>>  
>>>  enum {
>>> @@ -56,6 +58,7 @@ struct vhost_vsock {
>>>  	atomic_t queued_replies;
>>>  
>>>  	u32 guest_cid;
>>> +	bool dgram_allow;
>>>  	bool seqpacket_allow;
>>>  };
>>>  
>>> @@ -86,6 +89,32 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
>>>  	return NULL;
>>>  }
>>>  
>>> +/* Claims ownership of the skb, do not free the skb after calling! */
>>> +static void
>>> +vhost_transport_error(struct sk_buff *skb, int err)
>>> +{
>>> +	struct sock_exterr_skb *serr;
>>> +	struct sock *sk = skb->sk;
>>> +	struct sk_buff *clone;
>>> +
>>> +	serr = SKB_EXT_ERR(skb);
>>> +	memset(serr, 0, sizeof(*serr));
>>> +	serr->ee.ee_errno = err;
>>> +	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
>>> +
>>> +	clone = skb_clone(skb, GFP_KERNEL);
>>
>> May for skb which is error carrier we can use 'sock_omalloc()', not 'skb_clone()' ? TCP uses skb
>> allocated by this function as carriers of error structure. I guess 'skb_clone()' also clones data of origin,
>> but i think that there is no need in data as we insert it to error queue of the socket.
>>
>> What do You think?
> 
> IIUC skb_clone() is often used in this scenario so that the user can
> retrieve the error-causing packet from the error queue.  Is there some
> reason we shouldn't do this?
> 
> I'm seeing that the serr bits need to occur on the clone here, not the
> original. I didn't realize the SKB_EXT_ERR() is a skb->cb cast. I'm not
> actually sure how this passes the test case since ->cb isn't cloned.

Ah yes, sorry, You are right, I just confused this case with zerocopy completion
handling - there we allocate "empty" skb which carries completion metadata in its
'cb' field.

Hm, but can't we just reinsert current skb (update it's 'cb' as 'sock_exterr_skb')
to error queue of the socket without cloning it ?

Thanks, Arseniy

> 
>>
>>> +	if (!clone)
>>> +		return;
>>
>> What will happen here 'if (!clone)' ? skb will leak as it was removed from queue?
>>
> 
> Ah yes, true.
> 
>>> +
>>> +	if (sock_queue_err_skb(sk, clone))
>>> +		kfree_skb(clone);
>>> +
>>> +	sk->sk_err = err;
>>> +	sk_error_report(sk);
>>> +
>>> +	kfree_skb(skb);
>>> +}
>>> +
>>>  static void
>>>  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>>>  			    struct vhost_virtqueue *vq)
>>> @@ -160,9 +189,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>>>  		hdr = virtio_vsock_hdr(skb);
>>>  
>>>  		/* If the packet is greater than the space available in the
>>> -		 * buffer, we split it using multiple buffers.
>>> +		 * buffer, we split it using multiple buffers for connectible
>>> +		 * sockets and drop the packet for datagram sockets.
>>>  		 */
>>>  		if (payload_len > iov_len - sizeof(*hdr)) {
>>> +			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
>>> +				vhost_transport_error(skb, EHOSTUNREACH);
>>> +				continue;
>>> +			}
>>> +
>>>  			payload_len = iov_len - sizeof(*hdr);
>>>  
>>>  			/* As we are copying pieces of large packet's buffer to
>>> @@ -394,6 +429,7 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
>>>  	return val < vq->num;
>>>  }
>>>  
>>> +static bool vhost_transport_dgram_allow(u32 cid, u32 port);
>>>  static bool vhost_transport_seqpacket_allow(u32 remote_cid);
>>>  
>>>  static struct virtio_transport vhost_transport = {
>>> @@ -410,7 +446,8 @@ static struct virtio_transport vhost_transport = {
>>>  		.cancel_pkt               = vhost_transport_cancel_pkt,
>>>  
>>>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
>>> -		.dgram_allow              = virtio_transport_dgram_allow,
>>> +		.dgram_allow              = vhost_transport_dgram_allow,
>>> +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
>>>  
>>>  		.stream_enqueue           = virtio_transport_stream_enqueue,
>>>  		.stream_dequeue           = virtio_transport_stream_dequeue,
>>> @@ -443,6 +480,22 @@ static struct virtio_transport vhost_transport = {
>>>  	.send_pkt = vhost_transport_send_pkt,
>>>  };
>>>  
>>> +static bool vhost_transport_dgram_allow(u32 cid, u32 port)
>>> +{
>>> +	struct vhost_vsock *vsock;
>>> +	bool dgram_allow = false;
>>> +
>>> +	rcu_read_lock();
>>> +	vsock = vhost_vsock_get(cid);
>>> +
>>> +	if (vsock)
>>> +		dgram_allow = vsock->dgram_allow;
>>> +
>>> +	rcu_read_unlock();
>>> +
>>> +	return dgram_allow;
>>> +}
>>> +
>>>  static bool vhost_transport_seqpacket_allow(u32 remote_cid)
>>>  {
>>>  	struct vhost_vsock *vsock;
>>> @@ -799,6 +852,9 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
>>>  	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
>>>  		vsock->seqpacket_allow = true;
>>>  
>>> +	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
>>> +		vsock->dgram_allow = true;
>>> +
>>>  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
>>>  		vq = &vsock->vqs[i];
>>>  		mutex_lock(&vq->mutex);
>>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>>> index e73f3b2c52f1..449ed63ac2b0 100644
>>> --- a/net/vmw_vsock/af_vsock.c
>>> +++ b/net/vmw_vsock/af_vsock.c
>>> @@ -1427,9 +1427,12 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>>>  		return prot->recvmsg(sk, msg, len, flags, NULL);
>>>  #endif
>>>  
>>> -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
>>> +	if (unlikely(flags & MSG_OOB))
>>>  		return -EOPNOTSUPP;
>>>  
>>> +	if (unlikely(flags & MSG_ERRQUEUE))
>>> +		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
>>> +
>>
>> Sorry, but I get build error here, because SOL_VSOCK in undefined. I think it should be added to
>> include/linux/socket.h and to uapi files also for future use in userspace.
>>
> 
> Strange, I built each patch individually without issue. My base is
> netdev/main with your SOL_VSOCK patch applied. I will look today and see
> if I'm missing something.
> 
>> Also Stefano Garzarella <sgarzare@redhat.com> suggested to add define something like VSOCK_RECVERR,
>> in the same way as IP_RECVERR, and use it as last parameter of 'sock_recv_errqueue()'.
>>
> 
> Got it, thanks.
> 
>>>  	transport = vsk->transport;
>>>  
>>>  	/* Retrieve the head sk_buff from the socket's receive queue. */
>>>
>>
>> Thanks, Arseniy
> 
> Thanks,
> Bobby
  
Arseniy Krasnov July 27, 2023, 8:04 a.m. UTC | #5
On 26.07.2023 20:55, Bobby Eshleman wrote:
> On Sat, Jul 22, 2023 at 11:42:38AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 19.07.2023 03:50, Bobby Eshleman wrote:
>>> This commit implements datagram support for vhost/vsock by teaching
>>> vhost to use the common virtio transport datagram functions.
>>>
>>> If the virtio RX buffer is too small, then the transmission is
>>> abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
>>> error queue.
>>>
>>> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>>> ---
>>>  drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
>>>  net/vmw_vsock/af_vsock.c |  5 +++-
>>>  2 files changed, 63 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>> index d5d6a3c3f273..da14260c6654 100644
>>> --- a/drivers/vhost/vsock.c
>>> +++ b/drivers/vhost/vsock.c
>>> @@ -8,6 +8,7 @@
>>>   */
>>>  #include <linux/miscdevice.h>
>>>  #include <linux/atomic.h>
>>> +#include <linux/errqueue.h>
>>>  #include <linux/module.h>
>>>  #include <linux/mutex.h>
>>>  #include <linux/vmalloc.h>
>>> @@ -32,7 +33,8 @@
>>>  enum {
>>>  	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
>>>  			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
>>> -			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
>>> +			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
>>> +			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
>>>  };
>>>  
>>>  enum {
>>> @@ -56,6 +58,7 @@ struct vhost_vsock {
>>>  	atomic_t queued_replies;
>>>  
>>>  	u32 guest_cid;
>>> +	bool dgram_allow;
>>>  	bool seqpacket_allow;
>>>  };
>>>  
>>> @@ -86,6 +89,32 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
>>>  	return NULL;
>>>  }
>>>  
>>> +/* Claims ownership of the skb, do not free the skb after calling! */
>>> +static void
>>> +vhost_transport_error(struct sk_buff *skb, int err)
>>> +{
>>> +	struct sock_exterr_skb *serr;
>>> +	struct sock *sk = skb->sk;
>>> +	struct sk_buff *clone;
>>> +
>>> +	serr = SKB_EXT_ERR(skb);
>>> +	memset(serr, 0, sizeof(*serr));
>>> +	serr->ee.ee_errno = err;
>>> +	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
>>> +
>>> +	clone = skb_clone(skb, GFP_KERNEL);
>>
>> May for skb which is error carrier we can use 'sock_omalloc()', not 'skb_clone()' ? TCP uses skb
>> allocated by this function as carriers of error structure. I guess 'skb_clone()' also clones data of origin,
>> but i think that there is no need in data as we insert it to error queue of the socket.
>>
>> What do You think?
> 
> IIUC skb_clone() is often used in this scenario so that the user can
> retrieve the error-causing packet from the error queue.  Is there some
> reason we shouldn't do this?
> 
> I'm seeing that the serr bits need to occur on the clone here, not the
> original. I didn't realize the SKB_EXT_ERR() is a skb->cb cast. I'm not
> actually sure how this passes the test case since ->cb isn't cloned.
> 
>>
>>> +	if (!clone)
>>> +		return;
>>
>> What will happen here 'if (!clone)' ? skb will leak as it was removed from queue?
>>
> 
> Ah yes, true.
> 
>>> +
>>> +	if (sock_queue_err_skb(sk, clone))
>>> +		kfree_skb(clone);
>>> +
>>> +	sk->sk_err = err;
>>> +	sk_error_report(sk);
>>> +
>>> +	kfree_skb(skb);
>>> +}
>>> +
>>>  static void
>>>  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>>>  			    struct vhost_virtqueue *vq)
>>> @@ -160,9 +189,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>>>  		hdr = virtio_vsock_hdr(skb);
>>>  
>>>  		/* If the packet is greater than the space available in the
>>> -		 * buffer, we split it using multiple buffers.
>>> +		 * buffer, we split it using multiple buffers for connectible
>>> +		 * sockets and drop the packet for datagram sockets.
>>>  		 */
>>>  		if (payload_len > iov_len - sizeof(*hdr)) {
>>> +			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
>>> +				vhost_transport_error(skb, EHOSTUNREACH);
>>> +				continue;
>>> +			}
>>> +
>>>  			payload_len = iov_len - sizeof(*hdr);
>>>  
>>>  			/* As we are copying pieces of large packet's buffer to
>>> @@ -394,6 +429,7 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
>>>  	return val < vq->num;
>>>  }
>>>  
>>> +static bool vhost_transport_dgram_allow(u32 cid, u32 port);
>>>  static bool vhost_transport_seqpacket_allow(u32 remote_cid);
>>>  
>>>  static struct virtio_transport vhost_transport = {
>>> @@ -410,7 +446,8 @@ static struct virtio_transport vhost_transport = {
>>>  		.cancel_pkt               = vhost_transport_cancel_pkt,
>>>  
>>>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
>>> -		.dgram_allow              = virtio_transport_dgram_allow,
>>> +		.dgram_allow              = vhost_transport_dgram_allow,
>>> +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
>>>  
>>>  		.stream_enqueue           = virtio_transport_stream_enqueue,
>>>  		.stream_dequeue           = virtio_transport_stream_dequeue,
>>> @@ -443,6 +480,22 @@ static struct virtio_transport vhost_transport = {
>>>  	.send_pkt = vhost_transport_send_pkt,
>>>  };
>>>  
>>> +static bool vhost_transport_dgram_allow(u32 cid, u32 port)
>>> +{
>>> +	struct vhost_vsock *vsock;
>>> +	bool dgram_allow = false;
>>> +
>>> +	rcu_read_lock();
>>> +	vsock = vhost_vsock_get(cid);
>>> +
>>> +	if (vsock)
>>> +		dgram_allow = vsock->dgram_allow;
>>> +
>>> +	rcu_read_unlock();
>>> +
>>> +	return dgram_allow;
>>> +}
>>> +
>>>  static bool vhost_transport_seqpacket_allow(u32 remote_cid)
>>>  {
>>>  	struct vhost_vsock *vsock;
>>> @@ -799,6 +852,9 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
>>>  	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
>>>  		vsock->seqpacket_allow = true;
>>>  
>>> +	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
>>> +		vsock->dgram_allow = true;
>>> +
>>>  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
>>>  		vq = &vsock->vqs[i];
>>>  		mutex_lock(&vq->mutex);
>>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>>> index e73f3b2c52f1..449ed63ac2b0 100644
>>> --- a/net/vmw_vsock/af_vsock.c
>>> +++ b/net/vmw_vsock/af_vsock.c
>>> @@ -1427,9 +1427,12 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>>>  		return prot->recvmsg(sk, msg, len, flags, NULL);
>>>  #endif
>>>  
>>> -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
>>> +	if (unlikely(flags & MSG_OOB))
>>>  		return -EOPNOTSUPP;
>>>  
>>> +	if (unlikely(flags & MSG_ERRQUEUE))
>>> +		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
>>> +
>>
>> Sorry, but I get build error here, because SOL_VSOCK in undefined. I think it should be added to
>> include/linux/socket.h and to uapi files also for future use in userspace.
>>
> 
> Strange, I built each patch individually without issue. My base is
> netdev/main with your SOL_VSOCK patch applied. I will look today and see
> if I'm missing something.

I see, this is difference, because i'm trying to run this patchset on the last net-next (as it is
supposed to be merged to net-next). I guess You should add this define anyway when You be ready to
be merged to net-next (I really don't know which SOL_VSOCK will be merged first - "Your" or "my" :) )

Thanks, Arseniy

> 
>> Also Stefano Garzarella <sgarzare@redhat.com> suggested to add define something like VSOCK_RECVERR,
>> in the same way as IP_RECVERR, and use it as last parameter of 'sock_recv_errqueue()'.
>>
> 
> Got it, thanks.
> 
>>>  	transport = vsk->transport;
>>>  
>>>  	/* Retrieve the head sk_buff from the socket's receive queue. */
>>>
>>
>> Thanks, Arseniy
> 
> Thanks,
> Bobby
  
Bobby Eshleman Aug. 1, 2023, 4:26 a.m. UTC | #6
On Wed, Jul 26, 2023 at 02:40:22PM -0400, Michael S. Tsirkin wrote:
> On Wed, Jul 19, 2023 at 12:50:15AM +0000, Bobby Eshleman wrote:
> > This commit implements datagram support for vhost/vsock by teaching
> > vhost to use the common virtio transport datagram functions.
> > 
> > If the virtio RX buffer is too small, then the transmission is
> > abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
> > error queue.
> > 
> > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> 
> EHOSTUNREACH?
> 

Yes, in the v4 thread we decided to try to mimic UDP/ICMP behavior when
IP packets are lost.

If an IP packet is dropped and the full UDP segment is not assembled,
then ICMP_TIME_EXCEEDED ICMP_EXC_FRAGTIME is sent. The sending stack
propagates this up the socket as EHOSTUNREACH. ENOBUFS/ENOMEM is already
used for local buffers, so EHOSTUNREACH distinctly points to the remote
end of the flow as well.

> 
> > ---
> >  drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
> >  net/vmw_vsock/af_vsock.c |  5 +++-
> >  2 files changed, 63 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > index d5d6a3c3f273..da14260c6654 100644
> > --- a/drivers/vhost/vsock.c
> > +++ b/drivers/vhost/vsock.c
> > @@ -8,6 +8,7 @@
> >   */
> >  #include <linux/miscdevice.h>
> >  #include <linux/atomic.h>
> > +#include <linux/errqueue.h>
> >  #include <linux/module.h>
> >  #include <linux/mutex.h>
> >  #include <linux/vmalloc.h>
> > @@ -32,7 +33,8 @@
> >  enum {
> >  	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
> >  			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
> > -			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
> > +			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
> > +			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
> >  };
> >  
> >  enum {
> > @@ -56,6 +58,7 @@ struct vhost_vsock {
> >  	atomic_t queued_replies;
> >  
> >  	u32 guest_cid;
> > +	bool dgram_allow;
> >  	bool seqpacket_allow;
> >  };
> >  
> > @@ -86,6 +89,32 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
> >  	return NULL;
> >  }
> >  
> > +/* Claims ownership of the skb, do not free the skb after calling! */
> > +static void
> > +vhost_transport_error(struct sk_buff *skb, int err)
> > +{
> > +	struct sock_exterr_skb *serr;
> > +	struct sock *sk = skb->sk;
> > +	struct sk_buff *clone;
> > +
> > +	serr = SKB_EXT_ERR(skb);
> > +	memset(serr, 0, sizeof(*serr));
> > +	serr->ee.ee_errno = err;
> > +	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
> > +
> > +	clone = skb_clone(skb, GFP_KERNEL);
> > +	if (!clone)
> > +		return;
> > +
> > +	if (sock_queue_err_skb(sk, clone))
> > +		kfree_skb(clone);
> > +
> > +	sk->sk_err = err;
> > +	sk_error_report(sk);
> > +
> > +	kfree_skb(skb);
> > +}
> > +
> >  static void
> >  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> >  			    struct vhost_virtqueue *vq)
> > @@ -160,9 +189,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> >  		hdr = virtio_vsock_hdr(skb);
> >  
> >  		/* If the packet is greater than the space available in the
> > -		 * buffer, we split it using multiple buffers.
> > +		 * buffer, we split it using multiple buffers for connectible
> > +		 * sockets and drop the packet for datagram sockets.
> >  		 */
> 
> won't this break things like recently proposed zerocopy?
> I think splitup has to be supported for all types.
> 
> 
> >  		if (payload_len > iov_len - sizeof(*hdr)) {
> > +			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
> > +				vhost_transport_error(skb, EHOSTUNREACH);
> > +				continue;
> > +			}
> > +
> >  			payload_len = iov_len - sizeof(*hdr);
> >  
> >  			/* As we are copying pieces of large packet's buffer to
> > @@ -394,6 +429,7 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
> >  	return val < vq->num;
> >  }
> >  
> > +static bool vhost_transport_dgram_allow(u32 cid, u32 port);
> >  static bool vhost_transport_seqpacket_allow(u32 remote_cid);
> >  
> >  static struct virtio_transport vhost_transport = {
> > @@ -410,7 +446,8 @@ static struct virtio_transport vhost_transport = {
> >  		.cancel_pkt               = vhost_transport_cancel_pkt,
> >  
> >  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
> > -		.dgram_allow              = virtio_transport_dgram_allow,
> > +		.dgram_allow              = vhost_transport_dgram_allow,
> > +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
> >  
> >  		.stream_enqueue           = virtio_transport_stream_enqueue,
> >  		.stream_dequeue           = virtio_transport_stream_dequeue,
> > @@ -443,6 +480,22 @@ static struct virtio_transport vhost_transport = {
> >  	.send_pkt = vhost_transport_send_pkt,
> >  };
> >  
> > +static bool vhost_transport_dgram_allow(u32 cid, u32 port)
> > +{
> > +	struct vhost_vsock *vsock;
> > +	bool dgram_allow = false;
> > +
> > +	rcu_read_lock();
> > +	vsock = vhost_vsock_get(cid);
> > +
> > +	if (vsock)
> > +		dgram_allow = vsock->dgram_allow;
> > +
> > +	rcu_read_unlock();
> > +
> > +	return dgram_allow;
> > +}
> > +
> >  static bool vhost_transport_seqpacket_allow(u32 remote_cid)
> >  {
> >  	struct vhost_vsock *vsock;
> > @@ -799,6 +852,9 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
> >  	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
> >  		vsock->seqpacket_allow = true;
> >  
> > +	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
> > +		vsock->dgram_allow = true;
> > +
> >  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
> >  		vq = &vsock->vqs[i];
> >  		mutex_lock(&vq->mutex);
> > diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> > index e73f3b2c52f1..449ed63ac2b0 100644
> > --- a/net/vmw_vsock/af_vsock.c
> > +++ b/net/vmw_vsock/af_vsock.c
> > @@ -1427,9 +1427,12 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
> >  		return prot->recvmsg(sk, msg, len, flags, NULL);
> >  #endif
> >  
> > -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
> > +	if (unlikely(flags & MSG_OOB))
> >  		return -EOPNOTSUPP;
> >  
> > +	if (unlikely(flags & MSG_ERRQUEUE))
> > +		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
> > +
> >  	transport = vsk->transport;
> >  
> >  	/* Retrieve the head sk_buff from the socket's receive queue. */
> > 
> > -- 
> > 2.30.2
> 
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization
  
Bobby Eshleman Aug. 2, 2023, 7:28 p.m. UTC | #7
On Wed, Jul 26, 2023 at 02:40:22PM -0400, Michael S. Tsirkin wrote:
> On Wed, Jul 19, 2023 at 12:50:15AM +0000, Bobby Eshleman wrote:
> > This commit implements datagram support for vhost/vsock by teaching
> > vhost to use the common virtio transport datagram functions.
> > 
> > If the virtio RX buffer is too small, then the transmission is
> > abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
> > error queue.
> > 
> > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> 
> EHOSTUNREACH?
> 
> 
> > ---
> >  drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
> >  net/vmw_vsock/af_vsock.c |  5 +++-
> >  2 files changed, 63 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > index d5d6a3c3f273..da14260c6654 100644
> > --- a/drivers/vhost/vsock.c
> > +++ b/drivers/vhost/vsock.c
> > @@ -8,6 +8,7 @@
> >   */
> >  #include <linux/miscdevice.h>
> >  #include <linux/atomic.h>
> > +#include <linux/errqueue.h>
> >  #include <linux/module.h>
> >  #include <linux/mutex.h>
> >  #include <linux/vmalloc.h>
> > @@ -32,7 +33,8 @@
> >  enum {
> >  	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
> >  			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
> > -			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
> > +			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
> > +			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
> >  };
> >  
> >  enum {
> > @@ -56,6 +58,7 @@ struct vhost_vsock {
> >  	atomic_t queued_replies;
> >  
> >  	u32 guest_cid;
> > +	bool dgram_allow;
> >  	bool seqpacket_allow;
> >  };
> >  
> > @@ -86,6 +89,32 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
> >  	return NULL;
> >  }
> >  
> > +/* Claims ownership of the skb, do not free the skb after calling! */
> > +static void
> > +vhost_transport_error(struct sk_buff *skb, int err)
> > +{
> > +	struct sock_exterr_skb *serr;
> > +	struct sock *sk = skb->sk;
> > +	struct sk_buff *clone;
> > +
> > +	serr = SKB_EXT_ERR(skb);
> > +	memset(serr, 0, sizeof(*serr));
> > +	serr->ee.ee_errno = err;
> > +	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
> > +
> > +	clone = skb_clone(skb, GFP_KERNEL);
> > +	if (!clone)
> > +		return;
> > +
> > +	if (sock_queue_err_skb(sk, clone))
> > +		kfree_skb(clone);
> > +
> > +	sk->sk_err = err;
> > +	sk_error_report(sk);
> > +
> > +	kfree_skb(skb);
> > +}
> > +
> >  static void
> >  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> >  			    struct vhost_virtqueue *vq)
> > @@ -160,9 +189,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> >  		hdr = virtio_vsock_hdr(skb);
> >  
> >  		/* If the packet is greater than the space available in the
> > -		 * buffer, we split it using multiple buffers.
> > +		 * buffer, we split it using multiple buffers for connectible
> > +		 * sockets and drop the packet for datagram sockets.
> >  		 */
> 
> won't this break things like recently proposed zerocopy?
> I think splitup has to be supported for all types.
> 

Could you elaborate? Is there something about zerocopy that would
prohibit the transport from dropping a datagram?

> 
> >  		if (payload_len > iov_len - sizeof(*hdr)) {
> > +			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
> > +				vhost_transport_error(skb, EHOSTUNREACH);
> > +				continue;
> > +			}
> > +
> >  			payload_len = iov_len - sizeof(*hdr);
> >  
> >  			/* As we are copying pieces of large packet's buffer to
> > @@ -394,6 +429,7 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
> >  	return val < vq->num;
> >  }
> >  
> > +static bool vhost_transport_dgram_allow(u32 cid, u32 port);
> >  static bool vhost_transport_seqpacket_allow(u32 remote_cid);
> >  
> >  static struct virtio_transport vhost_transport = {
> > @@ -410,7 +446,8 @@ static struct virtio_transport vhost_transport = {
> >  		.cancel_pkt               = vhost_transport_cancel_pkt,
> >  
> >  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
> > -		.dgram_allow              = virtio_transport_dgram_allow,
> > +		.dgram_allow              = vhost_transport_dgram_allow,
> > +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
> >  
> >  		.stream_enqueue           = virtio_transport_stream_enqueue,
> >  		.stream_dequeue           = virtio_transport_stream_dequeue,
> > @@ -443,6 +480,22 @@ static struct virtio_transport vhost_transport = {
> >  	.send_pkt = vhost_transport_send_pkt,
> >  };
> >  
> > +static bool vhost_transport_dgram_allow(u32 cid, u32 port)
> > +{
> > +	struct vhost_vsock *vsock;
> > +	bool dgram_allow = false;
> > +
> > +	rcu_read_lock();
> > +	vsock = vhost_vsock_get(cid);
> > +
> > +	if (vsock)
> > +		dgram_allow = vsock->dgram_allow;
> > +
> > +	rcu_read_unlock();
> > +
> > +	return dgram_allow;
> > +}
> > +
> >  static bool vhost_transport_seqpacket_allow(u32 remote_cid)
> >  {
> >  	struct vhost_vsock *vsock;
> > @@ -799,6 +852,9 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
> >  	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
> >  		vsock->seqpacket_allow = true;
> >  
> > +	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
> > +		vsock->dgram_allow = true;
> > +
> >  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
> >  		vq = &vsock->vqs[i];
> >  		mutex_lock(&vq->mutex);
> > diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> > index e73f3b2c52f1..449ed63ac2b0 100644
> > --- a/net/vmw_vsock/af_vsock.c
> > +++ b/net/vmw_vsock/af_vsock.c
> > @@ -1427,9 +1427,12 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
> >  		return prot->recvmsg(sk, msg, len, flags, NULL);
> >  #endif
> >  
> > -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
> > +	if (unlikely(flags & MSG_OOB))
> >  		return -EOPNOTSUPP;
> >  
> > +	if (unlikely(flags & MSG_ERRQUEUE))
> > +		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
> > +
> >  	transport = vsk->transport;
> >  
> >  	/* Retrieve the head sk_buff from the socket's receive queue. */
> > 
> > -- 
> > 2.30.2
> 
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization
  
Bobby Eshleman Aug. 2, 2023, 9:23 p.m. UTC | #8
On Thu, Jul 27, 2023 at 11:00:55AM +0300, Arseniy Krasnov wrote:
> 
> 
> On 26.07.2023 20:55, Bobby Eshleman wrote:
> > On Sat, Jul 22, 2023 at 11:42:38AM +0300, Arseniy Krasnov wrote:
> >>
> >>
> >> On 19.07.2023 03:50, Bobby Eshleman wrote:
> >>> This commit implements datagram support for vhost/vsock by teaching
> >>> vhost to use the common virtio transport datagram functions.
> >>>
> >>> If the virtio RX buffer is too small, then the transmission is
> >>> abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
> >>> error queue.
> >>>
> >>> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> >>> ---
> >>>  drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
> >>>  net/vmw_vsock/af_vsock.c |  5 +++-
> >>>  2 files changed, 63 insertions(+), 4 deletions(-)
> >>>
> >>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> >>> index d5d6a3c3f273..da14260c6654 100644
> >>> --- a/drivers/vhost/vsock.c
> >>> +++ b/drivers/vhost/vsock.c
> >>> @@ -8,6 +8,7 @@
> >>>   */
> >>>  #include <linux/miscdevice.h>
> >>>  #include <linux/atomic.h>
> >>> +#include <linux/errqueue.h>
> >>>  #include <linux/module.h>
> >>>  #include <linux/mutex.h>
> >>>  #include <linux/vmalloc.h>
> >>> @@ -32,7 +33,8 @@
> >>>  enum {
> >>>  	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
> >>>  			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
> >>> -			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
> >>> +			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
> >>> +			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
> >>>  };
> >>>  
> >>>  enum {
> >>> @@ -56,6 +58,7 @@ struct vhost_vsock {
> >>>  	atomic_t queued_replies;
> >>>  
> >>>  	u32 guest_cid;
> >>> +	bool dgram_allow;
> >>>  	bool seqpacket_allow;
> >>>  };
> >>>  
> >>> @@ -86,6 +89,32 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
> >>>  	return NULL;
> >>>  }
> >>>  
> >>> +/* Claims ownership of the skb, do not free the skb after calling! */
> >>> +static void
> >>> +vhost_transport_error(struct sk_buff *skb, int err)
> >>> +{
> >>> +	struct sock_exterr_skb *serr;
> >>> +	struct sock *sk = skb->sk;
> >>> +	struct sk_buff *clone;
> >>> +
> >>> +	serr = SKB_EXT_ERR(skb);
> >>> +	memset(serr, 0, sizeof(*serr));
> >>> +	serr->ee.ee_errno = err;
> >>> +	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
> >>> +
> >>> +	clone = skb_clone(skb, GFP_KERNEL);
> >>
> >> May for skb which is error carrier we can use 'sock_omalloc()', not 'skb_clone()' ? TCP uses skb
> >> allocated by this function as carriers of error structure. I guess 'skb_clone()' also clones data of origin,
> >> but i think that there is no need in data as we insert it to error queue of the socket.
> >>
> >> What do You think?
> > 
> > IIUC skb_clone() is often used in this scenario so that the user can
> > retrieve the error-causing packet from the error queue.  Is there some
> > reason we shouldn't do this?
> > 
> > I'm seeing that the serr bits need to occur on the clone here, not the
> > original. I didn't realize the SKB_EXT_ERR() is a skb->cb cast. I'm not
> > actually sure how this passes the test case since ->cb isn't cloned.
> 
> Ah yes, sorry, You are right, I just confused this case with zerocopy completion
> handling - there we allocate "empty" skb which carries completion metadata in its
> 'cb' field.
> 
> Hm, but can't we just reinsert current skb (update it's 'cb' as 'sock_exterr_skb')
> to error queue of the socket without cloning it ?
> 
> Thanks, Arseniy
> 

I just assumed other socket types used skb_clone() for some reason
unknown to me and I didn't want to deviate.

If it is fine to just use the skb directly, then I am happy to make that
change.

Best,
Bobby

> > 
> >>
> >>> +	if (!clone)
> >>> +		return;
> >>
> >> What will happen here 'if (!clone)' ? skb will leak as it was removed from queue?
> >>
> > 
> > Ah yes, true.
> > 
> >>> +
> >>> +	if (sock_queue_err_skb(sk, clone))
> >>> +		kfree_skb(clone);
> >>> +
> >>> +	sk->sk_err = err;
> >>> +	sk_error_report(sk);
> >>> +
> >>> +	kfree_skb(skb);
> >>> +}
> >>> +
> >>>  static void
> >>>  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> >>>  			    struct vhost_virtqueue *vq)
> >>> @@ -160,9 +189,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> >>>  		hdr = virtio_vsock_hdr(skb);
> >>>  
> >>>  		/* If the packet is greater than the space available in the
> >>> -		 * buffer, we split it using multiple buffers.
> >>> +		 * buffer, we split it using multiple buffers for connectible
> >>> +		 * sockets and drop the packet for datagram sockets.
> >>>  		 */
> >>>  		if (payload_len > iov_len - sizeof(*hdr)) {
> >>> +			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
> >>> +				vhost_transport_error(skb, EHOSTUNREACH);
> >>> +				continue;
> >>> +			}
> >>> +
> >>>  			payload_len = iov_len - sizeof(*hdr);
> >>>  
> >>>  			/* As we are copying pieces of large packet's buffer to
> >>> @@ -394,6 +429,7 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
> >>>  	return val < vq->num;
> >>>  }
> >>>  
> >>> +static bool vhost_transport_dgram_allow(u32 cid, u32 port);
> >>>  static bool vhost_transport_seqpacket_allow(u32 remote_cid);
> >>>  
> >>>  static struct virtio_transport vhost_transport = {
> >>> @@ -410,7 +446,8 @@ static struct virtio_transport vhost_transport = {
> >>>  		.cancel_pkt               = vhost_transport_cancel_pkt,
> >>>  
> >>>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
> >>> -		.dgram_allow              = virtio_transport_dgram_allow,
> >>> +		.dgram_allow              = vhost_transport_dgram_allow,
> >>> +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
> >>>  
> >>>  		.stream_enqueue           = virtio_transport_stream_enqueue,
> >>>  		.stream_dequeue           = virtio_transport_stream_dequeue,
> >>> @@ -443,6 +480,22 @@ static struct virtio_transport vhost_transport = {
> >>>  	.send_pkt = vhost_transport_send_pkt,
> >>>  };
> >>>  
> >>> +static bool vhost_transport_dgram_allow(u32 cid, u32 port)
> >>> +{
> >>> +	struct vhost_vsock *vsock;
> >>> +	bool dgram_allow = false;
> >>> +
> >>> +	rcu_read_lock();
> >>> +	vsock = vhost_vsock_get(cid);
> >>> +
> >>> +	if (vsock)
> >>> +		dgram_allow = vsock->dgram_allow;
> >>> +
> >>> +	rcu_read_unlock();
> >>> +
> >>> +	return dgram_allow;
> >>> +}
> >>> +
> >>>  static bool vhost_transport_seqpacket_allow(u32 remote_cid)
> >>>  {
> >>>  	struct vhost_vsock *vsock;
> >>> @@ -799,6 +852,9 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
> >>>  	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
> >>>  		vsock->seqpacket_allow = true;
> >>>  
> >>> +	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
> >>> +		vsock->dgram_allow = true;
> >>> +
> >>>  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
> >>>  		vq = &vsock->vqs[i];
> >>>  		mutex_lock(&vq->mutex);
> >>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> >>> index e73f3b2c52f1..449ed63ac2b0 100644
> >>> --- a/net/vmw_vsock/af_vsock.c
> >>> +++ b/net/vmw_vsock/af_vsock.c
> >>> @@ -1427,9 +1427,12 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
> >>>  		return prot->recvmsg(sk, msg, len, flags, NULL);
> >>>  #endif
> >>>  
> >>> -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
> >>> +	if (unlikely(flags & MSG_OOB))
> >>>  		return -EOPNOTSUPP;
> >>>  
> >>> +	if (unlikely(flags & MSG_ERRQUEUE))
> >>> +		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
> >>> +
> >>
> >> Sorry, but I get build error here, because SOL_VSOCK in undefined. I think it should be added to
> >> include/linux/socket.h and to uapi files also for future use in userspace.
> >>
> > 
> > Strange, I built each patch individually without issue. My base is
> > netdev/main with your SOL_VSOCK patch applied. I will look today and see
> > if I'm missing something.
> > 
> >> Also Stefano Garzarella <sgarzare@redhat.com> suggested to add define something like VSOCK_RECVERR,
> >> in the same way as IP_RECVERR, and use it as last parameter of 'sock_recv_errqueue()'.
> >>
> > 
> > Got it, thanks.
> > 
> >>>  	transport = vsk->transport;
> >>>  
> >>>  	/* Retrieve the head sk_buff from the socket's receive queue. */
> >>>
> >>
> >> Thanks, Arseniy
> > 
> > Thanks,
> > Bobby
  
Arseniy Krasnov Aug. 4, 2023, 1:16 p.m. UTC | #9
On 03.08.2023 00:23, Bobby Eshleman wrote:
> On Thu, Jul 27, 2023 at 11:00:55AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 26.07.2023 20:55, Bobby Eshleman wrote:
>>> On Sat, Jul 22, 2023 at 11:42:38AM +0300, Arseniy Krasnov wrote:
>>>>
>>>>
>>>> On 19.07.2023 03:50, Bobby Eshleman wrote:
>>>>> This commit implements datagram support for vhost/vsock by teaching
>>>>> vhost to use the common virtio transport datagram functions.
>>>>>
>>>>> If the virtio RX buffer is too small, then the transmission is
>>>>> abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
>>>>> error queue.
>>>>>
>>>>> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>>>>> ---
>>>>>  drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
>>>>>  net/vmw_vsock/af_vsock.c |  5 +++-
>>>>>  2 files changed, 63 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>>>> index d5d6a3c3f273..da14260c6654 100644
>>>>> --- a/drivers/vhost/vsock.c
>>>>> +++ b/drivers/vhost/vsock.c
>>>>> @@ -8,6 +8,7 @@
>>>>>   */
>>>>>  #include <linux/miscdevice.h>
>>>>>  #include <linux/atomic.h>
>>>>> +#include <linux/errqueue.h>
>>>>>  #include <linux/module.h>
>>>>>  #include <linux/mutex.h>
>>>>>  #include <linux/vmalloc.h>
>>>>> @@ -32,7 +33,8 @@
>>>>>  enum {
>>>>>  	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
>>>>>  			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
>>>>> -			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
>>>>> +			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
>>>>> +			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
>>>>>  };
>>>>>  
>>>>>  enum {
>>>>> @@ -56,6 +58,7 @@ struct vhost_vsock {
>>>>>  	atomic_t queued_replies;
>>>>>  
>>>>>  	u32 guest_cid;
>>>>> +	bool dgram_allow;
>>>>>  	bool seqpacket_allow;
>>>>>  };
>>>>>  
>>>>> @@ -86,6 +89,32 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
>>>>>  	return NULL;
>>>>>  }
>>>>>  
>>>>> +/* Claims ownership of the skb, do not free the skb after calling! */
>>>>> +static void
>>>>> +vhost_transport_error(struct sk_buff *skb, int err)
>>>>> +{
>>>>> +	struct sock_exterr_skb *serr;
>>>>> +	struct sock *sk = skb->sk;
>>>>> +	struct sk_buff *clone;
>>>>> +
>>>>> +	serr = SKB_EXT_ERR(skb);
>>>>> +	memset(serr, 0, sizeof(*serr));
>>>>> +	serr->ee.ee_errno = err;
>>>>> +	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
>>>>> +
>>>>> +	clone = skb_clone(skb, GFP_KERNEL);
>>>>
>>>> May for skb which is error carrier we can use 'sock_omalloc()', not 'skb_clone()' ? TCP uses skb
>>>> allocated by this function as carriers of error structure. I guess 'skb_clone()' also clones data of origin,
>>>> but i think that there is no need in data as we insert it to error queue of the socket.
>>>>
>>>> What do You think?
>>>
>>> IIUC skb_clone() is often used in this scenario so that the user can
>>> retrieve the error-causing packet from the error queue.  Is there some
>>> reason we shouldn't do this?
>>>
>>> I'm seeing that the serr bits need to occur on the clone here, not the
>>> original. I didn't realize the SKB_EXT_ERR() is a skb->cb cast. I'm not
>>> actually sure how this passes the test case since ->cb isn't cloned.
>>
>> Ah yes, sorry, You are right, I just confused this case with zerocopy completion
>> handling - there we allocate "empty" skb which carries completion metadata in its
>> 'cb' field.
>>
>> Hm, but can't we just reinsert current skb (update it's 'cb' as 'sock_exterr_skb')
>> to error queue of the socket without cloning it ?
>>
>> Thanks, Arseniy
>>
> 
> I just assumed other socket types used skb_clone() for some reason
> unknown to me and I didn't want to deviate.
> 
> If it is fine to just use the skb directly, then I am happy to make that
> change.

Agree, it is better to use behaviour from already implemented sockets.
I also found, that ICMP clones skb in this way:

https://elixir.bootlin.com/linux/latest/source/net/ipv4/ip_sockglue.c#L412
skb = skb_clone(skb, GFP_ATOMIC);

I guess there is some sense beyond 'skb = skb_clone(skb)'...

Thanks, Arseniy

> 
> Best,
> Bobby
> 
>>>
>>>>
>>>>> +	if (!clone)
>>>>> +		return;
>>>>
>>>> What will happen here 'if (!clone)' ? skb will leak as it was removed from queue?
>>>>
>>>
>>> Ah yes, true.
>>>
>>>>> +
>>>>> +	if (sock_queue_err_skb(sk, clone))
>>>>> +		kfree_skb(clone);
>>>>> +
>>>>> +	sk->sk_err = err;
>>>>> +	sk_error_report(sk);
>>>>> +
>>>>> +	kfree_skb(skb);
>>>>> +}
>>>>> +
>>>>>  static void
>>>>>  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>>>>>  			    struct vhost_virtqueue *vq)
>>>>> @@ -160,9 +189,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>>>>>  		hdr = virtio_vsock_hdr(skb);
>>>>>  
>>>>>  		/* If the packet is greater than the space available in the
>>>>> -		 * buffer, we split it using multiple buffers.
>>>>> +		 * buffer, we split it using multiple buffers for connectible
>>>>> +		 * sockets and drop the packet for datagram sockets.
>>>>>  		 */
>>>>>  		if (payload_len > iov_len - sizeof(*hdr)) {
>>>>> +			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
>>>>> +				vhost_transport_error(skb, EHOSTUNREACH);
>>>>> +				continue;
>>>>> +			}
>>>>> +
>>>>>  			payload_len = iov_len - sizeof(*hdr);
>>>>>  
>>>>>  			/* As we are copying pieces of large packet's buffer to
>>>>> @@ -394,6 +429,7 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
>>>>>  	return val < vq->num;
>>>>>  }
>>>>>  
>>>>> +static bool vhost_transport_dgram_allow(u32 cid, u32 port);
>>>>>  static bool vhost_transport_seqpacket_allow(u32 remote_cid);
>>>>>  
>>>>>  static struct virtio_transport vhost_transport = {
>>>>> @@ -410,7 +446,8 @@ static struct virtio_transport vhost_transport = {
>>>>>  		.cancel_pkt               = vhost_transport_cancel_pkt,
>>>>>  
>>>>>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
>>>>> -		.dgram_allow              = virtio_transport_dgram_allow,
>>>>> +		.dgram_allow              = vhost_transport_dgram_allow,
>>>>> +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
>>>>>  
>>>>>  		.stream_enqueue           = virtio_transport_stream_enqueue,
>>>>>  		.stream_dequeue           = virtio_transport_stream_dequeue,
>>>>> @@ -443,6 +480,22 @@ static struct virtio_transport vhost_transport = {
>>>>>  	.send_pkt = vhost_transport_send_pkt,
>>>>>  };
>>>>>  
>>>>> +static bool vhost_transport_dgram_allow(u32 cid, u32 port)
>>>>> +{
>>>>> +	struct vhost_vsock *vsock;
>>>>> +	bool dgram_allow = false;
>>>>> +
>>>>> +	rcu_read_lock();
>>>>> +	vsock = vhost_vsock_get(cid);
>>>>> +
>>>>> +	if (vsock)
>>>>> +		dgram_allow = vsock->dgram_allow;
>>>>> +
>>>>> +	rcu_read_unlock();
>>>>> +
>>>>> +	return dgram_allow;
>>>>> +}
>>>>> +
>>>>>  static bool vhost_transport_seqpacket_allow(u32 remote_cid)
>>>>>  {
>>>>>  	struct vhost_vsock *vsock;
>>>>> @@ -799,6 +852,9 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
>>>>>  	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
>>>>>  		vsock->seqpacket_allow = true;
>>>>>  
>>>>> +	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
>>>>> +		vsock->dgram_allow = true;
>>>>> +
>>>>>  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
>>>>>  		vq = &vsock->vqs[i];
>>>>>  		mutex_lock(&vq->mutex);
>>>>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>>>>> index e73f3b2c52f1..449ed63ac2b0 100644
>>>>> --- a/net/vmw_vsock/af_vsock.c
>>>>> +++ b/net/vmw_vsock/af_vsock.c
>>>>> @@ -1427,9 +1427,12 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>>>>>  		return prot->recvmsg(sk, msg, len, flags, NULL);
>>>>>  #endif
>>>>>  
>>>>> -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
>>>>> +	if (unlikely(flags & MSG_OOB))
>>>>>  		return -EOPNOTSUPP;
>>>>>  
>>>>> +	if (unlikely(flags & MSG_ERRQUEUE))
>>>>> +		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
>>>>> +
>>>>
>>>> Sorry, but I get build error here, because SOL_VSOCK in undefined. I think it should be added to
>>>> include/linux/socket.h and to uapi files also for future use in userspace.
>>>>
>>>
>>> Strange, I built each patch individually without issue. My base is
>>> netdev/main with your SOL_VSOCK patch applied. I will look today and see
>>> if I'm missing something.
>>>
>>>> Also Stefano Garzarella <sgarzare@redhat.com> suggested to add define something like VSOCK_RECVERR,
>>>> in the same way as IP_RECVERR, and use it as last parameter of 'sock_recv_errqueue()'.
>>>>
>>>
>>> Got it, thanks.
>>>
>>>>>  	transport = vsk->transport;
>>>>>  
>>>>>  	/* Retrieve the head sk_buff from the socket's receive queue. */
>>>>>
>>>>
>>>> Thanks, Arseniy
>>>
>>> Thanks,
>>> Bobby
  

Patch

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index d5d6a3c3f273..da14260c6654 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -8,6 +8,7 @@ 
  */
 #include <linux/miscdevice.h>
 #include <linux/atomic.h>
+#include <linux/errqueue.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/vmalloc.h>
@@ -32,7 +33,8 @@ 
 enum {
 	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
 			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
-			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
+			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
+			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
 };
 
 enum {
@@ -56,6 +58,7 @@  struct vhost_vsock {
 	atomic_t queued_replies;
 
 	u32 guest_cid;
+	bool dgram_allow;
 	bool seqpacket_allow;
 };
 
@@ -86,6 +89,32 @@  static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
 	return NULL;
 }
 
+/* Claims ownership of the skb, do not free the skb after calling! */
+static void
+vhost_transport_error(struct sk_buff *skb, int err)
+{
+	struct sock_exterr_skb *serr;
+	struct sock *sk = skb->sk;
+	struct sk_buff *clone;
+
+	serr = SKB_EXT_ERR(skb);
+	memset(serr, 0, sizeof(*serr));
+	serr->ee.ee_errno = err;
+	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
+
+	clone = skb_clone(skb, GFP_KERNEL);
+	if (!clone)
+		return;
+
+	if (sock_queue_err_skb(sk, clone))
+		kfree_skb(clone);
+
+	sk->sk_err = err;
+	sk_error_report(sk);
+
+	kfree_skb(skb);
+}
+
 static void
 vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			    struct vhost_virtqueue *vq)
@@ -160,9 +189,15 @@  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		hdr = virtio_vsock_hdr(skb);
 
 		/* If the packet is greater than the space available in the
-		 * buffer, we split it using multiple buffers.
+		 * buffer, we split it using multiple buffers for connectible
+		 * sockets and drop the packet for datagram sockets.
 		 */
 		if (payload_len > iov_len - sizeof(*hdr)) {
+			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
+				vhost_transport_error(skb, EHOSTUNREACH);
+				continue;
+			}
+
 			payload_len = iov_len - sizeof(*hdr);
 
 			/* As we are copying pieces of large packet's buffer to
@@ -394,6 +429,7 @@  static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
 	return val < vq->num;
 }
 
+static bool vhost_transport_dgram_allow(u32 cid, u32 port);
 static bool vhost_transport_seqpacket_allow(u32 remote_cid);
 
 static struct virtio_transport vhost_transport = {
@@ -410,7 +446,8 @@  static struct virtio_transport vhost_transport = {
 		.cancel_pkt               = vhost_transport_cancel_pkt,
 
 		.dgram_enqueue            = virtio_transport_dgram_enqueue,
-		.dgram_allow              = virtio_transport_dgram_allow,
+		.dgram_allow              = vhost_transport_dgram_allow,
+		.dgram_addr_init          = virtio_transport_dgram_addr_init,
 
 		.stream_enqueue           = virtio_transport_stream_enqueue,
 		.stream_dequeue           = virtio_transport_stream_dequeue,
@@ -443,6 +480,22 @@  static struct virtio_transport vhost_transport = {
 	.send_pkt = vhost_transport_send_pkt,
 };
 
+static bool vhost_transport_dgram_allow(u32 cid, u32 port)
+{
+	struct vhost_vsock *vsock;
+	bool dgram_allow = false;
+
+	rcu_read_lock();
+	vsock = vhost_vsock_get(cid);
+
+	if (vsock)
+		dgram_allow = vsock->dgram_allow;
+
+	rcu_read_unlock();
+
+	return dgram_allow;
+}
+
 static bool vhost_transport_seqpacket_allow(u32 remote_cid)
 {
 	struct vhost_vsock *vsock;
@@ -799,6 +852,9 @@  static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
 	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
 		vsock->seqpacket_allow = true;
 
+	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
+		vsock->dgram_allow = true;
+
 	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
 		vq = &vsock->vqs[i];
 		mutex_lock(&vq->mutex);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index e73f3b2c52f1..449ed63ac2b0 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1427,9 +1427,12 @@  int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 		return prot->recvmsg(sk, msg, len, flags, NULL);
 #endif
 
-	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
+	if (unlikely(flags & MSG_OOB))
 		return -EOPNOTSUPP;
 
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
+
 	transport = vsk->transport;
 
 	/* Retrieve the head sk_buff from the socket's receive queue. */