[bpf-next] bpf: Export rx queue info for reuseport ebpf prog

Message ID 20230525033757.47483-1-jdamato@fastly.com
State New
Headers
Series [bpf-next] bpf: Export rx queue info for reuseport ebpf prog |

Commit Message

Joe Damato May 25, 2023, 3:37 a.m. UTC
  BPF_PROG_TYPE_SK_REUSEPORT / sk_reuseport ebpf programs do not have
access to the queue_mapping or napi_id of the incoming skb. Having
this information can help ebpf progs determine which listen socket to
select.

This patch exposes both queue_mapping and napi_id so that
sk_reuseport ebpf programs can use this information to direct incoming
connections to the correct listen socket in the SOCKMAP.

For example:

A multi-threaded userland program with several threads accepting client
connections via a reuseport listen socket group might want to direct
incoming connections from specific receive queues (or NAPI IDs) to specific
listen sockets to maximize locality or for use with epoll busy-poll.

Signed-off-by: Joe Damato <jdamato@fastly.com>
---
 include/uapi/linux/bpf.h |  2 ++
 net/core/filter.c        | 10 ++++++++++
 2 files changed, 12 insertions(+)
  

Comments

Yonghong Song May 25, 2023, 5:26 a.m. UTC | #1
On 5/24/23 8:37 PM, Joe Damato wrote:
> BPF_PROG_TYPE_SK_REUSEPORT / sk_reuseport ebpf programs do not have
> access to the queue_mapping or napi_id of the incoming skb. Having
> this information can help ebpf progs determine which listen socket to
> select.
> 
> This patch exposes both queue_mapping and napi_id so that
> sk_reuseport ebpf programs can use this information to direct incoming
> connections to the correct listen socket in the SOCKMAP.
> 
> For example:
> 
> A multi-threaded userland program with several threads accepting client
> connections via a reuseport listen socket group might want to direct
> incoming connections from specific receive queues (or NAPI IDs) to specific
> listen sockets to maximize locality or for use with epoll busy-poll.
> 
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
>   include/uapi/linux/bpf.h |  2 ++
>   net/core/filter.c        | 10 ++++++++++
>   2 files changed, 12 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 9273c654743c..31560b506535 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -6286,6 +6286,8 @@ struct sk_reuseport_md {
>   	 */
>   	__u32 eth_protocol;
>   	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
> +	__u32 rx_queue_mapping; /* Rx queue associated with the skb */
> +	__u32 napi_id;          /* napi id associated with the skb */
>   	__u32 bind_inany;	/* Is sock bound to an INANY address? */
>   	__u32 hash;		/* A hash of the packet 4 tuples */

This won't work. You will need to append to the end of data structure
to keep it backward compatibility.

Also, recent kernel has a kfunc bpf_cast_to_kern_ctx() which converts
a ctx to a kernel ctx and you can then use tracing-coding-style to
access those fields. In this particular case, you can do

    struct sk_reuseport_kern *kctx = bpf_cast_to_kern_ctx(ctx);

We have

struct sk_reuseport_kern {
         struct sk_buff *skb;
         struct sock *sk;
         struct sock *selected_sk;
         struct sock *migrating_sk;
         void *data_end;
         u32 hash;
         u32 reuseport_id;
         bool bind_inany;
};

through sk and skb pointer, you should be access the fields presented in
this patch. You can access more fields too.

So using bpf_cast_to_kern_ctx(), there is no need for more uapi changes.
Please give a try.

>   	/* When reuse->migrating_sk is NULL, it is selecting a sk for the
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 968139f4a1ac..71826e1ef7dc 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -11134,6 +11134,8 @@ sk_reuseport_is_valid_access(int off, int size,
>   	case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
>   	case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
>   	case bpf_ctx_range(struct sk_reuseport_md, len):
> +	case bpf_ctx_range(struct sk_reuseport_md, rx_queue_mapping):
> +	case bpf_ctx_range(struct sk_reuseport_md, napi_id):
>   		bpf_ctx_record_field_size(info, size_default);
>   		return bpf_ctx_narrow_access_ok(off, size, size_default);
>   
> @@ -11183,6 +11185,14 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
>   		SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
>   		break;
>   
> +	case offsetof(struct sk_reuseport_md, rx_queue_mapping):
> +		SK_REUSEPORT_LOAD_SKB_FIELD(queue_mapping);
> +		break;
> +
> +	case offsetof(struct sk_reuseport_md, napi_id):
> +		SK_REUSEPORT_LOAD_SKB_FIELD(napi_id);
> +		break;
> +
>   	case offsetof(struct sk_reuseport_md, ip_protocol):
>   		SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
>   		break;
  
Joe Damato May 25, 2023, 2:35 p.m. UTC | #2
On Wed, May 24, 2023 at 10:26:32PM -0700, Yonghong Song wrote:
> 
> 
> On 5/24/23 8:37 PM, Joe Damato wrote:
> >BPF_PROG_TYPE_SK_REUSEPORT / sk_reuseport ebpf programs do not have
> >access to the queue_mapping or napi_id of the incoming skb. Having
> >this information can help ebpf progs determine which listen socket to
> >select.
> >
> >This patch exposes both queue_mapping and napi_id so that
> >sk_reuseport ebpf programs can use this information to direct incoming
> >connections to the correct listen socket in the SOCKMAP.
> >
> >For example:
> >
> >A multi-threaded userland program with several threads accepting client
> >connections via a reuseport listen socket group might want to direct
> >incoming connections from specific receive queues (or NAPI IDs) to specific
> >listen sockets to maximize locality or for use with epoll busy-poll.
> >
> >Signed-off-by: Joe Damato <jdamato@fastly.com>
> >---
> >  include/uapi/linux/bpf.h |  2 ++
> >  net/core/filter.c        | 10 ++++++++++
> >  2 files changed, 12 insertions(+)
> >
> >diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >index 9273c654743c..31560b506535 100644
> >--- a/include/uapi/linux/bpf.h
> >+++ b/include/uapi/linux/bpf.h
> >@@ -6286,6 +6286,8 @@ struct sk_reuseport_md {
> >  	 */
> >  	__u32 eth_protocol;
> >  	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
> >+	__u32 rx_queue_mapping; /* Rx queue associated with the skb */
> >+	__u32 napi_id;          /* napi id associated with the skb */
> >  	__u32 bind_inany;	/* Is sock bound to an INANY address? */
> >  	__u32 hash;		/* A hash of the packet 4 tuples */
> 
> This won't work. You will need to append to the end of data structure
> to keep it backward compatibility.
> 
> Also, recent kernel has a kfunc bpf_cast_to_kern_ctx() which converts
> a ctx to a kernel ctx and you can then use tracing-coding-style to
> access those fields. In this particular case, you can do
> 
>    struct sk_reuseport_kern *kctx = bpf_cast_to_kern_ctx(ctx);
> 
> We have
> 
> struct sk_reuseport_kern {
>         struct sk_buff *skb;
>         struct sock *sk;
>         struct sock *selected_sk;
>         struct sock *migrating_sk;
>         void *data_end;
>         u32 hash;
>         u32 reuseport_id;
>         bool bind_inany;
> };
> 
> through sk and skb pointer, you should be access the fields presented in
> this patch. You can access more fields too.
> 
> So using bpf_cast_to_kern_ctx(), there is no need for more uapi changes.
> Please give a try.

Thanks! I was looking at an LTS kernel tree that didn't have
bpf_cast_to_kern_ctx; this is very helpful and definitely a better way to
go.

Sorry for the noise.
  

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9273c654743c..31560b506535 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6286,6 +6286,8 @@  struct sk_reuseport_md {
 	 */
 	__u32 eth_protocol;
 	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
+	__u32 rx_queue_mapping; /* Rx queue associated with the skb */
+	__u32 napi_id;          /* napi id associated with the skb */
 	__u32 bind_inany;	/* Is sock bound to an INANY address? */
 	__u32 hash;		/* A hash of the packet 4 tuples */
 	/* When reuse->migrating_sk is NULL, it is selecting a sk for the
diff --git a/net/core/filter.c b/net/core/filter.c
index 968139f4a1ac..71826e1ef7dc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -11134,6 +11134,8 @@  sk_reuseport_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
 	case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
 	case bpf_ctx_range(struct sk_reuseport_md, len):
+	case bpf_ctx_range(struct sk_reuseport_md, rx_queue_mapping):
+	case bpf_ctx_range(struct sk_reuseport_md, napi_id):
 		bpf_ctx_record_field_size(info, size_default);
 		return bpf_ctx_narrow_access_ok(off, size, size_default);
 
@@ -11183,6 +11185,14 @@  static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
 		SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
 		break;
 
+	case offsetof(struct sk_reuseport_md, rx_queue_mapping):
+		SK_REUSEPORT_LOAD_SKB_FIELD(queue_mapping);
+		break;
+
+	case offsetof(struct sk_reuseport_md, napi_id):
+		SK_REUSEPORT_LOAD_SKB_FIELD(napi_id);
+		break;
+
 	case offsetof(struct sk_reuseport_md, ip_protocol):
 		SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
 		break;