[bpf-next,4/7] netfilter: bpf: Support BPF_F_NETFILTER_IP_DEFRAG in netfilter link

Message ID 242c66138bf4ec8aa26b29d736fb48242b4164ce.1687819413.git.dxu@dxuuu.xyz
State New
Headers
Series Support defragmenting IPv(4|6) packets in BPF |

Commit Message

Daniel Xu June 26, 2023, 11:02 p.m. UTC
  This commit adds support for enabling IP defrag using pre-existing
netfilter defrag support. Basically all the flag does is bump a refcnt
while the link the active. Checks are also added to ensure the prog
requesting defrag support is run _after_ netfilter defrag hooks.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
 include/uapi/linux/bpf.h       |   5 ++
 net/netfilter/nf_bpf_link.c    | 108 +++++++++++++++++++++++++++++----
 tools/include/uapi/linux/bpf.h |   5 ++
 3 files changed, 107 insertions(+), 11 deletions(-)
  

Comments

Florian Westphal June 27, 2023, 11:12 a.m. UTC | #1
Daniel Xu <dxu@dxuuu.xyz> wrote:
> +static int bpf_nf_enable_defrag(struct bpf_nf_link *link)
> +{
> +	int err;
> +
> +	switch (link->hook_ops.pf) {
> +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
> +	case NFPROTO_IPV4:
> +		const struct nf_defrag_v4_hook *v4_hook;
> +
> +		err = request_module("nf_defrag_ipv4");
> +		if (err)
> +			return err;
> +
> +		rcu_read_lock();
> +		v4_hook = rcu_dereference(nf_defrag_v4_hook);
> +		err = v4_hook->enable(link->net);
> +		rcu_read_unlock();

I'd reverse this, first try rcu_dereference(), then modprobe
if thats returned NULL.

> +static void bpf_nf_disable_defrag(struct bpf_nf_link *link)
> +{
> +	switch (link->hook_ops.pf) {
> +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
> +	case NFPROTO_IPV4:
> +		const struct nf_defrag_v4_hook *v4_hook;
> +
> +		rcu_read_lock();
> +		v4_hook = rcu_dereference(nf_defrag_v4_hook);
> +		v4_hook->disable(link->net);
> +		rcu_read_unlock();

if (v4_hook)
	v4_hook->disable()

Else we get trouble on manual 'rmmod'.

> +	/* make sure conntrack confirm is always last */
> +	prio = attr->link_create.netfilter.priority;
> +	if (prio == NF_IP_PRI_FIRST)
> +		return -ERANGE;  /* sabotage_in and other warts */
> +	else if (prio == NF_IP_PRI_LAST)
> +		return -ERANGE;  /* e.g. conntrack confirm */
> +	else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) &&
> +		 (prio > NF_IP_PRI_FIRST && prio <= NF_IP_PRI_CONNTRACK_DEFRAG))
> +		return -ERANGE;  /* cannot use defrag if prog runs before nf_defrag */

You could elide the (prio > NF_IP_PRI_FIRST, its already handled by
first conditional.  Otherwise this looks good to me.
  
Daniel Xu June 27, 2023, 3:35 p.m. UTC | #2
On Tue, Jun 27, 2023 at 01:12:48PM +0200, Florian Westphal wrote:
> Daniel Xu <dxu@dxuuu.xyz> wrote:
> > +static int bpf_nf_enable_defrag(struct bpf_nf_link *link)
> > +{
> > +	int err;
> > +
> > +	switch (link->hook_ops.pf) {
> > +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
> > +	case NFPROTO_IPV4:
> > +		const struct nf_defrag_v4_hook *v4_hook;
> > +
> > +		err = request_module("nf_defrag_ipv4");
> > +		if (err)
> > +			return err;
> > +
> > +		rcu_read_lock();
> > +		v4_hook = rcu_dereference(nf_defrag_v4_hook);
> > +		err = v4_hook->enable(link->net);
> > +		rcu_read_unlock();
> 
> I'd reverse this, first try rcu_dereference(), then modprobe
> if thats returned NULL.

Ack.

> 
> > +static void bpf_nf_disable_defrag(struct bpf_nf_link *link)
> > +{
> > +	switch (link->hook_ops.pf) {
> > +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
> > +	case NFPROTO_IPV4:
> > +		const struct nf_defrag_v4_hook *v4_hook;
> > +
> > +		rcu_read_lock();
> > +		v4_hook = rcu_dereference(nf_defrag_v4_hook);
> > +		v4_hook->disable(link->net);
> > +		rcu_read_unlock();
> 
> if (v4_hook)
> 	v4_hook->disable()
> 
> Else we get trouble on manual 'rmmod'.

Ah good catch, thanks.

> 
> > +	/* make sure conntrack confirm is always last */
> > +	prio = attr->link_create.netfilter.priority;
> > +	if (prio == NF_IP_PRI_FIRST)
> > +		return -ERANGE;  /* sabotage_in and other warts */
> > +	else if (prio == NF_IP_PRI_LAST)
> > +		return -ERANGE;  /* e.g. conntrack confirm */
> > +	else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) &&
> > +		 (prio > NF_IP_PRI_FIRST && prio <= NF_IP_PRI_CONNTRACK_DEFRAG))
> > +		return -ERANGE;  /* cannot use defrag if prog runs before nf_defrag */
> 
> You could elide the (prio > NF_IP_PRI_FIRST, its already handled by
> first conditional.  Otherwise this looks good to me.
> 

Ah, right. It's INT_MIN.


Thanks,
Daniel
  

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 60a9d59beeab..04ac77481583 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1170,6 +1170,11 @@  enum bpf_link_type {
  */
 #define BPF_F_KPROBE_MULTI_RETURN	(1U << 0)
 
+/* link_create.netfilter.flags used in LINK_CREATE command for
+ * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation.
+ */
+#define BPF_F_NETFILTER_IP_DEFRAG (1U << 0)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * the following extensions:
  *
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index c36da56d756f..a8015dbce12a 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -1,6 +1,7 @@ 
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/kmod.h>
 #include <linux/netfilter.h>
 
 #include <net/netfilter/nf_bpf_link.h>
@@ -23,8 +24,77 @@  struct bpf_nf_link {
 	struct nf_hook_ops hook_ops;
 	struct net *net;
 	u32 dead;
+	bool defrag;
 };
 
+static int bpf_nf_enable_defrag(struct bpf_nf_link *link)
+{
+	int err;
+
+	switch (link->hook_ops.pf) {
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+	case NFPROTO_IPV4:
+		const struct nf_defrag_v4_hook *v4_hook;
+
+		err = request_module("nf_defrag_ipv4");
+		if (err)
+			return err;
+
+		rcu_read_lock();
+		v4_hook = rcu_dereference(nf_defrag_v4_hook);
+		err = v4_hook->enable(link->net);
+		rcu_read_unlock();
+
+		return err;
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	case NFPROTO_IPV6:
+		const struct nf_defrag_v6_hook *v6_hook;
+
+		err = request_module("nf_defrag_ipv6_hooks");
+		if (err)
+			return err;
+
+		rcu_read_lock();
+		v6_hook = rcu_dereference(nf_defrag_v6_hook);
+		err = v6_hook->enable(link->net);
+		rcu_read_unlock();
+
+		return err;
+#endif
+	default:
+		return -EAFNOSUPPORT;
+	}
+}
+
+static void bpf_nf_disable_defrag(struct bpf_nf_link *link)
+{
+	switch (link->hook_ops.pf) {
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+	case NFPROTO_IPV4:
+		const struct nf_defrag_v4_hook *v4_hook;
+
+		rcu_read_lock();
+		v4_hook = rcu_dereference(nf_defrag_v4_hook);
+		v4_hook->disable(link->net);
+		rcu_read_unlock();
+
+		break;
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	case NFPROTO_IPV6:
+		const struct nf_defrag_v6_hook *v6_hook;
+
+		rcu_read_lock();
+		v6_hook = rcu_dereference(nf_defrag_v6_hook);
+		v6_hook->disable(link->net);
+		rcu_read_unlock();
+
+		break;
+	}
+#endif
+}
+
 static void bpf_nf_link_release(struct bpf_link *link)
 {
 	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
@@ -37,6 +107,9 @@  static void bpf_nf_link_release(struct bpf_link *link)
 	 */
 	if (!cmpxchg(&nf_link->dead, 0, 1))
 		nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
+
+	if (nf_link->defrag)
+		bpf_nf_disable_defrag(nf_link);
 }
 
 static void bpf_nf_link_dealloc(struct bpf_link *link)
@@ -92,6 +165,8 @@  static const struct bpf_link_ops bpf_nf_link_lops = {
 
 static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr)
 {
+	int prio;
+
 	switch (attr->link_create.netfilter.pf) {
 	case NFPROTO_IPV4:
 	case NFPROTO_IPV6:
@@ -102,19 +177,18 @@  static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr)
 		return -EAFNOSUPPORT;
 	}
 
-	if (attr->link_create.netfilter.flags)
+	if (attr->link_create.netfilter.flags & ~BPF_F_NETFILTER_IP_DEFRAG)
 		return -EOPNOTSUPP;
 
-	/* make sure conntrack confirm is always last.
-	 *
-	 * In the future, if userspace can e.g. request defrag, then
-	 * "defrag_requested && prio before NF_IP_PRI_CONNTRACK_DEFRAG"
-	 * should fail.
-	 */
-	switch (attr->link_create.netfilter.priority) {
-	case NF_IP_PRI_FIRST: return -ERANGE; /* sabotage_in and other warts */
-	case NF_IP_PRI_LAST: return -ERANGE; /* e.g. conntrack confirm */
-	}
+	/* make sure conntrack confirm is always last */
+	prio = attr->link_create.netfilter.priority;
+	if (prio == NF_IP_PRI_FIRST)
+		return -ERANGE;  /* sabotage_in and other warts */
+	else if (prio == NF_IP_PRI_LAST)
+		return -ERANGE;  /* e.g. conntrack confirm */
+	else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) &&
+		 (prio > NF_IP_PRI_FIRST && prio <= NF_IP_PRI_CONNTRACK_DEFRAG))
+		return -ERANGE;  /* cannot use defrag if prog runs before nf_defrag */
 
 	return 0;
 }
@@ -156,6 +230,18 @@  int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 		return err;
 	}
 
+	if (attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) {
+		err = bpf_nf_enable_defrag(link);
+		if (err) {
+			bpf_link_cleanup(&link_primer);
+			return err;
+		}
+		/* only mark defrag enabled if enabling succeeds so cleanup path
+		 * doesn't disable without a corresponding enable
+		 */
+		link->defrag = true;
+	}
+
 	err = nf_register_net_hook(net, &link->hook_ops);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 60a9d59beeab..04ac77481583 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1170,6 +1170,11 @@  enum bpf_link_type {
  */
 #define BPF_F_KPROBE_MULTI_RETURN	(1U << 0)
 
+/* link_create.netfilter.flags used in LINK_CREATE command for
+ * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation.
+ */
+#define BPF_F_NETFILTER_IP_DEFRAG (1U << 0)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * the following extensions:
  *