[RFC,1/4] bpf: add cgroup device guard to flag a cgroup device prog

Message ID 20230814-devcg_guard-v1-1-654971ab88b1@aisec.fraunhofer.de
State New
Headers
Series bpf: cgroup device guard for non-initial user namespace |

Commit Message

Michael Weiß Aug. 14, 2023, 2:26 p.m. UTC
  Introduce the BPF_F_CGROUP_DEVICE_GUARD flag for BPF_PROG_LOAD
which allows to set a cgroup device program to be a device guard.
Later this may be used to guard actions on device nodes in
non-initial userns. For this reason we provide the helper function
cgroup_bpf_device_guard_enabled() to check if a task has a cgroups
device program which is a device guard in its effective set of bpf
programs.

Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
---
 include/linux/bpf-cgroup.h     |  7 +++++++
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       |  5 +++++
 kernel/bpf/cgroup.c            | 30 ++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  5 ++++-
 tools/include/uapi/linux/bpf.h |  5 +++++
 6 files changed, 52 insertions(+), 1 deletion(-)
  

Comments

Alexander Mikhalitsyn Aug. 14, 2023, 3:54 p.m. UTC | #1
On Mon, Aug 14, 2023 at 4:32 PM Michael Weiß
<michael.weiss@aisec.fraunhofer.de> wrote:
>
> Introduce the BPF_F_CGROUP_DEVICE_GUARD flag for BPF_PROG_LOAD
> which allows to set a cgroup device program to be a device guard.
> Later this may be used to guard actions on device nodes in
> non-initial userns. For this reason we provide the helper function
> cgroup_bpf_device_guard_enabled() to check if a task has a cgroups
> device program which is a device guard in its effective set of bpf
> programs.
>
> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>

Hi Michael!

Thanks for working on this. It's also very useful for the LXC system
containers project.

> ---
>  include/linux/bpf-cgroup.h     |  7 +++++++
>  include/linux/bpf.h            |  1 +
>  include/uapi/linux/bpf.h       |  5 +++++
>  kernel/bpf/cgroup.c            | 30 ++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c           |  5 ++++-
>  tools/include/uapi/linux/bpf.h |  5 +++++
>  6 files changed, 52 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 57e9e109257e..112b6093f9fd 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -184,6 +184,8 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
>         return array != &bpf_empty_prog_array.hdr;
>  }
>
> +bool cgroup_bpf_device_guard_enabled(struct task_struct *task);
> +
>  /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
>  #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)                            \
>  ({                                                                           \
> @@ -476,6 +478,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
>         return 0;
>  }
>
> +static bool cgroup_bpf_device_guard_enabled(struct task_struct *task)
> +{
> +       return false;
> +}
> +
>  #define cgroup_bpf_enabled(atype) (0)
>  #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
>  #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; })
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index f58895830ada..313cce8aee05 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1384,6 +1384,7 @@ struct bpf_prog_aux {
>         bool sleepable;
>         bool tail_call_reachable;
>         bool xdp_has_frags;
> +       bool cgroup_device_guard;
>         /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
>         const struct btf_type *attach_func_proto;
>         /* function name for valid attach_btf_id */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 60a9d59beeab..3be57f7957b1 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1165,6 +1165,11 @@ enum bpf_link_type {
>   */
>  #define BPF_F_XDP_DEV_BOUND_ONLY       (1U << 6)
>
> +/* If BPF_F_CGROUP_DEVICE_GUARD is used in BPF_PROG_LOAD command, the loaded
> + * program will be allowed to guard device access inside user namespaces.
> + */
> +#define BPF_F_CGROUP_DEVICE_GUARD      (1U << 7)
> +
>  /* link_create.kprobe_multi.flags used in LINK_CREATE command for
>   * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
>   */
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 5b2741aa0d9b..230693ca4cdb 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -2505,6 +2505,36 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
>  const struct bpf_prog_ops cg_sockopt_prog_ops = {
>  };
>
> +bool
> +cgroup_bpf_device_guard_enabled(struct task_struct *task)
> +{
> +       bool ret;
> +       const struct bpf_prog_array *array;
> +       const struct bpf_prog_array_item *item;
> +       const struct bpf_prog *prog;
> +       struct cgroup *cgrp = task_dfl_cgroup(task);
> +
> +       ret = false;
> +
> +       array = rcu_access_pointer(cgrp->bpf.effective[CGROUP_DEVICE]);
> +       if (array == &bpf_empty_prog_array.hdr)
> +               return ret;
> +
> +       mutex_lock(&cgroup_mutex);

-> cgroup_lock();

> +       array = rcu_dereference_protected(cgrp->bpf.effective[CGROUP_DEVICE],
> +                                             lockdep_is_held(&cgroup_mutex));
> +       item = &array->items[0];
> +       while ((prog = READ_ONCE(item->prog))) {
> +               if (prog->aux->cgroup_device_guard) {
> +                       ret = true;
> +                       break;
> +               }
> +               item++;
> +       }
> +       mutex_unlock(&cgroup_mutex);

Don't we want to make this function specific to "current" task? This
allows to make locking lighter like in
__cgroup_bpf_check_dev_permission
https://github.com/torvalds/linux/blob/2ccdd1b13c591d306f0401d98dedc4bdcd02b421/kernel/bpf/cgroup.c#L1531
Here we have only RCU read lock.

AFAIK, cgroup_mutex is a heavily-contended lock.

Kind regards,
Alex

> +       return ret;
> +}
> +
>  /* Common helpers for cgroup hooks. */
>  const struct bpf_func_proto *
>  cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index a2aef900519c..33ea67c702c1 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2564,7 +2564,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>                                  BPF_F_SLEEPABLE |
>                                  BPF_F_TEST_RND_HI32 |
>                                  BPF_F_XDP_HAS_FRAGS |
> -                                BPF_F_XDP_DEV_BOUND_ONLY))
> +                                BPF_F_XDP_DEV_BOUND_ONLY |
> +                                BPF_F_CGROUP_DEVICE_GUARD))
>                 return -EINVAL;
>
>         if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
> @@ -2651,6 +2652,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>         prog->aux->dev_bound = !!attr->prog_ifindex;
>         prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
>         prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
> +       prog->aux->cgroup_device_guard =
> +               attr->prog_flags & BPF_F_CGROUP_DEVICE_GUARD;
>
>         err = security_bpf_prog_alloc(prog->aux);
>         if (err)
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 60a9d59beeab..3be57f7957b1 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -1165,6 +1165,11 @@ enum bpf_link_type {
>   */
>  #define BPF_F_XDP_DEV_BOUND_ONLY       (1U << 6)
>
> +/* If BPF_F_CGROUP_DEVICE_GUARD is used in BPF_PROG_LOAD command, the loaded
> + * program will be allowed to guard device access inside user namespaces.
> + */
> +#define BPF_F_CGROUP_DEVICE_GUARD      (1U << 7)
> +
>  /* link_create.kprobe_multi.flags used in LINK_CREATE command for
>   * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
>   */
>
> --
> 2.30.2
>
  
Christian Brauner Aug. 15, 2023, 8:59 a.m. UTC | #2
On Mon, Aug 14, 2023 at 04:26:09PM +0200, Michael Weiß wrote:
> Introduce the BPF_F_CGROUP_DEVICE_GUARD flag for BPF_PROG_LOAD
> which allows to set a cgroup device program to be a device guard.

Currently we block access to devices unconditionally in may_open_dev().
Anything that's mounted by an unprivileged containers will get
SB_I_NODEV set in s_i_flags.

Then we currently mediate device access in:

* inode_permission()
  -> devcgroup_inode_permission()
* vfs_mknod()
  -> devcgroup_inode_mknod()
* blkdev_get_by_dev() // sget()/sget_fc(), other ways to open block devices and friends
  -> devcgroup_check_permission()
* drivers/gpu/drm/amd/amdkfd // weird restrictions on showing gpu info afaict
  -> devcgroup_check_permission()

All your new flag does is to bypass that SB_I_NODEV check afaict and let
it proceed to the devcgroup_*() checks for the vfs layer.

But I don't get the semantics yet.
Is that a flag which is set on BPF_PROG_TYPE_CGROUP_DEVICE programs or
is that a flag on random bpf programs? It looks like it would be the
latter but design-wise I would expect this to be a property of the
device program itself.
  
Michael Weiß Aug. 17, 2023, 3:47 p.m. UTC | #3
On 15.08.23 10:59, Christian Brauner wrote:
> On Mon, Aug 14, 2023 at 04:26:09PM +0200, Michael Weiß wrote:
>> Introduce the BPF_F_CGROUP_DEVICE_GUARD flag for BPF_PROG_LOAD
>> which allows to set a cgroup device program to be a device guard.
> 
> Currently we block access to devices unconditionally in may_open_dev().
> Anything that's mounted by an unprivileged containers will get
> SB_I_NODEV set in s_i_flags.
> 
> Then we currently mediate device access in:
> 
> * inode_permission()
>   -> devcgroup_inode_permission()
> * vfs_mknod()
>   -> devcgroup_inode_mknod()
> * blkdev_get_by_dev() // sget()/sget_fc(), other ways to open block devices and friends
>   -> devcgroup_check_permission()
> * drivers/gpu/drm/amd/amdkfd // weird restrictions on showing gpu info afaict
>   -> devcgroup_check_permission()
> 
> All your new flag does is to bypass that SB_I_NODEV check afaict and let
> it proceed to the devcgroup_*() checks for the vfs layer.

Yes. In an early version, I had the check in super.c to avoid setting the
SB_I_NODEV on mount. I thought it would be a less invasive change to do both
checks in one source file. But from an architecture point of view it would be
better that we do it there. Should we?

> 
> But I don't get the semantics yet.
> Is that a flag which is set on BPF_PROG_TYPE_CGROUP_DEVICE programs or
> is that a flag on random bpf programs? It looks like it would be the
> latter but design-wise I would expect this to be a property of the
> device program itself.

Yes it's a flag on the bpf program which could be set during BPF_PROG_LOAD.
This was straight forward to be implemented similarly to the BPF_F_XDP_*
flags.

Cheers,
Michael
  

Patch

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 57e9e109257e..112b6093f9fd 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -184,6 +184,8 @@  static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 	return array != &bpf_empty_prog_array.hdr;
 }
 
+bool cgroup_bpf_device_guard_enabled(struct task_struct *task);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -476,6 +478,11 @@  static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 	return 0;
 }
 
+static bool cgroup_bpf_device_guard_enabled(struct task_struct *task)
+{
+	return false;
+}
+
 #define cgroup_bpf_enabled(atype) (0)
 #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f58895830ada..313cce8aee05 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1384,6 +1384,7 @@  struct bpf_prog_aux {
 	bool sleepable;
 	bool tail_call_reachable;
 	bool xdp_has_frags;
+	bool cgroup_device_guard;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 60a9d59beeab..3be57f7957b1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1165,6 +1165,11 @@  enum bpf_link_type {
  */
 #define BPF_F_XDP_DEV_BOUND_ONLY	(1U << 6)
 
+/* If BPF_F_CGROUP_DEVICE_GUARD is used in BPF_PROG_LOAD command, the loaded
+ * program will be allowed to guard device access inside user namespaces.
+ */
+#define BPF_F_CGROUP_DEVICE_GUARD	(1U << 7)
+
 /* link_create.kprobe_multi.flags used in LINK_CREATE command for
  * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
  */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 5b2741aa0d9b..230693ca4cdb 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -2505,6 +2505,36 @@  const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
 const struct bpf_prog_ops cg_sockopt_prog_ops = {
 };
 
+bool
+cgroup_bpf_device_guard_enabled(struct task_struct *task)
+{
+	bool ret;
+	const struct bpf_prog_array *array;
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	struct cgroup *cgrp = task_dfl_cgroup(task);
+
+	ret = false;
+
+	array = rcu_access_pointer(cgrp->bpf.effective[CGROUP_DEVICE]);
+	if (array == &bpf_empty_prog_array.hdr)
+		return ret;
+
+	mutex_lock(&cgroup_mutex);
+	array = rcu_dereference_protected(cgrp->bpf.effective[CGROUP_DEVICE],
+					      lockdep_is_held(&cgroup_mutex));
+	item = &array->items[0];
+	while ((prog = READ_ONCE(item->prog))) {
+		if (prog->aux->cgroup_device_guard) {
+			ret = true;
+			break;
+		}
+		item++;
+	}
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
 /* Common helpers for cgroup hooks. */
 const struct bpf_func_proto *
 cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a2aef900519c..33ea67c702c1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2564,7 +2564,8 @@  static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 				 BPF_F_SLEEPABLE |
 				 BPF_F_TEST_RND_HI32 |
 				 BPF_F_XDP_HAS_FRAGS |
-				 BPF_F_XDP_DEV_BOUND_ONLY))
+				 BPF_F_XDP_DEV_BOUND_ONLY |
+				 BPF_F_CGROUP_DEVICE_GUARD))
 		return -EINVAL;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2651,6 +2652,8 @@  static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	prog->aux->dev_bound = !!attr->prog_ifindex;
 	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
 	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
+	prog->aux->cgroup_device_guard =
+		attr->prog_flags & BPF_F_CGROUP_DEVICE_GUARD;
 
 	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 60a9d59beeab..3be57f7957b1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1165,6 +1165,11 @@  enum bpf_link_type {
  */
 #define BPF_F_XDP_DEV_BOUND_ONLY	(1U << 6)
 
+/* If BPF_F_CGROUP_DEVICE_GUARD is used in BPF_PROG_LOAD command, the loaded
+ * program will be allowed to guard device access inside user namespaces.
+ */
+#define BPF_F_CGROUP_DEVICE_GUARD	(1U << 7)
+
 /* link_create.kprobe_multi.flags used in LINK_CREATE command for
  * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
  */