[net-next,v3,3/3] eventpoll: Add epoll ioctl for epoll_params

Message ID 20240125225704.12781-4-jdamato@fastly.com
State New
Headers
Series Per epoll context busy poll support |

Commit Message

Joe Damato Jan. 25, 2024, 10:56 p.m. UTC
  Add an ioctl for getting and setting epoll_params. User programs can use
this ioctl to get and set the busy poll usec time or packet budget
params for a specific epoll context.

Signed-off-by: Joe Damato <jdamato@fastly.com>
---
 .../userspace-api/ioctl/ioctl-number.rst      |  1 +
 fs/eventpoll.c                                | 64 +++++++++++++++++++
 include/uapi/linux/eventpoll.h                | 12 ++++
 3 files changed, 77 insertions(+)
  

Comments

Greg KH Jan. 25, 2024, 11:20 p.m. UTC | #1
On Thu, Jan 25, 2024 at 10:56:59PM +0000, Joe Damato wrote:
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -6,6 +6,8 @@
>   *  Davide Libenzi <davidel@xmailserver.org>
>   */
>  
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +

Why this addition?  You do not add any pr_*() calls in this patch at all
that I can see.

thanks,

greg k-h
  
Greg KH Jan. 25, 2024, 11:21 p.m. UTC | #2
On Thu, Jan 25, 2024 at 10:56:59PM +0000, Joe Damato wrote:
> +struct epoll_params {
> +	u64 busy_poll_usecs;
> +	u16 busy_poll_budget;
> +
> +	/* for future fields */
> +	u8 data[118];
> +} EPOLL_PACKED;

variables that cross the user/kernel boundry need to be __u64, __u16,
and __u8 here.

And why 118?

thanks,

greg k-h
  
Greg KH Jan. 25, 2024, 11:22 p.m. UTC | #3
On Thu, Jan 25, 2024 at 10:56:59PM +0000, Joe Damato wrote:
> Add an ioctl for getting and setting epoll_params. User programs can use
> this ioctl to get and set the busy poll usec time or packet budget
> params for a specific epoll context.
> 
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
>  .../userspace-api/ioctl/ioctl-number.rst      |  1 +
>  fs/eventpoll.c                                | 64 +++++++++++++++++++
>  include/uapi/linux/eventpoll.h                | 12 ++++
>  3 files changed, 77 insertions(+)
> 
> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
> index 457e16f06e04..b33918232f78 100644
> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> @@ -309,6 +309,7 @@ Code  Seq#    Include File                                           Comments
>  0x89  0B-DF  linux/sockios.h
>  0x89  E0-EF  linux/sockios.h                                         SIOCPROTOPRIVATE range
>  0x89  F0-FF  linux/sockios.h                                         SIOCDEVPRIVATE range
> +0x8A  00-1F  linux/eventpoll.h
>  0x8B  all    linux/wireless.h
>  0x8C  00-3F                                                          WiNRADiO driver
>                                                                       <http://www.winradio.com.au/>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 40bd97477b91..73ae886efb8a 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -6,6 +6,8 @@
>   *  Davide Libenzi <davidel@xmailserver.org>
>   */
>  
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
>  #include <linux/init.h>
>  #include <linux/kernel.h>
>  #include <linux/sched/signal.h>
> @@ -37,6 +39,7 @@
>  #include <linux/seq_file.h>
>  #include <linux/compat.h>
>  #include <linux/rculist.h>
> +#include <linux/capability.h>
>  #include <net/busy_poll.h>
>  
>  /*
> @@ -495,6 +498,39 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
>  	ep->napi_id = napi_id;
>  }
>  
> +static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
> +				  unsigned long arg)
> +{
> +	struct eventpoll *ep;
> +	struct epoll_params epoll_params;
> +	void __user *uarg = (void __user *) arg;
> +
> +	ep = file->private_data;
> +
> +	switch (cmd) {
> +	case EPIOCSPARAMS:
> +		if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
> +			return -EFAULT;
> +
> +		if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
> +		    !capable(CAP_NET_ADMIN))
> +			return -EPERM;
> +
> +		ep->busy_poll_usecs = epoll_params.busy_poll_usecs;
> +		ep->busy_poll_budget = epoll_params.busy_poll_budget;
> +		return 0;
> +	case EPIOCGPARAMS:
> +		memset(&epoll_params, 0, sizeof(epoll_params));
> +		epoll_params.busy_poll_usecs = ep->busy_poll_usecs;
> +		epoll_params.busy_poll_budget = ep->busy_poll_budget;
> +		if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
> +			return -EFAULT;
> +		return 0;
> +	default:
> +		return -ENOIOCTLCMD;
> +	}
> +}
> +
>  #else
>  
>  static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
> @@ -510,6 +546,12 @@ static inline bool ep_busy_loop_on(struct eventpoll *ep)
>  {
>  	return false;
>  }
> +
> +static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
> +				  unsigned long arg)
> +{
> +	return -EOPNOTSUPP;
> +}
>  #endif /* CONFIG_NET_RX_BUSY_POLL */
>  
>  /*
> @@ -869,6 +911,26 @@ static void ep_clear_and_put(struct eventpoll *ep)
>  		ep_free(ep);
>  }
>  
> +static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> +{
> +	int ret;
> +
> +	if (!is_file_epoll(file))
> +		return -EINVAL;
> +
> +	switch (cmd) {
> +	case EPIOCSPARAMS:
> +	case EPIOCGPARAMS:
> +		ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
> +		break;
> +	default:
> +		ret = -EINVAL;
> +		break;
> +	}
> +
> +	return ret;
> +}
> +
>  static int ep_eventpoll_release(struct inode *inode, struct file *file)
>  {
>  	struct eventpoll *ep = file->private_data;
> @@ -975,6 +1037,8 @@ static const struct file_operations eventpoll_fops = {
>  	.release	= ep_eventpoll_release,
>  	.poll		= ep_eventpoll_poll,
>  	.llseek		= noop_llseek,
> +	.unlocked_ioctl	= ep_eventpoll_ioctl,
> +	.compat_ioctl   = compat_ptr_ioctl,
>  };
>  
>  /*
> diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
> index cfbcc4cc49ac..8eb0fdbce995 100644
> --- a/include/uapi/linux/eventpoll.h
> +++ b/include/uapi/linux/eventpoll.h
> @@ -85,4 +85,16 @@ struct epoll_event {
>  	__u64 data;
>  } EPOLL_PACKED;
>  
> +struct epoll_params {
> +	u64 busy_poll_usecs;
> +	u16 busy_poll_budget;
> +
> +	/* for future fields */
> +	u8 data[118];

You forgot to validate that "data" is set to 0, which means that this
would be useless.  Why have this at all?

thanks,

greg k-h
  
Joe Damato Jan. 26, 2024, 12:04 a.m. UTC | #4
On Thu, Jan 25, 2024 at 03:20:37PM -0800, Greg Kroah-Hartman wrote:
> On Thu, Jan 25, 2024 at 10:56:59PM +0000, Joe Damato wrote:
> > --- a/fs/eventpoll.c
> > +++ b/fs/eventpoll.c
> > @@ -6,6 +6,8 @@
> >   *  Davide Libenzi <davidel@xmailserver.org>
> >   */
> >  
> > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> > +
> 
> Why this addition?  You do not add any pr_*() calls in this patch at all
> that I can see.

Thanks, I've removed this for the next version. It was a remnant from a
previous version.
  
Joe Damato Jan. 26, 2024, 12:07 a.m. UTC | #5
On Thu, Jan 25, 2024 at 03:22:34PM -0800, Greg Kroah-Hartman wrote:
> On Thu, Jan 25, 2024 at 10:56:59PM +0000, Joe Damato wrote:
> > Add an ioctl for getting and setting epoll_params. User programs can use
> > this ioctl to get and set the busy poll usec time or packet budget
> > params for a specific epoll context.
> > 
> > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > ---
> >  .../userspace-api/ioctl/ioctl-number.rst      |  1 +
> >  fs/eventpoll.c                                | 64 +++++++++++++++++++
> >  include/uapi/linux/eventpoll.h                | 12 ++++
> >  3 files changed, 77 insertions(+)
> > 
> > diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > index 457e16f06e04..b33918232f78 100644
> > --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> > +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > @@ -309,6 +309,7 @@ Code  Seq#    Include File                                           Comments
> >  0x89  0B-DF  linux/sockios.h
> >  0x89  E0-EF  linux/sockios.h                                         SIOCPROTOPRIVATE range
> >  0x89  F0-FF  linux/sockios.h                                         SIOCDEVPRIVATE range
> > +0x8A  00-1F  linux/eventpoll.h
> >  0x8B  all    linux/wireless.h
> >  0x8C  00-3F                                                          WiNRADiO driver
> >                                                                       <http://www.winradio.com.au/>
> > diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> > index 40bd97477b91..73ae886efb8a 100644
> > --- a/fs/eventpoll.c
> > +++ b/fs/eventpoll.c
> > @@ -6,6 +6,8 @@
> >   *  Davide Libenzi <davidel@xmailserver.org>
> >   */
> >  
> > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> > +
> >  #include <linux/init.h>
> >  #include <linux/kernel.h>
> >  #include <linux/sched/signal.h>
> > @@ -37,6 +39,7 @@
> >  #include <linux/seq_file.h>
> >  #include <linux/compat.h>
> >  #include <linux/rculist.h>
> > +#include <linux/capability.h>
> >  #include <net/busy_poll.h>
> >  
> >  /*
> > @@ -495,6 +498,39 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
> >  	ep->napi_id = napi_id;
> >  }
> >  
> > +static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
> > +				  unsigned long arg)
> > +{
> > +	struct eventpoll *ep;
> > +	struct epoll_params epoll_params;
> > +	void __user *uarg = (void __user *) arg;
> > +
> > +	ep = file->private_data;
> > +
> > +	switch (cmd) {
> > +	case EPIOCSPARAMS:
> > +		if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
> > +			return -EFAULT;
> > +
> > +		if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
> > +		    !capable(CAP_NET_ADMIN))
> > +			return -EPERM;
> > +
> > +		ep->busy_poll_usecs = epoll_params.busy_poll_usecs;
> > +		ep->busy_poll_budget = epoll_params.busy_poll_budget;
> > +		return 0;
> > +	case EPIOCGPARAMS:
> > +		memset(&epoll_params, 0, sizeof(epoll_params));
> > +		epoll_params.busy_poll_usecs = ep->busy_poll_usecs;
> > +		epoll_params.busy_poll_budget = ep->busy_poll_budget;
> > +		if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
> > +			return -EFAULT;
> > +		return 0;
> > +	default:
> > +		return -ENOIOCTLCMD;
> > +	}
> > +}
> > +
> >  #else
> >  
> >  static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
> > @@ -510,6 +546,12 @@ static inline bool ep_busy_loop_on(struct eventpoll *ep)
> >  {
> >  	return false;
> >  }
> > +
> > +static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
> > +				  unsigned long arg)
> > +{
> > +	return -EOPNOTSUPP;
> > +}
> >  #endif /* CONFIG_NET_RX_BUSY_POLL */
> >  
> >  /*
> > @@ -869,6 +911,26 @@ static void ep_clear_and_put(struct eventpoll *ep)
> >  		ep_free(ep);
> >  }
> >  
> > +static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> > +{
> > +	int ret;
> > +
> > +	if (!is_file_epoll(file))
> > +		return -EINVAL;
> > +
> > +	switch (cmd) {
> > +	case EPIOCSPARAMS:
> > +	case EPIOCGPARAMS:
> > +		ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
> > +		break;
> > +	default:
> > +		ret = -EINVAL;
> > +		break;
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> >  static int ep_eventpoll_release(struct inode *inode, struct file *file)
> >  {
> >  	struct eventpoll *ep = file->private_data;
> > @@ -975,6 +1037,8 @@ static const struct file_operations eventpoll_fops = {
> >  	.release	= ep_eventpoll_release,
> >  	.poll		= ep_eventpoll_poll,
> >  	.llseek		= noop_llseek,
> > +	.unlocked_ioctl	= ep_eventpoll_ioctl,
> > +	.compat_ioctl   = compat_ptr_ioctl,
> >  };
> >  
> >  /*
> > diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
> > index cfbcc4cc49ac..8eb0fdbce995 100644
> > --- a/include/uapi/linux/eventpoll.h
> > +++ b/include/uapi/linux/eventpoll.h
> > @@ -85,4 +85,16 @@ struct epoll_event {
> >  	__u64 data;
> >  } EPOLL_PACKED;
> >  
> > +struct epoll_params {
> > +	u64 busy_poll_usecs;
> > +	u16 busy_poll_budget;
> > +
> > +	/* for future fields */
> > +	u8 data[118];
> 
> You forgot to validate that "data" is set to 0, which means that this
> would be useless.  Why have this at all?

I included this because I (probably incorrectly) thought that there should
be some extra space in the struct for future additions if needed.

I am not sure if that is a recommended practice for this sort of thing or
not, but if it is I can add some validation.

Thanks for your time and effort in reviewing my code.
  
Joe Damato Jan. 26, 2024, 12:11 a.m. UTC | #6
On Thu, Jan 25, 2024 at 03:21:46PM -0800, Greg Kroah-Hartman wrote:
> On Thu, Jan 25, 2024 at 10:56:59PM +0000, Joe Damato wrote:
> > +struct epoll_params {
> > +	u64 busy_poll_usecs;
> > +	u16 busy_poll_budget;
> > +
> > +	/* for future fields */
> > +	u8 data[118];
> > +} EPOLL_PACKED;
> 
> variables that cross the user/kernel boundry need to be __u64, __u16,
> and __u8 here.

I'll make that change for the next version, thank you.

> And why 118?

I chose this arbitrarily. I figured that a 128 byte struct would support 16
u64s in the event that other fields needed to be added in the future. 118
is what was left after the existing fields. There's almost certainly a
better way to do this - or perhaps it is unnecessary as per your other
message.

I am not sure if leaving extra space in the struct is a recommended
practice for ioctls or not - I thought I noticed some code that did and
some that didn't in the kernel so I err'd on the side of leaving the space
and probably did it in the worst way possible.

Thanks,
Joe
  
Greg KH Jan. 26, 2024, 12:23 a.m. UTC | #7
On Thu, Jan 25, 2024 at 04:11:28PM -0800, Joe Damato wrote:
> On Thu, Jan 25, 2024 at 03:21:46PM -0800, Greg Kroah-Hartman wrote:
> > On Thu, Jan 25, 2024 at 10:56:59PM +0000, Joe Damato wrote:
> > > +struct epoll_params {
> > > +	u64 busy_poll_usecs;
> > > +	u16 busy_poll_budget;
> > > +
> > > +	/* for future fields */
> > > +	u8 data[118];
> > > +} EPOLL_PACKED;
> > 
> > variables that cross the user/kernel boundry need to be __u64, __u16,
> > and __u8 here.
> 
> I'll make that change for the next version, thank you.
> 
> > And why 118?
> 
> I chose this arbitrarily. I figured that a 128 byte struct would support 16
> u64s in the event that other fields needed to be added in the future. 118
> is what was left after the existing fields. There's almost certainly a
> better way to do this - or perhaps it is unnecessary as per your other
> message.
> 
> I am not sure if leaving extra space in the struct is a recommended
> practice for ioctls or not - I thought I noticed some code that did and
> some that didn't in the kernel so I err'd on the side of leaving the space
> and probably did it in the worst way possible.

It's not really a good idea unless you know exactly what you are going
to do with it.  Why not just have a new ioctl if you need new
information in the future?  That's simpler, right?

thanks,

greg k-h
  
Joe Damato Jan. 26, 2024, 2:36 a.m. UTC | #8
On Thu, Jan 25, 2024 at 04:23:58PM -0800, Greg Kroah-Hartman wrote:
> On Thu, Jan 25, 2024 at 04:11:28PM -0800, Joe Damato wrote:
> > On Thu, Jan 25, 2024 at 03:21:46PM -0800, Greg Kroah-Hartman wrote:
> > > On Thu, Jan 25, 2024 at 10:56:59PM +0000, Joe Damato wrote:
> > > > +struct epoll_params {
> > > > +	u64 busy_poll_usecs;
> > > > +	u16 busy_poll_budget;
> > > > +
> > > > +	/* for future fields */
> > > > +	u8 data[118];
> > > > +} EPOLL_PACKED;
> > > 
> > > variables that cross the user/kernel boundry need to be __u64, __u16,
> > > and __u8 here.
> > 
> > I'll make that change for the next version, thank you.
> > 
> > > And why 118?
> > 
> > I chose this arbitrarily. I figured that a 128 byte struct would support 16
> > u64s in the event that other fields needed to be added in the future. 118
> > is what was left after the existing fields. There's almost certainly a
> > better way to do this - or perhaps it is unnecessary as per your other
> > message.
> > 
> > I am not sure if leaving extra space in the struct is a recommended
> > practice for ioctls or not - I thought I noticed some code that did and
> > some that didn't in the kernel so I err'd on the side of leaving the space
> > and probably did it in the worst way possible.
> 
> It's not really a good idea unless you know exactly what you are going
> to do with it.  Why not just have a new ioctl if you need new
> information in the future?  That's simpler, right?

Sure, that makes sense to me. I'll remove it in the v4 alongside the other
changes you've requested.

Thanks for your time and patience reviewing my code. I greatly appreciate
your helpful comments and feedback.

Thanks,
Joe
  
Arnd Bergmann Jan. 26, 2024, 6:16 a.m. UTC | #9
On Fri, Jan 26, 2024, at 03:36, Joe Damato wrote:
> On Thu, Jan 25, 2024 at 04:23:58PM -0800, Greg Kroah-Hartman wrote:
>> On Thu, Jan 25, 2024 at 04:11:28PM -0800, Joe Damato wrote:
>> > On Thu, Jan 25, 2024 at 03:21:46PM -0800, Greg Kroah-Hartman wrote:
>> > > On Thu, Jan 25, 2024 at 10:56:59PM +0000, Joe Damato wrote:
>> > > > +struct epoll_params {
>> > > > +	u64 busy_poll_usecs;
>> > > > +	u16 busy_poll_budget;
>> > > > +
>> > > > +	/* for future fields */
>> > > > +	u8 data[118];
>> > > > +} EPOLL_PACKED;
>> > > 
>
> Sure, that makes sense to me. I'll remove it in the v4 alongside the other
> changes you've requested.
>
> Thanks for your time and patience reviewing my code. I greatly appreciate
> your helpful comments and feedback.

Note that you should still pad the structure to its normal
alignment. On non-x86 targets this would currently mean a
multiple of 64 bits.

I would suggest dropping the EPOLL_PACKED here entirely and
just using a fully aligned structure on all architectures, like

struct epoll_params {
      __aligned_u64 busy_poll_usecs;
      __u16 busy_poll_budget;
      __u8 __pad[6];
};

The explicit padding can help avoid leaking stack data when
a structure is copied back from kernel to userspace, so I would
just always use it in ioctl data structures.

      Arnd
  
Christian Brauner Jan. 26, 2024, 10:07 a.m. UTC | #10
On Thu, Jan 25, 2024 at 06:36:30PM -0800, Joe Damato wrote:
> On Thu, Jan 25, 2024 at 04:23:58PM -0800, Greg Kroah-Hartman wrote:
> > On Thu, Jan 25, 2024 at 04:11:28PM -0800, Joe Damato wrote:
> > > On Thu, Jan 25, 2024 at 03:21:46PM -0800, Greg Kroah-Hartman wrote:
> > > > On Thu, Jan 25, 2024 at 10:56:59PM +0000, Joe Damato wrote:
> > > > > +struct epoll_params {
> > > > > +	u64 busy_poll_usecs;
> > > > > +	u16 busy_poll_budget;
> > > > > +
> > > > > +	/* for future fields */
> > > > > +	u8 data[118];
> > > > > +} EPOLL_PACKED;
> > > > 
> > > > variables that cross the user/kernel boundry need to be __u64, __u16,
> > > > and __u8 here.
> > > 
> > > I'll make that change for the next version, thank you.
> > > 
> > > > And why 118?
> > > 
> > > I chose this arbitrarily. I figured that a 128 byte struct would support 16
> > > u64s in the event that other fields needed to be added in the future. 118
> > > is what was left after the existing fields. There's almost certainly a
> > > better way to do this - or perhaps it is unnecessary as per your other
> > > message.
> > > 
> > > I am not sure if leaving extra space in the struct is a recommended
> > > practice for ioctls or not - I thought I noticed some code that did and
> > > some that didn't in the kernel so I err'd on the side of leaving the space
> > > and probably did it in the worst way possible.
> > 
> > It's not really a good idea unless you know exactly what you are going
> > to do with it.  Why not just have a new ioctl if you need new
> > information in the future?  That's simpler, right?
> 
> Sure, that makes sense to me. I'll remove it in the v4 alongside the other
> changes you've requested.

Fwiw, we do support extensible ioctls since they encode the size. Take a
look at kernel/seccomp.c. It's a clean extensible interface built on top
of the copy_struct_from_user() pattern we added for system calls
(openat(), clone3() etc.):

static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
                                 unsigned long arg)
{
        struct seccomp_filter *filter = file->private_data;
        void __user *buf = (void __user *)arg;

        /* Fixed-size ioctls */
        switch (cmd) {
        case SECCOMP_IOCTL_NOTIF_RECV:
                return seccomp_notify_recv(filter, buf);
        case SECCOMP_IOCTL_NOTIF_SEND:
                return seccomp_notify_send(filter, buf);
        case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
        case SECCOMP_IOCTL_NOTIF_ID_VALID:
                return seccomp_notify_id_valid(filter, buf);
        case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
                return seccomp_notify_set_flags(filter, arg);
        }

        /* Extensible Argument ioctls */
#define EA_IOCTL(cmd)   ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
        switch (EA_IOCTL(cmd)) {
        case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
                return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
        default:
                return -EINVAL;
        }
}

static long seccomp_notify_addfd(struct seccomp_filter *filter,
                                 struct seccomp_notif_addfd __user *uaddfd,
                                 unsigned int size)
{
        struct seccomp_notif_addfd addfd;
        struct seccomp_knotif *knotif;
        struct seccomp_kaddfd kaddfd;
        int ret;

        BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
        BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);

        if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
                return -EINVAL;

        ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
        if (ret)
                return ret;
  
Joe Damato Jan. 26, 2024, 4:52 p.m. UTC | #11
On Fri, Jan 26, 2024 at 11:07:36AM +0100, Christian Brauner wrote:
> On Thu, Jan 25, 2024 at 06:36:30PM -0800, Joe Damato wrote:
> > On Thu, Jan 25, 2024 at 04:23:58PM -0800, Greg Kroah-Hartman wrote:
> > > On Thu, Jan 25, 2024 at 04:11:28PM -0800, Joe Damato wrote:
> > > > On Thu, Jan 25, 2024 at 03:21:46PM -0800, Greg Kroah-Hartman wrote:
> > > > > On Thu, Jan 25, 2024 at 10:56:59PM +0000, Joe Damato wrote:
> > > > > > +struct epoll_params {
> > > > > > +	u64 busy_poll_usecs;
> > > > > > +	u16 busy_poll_budget;
> > > > > > +
> > > > > > +	/* for future fields */
> > > > > > +	u8 data[118];
> > > > > > +} EPOLL_PACKED;
> > > > > 
> > > > > variables that cross the user/kernel boundry need to be __u64, __u16,
> > > > > and __u8 here.
> > > > 
> > > > I'll make that change for the next version, thank you.
> > > > 
> > > > > And why 118?
> > > > 
> > > > I chose this arbitrarily. I figured that a 128 byte struct would support 16
> > > > u64s in the event that other fields needed to be added in the future. 118
> > > > is what was left after the existing fields. There's almost certainly a
> > > > better way to do this - or perhaps it is unnecessary as per your other
> > > > message.
> > > > 
> > > > I am not sure if leaving extra space in the struct is a recommended
> > > > practice for ioctls or not - I thought I noticed some code that did and
> > > > some that didn't in the kernel so I err'd on the side of leaving the space
> > > > and probably did it in the worst way possible.
> > > 
> > > It's not really a good idea unless you know exactly what you are going
> > > to do with it.  Why not just have a new ioctl if you need new
> > > information in the future?  That's simpler, right?
> > 
> > Sure, that makes sense to me. I'll remove it in the v4 alongside the other
> > changes you've requested.
> 
> Fwiw, we do support extensible ioctls since they encode the size. Take a
> look at kernel/seccomp.c. It's a clean extensible interface built on top
> of the copy_struct_from_user() pattern we added for system calls
> (openat(), clone3() etc.):
> 
> static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
>                                  unsigned long arg)
> {
>         struct seccomp_filter *filter = file->private_data;
>         void __user *buf = (void __user *)arg;
> 
>         /* Fixed-size ioctls */
>         switch (cmd) {
>         case SECCOMP_IOCTL_NOTIF_RECV:
>                 return seccomp_notify_recv(filter, buf);
>         case SECCOMP_IOCTL_NOTIF_SEND:
>                 return seccomp_notify_send(filter, buf);
>         case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
>         case SECCOMP_IOCTL_NOTIF_ID_VALID:
>                 return seccomp_notify_id_valid(filter, buf);
>         case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
>                 return seccomp_notify_set_flags(filter, arg);
>         }
> 
>         /* Extensible Argument ioctls */
> #define EA_IOCTL(cmd)   ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
>         switch (EA_IOCTL(cmd)) {
>         case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
>                 return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
>         default:
>                 return -EINVAL;
>         }
> }
> 
> static long seccomp_notify_addfd(struct seccomp_filter *filter,
>                                  struct seccomp_notif_addfd __user *uaddfd,
>                                  unsigned int size)
> {
>         struct seccomp_notif_addfd addfd;
>         struct seccomp_knotif *knotif;
>         struct seccomp_kaddfd kaddfd;
>         int ret;
> 
>         BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
>         BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
> 
>         if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
>                 return -EINVAL;
> 
>         ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
>         if (ret)
>                 return ret;

Thanks; that's a really helpful note and example.

I'm inclined to believe that new fields probably won't be needed for a
while, but if they are: an extensible ioctl could be added in the future
to deal with that problem at that point.

Thanks,
Joe
  
David Laight Jan. 27, 2024, 2:51 p.m. UTC | #12
From: Arnd Bergmann
> Sent: 26 January 2024 06:16
> 
> On Fri, Jan 26, 2024, at 03:36, Joe Damato wrote:
> > On Thu, Jan 25, 2024 at 04:23:58PM -0800, Greg Kroah-Hartman wrote:
> >> On Thu, Jan 25, 2024 at 04:11:28PM -0800, Joe Damato wrote:
> >> > On Thu, Jan 25, 2024 at 03:21:46PM -0800, Greg Kroah-Hartman wrote:
> >> > > On Thu, Jan 25, 2024 at 10:56:59PM +0000, Joe Damato wrote:
> >> > > > +struct epoll_params {
> >> > > > +	u64 busy_poll_usecs;
> >> > > > +	u16 busy_poll_budget;
> >> > > > +
> >> > > > +	/* for future fields */
> >> > > > +	u8 data[118];
> >> > > > +} EPOLL_PACKED;
> >> > >
> >
> > Sure, that makes sense to me. I'll remove it in the v4 alongside the other
> > changes you've requested.
> >
> > Thanks for your time and patience reviewing my code. I greatly appreciate
> > your helpful comments and feedback.
> 
> Note that you should still pad the structure to its normal
> alignment. On non-x86 targets this would currently mean a
> multiple of 64 bits.
> 
> I would suggest dropping the EPOLL_PACKED here entirely and
> just using a fully aligned structure on all architectures, like
> 
> struct epoll_params {
>       __aligned_u64 busy_poll_usecs;
>       __u16 busy_poll_budget;
>       __u8 __pad[6];
> };
> 
> The explicit padding can help avoid leaking stack data when
> a structure is copied back from kernel to userspace, so I would
> just always use it in ioctl data structures.

Or just use 32bit types for both fields.
Both values need erroring before they get that large.
32bit of usec is about 20 seconds.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
  
kernel test robot Jan. 28, 2024, 11:24 a.m. UTC | #13
Hi Joe,

kernel test robot noticed the following build errors:

[auto build test ERROR on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Joe-Damato/eventpoll-support-busy-poll-per-epoll-instance/20240126-070250
base:   net-next/main
patch link:    https://lore.kernel.org/r/20240125225704.12781-4-jdamato%40fastly.com
patch subject: [PATCH net-next v3 3/3] eventpoll: Add epoll ioctl for epoll_params
config: i386-buildonly-randconfig-002-20240127 (https://download.01.org/0day-ci/archive/20240128/202401281917.WeFbZE56-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240128/202401281917.WeFbZE56-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401281917.WeFbZE56-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from <command-line>:
>> ./usr/include/linux/eventpoll.h:89:9: error: unknown type name 'u64'
      89 |         u64 busy_poll_usecs;
         |         ^~~
>> ./usr/include/linux/eventpoll.h:90:9: error: unknown type name 'u16'
      90 |         u16 busy_poll_budget;
         |         ^~~
>> ./usr/include/linux/eventpoll.h:93:9: error: unknown type name 'u8'
      93 |         u8 data[118];
         |         ^~
  

Patch

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 457e16f06e04..b33918232f78 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -309,6 +309,7 @@  Code  Seq#    Include File                                           Comments
 0x89  0B-DF  linux/sockios.h
 0x89  E0-EF  linux/sockios.h                                         SIOCPROTOPRIVATE range
 0x89  F0-FF  linux/sockios.h                                         SIOCDEVPRIVATE range
+0x8A  00-1F  linux/eventpoll.h
 0x8B  all    linux/wireless.h
 0x8C  00-3F                                                          WiNRADiO driver
                                                                      <http://www.winradio.com.au/>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 40bd97477b91..73ae886efb8a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -6,6 +6,8 @@ 
  *  Davide Libenzi <davidel@xmailserver.org>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
@@ -37,6 +39,7 @@ 
 #include <linux/seq_file.h>
 #include <linux/compat.h>
 #include <linux/rculist.h>
+#include <linux/capability.h>
 #include <net/busy_poll.h>
 
 /*
@@ -495,6 +498,39 @@  static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 	ep->napi_id = napi_id;
 }
 
+static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
+				  unsigned long arg)
+{
+	struct eventpoll *ep;
+	struct epoll_params epoll_params;
+	void __user *uarg = (void __user *) arg;
+
+	ep = file->private_data;
+
+	switch (cmd) {
+	case EPIOCSPARAMS:
+		if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
+			return -EFAULT;
+
+		if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
+		    !capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		ep->busy_poll_usecs = epoll_params.busy_poll_usecs;
+		ep->busy_poll_budget = epoll_params.busy_poll_budget;
+		return 0;
+	case EPIOCGPARAMS:
+		memset(&epoll_params, 0, sizeof(epoll_params));
+		epoll_params.busy_poll_usecs = ep->busy_poll_usecs;
+		epoll_params.busy_poll_budget = ep->busy_poll_budget;
+		if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
+			return -EFAULT;
+		return 0;
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
 #else
 
 static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
@@ -510,6 +546,12 @@  static inline bool ep_busy_loop_on(struct eventpoll *ep)
 {
 	return false;
 }
+
+static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
+				  unsigned long arg)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
 /*
@@ -869,6 +911,26 @@  static void ep_clear_and_put(struct eventpoll *ep)
 		ep_free(ep);
 }
 
+static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	if (!is_file_epoll(file))
+		return -EINVAL;
+
+	switch (cmd) {
+	case EPIOCSPARAMS:
+	case EPIOCGPARAMS:
+		ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
 static int ep_eventpoll_release(struct inode *inode, struct file *file)
 {
 	struct eventpoll *ep = file->private_data;
@@ -975,6 +1037,8 @@  static const struct file_operations eventpoll_fops = {
 	.release	= ep_eventpoll_release,
 	.poll		= ep_eventpoll_poll,
 	.llseek		= noop_llseek,
+	.unlocked_ioctl	= ep_eventpoll_ioctl,
+	.compat_ioctl   = compat_ptr_ioctl,
 };
 
 /*
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index cfbcc4cc49ac..8eb0fdbce995 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -85,4 +85,16 @@  struct epoll_event {
 	__u64 data;
 } EPOLL_PACKED;
 
+struct epoll_params {
+	u64 busy_poll_usecs;
+	u16 busy_poll_budget;
+
+	/* for future fields */
+	u8 data[118];
+} EPOLL_PACKED;
+
+#define EPOLL_IOC_TYPE 0x8A
+#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params)
+#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params)
+
 #endif /* _UAPI_LINUX_EVENTPOLL_H */