gpiolib: cdev: fix NULL-pointer dereferences

Message ID 20221125153257.528826-1-brgl@bgdev.pl
State New
Headers
Series gpiolib: cdev: fix NULL-pointer dereferences |

Commit Message

Bartosz Golaszewski Nov. 25, 2022, 3:32 p.m. UTC
  From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>

There are several places where we can crash the kernel by requesting
lines, unbinding the GPIO device, then calling any of the system calls
relevant to the GPIO character device's annonymous file descriptors:
ioctl(), read(), poll().

While I observed it with the GPIO simulator, it will also happen for any
of the GPIO devices that can be hot-unplugged - for instance any HID GPIO
expander (e.g. CP2112).

This affects both v1 and v2 uAPI.

Fix this by simply checking if the GPIO chip pointer is not NULL.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpiolib-cdev.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
  

Comments

Kent Gibson Nov. 25, 2022, 4:24 p.m. UTC | #1
On Fri, Nov 25, 2022 at 04:32:57PM +0100, Bartosz Golaszewski wrote:
> From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
> 
> There are several places where we can crash the kernel by requesting
> lines, unbinding the GPIO device, then calling any of the system calls
> relevant to the GPIO character device's annonymous file descriptors:
> ioctl(), read(), poll().
> 
> While I observed it with the GPIO simulator, it will also happen for any
> of the GPIO devices that can be hot-unplugged - for instance any HID GPIO
> expander (e.g. CP2112).
> 
> This affects both v1 and v2 uAPI.
> 
> Fix this by simply checking if the GPIO chip pointer is not NULL.
> 

Fixes: ??

And split for v1 and v2 as the Fixes for those will differ?

> Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
> ---
>  drivers/gpio/gpiolib-cdev.c | 15 +++++++++++++++
>  1 file changed, 15 insertions(+)
> 
> diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c
> index 0cb6b468f364..d5632742942a 100644
> --- a/drivers/gpio/gpiolib-cdev.c
> +++ b/drivers/gpio/gpiolib-cdev.c
> @@ -201,6 +201,9 @@ static long linehandle_ioctl(struct file *file, unsigned int cmd,
>  	unsigned int i;
>  	int ret;
>  
> +	if (!lh->gdev->chip)
> +		return -ENODEV;
> +

Is there anything to prevent the chip being removed by another thread
between this check and subsequent usage?

Cheers,
Kent.

>  	switch (cmd) {
>  	case GPIOHANDLE_GET_LINE_VALUES_IOCTL:
>  		/* NOTE: It's okay to read values of output lines */
> @@ -1384,6 +1387,9 @@ static long linereq_ioctl(struct file *file, unsigned int cmd,
>  	struct linereq *lr = file->private_data;
>  	void __user *ip = (void __user *)arg;
>  
> +	if (!lr->gdev->chip)
> +		return -ENODEV;
> +
>  	switch (cmd) {
>  	case GPIO_V2_LINE_GET_VALUES_IOCTL:
>  		return linereq_get_values(lr, ip);
> @@ -1716,6 +1722,9 @@ static __poll_t lineevent_poll(struct file *file,
>  	struct lineevent_state *le = file->private_data;
>  	__poll_t events = 0;
>  
> +	if (!le->gdev->chip)
> +		return -ENODEV;
> +
>  	poll_wait(file, &le->wait, wait);
>  
>  	if (!kfifo_is_empty_spinlocked_noirqsave(&le->events, &le->wait.lock))
> @@ -1740,6 +1749,9 @@ static ssize_t lineevent_read(struct file *file,
>  	ssize_t ge_size;
>  	int ret;
>  
> +	if (!le->gdev->chip)
> +		return -ENODEV;
> +
>  	/*
>  	 * When compatible system call is being used the struct gpioevent_data,
>  	 * in case of at least ia32, has different size due to the alignment
> @@ -1821,6 +1833,9 @@ static long lineevent_ioctl(struct file *file, unsigned int cmd,
>  	void __user *ip = (void __user *)arg;
>  	struct gpiohandle_data ghd;
>  
> +	if (!le->gdev->chip)
> +		return -ENODEV;
> +
>  	/*
>  	 * We can get the value for an event line but not set it,
>  	 * because it is input by definition.
> -- 
> 2.37.2
>
  
Bartosz Golaszewski Nov. 25, 2022, 4:48 p.m. UTC | #2
On Fri, Nov 25, 2022 at 5:24 PM Kent Gibson <warthog618@gmail.com> wrote:
>
> On Fri, Nov 25, 2022 at 04:32:57PM +0100, Bartosz Golaszewski wrote:
> > From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
> >
> > There are several places where we can crash the kernel by requesting
> > lines, unbinding the GPIO device, then calling any of the system calls
> > relevant to the GPIO character device's annonymous file descriptors:
> > ioctl(), read(), poll().
> >
> > While I observed it with the GPIO simulator, it will also happen for any
> > of the GPIO devices that can be hot-unplugged - for instance any HID GPIO
> > expander (e.g. CP2112).
> >
> > This affects both v1 and v2 uAPI.
> >
> > Fix this by simply checking if the GPIO chip pointer is not NULL.
> >
>
> Fixes: ??
>
> And split for v1 and v2 as the Fixes for those will differ?
>
> > Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
> > ---
> >  drivers/gpio/gpiolib-cdev.c | 15 +++++++++++++++
> >  1 file changed, 15 insertions(+)
> >
> > diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c
> > index 0cb6b468f364..d5632742942a 100644
> > --- a/drivers/gpio/gpiolib-cdev.c
> > +++ b/drivers/gpio/gpiolib-cdev.c
> > @@ -201,6 +201,9 @@ static long linehandle_ioctl(struct file *file, unsigned int cmd,
> >       unsigned int i;
> >       int ret;
> >
> > +     if (!lh->gdev->chip)
> > +             return -ENODEV;
> > +
>
> Is there anything to prevent the chip being removed by another thread
> between this check and subsequent usage?
>

Eh... not really, no. The issue we have here seems to be the same as
the one Laurent Pinchart described back in 2015[1] and revisited
during his 2022 linux plumbers presentation[2], except he blamed it on
devres, whereas I think the problem is much deeper and devres has
nothing to do with it.

Ideally we'd need a global fortifying of the driver model against
hot-unplug related device unbinding.

After a quick glance at the relevant code, I think we'd need to add
some flag to struct file for the vfs to check on any fs operation and
return an error early if user-space tries to operate on an fd
associated with a removed device. I'm not sure yet if that's feasible
globally or even the right solution at all - just brainstorming here.

Then at the subsystem level, the GPIO device struct would need a lock
that would be taken by every user-space operation AND the code
unregistering the device so that we don't do what you described (i.e.
if there's a thread doing a read(), then let's wait until it returns
before we drop the device).

This wouldn't fix the case in which the same situation happened in a
kernel driver but crashing the kernel from within is a much lesser
offense than allowing user-space to crash it.

So this patch is just papering over for now I suppose.

Bart

[1] https://lkml.org/lkml/2015/7/14/741
[2] https://www.youtube.com/watch?v=kW8LHWlJPTU
  
Andy Shevchenko Nov. 25, 2022, 5:56 p.m. UTC | #3
On Fri, Nov 25, 2022 at 05:48:02PM +0100, Bartosz Golaszewski wrote:
> On Fri, Nov 25, 2022 at 5:24 PM Kent Gibson <warthog618@gmail.com> wrote:

...

> Then at the subsystem level, the GPIO device struct would need a lock
> that would be taken by every user-space operation AND the code
> unregistering the device so that we don't do what you described (i.e.
> if there's a thread doing a read(), then let's wait until it returns
> before we drop the device).

It's called a reference counting, basically you need to get device and then
put when it makes sense.

> This wouldn't fix the case in which the same situation happened in a
> kernel driver but crashing the kernel from within is a much lesser
> offense than allowing user-space to crash it.
  
Andy Shevchenko Nov. 25, 2022, 6:02 p.m. UTC | #4
On Fri, Nov 25, 2022 at 07:56:10PM +0200, Andy Shevchenko wrote:
> On Fri, Nov 25, 2022 at 05:48:02PM +0100, Bartosz Golaszewski wrote:

...

An off topic here, can we have this [1] being applied meanwhile?

[1]: https://lore.kernel.org/linux-gpio/d1e86166-b2b0-a235-fe9e-be9ee4d93290@huawei.com/
  
Bartosz Golaszewski Nov. 25, 2022, 9:03 p.m. UTC | #5
On Fri, Nov 25, 2022 at 6:56 PM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> On Fri, Nov 25, 2022 at 05:48:02PM +0100, Bartosz Golaszewski wrote:
> > On Fri, Nov 25, 2022 at 5:24 PM Kent Gibson <warthog618@gmail.com> wrote:
>
> ...
>
> > Then at the subsystem level, the GPIO device struct would need a lock
> > that would be taken by every user-space operation AND the code
> > unregistering the device so that we don't do what you described (i.e.
> > if there's a thread doing a read(), then let's wait until it returns
> > before we drop the device).
>
> It's called a reference counting, basically you need to get device and then
> put when it makes sense.
>

Andy: I am aware of struct device reference counting but this isn't
it. You can count references all you want, but when I disconnect my
CP2112, the USB bus calls gpiochip_remove(), struct gpio_chip * inside
struct gpio_device is set to NULL and while the underlying struct
device itself is still alive, the GPIO chip is no longer usable.

Reference counting won't help because the device is no longer there,
so this behavior is correct but there's an issue with user-space still
being able to hold certain resources and we need to make sure that
when it tries to use them, we return an error instead of crashing.

I think that a good solution is to make sure, we cannot set gdev->gc
to NULL as long as there are user-space operations in progress. After
all, it's better to try to send a USB request to an unplugged device
than to dereference a NULL pointer. To that end, we could have a
user-space lock that would also be taken by gpiochip_remove().

But this is still a per-subsystem solution. Most other subsystems
suffer from the same issue.

Bartosz
  
Andy Shevchenko Nov. 25, 2022, 9:33 p.m. UTC | #6
On Fri, Nov 25, 2022 at 10:03:06PM +0100, Bartosz Golaszewski wrote:
> On Fri, Nov 25, 2022 at 6:56 PM Andy Shevchenko
> <andriy.shevchenko@linux.intel.com> wrote:
> > On Fri, Nov 25, 2022 at 05:48:02PM +0100, Bartosz Golaszewski wrote:
> > > On Fri, Nov 25, 2022 at 5:24 PM Kent Gibson <warthog618@gmail.com> wrote:

...

> > > Then at the subsystem level, the GPIO device struct would need a lock
> > > that would be taken by every user-space operation AND the code
> > > unregistering the device so that we don't do what you described (i.e.
> > > if there's a thread doing a read(), then let's wait until it returns
> > > before we drop the device).
> >
> > It's called a reference counting, basically you need to get device and then
> > put when it makes sense.
> 
> Andy: I am aware of struct device reference counting but this isn't
> it. You can count references all you want, but when I disconnect my
> CP2112, the USB bus calls gpiochip_remove(), struct gpio_chip * inside
> struct gpio_device is set to NULL and while the underlying struct
> device itself is still alive, the GPIO chip is no longer usable.
> 
> Reference counting won't help because the device is no longer there,
> so this behavior is correct but there's an issue with user-space still
> being able to hold certain resources and we need to make sure that
> when it tries to use them, we return an error instead of crashing.

Thank you for the detailed explanation of the case.

> I think that a good solution is to make sure, we cannot set gdev->gc
> to NULL as long as there are user-space operations in progress. After
> all, it's better to try to send a USB request to an unplugged device
> than to dereference a NULL pointer. To that end, we could have a
> user-space lock that would also be taken by gpiochip_remove().
> 
> But this is still a per-subsystem solution. Most other subsystems
> suffer from the same issue.

Yeah, many subsystems are not ready for hotplug...
  
Kent Gibson Nov. 26, 2022, 12:39 a.m. UTC | #7
On Fri, Nov 25, 2022 at 10:03:06PM +0100, Bartosz Golaszewski wrote:
> On Fri, Nov 25, 2022 at 6:56 PM Andy Shevchenko
> <andriy.shevchenko@linux.intel.com> wrote:
> >
> > On Fri, Nov 25, 2022 at 05:48:02PM +0100, Bartosz Golaszewski wrote:
> > > On Fri, Nov 25, 2022 at 5:24 PM Kent Gibson <warthog618@gmail.com> wrote:
> >
> > ...
> >
> > > Then at the subsystem level, the GPIO device struct would need a lock
> > > that would be taken by every user-space operation AND the code
> > > unregistering the device so that we don't do what you described (i.e.
> > > if there's a thread doing a read(), then let's wait until it returns
> > > before we drop the device).
> >
> > It's called a reference counting, basically you need to get device and then
> > put when it makes sense.
> >
> 
> Andy: I am aware of struct device reference counting but this isn't
> it. You can count references all you want, but when I disconnect my
> CP2112, the USB bus calls gpiochip_remove(), struct gpio_chip * inside
> struct gpio_device is set to NULL and while the underlying struct
> device itself is still alive, the GPIO chip is no longer usable.
> 
> Reference counting won't help because the device is no longer there,
> so this behavior is correct but there's an issue with user-space still
> being able to hold certain resources and we need to make sure that
> when it tries to use them, we return an error instead of crashing.
> 
> I think that a good solution is to make sure, we cannot set gdev->gc
> to NULL as long as there are user-space operations in progress. After
> all, it's better to try to send a USB request to an unplugged device
> than to dereference a NULL pointer. To that end, we could have a
> user-space lock that would also be taken by gpiochip_remove().
> 

This is basically the answer I was hoping for - that there is some
barrier in place to prevent chip removal while an ioctl is active.
Then the check makes total sense - it is ensuring that the chip wasn't
removed before the ioctl began and the barrier went up.

On the other end, the caller of gpiochip_remove() needs to be prepared
to gracefully fail calls on the chip until gpiochip_remove() returns.
You would hope that is already the case...

> But this is still a per-subsystem solution. Most other subsystems
> suffer from the same issue.
> 

Does that prevent us addressing the problem in gpio until a more general
solution comes along?

Anyway, I'm basically ok with your patch as a first step, as it greatly
reduces the chances of triggering the fault, but it is only a band-aid
over a larger issue and a more complete solution would be preferable.
Without that, highlight in the checkin comment that it is not a complete
fix.

Cheers,
Kent.
  
kernel test robot Nov. 26, 2022, 4:02 a.m. UTC | #8
Hi Bartosz,

I love your patch! Perhaps something to improve:

[auto build test WARNING on brgl/gpio/for-next]
[also build test WARNING on linus/master v6.1-rc6 next-20221125]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Bartosz-Golaszewski/gpiolib-cdev-fix-NULL-pointer-dereferences/20221125-233356
base:   https://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git gpio/for-next
patch link:    https://lore.kernel.org/r/20221125153257.528826-1-brgl%40bgdev.pl
patch subject: [PATCH] gpiolib: cdev: fix NULL-pointer dereferences
config: powerpc-randconfig-s051-20221124
compiler: powerpc-linux-gcc (GCC) 12.1.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # apt-get install sparse
        # sparse version: v0.6.4-39-gce1a6720-dirty
        # https://github.com/intel-lab-lkp/linux/commit/79d8e683d5df83bf0b25c6a1bf6755ba20266099
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Bartosz-Golaszewski/gpiolib-cdev-fix-NULL-pointer-dereferences/20221125-233356
        git checkout 79d8e683d5df83bf0b25c6a1bf6755ba20266099
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=powerpc SHELL=/bin/bash drivers/gpio/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

sparse warnings: (new ones prefixed by >>)
>> drivers/gpio/gpiolib-cdev.c:1726:24: sparse: sparse: incorrect type in return expression (different base types) @@     expected restricted __poll_t @@     got int @@
   drivers/gpio/gpiolib-cdev.c:1726:24: sparse:     expected restricted __poll_t
   drivers/gpio/gpiolib-cdev.c:1726:24: sparse:     got int

vim +1726 drivers/gpio/gpiolib-cdev.c

  1714	
  1715	#define GPIOEVENT_REQUEST_VALID_FLAGS \
  1716		(GPIOEVENT_REQUEST_RISING_EDGE | \
  1717		GPIOEVENT_REQUEST_FALLING_EDGE)
  1718	
  1719	static __poll_t lineevent_poll(struct file *file,
  1720				       struct poll_table_struct *wait)
  1721	{
  1722		struct lineevent_state *le = file->private_data;
  1723		__poll_t events = 0;
  1724	
  1725		if (!le->gdev->chip)
> 1726			return -ENODEV;
  1727	
  1728		poll_wait(file, &le->wait, wait);
  1729	
  1730		if (!kfifo_is_empty_spinlocked_noirqsave(&le->events, &le->wait.lock))
  1731			events = EPOLLIN | EPOLLRDNORM;
  1732	
  1733		return events;
  1734	}
  1735
  

Patch

diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c
index 0cb6b468f364..d5632742942a 100644
--- a/drivers/gpio/gpiolib-cdev.c
+++ b/drivers/gpio/gpiolib-cdev.c
@@ -201,6 +201,9 @@  static long linehandle_ioctl(struct file *file, unsigned int cmd,
 	unsigned int i;
 	int ret;
 
+	if (!lh->gdev->chip)
+		return -ENODEV;
+
 	switch (cmd) {
 	case GPIOHANDLE_GET_LINE_VALUES_IOCTL:
 		/* NOTE: It's okay to read values of output lines */
@@ -1384,6 +1387,9 @@  static long linereq_ioctl(struct file *file, unsigned int cmd,
 	struct linereq *lr = file->private_data;
 	void __user *ip = (void __user *)arg;
 
+	if (!lr->gdev->chip)
+		return -ENODEV;
+
 	switch (cmd) {
 	case GPIO_V2_LINE_GET_VALUES_IOCTL:
 		return linereq_get_values(lr, ip);
@@ -1716,6 +1722,9 @@  static __poll_t lineevent_poll(struct file *file,
 	struct lineevent_state *le = file->private_data;
 	__poll_t events = 0;
 
+	if (!le->gdev->chip)
+		return -ENODEV;
+
 	poll_wait(file, &le->wait, wait);
 
 	if (!kfifo_is_empty_spinlocked_noirqsave(&le->events, &le->wait.lock))
@@ -1740,6 +1749,9 @@  static ssize_t lineevent_read(struct file *file,
 	ssize_t ge_size;
 	int ret;
 
+	if (!le->gdev->chip)
+		return -ENODEV;
+
 	/*
 	 * When compatible system call is being used the struct gpioevent_data,
 	 * in case of at least ia32, has different size due to the alignment
@@ -1821,6 +1833,9 @@  static long lineevent_ioctl(struct file *file, unsigned int cmd,
 	void __user *ip = (void __user *)arg;
 	struct gpiohandle_data ghd;
 
+	if (!le->gdev->chip)
+		return -ENODEV;
+
 	/*
 	 * We can get the value for an event line but not set it,
 	 * because it is input by definition.