[5/6] driver core: Add __alloc_size hint to devm allocators

Message ID 20221101223321.1326815-5-keescook@chromium.org
State New
Headers
Series slab: Provide full coverage for __alloc_size attribute |

Commit Message

Kees Cook Nov. 1, 2022, 10:33 p.m. UTC
  Mark the devm_*alloc()-family of allocations with appropriate
__alloc_size()/__realloc_size() hints so the compiler can attempt to
reason about buffer lengths from allocations.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Nishanth Menon <nm@ti.com>
Cc: Michael Kelley <mikelley@microsoft.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Won Chung <wonchung@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20221029074734.gonna.276-kees@kernel.org
---
This is already in -next, but I'm including it here again to avoid any
confusion about this series landing (or being tested) via another tree.
---
 include/linux/device.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)
  

Comments

Yongqin Liu Feb. 1, 2023, 7:36 a.m. UTC | #1
Hi, Kees

This change causes "Kernel panic - not syncing: BRK handler: Fatal exception"
for the android-mainline based hikey960 build, with this commit reverted,
there is no problem for the build to boot to the homescreen.
Not sure if you have any idea about it and give some suggestions.

Here is part of the kernel panic log:

    [    9.479878][  T122] ueventd: Loading module
/vendor/lib/modules/spi-pl022.ko with args ''
    [    9.480276][  T115] apexd-bootstrap: Pre-allocated loop device 29
    [    9.480517][  T123] ueventd: LoadWithAliases was unable to load
of:Nhi3660_i2sT(null)Chisilicon,hi3660-i2s-1.0
    [    9.480632][  T121] Unexpected kernel BRK exception at EL1
    [    9.480637][  T121] Internal error: BRK handler:
00000000f2000001 [#1] PREEMPT SMP
    [    9.480644][  T121] Modules linked in: cpufreq_dt(E+)
hisi_thermal(E+) phy_hi3660_usb3(E) btqca(E) hi6421_pmic_core(E)
btbcm(E) spi_pl022(E) hi3660_mailbox(E) i2c_designware_platform(E)
mali_kbase(OE) dw_mmc_k3(E) bluetooth(E) dw_mmc_pltfm(E) dw_mmc(E)
kirin_drm(E) rfkill(E) kirin_dsi(E) i2c_designware_core(E) k3dma(E)
drm_dma_helper(E) cma_heap(E) system_heap(E)
    [    9.480688][  T121] CPU: 4 PID: 121 Comm: ueventd Tainted: G
       OE      6.2.0-rc6-mainline-14196-g1d9f94ec75b9 #1
    [    9.480694][  T121] Hardware name: HiKey960 (DT)
    [    9.480697][  T121] pstate: 20400005 (nzCv daif +PAN -UAO -TCO
-DIT -SSBS BTYPE=--)
    [    9.480703][  T121] pc : hi3660_thermal_probe+0x6c/0x74 [hisi_thermal]
    [    9.480722][  T121] lr : hi3660_thermal_probe+0x38/0x74 [hisi_thermal]
    [    9.480733][  T121] sp : ffffffc00aa13700
    [    9.480735][  T121] x29: ffffffc00aa13700 x28: 0000007ff8ae8531
x27: 00000000000008c0
    [    9.480743][  T121] x26: ffffffc00aa2a300 x25: ffffffc00aa2ab40
x24: 000000000000001d
    [    9.480749][  T121] x23: ffffffc00a29d000 x22: 0000000000000000
x21: ffffff8001fa4a80
    [    9.480755][  T121] x20: 0000000000000001 x19: ffffff8001fa4a80
x18: ffffffc00a8810b0
    [    9.480761][  T121] x17: 000000007ab542f2 x16: 000000007ab542f2
x15: ffffffc00aa01000
    [    9.480767][  T121] x14: ffffffc00966f250 x13: ffffffc0b58f9000
x12: ffffffc00a055f10
    [    9.480771][  T123] ueventd: LoadWithAliases was unable to load
cpu:type:aarch64:feature:,0000,0001,0002,0003,0004,0005,0006,0007,000B
    [    9.480773][  T121]
    [    9.480774][  T121] x11: 0000000000000000 x10: 0000000000000001
x9 : 0000000100000000
    [    9.480780][  T123] ueventd:
    [    9.480780][  T121] x8 : ffffffc0044154cb x7 : 0000000000000000
x6 : 000000000000003f
    [    9.480786][  T121] x5 : 0000000000000020 x4 : ffffffc0098db323
x3 : ffffff801aeb62c0
    [    9.480792][  T121] x2 : ffffff801aeb62c0 x1 : 0000000000000000
x0 : ffffff8001fa4c80
    [    9.480798][  T121] Call trace:
    [    9.480801][  T121]  hi3660_thermal_probe+0x6c/0x74 [hisi_thermal]
    [    9.480813][  T121]  hisi_thermal_probe+0xbc/0x284 [hisi_thermal]
    [    9.480823][  T121]  platform_probe+0xcc/0xf8
    [    9.480836][  T121]  really_probe+0x19c/0x390
    [    9.480842][  T121]  __driver_probe_device+0xc0/0xf0
    [    9.480848][  T121]  driver_probe_device+0x4c/0x228
    [    9.480853][  T121]  __driver_attach+0x110/0x1e0
    [    9.480858][  T121]  bus_for_each_dev+0xa0/0xf4
    [    9.480864][  T121]  driver_attach+0x2c/0x40
    [    9.480868][  T121]  bus_add_driver+0x118/0x208
    [    9.480873][  T121]  driver_register+0x80/0x124
    [    9.480878][  T121]  __platform_driver_register+0x2c/0x40
    [    9.480884][  T121]  init_module+0x28/0xfe4 [hisi_thermal]
    [    9.480895][  T121]  do_one_initcall+0xe4/0x334
    [    9.480902][  T121]  do_init_module+0x50/0x1f0
    [    9.480909][  T121]  load_module+0x1034/0x1204
    [    9.480914][  T121]  __arm64_sys_finit_module+0xc8/0x11c
    [    9.480919][  T121]  invoke_syscall+0x60/0x130
    [    9.480926][  T121]  el0_svc_common+0xbc/0x100
    [    9.480931][  T121]  do_el0_svc+0x38/0xc4
    [    9.480937][  T121]  el0_svc+0x34/0xc4
    [    9.480945][  T121]  el0t_64_sync_handler+0x8c/0xfc
    [    9.480950][  T121]  el0t_64_sync+0x1a4/0x1a8
    [    9.480957][  T121] Code: 91132d08 b9001814 f9000013 f9000808 (d4200020)
    [    9.480960][  T121] ---[ end trace 0000000000000000 ]---
    [    9.482201][   T72] dwmmc_k3 ff37f000.dwmmc1: IDMAC supports
64-bit address mode.
    [    9.482225][   T72] dwmmc_k3 ff37f000.dwmmc1: Using internal
DMA controller.
    [    9.482232][   T72] dwmmc_k3 ff37f000.dwmmc1: Version ID is 270a
    [    9.482261][   T72] dwmmc_k3 ff37f000.dwmmc1: DW MMC controller
at irq 72,32 bit host data width,128 deep fifo
    [    9.482406][  T117] cpu cpu0: EM: created perf domain
    [    9.482677][  T118] ueventd: Loaded kernel module
/vendor/lib/modules/btqca.ko
    [    9.482745][  T118] ueventd: Loading module
/vendor/lib/modules/hci_uart.ko with args ''
    [    9.483117][  T117] cpu cpu4: EM: created perf domain
    [    9.483767][  T117] ueventd: Loaded kernel module
/vendor/lib/modules/cpufreq-dt.ko
    [    9.484265][   T72] dwmmc_k3 ff37f000.dwmmc1: fifo-depth
property not found, using value of FIFOTH register as default
    [    9.484326][  T117] ueventd: LoadWithAliases was unable to load
cpu:type:aarch64:feature:,0000,0001,0002,0003,0004,0005,0006,0007,000B
    [    9.484335][  T117] ueventd:
    [    9.486508][   T72] dwmmc_k3 ff37f000.dwmmc1: IDMAC supports
64-bit address mode.
    [    9.486564][   T72] dwmmc_k3 ff37f000.dwmmc1: Using internal
DMA controller.
    [    9.486572][   T72] dwmmc_k3 ff37f000.dwmmc1: Version ID is 270a
    [    9.486620][   T72] dwmmc_k3 ff37f000.dwmmc1: DW MMC controller
at irq 72,32 bit host data width,64 deep fifo
    [    9.488281][  T121] Kernel panic - not syncing: BRK handler:
Fatal exception

for the full serial console log, please check here:
    http://ix.io/4mLg

Thanks,
Yongqin Liu
On Wed, 2 Nov 2022 at 06:34, Kees Cook <keescook@chromium.org> wrote:
>
> Mark the devm_*alloc()-family of allocations with appropriate
> __alloc_size()/__realloc_size() hints so the compiler can attempt to
> reason about buffer lengths from allocations.
>
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Jason Gunthorpe <jgg@ziepe.ca>
> Cc: Nishanth Menon <nm@ti.com>
> Cc: Michael Kelley <mikelley@microsoft.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Won Chung <wonchung@google.com>
> Signed-off-by: Kees Cook <keescook@chromium.org>
> Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Link: https://lore.kernel.org/r/20221029074734.gonna.276-kees@kernel.org
> ---
> This is already in -next, but I'm including it here again to avoid any
> confusion about this series landing (or being tested) via another tree.
> ---
>  include/linux/device.h | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/device.h b/include/linux/device.h
> index 424b55df0272..5e4cd857e74f 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -197,9 +197,9 @@ void devres_remove_group(struct device *dev, void *id);
>  int devres_release_group(struct device *dev, void *id);
>
>  /* managed devm_k.alloc/kfree for device drivers */
> -void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc;
> +void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __alloc_size(2);
>  void *devm_krealloc(struct device *dev, void *ptr, size_t size,
> -                   gfp_t gfp) __must_check;
> +                   gfp_t gfp) __must_check __realloc_size(3);
>  __printf(3, 0) char *devm_kvasprintf(struct device *dev, gfp_t gfp,
>                                      const char *fmt, va_list ap) __malloc;
>  __printf(3, 4) char *devm_kasprintf(struct device *dev, gfp_t gfp,
> @@ -226,7 +226,8 @@ static inline void *devm_kcalloc(struct device *dev,
>  void devm_kfree(struct device *dev, const void *p);
>  char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc;
>  const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp);
> -void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp);
> +void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp)
> +       __realloc_size(3);
>
>  unsigned long devm_get_free_pages(struct device *dev,
>                                   gfp_t gfp_mask, unsigned int order);
> --
> 2.34.1
>
  
John Stultz Feb. 1, 2023, 8:11 a.m. UTC | #2
On Tue, Jan 31, 2023 at 11:36 PM Yongqin Liu <yongqin.liu@linaro.org> wrote:
>
> Hi, Kees
>
> This change causes "Kernel panic - not syncing: BRK handler: Fatal exception"
> for the android-mainline based hikey960 build, with this commit reverted,
> there is no problem for the build to boot to the homescreen.
> Not sure if you have any idea about it and give some suggestions.
>
> Here is part of the kernel panic log:
>
>     [    9.479878][  T122] ueventd: Loading module
> /vendor/lib/modules/spi-pl022.ko with args ''
>     [    9.480276][  T115] apexd-bootstrap: Pre-allocated loop device 29
>     [    9.480517][  T123] ueventd: LoadWithAliases was unable to load
> of:Nhi3660_i2sT(null)Chisilicon,hi3660-i2s-1.0
>     [    9.480632][  T121] Unexpected kernel BRK exception at EL1
>     [    9.480637][  T121] Internal error: BRK handler:
> 00000000f2000001 [#1] PREEMPT SMP
>     [    9.480644][  T121] Modules linked in: cpufreq_dt(E+)
> hisi_thermal(E+) phy_hi3660_usb3(E) btqca(E) hi6421_pmic_core(E)
> btbcm(E) spi_pl022(E) hi3660_mailbox(E) i2c_designware_platform(E)
> mali_kbase(OE) dw_mmc_k3(E) bluetooth(E) dw_mmc_pltfm(E) dw_mmc(E)
> kirin_drm(E) rfkill(E) kirin_dsi(E) i2c_designware_core(E) k3dma(E)
> drm_dma_helper(E) cma_heap(E) system_heap(E)
>     [    9.480688][  T121] CPU: 4 PID: 121 Comm: ueventd Tainted: G
>        OE      6.2.0-rc6-mainline-14196-g1d9f94ec75b9 #1
>     [    9.480694][  T121] Hardware name: HiKey960 (DT)
>     [    9.480697][  T121] pstate: 20400005 (nzCv daif +PAN -UAO -TCO
> -DIT -SSBS BTYPE=--)
>     [    9.480703][  T121] pc : hi3660_thermal_probe+0x6c/0x74 [hisi_thermal]
>     [    9.480722][  T121] lr : hi3660_thermal_probe+0x38/0x74 [hisi_thermal]
>     [    9.480733][  T121] sp : ffffffc00aa13700
>     [    9.480735][  T121] x29: ffffffc00aa13700 x28: 0000007ff8ae8531
> x27: 00000000000008c0
>     [    9.480743][  T121] x26: ffffffc00aa2a300 x25: ffffffc00aa2ab40
> x24: 000000000000001d
>     [    9.480749][  T121] x23: ffffffc00a29d000 x22: 0000000000000000
> x21: ffffff8001fa4a80
>     [    9.480755][  T121] x20: 0000000000000001 x19: ffffff8001fa4a80
> x18: ffffffc00a8810b0
>     [    9.480761][  T121] x17: 000000007ab542f2 x16: 000000007ab542f2
> x15: ffffffc00aa01000
>     [    9.480767][  T121] x14: ffffffc00966f250 x13: ffffffc0b58f9000
> x12: ffffffc00a055f10
>     [    9.480771][  T123] ueventd: LoadWithAliases was unable to load
> cpu:type:aarch64:feature:,0000,0001,0002,0003,0004,0005,0006,0007,000B
>     [    9.480773][  T121]
>     [    9.480774][  T121] x11: 0000000000000000 x10: 0000000000000001
> x9 : 0000000100000000
>     [    9.480780][  T123] ueventd:
>     [    9.480780][  T121] x8 : ffffffc0044154cb x7 : 0000000000000000
> x6 : 000000000000003f
>     [    9.480786][  T121] x5 : 0000000000000020 x4 : ffffffc0098db323
> x3 : ffffff801aeb62c0
>     [    9.480792][  T121] x2 : ffffff801aeb62c0 x1 : 0000000000000000
> x0 : ffffff8001fa4c80
>     [    9.480798][  T121] Call trace:
>     [    9.480801][  T121]  hi3660_thermal_probe+0x6c/0x74 [hisi_thermal]
>     [    9.480813][  T121]  hisi_thermal_probe+0xbc/0x284 [hisi_thermal]


Taking a look here, it looks pretty obvious:
  https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/thermal/hisi_thermal.c#n414

data->nr_sensors = 1;
data->sensor = devm_kzalloc(dev, sizeof(*data->sensor) *
   data->nr_sensors, GFP_KERNEL);

Here as nr_sensors=1, we allocate only one structure for the array.
But then below that, we modify two entries, writing past the valid
array, and corrupting data when writing the second sensor values.

data->sensor[0].id = HI3660_BIG_SENSOR;
data->sensor[0].irq_name = "tsensor_a73";
data->sensor[0].data = data;

data->sensor[1].id = HI3660_LITTLE_SENSOR;
data->sensor[1].irq_name = "tsensor_a53";
data->sensor[1].data = data;

I suspect nr_sensors needs to be set to 2.

Nice work, Kees!

thanks
-john
  
John Stultz Feb. 1, 2023, 8:16 a.m. UTC | #3
On Wed, Feb 1, 2023 at 12:11 AM John Stultz <jstultz@google.com> wrote:
> On Tue, Jan 31, 2023 at 11:36 PM Yongqin Liu <yongqin.liu@linaro.org> wrote:
> >
> > Hi, Kees
> >
> > This change causes "Kernel panic - not syncing: BRK handler: Fatal exception"
> > for the android-mainline based hikey960 build, with this commit reverted,
> > there is no problem for the build to boot to the homescreen.
> > Not sure if you have any idea about it and give some suggestions.
> >
> > Here is part of the kernel panic log:
> >
> >     [    9.479878][  T122] ueventd: Loading module
> > /vendor/lib/modules/spi-pl022.ko with args ''
> >     [    9.480276][  T115] apexd-bootstrap: Pre-allocated loop device 29
> >     [    9.480517][  T123] ueventd: LoadWithAliases was unable to load
> > of:Nhi3660_i2sT(null)Chisilicon,hi3660-i2s-1.0
> >     [    9.480632][  T121] Unexpected kernel BRK exception at EL1
> >     [    9.480637][  T121] Internal error: BRK handler:
> > 00000000f2000001 [#1] PREEMPT SMP
> >     [    9.480644][  T121] Modules linked in: cpufreq_dt(E+)
> > hisi_thermal(E+) phy_hi3660_usb3(E) btqca(E) hi6421_pmic_core(E)
> > btbcm(E) spi_pl022(E) hi3660_mailbox(E) i2c_designware_platform(E)
> > mali_kbase(OE) dw_mmc_k3(E) bluetooth(E) dw_mmc_pltfm(E) dw_mmc(E)
> > kirin_drm(E) rfkill(E) kirin_dsi(E) i2c_designware_core(E) k3dma(E)
> > drm_dma_helper(E) cma_heap(E) system_heap(E)
> >     [    9.480688][  T121] CPU: 4 PID: 121 Comm: ueventd Tainted: G
> >        OE      6.2.0-rc6-mainline-14196-g1d9f94ec75b9 #1
> >     [    9.480694][  T121] Hardware name: HiKey960 (DT)
> >     [    9.480697][  T121] pstate: 20400005 (nzCv daif +PAN -UAO -TCO
> > -DIT -SSBS BTYPE=--)
> >     [    9.480703][  T121] pc : hi3660_thermal_probe+0x6c/0x74 [hisi_thermal]
> >     [    9.480722][  T121] lr : hi3660_thermal_probe+0x38/0x74 [hisi_thermal]
> >     [    9.480733][  T121] sp : ffffffc00aa13700
> >     [    9.480735][  T121] x29: ffffffc00aa13700 x28: 0000007ff8ae8531
> > x27: 00000000000008c0
> >     [    9.480743][  T121] x26: ffffffc00aa2a300 x25: ffffffc00aa2ab40
> > x24: 000000000000001d
> >     [    9.480749][  T121] x23: ffffffc00a29d000 x22: 0000000000000000
> > x21: ffffff8001fa4a80
> >     [    9.480755][  T121] x20: 0000000000000001 x19: ffffff8001fa4a80
> > x18: ffffffc00a8810b0
> >     [    9.480761][  T121] x17: 000000007ab542f2 x16: 000000007ab542f2
> > x15: ffffffc00aa01000
> >     [    9.480767][  T121] x14: ffffffc00966f250 x13: ffffffc0b58f9000
> > x12: ffffffc00a055f10
> >     [    9.480771][  T123] ueventd: LoadWithAliases was unable to load
> > cpu:type:aarch64:feature:,0000,0001,0002,0003,0004,0005,0006,0007,000B
> >     [    9.480773][  T121]
> >     [    9.480774][  T121] x11: 0000000000000000 x10: 0000000000000001
> > x9 : 0000000100000000
> >     [    9.480780][  T123] ueventd:
> >     [    9.480780][  T121] x8 : ffffffc0044154cb x7 : 0000000000000000
> > x6 : 000000000000003f
> >     [    9.480786][  T121] x5 : 0000000000000020 x4 : ffffffc0098db323
> > x3 : ffffff801aeb62c0
> >     [    9.480792][  T121] x2 : ffffff801aeb62c0 x1 : 0000000000000000
> > x0 : ffffff8001fa4c80
> >     [    9.480798][  T121] Call trace:
> >     [    9.480801][  T121]  hi3660_thermal_probe+0x6c/0x74 [hisi_thermal]
> >     [    9.480813][  T121]  hisi_thermal_probe+0xbc/0x284 [hisi_thermal]
>
>
> Taking a look here, it looks pretty obvious:
>   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/thermal/hisi_thermal.c#n414
>
> data->nr_sensors = 1;
> data->sensor = devm_kzalloc(dev, sizeof(*data->sensor) *
>    data->nr_sensors, GFP_KERNEL);
>
> Here as nr_sensors=1, we allocate only one structure for the array.
> But then below that, we modify two entries, writing past the valid
> array, and corrupting data when writing the second sensor values.
>
> data->sensor[0].id = HI3660_BIG_SENSOR;
> data->sensor[0].irq_name = "tsensor_a73";
> data->sensor[0].data = data;
>
> data->sensor[1].id = HI3660_LITTLE_SENSOR;
> data->sensor[1].irq_name = "tsensor_a53";
> data->sensor[1].data = data;
>
> I suspect nr_sensors needs to be set to 2.

Looks like the bug was introduced here:
  https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7d3a2a2bbadb4bf5856ed394ba09b8fbb7a80460

But that change seems to imply the dual zones weren't fully supported
at the time. I'm not sure if that's changed in the meantime, so
removing the second sensor writes may potentially be a better fix.

thanks
-john
  
Andy Shevchenko Feb. 1, 2023, 6:41 p.m. UTC | #4
On Wed, Feb 01, 2023 at 12:11:41AM -0800, John Stultz wrote:
> On Tue, Jan 31, 2023 at 11:36 PM Yongqin Liu <yongqin.liu@linaro.org> wrote:

...

> data->nr_sensors = 1;
> data->sensor = devm_kzalloc(dev, sizeof(*data->sensor) *
>    data->nr_sensors, GFP_KERNEL);

Side note: This should use devm_kcalloc().
  
Kees Cook Feb. 2, 2023, 5:18 p.m. UTC | #5
On Wed, Feb 01, 2023 at 12:11:41AM -0800, John Stultz wrote:
> On Tue, Jan 31, 2023 at 11:36 PM Yongqin Liu <yongqin.liu@linaro.org> wrote:
> >
> > Hi, Kees
> >
> > This change causes "Kernel panic - not syncing: BRK handler: Fatal exception"
> > for the android-mainline based hikey960 build, with this commit reverted,
> > there is no problem for the build to boot to the homescreen.
> > Not sure if you have any idea about it and give some suggestions.
> >
> > Here is part of the kernel panic log:
> >
> >     [    9.479878][  T122] ueventd: Loading module
> > /vendor/lib/modules/spi-pl022.ko with args ''
> >     [    9.480276][  T115] apexd-bootstrap: Pre-allocated loop device 29
> >     [    9.480517][  T123] ueventd: LoadWithAliases was unable to load
> > of:Nhi3660_i2sT(null)Chisilicon,hi3660-i2s-1.0
> >     [    9.480632][  T121] Unexpected kernel BRK exception at EL1
> >     [    9.480637][  T121] Internal error: BRK handler:
> > 00000000f2000001 [#1] PREEMPT SMP
> >     [    9.480644][  T121] Modules linked in: cpufreq_dt(E+)
> > hisi_thermal(E+) phy_hi3660_usb3(E) btqca(E) hi6421_pmic_core(E)
> > btbcm(E) spi_pl022(E) hi3660_mailbox(E) i2c_designware_platform(E)
> > mali_kbase(OE) dw_mmc_k3(E) bluetooth(E) dw_mmc_pltfm(E) dw_mmc(E)
> > kirin_drm(E) rfkill(E) kirin_dsi(E) i2c_designware_core(E) k3dma(E)
> > drm_dma_helper(E) cma_heap(E) system_heap(E)
> >     [    9.480688][  T121] CPU: 4 PID: 121 Comm: ueventd Tainted: G
> >        OE      6.2.0-rc6-mainline-14196-g1d9f94ec75b9 #1
> >     [    9.480694][  T121] Hardware name: HiKey960 (DT)
> >     [    9.480697][  T121] pstate: 20400005 (nzCv daif +PAN -UAO -TCO
> > -DIT -SSBS BTYPE=--)
> >     [    9.480703][  T121] pc : hi3660_thermal_probe+0x6c/0x74 [hisi_thermal]
> >     [    9.480722][  T121] lr : hi3660_thermal_probe+0x38/0x74 [hisi_thermal]
> >     [    9.480733][  T121] sp : ffffffc00aa13700
> >     [    9.480735][  T121] x29: ffffffc00aa13700 x28: 0000007ff8ae8531
> > x27: 00000000000008c0
> >     [    9.480743][  T121] x26: ffffffc00aa2a300 x25: ffffffc00aa2ab40
> > x24: 000000000000001d
> >     [    9.480749][  T121] x23: ffffffc00a29d000 x22: 0000000000000000
> > x21: ffffff8001fa4a80
> >     [    9.480755][  T121] x20: 0000000000000001 x19: ffffff8001fa4a80
> > x18: ffffffc00a8810b0
> >     [    9.480761][  T121] x17: 000000007ab542f2 x16: 000000007ab542f2
> > x15: ffffffc00aa01000
> >     [    9.480767][  T121] x14: ffffffc00966f250 x13: ffffffc0b58f9000
> > x12: ffffffc00a055f10
> >     [    9.480771][  T123] ueventd: LoadWithAliases was unable to load
> > cpu:type:aarch64:feature:,0000,0001,0002,0003,0004,0005,0006,0007,000B
> >     [    9.480773][  T121]
> >     [    9.480774][  T121] x11: 0000000000000000 x10: 0000000000000001
> > x9 : 0000000100000000
> >     [    9.480780][  T123] ueventd:
> >     [    9.480780][  T121] x8 : ffffffc0044154cb x7 : 0000000000000000
> > x6 : 000000000000003f
> >     [    9.480786][  T121] x5 : 0000000000000020 x4 : ffffffc0098db323
> > x3 : ffffff801aeb62c0
> >     [    9.480792][  T121] x2 : ffffff801aeb62c0 x1 : 0000000000000000
> > x0 : ffffff8001fa4c80
> >     [    9.480798][  T121] Call trace:
> >     [    9.480801][  T121]  hi3660_thermal_probe+0x6c/0x74 [hisi_thermal]
> >     [    9.480813][  T121]  hisi_thermal_probe+0xbc/0x284 [hisi_thermal]
> 
> 
> Taking a look here, it looks pretty obvious:
>   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/thermal/hisi_thermal.c#n414
> 
> data->nr_sensors = 1;
> data->sensor = devm_kzalloc(dev, sizeof(*data->sensor) *
>    data->nr_sensors, GFP_KERNEL);
> 
> Here as nr_sensors=1, we allocate only one structure for the array.
> But then below that, we modify two entries, writing past the valid
> array, and corrupting data when writing the second sensor values.
> 
> data->sensor[0].id = HI3660_BIG_SENSOR;
> data->sensor[0].irq_name = "tsensor_a73";
> data->sensor[0].data = data;
> 
> data->sensor[1].id = HI3660_LITTLE_SENSOR;
> data->sensor[1].irq_name = "tsensor_a53";
> data->sensor[1].data = data;
> 
> I suspect nr_sensors needs to be set to 2.
> 
> Nice work, Kees!

Yay for compilers! :)

Was a patch sent to fix this driver?
  
John Stultz Feb. 2, 2023, 6:56 p.m. UTC | #6
On Thu, Feb 2, 2023 at 9:18 AM Kees Cook <keescook@chromium.org> wrote:
> On Wed, Feb 01, 2023 at 12:11:41AM -0800, John Stultz wrote:
> > On Tue, Jan 31, 2023 at 11:36 PM Yongqin Liu <yongqin.liu@linaro.org> wrote:
> > > This change causes "Kernel panic - not syncing: BRK handler: Fatal exception"
> > > for the android-mainline based hikey960 build, with this commit reverted,
> > > there is no problem for the build to boot to the homescreen.
> > > Not sure if you have any idea about it and give some suggestions.
> > >
> > > Here is part of the kernel panic log:
...
> > Here as nr_sensors=1, we allocate only one structure for the array.
> > But then below that, we modify two entries, writing past the valid
> > array, and corrupting data when writing the second sensor values.
> >
> > data->sensor[0].id = HI3660_BIG_SENSOR;
> > data->sensor[0].irq_name = "tsensor_a73";
> > data->sensor[0].data = data;
> >
> > data->sensor[1].id = HI3660_LITTLE_SENSOR;
> > data->sensor[1].irq_name = "tsensor_a53";
> > data->sensor[1].data = data;
> >
> > I suspect nr_sensors needs to be set to 2.
> >
> > Nice work, Kees!
>
> Yay for compilers! :)

Well, I know it's not trivial to make the compilers catch these
things, so yay for you and others putting in all the effort on this as
well.

That said, making sense of the error message isn't completely trivial
either. I've been seeing a few cases recently of some of the new
compiler tooling (I pinged you earlier on a CFI one) causing errors
that developers aren't really sure how to address.  I know sometimes
it's not easy to surface the errors with context to what was wrong,
but at the risk of intense bike shedding, is there some way to provide
something like "Likely array bounds error" instead of just "BRK
handler: Fatal exception"?

> Was a patch sent to fix this driver?

I think YongQin is looking into it (either setting the nr_sensors
value to 2 or dropping the second sensor access).

thanks
-john
  
Kees Cook Feb. 2, 2023, 7:10 p.m. UTC | #7
On Thu, Feb 02, 2023 at 10:56:29AM -0800, John Stultz wrote:
> That said, making sense of the error message isn't completely trivial
> either. I've been seeing a few cases recently of some of the new
> compiler tooling (I pinged you earlier on a CFI one) causing errors
> that developers aren't really sure how to address.  I know sometimes
> it's not easy to surface the errors with context to what was wrong,
> but at the risk of intense bike shedding, is there some way to provide
> something like "Likely array bounds error" instead of just "BRK
> handler: Fatal exception"?

Yeah, this is a result of the size trade-off that resulted in config
CONFIG_UBSAN_TRAP -- there ends up being no message about what went
wrong. I'd really like to have cleaner handling of this -- perhaps what
was done for KCFI could be applied to UBSAN as well, though this is an
area I don't know well myself. (i.e. encoding "this was a UBSAN trap"
in the trap itself.)

Sami or Ard, is this something that could be improved for arm64?
  
Ard Biesheuvel Feb. 2, 2023, 7:20 p.m. UTC | #8
On Thu, 2 Feb 2023 at 20:10, Kees Cook <keescook@chromium.org> wrote:
>
> On Thu, Feb 02, 2023 at 10:56:29AM -0800, John Stultz wrote:
> > That said, making sense of the error message isn't completely trivial
> > either. I've been seeing a few cases recently of some of the new
> > compiler tooling (I pinged you earlier on a CFI one) causing errors
> > that developers aren't really sure how to address.  I know sometimes
> > it's not easy to surface the errors with context to what was wrong,
> > but at the risk of intense bike shedding, is there some way to provide
> > something like "Likely array bounds error" instead of just "BRK
> > handler: Fatal exception"?
>
> Yeah, this is a result of the size trade-off that resulted in config
> CONFIG_UBSAN_TRAP -- there ends up being no message about what went
> wrong. I'd really like to have cleaner handling of this -- perhaps what
> was done for KCFI could be applied to UBSAN as well, though this is an
> area I don't know well myself. (i.e. encoding "this was a UBSAN trap"
> in the trap itself.)
>
> Sami or Ard, is this something that could be improved for arm64?
>

-ENOCONTEXT, so I am going to assume this is about runtime
instrumentation that needs some kind of 'panic' function which it will
invoke if some condition is met that should never occur?

We already use brk with different immediate values in the opcode, so
the arch layer already has what we need. Is this a limitation in the
compiler, perhaps, where it always emits the same brk opcode?
  
Nick Desaulniers Feb. 2, 2023, 7:31 p.m. UTC | #9
On Thu, Feb 2, 2023 at 11:20 AM Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Thu, 2 Feb 2023 at 20:10, Kees Cook <keescook@chromium.org> wrote:
> >
> > On Thu, Feb 02, 2023 at 10:56:29AM -0800, John Stultz wrote:
> > > That said, making sense of the error message isn't completely trivial
> > > either. I've been seeing a few cases recently of some of the new
> > > compiler tooling (I pinged you earlier on a CFI one) causing errors
> > > that developers aren't really sure how to address.  I know sometimes
> > > it's not easy to surface the errors with context to what was wrong,
> > > but at the risk of intense bike shedding, is there some way to provide
> > > something like "Likely array bounds error" instead of just "BRK
> > > handler: Fatal exception"?
> >
> > Yeah, this is a result of the size trade-off that resulted in config
> > CONFIG_UBSAN_TRAP -- there ends up being no message about what went
> > wrong. I'd really like to have cleaner handling of this -- perhaps what
> > was done for KCFI could be applied to UBSAN as well, though this is an
> > area I don't know well myself. (i.e. encoding "this was a UBSAN trap"
> > in the trap itself.)
> >
> > Sami or Ard, is this something that could be improved for arm64?
> >
>
> -ENOCONTEXT, so I am going to assume this is about runtime
> instrumentation that needs some kind of 'panic' function which it will
> invoke if some condition is met that should never occur?
>
> We already use brk with different immediate values in the opcode, so
> the arch layer already has what we need. Is this a limitation in the
> compiler, perhaps, where it always emits the same brk opcode?

Yeah, we'd need to update both the compiler to produce the encoding,
and the kernel to recognize the encoding and do something special.
  
Sami Tolvanen Feb. 2, 2023, 7:49 p.m. UTC | #10
On Thu, Feb 2, 2023 at 11:31 AM Nick Desaulniers
<ndesaulniers@google.com> wrote:
>
> On Thu, Feb 2, 2023 at 11:20 AM Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > On Thu, 2 Feb 2023 at 20:10, Kees Cook <keescook@chromium.org> wrote:
> > >
> > > On Thu, Feb 02, 2023 at 10:56:29AM -0800, John Stultz wrote:
> > > > That said, making sense of the error message isn't completely trivial
> > > > either. I've been seeing a few cases recently of some of the new
> > > > compiler tooling (I pinged you earlier on a CFI one) causing errors
> > > > that developers aren't really sure how to address.  I know sometimes
> > > > it's not easy to surface the errors with context to what was wrong,
> > > > but at the risk of intense bike shedding, is there some way to provide
> > > > something like "Likely array bounds error" instead of just "BRK
> > > > handler: Fatal exception"?
> > >
> > > Yeah, this is a result of the size trade-off that resulted in config
> > > CONFIG_UBSAN_TRAP -- there ends up being no message about what went
> > > wrong. I'd really like to have cleaner handling of this -- perhaps what
> > > was done for KCFI could be applied to UBSAN as well, though this is an
> > > area I don't know well myself. (i.e. encoding "this was a UBSAN trap"
> > > in the trap itself.)
> > >
> > > Sami or Ard, is this something that could be improved for arm64?
> > >
> >
> > -ENOCONTEXT, so I am going to assume this is about runtime
> > instrumentation that needs some kind of 'panic' function which it will
> > invoke if some condition is met that should never occur?
> >
> > We already use brk with different immediate values in the opcode, so
> > the arch layer already has what we need. Is this a limitation in the
> > compiler, perhaps, where it always emits the same brk opcode?
>
> Yeah, we'd need to update both the compiler to produce the encoding,
> and the kernel to recognize the encoding and do something special.

A quick look at Clang's source code suggests that Intrinsic::ubsantrap
already accepts the handler ID (from the SanitizerHandler enum) as an
argument and the arm64 LLVM back-end appears to encode the value as an
immediate for the brk instruction. I didn't confirm that this actually
works, but perhaps we just need to teach the kernel about the possible
values?

Sami
  
Kees Cook Feb. 2, 2023, 7:53 p.m. UTC | #11
On Thu, Feb 02, 2023 at 11:49:42AM -0800, Sami Tolvanen wrote:
> On Thu, Feb 2, 2023 at 11:31 AM Nick Desaulniers
> <ndesaulniers@google.com> wrote:
> >
> > On Thu, Feb 2, 2023 at 11:20 AM Ard Biesheuvel <ardb@kernel.org> wrote:
> > >
> > > On Thu, 2 Feb 2023 at 20:10, Kees Cook <keescook@chromium.org> wrote:
> > > >
> > > > On Thu, Feb 02, 2023 at 10:56:29AM -0800, John Stultz wrote:
> > > > > That said, making sense of the error message isn't completely trivial
> > > > > either. I've been seeing a few cases recently of some of the new
> > > > > compiler tooling (I pinged you earlier on a CFI one) causing errors
> > > > > that developers aren't really sure how to address.  I know sometimes
> > > > > it's not easy to surface the errors with context to what was wrong,
> > > > > but at the risk of intense bike shedding, is there some way to provide
> > > > > something like "Likely array bounds error" instead of just "BRK
> > > > > handler: Fatal exception"?
> > > >
> > > > Yeah, this is a result of the size trade-off that resulted in config
> > > > CONFIG_UBSAN_TRAP -- there ends up being no message about what went
> > > > wrong. I'd really like to have cleaner handling of this -- perhaps what
> > > > was done for KCFI could be applied to UBSAN as well, though this is an
> > > > area I don't know well myself. (i.e. encoding "this was a UBSAN trap"
> > > > in the trap itself.)
> > > >
> > > > Sami or Ard, is this something that could be improved for arm64?
> > > >
> > >
> > > -ENOCONTEXT, so I am going to assume this is about runtime
> > > instrumentation that needs some kind of 'panic' function which it will
> > > invoke if some condition is met that should never occur?
> > >
> > > We already use brk with different immediate values in the opcode, so
> > > the arch layer already has what we need. Is this a limitation in the
> > > compiler, perhaps, where it always emits the same brk opcode?
> >
> > Yeah, we'd need to update both the compiler to produce the encoding,
> > and the kernel to recognize the encoding and do something special.
> 
> A quick look at Clang's source code suggests that Intrinsic::ubsantrap
> already accepts the handler ID (from the SanitizerHandler enum) as an
> argument and the arm64 LLVM back-end appears to encode the value as an
> immediate for the brk instruction. I didn't confirm that this actually
> works, but perhaps we just need to teach the kernel about the possible
> values?

Oh excellent. Yeah, if that's all that's needed here that would be
great. What are the values?
  
Sami Tolvanen Feb. 2, 2023, 8:11 p.m. UTC | #12
On Thu, Feb 2, 2023 at 11:53 AM Kees Cook <keescook@chromium.org> wrote:
>
> On Thu, Feb 02, 2023 at 11:49:42AM -0800, Sami Tolvanen wrote:
> > A quick look at Clang's source code suggests that Intrinsic::ubsantrap
> > already accepts the handler ID (from the SanitizerHandler enum) as an
> > argument and the arm64 LLVM back-end appears to encode the value as an
> > immediate for the brk instruction. I didn't confirm that this actually
> > works, but perhaps we just need to teach the kernel about the possible
> > values?
>
> Oh excellent. Yeah, if that's all that's needed here that would be
> great. What are the values?

The arm64 brk immediate encoding seems to be "ubsantrap arg | 'U' << 8":

https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AArch64/AArch64InstrInfo.td#L7571

The argument values come from the SanitizerHandler enum, which is
populated from this list:

https://github.com/llvm/llvm-project/blob/main/clang/lib/CodeGen/CodeGenFunction.h#L113

Therefore, according to the tests, for ubsantrap(12) we'll get brk
#0x550c, for example:

https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AArch64/ubsantrap.ll

Sami
  
Kees Cook Feb. 2, 2023, 8:43 p.m. UTC | #13
On Thu, Feb 02, 2023 at 12:11:47PM -0800, Sami Tolvanen wrote:
> On Thu, Feb 2, 2023 at 11:53 AM Kees Cook <keescook@chromium.org> wrote:
> >
> > On Thu, Feb 02, 2023 at 11:49:42AM -0800, Sami Tolvanen wrote:
> > > A quick look at Clang's source code suggests that Intrinsic::ubsantrap
> > > already accepts the handler ID (from the SanitizerHandler enum) as an
> > > argument and the arm64 LLVM back-end appears to encode the value as an
> > > immediate for the brk instruction. I didn't confirm that this actually
> > > works, but perhaps we just need to teach the kernel about the possible
> > > values?
> >
> > Oh excellent. Yeah, if that's all that's needed here that would be
> > great. What are the values?
> 
> The arm64 brk immediate encoding seems to be "ubsantrap arg | 'U' << 8":
> 
> https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AArch64/AArch64InstrInfo.td#L7571
> 
> The argument values come from the SanitizerHandler enum, which is
> populated from this list:
> 
> https://github.com/llvm/llvm-project/blob/main/clang/lib/CodeGen/CodeGenFunction.h#L113
> 
> Therefore, according to the tests, for ubsantrap(12) we'll get brk
> #0x550c, for example:
> 
> https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AArch64/ubsantrap.ll

So the absolute minimal handler would look like this:

diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h
index 6e000113e508..3f0f0d03268b 100644
--- a/arch/arm64/include/asm/brk-imm.h
+++ b/arch/arm64/include/asm/brk-imm.h
@@ -28,6 +28,8 @@
 #define BUG_BRK_IMM			0x800
 #define KASAN_BRK_IMM			0x900
 #define KASAN_BRK_MASK			0x0ff
+#define UBSAN_BRK_IMM			0x5500
+#define UBSAN_BRK_MASK			0x00ff
 
 #define CFI_BRK_IMM_TARGET		GENMASK(4, 0)
 #define CFI_BRK_IMM_TYPE		GENMASK(9, 5)
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 4c0caa589e12..36b917d8fa5f 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -1074,6 +1074,18 @@ static struct break_hook kasan_break_hook = {
 };
 #endif
 
+#ifdef CONFIG_UBSAN_TRAP
+static int ubsan_handler(struct pt_regs *regs, unsigned long esr)
+{
+	die("Oops - UBSAN", regs, esr);
+}
+
+static struct break_hook ubsan_break_hook = {
+	.fn	= ubsan_handler,
+	.imm	= UBSAN_BRK_IMM,
+	.mask	= UBSAN_BRK_MASK,
+};
+#endif
 
 #define esr_comment(esr) ((esr) & ESR_ELx_BRK64_ISS_COMMENT_MASK)
 
@@ -1091,6 +1103,10 @@ int __init early_brk64(unsigned long addr, unsigned long esr,
 #ifdef CONFIG_KASAN_SW_TAGS
 	if ((esr_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM)
 		return kasan_handler(regs, esr) != DBG_HOOK_HANDLED;
+#endif
+#ifdef CONFIG_UBSAN_TRAP
+	if ((esr_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM)
+		return ubsan_handler(regs, esr) != DBG_HOOK_HANDLED;
 #endif
 	return bug_handler(regs, esr) != DBG_HOOK_HANDLED;
 }
@@ -1104,6 +1120,9 @@ void __init trap_init(void)
 	register_kernel_break_hook(&fault_break_hook);
 #ifdef CONFIG_KASAN_SW_TAGS
 	register_kernel_break_hook(&kasan_break_hook);
+#endif
+#ifdef CONFIG_UBSAN_TRAP
+	register_kernel_break_hook(&ubsan_break_hook);
 #endif
 	debug_traps_init();
 }

But we could expand ubsan_handler() to extract the SanitizerHandler enum
value and report which UBSAN check was hit...
  

Patch

diff --git a/include/linux/device.h b/include/linux/device.h
index 424b55df0272..5e4cd857e74f 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -197,9 +197,9 @@  void devres_remove_group(struct device *dev, void *id);
 int devres_release_group(struct device *dev, void *id);
 
 /* managed devm_k.alloc/kfree for device drivers */
-void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc;
+void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __alloc_size(2);
 void *devm_krealloc(struct device *dev, void *ptr, size_t size,
-		    gfp_t gfp) __must_check;
+		    gfp_t gfp) __must_check __realloc_size(3);
 __printf(3, 0) char *devm_kvasprintf(struct device *dev, gfp_t gfp,
 				     const char *fmt, va_list ap) __malloc;
 __printf(3, 4) char *devm_kasprintf(struct device *dev, gfp_t gfp,
@@ -226,7 +226,8 @@  static inline void *devm_kcalloc(struct device *dev,
 void devm_kfree(struct device *dev, const void *p);
 char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc;
 const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp);
-void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp);
+void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp)
+	__realloc_size(3);
 
 unsigned long devm_get_free_pages(struct device *dev,
 				  gfp_t gfp_mask, unsigned int order);