[v3,3/7] mm/vmalloc.c: allow vread() to read out vm_map_ram areas

Message ID 20230113031921.64716-4-bhe@redhat.com
State New
Headers
Series mm/vmalloc.c: allow vread() to read out vm_map_ram areas |

Commit Message

Baoquan He Jan. 13, 2023, 3:19 a.m. UTC
  Currently, vread can read out vmalloc areas which is associated with
a vm_struct. While this doesn't work for areas created by vm_map_ram()
interface because it doesn't have an associated vm_struct. Then in vread(),
these areas are all skipped.

Here, add a new function vmap_ram_vread() to read out vm_map_ram areas.
The area created with vmap_ram_vread() interface directly can be handled
like the other normal vmap areas with aligned_vread(). While areas
which will be further subdivided and managed with vmap_block need
carefully read out page-aligned small regions and zero fill holes.

Signed-off-by: Baoquan He <bhe@redhat.com>
---
 mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 73 insertions(+), 7 deletions(-)
  

Comments

Dan Carpenter Jan. 14, 2023, 7:57 a.m. UTC | #1
Hi Baoquan,

url:    https://github.com/intel-lab-lkp/linux/commits/Baoquan-He/mm-vmalloc-c-add-used_map-into-vmap_block-to-track-space-of-vmap_block/20230113-112149
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20230113031921.64716-4-bhe%40redhat.com
patch subject: [PATCH v3 3/7] mm/vmalloc.c: allow vread() to read out vm_map_ram areas
config: i386-randconfig-m021
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <error27@gmail.com>

smatch warnings:
mm/vmalloc.c:3682 vread() error: we previously assumed 'vm' could be null (see line 3664)

vim +/vm +3682 mm/vmalloc.c

^1da177e4c3f415 Linus Torvalds          2005-04-16  3630  long vread(char *buf, char *addr, unsigned long count)
^1da177e4c3f415 Linus Torvalds          2005-04-16  3631  {
e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3632  	struct vmap_area *va;
e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3633  	struct vm_struct *vm;
^1da177e4c3f415 Linus Torvalds          2005-04-16  3634  	char *vaddr, *buf_start = buf;
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3635  	unsigned long buflen = count;
129dbdf298d7383 Baoquan He              2023-01-13  3636  	unsigned long n, size, flags;
^1da177e4c3f415 Linus Torvalds          2005-04-16  3637  
4aff1dc4fb3a5a3 Andrey Konovalov        2022-03-24  3638  	addr = kasan_reset_tag(addr);
4aff1dc4fb3a5a3 Andrey Konovalov        2022-03-24  3639  
^1da177e4c3f415 Linus Torvalds          2005-04-16  3640  	/* Don't allow overflow */
^1da177e4c3f415 Linus Torvalds          2005-04-16  3641  	if ((unsigned long) addr + count < count)
^1da177e4c3f415 Linus Torvalds          2005-04-16  3642  		count = -(unsigned long) addr;
^1da177e4c3f415 Linus Torvalds          2005-04-16  3643  
e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3644  	spin_lock(&vmap_area_lock);
f181234a5a21fd0 Chen Wandun             2021-09-02  3645  	va = find_vmap_area_exceed_addr((unsigned long)addr);
f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29  3646  	if (!va)
f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29  3647  		goto finished;
f181234a5a21fd0 Chen Wandun             2021-09-02  3648  
f181234a5a21fd0 Chen Wandun             2021-09-02  3649  	/* no intersects with alive vmap_area */
f181234a5a21fd0 Chen Wandun             2021-09-02  3650  	if ((unsigned long)addr + count <= va->va_start)
f181234a5a21fd0 Chen Wandun             2021-09-02  3651  		goto finished;
f181234a5a21fd0 Chen Wandun             2021-09-02  3652  
f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29  3653  	list_for_each_entry_from(va, &vmap_area_list, list) {
e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3654  		if (!count)
e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3655  			break;
e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3656  
129dbdf298d7383 Baoquan He              2023-01-13  3657  		vm = va->vm;
129dbdf298d7383 Baoquan He              2023-01-13  3658  		flags = va->flags & VMAP_FLAGS_MASK;
129dbdf298d7383 Baoquan He              2023-01-13  3659  
129dbdf298d7383 Baoquan He              2023-01-13  3660  		if (!vm && !flags)
                                                                            ^^^
vm can be NULL if a flag in VMAP_FLAGS_MASK is set.

e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3661  			continue;
e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3662  
129dbdf298d7383 Baoquan He              2023-01-13  3663  		vaddr = (char *) va->va_start;
129dbdf298d7383 Baoquan He              2023-01-13 @3664  		size = vm ? get_vm_area_size(vm) : va_size(va);
                                                                               ^^

129dbdf298d7383 Baoquan He              2023-01-13  3665  
129dbdf298d7383 Baoquan He              2023-01-13  3666  		if (addr >= vaddr + size)
^1da177e4c3f415 Linus Torvalds          2005-04-16  3667  			continue;
^1da177e4c3f415 Linus Torvalds          2005-04-16  3668  		while (addr < vaddr) {
^1da177e4c3f415 Linus Torvalds          2005-04-16  3669  			if (count == 0)
^1da177e4c3f415 Linus Torvalds          2005-04-16  3670  				goto finished;
^1da177e4c3f415 Linus Torvalds          2005-04-16  3671  			*buf = '\0';
^1da177e4c3f415 Linus Torvalds          2005-04-16  3672  			buf++;
^1da177e4c3f415 Linus Torvalds          2005-04-16  3673  			addr++;
^1da177e4c3f415 Linus Torvalds          2005-04-16  3674  			count--;
^1da177e4c3f415 Linus Torvalds          2005-04-16  3675  		}
129dbdf298d7383 Baoquan He              2023-01-13  3676  		n = vaddr + size - addr;
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3677  		if (n > count)
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3678  			n = count;
129dbdf298d7383 Baoquan He              2023-01-13  3679  
129dbdf298d7383 Baoquan He              2023-01-13  3680  		if (flags & VMAP_RAM)

assume VMAP_RAM is not set

129dbdf298d7383 Baoquan He              2023-01-13  3681  			vmap_ram_vread(buf, addr, n, flags);
129dbdf298d7383 Baoquan He              2023-01-13 @3682  		else if (!(vm->flags & VM_IOREMAP))
                                                                                   ^^^^^^^^^
Unchecked dereference.  Should this be "flags" instead of "vm->flags"?

d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3683  			aligned_vread(buf, addr, n);
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3684  		else /* IOREMAP area is treated as memory hole */
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3685  			memset(buf, 0, n);
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3686  		buf += n;
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3687  		addr += n;
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3688  		count -= n;
^1da177e4c3f415 Linus Torvalds          2005-04-16  3689  	}
^1da177e4c3f415 Linus Torvalds          2005-04-16  3690  finished:
e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3691  	spin_unlock(&vmap_area_lock);
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3692  
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3693  	if (buf == buf_start)
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3694  		return 0;
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3695  	/* zero-fill memory holes */
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3696  	if (buf != buf_start + buflen)
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3697  		memset(buf, 0, buflen - (buf - buf_start));
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3698  
d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3699  	return buflen;
^1da177e4c3f415 Linus Torvalds          2005-04-16  3700  }
  
Baoquan He Jan. 15, 2023, 2:08 p.m. UTC | #2
Hi Dan,

On 01/14/23 at 10:57am, Dan Carpenter wrote:
> Hi Baoquan,
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Baoquan-He/mm-vmalloc-c-add-used_map-into-vmap_block-to-track-space-of-vmap_block/20230113-112149
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> patch link:    https://lore.kernel.org/r/20230113031921.64716-4-bhe%40redhat.com
> patch subject: [PATCH v3 3/7] mm/vmalloc.c: allow vread() to read out vm_map_ram areas
> config: i386-randconfig-m021
> compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
> 
> If you fix the issue, kindly add following tag where applicable
> | Reported-by: kernel test robot <lkp@intel.com>
> | Reported-by: Dan Carpenter <error27@gmail.com>
> 
> smatch warnings:
> mm/vmalloc.c:3682 vread() error: we previously assumed 'vm' could be null (see line 3664)

Thanks for checking this. I went through the code flow again, personally
think that the issue and risk pointed out could not exist. Please see
the comment at below.

> 
> vim +/vm +3682 mm/vmalloc.c
> 
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3630  long vread(char *buf, char *addr, unsigned long count)
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3631  {
> e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3632  	struct vmap_area *va;
> e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3633  	struct vm_struct *vm;
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3634  	char *vaddr, *buf_start = buf;
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3635  	unsigned long buflen = count;
> 129dbdf298d7383 Baoquan He              2023-01-13  3636  	unsigned long n, size, flags;
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3637  
> 4aff1dc4fb3a5a3 Andrey Konovalov        2022-03-24  3638  	addr = kasan_reset_tag(addr);
> 4aff1dc4fb3a5a3 Andrey Konovalov        2022-03-24  3639  
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3640  	/* Don't allow overflow */
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3641  	if ((unsigned long) addr + count < count)
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3642  		count = -(unsigned long) addr;
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3643  
> e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3644  	spin_lock(&vmap_area_lock);
> f181234a5a21fd0 Chen Wandun             2021-09-02  3645  	va = find_vmap_area_exceed_addr((unsigned long)addr);
> f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29  3646  	if (!va)
> f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29  3647  		goto finished;
> f181234a5a21fd0 Chen Wandun             2021-09-02  3648  
> f181234a5a21fd0 Chen Wandun             2021-09-02  3649  	/* no intersects with alive vmap_area */
> f181234a5a21fd0 Chen Wandun             2021-09-02  3650  	if ((unsigned long)addr + count <= va->va_start)
> f181234a5a21fd0 Chen Wandun             2021-09-02  3651  		goto finished;
> f181234a5a21fd0 Chen Wandun             2021-09-02  3652  
> f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29  3653  	list_for_each_entry_from(va, &vmap_area_list, list) {
> e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3654  		if (!count)
> e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3655  			break;
> e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3656  
> 129dbdf298d7383 Baoquan He              2023-01-13  3657  		vm = va->vm;
> 129dbdf298d7383 Baoquan He              2023-01-13  3658  		flags = va->flags & VMAP_FLAGS_MASK;
> 129dbdf298d7383 Baoquan He              2023-01-13  3659  
> 129dbdf298d7383 Baoquan He              2023-01-13  3660  		if (!vm && !flags)
>                                                                             ^^^
> vm can be NULL if a flag in VMAP_FLAGS_MASK is set.
> 
> e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3661  			continue;

Right, after the 'continue;' line, only two cases could happen when it
comes here. (vm != null) or (vm->flags & VMAP_RAM) is true.

> e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3662  
> 129dbdf298d7383 Baoquan He              2023-01-13  3663  		vaddr = (char *) va->va_start;
> 129dbdf298d7383 Baoquan He              2023-01-13 @3664  		size = vm ? get_vm_area_size(vm) : va_size(va);
>                                                                                ^^
> 
> 129dbdf298d7383 Baoquan He              2023-01-13  3665  
> 129dbdf298d7383 Baoquan He              2023-01-13  3666  		if (addr >= vaddr + size)
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3667  			continue;
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3668  		while (addr < vaddr) {
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3669  			if (count == 0)
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3670  				goto finished;
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3671  			*buf = '\0';
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3672  			buf++;
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3673  			addr++;
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3674  			count--;
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3675  		}
> 129dbdf298d7383 Baoquan He              2023-01-13  3676  		n = vaddr + size - addr;
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3677  		if (n > count)
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3678  			n = count;
> 129dbdf298d7383 Baoquan He              2023-01-13  3679  
> 129dbdf298d7383 Baoquan He              2023-01-13  3680  		if (flags & VMAP_RAM)
> 
> assume VMAP_RAM is not set

OK, then vm is not null.
> 
> 129dbdf298d7383 Baoquan He              2023-01-13  3681  			vmap_ram_vread(buf, addr, n, flags);
> 129dbdf298d7383 Baoquan He              2023-01-13 @3682  		else if (!(vm->flags & VM_IOREMAP))
>                                                                                    ^^^^^^^^^
> Unchecked dereference.  Should this be "flags" instead of "vm->flags"?

Thus, here, in 'else if', vm is not null. And in this 'else if', we are
intending to check vm->flags. I don't see issue or risk here. Please
help check if I miss anything.

Thanks
Baoquan

> 
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3683  			aligned_vread(buf, addr, n);
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3684  		else /* IOREMAP area is treated as memory hole */
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3685  			memset(buf, 0, n);
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3686  		buf += n;
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3687  		addr += n;
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3688  		count -= n;
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3689  	}
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3690  finished:
> e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3691  	spin_unlock(&vmap_area_lock);
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3692  
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3693  	if (buf == buf_start)
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3694  		return 0;
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3695  	/* zero-fill memory holes */
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3696  	if (buf != buf_start + buflen)
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3697  		memset(buf, 0, buflen - (buf - buf_start));
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3698  
> d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3699  	return buflen;
> ^1da177e4c3f415 Linus Torvalds          2005-04-16  3700  }
> 
> -- 
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests
>
  
Uladzislau Rezki Jan. 16, 2023, 11:50 a.m. UTC | #3
On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote:
> Currently, vread can read out vmalloc areas which is associated with
> a vm_struct. While this doesn't work for areas created by vm_map_ram()
> interface because it doesn't have an associated vm_struct. Then in vread(),
> these areas are all skipped.
> 
> Here, add a new function vmap_ram_vread() to read out vm_map_ram areas.
> The area created with vmap_ram_vread() interface directly can be handled
> like the other normal vmap areas with aligned_vread(). While areas
> which will be further subdivided and managed with vmap_block need
> carefully read out page-aligned small regions and zero fill holes.
> 
> Signed-off-by: Baoquan He <bhe@redhat.com>
> ---
>  mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 73 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index ab4825050b5c..13875bc41e27 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
>  	return copied;
>  }
>  
> +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
> +{
> +	char *start;
> +	struct vmap_block *vb;
> +	unsigned long offset;
> +	unsigned int rs, re, n;
> +
> +	/*
> +	 * If it's area created by vm_map_ram() interface directly, but
> +	 * not further subdividing and delegating management to vmap_block,
> +	 * handle it here.
> +	 */
> +	if (!(flags & VMAP_BLOCK)) {
> +		aligned_vread(buf, addr, count);
> +		return;
> +	}
> +
> +	/*
> +	 * Area is split into regions and tracked with vmap_block, read out
> +	 * each region and zero fill the hole between regions.
> +	 */
> +	vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> +
> +	spin_lock(&vb->lock);
> +	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
>
CPU-X invokes free_vmap_block() whereas we take the vb->lock and do
some manipulations with vb that might be already freed over RCU-core.

Should we protect it by the rcu_read_lock() also here?

--
Uladzislau Rezki
  
Dan Carpenter Jan. 16, 2023, 1:08 p.m. UTC | #4
On Sun, Jan 15, 2023 at 10:08:55PM +0800, Baoquan He wrote:
> > f181234a5a21fd0 Chen Wandun             2021-09-02  3650  	if ((unsigned long)addr + count <= va->va_start)
> > f181234a5a21fd0 Chen Wandun             2021-09-02  3651  		goto finished;
> > f181234a5a21fd0 Chen Wandun             2021-09-02  3652  
> > f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29  3653  	list_for_each_entry_from(va, &vmap_area_list, list) {
> > e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3654  		if (!count)
> > e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3655  			break;
> > e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3656  
> > 129dbdf298d7383 Baoquan He              2023-01-13  3657  		vm = va->vm;
> > 129dbdf298d7383 Baoquan He              2023-01-13  3658  		flags = va->flags & VMAP_FLAGS_MASK;
> > 129dbdf298d7383 Baoquan He              2023-01-13  3659  
> > 129dbdf298d7383 Baoquan He              2023-01-13  3660  		if (!vm && !flags)
> >                                                                             ^^^
> > vm can be NULL if a flag in VMAP_FLAGS_MASK is set.
> > 
> > e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3661  			continue;
> 
> Right, after the 'continue;' line, only two cases could happen when it
> comes here. (vm != null) or (vm->flags & VMAP_RAM) is true.
>

You're saying VMAP_RAM, but strictly speaking the code is checking
VMAP_FLAGS_MASK and not VMAP_RAM.

+#define VMAP_RAM               0x1 /* indicates vm_map_ram area*/
+#define VMAP_BLOCK             0x2 /* mark out the vmap_block sub-type*/
+#define VMAP_FLAGS_MASK                0x3

If we assume that vm is NULL, VMAP_BLOCK is set and VMAP_RAM is clear
then it would lead to a NULL dereference.  There might be reasons why
that combination is impossible outside the function but we can't tell
from the information we have here.

Which is fine, outside information is a common reason for false
positives with this check.  But I was just concerned about the mix of
VMAP_FLAGS_MASK and VMAP_RAM.

regards,
dan carpenter

 
> > e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3662  
> > 129dbdf298d7383 Baoquan He              2023-01-13  3663  		vaddr = (char *) va->va_start;
> > 129dbdf298d7383 Baoquan He              2023-01-13 @3664  		size = vm ? get_vm_area_size(vm) : va_size(va);
> >                                                                                ^^
> > 
> > 129dbdf298d7383 Baoquan He              2023-01-13  3665  
> > 129dbdf298d7383 Baoquan He              2023-01-13  3666  		if (addr >= vaddr + size)
> > ^1da177e4c3f415 Linus Torvalds          2005-04-16  3667  			continue;
> > ^1da177e4c3f415 Linus Torvalds          2005-04-16  3668  		while (addr < vaddr) {
> > ^1da177e4c3f415 Linus Torvalds          2005-04-16  3669  			if (count == 0)
> > ^1da177e4c3f415 Linus Torvalds          2005-04-16  3670  				goto finished;
> > ^1da177e4c3f415 Linus Torvalds          2005-04-16  3671  			*buf = '\0';
> > ^1da177e4c3f415 Linus Torvalds          2005-04-16  3672  			buf++;
> > ^1da177e4c3f415 Linus Torvalds          2005-04-16  3673  			addr++;
> > ^1da177e4c3f415 Linus Torvalds          2005-04-16  3674  			count--;
> > ^1da177e4c3f415 Linus Torvalds          2005-04-16  3675  		}
> > 129dbdf298d7383 Baoquan He              2023-01-13  3676  		n = vaddr + size - addr;
> > d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3677  		if (n > count)
> > d0107eb07320b5d KAMEZAWA Hiroyuki       2009-09-21  3678  			n = count;
> > 129dbdf298d7383 Baoquan He              2023-01-13  3679  
> > 129dbdf298d7383 Baoquan He              2023-01-13  3680  		if (flags & VMAP_RAM)
> > 
> > assume VMAP_RAM is not set
> 
> OK, then vm is not null.
> > 
> > 129dbdf298d7383 Baoquan He              2023-01-13  3681  			vmap_ram_vread(buf, addr, n, flags);
> > 129dbdf298d7383 Baoquan He              2023-01-13 @3682  		else if (!(vm->flags & VM_IOREMAP))
> >                                                                                    ^^^^^^^^^
> > Unchecked dereference.  Should this be "flags" instead of "vm->flags"?
> 
> Thus, here, in 'else if', vm is not null. And in this 'else if', we are
> intending to check vm->flags. I don't see issue or risk here. Please
> help check if I miss anything.
> 
> Thanks
> Baoquan
>
  
Matthew Wilcox Jan. 16, 2023, 7:01 p.m. UTC | #5
On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote:
> +	spin_lock(&vb->lock);
> +	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> +		spin_unlock(&vb->lock);
> +		memset(buf, 0, count);
> +		return;
> +	}
> +	for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
> +		if (!count)
> +			break;
> +		start = vmap_block_vaddr(vb->va->va_start, rs);
> +		while (addr < start) {
> +			if (count == 0)
> +				break;
> +			*buf = '\0';
> +			buf++;
> +			addr++;
> +			count--;
> +		}
> +		/*it could start reading from the middle of used region*/
> +		offset = offset_in_page(addr);
> +		n = ((re - rs + 1) << PAGE_SHIFT) - offset;
> +		if (n > count)
> +			n = count;
> +		aligned_vread(buf, start+offset, n);

The whole vread() interface is rather suboptimal.  The only user is proc,
which is trying to copy to userspace.  But the vread() interface copies
to a kernel address, so kcore has to copy to a bounce buffer.  That makes
this spinlock work, but the price is that we can't copy to a user address
in the future.  Ideally, read_kcore() would be kcore_read_iter() and
we'd pass an iov_iter into vread().  vread() would then need to use a
mutex rather than a spinlock.

I don't think this needs to be done now, but if someone's looking for
a project ...
  
Lorenzo Stoakes Jan. 16, 2023, 7:48 p.m. UTC | #6
On Mon, Jan 16, 2023 at 07:01:33PM +0000, Matthew Wilcox wrote:
> On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote:
> > +	spin_lock(&vb->lock);
> > +	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> > +		spin_unlock(&vb->lock);
> > +		memset(buf, 0, count);
> > +		return;
> > +	}
> > +	for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
> > +		if (!count)
> > +			break;
> > +		start = vmap_block_vaddr(vb->va->va_start, rs);
> > +		while (addr < start) {
> > +			if (count == 0)
> > +				break;
> > +			*buf = '\0';
> > +			buf++;
> > +			addr++;
> > +			count--;
> > +		}
> > +		/*it could start reading from the middle of used region*/
> > +		offset = offset_in_page(addr);
> > +		n = ((re - rs + 1) << PAGE_SHIFT) - offset;
> > +		if (n > count)
> > +			n = count;
> > +		aligned_vread(buf, start+offset, n);
>
> The whole vread() interface is rather suboptimal.  The only user is proc,
> which is trying to copy to userspace.  But the vread() interface copies
> to a kernel address, so kcore has to copy to a bounce buffer.  That makes
> this spinlock work, but the price is that we can't copy to a user address
> in the future.  Ideally, read_kcore() would be kcore_read_iter() and
> we'd pass an iov_iter into vread().  vread() would then need to use a
> mutex rather than a spinlock.
>
> I don't think this needs to be done now, but if someone's looking for
> a project ...

Interesting! I may take a look at this if I get the time :)
  
Baoquan He Jan. 18, 2023, 2:17 a.m. UTC | #7
On 01/16/23 at 04:08pm, Dan Carpenter wrote:
> On Sun, Jan 15, 2023 at 10:08:55PM +0800, Baoquan He wrote:
> > > f181234a5a21fd0 Chen Wandun             2021-09-02  3650  	if ((unsigned long)addr + count <= va->va_start)
> > > f181234a5a21fd0 Chen Wandun             2021-09-02  3651  		goto finished;
> > > f181234a5a21fd0 Chen Wandun             2021-09-02  3652  
> > > f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29  3653  	list_for_each_entry_from(va, &vmap_area_list, list) {
> > > e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3654  		if (!count)
> > > e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3655  			break;
> > > e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3656  
> > > 129dbdf298d7383 Baoquan He              2023-01-13  3657  		vm = va->vm;
> > > 129dbdf298d7383 Baoquan He              2023-01-13  3658  		flags = va->flags & VMAP_FLAGS_MASK;
> > > 129dbdf298d7383 Baoquan He              2023-01-13  3659  
> > > 129dbdf298d7383 Baoquan He              2023-01-13  3660  		if (!vm && !flags)
> > >                                                                             ^^^
> > > vm can be NULL if a flag in VMAP_FLAGS_MASK is set.
> > > 
> > > e81ce85f960c2e2 Joonsoo Kim             2013-04-29  3661  			continue;
> > 
> > Right, after the 'continue;' line, only two cases could happen when it
> > comes here. (vm != null) or (vm->flags & VMAP_RAM) is true.
> >
> 
> You're saying VMAP_RAM, but strictly speaking the code is checking
> VMAP_FLAGS_MASK and not VMAP_RAM.
> 
> +#define VMAP_RAM               0x1 /* indicates vm_map_ram area*/
> +#define VMAP_BLOCK             0x2 /* mark out the vmap_block sub-type*/
> +#define VMAP_FLAGS_MASK                0x3
> 
> If we assume that vm is NULL, VMAP_BLOCK is set and VMAP_RAM is clear
> then it would lead to a NULL dereference.  There might be reasons why
> that combination is impossible outside the function but we can't tell
> from the information we have here.

VMAP_BLOCK has no chance to be set alone. It has to be set together with
VMAP_RAM if needed.

> 
> Which is fine, outside information is a common reason for false
> positives with this check.  But I was just concerned about the mix of
> VMAP_FLAGS_MASK and VMAP_RAM.

Thanks, I see your point now, will consider how to improve it.
  
Baoquan He Jan. 19, 2023, 9:52 a.m. UTC | #8
On 01/16/23 at 12:50pm, Uladzislau Rezki wrote:
> On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote:
> > Currently, vread can read out vmalloc areas which is associated with
> > a vm_struct. While this doesn't work for areas created by vm_map_ram()
> > interface because it doesn't have an associated vm_struct. Then in vread(),
> > these areas are all skipped.
> > 
> > Here, add a new function vmap_ram_vread() to read out vm_map_ram areas.
> > The area created with vmap_ram_vread() interface directly can be handled
> > like the other normal vmap areas with aligned_vread(). While areas
> > which will be further subdivided and managed with vmap_block need
> > carefully read out page-aligned small regions and zero fill holes.
> > 
> > Signed-off-by: Baoquan He <bhe@redhat.com>
> > ---
> >  mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-----
> >  1 file changed, 73 insertions(+), 7 deletions(-)
> > 
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index ab4825050b5c..13875bc41e27 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
> >  	return copied;
> >  }
> >  
> > +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
> > +{
> > +	char *start;
> > +	struct vmap_block *vb;
> > +	unsigned long offset;
> > +	unsigned int rs, re, n;
> > +
> > +	/*
> > +	 * If it's area created by vm_map_ram() interface directly, but
> > +	 * not further subdividing and delegating management to vmap_block,
> > +	 * handle it here.
> > +	 */
> > +	if (!(flags & VMAP_BLOCK)) {
> > +		aligned_vread(buf, addr, count);
> > +		return;
> > +	}
> > +
> > +	/*
> > +	 * Area is split into regions and tracked with vmap_block, read out
> > +	 * each region and zero fill the hole between regions.
> > +	 */
> > +	vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> > +
> > +	spin_lock(&vb->lock);
> > +	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> >
> CPU-X invokes free_vmap_block() whereas we take the vb->lock and do
> some manipulations with vb that might be already freed over RCU-core.
> 
> Should we protect it by the rcu_read_lock() also here?

Just go over the vb and vbq code again, seems we don't need the
rcu_read_lock() here. The rcu lock is needed when operating on the
vmap_block_queue->free list. I don't see race between the vb accessing
here and those list adding or removing on vmap_block_queue->free with
rcu. If I miss some race windows between them, please help point out.

However, when I check free_vmap_block(), I do find a risk. As you said,
CPU-x invokes free_vmap_block() and executed xa_erase() to remove the vb
from vmap_blocks tree. Then vread() comes into vmap_ram_vread() and call
xa_load(), it would be null. I should check the returned vb in
free_vmap_block().


static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
{
......
if (!(flags & VMAP_BLOCK)) {
                aligned_vread(buf, addr, count);
                return;
        }

        /*
         * Area is split into regions and tracked with vmap_block, read out
         * each region and zero fill the hole between regions.
         */
        vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
	if (!vb)    <-- vb need be checked here to avoid accessing erased vb from vmap_blocks tree
		memset(buf, 0, count);
......
}
  
Baoquan He Jan. 19, 2023, 12:48 p.m. UTC | #9
On 01/19/23 at 05:52pm, Baoquan He wrote:
> On 01/16/23 at 12:50pm, Uladzislau Rezki wrote:
> > On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote:
> > > Currently, vread can read out vmalloc areas which is associated with
> > > a vm_struct. While this doesn't work for areas created by vm_map_ram()
> > > interface because it doesn't have an associated vm_struct. Then in vread(),
> > > these areas are all skipped.
> > > 
> > > Here, add a new function vmap_ram_vread() to read out vm_map_ram areas.
> > > The area created with vmap_ram_vread() interface directly can be handled
> > > like the other normal vmap areas with aligned_vread(). While areas
> > > which will be further subdivided and managed with vmap_block need
> > > carefully read out page-aligned small regions and zero fill holes.
> > > 
> > > Signed-off-by: Baoquan He <bhe@redhat.com>
> > > ---
> > >  mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-----
> > >  1 file changed, 73 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > index ab4825050b5c..13875bc41e27 100644
> > > --- a/mm/vmalloc.c
> > > +++ b/mm/vmalloc.c
> > > @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
> > >  	return copied;
> > >  }
> > >  
> > > +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
> > > +{
> > > +	char *start;
> > > +	struct vmap_block *vb;
> > > +	unsigned long offset;
> > > +	unsigned int rs, re, n;
> > > +
> > > +	/*
> > > +	 * If it's area created by vm_map_ram() interface directly, but
> > > +	 * not further subdividing and delegating management to vmap_block,
> > > +	 * handle it here.
> > > +	 */
> > > +	if (!(flags & VMAP_BLOCK)) {
> > > +		aligned_vread(buf, addr, count);
> > > +		return;
> > > +	}
> > > +
> > > +	/*
> > > +	 * Area is split into regions and tracked with vmap_block, read out
> > > +	 * each region and zero fill the hole between regions.
> > > +	 */
> > > +	vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> > > +
> > > +	spin_lock(&vb->lock);
> > > +	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> > >
> > CPU-X invokes free_vmap_block() whereas we take the vb->lock and do
> > some manipulations with vb that might be already freed over RCU-core.
> > 
> > Should we protect it by the rcu_read_lock() also here?
> 
> Just go over the vb and vbq code again, seems we don't need the
> rcu_read_lock() here. The rcu lock is needed when operating on the
> vmap_block_queue->free list. I don't see race between the vb accessing
> here and those list adding or removing on vmap_block_queue->free with
> rcu. If I miss some race windows between them, please help point out.
> 
> However, when I check free_vmap_block(), I do find a risk. As you said,

Forgot to add details about why there's no race between free_vmap_block()
and vmap_ram_vread() because we have taken vmap_area_lock at the beginning
of vread(). So, except of the missing checking on returned value from
xa_load(), free_vmap_block() either is blocked to wait for vmap_area_lock
before calling unlink_va(), or finishes calling unlink_va() to remove
the vmap from vmap_area_root tree. In both cases, no race happened.

> CPU-x invokes free_vmap_block() and executed xa_erase() to remove the vb
> from vmap_blocks tree. Then vread() comes into vmap_ram_vread() and call
> xa_load(), it would be null. I should check the returned vb in
> free_vmap_block().
> 
> 
> static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
> {
> ......
> if (!(flags & VMAP_BLOCK)) {
>                 aligned_vread(buf, addr, count);
>                 return;
>         }
> 
>         /*
>          * Area is split into regions and tracked with vmap_block, read out
>          * each region and zero fill the hole between regions.
>          */
>         vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> 	if (!vb)    <-- vb need be checked here to avoid accessing erased vb from vmap_blocks tree
> 		memset(buf, 0, count);
> ......
> }
>
  
Uladzislau Rezki Jan. 20, 2023, 11:54 a.m. UTC | #10
>
> On 01/19/23 at 05:52pm, Baoquan He wrote:
> > On 01/16/23 at 12:50pm, Uladzislau Rezki wrote:
> > > On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote:
> > > > Currently, vread can read out vmalloc areas which is associated with
> > > > a vm_struct. While this doesn't work for areas created by vm_map_ram()
> > > > interface because it doesn't have an associated vm_struct. Then in vread(),
> > > > these areas are all skipped.
> > > >
> > > > Here, add a new function vmap_ram_vread() to read out vm_map_ram areas.
> > > > The area created with vmap_ram_vread() interface directly can be handled
> > > > like the other normal vmap areas with aligned_vread(). While areas
> > > > which will be further subdivided and managed with vmap_block need
> > > > carefully read out page-aligned small regions and zero fill holes.
> > > >
> > > > Signed-off-by: Baoquan He <bhe@redhat.com>
> > > > ---
> > > >  mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-----
> > > >  1 file changed, 73 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > > index ab4825050b5c..13875bc41e27 100644
> > > > --- a/mm/vmalloc.c
> > > > +++ b/mm/vmalloc.c
> > > > @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
> > > >   return copied;
> > > >  }
> > > >
> > > > +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
> > > > +{
> > > > + char *start;
> > > > + struct vmap_block *vb;
> > > > + unsigned long offset;
> > > > + unsigned int rs, re, n;
> > > > +
> > > > + /*
> > > > +  * If it's area created by vm_map_ram() interface directly, but
> > > > +  * not further subdividing and delegating management to vmap_block,
> > > > +  * handle it here.
> > > > +  */
> > > > + if (!(flags & VMAP_BLOCK)) {
> > > > +         aligned_vread(buf, addr, count);
> > > > +         return;
> > > > + }
> > > > +
> > > > + /*
> > > > +  * Area is split into regions and tracked with vmap_block, read out
> > > > +  * each region and zero fill the hole between regions.
> > > > +  */
> > > > + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> > > > +
> > > > + spin_lock(&vb->lock);
> > > > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> > > >
> > > CPU-X invokes free_vmap_block() whereas we take the vb->lock and do
> > > some manipulations with vb that might be already freed over RCU-core.
> > >
> > > Should we protect it by the rcu_read_lock() also here?
> >
> > Just go over the vb and vbq code again, seems we don't need the
> > rcu_read_lock() here. The rcu lock is needed when operating on the
> > vmap_block_queue->free list. I don't see race between the vb accessing
> > here and those list adding or removing on vmap_block_queue->free with
> > rcu. If I miss some race windows between them, please help point out.
> >
> > However, when I check free_vmap_block(), I do find a risk. As you said,
>
> Forgot to add details about why there's no race between free_vmap_block()
> and vmap_ram_vread() because we have taken vmap_area_lock at the beginning
> of vread(). So, except of the missing checking on returned value from
> xa_load(), free_vmap_block() either is blocked to wait for vmap_area_lock
> before calling unlink_va(), or finishes calling unlink_va() to remove
> the vmap from vmap_area_root tree. In both cases, no race happened.
>
Agree. xa_load()s return value should be checked. Because it can be that
there is no vmap_block associated with an address if xa_erase() was done
earlier.

--
Uladzislau Rezki
  

Patch

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ab4825050b5c..13875bc41e27 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3544,6 +3544,65 @@  static int aligned_vread(char *buf, char *addr, unsigned long count)
 	return copied;
 }
 
+static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
+{
+	char *start;
+	struct vmap_block *vb;
+	unsigned long offset;
+	unsigned int rs, re, n;
+
+	/*
+	 * If it's area created by vm_map_ram() interface directly, but
+	 * not further subdividing and delegating management to vmap_block,
+	 * handle it here.
+	 */
+	if (!(flags & VMAP_BLOCK)) {
+		aligned_vread(buf, addr, count);
+		return;
+	}
+
+	/*
+	 * Area is split into regions and tracked with vmap_block, read out
+	 * each region and zero fill the hole between regions.
+	 */
+	vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
+
+	spin_lock(&vb->lock);
+	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
+		spin_unlock(&vb->lock);
+		memset(buf, 0, count);
+		return;
+	}
+	for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
+		if (!count)
+			break;
+		start = vmap_block_vaddr(vb->va->va_start, rs);
+		while (addr < start) {
+			if (count == 0)
+				break;
+			*buf = '\0';
+			buf++;
+			addr++;
+			count--;
+		}
+		/*it could start reading from the middle of used region*/
+		offset = offset_in_page(addr);
+		n = ((re - rs + 1) << PAGE_SHIFT) - offset;
+		if (n > count)
+			n = count;
+		aligned_vread(buf, start+offset, n);
+
+		buf += n;
+		addr += n;
+		count -= n;
+	}
+	spin_unlock(&vb->lock);
+
+	/* zero-fill the left dirty or free regions */
+	if (count)
+		memset(buf, 0, count);
+}
+
 /**
  * vread() - read vmalloc area in a safe way.
  * @buf:     buffer for reading data
@@ -3574,7 +3633,7 @@  long vread(char *buf, char *addr, unsigned long count)
 	struct vm_struct *vm;
 	char *vaddr, *buf_start = buf;
 	unsigned long buflen = count;
-	unsigned long n;
+	unsigned long n, size, flags;
 
 	addr = kasan_reset_tag(addr);
 
@@ -3595,12 +3654,16 @@  long vread(char *buf, char *addr, unsigned long count)
 		if (!count)
 			break;
 
-		if (!va->vm)
+		vm = va->vm;
+		flags = va->flags & VMAP_FLAGS_MASK;
+
+		if (!vm && !flags)
 			continue;
 
-		vm = va->vm;
-		vaddr = (char *) vm->addr;
-		if (addr >= vaddr + get_vm_area_size(vm))
+		vaddr = (char *) va->va_start;
+		size = vm ? get_vm_area_size(vm) : va_size(va);
+
+		if (addr >= vaddr + size)
 			continue;
 		while (addr < vaddr) {
 			if (count == 0)
@@ -3610,10 +3673,13 @@  long vread(char *buf, char *addr, unsigned long count)
 			addr++;
 			count--;
 		}
-		n = vaddr + get_vm_area_size(vm) - addr;
+		n = vaddr + size - addr;
 		if (n > count)
 			n = count;
-		if (!(vm->flags & VM_IOREMAP))
+
+		if (flags & VMAP_RAM)
+			vmap_ram_vread(buf, addr, n, flags);
+		else if (!(vm->flags & VM_IOREMAP))
 			aligned_vread(buf, addr, n);
 		else /* IOREMAP area is treated as memory hole */
 			memset(buf, 0, n);