[RFC,3/3] mm/vmalloc.c: allow vread() to read out vm_map_ram areas

Message ID 20221109033535.269229-4-bhe@redhat.com
State New
Headers
Series mm/vmalloc.c: allow vread() to read out vm_map_ram areas |

Commit Message

Baoquan He Nov. 9, 2022, 3:35 a.m. UTC
  Currently, vread() can read out vmalloc areas which is associated with
a vm_struct. While this doesn't work for areas created by vm_map_ram()
interface because it doesn't allocate a vm_struct. Then in vread(),
these areas will be skipped.

Here, add a new function vb_vread() to read out areas managed by
vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags
and handle  them respectively.

Stephen Brennan <stephen.s.brennan@oracle.com>
Signed-off-by: Baoquan He <bhe@redhat.com>
Link: https://lore.kernel.org/all/87ilk6gos2.fsf@oracle.com/T/#u
---
 mm/vmalloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 6 deletions(-)
  

Comments

Stephen Brennan Nov. 10, 2022, 12:59 a.m. UTC | #1
Baoquan He <bhe@redhat.com> writes:
> Currently, vread() can read out vmalloc areas which is associated with
> a vm_struct. While this doesn't work for areas created by vm_map_ram()
> interface because it doesn't allocate a vm_struct. Then in vread(),
> these areas will be skipped.
>
> Here, add a new function vb_vread() to read out areas managed by
> vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags
> and handle  them respectively.
>
> Stephen Brennan <stephen.s.brennan@oracle.com>
> Signed-off-by: Baoquan He <bhe@redhat.com>
> Link: https://lore.kernel.org/all/87ilk6gos2.fsf@oracle.com/T/#u
> ---
>  mm/vmalloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 51 insertions(+), 6 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 41d82dc07e13..5a8d5659bfb0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3518,6 +3518,46 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
>  	return copied;
>  }
>  
> +static void vb_vread(char *buf, char *addr, int count)
> +{
> +	char *start;
> +	struct vmap_block *vb;
> +	unsigned long offset;
> +	unsigned int rs, re, n;
> +
> +	offset = ((unsigned long)addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
> +	vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> +
> +	spin_lock(&vb->lock);
> +	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> +		spin_unlock(&vb->lock);
> +		memset(buf, 0, count);
> +		return;
> +	}
> +	for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
> +		if (!count)
> +			break;
> +		start = vmap_block_vaddr(vb->va->va_start, rs);
> +		if (addr < start) {
> +			if (count == 0)
> +				break;
> +			*buf = '\0';
> +			buf++;
> +			addr++;
> +			count--;
> +		}
> +		n = (re - rs + 1) << PAGE_SHIFT;
> +		if (n > count)
> +			n = count;
> +		aligned_vread(buf, start, n);
> +
> +		buf += n;
> +		addr += n;
> +		count -= n;
> +	}
> +	spin_unlock(&vb->lock);
> +}
> +
>  /**
>   * vread() - read vmalloc area in a safe way.
>   * @buf:     buffer for reading data
> @@ -3548,7 +3588,7 @@ long vread(char *buf, char *addr, unsigned long count)
>  	struct vm_struct *vm;
>  	char *vaddr, *buf_start = buf;
>  	unsigned long buflen = count;
> -	unsigned long n;
> +	unsigned long n, size;
>  
>  	addr = kasan_reset_tag(addr);
>  
> @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count)
>  		if (!count)
>  			break;
>  
> -		if (!va->vm)
> +		if (!(va->flags & VMAP_RAM) && !va->vm)
>  			continue;
>  
>  		vm = va->vm;
> -		vaddr = (char *) vm->addr;
> -		if (addr >= vaddr + get_vm_area_size(vm))
> +		vaddr = (char *) va->va_start;
> +		size = vm ? get_vm_area_size(vm) : va_size(va);

Hi Baoquan,

Thanks for working on this. I tested your patches out by using drgn to
debug /proc/kcore. I have a kernel module[1] to do a vm_map_ram() call
and print the virtual address to the kernel log so I can try to read
that memory address in drgn. When I did this test, I got a panic on the
above line of code.

[  167.101113] BUG: kernel NULL pointer dereference, address: 0000000000000013
[  167.104538] #PF: supervisor read access in kernel mode
[  167.106446] #PF: error_code(0x0000) - not-present page
[  167.108474] PGD 0 P4D 0
[  167.109311] Oops: 0000 [#1] PREEMPT SMP NOPTI
[  167.111727] CPU: 3 PID: 7647 Comm: drgn Kdump: loaded Tainted: G           OE      6.1.0-rc4.bugvreadtest.el8.dev02.x86_64 #1
[  167.115076] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.5.1 06/16/2021
[  167.117348] RIP: 0010:vread+0xaf/0x210
[  167.118345] Code: 86 3e 01 00 00 48 85 db 0f 84 35 01 00 00 49 8d 47 28 48 3d 10 f8 a7 8f 0f 84 25 01 00 00 4d 89 f4 49 8b 57 38 48 85 d2 74 21 <48> 8b 42 10 f6 42 18 40 48 89 d6 49 8b 0f 48 8d b8 00 f0 ff ff 48
[  167.123776] RSP: 0018:ffffaeb380a1fb90 EFLAGS: 00010206
[  167.125669] RAX: ffff9853a1397b28 RBX: 0000000000000040 RCX: 0000000000000000
[  167.128401] RDX: 0000000000000003 RSI: 0000000000000000 RDI: 0000000000000000
[  167.130948] RBP: ffffaeb382400000 R08: 0000000000000000 R09: 0000000000000000
[  167.133372] R10: 0000000000000000 R11: 0000000000000000 R12: ffff985385877000
[  167.135397] R13: 0000000000000040 R14: ffff985385877000 R15: ffff9853a1397b00
[  167.137533] FS:  00007f71eae33b80(0000) GS:ffff9856afd80000(0000) knlGS:0000000000000000
[  167.140210] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  167.142440] CR2: 0000000000000013 CR3: 000000012048a000 CR4: 00000000003506e0
[  167.144640] Call Trace:
[  167.145494]  <TASK>
[  167.146263]  read_kcore+0x33a/0xa30
[  167.147392]  ? remove_entity_load_avg+0x2e/0x70
[  167.148425]  ? _raw_spin_unlock_irqrestore+0x11/0x60
[  167.150657]  ? __wake_up_common_lock+0x8b/0xd0
[  167.152261]  ? tty_set_termios+0x211/0x280
[  167.153397]  ? set_termios+0x16b/0x1d0
[  167.154698]  ? _raw_spin_unlock+0xe/0x40
[  167.155737]  ? wp_page_reuse+0x60/0x80
[  167.157138]  ? do_wp_page+0x169/0x3a0
[  167.158752]  ? pmd_pfn+0x9/0x50
[  167.159645]  ? __handle_mm_fault+0x3b0/0x690
[  167.160837]  ? inode_security+0x22/0x60
[  167.161761]  proc_reg_read+0x5a/0xb0
[  167.162777]  vfs_read+0xa7/0x320
[  167.163512]  ? handle_mm_fault+0xb6/0x2c0
[  167.164400]  __x64_sys_pread64+0x9c/0xd0
[  167.165763]  do_syscall_64+0x3f/0xa0
[  167.167610]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[  167.169951] RIP: 0033:0x7f71e9c123d7

I debugged the resulting core dump and found the reason:

>>> stack_trace = prog.crashed_thread().stack_trace()
>>> stack_trace
#0  crash_setup_regs (./arch/x86/include/asm/kexec.h:95:3)
#1  __crash_kexec (kernel/kexec_core.c:974:4)
#2  panic (kernel/panic.c:330:3)
#3  oops_end (arch/x86/kernel/dumpstack.c:379:3)
#4  page_fault_oops (arch/x86/mm/fault.c:729:2)
#5  handle_page_fault (arch/x86/mm/fault.c:1519:3)
#6  exc_page_fault (arch/x86/mm/fault.c:1575:2)
#7  asm_exc_page_fault+0x26/0x2b (./arch/x86/include/asm/idtentry.h:570)
#8  get_vm_area_size (./include/linux/vmalloc.h:203:14)
#9  vread (mm/vmalloc.c:3617:15)
#10 read_kcore (fs/proc/kcore.c:510:4)
#11 pde_read (fs/proc/inode.c:316:10)
#12 proc_reg_read (fs/proc/inode.c:328:8)
#13 vfs_read (fs/read_write.c:468:9)
#14 ksys_pread64 (fs/read_write.c:665:10)
#15 __do_sys_pread64 (fs/read_write.c:675:9)
#16 __se_sys_pread64 (fs/read_write.c:672:1)
#17 __x64_sys_pread64 (fs/read_write.c:672:1)
#18 do_syscall_x64 (arch/x86/entry/common.c:50:14)
#19 do_syscall_64 (arch/x86/entry/common.c:80:7)
#20 entry_SYSCALL_64+0x9f/0x19b (arch/x86/entry/entry_64.S:120)
#21 0x7f71e9c123d7
>>> stack_trace[9]["va"]
*(struct vmap_area *)0xffff9853a1397b00 = {
        .va_start = (unsigned long)18446654684740452352,
        .va_end = (unsigned long)18446654684741500928,
        .rb_node = (struct rb_node){
                .__rb_parent_color = (unsigned long)18446630083335569168,
                .rb_right = (struct rb_node *)0x0,
                .rb_left = (struct rb_node *)0x0,
        },
        .list = (struct list_head){
                .next = (struct list_head *)0xffff98538c403f28,
                .prev = (struct list_head *)0xffff98538c54e1e8,
        },
        .subtree_max_size = (unsigned long)3,
        .vm = (struct vm_struct *)0x3,
        .flags = (unsigned long)3,
}

Since flags is in a union, it shadows "vm" and causes the condition to
be true, and then get_vm_area_size() tries to follow the pointer defined
by flags. I'm not sure if the fix is to have flags be a separate field
inside vmap_area, or to have more careful handling in the vread path.

Thanks,
Stephen

> +
> +		if (addr >= vaddr + size)
>  			continue;
>  		while (addr < vaddr) {
>  			if (count == 0)
> @@ -3584,10 +3626,13 @@ long vread(char *buf, char *addr, unsigned long count)
>  			addr++;
>  			count--;
>  		}
> -		n = vaddr + get_vm_area_size(vm) - addr;
> +		n = vaddr + size - addr;
>  		if (n > count)
>  			n = count;
> -		if (!(vm->flags & VM_IOREMAP))
> +
> +		if ((va->flags & (VMAP_RAM|VMAP_BLOCK)) == (VMAP_RAM|VMAP_BLOCK))
> +			vb_vread(buf, addr, n);
> +		else if ((va->flags & VMAP_RAM) || !(vm->flags & VM_IOREMAP))
>  			aligned_vread(buf, addr, n);
>  		else /* IOREMAP area is treated as memory hole */
>  			memset(buf, 0, n);
> -- 
> 2.34.1
  
Baoquan He Nov. 10, 2022, 10:23 a.m. UTC | #2
On 11/09/22 at 04:59pm, Stephen Brennan wrote:
......  
> > @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count)
> >  		if (!count)
> >  			break;
> >  
> > -		if (!va->vm)
> > +		if (!(va->flags & VMAP_RAM) && !va->vm)
> >  			continue;
> >  
> >  		vm = va->vm;
> > -		vaddr = (char *) vm->addr;
> > -		if (addr >= vaddr + get_vm_area_size(vm))
> > +		vaddr = (char *) va->va_start;
> > +		size = vm ? get_vm_area_size(vm) : va_size(va);
> 
> Hi Baoquan,
> 
> Thanks for working on this. I tested your patches out by using drgn to
> debug /proc/kcore. I have a kernel module[1] to do a vm_map_ram() call
> and print the virtual address to the kernel log so I can try to read
> that memory address in drgn. When I did this test, I got a panic on the
> above line of code.
......
> Since flags is in a union, it shadows "vm" and causes the condition to
> be true, and then get_vm_area_size() tries to follow the pointer defined
> by flags. I'm not sure if the fix is to have flags be a separate field
> inside vmap_area, or to have more careful handling in the vread path.

Sorry, my bad. Thanks for testing this and catching the error, Stephen.

About the fix, both way are fine to me. I made a draft fix based on the
current patchset. To me, adding flags in a separate field makes code
easier, but cost extra memory. I will see what other people say about
this, firstly if the solution is acceptable, then reusing the union
field or adding anohter flags.

Could you try below code to see if it works?

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5a8d5659bfb0..78cae59170d8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1890,6 +1890,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
 
 #define VMAP_RAM	0x1
 #define VMAP_BLOCK	0x2
+#define VMAP_FLAGS_MASK	0x3
 
 struct vmap_block_queue {
 	spinlock_t lock;
@@ -3588,7 +3589,7 @@ long vread(char *buf, char *addr, unsigned long count)
 	struct vm_struct *vm;
 	char *vaddr, *buf_start = buf;
 	unsigned long buflen = count;
-	unsigned long n, size;
+	unsigned long n, size, flags;
 
 	addr = kasan_reset_tag(addr);
 
@@ -3609,12 +3610,14 @@ long vread(char *buf, char *addr, unsigned long count)
 		if (!count)
 			break;
 
-		if (!(va->flags & VMAP_RAM) && !va->vm)
+		if (!va->vm)
 			continue;
 
+		flags = va->flags & VMAP_FLAGS_MASK;
 		vm = va->vm;
+
 		vaddr = (char *) va->va_start;
-		size = vm ? get_vm_area_size(vm) : va_size(va);
+		size = flags ? va_size(va) : get_vm_area_size(vm);
 
 		if (addr >= vaddr + size)
 			continue;
@@ -3630,9 +3633,9 @@ long vread(char *buf, char *addr, unsigned long count)
 		if (n > count)
 			n = count;
 
-		if ((va->flags & (VMAP_RAM|VMAP_BLOCK)) == (VMAP_RAM|VMAP_BLOCK))
+		if ((flags & (VMAP_RAM|VMAP_BLOCK)) == (VMAP_RAM|VMAP_BLOCK))
 			vb_vread(buf, addr, n);
-		else if ((va->flags & VMAP_RAM) || !(vm->flags & VM_IOREMAP))
+		else if ((flags & VMAP_RAM) || !(vm->flags & VM_IOREMAP))
 			aligned_vread(buf, addr, n);
 		else /* IOREMAP area is treated as memory hole */
 			memset(buf, 0, n);
  
Stephen Brennan Nov. 10, 2022, 6:48 p.m. UTC | #3
Baoquan He <bhe@redhat.com> writes:

> On 11/09/22 at 04:59pm, Stephen Brennan wrote:
> ......  
>> > @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count)
>> >  		if (!count)
>> >  			break;
>> >  
>> > -		if (!va->vm)
>> > +		if (!(va->flags & VMAP_RAM) && !va->vm)
>> >  			continue;
>> >  
>> >  		vm = va->vm;
>> > -		vaddr = (char *) vm->addr;
>> > -		if (addr >= vaddr + get_vm_area_size(vm))
>> > +		vaddr = (char *) va->va_start;
>> > +		size = vm ? get_vm_area_size(vm) : va_size(va);
>> 
>> Hi Baoquan,
>> 
>> Thanks for working on this. I tested your patches out by using drgn to
>> debug /proc/kcore. I have a kernel module[1] to do a vm_map_ram() call
>> and print the virtual address to the kernel log so I can try to read
>> that memory address in drgn. When I did this test, I got a panic on the
>> above line of code.
> ......
>> Since flags is in a union, it shadows "vm" and causes the condition to
>> be true, and then get_vm_area_size() tries to follow the pointer defined
>> by flags. I'm not sure if the fix is to have flags be a separate field
>> inside vmap_area, or to have more careful handling in the vread path.
>
> Sorry, my bad. Thanks for testing this and catching the error, Stephen.
>
> About the fix, both way are fine to me. I made a draft fix based on the
> current patchset. To me, adding flags in a separate field makes code
> easier, but cost extra memory. I will see what other people say about
> this, firstly if the solution is acceptable, then reusing the union
> field or adding anohter flags.
>
> Could you try below code to see if it works?

I tried the patch below and it worked for me: reading from vm_map_ram()
regions in drgn was fine, gave me the correct values, and I also tested
reads which overlapped the beginning and end of the region.

That said (and I don't know the vmalloc code well at all), I wonder
whether you can be confident that the lower 2 bits of the va->vm pointer
are always clear? It looks like it comes from kmalloc, and so it should
be aligned, but can slab red zones mess up that alignment?

I also tested out this patch on top of yours, to be a bit more cautious.
I think we can be confident that the remaining bits are zero when used
as flags, and non-zero when used as a pointer, so you can test them to
avoid any overlap. But it's probably too cautious.

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 78cae59170d8..911974f32b23 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3613,7 +3613,7 @@ long vread(char *buf, char *addr, unsigned long count)
                if (!va->vm)
                        continue;

-               flags = va->flags & VMAP_FLAGS_MASK;
+               flags = (va->flags & ~VMAP_FLAGS_MASK) ? 0 : (va->flags & VMAP_FLAGS_MASK);
                vm = va->vm;

                vaddr = (char *) va->va_start;
  
Baoquan He Nov. 14, 2022, 10:06 a.m. UTC | #4
On 11/10/22 at 10:48am, Stephen Brennan wrote:
> Baoquan He <bhe@redhat.com> writes:
> 
> > On 11/09/22 at 04:59pm, Stephen Brennan wrote:
> > ......  
> >> > @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count)
> >> >  		if (!count)
> >> >  			break;
> >> >  
> >> > -		if (!va->vm)
> >> > +		if (!(va->flags & VMAP_RAM) && !va->vm)
> >> >  			continue;
> >> >  
> >> >  		vm = va->vm;
> >> > -		vaddr = (char *) vm->addr;
> >> > -		if (addr >= vaddr + get_vm_area_size(vm))
> >> > +		vaddr = (char *) va->va_start;
> >> > +		size = vm ? get_vm_area_size(vm) : va_size(va);
> >> 
> >> Hi Baoquan,
> >> 
> >> Thanks for working on this. I tested your patches out by using drgn to
> >> debug /proc/kcore. I have a kernel module[1] to do a vm_map_ram() call
> >> and print the virtual address to the kernel log so I can try to read
> >> that memory address in drgn. When I did this test, I got a panic on the
> >> above line of code.
> > ......
> >> Since flags is in a union, it shadows "vm" and causes the condition to
> >> be true, and then get_vm_area_size() tries to follow the pointer defined
> >> by flags. I'm not sure if the fix is to have flags be a separate field
> >> inside vmap_area, or to have more careful handling in the vread path.
> >
> > Sorry, my bad. Thanks for testing this and catching the error, Stephen.
> >
> > About the fix, both way are fine to me. I made a draft fix based on the
> > current patchset. To me, adding flags in a separate field makes code
> > easier, but cost extra memory. I will see what other people say about
> > this, firstly if the solution is acceptable, then reusing the union
> > field or adding anohter flags.
> >
> > Could you try below code to see if it works?
> 
> I tried the patch below and it worked for me: reading from vm_map_ram()
> regions in drgn was fine, gave me the correct values, and I also tested
> reads which overlapped the beginning and end of the region.

Thanks a lot for the testing.

> 
> That said (and I don't know the vmalloc code well at all), I wonder
> whether you can be confident that the lower 2 bits of the va->vm pointer
> are always clear? It looks like it comes from kmalloc, and so it should
> be aligned, but can slab red zones mess up that alignment?

Hmm, it should be OK. I am also worried about the other places of va->vm
checking. I will check code again to see if there's risk in the case you
mentioned. I may change to add another ->flags field into vmap_area in
v2 post.

> 
> I also tested out this patch on top of yours, to be a bit more cautious.
> I think we can be confident that the remaining bits are zero when used
> as flags, and non-zero when used as a pointer, so you can test them to
> avoid any overlap. But it's probably too cautious.
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 78cae59170d8..911974f32b23 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3613,7 +3613,7 @@ long vread(char *buf, char *addr, unsigned long count)
>                 if (!va->vm)
>                         continue;
> 
> -               flags = va->flags & VMAP_FLAGS_MASK;
> +               flags = (va->flags & ~VMAP_FLAGS_MASK) ? 0 : (va->flags & VMAP_FLAGS_MASK);
>                 vm = va->vm;
> 
>                 vaddr = (char *) va->va_start;
>
  
Matthew Wilcox Nov. 18, 2022, 8:01 a.m. UTC | #5
On Wed, Nov 09, 2022 at 11:35:34AM +0800, Baoquan He wrote:
> Currently, vread() can read out vmalloc areas which is associated with
> a vm_struct. While this doesn't work for areas created by vm_map_ram()
> interface because it doesn't allocate a vm_struct. Then in vread(),
> these areas will be skipped.
> 
> Here, add a new function vb_vread() to read out areas managed by
> vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags
> and handle  them respectively.

i don't understand how this deals with the original problem identified,
that the vread() can race with an unmap.
  
Baoquan He Nov. 23, 2022, 3:38 a.m. UTC | #6
On 11/18/22 at 08:01am, Matthew Wilcox wrote:
> On Wed, Nov 09, 2022 at 11:35:34AM +0800, Baoquan He wrote:
> > Currently, vread() can read out vmalloc areas which is associated with
> > a vm_struct. While this doesn't work for areas created by vm_map_ram()
> > interface because it doesn't allocate a vm_struct. Then in vread(),
> > these areas will be skipped.
> > 
> > Here, add a new function vb_vread() to read out areas managed by
> > vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags
> > and handle  them respectively.
> 
> i don't understand how this deals with the original problem identified,
> that the vread() can race with an unmap.

Thanks for checking.

I wrote a paragraph, then realized I misunderstood your concern. You are
saying the comment from Uladzislau about my original draft patch, right?
Paste the link of Uladzislau's reply here in case other people want to
know the background:
https://lore.kernel.org/all/Y1uKSmgURNEa3nQu@pc636/T/#u

When Stephen raised the issue originally, I posted a draft patch as
below trying to fix it:
https://lore.kernel.org/all/Y1pHTj2wuhoWmeV3@MiWiFi-R3L-srv/T/#u

In above draft patch, I tried to differentiate normal vmalloc area and
vm_map_ram area with the fact that vmalloc area is associated with a
vm_struct, while vm_map_ram area has ->vm as NULL. And I thought their
only difference is normal vmalloc area has guard page, so its size need
consider the guard page; while vm_map_ram area has no guard page, only
consider its own actual size. Uladzislau's comment reminded me I was
wrong. And the things we need handle are beyond that.

Currently there are three kinds of vmalloc areas in kernel:

1) normal vmalloc areas, associated with a vm_struct, this is allocated 
in __get_vm_area_node(). When freeing, it set ->vm to NULL
firstly, then unmap and free vmap_area, see remove_vm_area().

2) areas allocated via vm_map_ram() and size is larger than
VMAP_MAX_ALLOC. The entire area is not associated with vm_struct, and
freed at one time in vm_unmap_ram() with unmapping and freeing vmap_area;

3) areas allocated via vm_map_ram(), then delegate to vb_alloc() when
size <= VMAP_MAX_ALLOC. Its vmap_area is allocated at one time with
VMAP_BLOCK_SIZE big, and split and used later through vb_alloc(), freed
via vb_free(). When the entire area is dirty, it will be unmapped and
freed.

Based on above facts, we need add flags to differentiate the normal
vmalloc area from the vm_map_ram area, namely area 1) and 2). And we
also need flags to differentiate the area 2) and 3). Because area 3) are
pieces of a entire vmap_area, vb_free() will unmap the piece of area and
set the part dirty, but the entire vmap_area will kept there. So when we
will read area 3), we need take vb->lock and only read out the still
mapped part, but not dirty or free part of the vmap_area.

Thanks
Baoquan
  
Matthew Wilcox Nov. 23, 2022, 1:24 p.m. UTC | #7
On Wed, Nov 23, 2022 at 11:38:54AM +0800, Baoquan He wrote:
> On 11/18/22 at 08:01am, Matthew Wilcox wrote:
> > On Wed, Nov 09, 2022 at 11:35:34AM +0800, Baoquan He wrote:
> > > Currently, vread() can read out vmalloc areas which is associated with
> > > a vm_struct. While this doesn't work for areas created by vm_map_ram()
> > > interface because it doesn't allocate a vm_struct. Then in vread(),
> > > these areas will be skipped.
> > > 
> > > Here, add a new function vb_vread() to read out areas managed by
> > > vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags
> > > and handle  them respectively.
> > 
> > i don't understand how this deals with the original problem identified,
> > that the vread() can race with an unmap.
> 
> Thanks for checking.
> 
> I wrote a paragraph, then realized I misunderstood your concern. You are
> saying the comment from Uladzislau about my original draft patch, right?
> Paste the link of Uladzislau's reply here in case other people want to
> know the background:
> https://lore.kernel.org/all/Y1uKSmgURNEa3nQu@pc636/T/#u
> 
> When Stephen raised the issue originally, I posted a draft patch as
> below trying to fix it:
> https://lore.kernel.org/all/Y1pHTj2wuhoWmeV3@MiWiFi-R3L-srv/T/#u
> 
> In above draft patch, I tried to differentiate normal vmalloc area and
> vm_map_ram area with the fact that vmalloc area is associated with a
> vm_struct, while vm_map_ram area has ->vm as NULL. And I thought their
> only difference is normal vmalloc area has guard page, so its size need
> consider the guard page; while vm_map_ram area has no guard page, only
> consider its own actual size. Uladzislau's comment reminded me I was
> wrong. And the things we need handle are beyond that.
> 
> Currently there are three kinds of vmalloc areas in kernel:
> 
> 1) normal vmalloc areas, associated with a vm_struct, this is allocated 
> in __get_vm_area_node(). When freeing, it set ->vm to NULL
> firstly, then unmap and free vmap_area, see remove_vm_area().
> 
> 2) areas allocated via vm_map_ram() and size is larger than
> VMAP_MAX_ALLOC. The entire area is not associated with vm_struct, and
> freed at one time in vm_unmap_ram() with unmapping and freeing vmap_area;
> 
> 3) areas allocated via vm_map_ram(), then delegate to vb_alloc() when
> size <= VMAP_MAX_ALLOC. Its vmap_area is allocated at one time with
> VMAP_BLOCK_SIZE big, and split and used later through vb_alloc(), freed
> via vb_free(). When the entire area is dirty, it will be unmapped and
> freed.
> 
> Based on above facts, we need add flags to differentiate the normal
> vmalloc area from the vm_map_ram area, namely area 1) and 2). And we
> also need flags to differentiate the area 2) and 3). Because area 3) are
> pieces of a entire vmap_area, vb_free() will unmap the piece of area and
> set the part dirty, but the entire vmap_area will kept there. So when we
> will read area 3), we need take vb->lock and only read out the still
> mapped part, but not dirty or free part of the vmap_area.

I don't think you understand the problem.

Task A:			Task B:		Task C:
p = vm_map_ram()
			vread(p);
			... preempted ...
vm_unmap_ram(p);
					q = vm_map_ram();
			vread continues

If C has reused the address space allocated by A, task B is now reading
the memory mapped by task C instead of task A.  If it hasn't, it's now
trying to read from unmapped, and quite possibly freed memory.  Which
might have been allocated by task D.

Unless there's some kind of reference count so that B knows that both
the address range and the underlying memory can't be freed while it's
in the middle of the vread(), this is just unsafe.
  
Baoquan He Nov. 24, 2022, 9:52 a.m. UTC | #8
On 11/23/22 at 01:24pm, Matthew Wilcox wrote:
> On Wed, Nov 23, 2022 at 11:38:54AM +0800, Baoquan He wrote:
> > On 11/18/22 at 08:01am, Matthew Wilcox wrote:
> > > On Wed, Nov 09, 2022 at 11:35:34AM +0800, Baoquan He wrote:
> > > > Currently, vread() can read out vmalloc areas which is associated with
> > > > a vm_struct. While this doesn't work for areas created by vm_map_ram()
> > > > interface because it doesn't allocate a vm_struct. Then in vread(),
> > > > these areas will be skipped.
> > > > 
> > > > Here, add a new function vb_vread() to read out areas managed by
> > > > vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags
> > > > and handle  them respectively.
> > > 
> > > i don't understand how this deals with the original problem identified,
> > > that the vread() can race with an unmap.
> > 
> > Thanks for checking.
> > 
> > I wrote a paragraph, then realized I misunderstood your concern. You are
> > saying the comment from Uladzislau about my original draft patch, right?
> > Paste the link of Uladzislau's reply here in case other people want to
> > know the background:
> > https://lore.kernel.org/all/Y1uKSmgURNEa3nQu@pc636/T/#u
> > 
> > When Stephen raised the issue originally, I posted a draft patch as
> > below trying to fix it:
> > https://lore.kernel.org/all/Y1pHTj2wuhoWmeV3@MiWiFi-R3L-srv/T/#u
> > 
> > In above draft patch, I tried to differentiate normal vmalloc area and
> > vm_map_ram area with the fact that vmalloc area is associated with a
> > vm_struct, while vm_map_ram area has ->vm as NULL. And I thought their
> > only difference is normal vmalloc area has guard page, so its size need
> > consider the guard page; while vm_map_ram area has no guard page, only
> > consider its own actual size. Uladzislau's comment reminded me I was
> > wrong. And the things we need handle are beyond that.
> > 
> > Currently there are three kinds of vmalloc areas in kernel:
> > 
> > 1) normal vmalloc areas, associated with a vm_struct, this is allocated 
> > in __get_vm_area_node(). When freeing, it set ->vm to NULL
> > firstly, then unmap and free vmap_area, see remove_vm_area().
> > 
> > 2) areas allocated via vm_map_ram() and size is larger than
> > VMAP_MAX_ALLOC. The entire area is not associated with vm_struct, and
> > freed at one time in vm_unmap_ram() with unmapping and freeing vmap_area;
> > 
> > 3) areas allocated via vm_map_ram(), then delegate to vb_alloc() when
> > size <= VMAP_MAX_ALLOC. Its vmap_area is allocated at one time with
> > VMAP_BLOCK_SIZE big, and split and used later through vb_alloc(), freed
> > via vb_free(). When the entire area is dirty, it will be unmapped and
> > freed.
> > 
> > Based on above facts, we need add flags to differentiate the normal
> > vmalloc area from the vm_map_ram area, namely area 1) and 2). And we
> > also need flags to differentiate the area 2) and 3). Because area 3) are
> > pieces of a entire vmap_area, vb_free() will unmap the piece of area and
> > set the part dirty, but the entire vmap_area will kept there. So when we
> > will read area 3), we need take vb->lock and only read out the still
> > mapped part, but not dirty or free part of the vmap_area.
> 
> I don't think you understand the problem.
> 
> Task A:			Task B:		Task C:
> p = vm_map_ram()
> 			vread(p);
> 			... preempted ...
> vm_unmap_ram(p);
> 					q = vm_map_ram();
> 			vread continues



> 
> If C has reused the address space allocated by A, task B is now reading
> the memory mapped by task C instead of task A.  If it hasn't, it's now
> trying to read from unmapped, and quite possibly freed memory.  Which
> might have been allocated by task D.

Hmm, it may not be like that.

Firstly, I would remind that vread() takes vmap_area_lock during the
whole reading process. Before this patchset, the vread() and other area
manipulation will have below status:
1) __get_vm_area_node() could only finish insert_vmap_area(), then free
the vmap_area_lock, then warting;
2) __get_vm_area_node() finishs setup_vmalloc_vm()
  2.1) doing mapping but not finished;
  2.2) clear_vm_uninitialized_flag() is called after mapping is done;
3) remove_vm_area() is called to set -> = NULL, then free vmap_area_lock;

Task A:			   Task B:		     Task C:
p = __get_vm_area_node()
remove_vm_area(p);
			   vread(p);

			   vread end 
					     q = __get_vm_area_node();

So, as you can see, the checking "if (!va->vm)" in vread() will filter
out vmap_area:
a) areas if only insert_vmap_area() is called, but ->vm is still NULL; 
b) areas if remove_vm_area() is called to clear ->vm to NULL;
c) areas created through vm_map_ram() since its ->vm is always NULL;

Means vread() will read out vmap_area:
d) areas if setup_vmalloc_vm() is called;
  1) mapping is done on areas, e.g clear_vm_uninitialized_flag() is
       called;
  2) mapping is being handled, just after returning from setup_vmalloc_vm();


******* after this patchset applied:

Task A:			Task B:		Task C:
p = vm_map_ram()
vm_unmap_ram(p);
			vread(p);
                         vb_vread()
			vread end 

					q = vm_map_ram();

With this patchset applied, other than normal areas, for the
vm_map_ram() areas:
1) In vm_map_ram(), set vmap_area->flags = VMAP_RAM when vmap_area_lock
   is taken; In vm_unmap_ram(), clear it wiht "va->flags &= ~VMAP_RAM"
   when vmap_area_lock is taken;
2) If vmap_block, set va->flags = VMAP_RAM|VMAP_BLOCK; And set
   vmap_block->used_map to track the used region, filter out the dirty
   and free region;
3) In vb_vread(), we take vb->lock to avoid reading out dirty regions.

Please help point out what is wrong or I missed.

> 
> Unless there's some kind of reference count so that B knows that both
> the address range and the underlying memory can't be freed while it's
> in the middle of the vread(), this is just unsafe.
>
  
Uladzislau Rezki Nov. 30, 2022, 1:06 p.m. UTC | #9
On Thu, Nov 24, 2022 at 05:52:34PM +0800, Baoquan He wrote:
> On 11/23/22 at 01:24pm, Matthew Wilcox wrote:
> > On Wed, Nov 23, 2022 at 11:38:54AM +0800, Baoquan He wrote:
> > > On 11/18/22 at 08:01am, Matthew Wilcox wrote:
> > > > On Wed, Nov 09, 2022 at 11:35:34AM +0800, Baoquan He wrote:
> > > > > Currently, vread() can read out vmalloc areas which is associated with
> > > > > a vm_struct. While this doesn't work for areas created by vm_map_ram()
> > > > > interface because it doesn't allocate a vm_struct. Then in vread(),
> > > > > these areas will be skipped.
> > > > > 
> > > > > Here, add a new function vb_vread() to read out areas managed by
> > > > > vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags
> > > > > and handle  them respectively.
> > > > 
> > > > i don't understand how this deals with the original problem identified,
> > > > that the vread() can race with an unmap.
> > > 
> > > Thanks for checking.
> > > 
> > > I wrote a paragraph, then realized I misunderstood your concern. You are
> > > saying the comment from Uladzislau about my original draft patch, right?
> > > Paste the link of Uladzislau's reply here in case other people want to
> > > know the background:
> > > https://lore.kernel.org/all/Y1uKSmgURNEa3nQu@pc636/T/#u
> > > 
> > > When Stephen raised the issue originally, I posted a draft patch as
> > > below trying to fix it:
> > > https://lore.kernel.org/all/Y1pHTj2wuhoWmeV3@MiWiFi-R3L-srv/T/#u
> > > 
> > > In above draft patch, I tried to differentiate normal vmalloc area and
> > > vm_map_ram area with the fact that vmalloc area is associated with a
> > > vm_struct, while vm_map_ram area has ->vm as NULL. And I thought their
> > > only difference is normal vmalloc area has guard page, so its size need
> > > consider the guard page; while vm_map_ram area has no guard page, only
> > > consider its own actual size. Uladzislau's comment reminded me I was
> > > wrong. And the things we need handle are beyond that.
> > > 
> > > Currently there are three kinds of vmalloc areas in kernel:
> > > 
> > > 1) normal vmalloc areas, associated with a vm_struct, this is allocated 
> > > in __get_vm_area_node(). When freeing, it set ->vm to NULL
> > > firstly, then unmap and free vmap_area, see remove_vm_area().
> > > 
> > > 2) areas allocated via vm_map_ram() and size is larger than
> > > VMAP_MAX_ALLOC. The entire area is not associated with vm_struct, and
> > > freed at one time in vm_unmap_ram() with unmapping and freeing vmap_area;
> > > 
> > > 3) areas allocated via vm_map_ram(), then delegate to vb_alloc() when
> > > size <= VMAP_MAX_ALLOC. Its vmap_area is allocated at one time with
> > > VMAP_BLOCK_SIZE big, and split and used later through vb_alloc(), freed
> > > via vb_free(). When the entire area is dirty, it will be unmapped and
> > > freed.
> > > 
> > > Based on above facts, we need add flags to differentiate the normal
> > > vmalloc area from the vm_map_ram area, namely area 1) and 2). And we
> > > also need flags to differentiate the area 2) and 3). Because area 3) are
> > > pieces of a entire vmap_area, vb_free() will unmap the piece of area and
> > > set the part dirty, but the entire vmap_area will kept there. So when we
> > > will read area 3), we need take vb->lock and only read out the still
> > > mapped part, but not dirty or free part of the vmap_area.
> > 
> > I don't think you understand the problem.
> > 
> > Task A:			Task B:		Task C:
> > p = vm_map_ram()
> > 			vread(p);
> > 			... preempted ...
> > vm_unmap_ram(p);
> > 					q = vm_map_ram();
> > 			vread continues
> 
> 
> 
> > 
> > If C has reused the address space allocated by A, task B is now reading
> > the memory mapped by task C instead of task A.  If it hasn't, it's now
> > trying to read from unmapped, and quite possibly freed memory.  Which
> > might have been allocated by task D.
> 
> Hmm, it may not be like that.
> 
> Firstly, I would remind that vread() takes vmap_area_lock during the
> whole reading process. Before this patchset, the vread() and other area
> manipulation will have below status:
> 1) __get_vm_area_node() could only finish insert_vmap_area(), then free
> the vmap_area_lock, then warting;
> 2) __get_vm_area_node() finishs setup_vmalloc_vm()
>   2.1) doing mapping but not finished;
>   2.2) clear_vm_uninitialized_flag() is called after mapping is done;
> 3) remove_vm_area() is called to set -> = NULL, then free vmap_area_lock;
> 
> Task A:			   Task B:		     Task C:
> p = __get_vm_area_node()
> remove_vm_area(p);
> 			   vread(p);
> 
> 			   vread end 
> 					     q = __get_vm_area_node();
> 
> So, as you can see, the checking "if (!va->vm)" in vread() will filter
> out vmap_area:
> a) areas if only insert_vmap_area() is called, but ->vm is still NULL; 
> b) areas if remove_vm_area() is called to clear ->vm to NULL;
> c) areas created through vm_map_ram() since its ->vm is always NULL;
> 
> Means vread() will read out vmap_area:
> d) areas if setup_vmalloc_vm() is called;
>   1) mapping is done on areas, e.g clear_vm_uninitialized_flag() is
>        called;
>   2) mapping is being handled, just after returning from setup_vmalloc_vm();
> 
> 
> ******* after this patchset applied:
> 
> Task A:			Task B:		Task C:
> p = vm_map_ram()
> vm_unmap_ram(p);
> 			vread(p);
>                          vb_vread()
> 			vread end 
> 
> 					q = vm_map_ram();
> 
> With this patchset applied, other than normal areas, for the
> vm_map_ram() areas:
> 1) In vm_map_ram(), set vmap_area->flags = VMAP_RAM when vmap_area_lock
>    is taken; In vm_unmap_ram(), clear it wiht "va->flags &= ~VMAP_RAM"
>    when vmap_area_lock is taken;
> 2) If vmap_block, set va->flags = VMAP_RAM|VMAP_BLOCK; And set
>    vmap_block->used_map to track the used region, filter out the dirty
>    and free region;
> 3) In vb_vread(), we take vb->lock to avoid reading out dirty regions.
> 
> Please help point out what is wrong or I missed.
> 
One thing is we still can read-out un-mapped pages, i.e. a text instead:

<snip>
static void vb_free(unsigned long addr, unsigned long size)
{
	unsigned long offset;
	unsigned int order;
	struct vmap_block *vb;

	BUG_ON(offset_in_page(size));
	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);

	flush_cache_vunmap(addr, addr + size);

	order = get_order(size);
	offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
	vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));

	vunmap_range_noflush(addr, addr + size);

	if (debug_pagealloc_enabled_static())
		flush_tlb_kernel_range(addr, addr + size);

	spin_lock(&vb->lock);
...
<snip>

or am i missing something? Is it a problem? It might be. Another thing
it would be good if you upload a new patchset so it is easier to review
it.

Thanks!

--
Uladzislau Rezki
  
Baoquan He Dec. 1, 2022, 4:46 a.m. UTC | #10
On 11/30/22 at 02:06pm, Uladzislau Rezki wrote:
> On Thu, Nov 24, 2022 at 05:52:34PM +0800, Baoquan He wrote:
......
> > > I don't think you understand the problem.
> > > 
> > > Task A:			Task B:		Task C:
> > > p = vm_map_ram()
> > > 			vread(p);
> > > 			... preempted ...
> > > vm_unmap_ram(p);
> > > 					q = vm_map_ram();
> > > 			vread continues
> > 
> > 
> > 
> > > 
> > > If C has reused the address space allocated by A, task B is now reading
> > > the memory mapped by task C instead of task A.  If it hasn't, it's now
> > > trying to read from unmapped, and quite possibly freed memory.  Which
> > > might have been allocated by task D.
> > 
> > Hmm, it may not be like that.
> > 
> > Firstly, I would remind that vread() takes vmap_area_lock during the
> > whole reading process. Before this patchset, the vread() and other area
> > manipulation will have below status:
> > 1) __get_vm_area_node() could only finish insert_vmap_area(), then free
> > the vmap_area_lock, then warting;
> > 2) __get_vm_area_node() finishs setup_vmalloc_vm()
> >   2.1) doing mapping but not finished;
> >   2.2) clear_vm_uninitialized_flag() is called after mapping is done;
> > 3) remove_vm_area() is called to set -> = NULL, then free vmap_area_lock;
> > 
> > Task A:			   Task B:		     Task C:
> > p = __get_vm_area_node()
> > remove_vm_area(p);
> > 			   vread(p);
> > 
> > 			   vread end 
> > 					     q = __get_vm_area_node();
> > 
> > So, as you can see, the checking "if (!va->vm)" in vread() will filter
> > out vmap_area:
> > a) areas if only insert_vmap_area() is called, but ->vm is still NULL; 
> > b) areas if remove_vm_area() is called to clear ->vm to NULL;
> > c) areas created through vm_map_ram() since its ->vm is always NULL;
> > 
> > Means vread() will read out vmap_area:
> > d) areas if setup_vmalloc_vm() is called;
> >   1) mapping is done on areas, e.g clear_vm_uninitialized_flag() is
> >        called;
> >   2) mapping is being handled, just after returning from setup_vmalloc_vm();
> > 
> > 
> > ******* after this patchset applied:
> > 
> > Task A:			Task B:		Task C:
> > p = vm_map_ram()
> > vm_unmap_ram(p);
> > 			vread(p);
> >                          vb_vread()
> > 			vread end 
> > 
> > 					q = vm_map_ram();
> > 
> > With this patchset applied, other than normal areas, for the
> > vm_map_ram() areas:
> > 1) In vm_map_ram(), set vmap_area->flags = VMAP_RAM when vmap_area_lock
> >    is taken; In vm_unmap_ram(), clear it wiht "va->flags &= ~VMAP_RAM"
> >    when vmap_area_lock is taken;
> > 2) If vmap_block, set va->flags = VMAP_RAM|VMAP_BLOCK; And set
> >    vmap_block->used_map to track the used region, filter out the dirty
> >    and free region;
> > 3) In vb_vread(), we take vb->lock to avoid reading out dirty regions.
> > 
> > Please help point out what is wrong or I missed.
> > 
> One thing is we still can read-out un-mapped pages, i.e. a text instead:
> 
> <snip>
> static void vb_free(unsigned long addr, unsigned long size)
> {
> 	unsigned long offset;
> 	unsigned int order;
> 	struct vmap_block *vb;
> 
> 	BUG_ON(offset_in_page(size));
> 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
> 
> 	flush_cache_vunmap(addr, addr + size);
> 
> 	order = get_order(size);
> 	offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
> 	vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
> 
> 	vunmap_range_noflush(addr, addr + size);
> 
> 	if (debug_pagealloc_enabled_static())
> 		flush_tlb_kernel_range(addr, addr + size);
> 
> 	spin_lock(&vb->lock);
> ...
> <snip>
> 
> or am i missing something? Is it a problem? It might be. Another thing
> it would be good if you upload a new patchset so it is easier to review
> it.

Thanks for checking.

Please check patch 1, vmap_block->used_map is introduced to track the
vb regions allocation and free via vb_alloc/vb_free(). The vb->used_map
only set for pages being used, the dirty and free regions are all
cleared. In the added vb_vread() of patch 3, vb->lock is required and
taken during the whole vb vmap reading, and only page of regions set in
vb->used_map will be read out.

So if vb_free() is called, and vb->used_map is cleared away, it won't
be read out in vb_vread(). If vb_free() is requiring vb->lock and waiting,
the region hasn't been unmapped and can be read out.

@@ -2114,6 +2118,9 @@ static void vb_free(unsigned long addr, unsigned long size)
        order = get_order(size);
        offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
        vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
+       spin_lock(&vb->lock);
+       bitmap_clear(vb->used_map, offset, (1UL << order));
+       spin_unlock(&vb->lock);
 
        vunmap_range_noflush(addr, addr + size);

I will work out a formal patchset for reviewing, will post and CC all
reviewers.

Thanks
Baoquan
  

Patch

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 41d82dc07e13..5a8d5659bfb0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3518,6 +3518,46 @@  static int aligned_vread(char *buf, char *addr, unsigned long count)
 	return copied;
 }
 
+static void vb_vread(char *buf, char *addr, int count)
+{
+	char *start;
+	struct vmap_block *vb;
+	unsigned long offset;
+	unsigned int rs, re, n;
+
+	offset = ((unsigned long)addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
+	vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
+
+	spin_lock(&vb->lock);
+	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
+		spin_unlock(&vb->lock);
+		memset(buf, 0, count);
+		return;
+	}
+	for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
+		if (!count)
+			break;
+		start = vmap_block_vaddr(vb->va->va_start, rs);
+		if (addr < start) {
+			if (count == 0)
+				break;
+			*buf = '\0';
+			buf++;
+			addr++;
+			count--;
+		}
+		n = (re - rs + 1) << PAGE_SHIFT;
+		if (n > count)
+			n = count;
+		aligned_vread(buf, start, n);
+
+		buf += n;
+		addr += n;
+		count -= n;
+	}
+	spin_unlock(&vb->lock);
+}
+
 /**
  * vread() - read vmalloc area in a safe way.
  * @buf:     buffer for reading data
@@ -3548,7 +3588,7 @@  long vread(char *buf, char *addr, unsigned long count)
 	struct vm_struct *vm;
 	char *vaddr, *buf_start = buf;
 	unsigned long buflen = count;
-	unsigned long n;
+	unsigned long n, size;
 
 	addr = kasan_reset_tag(addr);
 
@@ -3569,12 +3609,14 @@  long vread(char *buf, char *addr, unsigned long count)
 		if (!count)
 			break;
 
-		if (!va->vm)
+		if (!(va->flags & VMAP_RAM) && !va->vm)
 			continue;
 
 		vm = va->vm;
-		vaddr = (char *) vm->addr;
-		if (addr >= vaddr + get_vm_area_size(vm))
+		vaddr = (char *) va->va_start;
+		size = vm ? get_vm_area_size(vm) : va_size(va);
+
+		if (addr >= vaddr + size)
 			continue;
 		while (addr < vaddr) {
 			if (count == 0)
@@ -3584,10 +3626,13 @@  long vread(char *buf, char *addr, unsigned long count)
 			addr++;
 			count--;
 		}
-		n = vaddr + get_vm_area_size(vm) - addr;
+		n = vaddr + size - addr;
 		if (n > count)
 			n = count;
-		if (!(vm->flags & VM_IOREMAP))
+
+		if ((va->flags & (VMAP_RAM|VMAP_BLOCK)) == (VMAP_RAM|VMAP_BLOCK))
+			vb_vread(buf, addr, n);
+		else if ((va->flags & VMAP_RAM) || !(vm->flags & VM_IOREMAP))
 			aligned_vread(buf, addr, n);
 		else /* IOREMAP area is treated as memory hole */
 			memset(buf, 0, n);