[v3] dax: enable dax fault handler to report VM_FAULT_HWPOISON

Message ID 20230505011747.956945-1-jane.chu@oracle.com
State New
Headers
Series [v3] dax: enable dax fault handler to report VM_FAULT_HWPOISON |

Commit Message

Jane Chu May 5, 2023, 1:17 a.m. UTC
  When multiple processes mmap() a dax file, then at some point,
a process issues a 'load' and consumes a hwpoison, the process
receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb
set for the poison scope. Soon after, any other process issues
a 'load' to the poisoned page (that is unmapped from the kernel
side by memory_failure), it receives a SIGBUS with
si_code = BUS_ADRERR and without valid si_lsb.

This is confusing to user, and is different from page fault due
to poison in RAM memory, also some helpful information is lost.

Channel dax backend driver's poison detection to the filesystem
such that instead of reporting VM_FAULT_SIGBUS, it could report
VM_FAULT_HWPOISON.

Change from v2:
  Convert -EHWPOISON to -EIO to prevent EHWPOISON errno from leaking
out to block read(2) - suggested by Matthew.

Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
 drivers/nvdimm/pmem.c | 2 +-
 fs/dax.c              | 4 ++--
 include/linux/mm.h    | 2 ++
 3 files changed, 5 insertions(+), 3 deletions(-)
  

Comments

Dan Williams May 5, 2023, 2:32 a.m. UTC | #1
Jane Chu wrote:
> When multiple processes mmap() a dax file, then at some point,
> a process issues a 'load' and consumes a hwpoison, the process
> receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb
> set for the poison scope. Soon after, any other process issues
> a 'load' to the poisoned page (that is unmapped from the kernel
> side by memory_failure), it receives a SIGBUS with
> si_code = BUS_ADRERR and without valid si_lsb.
> 
> This is confusing to user, and is different from page fault due
> to poison in RAM memory, also some helpful information is lost.
> 
> Channel dax backend driver's poison detection to the filesystem
> such that instead of reporting VM_FAULT_SIGBUS, it could report
> VM_FAULT_HWPOISON.

I do think it is interesting that this will start returning SIGBUS with
BUS_MCEERR_AR for stores whereas it is only signalled for loads in the
direct consumption path, but I can't think of a scenario where that
would confuse existing software.

> Change from v2:
>   Convert -EHWPOISON to -EIO to prevent EHWPOISON errno from leaking
> out to block read(2) - suggested by Matthew.

For next time these kind of changelog notes belong...

> Signed-off-by: Jane Chu <jane.chu@oracle.com>
> ---

...here after the "---".

>  drivers/nvdimm/pmem.c | 2 +-
>  fs/dax.c              | 4 ++--
>  include/linux/mm.h    | 2 ++
>  3 files changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index ceea55f621cc..46e094e56159 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
>  		long actual_nr;
>  
>  		if (mode != DAX_RECOVERY_WRITE)
> -			return -EIO;
> +			return -EHWPOISON;
>  
>  		/*
>  		 * Set the recovery stride is set to kernel page size because
> diff --git a/fs/dax.c b/fs/dax.c
> index 2ababb89918d..18f9598951f1 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1498,7 +1498,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>  
>  		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
>  				DAX_ACCESS, &kaddr, NULL);
> -		if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
> +		if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
>  			map_len = dax_direct_access(dax_dev, pgoff,
>  					PHYS_PFN(size), DAX_RECOVERY_WRITE,
>  					&kaddr, NULL);
> @@ -1506,7 +1506,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>  				recovery = true;
>  		}
>  		if (map_len < 0) {
> -			ret = map_len;
> +			ret = (map_len == -EHWPOISON) ? -EIO : map_len;

This fixup leaves out several other places where EHWPOISON could leak as
the errno for read(2)/write(2). I think I want to see something like
this:

diff --git a/fs/dax.c b/fs/dax.c
index 2ababb89918d..ec17f9977bcb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1077,14 +1077,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
-               size_t size, void **kaddr, pfn_t *pfnp)
+static int __dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
+                                    size_t size, void **kaddr, pfn_t *pfnp)
 {
        pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
-       int id, rc = 0;
        long length;
+       int rc = 0;
 
-       id = dax_read_lock();
        length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
                                   DAX_ACCESS, kaddr, pfnp);
        if (length < 0) {
@@ -1113,6 +1112,36 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
        return rc;
 }
 
+static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
+                                  size_t size, void **kaddr, pfn_t *pfnp)
+{
+
+       int id;
+
+       id = dax_read_lock();
+       rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
+       dax_read_unlock(id);
+
+       /* don't leak a memory access error code to I/O syscalls */
+       if (rc == -EHWPOISON)
+               return -EIO;
+       return rc;
+}
+
+static int dax_fault_direct_access(const struct iomap *iomap, loff_t pos,
+                                  size_t size, void **kaddr, pfn_t *pfnp)
+{
+
+       int id;
+
+       id = dax_read_lock();
+       rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
+       dax_read_unlock(id);
+
+       /* -EHWPOISON return ok */
+       return rc;
+}
+
 /**
  * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
  * by copying the data before and after the range to be written.
@@ -1682,7 +1711,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
                return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
        }
 
-       err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
+       err = dax_fault_direct_access(iomap, pos, size, &kaddr, &pfn);
        if (err)
                return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
 


...and then convert all other callers of dax_direct_access() in that
file such that they are either calling:

dax_iomap_direct_access(): if caller wants EHWPOISON translated to -EIO and is ok
			   with internal locking
dax_fault_direct_access(): if caller wants EHWPOISON passed through and is 
			   ok with internal locking
__dax_iomap_direct_access(): if the caller wants to do its own EHWPOISON
			     translation and locking

>  			break;
>  		}
>  
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1f79667824eb..e4c974587659 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3217,6 +3217,8 @@ static inline vm_fault_t vmf_error(int err)
>  {
>  	if (err == -ENOMEM)
>  		return VM_FAULT_OOM;
> +	else if (err == -EHWPOISON)
> +		return VM_FAULT_HWPOISON;
>  	return VM_FAULT_SIGBUS;
>  }
>  
> -- 
> 2.18.4
>
  
Jane Chu May 9, 2023, 1:12 a.m. UTC | #2
On 5/4/2023 7:32 PM, Dan Williams wrote:
> Jane Chu wrote:
>> When multiple processes mmap() a dax file, then at some point,
>> a process issues a 'load' and consumes a hwpoison, the process
>> receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb
>> set for the poison scope. Soon after, any other process issues
>> a 'load' to the poisoned page (that is unmapped from the kernel
>> side by memory_failure), it receives a SIGBUS with
>> si_code = BUS_ADRERR and without valid si_lsb.
>>
>> This is confusing to user, and is different from page fault due
>> to poison in RAM memory, also some helpful information is lost.
>>
>> Channel dax backend driver's poison detection to the filesystem
>> such that instead of reporting VM_FAULT_SIGBUS, it could report
>> VM_FAULT_HWPOISON.
> 
> I do think it is interesting that this will start returning SIGBUS with
> BUS_MCEERR_AR for stores whereas it is only signalled for loads in the
> direct consumption path, but I can't think of a scenario where that
> would confuse existing software.

Yes indeed, nice catch, thank you!

> 
>> Change from v2:
>>    Convert -EHWPOISON to -EIO to prevent EHWPOISON errno from leaking
>> out to block read(2) - suggested by Matthew.
> 
> For next time these kind of changelog notes belong...
> 
>> Signed-off-by: Jane Chu <jane.chu@oracle.com>
>> ---
> 
> ...here after the "---".

I'll move the change log to a cover letter.

> 
>>   drivers/nvdimm/pmem.c | 2 +-
>>   fs/dax.c              | 4 ++--
>>   include/linux/mm.h    | 2 ++
>>   3 files changed, 5 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
>> index ceea55f621cc..46e094e56159 100644
>> --- a/drivers/nvdimm/pmem.c
>> +++ b/drivers/nvdimm/pmem.c
>> @@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
>>   		long actual_nr;
>>   
>>   		if (mode != DAX_RECOVERY_WRITE)
>> -			return -EIO;
>> +			return -EHWPOISON;
>>   
>>   		/*
>>   		 * Set the recovery stride is set to kernel page size because
>> diff --git a/fs/dax.c b/fs/dax.c
>> index 2ababb89918d..18f9598951f1 100644
>> --- a/fs/dax.c
>> +++ b/fs/dax.c
>> @@ -1498,7 +1498,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>>   
>>   		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
>>   				DAX_ACCESS, &kaddr, NULL);
>> -		if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
>> +		if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
>>   			map_len = dax_direct_access(dax_dev, pgoff,
>>   					PHYS_PFN(size), DAX_RECOVERY_WRITE,
>>   					&kaddr, NULL);
>> @@ -1506,7 +1506,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>>   				recovery = true;
>>   		}
>>   		if (map_len < 0) {
>> -			ret = map_len;
>> +			ret = (map_len == -EHWPOISON) ? -EIO : map_len;
> 
> This fixup leaves out several other places where EHWPOISON could leak as
> the errno for read(2)/write(2). I think I want to see something like
> this:
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 2ababb89918d..ec17f9977bcb 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1077,14 +1077,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
>   }
>   EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
>   
> -static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
> -               size_t size, void **kaddr, pfn_t *pfnp)
> +static int __dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
> +                                    size_t size, void **kaddr, pfn_t *pfnp)
>   {
>          pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> -       int id, rc = 0;
>          long length;
> +       int rc = 0;
>   
> -       id = dax_read_lock();
>          length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
>                                     DAX_ACCESS, kaddr, pfnp);
>          if (length < 0) {
> @@ -1113,6 +1112,36 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
>          return rc;
>   }
>   
> +static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
> +                                  size_t size, void **kaddr, pfn_t *pfnp)
> +{
> +
> +       int id;
> +
> +       id = dax_read_lock();
> +       rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
> +       dax_read_unlock(id);
> +
> +       /* don't leak a memory access error code to I/O syscalls */
> +       if (rc == -EHWPOISON)
> +               return -EIO;
> +       return rc;
> +}
> +
> +static int dax_fault_direct_access(const struct iomap *iomap, loff_t pos,
> +                                  size_t size, void **kaddr, pfn_t *pfnp)
> +{
> +
> +       int id;
> +
> +       id = dax_read_lock();
> +       rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
> +       dax_read_unlock(id);
> +
> +       /* -EHWPOISON return ok */
> +       return rc;
> +}
> +
>   /**
>    * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
>    * by copying the data before and after the range to be written.
> @@ -1682,7 +1711,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
>                  return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
>          }
>   
> -       err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
> +       err = dax_fault_direct_access(iomap, pos, size, &kaddr, &pfn);
>          if (err)
>                  return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
>   
> 
> 
> ...and then convert all other callers of dax_direct_access() in that
> file such that they are either calling:
> 
> dax_iomap_direct_access(): if caller wants EHWPOISON translated to -EIO and is ok
> 			   with internal locking
> dax_fault_direct_access(): if caller wants EHWPOISON passed through and is
> 			   ok with internal locking
> __dax_iomap_direct_access(): if the caller wants to do its own EHWPOISON
> 			     translation and locking

Got it.  I examined all callers of dax_direct_access() and found a 
couple move places that need the errno conversion.
I'd like to introduce a helper mem2blk_err(err) for that. It could make
the code more self explanatory.

Thanks,
-jane

> 
>>   			break;
>>   		}
>>   
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 1f79667824eb..e4c974587659 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -3217,6 +3217,8 @@ static inline vm_fault_t vmf_error(int err)
>>   {
>>   	if (err == -ENOMEM)
>>   		return VM_FAULT_OOM;
>> +	else if (err == -EHWPOISON)
>> +		return VM_FAULT_HWPOISON;
>>   	return VM_FAULT_SIGBUS;
>>   }
>>   
>> -- 
>> 2.18.4
>>
> 
>
  

Patch

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index ceea55f621cc..46e094e56159 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -260,7 +260,7 @@  __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 		long actual_nr;
 
 		if (mode != DAX_RECOVERY_WRITE)
-			return -EIO;
+			return -EHWPOISON;
 
 		/*
 		 * Set the recovery stride is set to kernel page size because
diff --git a/fs/dax.c b/fs/dax.c
index 2ababb89918d..18f9598951f1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1498,7 +1498,7 @@  static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 
 		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
 				DAX_ACCESS, &kaddr, NULL);
-		if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
+		if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
 			map_len = dax_direct_access(dax_dev, pgoff,
 					PHYS_PFN(size), DAX_RECOVERY_WRITE,
 					&kaddr, NULL);
@@ -1506,7 +1506,7 @@  static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 				recovery = true;
 		}
 		if (map_len < 0) {
-			ret = map_len;
+			ret = (map_len == -EHWPOISON) ? -EIO : map_len;
 			break;
 		}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1f79667824eb..e4c974587659 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3217,6 +3217,8 @@  static inline vm_fault_t vmf_error(int err)
 {
 	if (err == -ENOMEM)
 		return VM_FAULT_OOM;
+	else if (err == -EHWPOISON)
+		return VM_FAULT_HWPOISON;
 	return VM_FAULT_SIGBUS;
 }