[RFC,2/2] mm: Use pte markers for swap errors
Commit Message
PTE markers are ideal mechanism for things like SWP_SWAPIN_ERROR. Using a
whole swap entry type for this purpose can be an overkill, especially if we
already have PTE markers. Define a new bit for swapin error and replace it
with pte markers. Then we can safely drop SWP_SWAPIN_ERROR and give one
device slot back to swap.
We used to have SWP_SWAPIN_ERROR taking the page pfn as part of the swap
entry, but it's never used. Neither do I see how it can be useful because
normally the swapin failure should not be caused by a bad page but bad swap
device. Drop it alongside.
Signed-off-by: Peter Xu <peterx@redhat.com>
---
include/linux/swap.h | 6 +-----
include/linux/swapops.h | 26 ++++++++++++++------------
mm/memory.c | 6 ++++--
mm/shmem.c | 2 +-
mm/swapfile.c | 2 +-
5 files changed, 21 insertions(+), 21 deletions(-)
Comments
On 24.10.22 22:48, Peter Xu wrote:
> PTE markers are ideal mechanism for things like SWP_SWAPIN_ERROR. Using a
> whole swap entry type for this purpose can be an overkill, especially if we
> already have PTE markers. Define a new bit for swapin error and replace it
> with pte markers. Then we can safely drop SWP_SWAPIN_ERROR and give one
> device slot back to swap.
>
> We used to have SWP_SWAPIN_ERROR taking the page pfn as part of the swap
> entry, but it's never used. Neither do I see how it can be useful because
> normally the swapin failure should not be caused by a bad page but bad swap
> device. Drop it alongside.
[...]
>
> -#define PTE_MARKER_UFFD_WP BIT(0)
> -#define PTE_MARKER_MASK (PTE_MARKER_UFFD_WP)
> +#define PTE_MARKER_UFFD_WP BIT(0)
> +#define PTE_MARKER_SWAP_ERROR BIT(1)
I'd suggest to keep the term SWAPIN. An error happened during swapin,
which is why the page is corrupted.
(I remember that we discussed naming details in the original series and
SWAPIN was the conclusion)
On Tue, Oct 25, 2022 at 05:31:43PM +0200, David Hildenbrand wrote:
> > -#define PTE_MARKER_UFFD_WP BIT(0)
> > -#define PTE_MARKER_MASK (PTE_MARKER_UFFD_WP)
> > +#define PTE_MARKER_UFFD_WP BIT(0)
> > +#define PTE_MARKER_SWAP_ERROR BIT(1)
>
> I'd suggest to keep the term SWAPIN. An error happened during swapin, which
> is why the page is corrupted.
Sure thing.
On 2022/10/25 4:48, Peter Xu wrote:
> PTE markers are ideal mechanism for things like SWP_SWAPIN_ERROR. Using a
> whole swap entry type for this purpose can be an overkill, especially if we
> already have PTE markers. Define a new bit for swapin error and replace it
> with pte markers. Then we can safely drop SWP_SWAPIN_ERROR and give one
> device slot back to swap.
>
> We used to have SWP_SWAPIN_ERROR taking the page pfn as part of the swap
> entry, but it's never used. Neither do I see how it can be useful because
> normally the swapin failure should not be caused by a bad page but bad swap
> device. Drop it alongside.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
Thanks for doing this. I have been waiting this for a "long" time. ;)
With changing the name of PTE_MARKER_SWAP_ERROR proposed by David, this
patch looks good to me.
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Thanks,
Miaohe Lin
@@ -55,10 +55,6 @@ static inline int current_is_kswapd(void)
* actions on faults.
*/
-#define SWP_SWAPIN_ERROR_NUM 1
-#define SWP_SWAPIN_ERROR (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
- SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \
- SWP_PTE_MARKER_NUM)
/*
* PTE markers are used to persist information onto PTEs that otherwise
* should be a none pte. As its name "PTE" hints, it should only be
@@ -121,7 +117,7 @@ static inline int current_is_kswapd(void)
#define MAX_SWAPFILES \
((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
- SWP_PTE_MARKER_NUM - SWP_SWAPIN_ERROR_NUM)
+ SWP_PTE_MARKER_NUM)
/*
* Magic header for a swap area. The first part of the union is
@@ -162,16 +162,6 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
return xa_mk_value(entry.val);
}
-static inline swp_entry_t make_swapin_error_entry(struct page *page)
-{
- return swp_entry(SWP_SWAPIN_ERROR, page_to_pfn(page));
-}
-
-static inline int is_swapin_error_entry(swp_entry_t entry)
-{
- return swp_type(entry) == SWP_SWAPIN_ERROR;
-}
-
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
@@ -409,8 +399,9 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry)
typedef unsigned long pte_marker;
-#define PTE_MARKER_UFFD_WP BIT(0)
-#define PTE_MARKER_MASK (PTE_MARKER_UFFD_WP)
+#define PTE_MARKER_UFFD_WP BIT(0)
+#define PTE_MARKER_SWAP_ERROR BIT(1)
+#define PTE_MARKER_MASK (BIT(2) - 1)
static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
{
@@ -437,6 +428,17 @@ static inline pte_t make_pte_marker(pte_marker marker)
return swp_entry_to_pte(make_pte_marker_entry(marker));
}
+static inline swp_entry_t make_swapin_error_entry(void)
+{
+ return make_pte_marker_entry(PTE_MARKER_SWAP_ERROR);
+}
+
+static inline int is_swapin_error_entry(swp_entry_t entry)
+{
+ return is_pte_marker_entry(entry) &&
+ (pte_marker_get(entry) & PTE_MARKER_SWAP_ERROR);
+}
+
/*
* This is a special version to check pte_none() just to cover the case when
* the pte is a pte marker. It existed because in many cases the pte marker
@@ -3705,6 +3705,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
if (WARN_ON_ONCE(!marker))
return VM_FAULT_SIGBUS;
+ /* Higher priority than uffd-wp when data corrupted */
+ if (marker & PTE_MARKER_SWAP_ERROR)
+ return VM_FAULT_SIGBUS;
+
if (pte_marker_entry_uffd_wp(entry))
return pte_marker_handle_uffd_wp(vmf);
@@ -3764,8 +3768,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
put_page(vmf->page);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
- } else if (is_swapin_error_entry(entry)) {
- ret = VM_FAULT_SIGBUS;
} else if (is_pte_marker_entry(entry)) {
ret = handle_pte_marker(vmf);
} else {
@@ -1682,7 +1682,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
swp_entry_t swapin_error;
void *old;
- swapin_error = make_swapin_error_entry(&folio->page);
+ swapin_error = make_swapin_error_entry();
old = xa_cmpxchg_irq(&mapping->i_pages, index,
swp_to_radix_entry(swap),
swp_to_radix_entry(swapin_error), 0);
@@ -1781,7 +1781,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
pte_t pteval;
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
- pteval = swp_entry_to_pte(make_swapin_error_entry(page));
+ pteval = swp_entry_to_pte(make_swapin_error_entry());
set_pte_at(vma->vm_mm, addr, pte, pteval);
swap_free(entry);
ret = 0;