diff mbox series

[2/2] mm: zswap.c: remove RB tree

Message ID	20240117-zswap-xarray-v1-2-6daa86c08fae@kernel.org
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel+bounces-29680-ouuuleilei=gmail.com@vger.kernel.org designates 2604:1380:45e3:2400::1 as permitted sender) client-ip=2604:1380:45e3:2400::1; From: Chris Li <chrisl@kernel.org> Date: Wed, 17 Jan 2024 19:05:42 -0800 Subject: [PATCH 2/2] mm: zswap.c: remove RB tree Precedence: bulk MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit Message-Id: <20240117-zswap-xarray-v1-2-6daa86c08fae@kernel.org> References: <20240117-zswap-xarray-v1-0-6daa86c08fae@kernel.org> In-Reply-To: <20240117-zswap-xarray-v1-0-6daa86c08fae@kernel.org> To: Andrew Morton <akpm@linux-foundation.org> Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org, =?utf-8?b?V2VpIFh177+8?= <weixugc@google.com>, Yu Zhao <yuzhao@google.com>, Greg Thelen <gthelen@google.com>, Chun-Tse Shao <ctshao@google.com>, =?utf-8?q?Suren_Baghdasaryan=EF=BF=BC?= <surenb@google.com>, Yosry Ahmed <yosryahmed@google.com>, Brain Geffon <bgeffon@google.com>, Minchan Kim <minchan@kernel.org>, Michal Hocko <mhocko@suse.com>, Mel Gorman <mgorman@techsingularity.net>, Huang Ying <ying.huang@intel.com>, Nhat Pham <nphamcs@gmail.com>, Johannes Weiner <hannes@cmpxchg.org>, Kairui Song <kasong@tencent.com>, Zhongkun He <hezhongkun.hzk@bytedance.com>, Kemeng Shi <shikemeng@huaweicloud.com>, Barry Song <v-songbaohua@oppo.com>, "Matthew Wilcox (Oracle)" <willy@infradead.org>, "Liam R. Howlett" <Liam.Howlett@oracle.com>, Joel Fernandes <joel@joelfernandes.org>, Chengming Zhou <zhouchengming@bytedance.com>, Chris Li <chrisl@kernel.org>
Series	RFC: zswap tree use xarray instead of RB tree \| [0/2] RFC: zswap tree use xarray instead of RB tree [1/2] mm: zswap.c: add xarray tree to zswap [2/2] mm: zswap.c: remove RB tree

Commit Message

Chris Li Jan. 18, 2024, 3:05 a.m. UTC

  remove the RB tree code and the RB tree data structure
from zswap.

The xarray insert and erase code have been updated to
use the XAS version of the API to cache the lookup before
the final xarray store.

The zswap tree spinlock hasn't been removed yet due to
other usage outside of the zswap tree. The zswap xarray
function should work fine with its internal lock on RCU
without the zswap tree lock.

This removes the RB node inside the zswap entry, saving
about three pointers in size.  Considering the extra
overhead of xarray lookup tables, this should have some
net saving in terms of memory overhead in zswap if the
index is dense.

The zswap RB tree spin lock is still there to protect
the zswap entry. Expect the follow up change to merge
the RB tree spin lock with the xarray lock.
---
 mm/zswap.c | 98 +++++++++++++++++++++++---------------------------------------
 1 file changed, 36 insertions(+), 62 deletions(-)

Comments

Yosry Ahmed Jan. 18, 2024, 6:35 a.m. UTC | #1

> @@ -493,45 +471,47 @@ static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
>  static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
>                         struct zswap_entry **dupentry)
>  {
> -       struct rb_root *root = &tree->rbroot;
> -       struct rb_node **link = &root->rb_node, *parent = NULL;
> -       struct zswap_entry *myentry, *old;
> -       pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
> -
> -
> -       while (*link) {
> -               parent = *link;
> -               myentry = rb_entry(parent, struct zswap_entry, rbnode);
> -               myentry_offset = swp_offset(myentry->swpentry);
> -               if (myentry_offset > entry_offset)
> -                       link = &(*link)->rb_left;
> -               else if (myentry_offset < entry_offset)
> -                       link = &(*link)->rb_right;
> -               else {
> -                       old = xa_load(&tree->xarray, entry_offset);
> -                       BUG_ON(old != myentry);
> -                       *dupentry = myentry;
> +       struct zswap_entry *e;
> +       pgoff_t offset = swp_offset(entry->swpentry);
> +       XA_STATE(xas, &tree->xarray, offset);
> +
> +       do {
> +               xas_lock_irq(&xas);
> +               do {
> +                       e = xas_load(&xas);
> +                       if (xa_is_zero(e))
> +                               e = NULL;
> +               } while (xas_retry(&xas, e));
> +               if (xas_valid(&xas) && e) {
> +                       xas_unlock_irq(&xas);
> +                       *dupentry = e;
>                         return -EEXIST;
>                 }
> -       }
> -       rb_link_node(&entry->rbnode, parent, link);
> -       rb_insert_color(&entry->rbnode, root);
> -       old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
> -       return 0;
> +               xas_store(&xas, entry);
> +               xas_unlock_irq(&xas);
> +       } while (xas_nomem(&xas, GFP_KERNEL));
> +       return xas_error(&xas);

I think using the xas_* APIs can be avoided here. The only reason we
need it is that we want to check if there's an existing entry first,
and return -EEXIST. However, in that case, the caller will replace it
anyway (and do some operations on the dupentry):

while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
        WARN_ON(1);
        zswap_duplicate_entry++;
        zswap_invalidate_entry(tree, dupentry);
}

So I think we can do something like this in zswap_insert() instead:

dupentry = xa_store(..);
if (WARN_ON(dupentry)) {
        zswap_duplicate_entry++;
        zswap_invalidate_entry(tree, dupentry);
}

WDYT?

>  }
>
>  static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
>  {
> +       struct zswap_entry *e;
>         pgoff_t offset = swp_offset(entry->swpentry);
> -       if (!RB_EMPTY_NODE(&entry->rbnode)) {
> -               struct zswap_entry *old;
> -               old = xa_erase(&tree->xarray, offset);
> -               BUG_ON(old != entry);
> -               rb_erase(&entry->rbnode, &tree->rbroot);
> -               RB_CLEAR_NODE(&entry->rbnode);
> -               return true;
> -       }
> -       return false;
> +       XA_STATE(xas, &tree->xarray, offset);
> +
> +       do {
> +               xas_lock_irq(&xas);
> +               do {
> +                       e = xas_load(&xas);
> +               } while (xas_retry(&xas, e));
> +               if (xas_valid(&xas) && e != entry) {
> +                       xas_unlock_irq(&xas);
> +                       return false;
> +               }
> +               xas_store(&xas, NULL);
> +               xas_unlock_irq(&xas);
> +       } while (xas_nomem(&xas, GFP_KERNEL));
> +       return !xas_error(&xas);
>  }

Same here, I think we just want:

return !!xa_erase(..);

>
>  static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> @@ -583,7 +563,6 @@ static void zswap_entry_put(struct zswap_tree *tree,
>
>         WARN_ON_ONCE(refcount < 0);
>         if (refcount == 0) {
> -               WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
>                 zswap_free_entry(entry);
>         }

nit: the braces are no longer needed here

Yosry Ahmed Jan. 18, 2024, 7:35 p.m. UTC | #2

On Wed, Jan 17, 2024 at 10:35 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> > @@ -493,45 +471,47 @@ static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
> >  static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
> >                         struct zswap_entry **dupentry)
> >  {
> > -       struct rb_root *root = &tree->rbroot;
> > -       struct rb_node **link = &root->rb_node, *parent = NULL;
> > -       struct zswap_entry *myentry, *old;
> > -       pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
> > -
> > -
> > -       while (*link) {
> > -               parent = *link;
> > -               myentry = rb_entry(parent, struct zswap_entry, rbnode);
> > -               myentry_offset = swp_offset(myentry->swpentry);
> > -               if (myentry_offset > entry_offset)
> > -                       link = &(*link)->rb_left;
> > -               else if (myentry_offset < entry_offset)
> > -                       link = &(*link)->rb_right;
> > -               else {
> > -                       old = xa_load(&tree->xarray, entry_offset);
> > -                       BUG_ON(old != myentry);
> > -                       *dupentry = myentry;
> > +       struct zswap_entry *e;
> > +       pgoff_t offset = swp_offset(entry->swpentry);
> > +       XA_STATE(xas, &tree->xarray, offset);
> > +
> > +       do {
> > +               xas_lock_irq(&xas);
> > +               do {
> > +                       e = xas_load(&xas);
> > +                       if (xa_is_zero(e))
> > +                               e = NULL;
> > +               } while (xas_retry(&xas, e));
> > +               if (xas_valid(&xas) && e) {
> > +                       xas_unlock_irq(&xas);
> > +                       *dupentry = e;
> >                         return -EEXIST;
> >                 }
> > -       }
> > -       rb_link_node(&entry->rbnode, parent, link);
> > -       rb_insert_color(&entry->rbnode, root);
> > -       old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
> > -       return 0;
> > +               xas_store(&xas, entry);
> > +               xas_unlock_irq(&xas);
> > +       } while (xas_nomem(&xas, GFP_KERNEL));
> > +       return xas_error(&xas);
>
> I think using the xas_* APIs can be avoided here. The only reason we
> need it is that we want to check if there's an existing entry first,
> and return -EEXIST. However, in that case, the caller will replace it
> anyway (and do some operations on the dupentry):
>
> while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
>         WARN_ON(1);
>         zswap_duplicate_entry++;
>         zswap_invalidate_entry(tree, dupentry);
> }
>
> So I think we can do something like this in zswap_insert() instead:
>
> dupentry = xa_store(..);
> if (WARN_ON(dupentry)) {
>         zswap_duplicate_entry++;
>         zswap_invalidate_entry(tree, dupentry);
> }

If this is doable, I think we can return xa_store(..) and keep the
logic in the caller. I think there's a chance
zswap_{search/insert/erase} may end up being very thin wrappers around
xa_{load/store/erase}, and we may be able to remove them completely.
Let's see how it goes.

Chris Li Jan. 19, 2024, 5:43 a.m. UTC | #3

Hi Yosry,

On Wed, Jan 17, 2024 at 10:35 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> > @@ -493,45 +471,47 @@ static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
> >  static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
> >                         struct zswap_entry **dupentry)
> >  {
> > -       struct rb_root *root = &tree->rbroot;
> > -       struct rb_node **link = &root->rb_node, *parent = NULL;
> > -       struct zswap_entry *myentry, *old;
> > -       pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
> > -
> > -
> > -       while (*link) {
> > -               parent = *link;
> > -               myentry = rb_entry(parent, struct zswap_entry, rbnode);
> > -               myentry_offset = swp_offset(myentry->swpentry);
> > -               if (myentry_offset > entry_offset)
> > -                       link = &(*link)->rb_left;
> > -               else if (myentry_offset < entry_offset)
> > -                       link = &(*link)->rb_right;
> > -               else {
> > -                       old = xa_load(&tree->xarray, entry_offset);
> > -                       BUG_ON(old != myentry);
> > -                       *dupentry = myentry;
> > +       struct zswap_entry *e;
> > +       pgoff_t offset = swp_offset(entry->swpentry);
> > +       XA_STATE(xas, &tree->xarray, offset);
> > +
> > +       do {
> > +               xas_lock_irq(&xas);
> > +               do {
> > +                       e = xas_load(&xas);
> > +                       if (xa_is_zero(e))
> > +                               e = NULL;
> > +               } while (xas_retry(&xas, e));
> > +               if (xas_valid(&xas) && e) {
> > +                       xas_unlock_irq(&xas);
> > +                       *dupentry = e;
> >                         return -EEXIST;
> >                 }
> > -       }
> > -       rb_link_node(&entry->rbnode, parent, link);
> > -       rb_insert_color(&entry->rbnode, root);
> > -       old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
> > -       return 0;
> > +               xas_store(&xas, entry);
> > +               xas_unlock_irq(&xas);
> > +       } while (xas_nomem(&xas, GFP_KERNEL));
> > +       return xas_error(&xas);
>
> I think using the xas_* APIs can be avoided here. The only reason we
> need it is that we want to check if there's an existing entry first,
> and return -EEXIST. However, in that case, the caller will replace it
> anyway (and do some operations on the dupentry):

We might be able to for the insert case if we don't mind changing the
code behavior a bit. My original intent is to keep close to the
original zswap code and not stir the pot too much for the xarray
replacement. We can always make more adjustment once the RB tree is
gone.

> while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
>         WARN_ON(1);
>         zswap_duplicate_entry++;
>         zswap_invalidate_entry(tree, dupentry);
> }
>
> So I think we can do something like this in zswap_insert() instead:
>
> dupentry = xa_store(..);
> if (WARN_ON(dupentry)) {
>         zswap_duplicate_entry++;
>         zswap_invalidate_entry(tree, dupentry);
> }
>
> WDYT?
>
> >  }
> >
> >  static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
> >  {
> > +       struct zswap_entry *e;
> >         pgoff_t offset = swp_offset(entry->swpentry);
> > -       if (!RB_EMPTY_NODE(&entry->rbnode)) {
> > -               struct zswap_entry *old;
> > -               old = xa_erase(&tree->xarray, offset);
> > -               BUG_ON(old != entry);
> > -               rb_erase(&entry->rbnode, &tree->rbroot);
> > -               RB_CLEAR_NODE(&entry->rbnode);
> > -               return true;
> > -       }
> > -       return false;
> > +       XA_STATE(xas, &tree->xarray, offset);
> > +
> > +       do {
> > +               xas_lock_irq(&xas);
> > +               do {
> > +                       e = xas_load(&xas);
> > +               } while (xas_retry(&xas, e));
> > +               if (xas_valid(&xas) && e != entry) {
> > +                       xas_unlock_irq(&xas);
> > +                       return false;
> > +               }
> > +               xas_store(&xas, NULL);
> > +               xas_unlock_irq(&xas);
> > +       } while (xas_nomem(&xas, GFP_KERNEL));
> > +       return !xas_error(&xas);
> >  }
>
> Same here, I think we just want:
>
> return !!xa_erase(..);

For the erase case it is tricky.
The current zswap code does not erase an entry if the tree entry at
the same offset has been changed. It should be fine if the new entry
is NULL. Basically some race to remove the entry already. However, if
the entry is not NULL, then force resetting it to NULL will change
behavior compared to the current.

From reading the zswap code, I am not sure this is in theory
impossible to happen: Some one races to remove the zswap entry then
reclaim converts that page back to the zswap, with a new entry. By the
time the zswap_erase() tries to erase the current entry, the entry has
changed to a new entry. It seems the right thing to do in this
unlikely case is that we should skip the force erase and drop the
current entry, assuming someone else has already invalidated it. Don't
touch the entry in the tree, we assume it is a newer generation.

If we can reason the above is impossible to happen, then the force
erase and reset the entry to NULL should be fine(Famous last word).
Noted that this is a behavior change, I would like to seperate it out
from the drop in replacement patch(keep the original behavior)

Chris




>
> >
> >  static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> > @@ -583,7 +563,6 @@ static void zswap_entry_put(struct zswap_tree *tree,
> >
> >         WARN_ON_ONCE(refcount < 0);
> >         if (refcount == 0) {
> > -               WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
> >                 zswap_free_entry(entry);
> >         }
>
> nit: the braces are no longer needed here
>

Chris Li Jan. 19, 2024, 5:49 a.m. UTC | #4

On Thu, Jan 18, 2024 at 11:36 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Wed, Jan 17, 2024 at 10:35 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > > @@ -493,45 +471,47 @@ static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
> > >  static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
> > >                         struct zswap_entry **dupentry)
> > >  {
> > > -       struct rb_root *root = &tree->rbroot;
> > > -       struct rb_node **link = &root->rb_node, *parent = NULL;
> > > -       struct zswap_entry *myentry, *old;
> > > -       pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
> > > -
> > > -
> > > -       while (*link) {
> > > -               parent = *link;
> > > -               myentry = rb_entry(parent, struct zswap_entry, rbnode);
> > > -               myentry_offset = swp_offset(myentry->swpentry);
> > > -               if (myentry_offset > entry_offset)
> > > -                       link = &(*link)->rb_left;
> > > -               else if (myentry_offset < entry_offset)
> > > -                       link = &(*link)->rb_right;
> > > -               else {
> > > -                       old = xa_load(&tree->xarray, entry_offset);
> > > -                       BUG_ON(old != myentry);
> > > -                       *dupentry = myentry;
> > > +       struct zswap_entry *e;
> > > +       pgoff_t offset = swp_offset(entry->swpentry);
> > > +       XA_STATE(xas, &tree->xarray, offset);
> > > +
> > > +       do {
> > > +               xas_lock_irq(&xas);
> > > +               do {
> > > +                       e = xas_load(&xas);
> > > +                       if (xa_is_zero(e))
> > > +                               e = NULL;
> > > +               } while (xas_retry(&xas, e));
> > > +               if (xas_valid(&xas) && e) {
> > > +                       xas_unlock_irq(&xas);
> > > +                       *dupentry = e;
> > >                         return -EEXIST;
> > >                 }
> > > -       }
> > > -       rb_link_node(&entry->rbnode, parent, link);
> > > -       rb_insert_color(&entry->rbnode, root);
> > > -       old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
> > > -       return 0;
> > > +               xas_store(&xas, entry);
> > > +               xas_unlock_irq(&xas);
> > > +       } while (xas_nomem(&xas, GFP_KERNEL));
> > > +       return xas_error(&xas);
> >
> > I think using the xas_* APIs can be avoided here. The only reason we
> > need it is that we want to check if there's an existing entry first,
> > and return -EEXIST. However, in that case, the caller will replace it
> > anyway (and do some operations on the dupentry):
> >
> > while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
> >         WARN_ON(1);
> >         zswap_duplicate_entry++;
> >         zswap_invalidate_entry(tree, dupentry);
> > }
> >
> > So I think we can do something like this in zswap_insert() instead:
> >
> > dupentry = xa_store(..);
> > if (WARN_ON(dupentry)) {
> >         zswap_duplicate_entry++;
> >         zswap_invalidate_entry(tree, dupentry);
> > }
>
> If this is doable, I think we can return xa_store(..) and keep the
> logic in the caller. I think there's a chance
> zswap_{search/insert/erase} may end up being very thin wrappers around
> xa_{load/store/erase}, and we may be able to remove them completely.
> Let's see how it goes.
>

See my other email about erasing an entry raced into a new entry. That
is the part I am not certain.
Anyway, when zswap adopte folio, then it might need to handle multi
entry, we will be back to using the XAS API.

Chris

Yosry Ahmed Jan. 19, 2024, 7:36 p.m. UTC | #5

On Thu, Jan 18, 2024 at 9:43 PM Chris Li <chrisl@kernel.org> wrote:
>
> Hi Yosry,
>
> On Wed, Jan 17, 2024 at 10:35 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > > @@ -493,45 +471,47 @@ static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
> > >  static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
> > >                         struct zswap_entry **dupentry)
> > >  {
> > > -       struct rb_root *root = &tree->rbroot;
> > > -       struct rb_node **link = &root->rb_node, *parent = NULL;
> > > -       struct zswap_entry *myentry, *old;
> > > -       pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
> > > -
> > > -
> > > -       while (*link) {
> > > -               parent = *link;
> > > -               myentry = rb_entry(parent, struct zswap_entry, rbnode);
> > > -               myentry_offset = swp_offset(myentry->swpentry);
> > > -               if (myentry_offset > entry_offset)
> > > -                       link = &(*link)->rb_left;
> > > -               else if (myentry_offset < entry_offset)
> > > -                       link = &(*link)->rb_right;
> > > -               else {
> > > -                       old = xa_load(&tree->xarray, entry_offset);
> > > -                       BUG_ON(old != myentry);
> > > -                       *dupentry = myentry;
> > > +       struct zswap_entry *e;
> > > +       pgoff_t offset = swp_offset(entry->swpentry);
> > > +       XA_STATE(xas, &tree->xarray, offset);
> > > +
> > > +       do {
> > > +               xas_lock_irq(&xas);
> > > +               do {
> > > +                       e = xas_load(&xas);
> > > +                       if (xa_is_zero(e))
> > > +                               e = NULL;
> > > +               } while (xas_retry(&xas, e));
> > > +               if (xas_valid(&xas) && e) {
> > > +                       xas_unlock_irq(&xas);
> > > +                       *dupentry = e;
> > >                         return -EEXIST;
> > >                 }
> > > -       }
> > > -       rb_link_node(&entry->rbnode, parent, link);
> > > -       rb_insert_color(&entry->rbnode, root);
> > > -       old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
> > > -       return 0;
> > > +               xas_store(&xas, entry);
> > > +               xas_unlock_irq(&xas);
> > > +       } while (xas_nomem(&xas, GFP_KERNEL));
> > > +       return xas_error(&xas);
> >
> > I think using the xas_* APIs can be avoided here. The only reason we
> > need it is that we want to check if there's an existing entry first,
> > and return -EEXIST. However, in that case, the caller will replace it
> > anyway (and do some operations on the dupentry):
>
> We might be able to for the insert case if we don't mind changing the
> code behavior a bit. My original intent is to keep close to the
> original zswap code and not stir the pot too much for the xarray
> replacement. We can always make more adjustment once the RB tree is
> gone.

I don't see how this changes code behavior though. The current code in
zswap_store() will do the following:

- Hold the tree lock to make sure no one else modifies it.
- Try to insert, check if there is already a dupentry at the index and
return -EEXIST.
- Warn, increment zswap_duplicate_entry, and invalidate the dupentry.
- Try to insert again (this should always succeed since we are holding
the lock).

What I am proposing is:
- zswap_xa_insert() is a thin wrapper around xa_store() (or we can
remove it completely).
- zswap_store() does the following:
  - Use zswap_xa_insert() and check if there is a returned dupentry.
  - Warn, increment zswap_duplicate_entry, and invalidate the dupentry.

Either way, we always place the entry we have in the tree, and if
there is a dupentry we warn and invalidate it. If anything, the latter
is more straightforward.

Am I missing something?

> >
> > >  }
> > >
> > >  static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
> > >  {
> > > +       struct zswap_entry *e;
> > >         pgoff_t offset = swp_offset(entry->swpentry);
> > > -       if (!RB_EMPTY_NODE(&entry->rbnode)) {
> > > -               struct zswap_entry *old;
> > > -               old = xa_erase(&tree->xarray, offset);
> > > -               BUG_ON(old != entry);
> > > -               rb_erase(&entry->rbnode, &tree->rbroot);
> > > -               RB_CLEAR_NODE(&entry->rbnode);
> > > -               return true;
> > > -       }
> > > -       return false;
> > > +       XA_STATE(xas, &tree->xarray, offset);
> > > +
> > > +       do {
> > > +               xas_lock_irq(&xas);
> > > +               do {
> > > +                       e = xas_load(&xas);
> > > +               } while (xas_retry(&xas, e));
> > > +               if (xas_valid(&xas) && e != entry) {
> > > +                       xas_unlock_irq(&xas);
> > > +                       return false;
> > > +               }
> > > +               xas_store(&xas, NULL);
> > > +               xas_unlock_irq(&xas);
> > > +       } while (xas_nomem(&xas, GFP_KERNEL));
> > > +       return !xas_error(&xas);
> > >  }
> >
> > Same here, I think we just want:
> >
> > return !!xa_erase(..);
>
> For the erase case it is tricky.
> The current zswap code does not erase an entry if the tree entry at
> the same offset has been changed. It should be fine if the new entry
> is NULL. Basically some race to remove the entry already. However, if
> the entry is not NULL, then force resetting it to NULL will change
> behavior compared to the current.

I see, very good point. I think we can use xa_cmpxchg() and pass in NULL?

Yosry Ahmed Jan. 19, 2024, 7:37 p.m. UTC | #6

> > If this is doable, I think we can return xa_store(..) and keep the
> > logic in the caller. I think there's a chance
> > zswap_{search/insert/erase} may end up being very thin wrappers around
> > xa_{load/store/erase}, and we may be able to remove them completely.
> > Let's see how it goes.
> >
>
> See my other email about erasing an entry raced into a new entry. That
> is the part I am not certain.
> Anyway, when zswap adopte folio, then it might need to handle multi
> entry, we will be back to using the XAS API.

Handling large folios in zswap is a much larger topic that involves a
lot more than this xa_* vs. xas_* apis dispute. Let's not worry about
this for now.

Chris Li Jan. 19, 2024, 9:31 p.m. UTC | #7

On Fri, Jan 19, 2024 at 11:37 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> > > I think using the xas_* APIs can be avoided here. The only reason we
> > > need it is that we want to check if there's an existing entry first,
> > > and return -EEXIST. However, in that case, the caller will replace it
> > > anyway (and do some operations on the dupentry):
> >
> > We might be able to for the insert case if we don't mind changing the
> > code behavior a bit. My original intent is to keep close to the
> > original zswap code and not stir the pot too much for the xarray
> > replacement. We can always make more adjustment once the RB tree is
> > gone.
>
> I don't see how this changes code behavior though. The current code in
> zswap_store() will do the following:

I am referring to the log and update counter happening after the zswap
mapping was updated. Maybe nobody actually cares about that behavior
difference. In my mind, there is a difference.

>
> - Hold the tree lock to make sure no one else modifies it.
> - Try to insert, check if there is already a dupentry at the index and
> return -EEXIST.
> - Warn, increment zswap_duplicate_entry, and invalidate the dupentry.
> - Try to insert again (this should always succeed since we are holding
> the lock).
>
> What I am proposing is:
> - zswap_xa_insert() is a thin wrapper around xa_store() (or we can
> remove it completely).
> - zswap_store() does the following:
>   - Use zswap_xa_insert() and check if there is a returned dupentry.
>   - Warn, increment zswap_duplicate_entry, and invalidate the dupentry.
>
> Either way, we always place the entry we have in the tree, and if
> there is a dupentry we warn and invalidate it. If anything, the latter
> is more straightforward.
>
> Am I missing something?

No, that is what I would do if I have to use the xa_* API.


>
> > >
> > > >  }
> > > >
> > > >  static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
> > > >  {
> > > > +       struct zswap_entry *e;
> > > >         pgoff_t offset = swp_offset(entry->swpentry);
> > > > -       if (!RB_EMPTY_NODE(&entry->rbnode)) {
> > > > -               struct zswap_entry *old;
> > > > -               old = xa_erase(&tree->xarray, offset);
> > > > -               BUG_ON(old != entry);
> > > > -               rb_erase(&entry->rbnode, &tree->rbroot);
> > > > -               RB_CLEAR_NODE(&entry->rbnode);
> > > > -               return true;
> > > > -       }
> > > > -       return false;
> > > > +       XA_STATE(xas, &tree->xarray, offset);
> > > > +
> > > > +       do {
> > > > +               xas_lock_irq(&xas);
> > > > +               do {
> > > > +                       e = xas_load(&xas);
> > > > +               } while (xas_retry(&xas, e));
> > > > +               if (xas_valid(&xas) && e != entry) {
> > > > +                       xas_unlock_irq(&xas);
> > > > +                       return false;
> > > > +               }
> > > > +               xas_store(&xas, NULL);
> > > > +               xas_unlock_irq(&xas);
> > > > +       } while (xas_nomem(&xas, GFP_KERNEL));
> > > > +       return !xas_error(&xas);
> > > >  }
> > >
> > > Same here, I think we just want:
> > >
> > > return !!xa_erase(..);
> >
> > For the erase case it is tricky.
> > The current zswap code does not erase an entry if the tree entry at
> > the same offset has been changed. It should be fine if the new entry
> > is NULL. Basically some race to remove the entry already. However, if
> > the entry is not NULL, then force resetting it to NULL will change
> > behavior compared to the current.
>
> I see, very good point. I think we can use xa_cmpxchg() and pass in NULL?
>
That is certainly possible. Thanks for bringing it up.
Let me try to combine the tree->lock with xarray lock first. If
xa_cmpxchg() can simplify the result there, I will use it.


> Handling large folios in zswap is a much larger topic that involves a
> lot more than this xa_* vs. xas_* apis dispute. Let's not worry about
> this for now.

Ack. One more reason to use the XAS interface is that zswap currently
does multiple lookups on typical zswap_load(). It finds entries by
offset, for the entry (lookup one). Then after folio install to swap
cache, it deletes the entry, it will performan another lookup to
delete the entry (look up two). Using XAS might be able to cache the
node location for the second lookup to avoid the full node walk. That
is not in my current patch and can be a later improvement patch as
well.

Chris

Yosry Ahmed Jan. 19, 2024, 9:44 p.m. UTC | #8

On Fri, Jan 19, 2024 at 1:32 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Jan 19, 2024 at 11:37 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> > > > I think using the xas_* APIs can be avoided here. The only reason we
> > > > need it is that we want to check if there's an existing entry first,
> > > > and return -EEXIST. However, in that case, the caller will replace it
> > > > anyway (and do some operations on the dupentry):
> > >
> > > We might be able to for the insert case if we don't mind changing the
> > > code behavior a bit. My original intent is to keep close to the
> > > original zswap code and not stir the pot too much for the xarray
> > > replacement. We can always make more adjustment once the RB tree is
> > > gone.
> >
> > I don't see how this changes code behavior though. The current code in
> > zswap_store() will do the following:
>
> I am referring to the log and update counter happening after the zswap
> mapping was updated. Maybe nobody actually cares about that behavior
> difference. In my mind, there is a difference.

I don't think it matters tbh, certainly not worth the more complicated
implementation.

> > > > >  static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
> > > > >  {
> > > > > +       struct zswap_entry *e;
> > > > >         pgoff_t offset = swp_offset(entry->swpentry);
> > > > > -       if (!RB_EMPTY_NODE(&entry->rbnode)) {
> > > > > -               struct zswap_entry *old;
> > > > > -               old = xa_erase(&tree->xarray, offset);
> > > > > -               BUG_ON(old != entry);
> > > > > -               rb_erase(&entry->rbnode, &tree->rbroot);
> > > > > -               RB_CLEAR_NODE(&entry->rbnode);
> > > > > -               return true;
> > > > > -       }
> > > > > -       return false;
> > > > > +       XA_STATE(xas, &tree->xarray, offset);
> > > > > +
> > > > > +       do {
> > > > > +               xas_lock_irq(&xas);
> > > > > +               do {
> > > > > +                       e = xas_load(&xas);
> > > > > +               } while (xas_retry(&xas, e));
> > > > > +               if (xas_valid(&xas) && e != entry) {
> > > > > +                       xas_unlock_irq(&xas);
> > > > > +                       return false;
> > > > > +               }
> > > > > +               xas_store(&xas, NULL);
> > > > > +               xas_unlock_irq(&xas);
> > > > > +       } while (xas_nomem(&xas, GFP_KERNEL));
> > > > > +       return !xas_error(&xas);
> > > > >  }
> > > >
> > > > Same here, I think we just want:
> > > >
> > > > return !!xa_erase(..);
> > >
> > > For the erase case it is tricky.
> > > The current zswap code does not erase an entry if the tree entry at
> > > the same offset has been changed. It should be fine if the new entry
> > > is NULL. Basically some race to remove the entry already. However, if
> > > the entry is not NULL, then force resetting it to NULL will change
> > > behavior compared to the current.
> >
> > I see, very good point. I think we can use xa_cmpxchg() and pass in NULL?
> >
> That is certainly possible. Thanks for bringing it up.
> Let me try to combine the tree->lock with xarray lock first. If
> xa_cmpxchg() can simplify the result there, I will use it.

SGTM.

> > Handling large folios in zswap is a much larger topic that involves a
> > lot more than this xa_* vs. xas_* apis dispute. Let's not worry about
> > this for now.
>
> Ack. One more reason to use the XAS interface is that zswap currently
> does multiple lookups on typical zswap_load(). It finds entries by
> offset, for the entry (lookup one). Then after folio install to swap
> cache, it deletes the entry, it will performan another lookup to
> delete the entry (look up two). Using XAS might be able to cache the
> node location for the second lookup to avoid the full node walk. That
> is not in my current patch and can be a later improvement patch as
> well.

One more straightforward optimization we can do with the xas_* API is
to cache the lookup done in zswap_load() and reuse it when doing
invalidations for exclusive loads.

For the initial implementation, let's keep it simple and try to use
the xa_* APIs where possible.

diff mbox series

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index a40b0076722b..555d5608d401 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -197,7 +197,6 @@  struct zswap_pool {
  * This structure contains the metadata for tracking a single compressed
  * page within zswap.
  *
- * rbnode - links the entry into red-black tree for the appropriate swap type
  * swpentry - associated swap entry, the offset indexes into the red-black tree
  * refcount - the number of outstanding reference to the entry. This is needed
  *            to protect against premature freeing of the entry by code
@@ -215,7 +214,6 @@  struct zswap_pool {
  * lru - handle to the pool's lru used to evict pages.
  */
 struct zswap_entry {
-	struct rb_node rbnode;
 	swp_entry_t swpentry;
 	int refcount;
 	unsigned int length;
@@ -234,7 +232,6 @@  struct zswap_entry {
  * - the refcount field of each entry in the tree
  */
 struct zswap_tree {
-	struct rb_root rbroot;
 	struct xarray xarray;
 	spinlock_t lock;
 };
@@ -357,7 +354,6 @@  static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
 	if (!entry)
 		return NULL;
 	entry->refcount = 1;
-	RB_CLEAR_NODE(&entry->rbnode);
 	return entry;
 }
 
@@ -465,25 +461,7 @@  static void zswap_lru_putback(struct list_lru *list_lru,
 **********************************/
 static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
 {
-	struct rb_node *node = tree->rbroot.rb_node;
-	struct zswap_entry *entry;
-	pgoff_t entry_offset;
-
-	while (node) {
-		entry = rb_entry(node, struct zswap_entry, rbnode);
-		entry_offset = swp_offset(entry->swpentry);
-		if (entry_offset > offset)
-			node = node->rb_left;
-		else if (entry_offset < offset)
-			node = node->rb_right;
-		else {
-			struct zswap_entry *e = xa_load(&tree->xarray, offset);
-
-			BUG_ON(entry != e);
-			return entry;
-		}
-	}
-	return NULL;
+	return xa_load(&tree->xarray, offset);
 }
 
 /*
@@ -493,45 +471,47 @@  static struct zswap_entry *zswap_search(struct zswap_tree *tree, pgoff_t offset)
 static int zswap_insert(struct zswap_tree *tree, struct zswap_entry *entry,
 			struct zswap_entry **dupentry)
 {
-	struct rb_root *root = &tree->rbroot;
-	struct rb_node **link = &root->rb_node, *parent = NULL;
-	struct zswap_entry *myentry, *old;
-	pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
-
-
-	while (*link) {
-		parent = *link;
-		myentry = rb_entry(parent, struct zswap_entry, rbnode);
-		myentry_offset = swp_offset(myentry->swpentry);
-		if (myentry_offset > entry_offset)
-			link = &(*link)->rb_left;
-		else if (myentry_offset < entry_offset)
-			link = &(*link)->rb_right;
-		else {
-			old = xa_load(&tree->xarray, entry_offset);
-			BUG_ON(old != myentry);
-			*dupentry = myentry;
+	struct zswap_entry *e;
+	pgoff_t offset = swp_offset(entry->swpentry);
+	XA_STATE(xas, &tree->xarray, offset);
+
+	do {
+		xas_lock_irq(&xas);
+		do {
+			e = xas_load(&xas);
+			if (xa_is_zero(e))
+				e = NULL;
+		} while (xas_retry(&xas, e));
+		if (xas_valid(&xas) && e) {
+			xas_unlock_irq(&xas);
+			*dupentry = e;
 			return -EEXIST;
 		}
-	}
-	rb_link_node(&entry->rbnode, parent, link);
-	rb_insert_color(&entry->rbnode, root);
-	old = xa_store(&tree->xarray, entry_offset, entry, GFP_KERNEL);
-	return 0;
+		xas_store(&xas, entry);
+		xas_unlock_irq(&xas);
+	} while (xas_nomem(&xas, GFP_KERNEL));
+	return xas_error(&xas);
 }
 
 static bool zswap_erase(struct zswap_tree *tree, struct zswap_entry *entry)
 {
+	struct zswap_entry *e;
 	pgoff_t offset = swp_offset(entry->swpentry);
-	if (!RB_EMPTY_NODE(&entry->rbnode)) {
-		struct zswap_entry *old;
-		old = xa_erase(&tree->xarray, offset);
-		BUG_ON(old != entry);
-		rb_erase(&entry->rbnode, &tree->rbroot);
-		RB_CLEAR_NODE(&entry->rbnode);
-		return true;
-	}
-	return false;
+	XA_STATE(xas, &tree->xarray, offset);
+
+	do {
+		xas_lock_irq(&xas);
+		do {
+			e = xas_load(&xas);
+		} while (xas_retry(&xas, e));
+		if (xas_valid(&xas) && e != entry) {
+			xas_unlock_irq(&xas);
+			return false;
+		}
+		xas_store(&xas, NULL);
+		xas_unlock_irq(&xas);
+	} while (xas_nomem(&xas, GFP_KERNEL));
+	return !xas_error(&xas);
 }
 
 static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
@@ -583,7 +563,6 @@  static void zswap_entry_put(struct zswap_tree *tree,
 
 	WARN_ON_ONCE(refcount < 0);
 	if (refcount == 0) {
-		WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
 		zswap_free_entry(entry);
 	}
 }
@@ -1799,7 +1778,6 @@  void zswap_swapon(int type)
 		return;
 	}
 
-	tree->rbroot = RB_ROOT;
 	xa_init(&tree->xarray);
 	spin_lock_init(&tree->lock);
 	zswap_trees[type] = tree;
@@ -1808,7 +1786,7 @@  void zswap_swapon(int type)
 void zswap_swapoff(int type)
 {
 	struct zswap_tree *tree = zswap_trees[type];
-	struct zswap_entry *entry, *e, *n;
+	struct zswap_entry *e;
 	XA_STATE(xas, tree ? &tree->xarray : NULL, 0);
 
 	if (!tree)
@@ -1820,10 +1798,6 @@  void zswap_swapoff(int type)
 	xas_for_each(&xas, e, ULONG_MAX)
 		zswap_invalidate_entry(tree, e);
 
-	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
-		BUG_ON(entry);
-
-	tree->rbroot = RB_ROOT;
 	spin_unlock(&tree->lock);
 	kfree(tree);
 	zswap_trees[type] = NULL;