zsmalloc: fix a race with deferred_handles storing
Commit Message
Currently, there is a race between zs_free() and zs_reclaim_page():
zs_reclaim_page() finds a handle to an allocated object, but before the
eviction happens, an independent zs_free() call to the same handle could
come in and overwrite the object value stored at the handle with
the last deferred handle. When zs_reclaim_page() finally gets to call
the eviction handler, it will see an invalid object value (i.e the
previous deferred handle instead of the original object value).
This race happens quite infrequently. We only managed to produce it with
out-of-tree developmental code that triggers zsmalloc writeback with a
much higher frequency than usual.
This patch fixes this race by storing the deferred handle in the object
header instead. We differentiate the deferred handle from the other two
cases (handle for allocated object, and linkage for free object) with a
new tag. If zspage reclamation succeeds, we will free these deferred
handles by walking through the zspage objects. On the other hand, if
zspage reclamation fails, we reconstruct the zspage freelist (with the
deferred handle tag and allocated tag) before trying again with the
reclamation.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
mm/zsmalloc.c | 237 +++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 205 insertions(+), 32 deletions(-)
Comments
On Tue, Jan 10, 2023 at 3:17 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> Currently, there is a race between zs_free() and zs_reclaim_page():
> zs_reclaim_page() finds a handle to an allocated object, but before the
> eviction happens, an independent zs_free() call to the same handle could
> come in and overwrite the object value stored at the handle with
> the last deferred handle. When zs_reclaim_page() finally gets to call
> the eviction handler, it will see an invalid object value (i.e the
> previous deferred handle instead of the original object value).
>
> This race happens quite infrequently. We only managed to produce it with
> out-of-tree developmental code that triggers zsmalloc writeback with a
> much higher frequency than usual.
>
> This patch fixes this race by storing the deferred handle in the object
> header instead. We differentiate the deferred handle from the other two
> cases (handle for allocated object, and linkage for free object) with a
> new tag. If zspage reclamation succeeds, we will free these deferred
> handles by walking through the zspage objects. On the other hand, if
> zspage reclamation fails, we reconstruct the zspage freelist (with the
> deferred handle tag and allocated tag) before trying again with the
> reclamation.
>
Fixes: 9997bc017549 ("zsmalloc: implement writeback mechanism for zsmalloc")
I forgot to add the fix tag to this patch.
> Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Nhat Pham <nphamcs@gmail.com>
> ---
> mm/zsmalloc.c | 237 +++++++++++++++++++++++++++++++++++++++++++-------
> 1 file changed, 205 insertions(+), 32 deletions(-)
>
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 9445bee6b014..b4b40ef98703 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -113,7 +113,23 @@
> * have room for two bit at least.
> */
> #define OBJ_ALLOCATED_TAG 1
> -#define OBJ_TAG_BITS 1
> +
> +#ifdef CONFIG_ZPOOL
> +/*
> + * The second least-significant bit in the object's header identifies if the
> + * value stored at the header is a deferred handle from the last reclaim
> + * attempt.
> + *
> + * As noted above, this is valid because we have room for two bits.
> + */
> +#define OBJ_DEFERRED_HANDLE_TAG 2
> +#define OBJ_TAG_BITS 2
> +#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)
> +#else
> +#define OBJ_TAG_BITS 1
> +#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
> +#endif /* CONFIG_ZPOOL */
> +
> #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
> #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
>
> @@ -222,6 +238,12 @@ struct link_free {
> * Handle of allocated object.
> */
> unsigned long handle;
> +#ifdef CONFIG_ZPOOL
> + /*
> + * Deferred handle of a reclaimed object.
> + */
> + unsigned long deferred_handle;
> +#endif
> };
> };
>
> @@ -272,8 +294,6 @@ struct zspage {
> /* links the zspage to the lru list in the pool */
> struct list_head lru;
> bool under_reclaim;
> - /* list of unfreed handles whose objects have been reclaimed */
> - unsigned long *deferred_handles;
> #endif
>
> struct zs_pool *pool;
> @@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle)
> return *(unsigned long *)handle;
> }
>
> -static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
> +static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
> + int tag)
> {
> unsigned long handle;
> struct zspage *zspage = get_zspage(page);
> @@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
> } else
> handle = *(unsigned long *)obj;
>
> - if (!(handle & OBJ_ALLOCATED_TAG))
> + if (!(handle & tag))
> return false;
>
> - *phandle = handle & ~OBJ_ALLOCATED_TAG;
> + /* Clear all tags before returning the handle */
> + *phandle = handle & ~OBJ_TAG_MASK;
> return true;
> }
>
> +static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
> +{
> + return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
> +}
> +
> +#ifdef CONFIG_ZPOOL
> +static bool obj_stores_deferred_handle(struct page *page, void *obj,
> + unsigned long *phandle)
> +{
> + return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG);
> +}
> +#endif
> +
> static void reset_page(struct page *page)
> {
> __ClearPageMovable(page);
> @@ -946,22 +981,36 @@ static int trylock_zspage(struct zspage *zspage)
> }
>
> #ifdef CONFIG_ZPOOL
> +static unsigned long find_deferred_handle_obj(struct size_class *class,
> + struct page *page, int *obj_idx);
> +
> /*
> * Free all the deferred handles whose objects are freed in zs_free.
> */
> -static void free_handles(struct zs_pool *pool, struct zspage *zspage)
> +static void free_handles(struct zs_pool *pool, struct size_class *class,
> + struct zspage *zspage)
> {
> - unsigned long handle = (unsigned long)zspage->deferred_handles;
> + int obj_idx = 0;
> + struct page *page = get_first_page(zspage);
> + unsigned long handle;
>
> - while (handle) {
> - unsigned long nxt_handle = handle_to_obj(handle);
> + while (1) {
> + handle = find_deferred_handle_obj(class, page, &obj_idx);
> + if (!handle) {
> + page = get_next_page(page);
> + if (!page)
> + break;
> + obj_idx = 0;
> + continue;
> + }
>
> cache_free_handle(pool, handle);
> - handle = nxt_handle;
> + obj_idx++;
> }
> }
> #else
> -static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {}
> +static inline void free_handles(struct zs_pool *pool, struct size_class *class,
> + struct zspage *zspage) {}
> #endif
>
> static void __free_zspage(struct zs_pool *pool, struct size_class *class,
> @@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
> VM_BUG_ON(fg != ZS_EMPTY);
>
> /* Free all deferred handles from zs_free */
> - free_handles(pool, zspage);
> + free_handles(pool, class, zspage);
>
> next = page = get_first_page(zspage);
> do {
> @@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
> #ifdef CONFIG_ZPOOL
> INIT_LIST_HEAD(&zspage->lru);
> zspage->under_reclaim = false;
> - zspage->deferred_handles = NULL;
> #endif
>
> set_freeobj(zspage, 0);
> @@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
> }
> EXPORT_SYMBOL_GPL(zs_malloc);
>
> -static void obj_free(int class_size, unsigned long obj)
> +static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
> {
> struct link_free *link;
> struct zspage *zspage;
> @@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj)
> zspage = get_zspage(f_page);
>
> vaddr = kmap_atomic(f_page);
> -
> - /* Insert this object in containing zspage's freelist */
> link = (struct link_free *)(vaddr + f_offset);
> - if (likely(!ZsHugePage(zspage)))
> - link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
> - else
> - f_page->index = 0;
> +
> + if (handle) {
> +#ifdef CONFIG_ZPOOL
> + /* Stores the (deferred) handle in the object's header */
> + *handle |= OBJ_DEFERRED_HANDLE_TAG;
> + *handle &= ~OBJ_ALLOCATED_TAG;
> +
> + if (likely(!ZsHugePage(zspage)))
> + link->deferred_handle = *handle;
> + else
> + f_page->index = *handle;
> +#endif
> + } else {
> + /* Insert this object in containing zspage's freelist */
> + if (likely(!ZsHugePage(zspage)))
> + link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
> + else
> + f_page->index = 0;
> + set_freeobj(zspage, f_objidx);
> + }
> +
> kunmap_atomic(vaddr);
> - set_freeobj(zspage, f_objidx);
> mod_zspage_inuse(zspage, -1);
> }
>
> @@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
> zspage = get_zspage(f_page);
> class = zspage_class(pool, zspage);
>
> - obj_free(class->size, obj);
> class_stat_dec(class, OBJ_USED, 1);
>
> #ifdef CONFIG_ZPOOL
> @@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
> * Reclaim needs the handles during writeback. It'll free
> * them along with the zspage when it's done with them.
> *
> - * Record current deferred handle at the memory location
> - * whose address is given by handle.
> + * Record current deferred handle in the object's header.
> */
> - record_obj(handle, (unsigned long)zspage->deferred_handles);
> - zspage->deferred_handles = (unsigned long *)handle;
> + obj_free(class->size, obj, &handle);
> spin_unlock(&pool->lock);
> return;
> }
> #endif
> + obj_free(class->size, obj, NULL);
> +
> fullness = fix_fullness_group(class, zspage);
> if (fullness == ZS_EMPTY)
> free_zspage(pool, class, zspage);
> @@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
> }
>
> /*
> - * Find alloced object in zspage from index object and
> + * Find object with a certain tag in zspage from index object and
> * return handle.
> */
> -static unsigned long find_alloced_obj(struct size_class *class,
> - struct page *page, int *obj_idx)
> +static unsigned long find_tagged_obj(struct size_class *class,
> + struct page *page, int *obj_idx, int tag)
> {
> unsigned int offset;
> int index = *obj_idx;
> @@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
> offset += class->size * index;
>
> while (offset < PAGE_SIZE) {
> - if (obj_allocated(page, addr + offset, &handle))
> + if (obj_tagged(page, addr + offset, &handle, tag))
> break;
>
> offset += class->size;
> @@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class,
> return handle;
> }
>
> +/*
> + * Find alloced object in zspage from index object and
> + * return handle.
> + */
> +static unsigned long find_alloced_obj(struct size_class *class,
> + struct page *page, int *obj_idx)
> +{
> + return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
> +}
> +
> +#ifdef CONFIG_ZPOOL
> +/*
> + * Find object storing a deferred handle in header in zspage from index object
> + * and return handle.
> + */
> +static unsigned long find_deferred_handle_obj(struct size_class *class,
> + struct page *page, int *obj_idx)
> +{
> + return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG);
> +}
> +#endif
> +
> struct zs_compact_control {
> /* Source spage for migration which could be a subpage of zspage */
> struct page *s_page;
> @@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
> zs_object_copy(class, free_obj, used_obj);
> obj_idx++;
> record_obj(handle, free_obj);
> - obj_free(class->size, used_obj);
> + obj_free(class->size, used_obj, NULL);
> }
>
> /* Remember last position in this iteration */
> @@ -2478,6 +2561,90 @@ void zs_destroy_pool(struct zs_pool *pool)
> EXPORT_SYMBOL_GPL(zs_destroy_pool);
>
> #ifdef CONFIG_ZPOOL
> +static void restore_freelist(struct zs_pool *pool, struct size_class *class,
> + struct zspage *zspage)
> +{
> + unsigned int obj_idx = 0;
> + unsigned long handle, off = 0; /* off is within-page offset */
> + struct page *page = get_first_page(zspage);
> + struct link_free *prev_free = NULL;
> + void *prev_page_vaddr = NULL;
> +
> + /* in case no free object found */
> + set_freeobj(zspage, (unsigned int)(-1UL));
> +
> + while (page) {
> + void *vaddr = kmap_atomic(page);
> + struct page *next_page;
> +
> + while (off < PAGE_SIZE) {
> + void *obj_addr = vaddr + off;
> +
> + /* skip allocated object */
> + if (obj_allocated(page, obj_addr, &handle)) {
> + obj_idx++;
> + off += class->size;
> + continue;
> + }
> +
> + /* free deferred handle from reclaim attempt */
> + if (obj_stores_deferred_handle(page, obj_addr, &handle))
> + cache_free_handle(pool, handle);
> +
> + if (prev_free)
> + prev_free->next = obj_idx << OBJ_TAG_BITS;
> + else /* first free object found */
> + set_freeobj(zspage, obj_idx);
> +
> + prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
> + /* if last free object in a previous page, need to unmap */
> + if (prev_page_vaddr) {
> + kunmap_atomic(prev_page_vaddr);
> + prev_page_vaddr = NULL;
> + }
> +
> + obj_idx++;
> + off += class->size;
> + }
> +
> + /*
> + * Handle the last (full or partial) object on this page.
> + */
> + next_page = get_next_page(page);
> + if (next_page) {
> + if (!prev_free || prev_page_vaddr) {
> + /*
> + * There is no free object in this page, so we can safely
> + * unmap it.
> + */
> + kunmap_atomic(vaddr);
> + } else {
> + /* update prev_page_vaddr since prev_free is on this page */
> + prev_page_vaddr = vaddr;
> + }
> + } else { /* this is the last page */
> + if (prev_free) {
> + /*
> + * Reset OBJ_TAG_BITS bit to last link to tell
> + * whether it's allocated object or not.
> + */
> + prev_free->next = -1UL << OBJ_TAG_BITS;
> + }
> +
> + /* unmap previous page (if not done yet) */
> + if (prev_page_vaddr) {
> + kunmap_atomic(prev_page_vaddr);
> + prev_page_vaddr = NULL;
> + }
> +
> + kunmap_atomic(vaddr);
> + }
> +
> + page = next_page;
> + off %= PAGE_SIZE;
> + }
> +}
> +
> static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
> {
> int i, obj_idx, ret = 0;
> @@ -2561,6 +2728,12 @@ static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
> return 0;
> }
>
> + /*
> + * Eviction fails on one of the handles, so we need to restore zspage.
> + * We need to rebuild its freelist (and free stored deferred handles),
> + * put it back to the correct size class, and add it to the LRU list.
> + */
> + restore_freelist(pool, class, zspage);
> putback_zspage(class, zspage);
> list_add(&zspage->lru, &pool->lru);
> unlock_zspage(zspage);
> --
> 2.30.2
>
On (23/01/10 15:17), Nhat Pham wrote:
> +#ifdef CONFIG_ZPOOL
> +/*
> + * The second least-significant bit in the object's header identifies if the
> + * value stored at the header is a deferred handle from the last reclaim
> + * attempt.
> + *
> + * As noted above, this is valid because we have room for two bits.
> + */
> +#define OBJ_DEFERRED_HANDLE_TAG 2
> +#define OBJ_TAG_BITS 2
> +#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)
> +#else
> +#define OBJ_TAG_BITS 1
> +#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
> +#endif /* CONFIG_ZPOOL */
> +
> #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
> #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
>
> @@ -222,6 +238,12 @@ struct link_free {
> * Handle of allocated object.
> */
> unsigned long handle;
> +#ifdef CONFIG_ZPOOL
> + /*
> + * Deferred handle of a reclaimed object.
> + */
> + unsigned long deferred_handle;
> +#endif
A nit:
Do we really need to have that #ifdef and add a member to anon uion?
I see that we use ->deferred_handle only in one place, so I'm not sure
if it makes code any simpler.
On (23/01/10 15:17), Nhat Pham wrote:
[..]
> #ifdef CONFIG_ZPOOL
> +static void restore_freelist(struct zs_pool *pool, struct size_class *class,
> + struct zspage *zspage)
> +{
> + unsigned int obj_idx = 0;
> + unsigned long handle, off = 0; /* off is within-page offset */
> + struct page *page = get_first_page(zspage);
> + struct link_free *prev_free = NULL;
> + void *prev_page_vaddr = NULL;
> +
> + /* in case no free object found */
> + set_freeobj(zspage, (unsigned int)(-1UL));
I'm not following this. I see how -1UL works for link_free, but this
cast of -1UL to 4 bytes looks suspicious.
> + while (page) {
> + void *vaddr = kmap_atomic(page);
> + struct page *next_page;
> +
> + while (off < PAGE_SIZE) {
> + void *obj_addr = vaddr + off;
> +
> + /* skip allocated object */
> + if (obj_allocated(page, obj_addr, &handle)) {
> + obj_idx++;
> + off += class->size;
> + continue;
> + }
> +
> + /* free deferred handle from reclaim attempt */
> + if (obj_stores_deferred_handle(page, obj_addr, &handle))
> + cache_free_handle(pool, handle);
> +
> + if (prev_free)
> + prev_free->next = obj_idx << OBJ_TAG_BITS;
> + else /* first free object found */
> + set_freeobj(zspage, obj_idx);
> +
> + prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
> + /* if last free object in a previous page, need to unmap */
> + if (prev_page_vaddr) {
> + kunmap_atomic(prev_page_vaddr);
> + prev_page_vaddr = NULL;
> + }
> +
> + obj_idx++;
> + off += class->size;
> + }
> +
> + /*
> + * Handle the last (full or partial) object on this page.
> + */
> + next_page = get_next_page(page);
> + if (next_page) {
> + if (!prev_free || prev_page_vaddr) {
> + /*
> + * There is no free object in this page, so we can safely
> + * unmap it.
> + */
> + kunmap_atomic(vaddr);
> + } else {
> + /* update prev_page_vaddr since prev_free is on this page */
> + prev_page_vaddr = vaddr;
> + }
A polite and gentle nit: I'd appreciate it if we honored kernel coding
styles in zsmalloc a little bit more. Comments, function declarations, etc.
I'm personally very happy with https://github.com/vivien/vim-linux-coding-style
On Tue, Jan 31, 2023 at 5:41 PM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> On (23/01/10 15:17), Nhat Pham wrote:
> [..]
> > #ifdef CONFIG_ZPOOL
> > +static void restore_freelist(struct zs_pool *pool, struct size_class *class,
> > + struct zspage *zspage)
> > +{
> > + unsigned int obj_idx = 0;
> > + unsigned long handle, off = 0; /* off is within-page offset */
> > + struct page *page = get_first_page(zspage);
> > + struct link_free *prev_free = NULL;
> > + void *prev_page_vaddr = NULL;
> > +
> > + /* in case no free object found */
> > + set_freeobj(zspage, (unsigned int)(-1UL));
>
> I'm not following this. I see how -1UL works for link_free, but this
> cast of -1UL to 4 bytes looks suspicious.
(resending this since I forgot to forward this to other recipients)
It is a bit convoluted indeed. But the idea is that for the last object,
the last link is given by:
link->next = -1UL << OBJ_TAG_BITS
And at malloc time, we update freeobj as follows
set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
Which means the freeobj value would be set to something like this:
(-1UL << OBJ_TAG_BITS) >> OBJ_TAG_BITS
I want to emulate this here (i.e in the case we have no free object).
As for the casting, I believe set_freeobj requires an unsigned int for
the second field.
Alternatively, to be 100% safe, we can do something like this:
(unsigned int)((-1UL << OBJ_TAG_BITS) >> OBJ_TAG_BITS)
But I think I got the same result as just (unsigned int)(-1UL)
when I printed out these two values - feel free to
fact check me on this of course.
Let me know what you think about this, or if you have a
cleaner/safer way to handle this edge case :)
>
> > + while (page) {
> > + void *vaddr = kmap_atomic(page);
> > + struct page *next_page;
> > +
> > + while (off < PAGE_SIZE) {
> > + void *obj_addr = vaddr + off;
> > +
> > + /* skip allocated object */
> > + if (obj_allocated(page, obj_addr, &handle)) {
> > + obj_idx++;
> > + off += class->size;
> > + continue;
> > + }
> > +
> > + /* free deferred handle from reclaim attempt */
> > + if (obj_stores_deferred_handle(page, obj_addr, &handle))
> > + cache_free_handle(pool, handle);
> > +
> > + if (prev_free)
> > + prev_free->next = obj_idx << OBJ_TAG_BITS;
> > + else /* first free object found */
> > + set_freeobj(zspage, obj_idx);
> > +
> > + prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
> > + /* if last free object in a previous page, need to unmap */
> > + if (prev_page_vaddr) {
> > + kunmap_atomic(prev_page_vaddr);
> > + prev_page_vaddr = NULL;
> > + }
> > +
> > + obj_idx++;
> > + off += class->size;
> > + }
> > +
> > + /*
> > + * Handle the last (full or partial) object on this page.
> > + */
> > + next_page = get_next_page(page);
> > + if (next_page) {
> > + if (!prev_free || prev_page_vaddr) {
> > + /*
> > + * There is no free object in this page, so we can safely
> > + * unmap it.
> > + */
> > + kunmap_atomic(vaddr);
> > + } else {
> > + /* update prev_page_vaddr since prev_free is on this page */
> > + prev_page_vaddr = vaddr;
> > + }
>
> A polite and gentle nit: I'd appreciate it if we honored kernel coding
> styles in zsmalloc a little bit more. Comments, function declarations, etc.
> I'm personally very happy with https://github.com/vivien/vim-linux-coding-style
On (23/01/31 18:28), Nhat Pham wrote:
> > On (23/01/10 15:17), Nhat Pham wrote:
> > [..]
> > > #ifdef CONFIG_ZPOOL
> > > +static void restore_freelist(struct zs_pool *pool, struct size_class *class,
> > > + struct zspage *zspage)
> > > +{
> > > + unsigned int obj_idx = 0;
> > > + unsigned long handle, off = 0; /* off is within-page offset */
> > > + struct page *page = get_first_page(zspage);
> > > + struct link_free *prev_free = NULL;
> > > + void *prev_page_vaddr = NULL;
> > > +
> > > + /* in case no free object found */
> > > + set_freeobj(zspage, (unsigned int)(-1UL));
> >
> > I'm not following this. I see how -1UL works for link_free, but this
> > cast of -1UL to 4 bytes looks suspicious.
>
> (resending this since I forgot to forward this to other recipients)
>
> It is a bit convoluted indeed. But the idea is that for the last object,
> the last link is given by:
>
> link->next = -1UL << OBJ_TAG_BITS
>
> And at malloc time, we update freeobj as follows
> set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
>
> Which means the freeobj value would be set to something like this:
> (-1UL << OBJ_TAG_BITS) >> OBJ_TAG_BITS
Oh, good point. I see what you did there.
> I want to emulate this here (i.e in the case we have no free object).
Makes sense.
> As for the casting, I believe set_freeobj requires an unsigned int for
> the second field.
>
> Alternatively, to be 100% safe, we can do something like this:
> (unsigned int)((-1UL << OBJ_TAG_BITS) >> OBJ_TAG_BITS)
>
> But I think I got the same result as just (unsigned int)(-1UL)
Yeah, I guess they should be the same, as we take the lower 4 bytes
only.
@@ -113,7 +113,23 @@
* have room for two bit at least.
*/
#define OBJ_ALLOCATED_TAG 1
-#define OBJ_TAG_BITS 1
+
+#ifdef CONFIG_ZPOOL
+/*
+ * The second least-significant bit in the object's header identifies if the
+ * value stored at the header is a deferred handle from the last reclaim
+ * attempt.
+ *
+ * As noted above, this is valid because we have room for two bits.
+ */
+#define OBJ_DEFERRED_HANDLE_TAG 2
+#define OBJ_TAG_BITS 2
+#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)
+#else
+#define OBJ_TAG_BITS 1
+#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
+#endif /* CONFIG_ZPOOL */
+
#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
@@ -222,6 +238,12 @@ struct link_free {
* Handle of allocated object.
*/
unsigned long handle;
+#ifdef CONFIG_ZPOOL
+ /*
+ * Deferred handle of a reclaimed object.
+ */
+ unsigned long deferred_handle;
+#endif
};
};
@@ -272,8 +294,6 @@ struct zspage {
/* links the zspage to the lru list in the pool */
struct list_head lru;
bool under_reclaim;
- /* list of unfreed handles whose objects have been reclaimed */
- unsigned long *deferred_handles;
#endif
struct zs_pool *pool;
@@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
-static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
+static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
+ int tag)
{
unsigned long handle;
struct zspage *zspage = get_zspage(page);
@@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
} else
handle = *(unsigned long *)obj;
- if (!(handle & OBJ_ALLOCATED_TAG))
+ if (!(handle & tag))
return false;
- *phandle = handle & ~OBJ_ALLOCATED_TAG;
+ /* Clear all tags before returning the handle */
+ *phandle = handle & ~OBJ_TAG_MASK;
return true;
}
+static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
+{
+ return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
+}
+
+#ifdef CONFIG_ZPOOL
+static bool obj_stores_deferred_handle(struct page *page, void *obj,
+ unsigned long *phandle)
+{
+ return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG);
+}
+#endif
+
static void reset_page(struct page *page)
{
__ClearPageMovable(page);
@@ -946,22 +981,36 @@ static int trylock_zspage(struct zspage *zspage)
}
#ifdef CONFIG_ZPOOL
+static unsigned long find_deferred_handle_obj(struct size_class *class,
+ struct page *page, int *obj_idx);
+
/*
* Free all the deferred handles whose objects are freed in zs_free.
*/
-static void free_handles(struct zs_pool *pool, struct zspage *zspage)
+static void free_handles(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage)
{
- unsigned long handle = (unsigned long)zspage->deferred_handles;
+ int obj_idx = 0;
+ struct page *page = get_first_page(zspage);
+ unsigned long handle;
- while (handle) {
- unsigned long nxt_handle = handle_to_obj(handle);
+ while (1) {
+ handle = find_deferred_handle_obj(class, page, &obj_idx);
+ if (!handle) {
+ page = get_next_page(page);
+ if (!page)
+ break;
+ obj_idx = 0;
+ continue;
+ }
cache_free_handle(pool, handle);
- handle = nxt_handle;
+ obj_idx++;
}
}
#else
-static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {}
+static inline void free_handles(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage) {}
#endif
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
@@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
VM_BUG_ON(fg != ZS_EMPTY);
/* Free all deferred handles from zs_free */
- free_handles(pool, zspage);
+ free_handles(pool, class, zspage);
next = page = get_first_page(zspage);
do {
@@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
#ifdef CONFIG_ZPOOL
INIT_LIST_HEAD(&zspage->lru);
zspage->under_reclaim = false;
- zspage->deferred_handles = NULL;
#endif
set_freeobj(zspage, 0);
@@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(zs_malloc);
-static void obj_free(int class_size, unsigned long obj)
+static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
{
struct link_free *link;
struct zspage *zspage;
@@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj)
zspage = get_zspage(f_page);
vaddr = kmap_atomic(f_page);
-
- /* Insert this object in containing zspage's freelist */
link = (struct link_free *)(vaddr + f_offset);
- if (likely(!ZsHugePage(zspage)))
- link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
- else
- f_page->index = 0;
+
+ if (handle) {
+#ifdef CONFIG_ZPOOL
+ /* Stores the (deferred) handle in the object's header */
+ *handle |= OBJ_DEFERRED_HANDLE_TAG;
+ *handle &= ~OBJ_ALLOCATED_TAG;
+
+ if (likely(!ZsHugePage(zspage)))
+ link->deferred_handle = *handle;
+ else
+ f_page->index = *handle;
+#endif
+ } else {
+ /* Insert this object in containing zspage's freelist */
+ if (likely(!ZsHugePage(zspage)))
+ link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
+ else
+ f_page->index = 0;
+ set_freeobj(zspage, f_objidx);
+ }
+
kunmap_atomic(vaddr);
- set_freeobj(zspage, f_objidx);
mod_zspage_inuse(zspage, -1);
}
@@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
zspage = get_zspage(f_page);
class = zspage_class(pool, zspage);
- obj_free(class->size, obj);
class_stat_dec(class, OBJ_USED, 1);
#ifdef CONFIG_ZPOOL
@@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
* Reclaim needs the handles during writeback. It'll free
* them along with the zspage when it's done with them.
*
- * Record current deferred handle at the memory location
- * whose address is given by handle.
+ * Record current deferred handle in the object's header.
*/
- record_obj(handle, (unsigned long)zspage->deferred_handles);
- zspage->deferred_handles = (unsigned long *)handle;
+ obj_free(class->size, obj, &handle);
spin_unlock(&pool->lock);
return;
}
#endif
+ obj_free(class->size, obj, NULL);
+
fullness = fix_fullness_group(class, zspage);
if (fullness == ZS_EMPTY)
free_zspage(pool, class, zspage);
@@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
}
/*
- * Find alloced object in zspage from index object and
+ * Find object with a certain tag in zspage from index object and
* return handle.
*/
-static unsigned long find_alloced_obj(struct size_class *class,
- struct page *page, int *obj_idx)
+static unsigned long find_tagged_obj(struct size_class *class,
+ struct page *page, int *obj_idx, int tag)
{
unsigned int offset;
int index = *obj_idx;
@@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
offset += class->size * index;
while (offset < PAGE_SIZE) {
- if (obj_allocated(page, addr + offset, &handle))
+ if (obj_tagged(page, addr + offset, &handle, tag))
break;
offset += class->size;
@@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class,
return handle;
}
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct size_class *class,
+ struct page *page, int *obj_idx)
+{
+ return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
+}
+
+#ifdef CONFIG_ZPOOL
+/*
+ * Find object storing a deferred handle in header in zspage from index object
+ * and return handle.
+ */
+static unsigned long find_deferred_handle_obj(struct size_class *class,
+ struct page *page, int *obj_idx)
+{
+ return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG);
+}
+#endif
+
struct zs_compact_control {
/* Source spage for migration which could be a subpage of zspage */
struct page *s_page;
@@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
record_obj(handle, free_obj);
- obj_free(class->size, used_obj);
+ obj_free(class->size, used_obj, NULL);
}
/* Remember last position in this iteration */
@@ -2478,6 +2561,90 @@ void zs_destroy_pool(struct zs_pool *pool)
EXPORT_SYMBOL_GPL(zs_destroy_pool);
#ifdef CONFIG_ZPOOL
+static void restore_freelist(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage)
+{
+ unsigned int obj_idx = 0;
+ unsigned long handle, off = 0; /* off is within-page offset */
+ struct page *page = get_first_page(zspage);
+ struct link_free *prev_free = NULL;
+ void *prev_page_vaddr = NULL;
+
+ /* in case no free object found */
+ set_freeobj(zspage, (unsigned int)(-1UL));
+
+ while (page) {
+ void *vaddr = kmap_atomic(page);
+ struct page *next_page;
+
+ while (off < PAGE_SIZE) {
+ void *obj_addr = vaddr + off;
+
+ /* skip allocated object */
+ if (obj_allocated(page, obj_addr, &handle)) {
+ obj_idx++;
+ off += class->size;
+ continue;
+ }
+
+ /* free deferred handle from reclaim attempt */
+ if (obj_stores_deferred_handle(page, obj_addr, &handle))
+ cache_free_handle(pool, handle);
+
+ if (prev_free)
+ prev_free->next = obj_idx << OBJ_TAG_BITS;
+ else /* first free object found */
+ set_freeobj(zspage, obj_idx);
+
+ prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
+ /* if last free object in a previous page, need to unmap */
+ if (prev_page_vaddr) {
+ kunmap_atomic(prev_page_vaddr);
+ prev_page_vaddr = NULL;
+ }
+
+ obj_idx++;
+ off += class->size;
+ }
+
+ /*
+ * Handle the last (full or partial) object on this page.
+ */
+ next_page = get_next_page(page);
+ if (next_page) {
+ if (!prev_free || prev_page_vaddr) {
+ /*
+ * There is no free object in this page, so we can safely
+ * unmap it.
+ */
+ kunmap_atomic(vaddr);
+ } else {
+ /* update prev_page_vaddr since prev_free is on this page */
+ prev_page_vaddr = vaddr;
+ }
+ } else { /* this is the last page */
+ if (prev_free) {
+ /*
+ * Reset OBJ_TAG_BITS bit to last link to tell
+ * whether it's allocated object or not.
+ */
+ prev_free->next = -1UL << OBJ_TAG_BITS;
+ }
+
+ /* unmap previous page (if not done yet) */
+ if (prev_page_vaddr) {
+ kunmap_atomic(prev_page_vaddr);
+ prev_page_vaddr = NULL;
+ }
+
+ kunmap_atomic(vaddr);
+ }
+
+ page = next_page;
+ off %= PAGE_SIZE;
+ }
+}
+
static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
{
int i, obj_idx, ret = 0;
@@ -2561,6 +2728,12 @@ static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
return 0;
}
+ /*
+ * Eviction fails on one of the handles, so we need to restore zspage.
+ * We need to rebuild its freelist (and free stored deferred handles),
+ * put it back to the correct size class, and add it to the LRU list.
+ */
+ restore_freelist(pool, class, zspage);
putback_zspage(class, zspage);
list_add(&zspage->lru, &pool->lru);
unlock_zspage(zspage);