[PATCHv4,4/9] zram: Introduce recompress sysfs knob

Message ID 20221018045533.2396670-5-senozhatsky@chromium.org
State New
Headers
Series zram: Support multiple compression streams |

Commit Message

Sergey Senozhatsky Oct. 18, 2022, 4:55 a.m. UTC
  Allow zram to recompress (using secondary compression streams)
pages. We support three modes:

1) IDLE pages recompression is activated by `idle` mode

	echo idle > /sys/block/zram0/recompress

2) Since there may be many idle pages user-space may pass a size
watermark value (in bytes) and we will recompress IDLE pages only
of equal or greater size:

	echo 888 > /sys/block/zram0/recompress

3) HUGE pages recompression is activated by `huge` mode

	echo huge > /sys/block/zram0/recompress

4) HUGE_IDLE pages recompression is activated by `huge_idle` mode

	echo huge_idle > /sys/block/zram0/recompress

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 drivers/block/zram/Kconfig    |  15 +++
 drivers/block/zram/zram_drv.c | 196 +++++++++++++++++++++++++++++++++-
 drivers/block/zram/zram_drv.h |   2 +
 3 files changed, 210 insertions(+), 3 deletions(-)
  

Comments

Minchan Kim Nov. 2, 2022, 9:06 p.m. UTC | #1
On Tue, Oct 18, 2022 at 01:55:28PM +0900, Sergey Senozhatsky wrote:
> Allow zram to recompress (using secondary compression streams)
> pages. We support three modes:
> 
> 1) IDLE pages recompression is activated by `idle` mode
> 
> 	echo idle > /sys/block/zram0/recompress
> 
> 2) Since there may be many idle pages user-space may pass a size
> watermark value (in bytes) and we will recompress IDLE pages only
> of equal or greater size:
> 
> 	echo 888 > /sys/block/zram0/recompress

Hmm, how about having seperate knob for threshold?

recompress_threshold

With that, we could make rescompress 888 and idle/huge
as well as only 888.

  echo 888 > /sys/block/zram0/recompress_threshold
  echo 1 > /sys/block/zram0/recompress

  or

  echo 888 > /sys/block/zram0/recompress_threshold
  echo idle > /sys/block/zram0/recompress

or we can introduce the threshold with action item.
  
  echo "idle 888" > /sys/block/zram0/recompress
  echo "huge 888" > /sys/block/zram0/recompress
  echo "normal 888" > /sys/block/zram0/recompress

> 
> 3) HUGE pages recompression is activated by `huge` mode
> 
> 	echo huge > /sys/block/zram0/recompress
> 
> 4) HUGE_IDLE pages recompression is activated by `huge_idle` mode
> 
> 	echo huge_idle > /sys/block/zram0/recompress
> 
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
>  drivers/block/zram/Kconfig    |  15 +++
>  drivers/block/zram/zram_drv.c | 196 +++++++++++++++++++++++++++++++++-
>  drivers/block/zram/zram_drv.h |   2 +
>  3 files changed, 210 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
> index d4100b0c083e..3e00656a6f8a 100644
> --- a/drivers/block/zram/Kconfig
> +++ b/drivers/block/zram/Kconfig
> @@ -78,3 +78,18 @@ config ZRAM_MEMORY_TRACKING
>  	  /sys/kernel/debug/zram/zramX/block_state.
>  
>  	  See Documentation/admin-guide/blockdev/zram.rst for more information.
> +
> +config ZRAM_MULTI_COMP
> +	bool "Enable multiple per-CPU compression streams"

per-CPU is implementation detail. Let's do not mention it.

> +	depends on ZRAM
> +	help
> +	This will enable per-CPU multi-compression streams, so that ZRAM

      indentation

> +	can re-compress IDLE/huge pages, using a potentially slower but
> +	more effective compression algorithm. Note, that IDLE page support
> +	requires ZRAM_MEMORY_TRACKING.
> +
> +          echo TIMEOUT > /sys/block/zramX/idle
> +          echo SIZE > /sys/block/zramX/recompress
> +
> +	SIZE (in bytes) parameter sets the object size watermark: idle
> +	objects that are of a smaller size will not get recompressed.
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 94c62d7ea818..da11560ecf70 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -1282,6 +1282,12 @@ static void zram_free_page(struct zram *zram, size_t index)
>  		atomic64_dec(&zram->stats.huge_pages);
>  	}
>  
> +	if (zram_test_flag(zram, index, ZRAM_RECOMP))
> +		zram_clear_flag(zram, index, ZRAM_RECOMP);
> +
> +	if (zram_test_flag(zram, index, ZRAM_RECOMP_SKIP))
> +		zram_clear_flag(zram, index, ZRAM_RECOMP_SKIP);

Let's squeeze the comp algo index into meta area since we have
some rooms for the bits. Then can we could remove the specific
recomp two flags?

I am thinking the feature to work incoming pages on the fly,
not only for recompress manual knob so it would be good
if we could make the interface abstract to work something
like this(I may miss something why we couldn't. I need more
time to look into then)

zram_bvec_write:

    for (i = 0; i < MAX_COMP_ALGO; i++) {
        zstrm = zcomp_stream_get(i);
        zcomp_compress(src, &comp_len);
        if (comp_len > threshold) {
            zcomp_stream_put(i);
            continue;
        }
        break;
    }

zram_bvec_read:
    algo_idx = zram_get_algo_idx(zram, index);
    zstrm = zcomp_stream_get(zram, algo_idx);
    zcomp_decompress(zstrm);
    zcomp_stream_put(zram, algo_idx);


> +
>  	if (zram_test_flag(zram, index, ZRAM_WB)) {
>  		zram_clear_flag(zram, index, ZRAM_WB);
>  		free_block_bdev(zram, zram_get_element(zram, index));
> @@ -1343,6 +1349,7 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page,
>  	unsigned long handle;
>  	unsigned int size;
>  	void *src, *dst;
> +	u32 idx;
>  	int ret;
>  
>  	handle = zram_get_handle(zram, index);
> @@ -1359,8 +1366,13 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page,
>  
>  	size = zram_get_obj_size(zram, index);
>  
> -	if (size != PAGE_SIZE)
> -		zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_ZCOMP]);
> +	if (size != PAGE_SIZE) {
> +		idx = ZRAM_PRIMARY_ZCOMP;
> +		if (zram_test_flag(zram, index, ZRAM_RECOMP))
> +			idx = ZRAM_SECONDARY_ZCOMP;
> +
> +		zstrm = zcomp_stream_get(zram->comps[idx]);
> +	}
>  
>  	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
>  	if (size == PAGE_SIZE) {
> @@ -1372,7 +1384,7 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page,
>  		dst = kmap_atomic(page);
>  		ret = zcomp_decompress(zstrm, src, size, dst);
>  		kunmap_atomic(dst);
> -		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_ZCOMP]);
> +		zcomp_stream_put(zram->comps[idx]);
>  	}
>  	zs_unmap_object(zram->mem_pool, handle);
>  	return ret;
> @@ -1603,6 +1615,182 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
>  	return ret;
>  }
>  
> +#ifdef CONFIG_ZRAM_MULTI_COMP
> +/*
> + * This function will decompress (unless it's ZRAM_HUGE) the page and then
> + * attempt to compress it using secondary compression algorithm (which is
> + * potentially more effective).
> + *
> + * Corresponding ZRAM slot should be locked.
> + */
> +static int zram_recompress(struct zram *zram, u32 index, struct page *page,
> +			   int size_watermark)
> +{
> +	unsigned long handle_prev;
> +	unsigned long handle_next;
> +	unsigned int comp_len_next;
> +	unsigned int comp_len_prev;

How about orig_handle and new_nandle with orig_comp_len and new_comp_len?

> +	struct zcomp_strm *zstrm;
> +	void *src, *dst;
> +	int ret;
> +
> +	handle_prev = zram_get_handle(zram, index);
> +	if (!handle_prev)
> +		return -EINVAL;
> +
> +	comp_len_prev = zram_get_obj_size(zram, index);
> +	/*
> +	 * Do not recompress objects that are already "small enough".
> +	 */
> +	if (comp_len_prev < size_watermark)
> +		return 0;
> +
> +	ret = zram_read_from_zspool(zram, page, index);
> +	if (ret)
> +		return ret;
> +
> +	zstrm = zcomp_stream_get(zram->comps[ZRAM_SECONDARY_ZCOMP]);
> +	src = kmap_atomic(page);
> +	ret = zcomp_compress(zstrm, src, &comp_len_next);
> +	kunmap_atomic(src);
> +
> +	/*
> +	 * Either a compression error or we failed to compressed the object
> +	 * in a way that will save us memory. Mark the object so that we
> +	 * don't attempt to re-compress it again (RECOMP_SKIP).
> +	 */
> +	if (comp_len_next >= huge_class_size ||
> +	    comp_len_next >= comp_len_prev ||
> +	    ret) {
> +		zram_set_flag(zram, index, ZRAM_RECOMP_SKIP);
> +		zram_clear_flag(zram, index, ZRAM_IDLE);
> +		zcomp_stream_put(zram->comps[ZRAM_SECONDARY_ZCOMP]);
> +		return ret;
> +	}
> +
> +	/*
> +	 * No direct reclaim (slow path) for handle allocation and no
> +	 * re-compression attempt (unlike in __zram_bvec_write()) since
> +	 * we already have stored that object in zsmalloc. If we cannot
> +	 * alloc memory for recompressed object then we bail out and
> +	 * simply keep the old (existing) object in zsmalloc.
> +	 */
> +	handle_next = zs_malloc(zram->mem_pool, comp_len_next,
> +				__GFP_KSWAPD_RECLAIM |
> +				__GFP_NOWARN |
> +				__GFP_HIGHMEM |
> +				__GFP_MOVABLE);
> +	if (IS_ERR((void *)handle_next)) {
> +		zcomp_stream_put(zram->comps[ZRAM_SECONDARY_ZCOMP]);
> +		return PTR_ERR((void *)handle_next);
> +	}
> +
> +	dst = zs_map_object(zram->mem_pool, handle_next, ZS_MM_WO);
> +	memcpy(dst, zstrm->buffer, comp_len_next);
> +	zcomp_stream_put(zram->comps[ZRAM_SECONDARY_ZCOMP]);
> +
> +	zs_unmap_object(zram->mem_pool, handle_next);
> +
> +	zram_free_page(zram, index);
> +	zram_set_handle(zram, index, handle_next);
> +	zram_set_obj_size(zram, index, comp_len_next);
> +
> +	zram_set_flag(zram, index, ZRAM_RECOMP);
> +	atomic64_add(comp_len_next, &zram->stats.compr_data_size);
> +	atomic64_inc(&zram->stats.pages_stored);
> +
> +	return 0;
> +}
> +
> +#define RECOMPRESS_IDLE		(1 << 0)
> +#define RECOMPRESS_HUGE		(1 << 1)
> +
> +static ssize_t recompress_store(struct device *dev,
> +				struct device_attribute *attr,
> +				const char *buf, size_t len)
> +{
> +	struct zram *zram = dev_to_zram(dev);
> +	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
> +	unsigned long index;
> +	struct page *page;
> +	ssize_t ret;
> +	int mode, size_watermark = 0;
> +
> +	if (sysfs_streq(buf, "idle")) {
> +		mode = RECOMPRESS_IDLE;
> +	} else if (sysfs_streq(buf, "huge")) {
> +		mode = RECOMPRESS_HUGE;
> +	} else if (sysfs_streq(buf, "huge_idle")) {
> +		mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
> +	} else {
> +		/*
> +		 * We will re-compress only idle objects equal or greater
> +		 * in size than watermark.
> +		 */
> +		ret = kstrtoint(buf, 10, &size_watermark);
> +		if (ret)
> +			return ret;
> +		mode = RECOMPRESS_IDLE;
> +	}
> +
> +	if (size_watermark > PAGE_SIZE)

nit: How about threshold instead of watermark? 

> +		return -EINVAL;
> +
> +	down_read(&zram->init_lock);
> +	if (!init_done(zram)) {
> +		ret = -EINVAL;
> +		goto release_init_lock;
> +	}
> +
> +	page = alloc_page(GFP_KERNEL);
> +	if (!page) {
> +		ret = -ENOMEM;
> +		goto release_init_lock;
> +	}
> +
> +	ret = len;
> +	for (index = 0; index < nr_pages; index++) {
> +		int err = 0;
> +
> +		zram_slot_lock(zram, index);
> +
> +		if (!zram_allocated(zram, index))
> +			goto next;
> +
> +		if (mode & RECOMPRESS_IDLE &&
> +		    !zram_test_flag(zram, index, ZRAM_IDLE))
> +			goto next;
> +
> +		if (mode & RECOMPRESS_HUGE &&
> +		    !zram_test_flag(zram, index, ZRAM_HUGE))
> +			goto next;
> +
> +		if (zram_test_flag(zram, index, ZRAM_WB) ||
> +		    zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
> +		    zram_test_flag(zram, index, ZRAM_SAME) ||
> +		    zram_test_flag(zram, index, ZRAM_RECOMP) ||
> +		    zram_test_flag(zram, index, ZRAM_RECOMP_SKIP))
> +			goto next;
> +
> +		err = zram_recompress(zram, index, page, size_watermark);
> +next:
> +		zram_slot_unlock(zram, index);
> +		if (err) {
> +			ret = err;
> +			break;
> +		}
> +
> +		cond_resched();
> +	}
> +
> +	__free_page(page);
> +
> +release_init_lock:
> +	up_read(&zram->init_lock);
> +	return ret;
> +}
> +#endif
> +
>  /*
>   * zram_bio_discard - handler on discard request
>   * @index: physical block index in PAGE_SIZE units
> @@ -1983,6 +2171,7 @@ static DEVICE_ATTR_RW(writeback_limit_enable);
>  #endif
>  #ifdef CONFIG_ZRAM_MULTI_COMP
>  static DEVICE_ATTR_RW(recomp_algorithm);
> +static DEVICE_ATTR_WO(recompress);
>  #endif
>  
>  static struct attribute *zram_disk_attrs[] = {
> @@ -2009,6 +2198,7 @@ static struct attribute *zram_disk_attrs[] = {
>  	&dev_attr_debug_stat.attr,
>  #ifdef CONFIG_ZRAM_MULTI_COMP
>  	&dev_attr_recomp_algorithm.attr,
> +	&dev_attr_recompress.attr,
>  #endif
>  	NULL,
>  };
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index 4044ddbb2326..09b9ceb5dfa3 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -49,6 +49,8 @@ enum zram_pageflags {
>  	ZRAM_UNDER_WB,	/* page is under writeback */
>  	ZRAM_HUGE,	/* Incompressible page */
>  	ZRAM_IDLE,	/* not accessed page since last idle marking */
> +	ZRAM_RECOMP,	/* page was recompressed */
> +	ZRAM_RECOMP_SKIP, /* secondary algorithm cannot compress this page */
>  
>  	__NR_ZRAM_PAGEFLAGS,
>  };
> -- 
> 2.38.0.413.g74048e4d9e-goog
> 
>
  
Sergey Senozhatsky Nov. 3, 2022, 3:25 a.m. UTC | #2
On (22/11/02 14:06), Minchan Kim wrote:
> On Tue, Oct 18, 2022 at 01:55:28PM +0900, Sergey Senozhatsky wrote:
> > Allow zram to recompress (using secondary compression streams)
> > pages. We support three modes:
> > 
> > 1) IDLE pages recompression is activated by `idle` mode
> > 
> > 	echo idle > /sys/block/zram0/recompress
> > 
> > 2) Since there may be many idle pages user-space may pass a size
> > watermark value (in bytes) and we will recompress IDLE pages only
> > of equal or greater size:
> > 
> > 	echo 888 > /sys/block/zram0/recompress
> 
> Hmm, how about having seperate knob for threshold?

Per-my understanding this threshold can change quite often,
depending on memory pressure and so on. So we may force
user-space to issues more syscalls, without any gain in
simplicity.

> recompress_threshold
> 
> With that, we could make rescompress 888 and idle/huge
> as well as only 888.
> 
>   echo 888 > /sys/block/zram0/recompress_threshold
>   echo 1 > /sys/block/zram0/recompress
> 
>   or
> 
>   echo 888 > /sys/block/zram0/recompress_threshold
>   echo idle > /sys/block/zram0/recompress
> 
> or we can introduce the threshold with action item.
>   
>   echo "idle 888" > /sys/block/zram0/recompress
>   echo "huge 888" > /sys/block/zram0/recompress
>   echo "normal 888" > /sys/block/zram0/recompress

I like the latter one, when threshold is an optional argument.
I probably would even go a bit further and add keywords:

	type=STRING threshold=INT

> > 3) HUGE pages recompression is activated by `huge` mode
> > 
> > 	echo huge > /sys/block/zram0/recompress
> > 
> > 4) HUGE_IDLE pages recompression is activated by `huge_idle` mode
> > 
> > 	echo huge_idle > /sys/block/zram0/recompress
> > 
> > Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> > ---
> >  drivers/block/zram/Kconfig    |  15 +++
> >  drivers/block/zram/zram_drv.c | 196 +++++++++++++++++++++++++++++++++-
> >  drivers/block/zram/zram_drv.h |   2 +
> >  3 files changed, 210 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
> > index d4100b0c083e..3e00656a6f8a 100644
> > --- a/drivers/block/zram/Kconfig
> > +++ b/drivers/block/zram/Kconfig
> > @@ -78,3 +78,18 @@ config ZRAM_MEMORY_TRACKING
> >  	  /sys/kernel/debug/zram/zramX/block_state.
> >  
> >  	  See Documentation/admin-guide/blockdev/zram.rst for more information.
> > +
> > +config ZRAM_MULTI_COMP
> > +	bool "Enable multiple per-CPU compression streams"
> 
> per-CPU is implementation detail. Let's do not mention it.

OK.

> > +	depends on ZRAM
> > +	help
> > +	This will enable per-CPU multi-compression streams, so that ZRAM
> 
>       indentation

OK. A question: does this matter? I don't see any problems in menuconfig.

> > +	can re-compress IDLE/huge pages, using a potentially slower but
> > +	more effective compression algorithm. Note, that IDLE page support
> > +	requires ZRAM_MEMORY_TRACKING.
> > +
> > +          echo TIMEOUT > /sys/block/zramX/idle
> > +          echo SIZE > /sys/block/zramX/recompress
> > +
> > +	SIZE (in bytes) parameter sets the object size watermark: idle
> > +	objects that are of a smaller size will not get recompressed.
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index 94c62d7ea818..da11560ecf70 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -1282,6 +1282,12 @@ static void zram_free_page(struct zram *zram, size_t index)
> >  		atomic64_dec(&zram->stats.huge_pages);
> >  	}
> >  
> > +	if (zram_test_flag(zram, index, ZRAM_RECOMP))
> > +		zram_clear_flag(zram, index, ZRAM_RECOMP);
> > +
> > +	if (zram_test_flag(zram, index, ZRAM_RECOMP_SKIP))
> > +		zram_clear_flag(zram, index, ZRAM_RECOMP_SKIP);
> 
> Let's squeeze the comp algo index into meta area since we have
> some rooms for the bits. Then can we could remove the specific
> recomp two flags?

What is meta area?

> I am thinking the feature to work incoming pages on the fly,
> not only for recompress manual knob so it would be good
> if we could make the interface abstract to work something
> like this(I may miss something why we couldn't. I need more
> time to look into then)
>
> zram_bvec_write:
> 
>     for (i = 0; i < MAX_COMP_ALGO; i++) {
>         zstrm = zcomp_stream_get(i);
>         zcomp_compress(src, &comp_len);
>         if (comp_len > threshold) {
>             zcomp_stream_put(i);
>             continue;
>         }
>         break;
>     }
> 
> zram_bvec_read:
>     algo_idx = zram_get_algo_idx(zram, index);
>     zstrm = zcomp_stream_get(zram, algo_idx);
>     zcomp_decompress(zstrm);
>     zcomp_stream_put(zram, algo_idx);

Hmm. This is something that should not be enabled by default.
N compressions per every stored page is very very CPU and
power intensive. We definitely want a way to have recompression
as a user-space event, which gives all sorts of flexibility and
extensibility. ZRAM doesn't (and should not) know about too many
things, so ZRAM can't make good decisions (and probably should not
try). User-space can make good decisions on the other hand.

So recompression for us is not something that happens all the time,
unconditionally. It's something that happens sometimes, depending on
the situation on the host.

[..]
> > +static int zram_recompress(struct zram *zram, u32 index, struct page *page,
> > +			   int size_watermark)
> > +{
> > +	unsigned long handle_prev;
> > +	unsigned long handle_next;
> > +	unsigned int comp_len_next;
> > +	unsigned int comp_len_prev;
> 
> How about orig_handle and new_nandle with orig_comp_len and new_comp_len?

No opinion. Can we have prev and next? :)

[..]
> > +	if (sysfs_streq(buf, "idle")) {
> > +		mode = RECOMPRESS_IDLE;
> > +	} else if (sysfs_streq(buf, "huge")) {
> > +		mode = RECOMPRESS_HUGE;
> > +	} else if (sysfs_streq(buf, "huge_idle")) {
> > +		mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
> > +	} else {
> > +		/*
> > +		 * We will re-compress only idle objects equal or greater
> > +		 * in size than watermark.
> > +		 */
> > +		ret = kstrtoint(buf, 10, &size_watermark);
> > +		if (ret)
> > +			return ret;
> > +		mode = RECOMPRESS_IDLE;
> > +	}
> > +
> > +	if (size_watermark > PAGE_SIZE)
> 
> nit: How about threshold instead of watermark? 

OK. MM uses watermarks everywhere so I just used the same term.
Can change to threshold.
  
Sergey Senozhatsky Nov. 3, 2022, 6:03 a.m. UTC | #3
On (22/11/03 12:25), Sergey Senozhatsky wrote:
> > or we can introduce the threshold with action item.
> >   
> >   echo "idle 888" > /sys/block/zram0/recompress
> >   echo "huge 888" > /sys/block/zram0/recompress
> >   echo "normal 888" > /sys/block/zram0/recompress
> 
> I like the latter one, when threshold is an optional argument.
> I probably would even go a bit further and add keywords:
> 
> 	type=STRING threshold=INT

E.g. recompress support for type= and optional threshold=

We kind of don't have a use case of type=normal, as it is an equivalent
of no type. So we have huge, idle, huge_idle and no param means all
pages (which is sort of logical). threshold is optional.

---
 drivers/block/zram/zram_drv.c | 55 ++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9a614253de07..12f03745baf9 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1688,7 +1688,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
  * Corresponding ZRAM slot should be locked.
  */
 static int zram_recompress(struct zram *zram, u32 index, struct page *page,
-			   int size_watermark)
+			   int size_threshold)
 {
 	unsigned long handle_prev;
 	unsigned long handle_next;
@@ -1708,7 +1708,7 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page,
 	/*
 	 * Do not recompress objects that are already "small enough".
 	 */
-	if (comp_len_prev < size_watermark)
+	if (comp_len_prev < size_threshold)
 		return 0;
 
 	ret = zram_read_from_zspool(zram, page, index);
@@ -1780,29 +1780,42 @@ static ssize_t recompress_store(struct device *dev,
 {
 	struct zram *zram = dev_to_zram(dev);
 	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
+	char *args, *param, *val;
 	unsigned long index;
 	struct page *page;
 	ssize_t ret;
-	int mode, size_watermark = 0;
-
-	if (sysfs_streq(buf, "idle")) {
-		mode = RECOMPRESS_IDLE;
-	} else if (sysfs_streq(buf, "huge")) {
-		mode = RECOMPRESS_HUGE;
-	} else if (sysfs_streq(buf, "huge_idle")) {
-		mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
-	} else {
-		/*
-		 * We will re-compress only idle objects equal or greater
-		 * in size than watermark.
-		 */
-		ret = kstrtoint(buf, 10, &size_watermark);
-		if (ret)
-			return ret;
-		mode = RECOMPRESS_IDLE;
+	int mode = 0, size_threshold = 0;
+
+	args = skip_spaces(buf);
+	while (*args) {
+		args = next_arg(args, &param, &val);
+
+		if (!*val)
+			return -EINVAL;
+
+		if (!strcmp(param, "type")) {
+			if (!strcmp(val, "idle"))
+				mode = RECOMPRESS_IDLE;
+			if (!strcmp(val, "huge"))
+				mode = RECOMPRESS_HUGE;
+			if (!strcmp(val, "huge_idle"))
+				mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
+			continue;
+		}
+
+		if (!strcmp(param, "threshold")) {
+			/*
+			 * We will re-compress only idle objects equal or
+			 * greater in size than watermark.
+			 */
+			ret = kstrtoint(val, 10, &size_threshold);
+			if (ret)
+				return ret;
+			continue;
+		}
 	}
 
-	if (size_watermark > PAGE_SIZE)
+	if (size_threshold > PAGE_SIZE)
 		return -EINVAL;
 
 	down_read(&zram->init_lock);
@@ -1841,7 +1854,7 @@ static ssize_t recompress_store(struct device *dev,
 		    zram_test_flag(zram, index, ZRAM_RECOMP_SKIP))
 			goto next;
 
-		err = zram_recompress(zram, index, page, size_watermark);
+		err = zram_recompress(zram, index, page, size_threshold);
 next:
 		zram_slot_unlock(zram, index);
 		if (err) {
  
Minchan Kim Nov. 3, 2022, 5 p.m. UTC | #4
On Thu, Nov 03, 2022 at 12:25:43PM +0900, Sergey Senozhatsky wrote:
> On (22/11/02 14:06), Minchan Kim wrote:
> > On Tue, Oct 18, 2022 at 01:55:28PM +0900, Sergey Senozhatsky wrote:
> > > Allow zram to recompress (using secondary compression streams)
> > > pages. We support three modes:
> > > 
> > > 1) IDLE pages recompression is activated by `idle` mode
> > > 
> > > 	echo idle > /sys/block/zram0/recompress
> > > 
> > > 2) Since there may be many idle pages user-space may pass a size
> > > watermark value (in bytes) and we will recompress IDLE pages only
> > > of equal or greater size:
> > > 
> > > 	echo 888 > /sys/block/zram0/recompress
> > 
> > Hmm, how about having seperate knob for threshold?
> 
> Per-my understanding this threshold can change quite often,
> depending on memory pressure and so on. So we may force
> user-space to issues more syscalls, without any gain in
> simplicity.

Sorry, didn't understand your point. Let me clarify my idea.
If we have separate knob for recompress thresh hold, we could
work like this.

# recompress any compressed pages which is greater than 888 bytes.
echo 888 > /sys/block/zram0/recompress_threshold

# try to compress any pages greather than threshold with following
# algorithm.

echo "type=lzo priority=1" > /sys/block/zram0/recompress_algo
echo "type=zstd priority=2" > /sys/block/zram0/recompress_algo
echo "type=deflate priority=3" > /sys/block/zram0/recompress_algo

> 
> > recompress_threshold
> > 
> > With that, we could make rescompress 888 and idle/huge
> > as well as only 888.
> > 
> >   echo 888 > /sys/block/zram0/recompress_threshold
> >   echo 1 > /sys/block/zram0/recompress
> > 
> >   or
> > 
> >   echo 888 > /sys/block/zram0/recompress_threshold
> >   echo idle > /sys/block/zram0/recompress
> > 
> > or we can introduce the threshold with action item.
> >   
> >   echo "idle 888" > /sys/block/zram0/recompress
> >   echo "huge 888" > /sys/block/zram0/recompress
> >   echo "normal 888" > /sys/block/zram0/recompress
> 
> I like the latter one, when threshold is an optional argument.
> I probably would even go a bit further and add keywords:
> 
> 	type=STRING threshold=INT

Yeah, kerwords would be better. Let's discuss we need a threshold
optional argument for each algo or just have one threshold
for every secondary algorithm or both.

> 
> > > 3) HUGE pages recompression is activated by `huge` mode
> > > 
> > > 	echo huge > /sys/block/zram0/recompress
> > > 
> > > 4) HUGE_IDLE pages recompression is activated by `huge_idle` mode
> > > 
> > > 	echo huge_idle > /sys/block/zram0/recompress
> > > 
> > > Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> > > ---
> > >  drivers/block/zram/Kconfig    |  15 +++
> > >  drivers/block/zram/zram_drv.c | 196 +++++++++++++++++++++++++++++++++-
> > >  drivers/block/zram/zram_drv.h |   2 +
> > >  3 files changed, 210 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
> > > index d4100b0c083e..3e00656a6f8a 100644
> > > --- a/drivers/block/zram/Kconfig
> > > +++ b/drivers/block/zram/Kconfig
> > > @@ -78,3 +78,18 @@ config ZRAM_MEMORY_TRACKING
> > >  	  /sys/kernel/debug/zram/zramX/block_state.
> > >  
> > >  	  See Documentation/admin-guide/blockdev/zram.rst for more information.
> > > +
> > > +config ZRAM_MULTI_COMP
> > > +	bool "Enable multiple per-CPU compression streams"
> > 
> > per-CPU is implementation detail. Let's do not mention it.
> 
> OK.
> 
> > > +	depends on ZRAM
> > > +	help
> > > +	This will enable per-CPU multi-compression streams, so that ZRAM
> > 
> >       indentation
> 
> OK. A question: does this matter? I don't see any problems in menuconfig.
> 
> > > +	can re-compress IDLE/huge pages, using a potentially slower but
> > > +	more effective compression algorithm. Note, that IDLE page support
> > > +	requires ZRAM_MEMORY_TRACKING.
> > > +
> > > +          echo TIMEOUT > /sys/block/zramX/idle
> > > +          echo SIZE > /sys/block/zramX/recompress
> > > +
> > > +	SIZE (in bytes) parameter sets the object size watermark: idle
> > > +	objects that are of a smaller size will not get recompressed.
> > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > index 94c62d7ea818..da11560ecf70 100644
> > > --- a/drivers/block/zram/zram_drv.c
> > > +++ b/drivers/block/zram/zram_drv.c
> > > @@ -1282,6 +1282,12 @@ static void zram_free_page(struct zram *zram, size_t index)
> > >  		atomic64_dec(&zram->stats.huge_pages);
> > >  	}
> > >  
> > > +	if (zram_test_flag(zram, index, ZRAM_RECOMP))
> > > +		zram_clear_flag(zram, index, ZRAM_RECOMP);
> > > +
> > > +	if (zram_test_flag(zram, index, ZRAM_RECOMP_SKIP))
> > > +		zram_clear_flag(zram, index, ZRAM_RECOMP_SKIP);
> > 
> > Let's squeeze the comp algo index into meta area since we have
> > some rooms for the bits. Then can we could remove the specific
> > recomp two flags?
> 
> What is meta area?

zram->table[index].flags

If we squeeze the algorithm index, we could work like this
without ZRAM_RECOMP_SKIP.

read_block_state
    zram_algo_idx(zram, index) > 0 ? 'r' : '.');

zram_read_from_zpool
    if (zram_algo_idx(zram, idx) != 0)
        idx = 1;

zram_recompress
    ..
    we don't need to set ZRAM_RECOMP since every meta will have the algo
    idx.

zram_free_page
    zram_clear_algo(zram, index);

recompress_store
    int algo_idx = zram_algo_idx(zram, index);

    if (algo_idx == max_algo_idx)
        goto next

> 
> > I am thinking the feature to work incoming pages on the fly,
> > not only for recompress manual knob so it would be good
> > if we could make the interface abstract to work something
> > like this(I may miss something why we couldn't. I need more
> > time to look into then)
> >
> > zram_bvec_write:
> > 
> >     for (i = 0; i < MAX_COMP_ALGO; i++) {
> >         zstrm = zcomp_stream_get(i);
> >         zcomp_compress(src, &comp_len);
> >         if (comp_len > threshold) {
> >             zcomp_stream_put(i);
> >             continue;
> >         }
> >         break;
> >     }
> > 
> > zram_bvec_read:
> >     algo_idx = zram_get_algo_idx(zram, index);
> >     zstrm = zcomp_stream_get(zram, algo_idx);
> >     zcomp_decompress(zstrm);
> >     zcomp_stream_put(zram, algo_idx);
> 
> Hmm. This is something that should not be enabled by default.

Exactly. I don't mean to enable by default, either.

> N compressions per every stored page is very very CPU and
> power intensive. We definitely want a way to have recompression
> as a user-space event, which gives all sorts of flexibility and
> extensibility. ZRAM doesn't (and should not) know about too many
> things, so ZRAM can't make good decisions (and probably should not
> try). User-space can make good decisions on the other hand.
> 
> So recompression for us is not something that happens all the time,
> unconditionally. It's something that happens sometimes, depending on
> the situation on the host.

Totally agree. I am not saying we should enable the feature by default
but at lesat consider it for the future. I have something in mind to
be useful later.

> 
> [..]
> > > +static int zram_recompress(struct zram *zram, u32 index, struct page *page,
> > > +			   int size_watermark)
> > > +{
> > > +	unsigned long handle_prev;
> > > +	unsigned long handle_next;
> > > +	unsigned int comp_len_next;
> > > +	unsigned int comp_len_prev;
> > 
> > How about orig_handle and new_nandle with orig_comp_len and new_comp_len?
> 
> No opinion. Can we have prev and next? :)

prev and next gives the impression position something like list.
orig and new gives the impression stale and fresh.

We are doing latter here.
  
Sergey Senozhatsky Nov. 4, 2022, 3:48 a.m. UTC | #5
On (22/11/03 10:00), Minchan Kim wrote:
[..]
> > Per-my understanding this threshold can change quite often,
> > depending on memory pressure and so on. So we may force
> > user-space to issues more syscalls, without any gain in
> > simplicity.
> 
> Sorry, didn't understand your point. Let me clarify my idea.
> If we have separate knob for recompress thresh hold, we could
> work like this.
> 
> # recompress any compressed pages which is greater than 888 bytes.
> echo 888 > /sys/block/zram0/recompress_threshold
> 
> # try to compress any pages greather than threshold with following
> # algorithm.
> 
> echo "type=lzo priority=1" > /sys/block/zram0/recompress_algo
> echo "type=zstd priority=2" > /sys/block/zram0/recompress_algo
> echo "type=deflate priority=3" > /sys/block/zram0/recompress_algo

OK. We can always add more sysfs knobs and make threshold a global
per-device value.

I think I prefer the approach when threshold is part of the current
recompress context, not something derived form another context. That
is, when all values (page type, threshold, possibly algorithm index)
are submitted by user-space for this particular recompression

	echo "type=huge threshold=3000 ..." > recompress

If threshold is a global value that is applied to all recompress calls
then how does user-space say no-threshold? For instance, when it wants
to recompress only huge pages. It probably still needs to supply something
like threshold=0. So my personal preference for now - keep threshold
as a context dependent value.

Another thing that I like about threshold= being context dependent
is that then we don't need to protect recompression against concurrent
global threshold modifications with lock and so on. It keeps things
simpler.

[..]
> > > Let's squeeze the comp algo index into meta area since we have
> > > some rooms for the bits. Then can we could remove the specific
> > > recomp two flags?
> > 
> > What is meta area?
> 
> zram->table[index].flags
> 
> If we squeeze the algorithm index, we could work like this
> without ZRAM_RECOMP_SKIP.

We still need ZRAM_RECOMP_SKIP. Recompression may fail to compress
object further: sometimes we can get recompressed object that is larger
than the original one, sometimes of the same size, sometimes of a smaller
size but still belonging to the same size class, which doesn't save us
any memory. Without ZRAM_RECOMP_SKIP we will continue re-compressing
objects that are in-compressible (in a way that saves us memory in
zsmalloc) by any of the ZRAM's algorithms.

> read_block_state
>     zram_algo_idx(zram, index) > 0 ? 'r' : '.');
> 
> zram_read_from_zpool
>     if (zram_algo_idx(zram, idx) != 0)
>         idx = 1;

As an idea, maybe we can store everything re-compression related
in a dedicated meta field? SKIP flag, algorithm ID, etc.

We don't have too many bits left in ->flags on 32-bit systems. We
currently probably need at least 3 bits - one for RECOMP_SKIP and at
least 2 for algorithm ID. 2 bits for algorithm ID put us into situation
that we can have only 00, 01, 10, 11 as IDs, that is maximum 3 recompress
algorithms: 00 is the primary one and the rest are alternative ones.
Maximum 3 re-compression algorithms sounds like a reasonable max value to
me. Yeah, maybe we can use flags bits for it.

[..]
> > > zram_bvec_read:
> > >     algo_idx = zram_get_algo_idx(zram, index);
> > >     zstrm = zcomp_stream_get(zram, algo_idx);
> > >     zcomp_decompress(zstrm);
> > >     zcomp_stream_put(zram, algo_idx);
> > 
> > Hmm. This is something that should not be enabled by default.
> 
> Exactly. I don't mean to enable by default, either.

OK.

> > N compressions per every stored page is very very CPU and
> > power intensive. We definitely want a way to have recompression
> > as a user-space event, which gives all sorts of flexibility and
> > extensibility. ZRAM doesn't (and should not) know about too many
> > things, so ZRAM can't make good decisions (and probably should not
> > try). User-space can make good decisions on the other hand.
> > 
> > So recompression for us is not something that happens all the time,
> > unconditionally. It's something that happens sometimes, depending on
> > the situation on the host.
> 
> Totally agree. I am not saying we should enable the feature by default
> but at lesat consider it for the future. I have something in mind to
> be useful later.

OK.

> > [..]
> > > > +static int zram_recompress(struct zram *zram, u32 index, struct page *page,
> > > > +			   int size_watermark)
> > > > +{
> > > > +	unsigned long handle_prev;
> > > > +	unsigned long handle_next;
> > > > +	unsigned int comp_len_next;
> > > > +	unsigned int comp_len_prev;
> > > 
> > > How about orig_handle and new_nandle with orig_comp_len and new_comp_len?
> > 
> > No opinion. Can we have prev and next? :)
> 
> prev and next gives the impression position something like list.
> orig and new gives the impression stale and fresh.
> 
> We are doing latter here.

Yeah, like I said in internal email, this will make rebasing harder on
my side, because this breaks a patch from Alexey and then breaks a higher
order zspages patch series. It's an very old series and we already have
quite a bit of patches depending on it.
  
Sergey Senozhatsky Nov. 4, 2022, 7:12 a.m. UTC | #6
On (22/11/04 12:48), Sergey Senozhatsky wrote:
> > read_block_state
> >     zram_algo_idx(zram, index) > 0 ? 'r' : '.');
> > 
> > zram_read_from_zpool
> >     if (zram_algo_idx(zram, idx) != 0)
> >         idx = 1;
> 
> As an idea, maybe we can store everything re-compression related
> in a dedicated meta field? SKIP flag, algorithm ID, etc.

That's just an idea.

Something like this

---
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index bdfc9bf0bdd5..c011d0f145f6 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -49,8 +49,6 @@ enum zram_pageflags {
        ZRAM_UNDER_WB,  /* page is under writeback */
        ZRAM_HUGE,      /* Incompressible page */
        ZRAM_IDLE,      /* not accessed page since last idle marking */
-       ZRAM_RECOMP,    /* page was recompressed */
-       ZRAM_RECOMP_SKIP, /* secondary algorithm cannot compress this page */
 
        __NR_ZRAM_PAGEFLAGS,
 };
@@ -64,6 +62,11 @@ struct zram_table_entry {
                unsigned long element;
        };
        unsigned long flags;
+#ifdef CONFIG_ZRAM_MULTI_COMP
+       unsigned int incompressible:1;
+       unsigned int priority:2;
+#endif
+
 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
        ktime_t ac_time;
 #endif
---

The reason I'm thinking about it is that we have flags bits that are
used only when particular .config options are enabled. Without those
options we just waste bits.

Recompression is one such thing. Another one is writeback.

---
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index c011d0f145f6..7076ec209e79 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -45,8 +45,6 @@ enum zram_pageflags {
        /* zram slot is locked */
        ZRAM_LOCK = ZRAM_FLAG_SHIFT,
        ZRAM_SAME,      /* Page consists the same element */
-       ZRAM_WB,        /* page is stored on backing_device */
-       ZRAM_UNDER_WB,  /* page is under writeback */
        ZRAM_HUGE,      /* Incompressible page */
        ZRAM_IDLE,      /* not accessed page since last idle marking */
 
@@ -68,6 +66,8 @@ struct zram_table_entry {
 #endif
 
 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
+       unsigned int under_writeback:1;
+       unsigned int written_back:1;
        ktime_t ac_time;
 #endif
 };
---

So we can use ->flags only for things that are independent of .config
  
Sergey Senozhatsky Nov. 4, 2022, 7:53 a.m. UTC | #7
On (22/11/03 10:00), Minchan Kim wrote:
> zram->table[index].flags
> 
> If we squeeze the algorithm index, we could work like this
> without ZRAM_RECOMP_SKIP.

Something like this?

Allocate two ->flags bits for priority and one bit for RECOMP_SKIP.
Two priority bits let us to have 3 alternative algorithms (01 10 11)
plus one default (00). So 4 in total.

---
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 12f03745baf9..af0ff58087ca 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -127,6 +127,19 @@ static size_t zram_get_obj_size(struct zram *zram, u32 index)
        return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1);
 }
 
+static inline void zram_set_priority(struct zram *zram, u32 index, u32 prio)
+{
+       prio &= ZRAM_COMP_PRIORITY_MASK;
+       zram->table[index].flags &= (prio << ZRAM_COMP_PRIORITY_1);
+}
+
+static inline u32 zram_get_priority(struct zram *zram, u32 index)
+{
+       u32 prio = zram->table[index].flags;
+
+       return prio & ZRAM_COMP_PRIORITY_MASK;
+}
+
 static void zram_set_obj_size(struct zram *zram,
                                        u32 index, size_t size)
 {
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index bdfc9bf0bdd5..33e52c5a9a90 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -40,6 +40,8 @@
  */
 #define ZRAM_FLAG_SHIFT (PAGE_SHIFT + 1)
 
+#define ZRAM_COMP_PRIORITY_MASK  0x3
+
 /* Flags for zram pages (table[page_no].flags) */
 enum zram_pageflags {
        /* zram slot is locked */
@@ -49,8 +51,9 @@ enum zram_pageflags {
        ZRAM_UNDER_WB,  /* page is under writeback */
        ZRAM_HUGE,      /* Incompressible page */
        ZRAM_IDLE,      /* not accessed page since last idle marking */
-       ZRAM_RECOMP,    /* page was recompressed */
        ZRAM_RECOMP_SKIP, /* secondary algorithm cannot compress this page */
+       ZRAM_COMP_PRIORITY_1,
+       ZRAM_COMP_PRIORITY_2,
 
        __NR_ZRAM_PAGEFLAGS,
 };
  
Sergey Senozhatsky Nov. 4, 2022, 8:08 a.m. UTC | #8
On (22/11/04 16:53), Sergey Senozhatsky wrote:
[..]
> +static inline void zram_set_priority(struct zram *zram, u32 index, u32 prio)
> +{
> +       prio &= ZRAM_COMP_PRIORITY_MASK;
> +       zram->table[index].flags &= (prio << ZRAM_COMP_PRIORITY_1);
> +}

Uh... Something like this, sorry.

+static inline void zram_set_priority(struct zram *zram, u32 index, u32 prio)
+{
+       prio &= ZRAM_RECOMP_PRIO_MASK;
+       zram->table[index].flags &= ~(ZRAM_RECOMP_PRIO_MASK <<
+                                    ZRAM_RECOMP_PRIORITY_1);
+       zram->table[index].flags |= (prio << ZRAM_RECOMP_PRIORITY_1);
+}
  
Minchan Kim Nov. 4, 2022, 5:27 p.m. UTC | #9
On Fri, Nov 04, 2022 at 12:48:42PM +0900, Sergey Senozhatsky wrote:
> On (22/11/03 10:00), Minchan Kim wrote:
> [..]
> > > Per-my understanding this threshold can change quite often,
> > > depending on memory pressure and so on. So we may force
> > > user-space to issues more syscalls, without any gain in
> > > simplicity.
> > 
> > Sorry, didn't understand your point. Let me clarify my idea.
> > If we have separate knob for recompress thresh hold, we could
> > work like this.
> > 
> > # recompress any compressed pages which is greater than 888 bytes.
> > echo 888 > /sys/block/zram0/recompress_threshold
> > 
> > # try to compress any pages greather than threshold with following
> > # algorithm.
> > 
> > echo "type=lzo priority=1" > /sys/block/zram0/recompress_algo
> > echo "type=zstd priority=2" > /sys/block/zram0/recompress_algo
> > echo "type=deflate priority=3" > /sys/block/zram0/recompress_algo
> 
> OK. We can always add more sysfs knobs and make threshold a global
> per-device value.
> 
> I think I prefer the approach when threshold is part of the current
> recompress context, not something derived form another context. That
> is, when all values (page type, threshold, possibly algorithm index)
> are submitted by user-space for this particular recompression
> 
> 	echo "type=huge threshold=3000 ..." > recompress
> 
> If threshold is a global value that is applied to all recompress calls
> then how does user-space say no-threshold? For instance, when it wants
> to recompress only huge pages. It probably still needs to supply something
> like threshold=0. So my personal preference for now - keep threshold
> as a context dependent value.
> 
> Another thing that I like about threshold= being context dependent
> is that then we don't need to protect recompression against concurrent
> global threshold modifications with lock and so on. It keeps things
> simpler.

Sure. Let's go with per-algo threshold.

> 
> [..]
> > > > Let's squeeze the comp algo index into meta area since we have
> > > > some rooms for the bits. Then can we could remove the specific
> > > > recomp two flags?
> > > 
> > > What is meta area?
> > 
> > zram->table[index].flags
> > 
> > If we squeeze the algorithm index, we could work like this
> > without ZRAM_RECOMP_SKIP.
> 
> We still need ZRAM_RECOMP_SKIP. Recompression may fail to compress
> object further: sometimes we can get recompressed object that is larger
> than the original one, sometimes of the same size, sometimes of a smaller
> size but still belonging to the same size class, which doesn't save us
> any memory. Without ZRAM_RECOMP_SKIP we will continue re-compressing

Indeed.

> objects that are in-compressible (in a way that saves us memory in
> zsmalloc) by any of the ZRAM's algorithms.
> 
> > read_block_state
> >     zram_algo_idx(zram, index) > 0 ? 'r' : '.');
> > 
> > zram_read_from_zpool
> >     if (zram_algo_idx(zram, idx) != 0)
> >         idx = 1;
> 
> As an idea, maybe we can store everything re-compression related
> in a dedicated meta field? SKIP flag, algorithm ID, etc.
> 
> We don't have too many bits left in ->flags on 32-bit systems. We
> currently probably need at least 3 bits - one for RECOMP_SKIP and at
> least 2 for algorithm ID. 2 bits for algorithm ID put us into situation
> that we can have only 00, 01, 10, 11 as IDs, that is maximum 3 recompress
> algorithms: 00 is the primary one and the rest are alternative ones.
> Maximum 3 re-compression algorithms sounds like a reasonable max value to
> me. Yeah, maybe we can use flags bits for it.

If possbile, let's go with those three bits into flags since we could
factor them out into dedicated field, anytime later since it's not ABI.
  
Minchan Kim Nov. 4, 2022, 5:47 p.m. UTC | #10
On Fri, Nov 04, 2022 at 04:53:13PM +0900, Sergey Senozhatsky wrote:
> On (22/11/03 10:00), Minchan Kim wrote:
> > zram->table[index].flags
> > 
> > If we squeeze the algorithm index, we could work like this
> > without ZRAM_RECOMP_SKIP.
> 
> Something like this?
> 
> Allocate two ->flags bits for priority and one bit for RECOMP_SKIP.
> Two priority bits let us to have 3 alternative algorithms (01 10 11)
> plus one default (00). So 4 in total.

Looks good!
  
Minchan Kim Nov. 4, 2022, 5:53 p.m. UTC | #11
On Fri, Nov 04, 2022 at 04:12:13PM +0900, Sergey Senozhatsky wrote:
> On (22/11/04 12:48), Sergey Senozhatsky wrote:
> > > read_block_state
> > >     zram_algo_idx(zram, index) > 0 ? 'r' : '.');
> > > 
> > > zram_read_from_zpool
> > >     if (zram_algo_idx(zram, idx) != 0)
> > >         idx = 1;
> > 
> > As an idea, maybe we can store everything re-compression related
> > in a dedicated meta field? SKIP flag, algorithm ID, etc.
> 
> That's just an idea.
> 
> Something like this
> 
> ---
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index bdfc9bf0bdd5..c011d0f145f6 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -49,8 +49,6 @@ enum zram_pageflags {
>         ZRAM_UNDER_WB,  /* page is under writeback */
>         ZRAM_HUGE,      /* Incompressible page */
>         ZRAM_IDLE,      /* not accessed page since last idle marking */
> -       ZRAM_RECOMP,    /* page was recompressed */
> -       ZRAM_RECOMP_SKIP, /* secondary algorithm cannot compress this page */
>  
>         __NR_ZRAM_PAGEFLAGS,
>  };
> @@ -64,6 +62,11 @@ struct zram_table_entry {
>                 unsigned long element;
>         };
>         unsigned long flags;
> +#ifdef CONFIG_ZRAM_MULTI_COMP
> +       unsigned int incompressible:1;
> +       unsigned int priority:2;
> +#endif
> +
>  #ifdef CONFIG_ZRAM_MEMORY_TRACKING
>         ktime_t ac_time;
>  #endif
> ---
> 
> The reason I'm thinking about it is that we have flags bits that are
> used only when particular .config options are enabled. Without those
> options we just waste bits.
> 
> Recompression is one such thing. Another one is writeback.
> 
> ---
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index c011d0f145f6..7076ec209e79 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -45,8 +45,6 @@ enum zram_pageflags {
>         /* zram slot is locked */
>         ZRAM_LOCK = ZRAM_FLAG_SHIFT,
>         ZRAM_SAME,      /* Page consists the same element */
> -       ZRAM_WB,        /* page is stored on backing_device */
> -       ZRAM_UNDER_WB,  /* page is under writeback */
>         ZRAM_HUGE,      /* Incompressible page */
>         ZRAM_IDLE,      /* not accessed page since last idle marking */
>  
> @@ -68,6 +66,8 @@ struct zram_table_entry {
>  #endif
>  
>  #ifdef CONFIG_ZRAM_MEMORY_TRACKING
> +       unsigned int under_writeback:1;
> +       unsigned int written_back:1;
>         ktime_t ac_time;
>  #endif
>  };
> ---
> 
> So we can use ->flags only for things that are independent of .config

Couldn't we move them into a dedicated field to introduce one more field
overhead in the meta area when we run out the extra field in the flag?
So I'd like to squeeze the bits into flag.
  
Sergey Senozhatsky Nov. 4, 2022, 11:22 p.m. UTC | #12
On (22/11/04 10:27), Minchan Kim wrote:
> > objects that are in-compressible (in a way that saves us memory in
> > zsmalloc) by any of the ZRAM's algorithms.
> > 
> > > read_block_state
> > >     zram_algo_idx(zram, index) > 0 ? 'r' : '.');
> > > 
> > > zram_read_from_zpool
> > >     if (zram_algo_idx(zram, idx) != 0)
> > >         idx = 1;
> > 
> > As an idea, maybe we can store everything re-compression related
> > in a dedicated meta field? SKIP flag, algorithm ID, etc.
> > 
> > We don't have too many bits left in ->flags on 32-bit systems. We
> > currently probably need at least 3 bits - one for RECOMP_SKIP and at
> > least 2 for algorithm ID. 2 bits for algorithm ID put us into situation
> > that we can have only 00, 01, 10, 11 as IDs, that is maximum 3 recompress
> > algorithms: 00 is the primary one and the rest are alternative ones.
> > Maximum 3 re-compression algorithms sounds like a reasonable max value to
> > me. Yeah, maybe we can use flags bits for it.
> 
> If possbile, let's go with those three bits into flags since we could
> factor them out into dedicated field, anytime later since it's not ABI.

Ack.
  

Patch

diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index d4100b0c083e..3e00656a6f8a 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -78,3 +78,18 @@  config ZRAM_MEMORY_TRACKING
 	  /sys/kernel/debug/zram/zramX/block_state.
 
 	  See Documentation/admin-guide/blockdev/zram.rst for more information.
+
+config ZRAM_MULTI_COMP
+	bool "Enable multiple per-CPU compression streams"
+	depends on ZRAM
+	help
+	This will enable per-CPU multi-compression streams, so that ZRAM
+	can re-compress IDLE/huge pages, using a potentially slower but
+	more effective compression algorithm. Note, that IDLE page support
+	requires ZRAM_MEMORY_TRACKING.
+
+          echo TIMEOUT > /sys/block/zramX/idle
+          echo SIZE > /sys/block/zramX/recompress
+
+	SIZE (in bytes) parameter sets the object size watermark: idle
+	objects that are of a smaller size will not get recompressed.
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 94c62d7ea818..da11560ecf70 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1282,6 +1282,12 @@  static void zram_free_page(struct zram *zram, size_t index)
 		atomic64_dec(&zram->stats.huge_pages);
 	}
 
+	if (zram_test_flag(zram, index, ZRAM_RECOMP))
+		zram_clear_flag(zram, index, ZRAM_RECOMP);
+
+	if (zram_test_flag(zram, index, ZRAM_RECOMP_SKIP))
+		zram_clear_flag(zram, index, ZRAM_RECOMP_SKIP);
+
 	if (zram_test_flag(zram, index, ZRAM_WB)) {
 		zram_clear_flag(zram, index, ZRAM_WB);
 		free_block_bdev(zram, zram_get_element(zram, index));
@@ -1343,6 +1349,7 @@  static int zram_read_from_zspool(struct zram *zram, struct page *page,
 	unsigned long handle;
 	unsigned int size;
 	void *src, *dst;
+	u32 idx;
 	int ret;
 
 	handle = zram_get_handle(zram, index);
@@ -1359,8 +1366,13 @@  static int zram_read_from_zspool(struct zram *zram, struct page *page,
 
 	size = zram_get_obj_size(zram, index);
 
-	if (size != PAGE_SIZE)
-		zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_ZCOMP]);
+	if (size != PAGE_SIZE) {
+		idx = ZRAM_PRIMARY_ZCOMP;
+		if (zram_test_flag(zram, index, ZRAM_RECOMP))
+			idx = ZRAM_SECONDARY_ZCOMP;
+
+		zstrm = zcomp_stream_get(zram->comps[idx]);
+	}
 
 	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
 	if (size == PAGE_SIZE) {
@@ -1372,7 +1384,7 @@  static int zram_read_from_zspool(struct zram *zram, struct page *page,
 		dst = kmap_atomic(page);
 		ret = zcomp_decompress(zstrm, src, size, dst);
 		kunmap_atomic(dst);
-		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_ZCOMP]);
+		zcomp_stream_put(zram->comps[idx]);
 	}
 	zs_unmap_object(zram->mem_pool, handle);
 	return ret;
@@ -1603,6 +1615,182 @@  static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 	return ret;
 }
 
+#ifdef CONFIG_ZRAM_MULTI_COMP
+/*
+ * This function will decompress (unless it's ZRAM_HUGE) the page and then
+ * attempt to compress it using secondary compression algorithm (which is
+ * potentially more effective).
+ *
+ * Corresponding ZRAM slot should be locked.
+ */
+static int zram_recompress(struct zram *zram, u32 index, struct page *page,
+			   int size_watermark)
+{
+	unsigned long handle_prev;
+	unsigned long handle_next;
+	unsigned int comp_len_next;
+	unsigned int comp_len_prev;
+	struct zcomp_strm *zstrm;
+	void *src, *dst;
+	int ret;
+
+	handle_prev = zram_get_handle(zram, index);
+	if (!handle_prev)
+		return -EINVAL;
+
+	comp_len_prev = zram_get_obj_size(zram, index);
+	/*
+	 * Do not recompress objects that are already "small enough".
+	 */
+	if (comp_len_prev < size_watermark)
+		return 0;
+
+	ret = zram_read_from_zspool(zram, page, index);
+	if (ret)
+		return ret;
+
+	zstrm = zcomp_stream_get(zram->comps[ZRAM_SECONDARY_ZCOMP]);
+	src = kmap_atomic(page);
+	ret = zcomp_compress(zstrm, src, &comp_len_next);
+	kunmap_atomic(src);
+
+	/*
+	 * Either a compression error or we failed to compressed the object
+	 * in a way that will save us memory. Mark the object so that we
+	 * don't attempt to re-compress it again (RECOMP_SKIP).
+	 */
+	if (comp_len_next >= huge_class_size ||
+	    comp_len_next >= comp_len_prev ||
+	    ret) {
+		zram_set_flag(zram, index, ZRAM_RECOMP_SKIP);
+		zram_clear_flag(zram, index, ZRAM_IDLE);
+		zcomp_stream_put(zram->comps[ZRAM_SECONDARY_ZCOMP]);
+		return ret;
+	}
+
+	/*
+	 * No direct reclaim (slow path) for handle allocation and no
+	 * re-compression attempt (unlike in __zram_bvec_write()) since
+	 * we already have stored that object in zsmalloc. If we cannot
+	 * alloc memory for recompressed object then we bail out and
+	 * simply keep the old (existing) object in zsmalloc.
+	 */
+	handle_next = zs_malloc(zram->mem_pool, comp_len_next,
+				__GFP_KSWAPD_RECLAIM |
+				__GFP_NOWARN |
+				__GFP_HIGHMEM |
+				__GFP_MOVABLE);
+	if (IS_ERR((void *)handle_next)) {
+		zcomp_stream_put(zram->comps[ZRAM_SECONDARY_ZCOMP]);
+		return PTR_ERR((void *)handle_next);
+	}
+
+	dst = zs_map_object(zram->mem_pool, handle_next, ZS_MM_WO);
+	memcpy(dst, zstrm->buffer, comp_len_next);
+	zcomp_stream_put(zram->comps[ZRAM_SECONDARY_ZCOMP]);
+
+	zs_unmap_object(zram->mem_pool, handle_next);
+
+	zram_free_page(zram, index);
+	zram_set_handle(zram, index, handle_next);
+	zram_set_obj_size(zram, index, comp_len_next);
+
+	zram_set_flag(zram, index, ZRAM_RECOMP);
+	atomic64_add(comp_len_next, &zram->stats.compr_data_size);
+	atomic64_inc(&zram->stats.pages_stored);
+
+	return 0;
+}
+
+#define RECOMPRESS_IDLE		(1 << 0)
+#define RECOMPRESS_HUGE		(1 << 1)
+
+static ssize_t recompress_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t len)
+{
+	struct zram *zram = dev_to_zram(dev);
+	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
+	unsigned long index;
+	struct page *page;
+	ssize_t ret;
+	int mode, size_watermark = 0;
+
+	if (sysfs_streq(buf, "idle")) {
+		mode = RECOMPRESS_IDLE;
+	} else if (sysfs_streq(buf, "huge")) {
+		mode = RECOMPRESS_HUGE;
+	} else if (sysfs_streq(buf, "huge_idle")) {
+		mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
+	} else {
+		/*
+		 * We will re-compress only idle objects equal or greater
+		 * in size than watermark.
+		 */
+		ret = kstrtoint(buf, 10, &size_watermark);
+		if (ret)
+			return ret;
+		mode = RECOMPRESS_IDLE;
+	}
+
+	if (size_watermark > PAGE_SIZE)
+		return -EINVAL;
+
+	down_read(&zram->init_lock);
+	if (!init_done(zram)) {
+		ret = -EINVAL;
+		goto release_init_lock;
+	}
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		ret = -ENOMEM;
+		goto release_init_lock;
+	}
+
+	ret = len;
+	for (index = 0; index < nr_pages; index++) {
+		int err = 0;
+
+		zram_slot_lock(zram, index);
+
+		if (!zram_allocated(zram, index))
+			goto next;
+
+		if (mode & RECOMPRESS_IDLE &&
+		    !zram_test_flag(zram, index, ZRAM_IDLE))
+			goto next;
+
+		if (mode & RECOMPRESS_HUGE &&
+		    !zram_test_flag(zram, index, ZRAM_HUGE))
+			goto next;
+
+		if (zram_test_flag(zram, index, ZRAM_WB) ||
+		    zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
+		    zram_test_flag(zram, index, ZRAM_SAME) ||
+		    zram_test_flag(zram, index, ZRAM_RECOMP) ||
+		    zram_test_flag(zram, index, ZRAM_RECOMP_SKIP))
+			goto next;
+
+		err = zram_recompress(zram, index, page, size_watermark);
+next:
+		zram_slot_unlock(zram, index);
+		if (err) {
+			ret = err;
+			break;
+		}
+
+		cond_resched();
+	}
+
+	__free_page(page);
+
+release_init_lock:
+	up_read(&zram->init_lock);
+	return ret;
+}
+#endif
+
 /*
  * zram_bio_discard - handler on discard request
  * @index: physical block index in PAGE_SIZE units
@@ -1983,6 +2171,7 @@  static DEVICE_ATTR_RW(writeback_limit_enable);
 #endif
 #ifdef CONFIG_ZRAM_MULTI_COMP
 static DEVICE_ATTR_RW(recomp_algorithm);
+static DEVICE_ATTR_WO(recompress);
 #endif
 
 static struct attribute *zram_disk_attrs[] = {
@@ -2009,6 +2198,7 @@  static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_debug_stat.attr,
 #ifdef CONFIG_ZRAM_MULTI_COMP
 	&dev_attr_recomp_algorithm.attr,
+	&dev_attr_recompress.attr,
 #endif
 	NULL,
 };
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 4044ddbb2326..09b9ceb5dfa3 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -49,6 +49,8 @@  enum zram_pageflags {
 	ZRAM_UNDER_WB,	/* page is under writeback */
 	ZRAM_HUGE,	/* Incompressible page */
 	ZRAM_IDLE,	/* not accessed page since last idle marking */
+	ZRAM_RECOMP,	/* page was recompressed */
+	ZRAM_RECOMP_SKIP, /* secondary algorithm cannot compress this page */
 
 	__NR_ZRAM_PAGEFLAGS,
 };