[v1,2/5] lib/bitmap: Introduce bitmap_scatter() and bitmap_gather() helpers

Message ID 20230926052007.3917389-3-andriy.shevchenko@linux.intel.com
State New
Headers
Series bitmap: get rid of bitmap_remap() and bitmap_biremap() uses |

Commit Message

Andy Shevchenko Sept. 26, 2023, 5:20 a.m. UTC
  These helpers are the optimized versions of the bitmap_remap()
where one of the bitmaps (source or destination) is of sequential bits.

See more in the kernel documentation of the helpers.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/bitmap.h |  9 ++++++
 lib/bitmap.c           | 70 ++++++++++++++++++++++++++++++++++++++++++
 lib/test_bitmap.c      | 23 ++++++++++++++
 3 files changed, 102 insertions(+)
  

Comments

Yury Norov Sept. 27, 2023, 12:25 a.m. UTC | #1
On Tue, Sep 26, 2023 at 08:20:04AM +0300, Andy Shevchenko wrote:
> These helpers are the optimized versions of the bitmap_remap()
> where one of the bitmaps (source or destination) is of sequential bits.

If so, can you add a test that makes sure that new API is consistent
with the old bitmap_remap? And also provide numbers how well are they
optimized, comparing to bitmap_remap.
 
> See more in the kernel documentation of the helpers.

I grepped the whole kernel, not only Documentation directory, and found
nothing...

> Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
> ---
>  include/linux/bitmap.h |  9 ++++++
>  lib/bitmap.c           | 70 ++++++++++++++++++++++++++++++++++++++++++
>  lib/test_bitmap.c      | 23 ++++++++++++++
>  3 files changed, 102 insertions(+)
> 
> diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> index 1516ff979315..87013b9a7dd8 100644
> --- a/include/linux/bitmap.h
> +++ b/include/linux/bitmap.h
> @@ -60,6 +60,8 @@ struct device;
>   *  bitmap_shift_left(dst, src, n, nbits)       *dst = *src << n
>   *  bitmap_cut(dst, src, first, n, nbits)       Cut n bits from first, copy rest
>   *  bitmap_replace(dst, old, new, mask, nbits)  *dst = (*old & ~(*mask)) | (*new & *mask)
> + *  bitmap_scatter(dst, src, mask, nbits)	*dst = map(dense, sparse)(src)
> + *  bitmap_gather(dst, src, mask, nbits)	*dst = map(sparse, dense)(src)
>   *  bitmap_remap(dst, src, old, new, nbits)     *dst = map(old, new)(src)
>   *  bitmap_bitremap(oldbit, old, new, nbits)    newbit = map(old, new)(oldbit)
>   *  bitmap_onto(dst, orig, relmap, nbits)       *dst = orig relative to relmap
> @@ -208,6 +210,12 @@ int bitmap_parselist(const char *buf, unsigned long *maskp,
>  			int nmaskbits);
>  int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
>  			unsigned long *dst, int nbits);
> +
> +unsigned int bitmap_scatter(unsigned long *dst, const unsigned long *src,
> +		const unsigned long *mask, unsigned int nbits);
> +unsigned int bitmap_gather(unsigned long *dst, const unsigned long *src,
> +		const unsigned long *mask, unsigned int nbits);
> +
>  void bitmap_remap(unsigned long *dst, const unsigned long *src,
>  		const unsigned long *old, const unsigned long *new, unsigned int nbits);
>  int bitmap_bitremap(int oldbit,
> @@ -216,6 +224,7 @@ void bitmap_onto(unsigned long *dst, const unsigned long *orig,
>  		const unsigned long *relmap, unsigned int bits);
>  void bitmap_fold(unsigned long *dst, const unsigned long *orig,
>  		unsigned int sz, unsigned int nbits);
> +
>  int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
>  void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
>  int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
> diff --git a/lib/bitmap.c b/lib/bitmap.c
> index 935e0f96e785..31cfc7846aae 100644
> --- a/lib/bitmap.c
> +++ b/lib/bitmap.c
> @@ -942,6 +942,76 @@ int bitmap_parse(const char *start, unsigned int buflen,
>  }
>  EXPORT_SYMBOL(bitmap_parse);
>  
> +/**
> + * bitmap_scatter - Scatter a bitmap according to the given mask
> + * @dst: scattered bitmap
> + * @src: gathered bitmap
> + * @mask: bits to assign to in the scattered bitmap
> + * @nbits: number of bits in each of these bitmaps
> + *
> + * Scatters bitmap with sequential bits according to the given @mask.
> + *
> + * Example:
> + * If @src bitmap = 0x005a, with @mask = 0x1313, @dst will be 0x0302.
> + *
> + * Or in binary form
> + * @src			@mask			@dst
> + * 0000000001011010	0001001100010011	0000001100000010
> + *
> + * (Bits 0, 1, 2, 3, 4, 5 are copied to the bits 0, 1, 4, 8, 9, 12)
> + *
> + * Returns: the weight of the @mask.

Returning a weight of the mask is somewhat non-trivial... To me it
would be logical to return a weight of destination, for example...

But I see that in the following patch you're using the returned value.
Maybe add a few words to advocate that?

> + */
> +unsigned int bitmap_scatter(unsigned long *dst, const unsigned long *src,
> +			    const unsigned long *mask, unsigned int nbits)
> +{
> +	unsigned int bit;
> +	int n = 0;

Is n signed for purpose? I think it should be consistent with
return value.

> +
> +	bitmap_zero(dst, nbits);
> +
> +	for_each_set_bit(bit, mask, nbits)
> +		__assign_bit(bit, dst, test_bit(n++, src));
> +
> +	return n;
> +}
> +EXPORT_SYMBOL(bitmap_scatter);
> +
> +/**
> + * bitmap_gather - Gather a bitmap according to given mask
> + * @dst: gathered bitmap
> + * @src: scattered bitmap
> + * @mask: bits to extract from in the scattered bitmap
> + * @nbits: number of bits in each of these bitmaps
> + *
> + * Gathers bitmap with sparse bits according to the given @mask.
> + *
> + * Example:
> + * If @src bitmap = 0x0302, with @mask = 0x1313, @dst will be 0x001a.

Not sure about others, but to me hex representation is quite useless,
moreover it's followed by binary one.

> + * Or in binary form
> + * @src			@mask			@dst
> + * 0000001100000010	0001001100010011	0000000000011010
> + *
> + * (Bits 0, 1, 4, 8, 9, 12 are copied to the bits 0, 1, 2, 3, 4, 5)
> + *
> + * Returns: the weight of the @mask.
> + */

It looks like those are designed complement to each other. Is that
true? If so, can you make your example showing that
        scatter -> gather -> scatter
would restore the original bitmap?

If I'm wrong, can you please underline that they are not complement,
and why?

> +unsigned int bitmap_gather(unsigned long *dst, const unsigned long *src,
> +			   const unsigned long *mask, unsigned int nbits)
> +{
> +	unsigned int bit;
> +	int n = 0;
> +
> +	bitmap_zero(dst, nbits);
> +
> +	for_each_set_bit(bit, mask, nbits)
> +		__assign_bit(n++, dst, test_bit(bit, src));
> +
> +	return n;
> +}
> +EXPORT_SYMBOL(bitmap_gather);

I feel like they should reside in header, because they are quite a small
functions indeed, and they would benefit from compile-time optimizations
without bloating the kernel.

Moreover, you are using them in patch #3 on 64-bit bitmaps, which
would benefit from small_const_nbits() optimization.

> +
>  /**
>   * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
>   *	@buf: pointer to a bitmap
> diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
> index 1f2dc7fef17f..f43a07679998 100644
> --- a/lib/test_bitmap.c
> +++ b/lib/test_bitmap.c
> @@ -50,6 +50,9 @@ static const unsigned long exp2[] __initconst = {
>  static const unsigned long exp2_to_exp3_mask[] __initconst = {
>  	BITMAP_FROM_U64(0x008000020020212eULL),
>  };
> +static const unsigned long exp2_to_exp3_maskg[] __initconst = {
> +	BITMAP_FROM_U64(0x00000000000001ffULL),
> +};
>  /* exp3_0_1 = (exp2[0] & ~exp2_to_exp3_mask) | (exp2[1] & exp2_to_exp3_mask) */
>  static const unsigned long exp3_0_1[] __initconst = {
>  	BITMAP_FROM_U64(0x33b3333311313137ULL),
> @@ -357,6 +360,25 @@ static void __init test_replace(void)
>  	expect_eq_bitmap(bmap, exp3_1_0, nbits);
>  }
>  
> +static void __init test_bitmap_sg(void)
> +{
> +	unsigned int nbits = 64;
> +	DECLARE_BITMAP(bmap, 1024);

Can you make it 1000? That way we'll test non-aligned case.

> +	unsigned int w;
> +
> +	bitmap_zero(bmap, 1024);
> +	w = bitmap_gather(bmap, exp2_to_exp3_mask, exp2_to_exp3_mask, nbits);
> +	expect_eq_uint(bitmap_weight(exp2_to_exp3_mask, nbits), w);
> +	expect_eq_uint(bitmap_weight(bmap, 1024), w);
> +	expect_eq_bitmap(bmap, exp2_to_exp3_maskg, nbits);
> +
> +	bitmap_zero(bmap, 1024);
> +	w = bitmap_scatter(bmap, exp2_to_exp3_maskg, exp2_to_exp3_mask, nbits);
> +	expect_eq_uint(bitmap_weight(exp2_to_exp3_maskg, nbits), w);
> +	expect_eq_uint(bitmap_weight(bmap, 1024), w);
> +	expect_eq_bitmap(bmap, exp2_to_exp3_mask, nbits);

Would be interesting to compare bitmap scatter/gather performance
against bitmap_remap.

> +}
> +
>  #define PARSE_TIME	0x1
>  #define NO_LEN		0x2
>  
> @@ -1228,6 +1250,7 @@ static void __init selftest(void)
>  	test_fill_set();
>  	test_copy();
>  	test_replace();
> +	test_bitmap_sg();
>  	test_bitmap_arr32();
>  	test_bitmap_arr64();
>  	test_bitmap_parse();
> -- 
> 2.40.0.1.gaa8946217a0b
  
Yury Norov Sept. 27, 2023, 2:10 a.m. UTC | #2
> > +unsigned int bitmap_gather(unsigned long *dst, const unsigned long *src,
> > +			   const unsigned long *mask, unsigned int nbits)
> > +{
> > +	unsigned int bit;
> > +	int n = 0;
> > +
> > +	bitmap_zero(dst, nbits);
> > +
> > +	for_each_set_bit(bit, mask, nbits)
> > +		__assign_bit(n++, dst, test_bit(bit, src));
> > +
> > +	return n;
> > +}
> > +EXPORT_SYMBOL(bitmap_gather);

So, if mask is 0b01, and src is 0b10, the output will be 0b00.
To me it sounds like you've gathered nothing, while the intention
was to gather all source bits to bit #0. This is my understanding
of the word 'gather', and this is how bitmap_remap() works.

bitmap_remap() handles it by wrapping around 0:
        set_bit(find_nth_bit(new, nbits, n % w), dst);

In your case, it may look like:
        n = off = 0;
        while (1) {
                off += n;
                n = 0;
        	for_each_set_bit(bit, mask, nbits) {
                        if (bit + off >= nbits)
                                return;
        		__assign_bit(n++, dst, test_bit(bit + off, src));
                }
        }

(Not tested, except that on piece of paper.)

If you claim you're replacing bitmap_remap(), you should correctly handle
the above case; when src == dst; when mask is empty, and probably more...

Thanks,
Yury
  
Andy Shevchenko Sept. 27, 2023, 12:02 p.m. UTC | #3
On Tue, Sep 26, 2023 at 05:25:13PM -0700, Yury Norov wrote:
> On Tue, Sep 26, 2023 at 08:20:04AM +0300, Andy Shevchenko wrote:
> > These helpers are the optimized versions of the bitmap_remap()
> > where one of the bitmaps (source or destination) is of sequential bits.
> 
> If so, can you add a test that makes sure that new API is consistent
> with the old bitmap_remap? And also provide numbers how well are they
> optimized, comparing to bitmap_remap.

It's impossible. bitmap_remap() is universal, these APIs only for the specific
domain.

> > See more in the kernel documentation of the helpers.
> 
> I grepped the whole kernel, not only Documentation directory, and found
> nothing...

It's added in this patch in the format of kernel doc.

...

> > + * Returns: the weight of the @mask.
> 
> Returning a weight of the mask is somewhat non-trivial... To me it
> would be logical to return a weight of destination, for example...

> But I see that in the following patch you're using the returned value.
> Maybe add a few words to advocate that?


I'll look into it again, maybe dst would work as well, I don't remember why
I have chosen mask. Maybe because it's invariant here, dunno.

...

> > +	int n = 0;
> 
> Is n signed for purpose? I think it should be consistent with
> return value.

OK. No purpose there. Perhaps it's a leftover from the first experiments
on the implementation of these APIs.

...

> > + * Example:
> > + * If @src bitmap = 0x0302, with @mask = 0x1313, @dst will be 0x001a.
> 
> Not sure about others, but to me hex representation is quite useless,
> moreover it's followed by binary one.

Somebody is better at hex, somebody at binary one, I would leave both.

> > + * Or in binary form
> > + * @src			@mask			@dst
> > + * 0000001100000010	0001001100010011	0000000000011010
> > + *
> > + * (Bits 0, 1, 4, 8, 9, 12 are copied to the bits 0, 1, 2, 3, 4, 5)
> > + *
> > + * Returns: the weight of the @mask.
> > + */
> 
> It looks like those are designed complement to each other. Is that
> true? If so, can you make your example showing that
>         scatter -> gather -> scatter
> would restore the original bitmap?

It looks like you stopped reading documentation somewhere on the middle.
The two APIs are documented with the same example which makes it clear
that they are data-loss transformations.

Do you need something like this to be added (in both documentations):

  The bitmap_scatter(), when executed over the @dst bitmap, will
  restore the @src one if the @mask is kept the same, see the example
  in the function description.

?

> If I'm wrong, can you please underline that they are not complement,
> and why?

No, you are not.

...

> I feel like they should reside in header, because they are quite a small
> functions indeed, and they would benefit from compile-time optimizations
> without bloating the kernel.
> 
> Moreover, you are using them in patch #3 on 64-bit bitmaps, which
> would benefit from small_const_nbits() optimization.

I can move them into header.

...

> > +	DECLARE_BITMAP(bmap, 1024);

> Can you make it 1000? That way we'll test non-aligned case.

Sure. But it's not related to the patch. It will test bitmap_weight() and not
the new APIs, so, whatever is bigger 64 will suffice the purpose and won't
anyhow affect the newly added APIs.

...

> Would be interesting to compare bitmap scatter/gather performance
> against bitmap_remap.

Do you have a code in mind? I can incorporate it.

Again, you should understand that it's only applicable to the certain cases,
otherwise it makes no sense (it's like comparing performance of ffs() on
a single word against find_first_bit() on the arbitrary amount of words).
  
Andy Shevchenko Sept. 27, 2023, 12:10 p.m. UTC | #4
On Tue, Sep 26, 2023 at 07:10:33PM -0700, Yury Norov wrote:

...

> So, if mask is 0b01, and src is 0b10, the output will be 0b00.

Correct. This how it must work.

> To me it sounds like you've gathered nothing, while the intention
> was to gather all source bits to bit #0.

No, the idea is to gather bits positions of which are provided by a mask
from source to a destination, where the positions are sequential.

> This is my understanding
> of the word 'gather',

You should get the mask meaning. It's not the bit provider, it's a bit
positions provider.

>	and this is how bitmap_remap() works.

It's NOT a replacement of bitmap_remap(). It's specifically written in the
commit message that domain of these APIs is when @old or @new (in case of
bitmap_remap() API) equals 2^n - 1, where n is amount of bits we consider.

...

> If you claim you're replacing bitmap_remap(),
> you should correctly handle
> the above case; when src == dst; when mask is empty, and probably more...

I don't care about corner cases of bitmap_remap(), and we can solve the issue
when it comes. Currently there is no issue with the all users that need these
API as they use different addresses.
  
Yury Norov Oct. 2, 2023, 4:06 a.m. UTC | #5
On Wed, Sep 27, 2023 at 03:02:34PM +0300, Andy Shevchenko wrote:

[...]

> > It looks like those are designed complement to each other. Is that
> > true? If so, can you make your example showing that
> >         scatter -> gather -> scatter
> > would restore the original bitmap?
> 
> It looks like you stopped reading documentation somewhere on the middle.

What a wonderful week of strong statements... Whatever...

> The two APIs are documented with the same example which makes it clear
> that they are data-loss transformations.
> 
> Do you need something like this to be added (in both documentations):
> 
>   The bitmap_scatter(), when executed over the @dst bitmap, will
>   restore the @src one if the @mask is kept the same, see the example
>   in the function description.
> 
> ?
> 
> > If I'm wrong, can you please underline that they are not complement,
> > and why?
> 
> No, you are not.

I should be confused even more. You're saying that I'm not wrong here, and few
lines above you're saying it's a data loss...

I don't mind this new 3-liners, but I'd like you to have a well better
wording and testing around them because those bitmap_scatter/gather are
all about performance, readability and usability.

To begin with, the whole name of the series: "get rid of bitmap_remap() and
bitmap_biremap() uses" is wrong because the functions are still here, and are
still used.

Even worse, instead of getting rid of some useless code, you're
bloating the kernel with something that duplicates existing
functionality.

This is an example of a series that 'gets rid of' something for true:

https://yhbt.net/lore/all/20230925023817.782509-7-yury.norov@gmail.com/T/

(And unfortunately it's still unreviewed.)

But I understand your motivation, and as I already said, I like this
series in general. So let's please figure out a better wording before
moving forward?

Below are some my of thought.

1. Stop saying that you're getting rid of something when you're not.
   I'd suggest something like: "add simple and verbose alternatives to
   bitmap_remap(), and use them where appropriate".

2. Don't blame people considering a parameter named 'mask' as a mask.
   I mean this sentence: 

   > You should get the mask meaning. It's not the bit provider, it's a bit
   > positions provider.

   If it's a 'bit position provider', please give it a proper name,
   for example 'pos'. I'd suggest something like:
        unsigned long bitmap_scatter(unsigned long *dst, unsigned long *pos,
                                                          unsigned long *val)

3. If you're saying that new API is a simplified replacement for
   something, I'd like to see the full contract, i.e. explicit list of all
   simplifications and limitations implied:
   - val == dst is not handled;
   - when 'pos' is empty, val is not copied to dst;
   - new API doesn't wrap around 0, like bitmap_remap() does;
   - set bits in 'val' are not copied to 'dst' when not in 'pos' (?)'
   - anything else else?

4. Similarly to previous item, I'd like to have explicit understanding
   and examples where and how bitmap_remap may be replaced. You're
   only saying that it is possible when either 'new' or 'old' are
   dense. Is that the only case? Can you add a test that explicitly
   checks that bitmap_remap and bitmap_scatter/gather produce the same
   output. Something like this:
        bitmap_remap(dst1, val, dense_map, pos, nbits);
        bitmap_scatter(dst2, val, pos, nbits);
        check_eq_bitmap(dst1, dst2, nbits);

5. Can you add a picture like this to explain the algorithm even more:

        mask:    ...v..vv..v...vv
        bits:    0000001100000010
        1.          ^  ^^  ^    0
        2.          |  ||  |   10
        3.          |  ||  +> 010
        4.          |  |+--> 1010
        5.          |  +--> 11010
        6.          +----> 011010
        gather:  ..........011010

5. Regarding my confusion, I had to draw the picture above to realise
   how it's possible that scatter/gather are inverse and data-loss
   (i.e. not inverse) at the same time. Can you explain it with a
   wording like this: "For bits selected by 'pos' bitmap, gathering a
   'val' bitmap with the following scattering restores the original map.
   All other bits values are lost and replaced with zeros." Similarly
   for gathering. And please add a test case.

6. Regarding performance. I think it's wrong to say that that your
   code is better optimized then some other, and then ask your
   reviewers to figure out how to measure the difference. If you make
   such statement, you should already have some test or real-life
   measurement.

   However, if you ask me, I can suggest you to pull this patch:
   https://lore.kernel.org/lkml/20230828184353.5145-4-yury.norov@gmail.com/

   and modify/extend it in a way that both bitmap_remap and
   bitmap_scatter/gather take the same 'val' and 'pos' bitmaps,
   produce the same output, and then see which code is faster.

   Worth to mention that since all current users of your API are working
   on 64-bit maps, performance is doubty an issue. So you can drop the
   'optimization' part of your wording, and don't add performance test.

Thanks,
Yury
  
Andy Shevchenko Oct. 2, 2023, 8:23 a.m. UTC | #6
On Sun, Oct 01, 2023 at 09:06:24PM -0700, Yury Norov wrote:
> On Wed, Sep 27, 2023 at 03:02:34PM +0300, Andy Shevchenko wrote:

[...]

> > > It looks like those are designed complement to each other. Is that
> > > true? If so, can you make your example showing that
> > >         scatter -> gather -> scatter
> > > would restore the original bitmap?
> > 
> > It looks like you stopped reading documentation somewhere on the middle.
> 
> What a wonderful week of strong statements... Whatever...
> 
> > The two APIs are documented with the same example which makes it clear
> > that they are data-loss transformations.

Oh, should be read as data-lossleess.

> > Do you need something like this to be added (in both documentations):
> > 
> >   The bitmap_scatter(), when executed over the @dst bitmap, will
> >   restore the @src one if the @mask is kept the same, see the example
> >   in the function description.
> > 
> > ?
> > 
> > > If I'm wrong, can you please underline that they are not complement,
> > > and why?
> > 
> > No, you are not.
> 
> I should be confused even more. You're saying that I'm not wrong here, and few
> lines above you're saying it's a data loss...

data-lossless, sorry, I was missing that.

> I don't mind this new 3-liners, but I'd like you to have a well better
> wording and testing around them because those bitmap_scatter/gather are
> all about performance, readability and usability.
> 
> To begin with, the whole name of the series: "get rid of bitmap_remap() and
> bitmap_biremap() uses" is wrong because the functions are still here, and are
> still used.
> 
> Even worse, instead of getting rid of some useless code, you're
> bloating the kernel with something that duplicates existing
> functionality.

Wasn't Rasmus clear about the intention:
1) get rid of active users of bitmap_remap() outside of NUMA;
2) move that APIs to be private for NUMA users exclusively?

Towards that direction is my series.

> But I understand your motivation, and as I already said, I like this
> series in general. So let's please figure out a better wording before
> moving forward?
> 
> Below are some my of thought.
> 
> 1. Stop saying that you're getting rid of something when you're not.
>    I'd suggest something like: "add simple and verbose alternatives to
>    bitmap_remap(), and use them where appropriate".

I'm getting rid of the users of the bitmap_remap() outside of NUMA.
Why should I stop telling that? I think I have to elaborate what
that means.

> 2. Don't blame people considering a parameter named 'mask' as a mask.
>    I mean this sentence: 
> 
>    > You should get the mask meaning. It's not the bit provider, it's a bit
>    > positions provider.
> 
>    If it's a 'bit position provider', please give it a proper name,
>    for example 'pos'. I'd suggest something like:
>         unsigned long bitmap_scatter(unsigned long *dst, unsigned long *pos,

It's not pos in the sense of what pos means. pos means a position of a single
bit, mask is about many bits. And mask meaning is still the same despite how
we call it, it's bit positions provider, i.e. bits that are set in the mask
will be considered to be worked on. It's mask, why should I rename it?

> 3. If you're saying that new API is a simplified replacement for
>    something, I'd like to see the full contract, i.e. explicit list of all
>    simplifications and limitations implied:
>    - val == dst is not handled;
>    - when 'pos' is empty, val is not copied to dst;
>    - new API doesn't wrap around 0, like bitmap_remap() does;
>    - set bits in 'val' are not copied to 'dst' when not in 'pos' (?)'
>    - anything else else?

I see, I should have not used word "simplification" at all.
I will reword to not mention bitmap_remap() _at all_.

> 4. Similarly to previous item, I'd like to have explicit understanding
>    and examples where and how bitmap_remap may be replaced. You're
>    only saying that it is possible when either 'new' or 'old' are
>    dense. Is that the only case? Can you add a test that explicitly
>    checks that bitmap_remap and bitmap_scatter/gather produce the same
>    output. Something like this:
>         bitmap_remap(dst1, val, dense_map, pos, nbits);
>         bitmap_scatter(dst2, val, pos, nbits);
>         check_eq_bitmap(dst1, dst2, nbits);

No, I won't do that. This is out of the scope as see comment on 3. above.

> 5. Can you add a picture like this to explain the algorithm even more:
> 
>         mask:    ...v..vv..v...vv
>         bits:    0000001100000010
>         1.          ^  ^^  ^    0
>         2.          |  ||  |   10
>         3.          |  ||  +> 010
>         4.          |  |+--> 1010
>         5.          |  +--> 11010
>         6.          +----> 011010
>         gather:  ..........011010

Why? Is it so complicated?

> 5. Regarding my confusion, I had to draw the picture above to realise
>    how it's possible that scatter/gather are inverse and data-loss
>    (i.e. not inverse) at the same time. Can you explain it with a
>    wording like this: "For bits selected by 'pos' bitmap, gathering a
>    'val' bitmap with the following scattering restores the original map.
>    All other bits values are lost and replaced with zeros." Similarly
>    for gathering. And please add a test case.

Let's not make a big deal out of a small typo. I admit that it confused people,
but that was neither in the documentation, nor in the code, just in our
discussion.

> 6. Regarding performance. I think it's wrong to say that that your
>    code is better optimized then some other, and then ask your
>    reviewers to figure out how to measure the difference. If you make
>    such statement, you should already have some test or real-life
>    measurement.

Same as for 4. above. I drop the comparison and mentioning of bitmap_remap().
It was a big mistake by me to even mentioned that.

>    However, if you ask me, I can suggest you to pull this patch:
>    https://lore.kernel.org/lkml/20230828184353.5145-4-yury.norov@gmail.com/
> 
>    and modify/extend it in a way that both bitmap_remap and
>    bitmap_scatter/gather take the same 'val' and 'pos' bitmaps,
>    produce the same output, and then see which code is faster.
> 
>    Worth to mention that since all current users of your API are working
>    on 64-bit maps, performance is doubty an issue. So you can drop the
>    'optimization' part of your wording, and don't add performance test.
  

Patch

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 1516ff979315..87013b9a7dd8 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -60,6 +60,8 @@  struct device;
  *  bitmap_shift_left(dst, src, n, nbits)       *dst = *src << n
  *  bitmap_cut(dst, src, first, n, nbits)       Cut n bits from first, copy rest
  *  bitmap_replace(dst, old, new, mask, nbits)  *dst = (*old & ~(*mask)) | (*new & *mask)
+ *  bitmap_scatter(dst, src, mask, nbits)	*dst = map(dense, sparse)(src)
+ *  bitmap_gather(dst, src, mask, nbits)	*dst = map(sparse, dense)(src)
  *  bitmap_remap(dst, src, old, new, nbits)     *dst = map(old, new)(src)
  *  bitmap_bitremap(oldbit, old, new, nbits)    newbit = map(old, new)(oldbit)
  *  bitmap_onto(dst, orig, relmap, nbits)       *dst = orig relative to relmap
@@ -208,6 +210,12 @@  int bitmap_parselist(const char *buf, unsigned long *maskp,
 			int nmaskbits);
 int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
 			unsigned long *dst, int nbits);
+
+unsigned int bitmap_scatter(unsigned long *dst, const unsigned long *src,
+		const unsigned long *mask, unsigned int nbits);
+unsigned int bitmap_gather(unsigned long *dst, const unsigned long *src,
+		const unsigned long *mask, unsigned int nbits);
+
 void bitmap_remap(unsigned long *dst, const unsigned long *src,
 		const unsigned long *old, const unsigned long *new, unsigned int nbits);
 int bitmap_bitremap(int oldbit,
@@ -216,6 +224,7 @@  void bitmap_onto(unsigned long *dst, const unsigned long *orig,
 		const unsigned long *relmap, unsigned int bits);
 void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 		unsigned int sz, unsigned int nbits);
+
 int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
 void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 935e0f96e785..31cfc7846aae 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -942,6 +942,76 @@  int bitmap_parse(const char *start, unsigned int buflen,
 }
 EXPORT_SYMBOL(bitmap_parse);
 
+/**
+ * bitmap_scatter - Scatter a bitmap according to the given mask
+ * @dst: scattered bitmap
+ * @src: gathered bitmap
+ * @mask: bits to assign to in the scattered bitmap
+ * @nbits: number of bits in each of these bitmaps
+ *
+ * Scatters bitmap with sequential bits according to the given @mask.
+ *
+ * Example:
+ * If @src bitmap = 0x005a, with @mask = 0x1313, @dst will be 0x0302.
+ *
+ * Or in binary form
+ * @src			@mask			@dst
+ * 0000000001011010	0001001100010011	0000001100000010
+ *
+ * (Bits 0, 1, 2, 3, 4, 5 are copied to the bits 0, 1, 4, 8, 9, 12)
+ *
+ * Returns: the weight of the @mask.
+ */
+unsigned int bitmap_scatter(unsigned long *dst, const unsigned long *src,
+			    const unsigned long *mask, unsigned int nbits)
+{
+	unsigned int bit;
+	int n = 0;
+
+	bitmap_zero(dst, nbits);
+
+	for_each_set_bit(bit, mask, nbits)
+		__assign_bit(bit, dst, test_bit(n++, src));
+
+	return n;
+}
+EXPORT_SYMBOL(bitmap_scatter);
+
+/**
+ * bitmap_gather - Gather a bitmap according to given mask
+ * @dst: gathered bitmap
+ * @src: scattered bitmap
+ * @mask: bits to extract from in the scattered bitmap
+ * @nbits: number of bits in each of these bitmaps
+ *
+ * Gathers bitmap with sparse bits according to the given @mask.
+ *
+ * Example:
+ * If @src bitmap = 0x0302, with @mask = 0x1313, @dst will be 0x001a.
+ *
+ * Or in binary form
+ * @src			@mask			@dst
+ * 0000001100000010	0001001100010011	0000000000011010
+ *
+ * (Bits 0, 1, 4, 8, 9, 12 are copied to the bits 0, 1, 2, 3, 4, 5)
+ *
+ * Returns: the weight of the @mask.
+ */
+unsigned int bitmap_gather(unsigned long *dst, const unsigned long *src,
+			   const unsigned long *mask, unsigned int nbits)
+{
+	unsigned int bit;
+	int n = 0;
+
+	bitmap_zero(dst, nbits);
+
+	for_each_set_bit(bit, mask, nbits)
+		__assign_bit(n++, dst, test_bit(bit, src));
+
+	return n;
+}
+EXPORT_SYMBOL(bitmap_gather);
+
 /**
  * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
  *	@buf: pointer to a bitmap
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 1f2dc7fef17f..f43a07679998 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -50,6 +50,9 @@  static const unsigned long exp2[] __initconst = {
 static const unsigned long exp2_to_exp3_mask[] __initconst = {
 	BITMAP_FROM_U64(0x008000020020212eULL),
 };
+static const unsigned long exp2_to_exp3_maskg[] __initconst = {
+	BITMAP_FROM_U64(0x00000000000001ffULL),
+};
 /* exp3_0_1 = (exp2[0] & ~exp2_to_exp3_mask) | (exp2[1] & exp2_to_exp3_mask) */
 static const unsigned long exp3_0_1[] __initconst = {
 	BITMAP_FROM_U64(0x33b3333311313137ULL),
@@ -357,6 +360,25 @@  static void __init test_replace(void)
 	expect_eq_bitmap(bmap, exp3_1_0, nbits);
 }
 
+static void __init test_bitmap_sg(void)
+{
+	unsigned int nbits = 64;
+	DECLARE_BITMAP(bmap, 1024);
+	unsigned int w;
+
+	bitmap_zero(bmap, 1024);
+	w = bitmap_gather(bmap, exp2_to_exp3_mask, exp2_to_exp3_mask, nbits);
+	expect_eq_uint(bitmap_weight(exp2_to_exp3_mask, nbits), w);
+	expect_eq_uint(bitmap_weight(bmap, 1024), w);
+	expect_eq_bitmap(bmap, exp2_to_exp3_maskg, nbits);
+
+	bitmap_zero(bmap, 1024);
+	w = bitmap_scatter(bmap, exp2_to_exp3_maskg, exp2_to_exp3_mask, nbits);
+	expect_eq_uint(bitmap_weight(exp2_to_exp3_maskg, nbits), w);
+	expect_eq_uint(bitmap_weight(bmap, 1024), w);
+	expect_eq_bitmap(bmap, exp2_to_exp3_mask, nbits);
+}
+
 #define PARSE_TIME	0x1
 #define NO_LEN		0x2
 
@@ -1228,6 +1250,7 @@  static void __init selftest(void)
 	test_fill_set();
 	test_copy();
 	test_replace();
+	test_bitmap_sg();
 	test_bitmap_arr32();
 	test_bitmap_arr64();
 	test_bitmap_parse();