net: page_pool: fix refcounting issues with fragmented allocation

Message ID 20230124124300.94886-1-nbd@nbd.name
State New
Headers
Series net: page_pool: fix refcounting issues with fragmented allocation |

Commit Message

Felix Fietkau Jan. 24, 2023, 12:43 p.m. UTC
  While testing fragmented page_pool allocation in the mt76 driver, I was able
to reliably trigger page refcount underflow issues, which did not occur with
full-page page_pool allocation.
It appears to me, that handling refcounting in two separate counters
(page->pp_frag_count and page refcount) is racy when page refcount gets
incremented by code dealing with skb fragments directly, and
page_pool_return_skb_page is called multiple times for the same fragment.

Dropping page->pp_frag_count and relying entirely on the page refcount makes
these underflow issues and crashes go away.

Cc: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
 include/linux/mm_types.h | 17 +++++------------
 include/net/page_pool.h  | 19 ++++---------------
 net/core/page_pool.c     | 12 ++++--------
 3 files changed, 13 insertions(+), 35 deletions(-)
  

Comments

Ilias Apalodimas Jan. 24, 2023, 2:11 p.m. UTC | #1
Hi Felix,

++cc Alexander and Yunsheng.

Thanks for the report

On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>
> While testing fragmented page_pool allocation in the mt76 driver, I was able
> to reliably trigger page refcount underflow issues, which did not occur with
> full-page page_pool allocation.
> It appears to me, that handling refcounting in two separate counters
> (page->pp_frag_count and page refcount) is racy when page refcount gets
> incremented by code dealing with skb fragments directly, and
> page_pool_return_skb_page is called multiple times for the same fragment.
>
> Dropping page->pp_frag_count and relying entirely on the page refcount makes
> these underflow issues and crashes go away.
>

This has been discussed here [1].  TL;DR changing this to page
refcount might blow up in other colorful ways.  Can we look closer and
figure out why the underflow happens?

[1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/

Thanks
/Ilias

> Cc: Lorenzo Bianconi <lorenzo@kernel.org>
> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> ---
>  include/linux/mm_types.h | 17 +++++------------
>  include/net/page_pool.h  | 19 ++++---------------
>  net/core/page_pool.c     | 12 ++++--------
>  3 files changed, 13 insertions(+), 35 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 9757067c3053..96ec3b19a86d 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -125,18 +125,11 @@ struct page {
>                         struct page_pool *pp;
>                         unsigned long _pp_mapping_pad;
>                         unsigned long dma_addr;
> -                       union {
> -                               /**
> -                                * dma_addr_upper: might require a 64-bit
> -                                * value on 32-bit architectures.
> -                                */
> -                               unsigned long dma_addr_upper;
> -                               /**
> -                                * For frag page support, not supported in
> -                                * 32-bit architectures with 64-bit DMA.
> -                                */
> -                               atomic_long_t pp_frag_count;
> -                       };
> +                       /**
> +                        * dma_addr_upper: might require a 64-bit
> +                        * value on 32-bit architectures.
> +                        */
> +                       unsigned long dma_addr_upper;
>                 };
>                 struct {        /* Tail pages of compound page */
>                         unsigned long compound_head;    /* Bit zero is set */
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 813c93499f20..28e1fdbdcd53 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -279,14 +279,14 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
>
>  static inline void page_pool_fragment_page(struct page *page, long nr)
>  {
> -       atomic_long_set(&page->pp_frag_count, nr);
> +       page_ref_add(page, nr);
>  }
>
>  static inline long page_pool_defrag_page(struct page *page, long nr)
>  {
>         long ret;
>
> -       /* If nr == pp_frag_count then we have cleared all remaining
> +       /* If nr == page_ref_count then we have cleared all remaining
>          * references to the page. No need to actually overwrite it, instead
>          * we can leave this to be overwritten by the calling function.
>          *
> @@ -295,22 +295,14 @@ static inline long page_pool_defrag_page(struct page *page, long nr)
>          * especially when dealing with a page that may be partitioned
>          * into only 2 or 3 pieces.
>          */
> -       if (atomic_long_read(&page->pp_frag_count) == nr)
> +       if (page_ref_count(page) == nr)
>                 return 0;
>
> -       ret = atomic_long_sub_return(nr, &page->pp_frag_count);
> +       ret = page_ref_sub_return(page, nr);
>         WARN_ON(ret < 0);
>         return ret;
>  }
>
> -static inline bool page_pool_is_last_frag(struct page_pool *pool,
> -                                         struct page *page)
> -{
> -       /* If fragments aren't enabled or count is 0 we were the last user */
> -       return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
> -              (page_pool_defrag_page(page, 1) == 0);
> -}
> -
>  static inline void page_pool_put_page(struct page_pool *pool,
>                                       struct page *page,
>                                       unsigned int dma_sync_size,
> @@ -320,9 +312,6 @@ static inline void page_pool_put_page(struct page_pool *pool,
>          * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
>          */
>  #ifdef CONFIG_PAGE_POOL
> -       if (!page_pool_is_last_frag(pool, page))
> -               return;
> -
>         page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
>  #endif
>  }
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 9b203d8660e4..0defcadae225 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -25,7 +25,7 @@
>  #define DEFER_TIME (msecs_to_jiffies(1000))
>  #define DEFER_WARN_INTERVAL (60 * HZ)
>
> -#define BIAS_MAX       LONG_MAX
> +#define BIAS_MAX(pool) (PAGE_SIZE << ((pool)->p.order))
>
>  #ifdef CONFIG_PAGE_POOL_STATS
>  /* alloc_stat_inc is intended to be used in softirq context */
> @@ -619,10 +619,6 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
>         for (i = 0; i < count; i++) {
>                 struct page *page = virt_to_head_page(data[i]);
>
> -               /* It is not the last user for the page frag case */
> -               if (!page_pool_is_last_frag(pool, page))
> -                       continue;
> -
>                 page = __page_pool_put_page(pool, page, -1, false);
>                 /* Approved for bulk recycling in ptr_ring cache */
>                 if (page)
> @@ -659,7 +655,7 @@ EXPORT_SYMBOL(page_pool_put_page_bulk);
>  static struct page *page_pool_drain_frag(struct page_pool *pool,
>                                          struct page *page)
>  {
> -       long drain_count = BIAS_MAX - pool->frag_users;
> +       long drain_count = BIAS_MAX(pool) - pool->frag_users;
>
>         /* Some user is still using the page frag */
>         if (likely(page_pool_defrag_page(page, drain_count)))
> @@ -678,7 +674,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
>
>  static void page_pool_free_frag(struct page_pool *pool)
>  {
> -       long drain_count = BIAS_MAX - pool->frag_users;
> +       long drain_count = BIAS_MAX(pool) - pool->frag_users;
>         struct page *page = pool->frag_page;
>
>         pool->frag_page = NULL;
> @@ -724,7 +720,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
>                 pool->frag_users = 1;
>                 *offset = 0;
>                 pool->frag_offset = size;
> -               page_pool_fragment_page(page, BIAS_MAX);
> +               page_pool_fragment_page(page, BIAS_MAX(pool));
>                 return page;
>         }
>
> --
> 2.39.0
>
  
Alexander Duyck Jan. 24, 2023, 3:57 p.m. UTC | #2
On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote:
> Hi Felix,
> 
> ++cc Alexander and Yunsheng.
> 
> Thanks for the report
> 
> On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > 
> > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > to reliably trigger page refcount underflow issues, which did not occur with
> > full-page page_pool allocation.
> > It appears to me, that handling refcounting in two separate counters
> > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > incremented by code dealing with skb fragments directly, and
> > page_pool_return_skb_page is called multiple times for the same fragment.
> > 
> > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > these underflow issues and crashes go away.
> > 
> 
> This has been discussed here [1].  TL;DR changing this to page
> refcount might blow up in other colorful ways.  Can we look closer and
> figure out why the underflow happens?
> 
> [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/
> 
> Thanks
> /Ilias
> 
> 

The logic should be safe in terms of the page pool itself as it should
be holding one reference to the page while the pp_frag_count is non-
zero. That one reference is what keeps the two halfs in sync as the
page shouldn't be able to be freed until we exhaust the pp_frag_count.

To have an underflow there are two possible scenarios. One is that
either put_page or free_page is being called somewhere that the
page_pool freeing functions should be used. The other possibility is
that a pp_frag_count reference was taken somewhere a page reference
should have.

Do we have a backtrace for the spots that are showing this underrun? If
nothing else we may want to look at tracking down the spots that are
freeing the page pool pages via put_page or free_page to determine what
paths these pages are taking.
  
Felix Fietkau Jan. 24, 2023, 4:59 p.m. UTC | #3
On 24.01.23 16:57, Alexander H Duyck wrote:
> On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote:
>> Hi Felix,
>> 
>> ++cc Alexander and Yunsheng.
>> 
>> Thanks for the report
>> 
>> On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> > 
>> > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> > to reliably trigger page refcount underflow issues, which did not occur with
>> > full-page page_pool allocation.
>> > It appears to me, that handling refcounting in two separate counters
>> > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> > incremented by code dealing with skb fragments directly, and
>> > page_pool_return_skb_page is called multiple times for the same fragment.
>> > 
>> > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> > these underflow issues and crashes go away.
>> > 
>> 
>> This has been discussed here [1].  TL;DR changing this to page
>> refcount might blow up in other colorful ways.  Can we look closer and
>> figure out why the underflow happens?
>> 
>> [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/
>> 
>> Thanks
>> /Ilias
>> 
>> 
> 
> The logic should be safe in terms of the page pool itself as it should
> be holding one reference to the page while the pp_frag_count is non-
> zero. That one reference is what keeps the two halfs in sync as the
> page shouldn't be able to be freed until we exhaust the pp_frag_count.
> 
> To have an underflow there are two possible scenarios. One is that
> either put_page or free_page is being called somewhere that the
> page_pool freeing functions should be used. The other possibility is
> that a pp_frag_count reference was taken somewhere a page reference
> should have.
> 
> Do we have a backtrace for the spots that are showing this underrun? If
> nothing else we may want to look at tracking down the spots that are
> freeing the page pool pages via put_page or free_page to determine what
> paths these pages are taking.
Here's an example of the kind of traces that I was seeing with v6.1:
https://nbd.name/p/61a6617e
On v5.15 I also occasionally got traces like this:
https://nbd.name/p/0b9e4f0d

 From what I can tell, it also triggered the warning that shows up when 
page->pp_frag_count underflows. Unfortunately these traces don't 
directly point to the place where things go wrong.
I do wonder if the pp_frag_count is maybe racy when we have a mix of 
get_page + page_pool_put_page calls.

In case you're wondering what I was doing to trigger the crash: I simply 
create 4 wireless client mode interfaces on the same card and pushed TCP 
traffic from an AP to all 4 simultaneously. I can trigger it pretty much 
immediately after TCP traffic ramps up.

- Felix
  
Felix Fietkau Jan. 24, 2023, 5:22 p.m. UTC | #4
On 24.01.23 15:11, Ilias Apalodimas wrote:
> Hi Felix,
> 
> ++cc Alexander and Yunsheng.
> 
> Thanks for the report
> 
> On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>>
>> While testing fragmented page_pool allocation in the mt76 driver, I was able
>> to reliably trigger page refcount underflow issues, which did not occur with
>> full-page page_pool allocation.
>> It appears to me, that handling refcounting in two separate counters
>> (page->pp_frag_count and page refcount) is racy when page refcount gets
>> incremented by code dealing with skb fragments directly, and
>> page_pool_return_skb_page is called multiple times for the same fragment.
>>
>> Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> these underflow issues and crashes go away.
>>
> 
> This has been discussed here [1].  TL;DR changing this to page
> refcount might blow up in other colorful ways.  Can we look closer and
> figure out why the underflow happens?
I don't see how the approch taken in my patch would blow up. From what I 
can tell, it should be fairly close to how refcount is handled in 
page_frag_alloc. The main improvement it adds is to prevent it from 
blowing up if pool-allocated fragments get shared across multiple skbs 
with corresponding get_page and page_pool_return_skb_page calls.

- Felix
  
Alexander Duyck Jan. 24, 2023, 9:10 p.m. UTC | #5
On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
> On 24.01.23 15:11, Ilias Apalodimas wrote:
> > Hi Felix,
> > 
> > ++cc Alexander and Yunsheng.
> > 
> > Thanks for the report
> > 
> > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > 
> > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > to reliably trigger page refcount underflow issues, which did not occur with
> > > full-page page_pool allocation.
> > > It appears to me, that handling refcounting in two separate counters
> > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > incremented by code dealing with skb fragments directly, and
> > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > 
> > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > these underflow issues and crashes go away.
> > > 
> > 
> > This has been discussed here [1].  TL;DR changing this to page
> > refcount might blow up in other colorful ways.  Can we look closer and
> > figure out why the underflow happens?
> I don't see how the approch taken in my patch would blow up. From what I 
> can tell, it should be fairly close to how refcount is handled in 
> page_frag_alloc. The main improvement it adds is to prevent it from 
> blowing up if pool-allocated fragments get shared across multiple skbs 
> with corresponding get_page and page_pool_return_skb_page calls.
> 
> - Felix
> 

Do you have the patch available to review as an RFC? From what I am
seeing it looks like you are underrunning on the pp_frag_count itself.
I would suspect the issue to be something like starting with a bad
count in terms of the total number of references, or deducing the wrong
amount when you finally free the page assuming you are tracking your
frag count using a non-atomic value in the driver.

Thanks,

- Alex
  
Felix Fietkau Jan. 24, 2023, 9:30 p.m. UTC | #6
On 24.01.23 22:10, Alexander H Duyck wrote:
> On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>> On 24.01.23 15:11, Ilias Apalodimas wrote:
>> > Hi Felix,
>> > 
>> > ++cc Alexander and Yunsheng.
>> > 
>> > Thanks for the report
>> > 
>> > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> > > 
>> > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> > > to reliably trigger page refcount underflow issues, which did not occur with
>> > > full-page page_pool allocation.
>> > > It appears to me, that handling refcounting in two separate counters
>> > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> > > incremented by code dealing with skb fragments directly, and
>> > > page_pool_return_skb_page is called multiple times for the same fragment.
>> > > 
>> > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> > > these underflow issues and crashes go away.
>> > > 
>> > 
>> > This has been discussed here [1].  TL;DR changing this to page
>> > refcount might blow up in other colorful ways.  Can we look closer and
>> > figure out why the underflow happens?
>> I don't see how the approch taken in my patch would blow up. From what I 
>> can tell, it should be fairly close to how refcount is handled in 
>> page_frag_alloc. The main improvement it adds is to prevent it from 
>> blowing up if pool-allocated fragments get shared across multiple skbs 
>> with corresponding get_page and page_pool_return_skb_page calls.
>> 
>> - Felix
>> 
> 
> Do you have the patch available to review as an RFC? From what I am
> seeing it looks like you are underrunning on the pp_frag_count itself.
> I would suspect the issue to be something like starting with a bad
> count in terms of the total number of references, or deducing the wrong
> amount when you finally free the page assuming you are tracking your
> frag count using a non-atomic value in the driver.
The driver patches for page pool are here:
https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/

They are also applied in my mt76 tree at:
https://github.com/nbd168/wireless

- Felix
  
Alexander Duyck Jan. 25, 2023, 5:11 p.m. UTC | #7
On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
> On 24.01.23 22:10, Alexander H Duyck wrote:
> > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
> > > On 24.01.23 15:11, Ilias Apalodimas wrote:
> > > > Hi Felix,
> > > > 
> > > > ++cc Alexander and Yunsheng.
> > > > 
> > > > Thanks for the report
> > > > 
> > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > > > 
> > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > > > to reliably trigger page refcount underflow issues, which did not occur with
> > > > > full-page page_pool allocation.
> > > > > It appears to me, that handling refcounting in two separate counters
> > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > > > incremented by code dealing with skb fragments directly, and
> > > > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > > > 
> > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > > > these underflow issues and crashes go away.
> > > > > 
> > > > 
> > > > This has been discussed here [1].  TL;DR changing this to page
> > > > refcount might blow up in other colorful ways.  Can we look closer and
> > > > figure out why the underflow happens?
> > > I don't see how the approch taken in my patch would blow up. From what I 
> > > can tell, it should be fairly close to how refcount is handled in 
> > > page_frag_alloc. The main improvement it adds is to prevent it from 
> > > blowing up if pool-allocated fragments get shared across multiple skbs 
> > > with corresponding get_page and page_pool_return_skb_page calls.
> > > 
> > > - Felix
> > > 
> > 
> > Do you have the patch available to review as an RFC? From what I am
> > seeing it looks like you are underrunning on the pp_frag_count itself.
> > I would suspect the issue to be something like starting with a bad
> > count in terms of the total number of references, or deducing the wrong
> > amount when you finally free the page assuming you are tracking your
> > frag count using a non-atomic value in the driver.
> The driver patches for page pool are here:
> https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
> https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
> 
> They are also applied in my mt76 tree at:
> https://github.com/nbd168/wireless
> 
> - Felix

So one thing I am thinking is that we may be seeing an issue where we
are somehow getting a mix of frag and non-frag based page pool pages.
That is the only case I can think of where we might be underflowing
negative. If you could add some additional debug info on the underflow
WARN_ON case in page_pool_defrag_page that might be useful.
Specifically I would be curious what the actual return value is. I'm
assuming we are only hitting negative 1, but I would want to verify we
aren't seeing something else.

Also just to confirm this is building on 64b kernel correct? Just want
to make sure we don't have this running on a 32b setup where the frag
count and the upper 32b of the DMA address are overlapped.

As far as the patch set I only really see a few minor issues which I am
going to post a few snippets below.


> diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
> b/drivers/net/wireless/mediatek/mt76/dma.c
> index 611769e445fa..7fd9aa9c3d9e 100644

...

> @@ -593,25 +593,28 @@  mt76_dma_rx_fill(struct mt76_dev *dev, struct
> mt76_queue *q)
>  
>  	while (q->queued < q->ndesc - 1) {
>  		struct mt76_queue_buf qbuf;
> -		void *buf = NULL;
> +		dma_addr_t addr;
> +		int offset;
> +		void *buf;
>  
> -		buf = page_frag_alloc(&q->rx_page, q->buf_size,
> GFP_ATOMIC);
> +		buf = mt76_get_page_pool_buf(q, &offset, q-
> >buf_size);
>  		if (!buf)
>  			break;
>  
> -		addr = dma_map_single(dev->dma_dev, buf, len,
> DMA_FROM_DEVICE);
> +		addr = dma_map_single(dev->dma_dev, buf + offset,
> len,
> +				      DMA_FROM_DEVICE);

Offset was already added to buf in mt76_get_page_pool_buf so the DMA
mapping offset doesn't look right to me.

>  		if (unlikely(dma_mapping_error(dev->dma_dev, addr)))
> {
> -			skb_free_frag(buf);
> +			mt76_put_page_pool_buf(buf, allow_direct);
>  			break;
>  		}
>  

I'm not a fan of the defensive programming in mt76_put_page_pool_buf.
If you are in an area that is using page pool you should be using the
page pool version of the freeing operations instead of adding
additional overhead that can mess things up by having it have to also
check for if the page is a page pool page or not.

> -		qbuf.addr = addr + offset;
> -		qbuf.len = len - offset;
> +		qbuf.addr = addr + q->buf_offset;
> +		qbuf.len = len - q->buf_offset;
>  		qbuf.skip_unmap = false;
>  		if (mt76_dma_add_rx_buf(dev, q, &qbuf, buf) < 0) {
>  			dma_unmap_single(dev->dma_dev, addr, len,
>  					 DMA_FROM_DEVICE);
> -			skb_free_frag(buf);
> +			mt76_put_page_pool_buf(buf, allow_direct);
>  			break;
>  		}
>  		frames++;

...

> @@ -848,6 +847,8 @@  mt76_dma_rx_process(struct mt76_dev *dev, struct
> mt76_queue *q, int budget)
>  			goto free_frag;
>  
>  		skb_reserve(skb, q->buf_offset);
> +		if (mt76_is_page_from_pp(data))
> +			skb_mark_for_recycle(skb);
>  
>  		*(u32 *)skb->cb = info;
>  

More defensive programming here. Is there a path that allows for a
mixed setup?

The only spot where I can see there being anything like that is in 
/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c. But it doesn't make
any sense to me as to why it was included in the patch. It might be
easier to sort out the issue if we were to get rid of some of the
defensive programming.
  
Felix Fietkau Jan. 25, 2023, 5:32 p.m. UTC | #8
On 25.01.23 18:11, Alexander H Duyck wrote:
> On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>> On 24.01.23 22:10, Alexander H Duyck wrote:
>> > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>> > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>> > > > Hi Felix,
>> > > > 
>> > > > ++cc Alexander and Yunsheng.
>> > > > 
>> > > > Thanks for the report
>> > > > 
>> > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> > > > > 
>> > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> > > > > to reliably trigger page refcount underflow issues, which did not occur with
>> > > > > full-page page_pool allocation.
>> > > > > It appears to me, that handling refcounting in two separate counters
>> > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> > > > > incremented by code dealing with skb fragments directly, and
>> > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>> > > > > 
>> > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> > > > > these underflow issues and crashes go away.
>> > > > > 
>> > > > 
>> > > > This has been discussed here [1].  TL;DR changing this to page
>> > > > refcount might blow up in other colorful ways.  Can we look closer and
>> > > > figure out why the underflow happens?
>> > > I don't see how the approch taken in my patch would blow up. From what I 
>> > > can tell, it should be fairly close to how refcount is handled in 
>> > > page_frag_alloc. The main improvement it adds is to prevent it from 
>> > > blowing up if pool-allocated fragments get shared across multiple skbs 
>> > > with corresponding get_page and page_pool_return_skb_page calls.
>> > > 
>> > > - Felix
>> > > 
>> > 
>> > Do you have the patch available to review as an RFC? From what I am
>> > seeing it looks like you are underrunning on the pp_frag_count itself.
>> > I would suspect the issue to be something like starting with a bad
>> > count in terms of the total number of references, or deducing the wrong
>> > amount when you finally free the page assuming you are tracking your
>> > frag count using a non-atomic value in the driver.
>> The driver patches for page pool are here:
>> https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>> https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>> 
>> They are also applied in my mt76 tree at:
>> https://github.com/nbd168/wireless
>> 
>> - Felix
> 
> So one thing I am thinking is that we may be seeing an issue where we
> are somehow getting a mix of frag and non-frag based page pool pages.
> That is the only case I can think of where we might be underflowing
> negative. If you could add some additional debug info on the underflow
> WARN_ON case in page_pool_defrag_page that might be useful.
> Specifically I would be curious what the actual return value is. I'm
> assuming we are only hitting negative 1, but I would want to verify we
> aren't seeing something else.
I'll try to run some more tests soon. However, I think I found the piece 
of code that is incompatible with using pp_frag_count.
When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
packet), and it is not split by the hardware, a cfg80211 function 
extracts the individual MSDUs into separate skbs. In that case, a 
fragment can be shared across multiple skbs, and get_page is used to 
increase the refcount.
You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
its helper functions).
This code also has a bug where it doesn't set pp_recycle on the newly 
allocated skb if the previous one has it, but that's a separate matter 
and fixing it doesn't make the crash go away.
Is there any way I can make that part of the code work with the current 
page pool frag implementation?

> Also just to confirm this is building on 64b kernel correct? Just want
> to make sure we don't have this running on a 32b setup where the frag
> count and the upper 32b of the DMA address are overlapped.
Yes, I'm using a 64b kernel.

> As far as the patch set I only really see a few minor issues which I am
> going to post a few snippets below.
> 
> 
>> diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
>> b/drivers/net/wireless/mediatek/mt76/dma.c
>> index 611769e445fa..7fd9aa9c3d9e 100644
> 
> ...
> 
>> @@ -593,25 +593,28 @@  mt76_dma_rx_fill(struct mt76_dev *dev, struct
>> mt76_queue *q)
>>  
>>  	while (q->queued < q->ndesc - 1) {
>>  		struct mt76_queue_buf qbuf;
>> -		void *buf = NULL;
>> +		dma_addr_t addr;
>> +		int offset;
>> +		void *buf;
>>  
>> -		buf = page_frag_alloc(&q->rx_page, q->buf_size,
>> GFP_ATOMIC);
>> +		buf = mt76_get_page_pool_buf(q, &offset, q-
>> >buf_size);
>>  		if (!buf)
>>  			break;
>>  
>> -		addr = dma_map_single(dev->dma_dev, buf, len,
>> DMA_FROM_DEVICE);
>> +		addr = dma_map_single(dev->dma_dev, buf + offset,
>> len,
>> +				      DMA_FROM_DEVICE);
> 
> Offset was already added to buf in mt76_get_page_pool_buf so the DMA
> mapping offset doesn't look right to me.
Right. This is resolved by the follow-up patch which keeps pages DMA 
mapped. I plan on squashing both patches into one and adding some fixes 
on top when the underlying page pool issue is resolved.

>>  		if (unlikely(dma_mapping_error(dev->dma_dev, addr)))
>> {
>> -			skb_free_frag(buf);
>> +			mt76_put_page_pool_buf(buf, allow_direct);
>>  			break;
>>  		}
>>  
> 
> I'm not a fan of the defensive programming in mt76_put_page_pool_buf.
> If you are in an area that is using page pool you should be using the
> page pool version of the freeing operations instead of adding
> additional overhead that can mess things up by having it have to also
> check for if the page is a page pool page or not.
See below.

>> -		qbuf.addr = addr + offset;
>> -		qbuf.len = len - offset;
>> +		qbuf.addr = addr + q->buf_offset;
>> +		qbuf.len = len - q->buf_offset;
>>  		qbuf.skip_unmap = false;
>>  		if (mt76_dma_add_rx_buf(dev, q, &qbuf, buf) < 0) {
>>  			dma_unmap_single(dev->dma_dev, addr, len,
>>  					 DMA_FROM_DEVICE);
>> -			skb_free_frag(buf);
>> +			mt76_put_page_pool_buf(buf, allow_direct);
>>  			break;
>>  		}
>>  		frames++;
> 
> ...
> 
>> @@ -848,6 +847,8 @@  mt76_dma_rx_process(struct mt76_dev *dev, struct
>> mt76_queue *q, int budget)
>>  			goto free_frag;
>>  
>>  		skb_reserve(skb, q->buf_offset);
>> +		if (mt76_is_page_from_pp(data))
>> +			skb_mark_for_recycle(skb);
>>  
>>  		*(u32 *)skb->cb = info;
>>  
> 
> More defensive programming here. Is there a path that allows for a
> mixed setup?
> 
> The only spot where I can see there being anything like that is in
> /drivers/net/wireless/mediatek/mt76/mt7915/mmio.c. But it doesn't make
> any sense to me as to why it was included in the patch. It might be
> easier to sort out the issue if we were to get rid of some of the
> defensive programming.
This is not defensive programming. In its current state, there is a 
scenario where we can have a mix of pp and non-pp pages (if hardware 
offload support is enabled).
However in my tests, offload support was disabled and all pages are PP 
ones.
I also have some unpublished pending changes to always allocate from the 
pool (even for the initial buffers allocated for offloading).
This did not make a difference in my tests though.

- Felix
  
Alexander Duyck Jan. 25, 2023, 6:26 p.m. UTC | #9
On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
> On 25.01.23 18:11, Alexander H Duyck wrote:
> > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
> > > On 24.01.23 22:10, Alexander H Duyck wrote:
> > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
> > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
> > > > > > Hi Felix,
> > > > > > 
> > > > > > ++cc Alexander and Yunsheng.
> > > > > > 
> > > > > > Thanks for the report
> > > > > > 
> > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > > > > > 
> > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
> > > > > > > full-page page_pool allocation.
> > > > > > > It appears to me, that handling refcounting in two separate counters
> > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > > > > > incremented by code dealing with skb fragments directly, and
> > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > > > > > 
> > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > > > > > these underflow issues and crashes go away.
> > > > > > > 
> > > > > > 
> > > > > > This has been discussed here [1].  TL;DR changing this to page
> > > > > > refcount might blow up in other colorful ways.  Can we look closer and
> > > > > > figure out why the underflow happens?
> > > > > I don't see how the approch taken in my patch would blow up. From what I 
> > > > > can tell, it should be fairly close to how refcount is handled in 
> > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
> > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
> > > > > with corresponding get_page and page_pool_return_skb_page calls.
> > > > > 
> > > > > - Felix
> > > > > 
> > > > 
> > > > Do you have the patch available to review as an RFC? From what I am
> > > > seeing it looks like you are underrunning on the pp_frag_count itself.
> > > > I would suspect the issue to be something like starting with a bad
> > > > count in terms of the total number of references, or deducing the wrong
> > > > amount when you finally free the page assuming you are tracking your
> > > > frag count using a non-atomic value in the driver.
> > > The driver patches for page pool are here:
> > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
> > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
> > > 
> > > They are also applied in my mt76 tree at:
> > > https://github.com/nbd168/wireless
> > > 
> > > - Felix
> > 
> > So one thing I am thinking is that we may be seeing an issue where we
> > are somehow getting a mix of frag and non-frag based page pool pages.
> > That is the only case I can think of where we might be underflowing
> > negative. If you could add some additional debug info on the underflow
> > WARN_ON case in page_pool_defrag_page that might be useful.
> > Specifically I would be curious what the actual return value is. I'm
> > assuming we are only hitting negative 1, but I would want to verify we
> > aren't seeing something else.
> I'll try to run some more tests soon. However, I think I found the piece 
> of code that is incompatible with using pp_frag_count.
> When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
> packet), and it is not split by the hardware, a cfg80211 function 
> extracts the individual MSDUs into separate skbs. In that case, a 
> fragment can be shared across multiple skbs, and get_page is used to 
> increase the refcount.
> You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
> its helper functions).

I'm not sure if it is problematic or not. Basically it is trading off
by copying over the frags, calling get_page on each frag, and then
using dev_kfree_skb to disassemble and release the pp_frag references.
There should be other paths in the kernel that are doing something
similar.

> This code also has a bug where it doesn't set pp_recycle on the newly 
> allocated skb if the previous one has it, but that's a separate matter 
> and fixing it doesn't make the crash go away.

Adding the recycle would cause this bug. So one thing we might be
seeing is something like that triggering this error. Specifically if
the page is taken via get_page when assembling the new skb then we
cannot set the recycle flag in the new skb otherwise it will result in
the reference undercount we are seeing. What we are doing is shifting
the references away from the pp_frag_count to the page reference count
in this case. If we set the pp_recycle flag then it would cause us to
decrement pp_frag_count instead of the page reference count resulting
in the underrun. 

> Is there any way I can make that part of the code work with the current 
> page pool frag implementation?

The current code should work. Basically as long as the references are
taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
into this issue because the pp_frag_count will be dropped when the
original skb is freed and the page reference count will be decremented
when the new one is freed.

For page pool page fragments the main thing to keep in mind is that if
pp_recycle is set it will update the pp_frag_count and if it is not
then it will just decrement the page reference count.
  
Felix Fietkau Jan. 25, 2023, 6:42 p.m. UTC | #10
On 25.01.23 19:26, Alexander H Duyck wrote:
> On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>> On 25.01.23 18:11, Alexander H Duyck wrote:
>> > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>> > > On 24.01.23 22:10, Alexander H Duyck wrote:
>> > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>> > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>> > > > > > Hi Felix,
>> > > > > > 
>> > > > > > ++cc Alexander and Yunsheng.
>> > > > > > 
>> > > > > > Thanks for the report
>> > > > > > 
>> > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> > > > > > > 
>> > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>> > > > > > > full-page page_pool allocation.
>> > > > > > > It appears to me, that handling refcounting in two separate counters
>> > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> > > > > > > incremented by code dealing with skb fragments directly, and
>> > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>> > > > > > > 
>> > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> > > > > > > these underflow issues and crashes go away.
>> > > > > > > 
>> > > > > > 
>> > > > > > This has been discussed here [1].  TL;DR changing this to page
>> > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>> > > > > > figure out why the underflow happens?
>> > > > > I don't see how the approch taken in my patch would blow up. From what I 
>> > > > > can tell, it should be fairly close to how refcount is handled in 
>> > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
>> > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
>> > > > > with corresponding get_page and page_pool_return_skb_page calls.
>> > > > > 
>> > > > > - Felix
>> > > > > 
>> > > > 
>> > > > Do you have the patch available to review as an RFC? From what I am
>> > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>> > > > I would suspect the issue to be something like starting with a bad
>> > > > count in terms of the total number of references, or deducing the wrong
>> > > > amount when you finally free the page assuming you are tracking your
>> > > > frag count using a non-atomic value in the driver.
>> > > The driver patches for page pool are here:
>> > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>> > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>> > > 
>> > > They are also applied in my mt76 tree at:
>> > > https://github.com/nbd168/wireless
>> > > 
>> > > - Felix
>> > 
>> > So one thing I am thinking is that we may be seeing an issue where we
>> > are somehow getting a mix of frag and non-frag based page pool pages.
>> > That is the only case I can think of where we might be underflowing
>> > negative. If you could add some additional debug info on the underflow
>> > WARN_ON case in page_pool_defrag_page that might be useful.
>> > Specifically I would be curious what the actual return value is. I'm
>> > assuming we are only hitting negative 1, but I would want to verify we
>> > aren't seeing something else.
>> I'll try to run some more tests soon. However, I think I found the piece 
>> of code that is incompatible with using pp_frag_count.
>> When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
>> packet), and it is not split by the hardware, a cfg80211 function 
>> extracts the individual MSDUs into separate skbs. In that case, a 
>> fragment can be shared across multiple skbs, and get_page is used to 
>> increase the refcount.
>> You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
>> its helper functions).
> 
> I'm not sure if it is problematic or not. Basically it is trading off
> by copying over the frags, calling get_page on each frag, and then
> using dev_kfree_skb to disassemble and release the pp_frag references.
> There should be other paths in the kernel that are doing something
> similar.
> 
>> This code also has a bug where it doesn't set pp_recycle on the newly 
>> allocated skb if the previous one has it, but that's a separate matter 
>> and fixing it doesn't make the crash go away.
> 
> Adding the recycle would cause this bug. So one thing we might be
> seeing is something like that triggering this error. Specifically if
> the page is taken via get_page when assembling the new skb then we
> cannot set the recycle flag in the new skb otherwise it will result in
> the reference undercount we are seeing. What we are doing is shifting
> the references away from the pp_frag_count to the page reference count
> in this case. If we set the pp_recycle flag then it would cause us to
> decrement pp_frag_count instead of the page reference count resulting
> in the underrun.
Couldn't leaving out the pp_recycle flag potentially lead to a case 
where the last user of the page drops it via page_frag_free instead of 
page_pool_return_skb_page? Is that valid?

>> Is there any way I can make that part of the code work with the current 
>> page pool frag implementation?
> 
> The current code should work. Basically as long as the references are
> taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
> into this issue because the pp_frag_count will be dropped when the
> original skb is freed and the page reference count will be decremented
> when the new one is freed.
> 
> For page pool page fragments the main thing to keep in mind is that if
> pp_recycle is set it will update the pp_frag_count and if it is not
> then it will just decrement the page reference count.
What takes care of DMA unmap and other cleanup if the last reference to 
the page is dropped via page_frag_free?

- Felix
  
Alexander Duyck Jan. 25, 2023, 7:02 p.m. UTC | #11
On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
> On 25.01.23 19:26, Alexander H Duyck wrote:
> > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
> > > On 25.01.23 18:11, Alexander H Duyck wrote:
> > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
> > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
> > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
> > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
> > > > > > > > Hi Felix,
> > > > > > > > 
> > > > > > > > ++cc Alexander and Yunsheng.
> > > > > > > > 
> > > > > > > > Thanks for the report
> > > > > > > > 
> > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > > > > > > > 
> > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
> > > > > > > > > full-page page_pool allocation.
> > > > > > > > > It appears to me, that handling refcounting in two separate counters
> > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > > > > > > > incremented by code dealing with skb fragments directly, and
> > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > > > > > > > 
> > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > > > > > > > these underflow issues and crashes go away.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > This has been discussed here [1].  TL;DR changing this to page
> > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
> > > > > > > > figure out why the underflow happens?
> > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
> > > > > > > can tell, it should be fairly close to how refcount is handled in 
> > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
> > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
> > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
> > > > > > > 
> > > > > > > - Felix
> > > > > > > 
> > > > > > 
> > > > > > Do you have the patch available to review as an RFC? From what I am
> > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
> > > > > > I would suspect the issue to be something like starting with a bad
> > > > > > count in terms of the total number of references, or deducing the wrong
> > > > > > amount when you finally free the page assuming you are tracking your
> > > > > > frag count using a non-atomic value in the driver.
> > > > > The driver patches for page pool are here:
> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
> > > > > 
> > > > > They are also applied in my mt76 tree at:
> > > > > https://github.com/nbd168/wireless
> > > > > 
> > > > > - Felix
> > > > 
> > > > So one thing I am thinking is that we may be seeing an issue where we
> > > > are somehow getting a mix of frag and non-frag based page pool pages.
> > > > That is the only case I can think of where we might be underflowing
> > > > negative. If you could add some additional debug info on the underflow
> > > > WARN_ON case in page_pool_defrag_page that might be useful.
> > > > Specifically I would be curious what the actual return value is. I'm
> > > > assuming we are only hitting negative 1, but I would want to verify we
> > > > aren't seeing something else.
> > > I'll try to run some more tests soon. However, I think I found the piece 
> > > of code that is incompatible with using pp_frag_count.
> > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
> > > packet), and it is not split by the hardware, a cfg80211 function 
> > > extracts the individual MSDUs into separate skbs. In that case, a 
> > > fragment can be shared across multiple skbs, and get_page is used to 
> > > increase the refcount.
> > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
> > > its helper functions).
> > 
> > I'm not sure if it is problematic or not. Basically it is trading off
> > by copying over the frags, calling get_page on each frag, and then
> > using dev_kfree_skb to disassemble and release the pp_frag references.
> > There should be other paths in the kernel that are doing something
> > similar.
> > 
> > > This code also has a bug where it doesn't set pp_recycle on the newly 
> > > allocated skb if the previous one has it, but that's a separate matter 
> > > and fixing it doesn't make the crash go away.
> > 
> > Adding the recycle would cause this bug. So one thing we might be
> > seeing is something like that triggering this error. Specifically if
> > the page is taken via get_page when assembling the new skb then we
> > cannot set the recycle flag in the new skb otherwise it will result in
> > the reference undercount we are seeing. What we are doing is shifting
> > the references away from the pp_frag_count to the page reference count
> > in this case. If we set the pp_recycle flag then it would cause us to
> > decrement pp_frag_count instead of the page reference count resulting
> > in the underrun.
> Couldn't leaving out the pp_recycle flag potentially lead to a case 
> where the last user of the page drops it via page_frag_free instead of 
> page_pool_return_skb_page? Is that valid?

No. What will happen is that when the pp_frag_count is exhausted the
page will be unmapped and evicted from the page pool. When the page is
then finally freed it will end up going back to the page allocator
instead of page pool.

Basically the idea is that until pp_frag_count reaches 0 there will be
at least 1 page reference held.

> > > Is there any way I can make that part of the code work with the current 
> > > page pool frag implementation?
> > 
> > The current code should work. Basically as long as the references are
> > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
> > into this issue because the pp_frag_count will be dropped when the
> > original skb is freed and the page reference count will be decremented
> > when the new one is freed.
> > 
> > For page pool page fragments the main thing to keep in mind is that if
> > pp_recycle is set it will update the pp_frag_count and if it is not
> > then it will just decrement the page reference count.
> What takes care of DMA unmap and other cleanup if the last reference to 
> the page is dropped via page_frag_free?
> 
> - Felix

When the page is freed on the skb w/ pp_recycle set it will unmap the
page and evict it from the page pool. Basically in these cases the page
goes from the page pool back to the page allocator.

The general idea with this is that if we are using fragments that there
will be enough of them floating around that if one or two frags have a
temporeary detour through a non-recycling path that hopefully by the
time the last fragment is freed the other instances holding the
additional page reference will have let them go. If not then the page
will go back to the page allocator and it will have to be replaced in
the page pool.
  
Felix Fietkau Jan. 25, 2023, 7:10 p.m. UTC | #12
On 25.01.23 20:02, Alexander H Duyck wrote:
> On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
>> On 25.01.23 19:26, Alexander H Duyck wrote:
>> > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>> > > On 25.01.23 18:11, Alexander H Duyck wrote:
>> > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>> > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
>> > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>> > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>> > > > > > > > Hi Felix,
>> > > > > > > > 
>> > > > > > > > ++cc Alexander and Yunsheng.
>> > > > > > > > 
>> > > > > > > > Thanks for the report
>> > > > > > > > 
>> > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> > > > > > > > > 
>> > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>> > > > > > > > > full-page page_pool allocation.
>> > > > > > > > > It appears to me, that handling refcounting in two separate counters
>> > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> > > > > > > > > incremented by code dealing with skb fragments directly, and
>> > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>> > > > > > > > > 
>> > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> > > > > > > > > these underflow issues and crashes go away.
>> > > > > > > > > 
>> > > > > > > > 
>> > > > > > > > This has been discussed here [1].  TL;DR changing this to page
>> > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>> > > > > > > > figure out why the underflow happens?
>> > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
>> > > > > > > can tell, it should be fairly close to how refcount is handled in 
>> > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
>> > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
>> > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
>> > > > > > > 
>> > > > > > > - Felix
>> > > > > > > 
>> > > > > > 
>> > > > > > Do you have the patch available to review as an RFC? From what I am
>> > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>> > > > > > I would suspect the issue to be something like starting with a bad
>> > > > > > count in terms of the total number of references, or deducing the wrong
>> > > > > > amount when you finally free the page assuming you are tracking your
>> > > > > > frag count using a non-atomic value in the driver.
>> > > > > The driver patches for page pool are here:
>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>> > > > > 
>> > > > > They are also applied in my mt76 tree at:
>> > > > > https://github.com/nbd168/wireless
>> > > > > 
>> > > > > - Felix
>> > > > 
>> > > > So one thing I am thinking is that we may be seeing an issue where we
>> > > > are somehow getting a mix of frag and non-frag based page pool pages.
>> > > > That is the only case I can think of where we might be underflowing
>> > > > negative. If you could add some additional debug info on the underflow
>> > > > WARN_ON case in page_pool_defrag_page that might be useful.
>> > > > Specifically I would be curious what the actual return value is. I'm
>> > > > assuming we are only hitting negative 1, but I would want to verify we
>> > > > aren't seeing something else.
>> > > I'll try to run some more tests soon. However, I think I found the piece 
>> > > of code that is incompatible with using pp_frag_count.
>> > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
>> > > packet), and it is not split by the hardware, a cfg80211 function 
>> > > extracts the individual MSDUs into separate skbs. In that case, a 
>> > > fragment can be shared across multiple skbs, and get_page is used to 
>> > > increase the refcount.
>> > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
>> > > its helper functions).
>> > 
>> > I'm not sure if it is problematic or not. Basically it is trading off
>> > by copying over the frags, calling get_page on each frag, and then
>> > using dev_kfree_skb to disassemble and release the pp_frag references.
>> > There should be other paths in the kernel that are doing something
>> > similar.
>> > 
>> > > This code also has a bug where it doesn't set pp_recycle on the newly 
>> > > allocated skb if the previous one has it, but that's a separate matter 
>> > > and fixing it doesn't make the crash go away.
>> > 
>> > Adding the recycle would cause this bug. So one thing we might be
>> > seeing is something like that triggering this error. Specifically if
>> > the page is taken via get_page when assembling the new skb then we
>> > cannot set the recycle flag in the new skb otherwise it will result in
>> > the reference undercount we are seeing. What we are doing is shifting
>> > the references away from the pp_frag_count to the page reference count
>> > in this case. If we set the pp_recycle flag then it would cause us to
>> > decrement pp_frag_count instead of the page reference count resulting
>> > in the underrun.
>> Couldn't leaving out the pp_recycle flag potentially lead to a case 
>> where the last user of the page drops it via page_frag_free instead of 
>> page_pool_return_skb_page? Is that valid?
> 
> No. What will happen is that when the pp_frag_count is exhausted the
> page will be unmapped and evicted from the page pool. When the page is
> then finally freed it will end up going back to the page allocator
> instead of page pool.
> 
> Basically the idea is that until pp_frag_count reaches 0 there will be
> at least 1 page reference held.
> 
>> > > Is there any way I can make that part of the code work with the current 
>> > > page pool frag implementation?
>> > 
>> > The current code should work. Basically as long as the references are
>> > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
>> > into this issue because the pp_frag_count will be dropped when the
>> > original skb is freed and the page reference count will be decremented
>> > when the new one is freed.
>> > 
>> > For page pool page fragments the main thing to keep in mind is that if
>> > pp_recycle is set it will update the pp_frag_count and if it is not
>> > then it will just decrement the page reference count.
>> What takes care of DMA unmap and other cleanup if the last reference to 
>> the page is dropped via page_frag_free?
>> 
>> - Felix
> 
> When the page is freed on the skb w/ pp_recycle set it will unmap the
> page and evict it from the page pool. Basically in these cases the page
> goes from the page pool back to the page allocator.
> 
> The general idea with this is that if we are using fragments that there
> will be enough of them floating around that if one or two frags have a
> temporeary detour through a non-recycling path that hopefully by the
> time the last fragment is freed the other instances holding the
> additional page reference will have let them go. If not then the page
> will go back to the page allocator and it will have to be replaced in
> the page pool.
Thanks for the explanation, it makes sense to me now. Unfortunately it 
also means that I have no idea what could cause this issue. I will 
finish my mt76 patch rework which gets rid of the pp vs non-pp 
allocation mix and re-run my tests to provide updated traces.

- Felix
  
Felix Fietkau Jan. 25, 2023, 7:40 p.m. UTC | #13
On 25.01.23 20:10, Felix Fietkau wrote:
> On 25.01.23 20:02, Alexander H Duyck wrote:
>> On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
>>> On 25.01.23 19:26, Alexander H Duyck wrote:
>>> > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>>> > > On 25.01.23 18:11, Alexander H Duyck wrote:
>>> > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>>> > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
>>> > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>>> > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>>> > > > > > > > Hi Felix,
>>> > > > > > > > 
>>> > > > > > > > ++cc Alexander and Yunsheng.
>>> > > > > > > > 
>>> > > > > > > > Thanks for the report
>>> > > > > > > > 
>>> > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>>> > > > > > > > > 
>>> > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>>> > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>>> > > > > > > > > full-page page_pool allocation.
>>> > > > > > > > > It appears to me, that handling refcounting in two separate counters
>>> > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>>> > > > > > > > > incremented by code dealing with skb fragments directly, and
>>> > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>>> > > > > > > > > 
>>> > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>>> > > > > > > > > these underflow issues and crashes go away.
>>> > > > > > > > > 
>>> > > > > > > > 
>>> > > > > > > > This has been discussed here [1].  TL;DR changing this to page
>>> > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>>> > > > > > > > figure out why the underflow happens?
>>> > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
>>> > > > > > > can tell, it should be fairly close to how refcount is handled in 
>>> > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
>>> > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
>>> > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
>>> > > > > > > 
>>> > > > > > > - Felix
>>> > > > > > > 
>>> > > > > > 
>>> > > > > > Do you have the patch available to review as an RFC? From what I am
>>> > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>>> > > > > > I would suspect the issue to be something like starting with a bad
>>> > > > > > count in terms of the total number of references, or deducing the wrong
>>> > > > > > amount when you finally free the page assuming you are tracking your
>>> > > > > > frag count using a non-atomic value in the driver.
>>> > > > > The driver patches for page pool are here:
>>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>>> > > > > 
>>> > > > > They are also applied in my mt76 tree at:
>>> > > > > https://github.com/nbd168/wireless
>>> > > > > 
>>> > > > > - Felix
>>> > > > 
>>> > > > So one thing I am thinking is that we may be seeing an issue where we
>>> > > > are somehow getting a mix of frag and non-frag based page pool pages.
>>> > > > That is the only case I can think of where we might be underflowing
>>> > > > negative. If you could add some additional debug info on the underflow
>>> > > > WARN_ON case in page_pool_defrag_page that might be useful.
>>> > > > Specifically I would be curious what the actual return value is. I'm
>>> > > > assuming we are only hitting negative 1, but I would want to verify we
>>> > > > aren't seeing something else.
>>> > > I'll try to run some more tests soon. However, I think I found the piece 
>>> > > of code that is incompatible with using pp_frag_count.
>>> > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
>>> > > packet), and it is not split by the hardware, a cfg80211 function 
>>> > > extracts the individual MSDUs into separate skbs. In that case, a 
>>> > > fragment can be shared across multiple skbs, and get_page is used to 
>>> > > increase the refcount.
>>> > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
>>> > > its helper functions).
>>> > 
>>> > I'm not sure if it is problematic or not. Basically it is trading off
>>> > by copying over the frags, calling get_page on each frag, and then
>>> > using dev_kfree_skb to disassemble and release the pp_frag references.
>>> > There should be other paths in the kernel that are doing something
>>> > similar.
>>> > 
>>> > > This code also has a bug where it doesn't set pp_recycle on the newly 
>>> > > allocated skb if the previous one has it, but that's a separate matter 
>>> > > and fixing it doesn't make the crash go away.
>>> > 
>>> > Adding the recycle would cause this bug. So one thing we might be
>>> > seeing is something like that triggering this error. Specifically if
>>> > the page is taken via get_page when assembling the new skb then we
>>> > cannot set the recycle flag in the new skb otherwise it will result in
>>> > the reference undercount we are seeing. What we are doing is shifting
>>> > the references away from the pp_frag_count to the page reference count
>>> > in this case. If we set the pp_recycle flag then it would cause us to
>>> > decrement pp_frag_count instead of the page reference count resulting
>>> > in the underrun.
>>> Couldn't leaving out the pp_recycle flag potentially lead to a case 
>>> where the last user of the page drops it via page_frag_free instead of 
>>> page_pool_return_skb_page? Is that valid?
>> 
>> No. What will happen is that when the pp_frag_count is exhausted the
>> page will be unmapped and evicted from the page pool. When the page is
>> then finally freed it will end up going back to the page allocator
>> instead of page pool.
>> 
>> Basically the idea is that until pp_frag_count reaches 0 there will be
>> at least 1 page reference held.
>> 
>>> > > Is there any way I can make that part of the code work with the current 
>>> > > page pool frag implementation?
>>> > 
>>> > The current code should work. Basically as long as the references are
>>> > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
>>> > into this issue because the pp_frag_count will be dropped when the
>>> > original skb is freed and the page reference count will be decremented
>>> > when the new one is freed.
>>> > 
>>> > For page pool page fragments the main thing to keep in mind is that if
>>> > pp_recycle is set it will update the pp_frag_count and if it is not
>>> > then it will just decrement the page reference count.
>>> What takes care of DMA unmap and other cleanup if the last reference to 
>>> the page is dropped via page_frag_free?
>>> 
>>> - Felix
>> 
>> When the page is freed on the skb w/ pp_recycle set it will unmap the
>> page and evict it from the page pool. Basically in these cases the page
>> goes from the page pool back to the page allocator.
>> 
>> The general idea with this is that if we are using fragments that there
>> will be enough of them floating around that if one or two frags have a
>> temporeary detour through a non-recycling path that hopefully by the
>> time the last fragment is freed the other instances holding the
>> additional page reference will have let them go. If not then the page
>> will go back to the page allocator and it will have to be replaced in
>> the page pool.
> Thanks for the explanation, it makes sense to me now. Unfortunately it
> also means that I have no idea what could cause this issue. I will
> finish my mt76 patch rework which gets rid of the pp vs non-pp
> allocation mix and re-run my tests to provide updated traces.
Here's the updated mt76 page pool support commit:
https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808

And here is the trace that I'm getting with 6.1:
https://nbd.name/p/a16957f2

If you have any debug patch you'd like me to test, please let me know.

- Felix
  
Felix Fietkau Jan. 25, 2023, 8:02 p.m. UTC | #14
On 25.01.23 20:40, Felix Fietkau wrote:
> On 25.01.23 20:10, Felix Fietkau wrote:
>> On 25.01.23 20:02, Alexander H Duyck wrote:
>>> On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
>>>> On 25.01.23 19:26, Alexander H Duyck wrote:
>>>> > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>>>> > > On 25.01.23 18:11, Alexander H Duyck wrote:
>>>> > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>>>> > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
>>>> > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>>>> > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>>>> > > > > > > > Hi Felix,
>>>> > > > > > > > 
>>>> > > > > > > > ++cc Alexander and Yunsheng.
>>>> > > > > > > > 
>>>> > > > > > > > Thanks for the report
>>>> > > > > > > > 
>>>> > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>>>> > > > > > > > > 
>>>> > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>>>> > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>>>> > > > > > > > > full-page page_pool allocation.
>>>> > > > > > > > > It appears to me, that handling refcounting in two separate counters
>>>> > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>>>> > > > > > > > > incremented by code dealing with skb fragments directly, and
>>>> > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>>>> > > > > > > > > 
>>>> > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>>>> > > > > > > > > these underflow issues and crashes go away.
>>>> > > > > > > > > 
>>>> > > > > > > > 
>>>> > > > > > > > This has been discussed here [1].  TL;DR changing this to page
>>>> > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>>>> > > > > > > > figure out why the underflow happens?
>>>> > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
>>>> > > > > > > can tell, it should be fairly close to how refcount is handled in 
>>>> > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
>>>> > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
>>>> > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
>>>> > > > > > > 
>>>> > > > > > > - Felix
>>>> > > > > > > 
>>>> > > > > > 
>>>> > > > > > Do you have the patch available to review as an RFC? From what I am
>>>> > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>>>> > > > > > I would suspect the issue to be something like starting with a bad
>>>> > > > > > count in terms of the total number of references, or deducing the wrong
>>>> > > > > > amount when you finally free the page assuming you are tracking your
>>>> > > > > > frag count using a non-atomic value in the driver.
>>>> > > > > The driver patches for page pool are here:
>>>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>>>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>>>> > > > > 
>>>> > > > > They are also applied in my mt76 tree at:
>>>> > > > > https://github.com/nbd168/wireless
>>>> > > > > 
>>>> > > > > - Felix
>>>> > > > 
>>>> > > > So one thing I am thinking is that we may be seeing an issue where we
>>>> > > > are somehow getting a mix of frag and non-frag based page pool pages.
>>>> > > > That is the only case I can think of where we might be underflowing
>>>> > > > negative. If you could add some additional debug info on the underflow
>>>> > > > WARN_ON case in page_pool_defrag_page that might be useful.
>>>> > > > Specifically I would be curious what the actual return value is. I'm
>>>> > > > assuming we are only hitting negative 1, but I would want to verify we
>>>> > > > aren't seeing something else.
>>>> > > I'll try to run some more tests soon. However, I think I found the piece 
>>>> > > of code that is incompatible with using pp_frag_count.
>>>> > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
>>>> > > packet), and it is not split by the hardware, a cfg80211 function 
>>>> > > extracts the individual MSDUs into separate skbs. In that case, a 
>>>> > > fragment can be shared across multiple skbs, and get_page is used to 
>>>> > > increase the refcount.
>>>> > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
>>>> > > its helper functions).
>>>> > 
>>>> > I'm not sure if it is problematic or not. Basically it is trading off
>>>> > by copying over the frags, calling get_page on each frag, and then
>>>> > using dev_kfree_skb to disassemble and release the pp_frag references.
>>>> > There should be other paths in the kernel that are doing something
>>>> > similar.
>>>> > 
>>>> > > This code also has a bug where it doesn't set pp_recycle on the newly 
>>>> > > allocated skb if the previous one has it, but that's a separate matter 
>>>> > > and fixing it doesn't make the crash go away.
>>>> > 
>>>> > Adding the recycle would cause this bug. So one thing we might be
>>>> > seeing is something like that triggering this error. Specifically if
>>>> > the page is taken via get_page when assembling the new skb then we
>>>> > cannot set the recycle flag in the new skb otherwise it will result in
>>>> > the reference undercount we are seeing. What we are doing is shifting
>>>> > the references away from the pp_frag_count to the page reference count
>>>> > in this case. If we set the pp_recycle flag then it would cause us to
>>>> > decrement pp_frag_count instead of the page reference count resulting
>>>> > in the underrun.
>>>> Couldn't leaving out the pp_recycle flag potentially lead to a case 
>>>> where the last user of the page drops it via page_frag_free instead of 
>>>> page_pool_return_skb_page? Is that valid?
>>> 
>>> No. What will happen is that when the pp_frag_count is exhausted the
>>> page will be unmapped and evicted from the page pool. When the page is
>>> then finally freed it will end up going back to the page allocator
>>> instead of page pool.
>>> 
>>> Basically the idea is that until pp_frag_count reaches 0 there will be
>>> at least 1 page reference held.
>>> 
>>>> > > Is there any way I can make that part of the code work with the current 
>>>> > > page pool frag implementation?
>>>> > 
>>>> > The current code should work. Basically as long as the references are
>>>> > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
>>>> > into this issue because the pp_frag_count will be dropped when the
>>>> > original skb is freed and the page reference count will be decremented
>>>> > when the new one is freed.
>>>> > 
>>>> > For page pool page fragments the main thing to keep in mind is that if
>>>> > pp_recycle is set it will update the pp_frag_count and if it is not
>>>> > then it will just decrement the page reference count.
>>>> What takes care of DMA unmap and other cleanup if the last reference to 
>>>> the page is dropped via page_frag_free?
>>>> 
>>>> - Felix
>>> 
>>> When the page is freed on the skb w/ pp_recycle set it will unmap the
>>> page and evict it from the page pool. Basically in these cases the page
>>> goes from the page pool back to the page allocator.
>>> 
>>> The general idea with this is that if we are using fragments that there
>>> will be enough of them floating around that if one or two frags have a
>>> temporeary detour through a non-recycling path that hopefully by the
>>> time the last fragment is freed the other instances holding the
>>> additional page reference will have let them go. If not then the page
>>> will go back to the page allocator and it will have to be replaced in
>>> the page pool.
>> Thanks for the explanation, it makes sense to me now. Unfortunately it
>> also means that I have no idea what could cause this issue. I will
>> finish my mt76 patch rework which gets rid of the pp vs non-pp
>> allocation mix and re-run my tests to provide updated traces.
> Here's the updated mt76 page pool support commit:
> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808
> 
> And here is the trace that I'm getting with 6.1:
> https://nbd.name/p/a16957f2
> 
> If you have any debug patch you'd like me to test, please let me know.
To answer your earlier question: When pp_frag_count goes below zero, 
it's at -1 as suspected.

Here are some more completely different traces that I got during other 
test runs. I hope they provide some kind of clue:
https://nbd.name/p/8e46b6eb

- Felix
  
Alexander Duyck Jan. 25, 2023, 10:14 p.m. UTC | #15
On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote:
> On 25.01.23 20:10, Felix Fietkau wrote:
> > On 25.01.23 20:02, Alexander H Duyck wrote:
> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
> > > > On 25.01.23 19:26, Alexander H Duyck wrote:
> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote:
> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
> > > > > > > > > > > Hi Felix,
> > > > > > > > > > > 
> > > > > > > > > > > ++cc Alexander and Yunsheng.
> > > > > > > > > > > 
> > > > > > > > > > > Thanks for the report
> > > > > > > > > > > 
> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > > > > > > > > > > 
> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
> > > > > > > > > > > > full-page page_pool allocation.
> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters
> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and
> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > > > > > > > > > > 
> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > > > > > > > > > > these underflow issues and crashes go away.
> > > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > This has been discussed here [1].  TL;DR changing this to page
> > > > > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
> > > > > > > > > > > figure out why the underflow happens?
> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in 
> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
> > > > > > > > > > 
> > > > > > > > > > - Felix
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Do you have the patch available to review as an RFC? From what I am
> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
> > > > > > > > > I would suspect the issue to be something like starting with a bad
> > > > > > > > > count in terms of the total number of references, or deducing the wrong
> > > > > > > > > amount when you finally free the page assuming you are tracking your
> > > > > > > > > frag count using a non-atomic value in the driver.
> > > > > > > > The driver patches for page pool are here:
> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
> > > > > > > > 
> > > > > > > > They are also applied in my mt76 tree at:
> > > > > > > > https://github.com/nbd168/wireless
> > > > > > > > 
> > > > > > > > - Felix
> > > > > > > 
> > > > > > > So one thing I am thinking is that we may be seeing an issue where we
> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages.
> > > > > > > That is the only case I can think of where we might be underflowing
> > > > > > > negative. If you could add some additional debug info on the underflow
> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful.
> > > > > > > Specifically I would be curious what the actual return value is. I'm
> > > > > > > assuming we are only hitting negative 1, but I would want to verify we
> > > > > > > aren't seeing something else.
> > > > > > I'll try to run some more tests soon. However, I think I found the piece 
> > > > > > of code that is incompatible with using pp_frag_count.
> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
> > > > > > packet), and it is not split by the hardware, a cfg80211 function 
> > > > > > extracts the individual MSDUs into separate skbs. In that case, a 
> > > > > > fragment can be shared across multiple skbs, and get_page is used to 
> > > > > > increase the refcount.
> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
> > > > > > its helper functions).
> > > > > 
> > > > > I'm not sure if it is problematic or not. Basically it is trading off
> > > > > by copying over the frags, calling get_page on each frag, and then
> > > > > using dev_kfree_skb to disassemble and release the pp_frag references.
> > > > > There should be other paths in the kernel that are doing something
> > > > > similar.
> > > > > 
> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly 
> > > > > > allocated skb if the previous one has it, but that's a separate matter 
> > > > > > and fixing it doesn't make the crash go away.
> > > > > 
> > > > > Adding the recycle would cause this bug. So one thing we might be
> > > > > seeing is something like that triggering this error. Specifically if
> > > > > the page is taken via get_page when assembling the new skb then we
> > > > > cannot set the recycle flag in the new skb otherwise it will result in
> > > > > the reference undercount we are seeing. What we are doing is shifting
> > > > > the references away from the pp_frag_count to the page reference count
> > > > > in this case. If we set the pp_recycle flag then it would cause us to
> > > > > decrement pp_frag_count instead of the page reference count resulting
> > > > > in the underrun.
> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case 
> > > > where the last user of the page drops it via page_frag_free instead of 
> > > > page_pool_return_skb_page? Is that valid?
> > > 
> > > No. What will happen is that when the pp_frag_count is exhausted the
> > > page will be unmapped and evicted from the page pool. When the page is
> > > then finally freed it will end up going back to the page allocator
> > > instead of page pool.
> > > 
> > > Basically the idea is that until pp_frag_count reaches 0 there will be
> > > at least 1 page reference held.
> > > 
> > > > > > Is there any way I can make that part of the code work with the current 
> > > > > > page pool frag implementation?
> > > > > 
> > > > > The current code should work. Basically as long as the references are
> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
> > > > > into this issue because the pp_frag_count will be dropped when the
> > > > > original skb is freed and the page reference count will be decremented
> > > > > when the new one is freed.
> > > > > 
> > > > > For page pool page fragments the main thing to keep in mind is that if
> > > > > pp_recycle is set it will update the pp_frag_count and if it is not
> > > > > then it will just decrement the page reference count.
> > > > What takes care of DMA unmap and other cleanup if the last reference to 
> > > > the page is dropped via page_frag_free?
> > > > 
> > > > - Felix
> > > 
> > > When the page is freed on the skb w/ pp_recycle set it will unmap the
> > > page and evict it from the page pool. Basically in these cases the page
> > > goes from the page pool back to the page allocator.
> > > 
> > > The general idea with this is that if we are using fragments that there
> > > will be enough of them floating around that if one or two frags have a
> > > temporeary detour through a non-recycling path that hopefully by the
> > > time the last fragment is freed the other instances holding the
> > > additional page reference will have let them go. If not then the page
> > > will go back to the page allocator and it will have to be replaced in
> > > the page pool.
> > Thanks for the explanation, it makes sense to me now. Unfortunately it
> > also means that I have no idea what could cause this issue. I will
> > finish my mt76 patch rework which gets rid of the pp vs non-pp
> > allocation mix and re-run my tests to provide updated traces.
> Here's the updated mt76 page pool support commit:
> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808

Yeah, so I don't see anything wrong with the patch in terms of page
pool.

> And here is the trace that I'm getting with 6.1:
> https://nbd.name/p/a16957f2
> 
> If you have any debug patch you'd like me to test, please let me know.
> 
> - Felix

So looking at the traces I am assuming what we are seeing is the
deferred freeing from the TCP Rx path since I don't see a driver
anywhere between net_rx_action and napi_consume skb. So it seems like
the packets are likely making it all the way up the network stack.

Is this the first wireless driver to add support for page pool? I'm
thinking we must be seeing something in the wireless path that is
causing an issue such as the function you called out earlier but I
can't see anything obvious.

One thing we need to be on the lookout for is cloned skbs. When an skb
is cloned the pp_recycle gets copied over. In that case the reference
is moved over to the skb dataref count. What comes to mind is something
like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool
fragment recycling").
  
Felix Fietkau Jan. 26, 2023, 6:12 a.m. UTC | #16
On 25.01.23 23:14, Alexander H Duyck wrote:
> On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote:
>> On 25.01.23 20:10, Felix Fietkau wrote:
>> > On 25.01.23 20:02, Alexander H Duyck wrote:
>> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
>> > > > On 25.01.23 19:26, Alexander H Duyck wrote:
>> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote:
>> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
>> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>> > > > > > > > > > > Hi Felix,
>> > > > > > > > > > > 
>> > > > > > > > > > > ++cc Alexander and Yunsheng.
>> > > > > > > > > > > 
>> > > > > > > > > > > Thanks for the report
>> > > > > > > > > > > 
>> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> > > > > > > > > > > > 
>> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>> > > > > > > > > > > > full-page page_pool allocation.
>> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters
>> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and
>> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>> > > > > > > > > > > > 
>> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> > > > > > > > > > > > these underflow issues and crashes go away.
>> > > > > > > > > > > > 
>> > > > > > > > > > > 
>> > > > > > > > > > > This has been discussed here [1].  TL;DR changing this to page
>> > > > > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>> > > > > > > > > > > figure out why the underflow happens?
>> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
>> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in 
>> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
>> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
>> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
>> > > > > > > > > > 
>> > > > > > > > > > - Felix
>> > > > > > > > > > 
>> > > > > > > > > 
>> > > > > > > > > Do you have the patch available to review as an RFC? From what I am
>> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>> > > > > > > > > I would suspect the issue to be something like starting with a bad
>> > > > > > > > > count in terms of the total number of references, or deducing the wrong
>> > > > > > > > > amount when you finally free the page assuming you are tracking your
>> > > > > > > > > frag count using a non-atomic value in the driver.
>> > > > > > > > The driver patches for page pool are here:
>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>> > > > > > > > 
>> > > > > > > > They are also applied in my mt76 tree at:
>> > > > > > > > https://github.com/nbd168/wireless
>> > > > > > > > 
>> > > > > > > > - Felix
>> > > > > > > 
>> > > > > > > So one thing I am thinking is that we may be seeing an issue where we
>> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages.
>> > > > > > > That is the only case I can think of where we might be underflowing
>> > > > > > > negative. If you could add some additional debug info on the underflow
>> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful.
>> > > > > > > Specifically I would be curious what the actual return value is. I'm
>> > > > > > > assuming we are only hitting negative 1, but I would want to verify we
>> > > > > > > aren't seeing something else.
>> > > > > > I'll try to run some more tests soon. However, I think I found the piece 
>> > > > > > of code that is incompatible with using pp_frag_count.
>> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
>> > > > > > packet), and it is not split by the hardware, a cfg80211 function 
>> > > > > > extracts the individual MSDUs into separate skbs. In that case, a 
>> > > > > > fragment can be shared across multiple skbs, and get_page is used to 
>> > > > > > increase the refcount.
>> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
>> > > > > > its helper functions).
>> > > > > 
>> > > > > I'm not sure if it is problematic or not. Basically it is trading off
>> > > > > by copying over the frags, calling get_page on each frag, and then
>> > > > > using dev_kfree_skb to disassemble and release the pp_frag references.
>> > > > > There should be other paths in the kernel that are doing something
>> > > > > similar.
>> > > > > 
>> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly 
>> > > > > > allocated skb if the previous one has it, but that's a separate matter 
>> > > > > > and fixing it doesn't make the crash go away.
>> > > > > 
>> > > > > Adding the recycle would cause this bug. So one thing we might be
>> > > > > seeing is something like that triggering this error. Specifically if
>> > > > > the page is taken via get_page when assembling the new skb then we
>> > > > > cannot set the recycle flag in the new skb otherwise it will result in
>> > > > > the reference undercount we are seeing. What we are doing is shifting
>> > > > > the references away from the pp_frag_count to the page reference count
>> > > > > in this case. If we set the pp_recycle flag then it would cause us to
>> > > > > decrement pp_frag_count instead of the page reference count resulting
>> > > > > in the underrun.
>> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case 
>> > > > where the last user of the page drops it via page_frag_free instead of 
>> > > > page_pool_return_skb_page? Is that valid?
>> > > 
>> > > No. What will happen is that when the pp_frag_count is exhausted the
>> > > page will be unmapped and evicted from the page pool. When the page is
>> > > then finally freed it will end up going back to the page allocator
>> > > instead of page pool.
>> > > 
>> > > Basically the idea is that until pp_frag_count reaches 0 there will be
>> > > at least 1 page reference held.
>> > > 
>> > > > > > Is there any way I can make that part of the code work with the current 
>> > > > > > page pool frag implementation?
>> > > > > 
>> > > > > The current code should work. Basically as long as the references are
>> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
>> > > > > into this issue because the pp_frag_count will be dropped when the
>> > > > > original skb is freed and the page reference count will be decremented
>> > > > > when the new one is freed.
>> > > > > 
>> > > > > For page pool page fragments the main thing to keep in mind is that if
>> > > > > pp_recycle is set it will update the pp_frag_count and if it is not
>> > > > > then it will just decrement the page reference count.
>> > > > What takes care of DMA unmap and other cleanup if the last reference to 
>> > > > the page is dropped via page_frag_free?
>> > > > 
>> > > > - Felix
>> > > 
>> > > When the page is freed on the skb w/ pp_recycle set it will unmap the
>> > > page and evict it from the page pool. Basically in these cases the page
>> > > goes from the page pool back to the page allocator.
>> > > 
>> > > The general idea with this is that if we are using fragments that there
>> > > will be enough of them floating around that if one or two frags have a
>> > > temporeary detour through a non-recycling path that hopefully by the
>> > > time the last fragment is freed the other instances holding the
>> > > additional page reference will have let them go. If not then the page
>> > > will go back to the page allocator and it will have to be replaced in
>> > > the page pool.
>> > Thanks for the explanation, it makes sense to me now. Unfortunately it
>> > also means that I have no idea what could cause this issue. I will
>> > finish my mt76 patch rework which gets rid of the pp vs non-pp
>> > allocation mix and re-run my tests to provide updated traces.
>> Here's the updated mt76 page pool support commit:
>> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808
> 
> Yeah, so I don't see anything wrong with the patch in terms of page
> pool.
> 
>> And here is the trace that I'm getting with 6.1:
>> https://nbd.name/p/a16957f2
>> 
>> If you have any debug patch you'd like me to test, please let me know.
>> 
>> - Felix
> 
> So looking at the traces I am assuming what we are seeing is the
> deferred freeing from the TCP Rx path since I don't see a driver
> anywhere between net_rx_action and napi_consume skb. So it seems like
> the packets are likely making it all the way up the network stack.
> 
> Is this the first wireless driver to add support for page pool? I'm
> thinking we must be seeing something in the wireless path that is
> causing an issue such as the function you called out earlier but I
> can't see anything obvious.
Yes, it's the first driver with page pool support.

> One thing we need to be on the lookout for is cloned skbs. When an skb
> is cloned the pp_recycle gets copied over. In that case the reference
> is moved over to the skb dataref count. What comes to mind is something
> like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool
> fragment recycling").
I suspect that the crash might be related to a bad interaction between 
the page reuse in A-MSDU rx + skb coalescing on TCP rx.
If I change the A-MSDU code to copy data instead of reusing fragments, 
it doesn't crash anymore.
I believe the issue must be specific to that codepath, since most 
received and processed packets are either not A-MSDU or A-MSDU decap has 
already been performed by the hardware.
If I change my test to use 3 client mode interfaces instead of 4, the 
hardware is able to offload all A-MSDU rx processing and I don't see any 
crashes anymore.

Could you please take another look at ieee80211_amsdu_to_8023s to see if 
there's anything in there that could cause these issues?

- Felix
  
Felix Fietkau Jan. 26, 2023, 9:14 a.m. UTC | #17
On 26.01.23 07:12, Felix Fietkau wrote:
> On 25.01.23 23:14, Alexander H Duyck wrote:
>> On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote:
>>> On 25.01.23 20:10, Felix Fietkau wrote:
>>> > On 25.01.23 20:02, Alexander H Duyck wrote:
>>> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
>>> > > > On 25.01.23 19:26, Alexander H Duyck wrote:
>>> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>>> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote:
>>> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>>> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
>>> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>>> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>>> > > > > > > > > > > Hi Felix,
>>> > > > > > > > > > > 
>>> > > > > > > > > > > ++cc Alexander and Yunsheng.
>>> > > > > > > > > > > 
>>> > > > > > > > > > > Thanks for the report
>>> > > > > > > > > > > 
>>> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>>> > > > > > > > > > > > 
>>> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>>> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>>> > > > > > > > > > > > full-page page_pool allocation.
>>> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters
>>> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>>> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and
>>> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>>> > > > > > > > > > > > 
>>> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>>> > > > > > > > > > > > these underflow issues and crashes go away.
>>> > > > > > > > > > > > 
>>> > > > > > > > > > > 
>>> > > > > > > > > > > This has been discussed here [1].  TL;DR changing this to page
>>> > > > > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>>> > > > > > > > > > > figure out why the underflow happens?
>>> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
>>> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in 
>>> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
>>> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
>>> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
>>> > > > > > > > > > 
>>> > > > > > > > > > - Felix
>>> > > > > > > > > > 
>>> > > > > > > > > 
>>> > > > > > > > > Do you have the patch available to review as an RFC? From what I am
>>> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>>> > > > > > > > > I would suspect the issue to be something like starting with a bad
>>> > > > > > > > > count in terms of the total number of references, or deducing the wrong
>>> > > > > > > > > amount when you finally free the page assuming you are tracking your
>>> > > > > > > > > frag count using a non-atomic value in the driver.
>>> > > > > > > > The driver patches for page pool are here:
>>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>>> > > > > > > > 
>>> > > > > > > > They are also applied in my mt76 tree at:
>>> > > > > > > > https://github.com/nbd168/wireless
>>> > > > > > > > 
>>> > > > > > > > - Felix
>>> > > > > > > 
>>> > > > > > > So one thing I am thinking is that we may be seeing an issue where we
>>> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages.
>>> > > > > > > That is the only case I can think of where we might be underflowing
>>> > > > > > > negative. If you could add some additional debug info on the underflow
>>> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful.
>>> > > > > > > Specifically I would be curious what the actual return value is. I'm
>>> > > > > > > assuming we are only hitting negative 1, but I would want to verify we
>>> > > > > > > aren't seeing something else.
>>> > > > > > I'll try to run some more tests soon. However, I think I found the piece 
>>> > > > > > of code that is incompatible with using pp_frag_count.
>>> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
>>> > > > > > packet), and it is not split by the hardware, a cfg80211 function 
>>> > > > > > extracts the individual MSDUs into separate skbs. In that case, a 
>>> > > > > > fragment can be shared across multiple skbs, and get_page is used to 
>>> > > > > > increase the refcount.
>>> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
>>> > > > > > its helper functions).
>>> > > > > 
>>> > > > > I'm not sure if it is problematic or not. Basically it is trading off
>>> > > > > by copying over the frags, calling get_page on each frag, and then
>>> > > > > using dev_kfree_skb to disassemble and release the pp_frag references.
>>> > > > > There should be other paths in the kernel that are doing something
>>> > > > > similar.
>>> > > > > 
>>> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly 
>>> > > > > > allocated skb if the previous one has it, but that's a separate matter 
>>> > > > > > and fixing it doesn't make the crash go away.
>>> > > > > 
>>> > > > > Adding the recycle would cause this bug. So one thing we might be
>>> > > > > seeing is something like that triggering this error. Specifically if
>>> > > > > the page is taken via get_page when assembling the new skb then we
>>> > > > > cannot set the recycle flag in the new skb otherwise it will result in
>>> > > > > the reference undercount we are seeing. What we are doing is shifting
>>> > > > > the references away from the pp_frag_count to the page reference count
>>> > > > > in this case. If we set the pp_recycle flag then it would cause us to
>>> > > > > decrement pp_frag_count instead of the page reference count resulting
>>> > > > > in the underrun.
>>> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case 
>>> > > > where the last user of the page drops it via page_frag_free instead of 
>>> > > > page_pool_return_skb_page? Is that valid?
>>> > > 
>>> > > No. What will happen is that when the pp_frag_count is exhausted the
>>> > > page will be unmapped and evicted from the page pool. When the page is
>>> > > then finally freed it will end up going back to the page allocator
>>> > > instead of page pool.
>>> > > 
>>> > > Basically the idea is that until pp_frag_count reaches 0 there will be
>>> > > at least 1 page reference held.
>>> > > 
>>> > > > > > Is there any way I can make that part of the code work with the current 
>>> > > > > > page pool frag implementation?
>>> > > > > 
>>> > > > > The current code should work. Basically as long as the references are
>>> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
>>> > > > > into this issue because the pp_frag_count will be dropped when the
>>> > > > > original skb is freed and the page reference count will be decremented
>>> > > > > when the new one is freed.
>>> > > > > 
>>> > > > > For page pool page fragments the main thing to keep in mind is that if
>>> > > > > pp_recycle is set it will update the pp_frag_count and if it is not
>>> > > > > then it will just decrement the page reference count.
>>> > > > What takes care of DMA unmap and other cleanup if the last reference to 
>>> > > > the page is dropped via page_frag_free?
>>> > > > 
>>> > > > - Felix
>>> > > 
>>> > > When the page is freed on the skb w/ pp_recycle set it will unmap the
>>> > > page and evict it from the page pool. Basically in these cases the page
>>> > > goes from the page pool back to the page allocator.
>>> > > 
>>> > > The general idea with this is that if we are using fragments that there
>>> > > will be enough of them floating around that if one or two frags have a
>>> > > temporeary detour through a non-recycling path that hopefully by the
>>> > > time the last fragment is freed the other instances holding the
>>> > > additional page reference will have let them go. If not then the page
>>> > > will go back to the page allocator and it will have to be replaced in
>>> > > the page pool.
>>> > Thanks for the explanation, it makes sense to me now. Unfortunately it
>>> > also means that I have no idea what could cause this issue. I will
>>> > finish my mt76 patch rework which gets rid of the pp vs non-pp
>>> > allocation mix and re-run my tests to provide updated traces.
>>> Here's the updated mt76 page pool support commit:
>>> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808
>> 
>> Yeah, so I don't see anything wrong with the patch in terms of page
>> pool.
>> 
>>> And here is the trace that I'm getting with 6.1:
>>> https://nbd.name/p/a16957f2
>>> 
>>> If you have any debug patch you'd like me to test, please let me know.
>>> 
>>> - Felix
>> 
>> So looking at the traces I am assuming what we are seeing is the
>> deferred freeing from the TCP Rx path since I don't see a driver
>> anywhere between net_rx_action and napi_consume skb. So it seems like
>> the packets are likely making it all the way up the network stack.
>> 
>> Is this the first wireless driver to add support for page pool? I'm
>> thinking we must be seeing something in the wireless path that is
>> causing an issue such as the function you called out earlier but I
>> can't see anything obvious.
> Yes, it's the first driver with page pool support.
> 
>> One thing we need to be on the lookout for is cloned skbs. When an skb
>> is cloned the pp_recycle gets copied over. In that case the reference
>> is moved over to the skb dataref count. What comes to mind is something
>> like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool
>> fragment recycling").
> I suspect that the crash might be related to a bad interaction between
> the page reuse in A-MSDU rx + skb coalescing on TCP rx.
> If I change the A-MSDU code to copy data instead of reusing fragments,
> it doesn't crash anymore.
> I believe the issue must be specific to that codepath, since most
> received and processed packets are either not A-MSDU or A-MSDU decap has
> already been performed by the hardware.
> If I change my test to use 3 client mode interfaces instead of 4, the
> hardware is able to offload all A-MSDU rx processing and I don't see any
> crashes anymore.
> 
> Could you please take another look at ieee80211_amsdu_to_8023s to see if
> there's anything in there that could cause these issues?
Here are clues from a few more tests that I ran:
- preventing the reuse of the last skb in ieee80211_amsdu_to_8023s does 
not prevent the crashes, so the issue is indeed related to taking page 
references and putting the pages in skb fragments.
- if I return false in skb_try_coalesce, it still crashes:
https://nbd.name/p/18cac078

- Felix
  
Ilias Apalodimas Jan. 26, 2023, 10:31 a.m. UTC | #18
Hi Alexander,

Sorry for being late to the party,  was overloaded...

On Tue, Jan 24, 2023 at 07:57:35AM -0800, Alexander H Duyck wrote:
> On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote:
> > Hi Felix,
> >
> > ++cc Alexander and Yunsheng.
> >
> > Thanks for the report
> >
> > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > >
> > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > to reliably trigger page refcount underflow issues, which did not occur with
> > > full-page page_pool allocation.
> > > It appears to me, that handling refcounting in two separate counters
> > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > incremented by code dealing with skb fragments directly, and
> > > page_pool_return_skb_page is called multiple times for the same fragment.
> > >
> > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > these underflow issues and crashes go away.
> > >
> >
> > This has been discussed here [1].  TL;DR changing this to page
> > refcount might blow up in other colorful ways.  Can we look closer and
> > figure out why the underflow happens?
> >
> > [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/
> >
> > Thanks
> > /Ilias
> >
> >
>
> The logic should be safe in terms of the page pool itself as it should
> be holding one reference to the page while the pp_frag_count is non-
> zero. That one reference is what keeps the two halfs in sync as the
> page shouldn't be able to be freed until we exhaust the pp_frag_count.

Do you remember why we decided to go with the fragment counter instead of
page references?

>
> To have an underflow there are two possible scenarios. One is that
> either put_page or free_page is being called somewhere that the
> page_pool freeing functions should be used.

Wouldn't that affect the non fragmented path as well? IOW the driver that
works with a full page would crash as well.

> The other possibility is
> that a pp_frag_count reference was taken somewhere a page reference
> should have.
>
> Do we have a backtrace for the spots that are showing this underrun? If
> nothing else we may want to look at tracking down the spots that are
> freeing the page pool pages via put_page or free_page to determine what
> paths these pages are taking.

Thanks
/Ilias
  
Ilias Apalodimas Jan. 26, 2023, 10:32 a.m. UTC | #19
On Tue, Jan 24, 2023 at 06:22:54PM +0100, Felix Fietkau wrote:
> On 24.01.23 15:11, Ilias Apalodimas wrote:
> > Hi Felix,
> >
> > ++cc Alexander and Yunsheng.
> >
> > Thanks for the report
> >
> > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > >
> > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > to reliably trigger page refcount underflow issues, which did not occur with
> > > full-page page_pool allocation.
> > > It appears to me, that handling refcounting in two separate counters
> > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > incremented by code dealing with skb fragments directly, and
> > > page_pool_return_skb_page is called multiple times for the same fragment.
> > >
> > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > these underflow issues and crashes go away.
> > >
> >
> > This has been discussed here [1].  TL;DR changing this to page
> > refcount might blow up in other colorful ways.  Can we look closer and
> > figure out why the underflow happens?
> I don't see how the approch taken in my patch would blow up. From what I can
> tell, it should be fairly close to how refcount is handled in
> page_frag_alloc. The main improvement it adds is to prevent it from blowing
> up if pool-allocated fragments get shared across multiple skbs with
> corresponding get_page and page_pool_return_skb_page calls.
>
> - Felix
>

Yes sorry for the noise, that patch I referred to was doing a completely
different thing, elevating the page refcnt to BIAS_MAX from the start

Thanks
/Ilias
  
Alexander Duyck Jan. 26, 2023, 3:41 p.m. UTC | #20
On Thu, Jan 26, 2023 at 2:32 AM Ilias Apalodimas
<ilias.apalodimas@linaro.org> wrote:
>
> Hi Alexander,
>
> Sorry for being late to the party,  was overloaded...
>
> On Tue, Jan 24, 2023 at 07:57:35AM -0800, Alexander H Duyck wrote:
> > On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote:
> > > Hi Felix,
> > >
> > > ++cc Alexander and Yunsheng.
> > >
> > > Thanks for the report
> > >
> > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > >
> > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > > to reliably trigger page refcount underflow issues, which did not occur with
> > > > full-page page_pool allocation.
> > > > It appears to me, that handling refcounting in two separate counters
> > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > > incremented by code dealing with skb fragments directly, and
> > > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > >
> > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > > these underflow issues and crashes go away.
> > > >
> > >
> > > This has been discussed here [1].  TL;DR changing this to page
> > > refcount might blow up in other colorful ways.  Can we look closer and
> > > figure out why the underflow happens?
> > >
> > > [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/
> > >
> > > Thanks
> > > /Ilias
> > >
> > >
> >
> > The logic should be safe in terms of the page pool itself as it should
> > be holding one reference to the page while the pp_frag_count is non-
> > zero. That one reference is what keeps the two halfs in sync as the
> > page shouldn't be able to be freed until we exhaust the pp_frag_count.
>
> Do you remember why we decided to go with the fragment counter instead of
> page references?

The issue has to do with when to destroy the mappings. Basically with
the fragment counter we destroy the mappings and remove the page from
the pool when the count hits 0. The reference count is really used for
the page allocator to do its tracking. If we end up trying to merge
the two the problem becomes one of lifetimes as we wouldn't know when
to destroy the DMA mappings as they would have to live the full life
of the page.

> >
> > To have an underflow there are two possible scenarios. One is that
> > either put_page or free_page is being called somewhere that the
> > page_pool freeing functions should be used.
>
> Wouldn't that affect the non fragmented path as well? IOW the driver that
> works with a full page would crash as well.

The problem is the non-fragmented path doesn't get as noisy. Also
there aren't currently any wireless drivers making use of the page
pool, or at least that is my understanding. I'm suspecting something
like the issue we saw in 1effe8ca4e34c ("skbuff: fix coalescing for
page_pool fragment recycling"). We likely have some corner case where
we should be taking a page reference and clearing a pp_recycle flag.
  
Ilias Apalodimas Jan. 26, 2023, 4:05 p.m. UTC | #21
On Thu, Jan 26, 2023 at 07:41:15AM -0800, Alexander Duyck wrote:
> On Thu, Jan 26, 2023 at 2:32 AM Ilias Apalodimas
> <ilias.apalodimas@linaro.org> wrote:
> >
> > Hi Alexander,
> >
> > Sorry for being late to the party,  was overloaded...
> >
> > On Tue, Jan 24, 2023 at 07:57:35AM -0800, Alexander H Duyck wrote:
> > > On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote:
> > > > Hi Felix,
> > > >
> > > > ++cc Alexander and Yunsheng.
> > > >
> > > > Thanks for the report
> > > >
> > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > > >
> > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > > > to reliably trigger page refcount underflow issues, which did not occur with
> > > > > full-page page_pool allocation.
> > > > > It appears to me, that handling refcounting in two separate counters
> > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > > > incremented by code dealing with skb fragments directly, and
> > > > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > > >
> > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > > > these underflow issues and crashes go away.
> > > > >
> > > >
> > > > This has been discussed here [1].  TL;DR changing this to page
> > > > refcount might blow up in other colorful ways.  Can we look closer and
> > > > figure out why the underflow happens?
> > > >
> > > > [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/
> > > >
> > > > Thanks
> > > > /Ilias
> > > >
> > > >
> > >
> > > The logic should be safe in terms of the page pool itself as it should
> > > be holding one reference to the page while the pp_frag_count is non-
> > > zero. That one reference is what keeps the two halfs in sync as the
> > > page shouldn't be able to be freed until we exhaust the pp_frag_count.
> >
> > Do you remember why we decided to go with the fragment counter instead of
> > page references?
>
> The issue has to do with when to destroy the mappings. Basically with
> the fragment counter we destroy the mappings and remove the page from
> the pool when the count hits 0. The reference count is really used for
> the page allocator to do its tracking. If we end up trying to merge
> the two the problem becomes one of lifetimes as we wouldn't know when
> to destroy the DMA mappings as they would have to live the full life
> of the page.

Ah yes thanks! We need that on a comment somewhere,  I keep forgetting...
Basically the pp_frag_count is our number of outstanding dma mappings.

>
> > >
> > > To have an underflow there are two possible scenarios. One is that
> > > either put_page or free_page is being called somewhere that the
> > > page_pool freeing functions should be used.
> >
> > Wouldn't that affect the non fragmented path as well? IOW the driver that
> > works with a full page would crash as well.
>
> The problem is the non-fragmented path doesn't get as noisy. Also
> there aren't currently any wireless drivers making use of the page
> pool, or at least that is my understanding. I'm suspecting something
> like the issue we saw in 1effe8ca4e34c ("skbuff: fix coalescing for
> page_pool fragment recycling"). We likely have some corner case where
> we should be taking a page reference and clearing a pp_recycle flag.

Yea, same thinking here. I'll have another closer look tomorrow, but
looking at the wireless internals what happens is
1. They alloc a fragment
2. They create a new SKB, without the recycle bit and refer to the existing
fragments.  Since the recyle bit is off *that* skb will never try to
decrease the frag counter.  Instead it bumps the page refcnt which should be
properly decremented one that SKB is freed. I guess somehow an SKB ends up with
the recycle bit set, when it shouldn't.

Regards
/Ilias
  
Alexander Duyck Jan. 26, 2023, 4:08 p.m. UTC | #22
On Thu, Jan 26, 2023 at 1:15 AM Felix Fietkau <nbd@nbd.name> wrote:
>
> On 26.01.23 07:12, Felix Fietkau wrote:
> > On 25.01.23 23:14, Alexander H Duyck wrote:
> >> On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote:
> >>> On 25.01.23 20:10, Felix Fietkau wrote:
> >>> > On 25.01.23 20:02, Alexander H Duyck wrote:
> >>> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
> >>> > > > On 25.01.23 19:26, Alexander H Duyck wrote:
> >>> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
> >>> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote:
> >>> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
> >>> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
> >>> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
> >>> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
> >>> > > > > > > > > > > Hi Felix,
> >>> > > > > > > > > > >
> >>> > > > > > > > > > > ++cc Alexander and Yunsheng.
> >>> > > > > > > > > > >
> >>> > > > > > > > > > > Thanks for the report
> >>> > > > > > > > > > >
> >>> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> >>> > > > > > > > > > > >
> >>> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> >>> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
> >>> > > > > > > > > > > > full-page page_pool allocation.
> >>> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters
> >>> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> >>> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and
> >>> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
> >>> > > > > > > > > > > >
> >>> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> >>> > > > > > > > > > > > these underflow issues and crashes go away.
> >>> > > > > > > > > > > >
> >>> > > > > > > > > > >
> >>> > > > > > > > > > > This has been discussed here [1].  TL;DR changing this to page
> >>> > > > > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
> >>> > > > > > > > > > > figure out why the underflow happens?
> >>> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I
> >>> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in
> >>> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from
> >>> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs
> >>> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
> >>> > > > > > > > > >
> >>> > > > > > > > > > - Felix
> >>> > > > > > > > > >
> >>> > > > > > > > >
> >>> > > > > > > > > Do you have the patch available to review as an RFC? From what I am
> >>> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
> >>> > > > > > > > > I would suspect the issue to be something like starting with a bad
> >>> > > > > > > > > count in terms of the total number of references, or deducing the wrong
> >>> > > > > > > > > amount when you finally free the page assuming you are tracking your
> >>> > > > > > > > > frag count using a non-atomic value in the driver.
> >>> > > > > > > > The driver patches for page pool are here:
> >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
> >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
> >>> > > > > > > >
> >>> > > > > > > > They are also applied in my mt76 tree at:
> >>> > > > > > > > https://github.com/nbd168/wireless
> >>> > > > > > > >
> >>> > > > > > > > - Felix
> >>> > > > > > >
> >>> > > > > > > So one thing I am thinking is that we may be seeing an issue where we
> >>> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages.
> >>> > > > > > > That is the only case I can think of where we might be underflowing
> >>> > > > > > > negative. If you could add some additional debug info on the underflow
> >>> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful.
> >>> > > > > > > Specifically I would be curious what the actual return value is. I'm
> >>> > > > > > > assuming we are only hitting negative 1, but I would want to verify we
> >>> > > > > > > aren't seeing something else.
> >>> > > > > > I'll try to run some more tests soon. However, I think I found the piece
> >>> > > > > > of code that is incompatible with using pp_frag_count.
> >>> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11
> >>> > > > > > packet), and it is not split by the hardware, a cfg80211 function
> >>> > > > > > extracts the individual MSDUs into separate skbs. In that case, a
> >>> > > > > > fragment can be shared across multiple skbs, and get_page is used to
> >>> > > > > > increase the refcount.
> >>> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and
> >>> > > > > > its helper functions).
> >>> > > > >
> >>> > > > > I'm not sure if it is problematic or not. Basically it is trading off
> >>> > > > > by copying over the frags, calling get_page on each frag, and then
> >>> > > > > using dev_kfree_skb to disassemble and release the pp_frag references.
> >>> > > > > There should be other paths in the kernel that are doing something
> >>> > > > > similar.
> >>> > > > >
> >>> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly
> >>> > > > > > allocated skb if the previous one has it, but that's a separate matter
> >>> > > > > > and fixing it doesn't make the crash go away.
> >>> > > > >
> >>> > > > > Adding the recycle would cause this bug. So one thing we might be
> >>> > > > > seeing is something like that triggering this error. Specifically if
> >>> > > > > the page is taken via get_page when assembling the new skb then we
> >>> > > > > cannot set the recycle flag in the new skb otherwise it will result in
> >>> > > > > the reference undercount we are seeing. What we are doing is shifting
> >>> > > > > the references away from the pp_frag_count to the page reference count
> >>> > > > > in this case. If we set the pp_recycle flag then it would cause us to
> >>> > > > > decrement pp_frag_count instead of the page reference count resulting
> >>> > > > > in the underrun.
> >>> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case
> >>> > > > where the last user of the page drops it via page_frag_free instead of
> >>> > > > page_pool_return_skb_page? Is that valid?
> >>> > >
> >>> > > No. What will happen is that when the pp_frag_count is exhausted the
> >>> > > page will be unmapped and evicted from the page pool. When the page is
> >>> > > then finally freed it will end up going back to the page allocator
> >>> > > instead of page pool.
> >>> > >
> >>> > > Basically the idea is that until pp_frag_count reaches 0 there will be
> >>> > > at least 1 page reference held.
> >>> > >
> >>> > > > > > Is there any way I can make that part of the code work with the current
> >>> > > > > > page pool frag implementation?
> >>> > > > >
> >>> > > > > The current code should work. Basically as long as the references are
> >>> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
> >>> > > > > into this issue because the pp_frag_count will be dropped when the
> >>> > > > > original skb is freed and the page reference count will be decremented
> >>> > > > > when the new one is freed.
> >>> > > > >
> >>> > > > > For page pool page fragments the main thing to keep in mind is that if
> >>> > > > > pp_recycle is set it will update the pp_frag_count and if it is not
> >>> > > > > then it will just decrement the page reference count.
> >>> > > > What takes care of DMA unmap and other cleanup if the last reference to
> >>> > > > the page is dropped via page_frag_free?
> >>> > > >
> >>> > > > - Felix
> >>> > >
> >>> > > When the page is freed on the skb w/ pp_recycle set it will unmap the
> >>> > > page and evict it from the page pool. Basically in these cases the page
> >>> > > goes from the page pool back to the page allocator.
> >>> > >
> >>> > > The general idea with this is that if we are using fragments that there
> >>> > > will be enough of them floating around that if one or two frags have a
> >>> > > temporeary detour through a non-recycling path that hopefully by the
> >>> > > time the last fragment is freed the other instances holding the
> >>> > > additional page reference will have let them go. If not then the page
> >>> > > will go back to the page allocator and it will have to be replaced in
> >>> > > the page pool.
> >>> > Thanks for the explanation, it makes sense to me now. Unfortunately it
> >>> > also means that I have no idea what could cause this issue. I will
> >>> > finish my mt76 patch rework which gets rid of the pp vs non-pp
> >>> > allocation mix and re-run my tests to provide updated traces.
> >>> Here's the updated mt76 page pool support commit:
> >>> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808
> >>
> >> Yeah, so I don't see anything wrong with the patch in terms of page
> >> pool.
> >>
> >>> And here is the trace that I'm getting with 6.1:
> >>> https://nbd.name/p/a16957f2
> >>>
> >>> If you have any debug patch you'd like me to test, please let me know.
> >>>
> >>> - Felix
> >>
> >> So looking at the traces I am assuming what we are seeing is the
> >> deferred freeing from the TCP Rx path since I don't see a driver
> >> anywhere between net_rx_action and napi_consume skb. So it seems like
> >> the packets are likely making it all the way up the network stack.
> >>
> >> Is this the first wireless driver to add support for page pool? I'm
> >> thinking we must be seeing something in the wireless path that is
> >> causing an issue such as the function you called out earlier but I
> >> can't see anything obvious.
> > Yes, it's the first driver with page pool support.
> >
> >> One thing we need to be on the lookout for is cloned skbs. When an skb
> >> is cloned the pp_recycle gets copied over. In that case the reference
> >> is moved over to the skb dataref count. What comes to mind is something
> >> like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool
> >> fragment recycling").
> > I suspect that the crash might be related to a bad interaction between
> > the page reuse in A-MSDU rx + skb coalescing on TCP rx.
> > If I change the A-MSDU code to copy data instead of reusing fragments,
> > it doesn't crash anymore.

Which piece did you change? My main interest would be trying to narrow
down the section of this function that is causing this. Did you modify
__ieee80211_amsdu_copy or some other function within the setup?

> > I believe the issue must be specific to that codepath, since most
> > received and processed packets are either not A-MSDU or A-MSDU decap has
> > already been performed by the hardware.
> > If I change my test to use 3 client mode interfaces instead of 4, the
> > hardware is able to offload all A-MSDU rx processing and I don't see any
> > crashes anymore.
> >
> > Could you please take another look at ieee80211_amsdu_to_8023s to see if
> > there's anything in there that could cause these issues?

The thing is I am not sure it is the only cause for this. I am
suspecting we are seeing this triggering an issue when combined with
something else.

If we could add some tracing to dump the skb and list buffers that
might be helpful. We would want to verify the pp_recycle value, clone
flag, and for the frags we would want to frag count and page reference
counts. The expectation would be that the original skb should have the
pp_recycle flag set and the frag counts consistent through the
process, and the list skbs should all have taken page references w/ no
pp_recycle on the skbs in the list.

> Here are clues from a few more tests that I ran:
> - preventing the reuse of the last skb in ieee80211_amsdu_to_8023s does
> not prevent the crashes, so the issue is indeed related to taking page
> references and putting the pages in skb fragments.

You said in the first email it stops it and in the second "does not".
I am assuming that is some sort of typo since you seem to be implying
it does resolve it for you. Is that correct?

> - if I return false in skb_try_coalesce, it still crashes:
> https://nbd.name/p/18cac078

Yeah, I wasn't suspecting skb_try_coalesce since we have exercised the
code. My thought was something like the function you mentioned above
plus cloning or something else.
  
Alexander Duyck Jan. 26, 2023, 4:40 p.m. UTC | #23
?

On Thu, Jan 26, 2023 at 8:08 AM Alexander Duyck
<alexander.duyck@gmail.com> wrote:
>
> On Thu, Jan 26, 2023 at 1:15 AM Felix Fietkau <nbd@nbd.name> wrote:
> >

...

> > - if I return false in skb_try_coalesce, it still crashes:
> > https://nbd.name/p/18cac078
>
> Yeah, I wasn't suspecting skb_try_coalesce since we have exercised the
> code. My thought was something like the function you mentioned above
> plus cloning or something else.

One question I would have. Is GRO happening after the call to
ieee80211_amsdu_to_8023s? If so we might want to try switching that
off to see if it might be aggregating page pool frames and non-page
pool frames together.
  
Felix Fietkau Jan. 26, 2023, 5:44 p.m. UTC | #24
On 26.01.23 17:08, Alexander Duyck wrote:
> On Thu, Jan 26, 2023 at 1:15 AM Felix Fietkau <nbd@nbd.name> wrote:
>>
>> On 26.01.23 07:12, Felix Fietkau wrote:
>> > On 25.01.23 23:14, Alexander H Duyck wrote:
>> >> On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote:
>> >>> On 25.01.23 20:10, Felix Fietkau wrote:
>> >>> > On 25.01.23 20:02, Alexander H Duyck wrote:
>> >>> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
>> >>> > > > On 25.01.23 19:26, Alexander H Duyck wrote:
>> >>> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>> >>> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote:
>> >>> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>> >>> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
>> >>> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>> >>> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>> >>> > > > > > > > > > > Hi Felix,
>> >>> > > > > > > > > > >
>> >>> > > > > > > > > > > ++cc Alexander and Yunsheng.
>> >>> > > > > > > > > > >
>> >>> > > > > > > > > > > Thanks for the report
>> >>> > > > > > > > > > >
>> >>> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> >>> > > > > > > > > > > >
>> >>> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> >>> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>> >>> > > > > > > > > > > > full-page page_pool allocation.
>> >>> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters
>> >>> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> >>> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and
>> >>> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>> >>> > > > > > > > > > > >
>> >>> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> >>> > > > > > > > > > > > these underflow issues and crashes go away.
>> >>> > > > > > > > > > > >
>> >>> > > > > > > > > > >
>> >>> > > > > > > > > > > This has been discussed here [1].  TL;DR changing this to page
>> >>> > > > > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>> >>> > > > > > > > > > > figure out why the underflow happens?
>> >>> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I
>> >>> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in
>> >>> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from
>> >>> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs
>> >>> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
>> >>> > > > > > > > > >
>> >>> > > > > > > > > > - Felix
>> >>> > > > > > > > > >
>> >>> > > > > > > > >
>> >>> > > > > > > > > Do you have the patch available to review as an RFC? From what I am
>> >>> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>> >>> > > > > > > > > I would suspect the issue to be something like starting with a bad
>> >>> > > > > > > > > count in terms of the total number of references, or deducing the wrong
>> >>> > > > > > > > > amount when you finally free the page assuming you are tracking your
>> >>> > > > > > > > > frag count using a non-atomic value in the driver.
>> >>> > > > > > > > The driver patches for page pool are here:
>> >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>> >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>> >>> > > > > > > >
>> >>> > > > > > > > They are also applied in my mt76 tree at:
>> >>> > > > > > > > https://github.com/nbd168/wireless
>> >>> > > > > > > >
>> >>> > > > > > > > - Felix
>> >>> > > > > > >
>> >>> > > > > > > So one thing I am thinking is that we may be seeing an issue where we
>> >>> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages.
>> >>> > > > > > > That is the only case I can think of where we might be underflowing
>> >>> > > > > > > negative. If you could add some additional debug info on the underflow
>> >>> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful.
>> >>> > > > > > > Specifically I would be curious what the actual return value is. I'm
>> >>> > > > > > > assuming we are only hitting negative 1, but I would want to verify we
>> >>> > > > > > > aren't seeing something else.
>> >>> > > > > > I'll try to run some more tests soon. However, I think I found the piece
>> >>> > > > > > of code that is incompatible with using pp_frag_count.
>> >>> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11
>> >>> > > > > > packet), and it is not split by the hardware, a cfg80211 function
>> >>> > > > > > extracts the individual MSDUs into separate skbs. In that case, a
>> >>> > > > > > fragment can be shared across multiple skbs, and get_page is used to
>> >>> > > > > > increase the refcount.
>> >>> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and
>> >>> > > > > > its helper functions).
>> >>> > > > >
>> >>> > > > > I'm not sure if it is problematic or not. Basically it is trading off
>> >>> > > > > by copying over the frags, calling get_page on each frag, and then
>> >>> > > > > using dev_kfree_skb to disassemble and release the pp_frag references.
>> >>> > > > > There should be other paths in the kernel that are doing something
>> >>> > > > > similar.
>> >>> > > > >
>> >>> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly
>> >>> > > > > > allocated skb if the previous one has it, but that's a separate matter
>> >>> > > > > > and fixing it doesn't make the crash go away.
>> >>> > > > >
>> >>> > > > > Adding the recycle would cause this bug. So one thing we might be
>> >>> > > > > seeing is something like that triggering this error. Specifically if
>> >>> > > > > the page is taken via get_page when assembling the new skb then we
>> >>> > > > > cannot set the recycle flag in the new skb otherwise it will result in
>> >>> > > > > the reference undercount we are seeing. What we are doing is shifting
>> >>> > > > > the references away from the pp_frag_count to the page reference count
>> >>> > > > > in this case. If we set the pp_recycle flag then it would cause us to
>> >>> > > > > decrement pp_frag_count instead of the page reference count resulting
>> >>> > > > > in the underrun.
>> >>> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case
>> >>> > > > where the last user of the page drops it via page_frag_free instead of
>> >>> > > > page_pool_return_skb_page? Is that valid?
>> >>> > >
>> >>> > > No. What will happen is that when the pp_frag_count is exhausted the
>> >>> > > page will be unmapped and evicted from the page pool. When the page is
>> >>> > > then finally freed it will end up going back to the page allocator
>> >>> > > instead of page pool.
>> >>> > >
>> >>> > > Basically the idea is that until pp_frag_count reaches 0 there will be
>> >>> > > at least 1 page reference held.
>> >>> > >
>> >>> > > > > > Is there any way I can make that part of the code work with the current
>> >>> > > > > > page pool frag implementation?
>> >>> > > > >
>> >>> > > > > The current code should work. Basically as long as the references are
>> >>> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
>> >>> > > > > into this issue because the pp_frag_count will be dropped when the
>> >>> > > > > original skb is freed and the page reference count will be decremented
>> >>> > > > > when the new one is freed.
>> >>> > > > >
>> >>> > > > > For page pool page fragments the main thing to keep in mind is that if
>> >>> > > > > pp_recycle is set it will update the pp_frag_count and if it is not
>> >>> > > > > then it will just decrement the page reference count.
>> >>> > > > What takes care of DMA unmap and other cleanup if the last reference to
>> >>> > > > the page is dropped via page_frag_free?
>> >>> > > >
>> >>> > > > - Felix
>> >>> > >
>> >>> > > When the page is freed on the skb w/ pp_recycle set it will unmap the
>> >>> > > page and evict it from the page pool. Basically in these cases the page
>> >>> > > goes from the page pool back to the page allocator.
>> >>> > >
>> >>> > > The general idea with this is that if we are using fragments that there
>> >>> > > will be enough of them floating around that if one or two frags have a
>> >>> > > temporeary detour through a non-recycling path that hopefully by the
>> >>> > > time the last fragment is freed the other instances holding the
>> >>> > > additional page reference will have let them go. If not then the page
>> >>> > > will go back to the page allocator and it will have to be replaced in
>> >>> > > the page pool.
>> >>> > Thanks for the explanation, it makes sense to me now. Unfortunately it
>> >>> > also means that I have no idea what could cause this issue. I will
>> >>> > finish my mt76 patch rework which gets rid of the pp vs non-pp
>> >>> > allocation mix and re-run my tests to provide updated traces.
>> >>> Here's the updated mt76 page pool support commit:
>> >>> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808
>> >>
>> >> Yeah, so I don't see anything wrong with the patch in terms of page
>> >> pool.
>> >>
>> >>> And here is the trace that I'm getting with 6.1:
>> >>> https://nbd.name/p/a16957f2
>> >>>
>> >>> If you have any debug patch you'd like me to test, please let me know.
>> >>>
>> >>> - Felix
>> >>
>> >> So looking at the traces I am assuming what we are seeing is the
>> >> deferred freeing from the TCP Rx path since I don't see a driver
>> >> anywhere between net_rx_action and napi_consume skb. So it seems like
>> >> the packets are likely making it all the way up the network stack.
>> >>
>> >> Is this the first wireless driver to add support for page pool? I'm
>> >> thinking we must be seeing something in the wireless path that is
>> >> causing an issue such as the function you called out earlier but I
>> >> can't see anything obvious.
>> > Yes, it's the first driver with page pool support.
>> >
>> >> One thing we need to be on the lookout for is cloned skbs. When an skb
>> >> is cloned the pp_recycle gets copied over. In that case the reference
>> >> is moved over to the skb dataref count. What comes to mind is something
>> >> like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool
>> >> fragment recycling").
>> > I suspect that the crash might be related to a bad interaction between
>> > the page reuse in A-MSDU rx + skb coalescing on TCP rx.
>> > If I change the A-MSDU code to copy data instead of reusing fragments,
>> > it doesn't crash anymore.
> 
> Which piece did you change? My main interest would be trying to narrow
> down the section of this function that is causing this. Did you modify
> __ieee80211_amsdu_copy or some other function within the setup?
I replaced this line:
   bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb);
with:
   bool reuse_frag = false;

>> > I believe the issue must be specific to that codepath, since most
>> > received and processed packets are either not A-MSDU or A-MSDU decap has
>> > already been performed by the hardware.
>> > If I change my test to use 3 client mode interfaces instead of 4, the
>> > hardware is able to offload all A-MSDU rx processing and I don't see any
>> > crashes anymore.
>> >
>> > Could you please take another look at ieee80211_amsdu_to_8023s to see if
>> > there's anything in there that could cause these issues?
> 
> The thing is I am not sure it is the only cause for this. I am
> suspecting we are seeing this triggering an issue when combined with
> something else.
> 
> If we could add some tracing to dump the skb and list buffers that
> might be helpful. We would want to verify the pp_recycle value, clone
> flag, and for the frags we would want to frag count and page reference
> counts. The expectation would be that the original skb should have the
> pp_recycle flag set and the frag counts consistent through the
> process, and the list skbs should all have taken page references w/ no
> pp_recycle on the skbs in the list.
> 
>> Here are clues from a few more tests that I ran:
>> - preventing the reuse of the last skb in ieee80211_amsdu_to_8023s does
>> not prevent the crashes, so the issue is indeed related to taking page
>> references and putting the pages in skb fragments.
> 
> You said in the first email it stops it and in the second "does not".
> I am assuming that is some sort of typo since you seem to be implying
> it does resolve it for you. Is that correct?
For everything except for the last subframe, the function pulls 
fragments out of the original skb and puts them into a new skb allocated 
with dev_alloc_skb. Additionally, the last skb is reused for the final 
subframe. What I meant was: In order to figure out if the skb-reuse is 
problematic, I prevented it from happening, ensuring that all subframes 
are allocated and get the fragments from the skb.
All I meant to say is that since that led to the same crash, the 
skb-reuse is not the problem.

Regarding the question from your other mail: without GRO there is no 
crash and it looks stable.

- Felix
  
Alexander Duyck Jan. 26, 2023, 6:38 p.m. UTC | #25
> 
> > Which piece did you change? My main interest would be trying to narrow
> > down the section of this function that is causing this. Did you modify
> > __ieee80211_amsdu_copy or some other function within the setup?
> I replaced this line:
>    bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb);
> with:
>    bool reuse_frag = false;

I see. So essentially everything is copied into the new buffers.

> > > > I believe the issue must be specific to that codepath, since most
> > > > received and processed packets are either not A-MSDU or A-MSDU decap has
> > > > already been performed by the hardware.
> > > > If I change my test to use 3 client mode interfaces instead of 4, the
> > > > hardware is able to offload all A-MSDU rx processing and I don't see any
> > > > crashes anymore.
> > > > 
> > > > Could you please take another look at ieee80211_amsdu_to_8023s to see if
> > > > there's anything in there that could cause these issues?
> > 
> > The thing is I am not sure it is the only cause for this. I am
> > suspecting we are seeing this triggering an issue when combined with
> > something else.
> > 
> > If we could add some tracing to dump the skb and list buffers that
> > might be helpful. We would want to verify the pp_recycle value, clone
> > flag, and for the frags we would want to frag count and page reference
> > counts. The expectation would be that the original skb should have the
> > pp_recycle flag set and the frag counts consistent through the
> > process, and the list skbs should all have taken page references w/ no
> > pp_recycle on the skbs in the list.
> > 
> > > Here are clues from a few more tests that I ran:
> > > - preventing the reuse of the last skb in ieee80211_amsdu_to_8023s does
> > > not prevent the crashes, so the issue is indeed related to taking page
> > > references and putting the pages in skb fragments.
> > 
> > You said in the first email it stops it and in the second "does not".
> > I am assuming that is some sort of typo since you seem to be implying
> > it does resolve it for you. Is that correct?
> For everything except for the last subframe, the function pulls 
> fragments out of the original skb and puts them into a new skb allocated 
> with dev_alloc_skb. Additionally, the last skb is reused for the final 
> subframe. What I meant was: In order to figure out if the skb-reuse is 
> problematic, I prevented it from happening, ensuring that all subframes 
> are allocated and get the fragments from the skb.
> All I meant to say is that since that led to the same crash, the 
> skb-reuse is not the problem.
> 
> Regarding the question from your other mail: without GRO there is no 
> crash and it looks stable.
> 
> - Felix
> 

Okay, I think that tells me exactly what is going on. Can you give the
change below a try and see if it solves the problem for you.

I think what is happening is that after you are reassigning the frags
they are getting merged into GRO frames where the head may have
pp_recycle set. As a result I think the pages are getting recycled when
they should be just freed via put_page.

I'm suspecting this wasn't an issue up until now as I don't believe
there are any that are running in a mixed mode where they have both
pp_recycle and non-pp_recycle skbs coming from the same device.

diff --git a/net/core/gro.c b/net/core/gro.c
index 506f83d715f8..4bac7ea6e025 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct
sk_buff *skb)
 	struct sk_buff *lp;
 	int segs;
 
+	/* Do not splice page pool based packets w/ non-page pool
+	 * packets. This can result in reference count issues as page
+	 * pool pages will not decrement the reference count and will
+	 * instead be immediately returned to the pool or have frag
+	 * count decremented.
+	 */
+	if (p->pp_recycle != skb->pp_recycle)
+		return -ETOOMANYREFS;
+
 	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
 	gro_max_size = READ_ONCE(p->dev->gro_max_size);
  
Felix Fietkau Jan. 26, 2023, 6:43 p.m. UTC | #26
On 26.01.23 19:38, Alexander H Duyck wrote:
> Okay, I think that tells me exactly what is going on. Can you give the
> change below a try and see if it solves the problem for you.
> 
> I think what is happening is that after you are reassigning the frags
> they are getting merged into GRO frames where the head may have
> pp_recycle set. As a result I think the pages are getting recycled when
> they should be just freed via put_page.
> 
> I'm suspecting this wasn't an issue up until now as I don't believe
> there are any that are running in a mixed mode where they have both
> pp_recycle and non-pp_recycle skbs coming from the same device.
> 
> diff --git a/net/core/gro.c b/net/core/gro.c
> index 506f83d715f8..4bac7ea6e025 100644
> --- a/net/core/gro.c
> +++ b/net/core/gro.c
> @@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct
> sk_buff *skb)
>   	struct sk_buff *lp;
>   	int segs;
>   
> +	/* Do not splice page pool based packets w/ non-page pool
> +	 * packets. This can result in reference count issues as page
> +	 * pool pages will not decrement the reference count and will
> +	 * instead be immediately returned to the pool or have frag
> +	 * count decremented.
> +	 */
> +	if (p->pp_recycle != skb->pp_recycle)
> +		return -ETOOMANYREFS;
> +
>   	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
>   	gro_max_size = READ_ONCE(p->dev->gro_max_size);
>   
That works, thanks!

- Felix
  

Patch

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9757067c3053..96ec3b19a86d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -125,18 +125,11 @@  struct page {
 			struct page_pool *pp;
 			unsigned long _pp_mapping_pad;
 			unsigned long dma_addr;
-			union {
-				/**
-				 * dma_addr_upper: might require a 64-bit
-				 * value on 32-bit architectures.
-				 */
-				unsigned long dma_addr_upper;
-				/**
-				 * For frag page support, not supported in
-				 * 32-bit architectures with 64-bit DMA.
-				 */
-				atomic_long_t pp_frag_count;
-			};
+			/**
+			 * dma_addr_upper: might require a 64-bit
+			 * value on 32-bit architectures.
+			 */
+			unsigned long dma_addr_upper;
 		};
 		struct {	/* Tail pages of compound page */
 			unsigned long compound_head;	/* Bit zero is set */
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 813c93499f20..28e1fdbdcd53 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -279,14 +279,14 @@  void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
 
 static inline void page_pool_fragment_page(struct page *page, long nr)
 {
-	atomic_long_set(&page->pp_frag_count, nr);
+	page_ref_add(page, nr);
 }
 
 static inline long page_pool_defrag_page(struct page *page, long nr)
 {
 	long ret;
 
-	/* If nr == pp_frag_count then we have cleared all remaining
+	/* If nr == page_ref_count then we have cleared all remaining
 	 * references to the page. No need to actually overwrite it, instead
 	 * we can leave this to be overwritten by the calling function.
 	 *
@@ -295,22 +295,14 @@  static inline long page_pool_defrag_page(struct page *page, long nr)
 	 * especially when dealing with a page that may be partitioned
 	 * into only 2 or 3 pieces.
 	 */
-	if (atomic_long_read(&page->pp_frag_count) == nr)
+	if (page_ref_count(page) == nr)
 		return 0;
 
-	ret = atomic_long_sub_return(nr, &page->pp_frag_count);
+	ret = page_ref_sub_return(page, nr);
 	WARN_ON(ret < 0);
 	return ret;
 }
 
-static inline bool page_pool_is_last_frag(struct page_pool *pool,
-					  struct page *page)
-{
-	/* If fragments aren't enabled or count is 0 we were the last user */
-	return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
-	       (page_pool_defrag_page(page, 1) == 0);
-}
-
 static inline void page_pool_put_page(struct page_pool *pool,
 				      struct page *page,
 				      unsigned int dma_sync_size,
@@ -320,9 +312,6 @@  static inline void page_pool_put_page(struct page_pool *pool,
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	if (!page_pool_is_last_frag(pool, page))
-		return;
-
 	page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
 #endif
 }
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 9b203d8660e4..0defcadae225 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -25,7 +25,7 @@ 
 #define DEFER_TIME (msecs_to_jiffies(1000))
 #define DEFER_WARN_INTERVAL (60 * HZ)
 
-#define BIAS_MAX	LONG_MAX
+#define BIAS_MAX(pool)	(PAGE_SIZE << ((pool)->p.order))
 
 #ifdef CONFIG_PAGE_POOL_STATS
 /* alloc_stat_inc is intended to be used in softirq context */
@@ -619,10 +619,6 @@  void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 	for (i = 0; i < count; i++) {
 		struct page *page = virt_to_head_page(data[i]);
 
-		/* It is not the last user for the page frag case */
-		if (!page_pool_is_last_frag(pool, page))
-			continue;
-
 		page = __page_pool_put_page(pool, page, -1, false);
 		/* Approved for bulk recycling in ptr_ring cache */
 		if (page)
@@ -659,7 +655,7 @@  EXPORT_SYMBOL(page_pool_put_page_bulk);
 static struct page *page_pool_drain_frag(struct page_pool *pool,
 					 struct page *page)
 {
-	long drain_count = BIAS_MAX - pool->frag_users;
+	long drain_count = BIAS_MAX(pool) - pool->frag_users;
 
 	/* Some user is still using the page frag */
 	if (likely(page_pool_defrag_page(page, drain_count)))
@@ -678,7 +674,7 @@  static struct page *page_pool_drain_frag(struct page_pool *pool,
 
 static void page_pool_free_frag(struct page_pool *pool)
 {
-	long drain_count = BIAS_MAX - pool->frag_users;
+	long drain_count = BIAS_MAX(pool) - pool->frag_users;
 	struct page *page = pool->frag_page;
 
 	pool->frag_page = NULL;
@@ -724,7 +720,7 @@  struct page *page_pool_alloc_frag(struct page_pool *pool,
 		pool->frag_users = 1;
 		*offset = 0;
 		pool->frag_offset = size;
-		page_pool_fragment_page(page, BIAS_MAX);
+		page_pool_fragment_page(page, BIAS_MAX(pool));
 		return page;
 	}