[10/11] io_uring/rsrc: cache struct io_rsrc_node

Message ID 7f5eb1b89e8dcf93739607c79bbf7aec1784cbbe.1680187408.git.asml.silence@gmail.com
State New
Headers
Series optimise registered buffer/file updates |

Commit Message

Pavel Begunkov March 30, 2023, 2:53 p.m. UTC
  Add allocation cache for struct io_rsrc_node, it's always allocated and
put under ->uring_lock, so it doesn't need any extra synchronisation
around caches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/linux/io_uring_types.h |  1 +
 io_uring/io_uring.c            | 11 +++++++++--
 io_uring/rsrc.c                | 23 +++++++++++++++--------
 io_uring/rsrc.h                |  5 ++++-
 4 files changed, 29 insertions(+), 11 deletions(-)
  

Comments

Gabriel Krisman Bertazi March 31, 2023, 2:09 p.m. UTC | #1
Pavel Begunkov <asml.silence@gmail.com> writes:

> Add allocation cache for struct io_rsrc_node, it's always allocated and
> put under ->uring_lock, so it doesn't need any extra synchronisation
> around caches.

Hi Pavel,

I'm curious if you considered using kmem_cache instead of the custom
cache for this case?  I'm wondering if this provokes visible difference in
performance in your benchmark.

> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  include/linux/io_uring_types.h |  1 +
>  io_uring/io_uring.c            | 11 +++++++++--
>  io_uring/rsrc.c                | 23 +++++++++++++++--------
>  io_uring/rsrc.h                |  5 ++++-
>  4 files changed, 29 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 47496059e13a..5d772e36e7fc 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -332,6 +332,7 @@ struct io_ring_ctx {
>  
>  	/* protected by ->uring_lock */
>  	struct list_head		rsrc_ref_list;
> +	struct io_alloc_cache		rsrc_node_cache;
>  
>  	struct list_head		io_buffers_pages;
>  
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 8c3886a4ca96..beedaf403284 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -310,6 +310,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>  	INIT_LIST_HEAD(&ctx->sqd_list);
>  	INIT_LIST_HEAD(&ctx->cq_overflow_list);
>  	INIT_LIST_HEAD(&ctx->io_buffers_cache);
> +	io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node));
>  	io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll));
>  	io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr));
>  	init_completion(&ctx->ref_comp);
> @@ -2791,6 +2792,11 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
>  	mutex_unlock(&ctx->uring_lock);
>  }
>  
> +void io_rsrc_node_cache_free(struct io_cache_entry *entry)
> +{
> +	kfree(container_of(entry, struct io_rsrc_node, cache));
> +}
> +
>  static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
>  {
>  	io_sq_thread_finish(ctx);
> @@ -2816,9 +2822,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
>  
>  	/* there are no registered resources left, nobody uses it */
>  	if (ctx->rsrc_node)
> -		io_rsrc_node_destroy(ctx->rsrc_node);
> +		io_rsrc_node_destroy(ctx, ctx->rsrc_node);
>  	if (ctx->rsrc_backup_node)
> -		io_rsrc_node_destroy(ctx->rsrc_backup_node);
> +		io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node);
>  
>  	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
>  
> @@ -2830,6 +2836,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
>  #endif
>  	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
>  
> +	io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
>  	if (ctx->mm_account) {
>  		mmdrop(ctx->mm_account);
>  		ctx->mm_account = NULL;
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 0f4e245dee1b..345631091d80 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -164,7 +164,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
>  		kfree(prsrc);
>  	}
>  
> -	io_rsrc_node_destroy(ref_node);
> +	io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
>  	if (atomic_dec_and_test(&rsrc_data->refs))
>  		complete(&rsrc_data->done);
>  }
> @@ -175,9 +175,10 @@ void io_wait_rsrc_data(struct io_rsrc_data *data)
>  		wait_for_completion(&data->done);
>  }
>  
> -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
> +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
>  {
> -	kfree(ref_node);
> +	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache))
> +		kfree(node);
>  }
>  
>  void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
> @@ -198,13 +199,19 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
>  	}
>  }
>  
> -static struct io_rsrc_node *io_rsrc_node_alloc(void)
> +static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
>  {
>  	struct io_rsrc_node *ref_node;
> +	struct io_cache_entry *entry;
>  
> -	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
> -	if (!ref_node)
> -		return NULL;
> +	entry = io_alloc_cache_get(&ctx->rsrc_node_cache);
> +	if (entry) {
> +		ref_node = container_of(entry, struct io_rsrc_node, cache);
> +	} else {
> +		ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
> +		if (!ref_node)
> +			return NULL;
> +	}
>  
>  	ref_node->refs = 1;
>  	INIT_LIST_HEAD(&ref_node->node);
> @@ -243,7 +250,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
>  {
>  	if (ctx->rsrc_backup_node)
>  		return 0;
> -	ctx->rsrc_backup_node = io_rsrc_node_alloc();
> +	ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
>  	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
>  }
>  
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index 17293ab90f64..d1555eaae81a 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -4,6 +4,8 @@
>  
>  #include <net/af_unix.h>
>  
> +#include "alloc_cache.h"
> +
>  #define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
>  #define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
>  #define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
> @@ -37,6 +39,7 @@ struct io_rsrc_data {
>  };
>  
>  struct io_rsrc_node {
> +	struct io_cache_entry		cache;
>  	int refs;
>  	struct list_head		node;
>  	struct io_rsrc_data		*rsrc_data;
> @@ -65,7 +68,7 @@ void io_rsrc_put_tw(struct callback_head *cb);
>  void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
>  void io_rsrc_put_work(struct work_struct *work);
>  void io_wait_rsrc_data(struct io_rsrc_data *data);
> -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
> +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
>  int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
>  int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
>  			  struct io_rsrc_node *node, void *rsrc);
  
Pavel Begunkov March 31, 2023, 4:27 p.m. UTC | #2
On 3/31/23 15:09, Gabriel Krisman Bertazi wrote:
> Pavel Begunkov <asml.silence@gmail.com> writes:
> 
>> Add allocation cache for struct io_rsrc_node, it's always allocated and
>> put under ->uring_lock, so it doesn't need any extra synchronisation
>> around caches.
> 
> Hi Pavel,
> 
> I'm curious if you considered using kmem_cache instead of the custom
> cache for this case?  I'm wondering if this provokes visible difference in
> performance in your benchmark.

I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us
much, definitely doesn't spare from locking, and the overhead
definitely wasn't satisfactory for requests before.

A quick note that I want to further limit the cache size,
IO_ALLOC_CACHE_MAX=512 for nodes doesn't sound great.
  
Gabriel Krisman Bertazi April 1, 2023, 12:04 a.m. UTC | #3
Pavel Begunkov <asml.silence@gmail.com> writes:

> On 3/31/23 15:09, Gabriel Krisman Bertazi wrote:
>> Pavel Begunkov <asml.silence@gmail.com> writes:
>> 
>>> Add allocation cache for struct io_rsrc_node, it's always allocated and
>>> put under ->uring_lock, so it doesn't need any extra synchronisation
>>> around caches.
>> Hi Pavel,
>> I'm curious if you considered using kmem_cache instead of the custom
>> cache for this case?  I'm wondering if this provokes visible difference in
>> performance in your benchmark.
>
> I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us
> much, definitely doesn't spare from locking, and the overhead
> definitely wasn't satisfactory for requests before.

There is no locks in the fast path of slub, as far as I know.  it has a
per-cpu cache that is refilled once empty, quite similar to the fastpath
of this cache.  I imagine the performance hit in slub comes from the
barrier and atomic operations?

kmem_cache works fine for most hot paths of the kernel.  I think this
custom cache makes sense for the request cache, where objects are
allocated at an incredibly high rate.  but is this level of update
frequency a valid use case here?

If it is indeed a significant performance improvement, I guess it is
fine to have another user of the cache. But I'd be curious to know how
much of the performance improvement you mentioned in the cover letter is
due to this patch!
  
Pavel Begunkov April 4, 2023, 1:21 p.m. UTC | #4
On 4/1/23 01:04, Gabriel Krisman Bertazi wrote:
> Pavel Begunkov <asml.silence@gmail.com> writes:
> 
>> On 3/31/23 15:09, Gabriel Krisman Bertazi wrote:
>>> Pavel Begunkov <asml.silence@gmail.com> writes:
>>>
>>>> Add allocation cache for struct io_rsrc_node, it's always allocated and
>>>> put under ->uring_lock, so it doesn't need any extra synchronisation
>>>> around caches.
>>> Hi Pavel,
>>> I'm curious if you considered using kmem_cache instead of the custom
>>> cache for this case?  I'm wondering if this provokes visible difference in
>>> performance in your benchmark.
>>
>> I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us
>> much, definitely doesn't spare from locking, and the overhead
>> definitely wasn't satisfactory for requests before.
> 
> There is no locks in the fast path of slub, as far as I know.  it has a
> per-cpu cache that is refilled once empty, quite similar to the fastpath
> of this cache.  I imagine the performance hit in slub comes from the
> barrier and atomic operations?

Yeah, I mean all kinds of synchronisation. And I don't think
that's the main offender here, the test is single threaded without
contention and the system was mostly idle.

> kmem_cache works fine for most hot paths of the kernel.  I think this

It doesn't for io_uring. There are caches for the net side and now
in the block layer as well. I wouldn't say it necessarily halves
performance but definitely takes a share of CPU.

> custom cache makes sense for the request cache, where objects are
> allocated at an incredibly high rate.  but is this level of update
> frequency a valid use case here?

I can think of some. For example it was of interest before to
install a file for just 2-3 IO operations and also fully bypassing
the normal file table. I rather don't see why we wouldn't use it.

> If it is indeed a significant performance improvement, I guess it is
> fine to have another user of the cache. But I'd be curious to know how
> much of the performance improvement you mentioned in the cover letter is
> due to this patch!

It was definitely sticking out in profiles, 5-10% of cycles, maybe more
  
Gabriel Krisman Bertazi April 4, 2023, 3:48 p.m. UTC | #5
Pavel Begunkov <asml.silence@gmail.com> writes:

> On 4/1/23 01:04, Gabriel Krisman Bertazi wrote:
>> Pavel Begunkov <asml.silence@gmail.com> writes:

>>> I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us
>>> much, definitely doesn't spare from locking, and the overhead
>>> definitely wasn't satisfactory for requests before.
>> There is no locks in the fast path of slub, as far as I know.  it has
>> a
>> per-cpu cache that is refilled once empty, quite similar to the fastpath
>> of this cache.  I imagine the performance hit in slub comes from the
>> barrier and atomic operations?
>
> Yeah, I mean all kinds of synchronisation. And I don't think
> that's the main offender here, the test is single threaded without
> contention and the system was mostly idle.
>
>> kmem_cache works fine for most hot paths of the kernel.  I think this
>
> It doesn't for io_uring. There are caches for the net side and now
> in the block layer as well. I wouldn't say it necessarily halves
> performance but definitely takes a share of CPU.

Right.  My point is that all these caches (block, io_uring) duplicate
what the slab cache is meant to do.  Since slab became a bottleneck, I'm
looking at how to improve the situation on their side, to see if we can
drop the caching here and in block/.

>> If it is indeed a significant performance improvement, I guess it is
>> fine to have another user of the cache. But I'd be curious to know how
>> much of the performance improvement you mentioned in the cover letter is
>> due to this patch!
>
> It was definitely sticking out in profiles, 5-10% of cycles, maybe
> more

That's surprisingly high.  Hopefully we will can avoid this caching
soon.  For now, feel free to add to this patch:

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
  
Jens Axboe April 4, 2023, 3:52 p.m. UTC | #6
On 4/4/23 9:48?AM, Gabriel Krisman Bertazi wrote:
> Pavel Begunkov <asml.silence@gmail.com> writes:
> 
>> On 4/1/23 01:04, Gabriel Krisman Bertazi wrote:
>>> Pavel Begunkov <asml.silence@gmail.com> writes:
> 
>>>> I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us
>>>> much, definitely doesn't spare from locking, and the overhead
>>>> definitely wasn't satisfactory for requests before.
>>> There is no locks in the fast path of slub, as far as I know.  it has
>>> a
>>> per-cpu cache that is refilled once empty, quite similar to the fastpath
>>> of this cache.  I imagine the performance hit in slub comes from the
>>> barrier and atomic operations?
>>
>> Yeah, I mean all kinds of synchronisation. And I don't think
>> that's the main offender here, the test is single threaded without
>> contention and the system was mostly idle.
>>
>>> kmem_cache works fine for most hot paths of the kernel.  I think this
>>
>> It doesn't for io_uring. There are caches for the net side and now
>> in the block layer as well. I wouldn't say it necessarily halves
>> performance but definitely takes a share of CPU.
> 
> Right.  My point is that all these caches (block, io_uring) duplicate
> what the slab cache is meant to do.  Since slab became a bottleneck, I'm
> looking at how to improve the situation on their side, to see if we can
> drop the caching here and in block/.

That would certainly be a worthy goal, and I do agree that these caches
are (largely) working around deficiencies. One important point that you
may miss is that most of this caching gets its performance from both
avoiding atomics in slub, but also because we can guarantee that both
alloc and free happen from process context. The block IRQ bits are a bit
different, but apart from that, it's true elsewhere. Caching that needs
to even disable IRQs locally generally doesn't beat out slub by much,
the big wins are the cases where we know free+alloc is done in process
context.
  
Gabriel Krisman Bertazi April 4, 2023, 4:53 p.m. UTC | #7
Jens Axboe <axboe@kernel.dk> writes:

> On 4/4/23 9:48?AM, Gabriel Krisman Bertazi wrote:
>> Pavel Begunkov <asml.silence@gmail.com> writes:
>> 
>>> On 4/1/23 01:04, Gabriel Krisman Bertazi wrote:
>>>> Pavel Begunkov <asml.silence@gmail.com> writes:
>> 
>>>>> I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us
>>>>> much, definitely doesn't spare from locking, and the overhead
>>>>> definitely wasn't satisfactory for requests before.
>>>> There is no locks in the fast path of slub, as far as I know.  it has
>>>> a
>>>> per-cpu cache that is refilled once empty, quite similar to the fastpath
>>>> of this cache.  I imagine the performance hit in slub comes from the
>>>> barrier and atomic operations?
>>>
>>> Yeah, I mean all kinds of synchronisation. And I don't think
>>> that's the main offender here, the test is single threaded without
>>> contention and the system was mostly idle.
>>>
>>>> kmem_cache works fine for most hot paths of the kernel.  I think this
>>>
>>> It doesn't for io_uring. There are caches for the net side and now
>>> in the block layer as well. I wouldn't say it necessarily halves
>>> performance but definitely takes a share of CPU.
>> 
>> Right.  My point is that all these caches (block, io_uring) duplicate
>> what the slab cache is meant to do.  Since slab became a bottleneck, I'm
>> looking at how to improve the situation on their side, to see if we can
>> drop the caching here and in block/.
>
> That would certainly be a worthy goal, and I do agree that these caches
> are (largely) working around deficiencies. One important point that you
> may miss is that most of this caching gets its performance from both
> avoiding atomics in slub, but also because we can guarantee that both
> alloc and free happen from process context. The block IRQ bits are a bit
> different, but apart from that, it's true elsewhere. Caching that needs
> to even disable IRQs locally generally doesn't beat out slub by much,
> the big wins are the cases where we know free+alloc is done in process
> context.

Yes, I noticed that.  I was thinking of exposing a flag at kmem_cache
creation-time to tell slab the user promises not to use it in IRQ
context, so it doesn't need to worry about nested invocation in the
allocation/free path.  Then, for those caches, have a
kmem_cache_alloc_locked variant, where the synchronization is maintained
by the caller (i.e. by ->uring_lock here), so it can manipulate the
cache without atomics.

I was looking at your implementation of the block cache for inspiration
and saw how you kept a second list for IRQ.  I'm thinking how to fit a
similar change inside slub.  But for now, I want to get the simpler
case, which is all io_uring need.

I'll try to get a prototype before lsfmm and see if I get the MM folks
input there.
  
Pavel Begunkov April 4, 2023, 6:26 p.m. UTC | #8
On 4/4/23 17:53, Gabriel Krisman Bertazi wrote:
> Jens Axboe <axboe@kernel.dk> writes:
> 
>> On 4/4/23 9:48?AM, Gabriel Krisman Bertazi wrote:
>>> Pavel Begunkov <asml.silence@gmail.com> writes:
>>>
>>>> On 4/1/23 01:04, Gabriel Krisman Bertazi wrote:
>>>>> Pavel Begunkov <asml.silence@gmail.com> writes:
>>>
>>>>>> I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us
>>>>>> much, definitely doesn't spare from locking, and the overhead
>>>>>> definitely wasn't satisfactory for requests before.
>>>>> There is no locks in the fast path of slub, as far as I know.  it has
>>>>> a
>>>>> per-cpu cache that is refilled once empty, quite similar to the fastpath
>>>>> of this cache.  I imagine the performance hit in slub comes from the
>>>>> barrier and atomic operations?
>>>>
>>>> Yeah, I mean all kinds of synchronisation. And I don't think
>>>> that's the main offender here, the test is single threaded without
>>>> contention and the system was mostly idle.
>>>>
>>>>> kmem_cache works fine for most hot paths of the kernel.  I think this
>>>>
>>>> It doesn't for io_uring. There are caches for the net side and now
>>>> in the block layer as well. I wouldn't say it necessarily halves
>>>> performance but definitely takes a share of CPU.
>>>
>>> Right.  My point is that all these caches (block, io_uring) duplicate
>>> what the slab cache is meant to do.  Since slab became a bottleneck, I'm
>>> looking at how to improve the situation on their side, to see if we can
>>> drop the caching here and in block/.
>>
>> That would certainly be a worthy goal, and I do agree that these caches
>> are (largely) working around deficiencies. One important point that you
>> may miss is that most of this caching gets its performance from both
>> avoiding atomics in slub, but also because we can guarantee that both
>> alloc and free happen from process context. The block IRQ bits are a bit
>> different, but apart from that, it's true elsewhere. Caching that needs
>> to even disable IRQs locally generally doesn't beat out slub by much,
>> the big wins are the cases where we know free+alloc is done in process
>> context.
> 
> Yes, I noticed that.  I was thinking of exposing a flag at kmem_cache
> creation-time to tell slab the user promises not to use it in IRQ
> context, so it doesn't need to worry about nested invocation in the
> allocation/free path.  Then, for those caches, have a
> kmem_cache_alloc_locked variant, where the synchronization is maintained
> by the caller (i.e. by ->uring_lock here), so it can manipulate the
> cache without atomics.
> 
> I was looking at your implementation of the block cache for inspiration
> and saw how you kept a second list for IRQ.  I'm thinking how to fit a

It works fine, but one problem I had while implementing it is
local_irq_save() in bio_put_percpu_cache() not being so cheap even when
interrupts are already disabled. Apparently, raw_local_save_flags() is
not as free as I wished it to be, and we can't easily pass the current
context. Would be nice to do something about it at some point.


> similar change inside slub.  But for now, I want to get the simpler
> case, which is all io_uring need.
> 
> I'll try to get a prototype before lsfmm and see if I get the MM folks
> input there.
>
  

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 47496059e13a..5d772e36e7fc 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -332,6 +332,7 @@  struct io_ring_ctx {
 
 	/* protected by ->uring_lock */
 	struct list_head		rsrc_ref_list;
+	struct io_alloc_cache		rsrc_node_cache;
 
 	struct list_head		io_buffers_pages;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8c3886a4ca96..beedaf403284 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -310,6 +310,7 @@  static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
+	io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node));
 	io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll));
 	io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr));
 	init_completion(&ctx->ref_comp);
@@ -2791,6 +2792,11 @@  static void io_req_caches_free(struct io_ring_ctx *ctx)
 	mutex_unlock(&ctx->uring_lock);
 }
 
+void io_rsrc_node_cache_free(struct io_cache_entry *entry)
+{
+	kfree(container_of(entry, struct io_rsrc_node, cache));
+}
+
 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
@@ -2816,9 +2822,9 @@  static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 
 	/* there are no registered resources left, nobody uses it */
 	if (ctx->rsrc_node)
-		io_rsrc_node_destroy(ctx->rsrc_node);
+		io_rsrc_node_destroy(ctx, ctx->rsrc_node);
 	if (ctx->rsrc_backup_node)
-		io_rsrc_node_destroy(ctx->rsrc_backup_node);
+		io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node);
 
 	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 
@@ -2830,6 +2836,7 @@  static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 #endif
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
+	io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
 	if (ctx->mm_account) {
 		mmdrop(ctx->mm_account);
 		ctx->mm_account = NULL;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 0f4e245dee1b..345631091d80 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -164,7 +164,7 @@  static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 		kfree(prsrc);
 	}
 
-	io_rsrc_node_destroy(ref_node);
+	io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
 	if (atomic_dec_and_test(&rsrc_data->refs))
 		complete(&rsrc_data->done);
 }
@@ -175,9 +175,10 @@  void io_wait_rsrc_data(struct io_rsrc_data *data)
 		wait_for_completion(&data->done);
 }
 
-void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
+void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 {
-	kfree(ref_node);
+	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache))
+		kfree(node);
 }
 
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
@@ -198,13 +199,19 @@  void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 	}
 }
 
-static struct io_rsrc_node *io_rsrc_node_alloc(void)
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 {
 	struct io_rsrc_node *ref_node;
+	struct io_cache_entry *entry;
 
-	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
-	if (!ref_node)
-		return NULL;
+	entry = io_alloc_cache_get(&ctx->rsrc_node_cache);
+	if (entry) {
+		ref_node = container_of(entry, struct io_rsrc_node, cache);
+	} else {
+		ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+		if (!ref_node)
+			return NULL;
+	}
 
 	ref_node->refs = 1;
 	INIT_LIST_HEAD(&ref_node->node);
@@ -243,7 +250,7 @@  int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 {
 	if (ctx->rsrc_backup_node)
 		return 0;
-	ctx->rsrc_backup_node = io_rsrc_node_alloc();
+	ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
 	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 17293ab90f64..d1555eaae81a 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -4,6 +4,8 @@ 
 
 #include <net/af_unix.h>
 
+#include "alloc_cache.h"
+
 #define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
 #define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
 #define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
@@ -37,6 +39,7 @@  struct io_rsrc_data {
 };
 
 struct io_rsrc_node {
+	struct io_cache_entry		cache;
 	int refs;
 	struct list_head		node;
 	struct io_rsrc_data		*rsrc_data;
@@ -65,7 +68,7 @@  void io_rsrc_put_tw(struct callback_head *cb);
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_put_work(struct work_struct *work);
 void io_wait_rsrc_data(struct io_rsrc_data *data);
-void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
+void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 			  struct io_rsrc_node *node, void *rsrc);