fs/buffer.c: remove per-CPU buffer_head lookup cache
Commit Message
For certain types of applications (for example PLC software or
RAN processing), upon occurrence of an event, it is necessary to
complete a certain task in a maximum amount of time (deadline).
One way to express this requirement is with a pair of numbers,
deadline time and execution time, where:
* deadline time: length of time between event and deadline.
* execution time: length of time it takes for processing of event
to occur on a particular hardware platform
(uninterrupted).
The particular values depend on use-case. For the case
where the realtime application executes in a virtualized
guest, an IPI which must be serviced in the host will cause
the following sequence of events:
1) VM-exit
2) execution of IPI (and function call)
3) VM-entry
Which causes an excess of 50us latency as observed by cyclictest
(this violates the latency requirement of vRAN application with 1ms TTI,
for example).
invalidate_bh_lrus calls an IPI on each CPU that has non empty
per-CPU cache:
on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
Upon closer investigation, it was found that in current codebase, lookup_bh_lru
is slower than __find_get_block_slow:
114 ns per __find_get_block
68 ns per __find_get_block_slow
So remove the per-CPU buffer_head caching.
Test program:
#define NRLOOPS 200000
static int __init example_init(void)
{
ktime_t s, e;
s64 delta;
int i, suc;
bdev = blkdev_get_by_path("/dev/loop0", FMODE_READ, NULL);
if (IS_ERR(bdev)) {
printk(KERN_ERR "failed to load /dev/loop0\n");
return -ENODEV;
}
suc = 0;
delta = 0;
for (i=0; i < NRLOOPS; i++) {
struct buffer_head *bh;
s = ktime_get();
bh = __find_get_block(bdev, 1, 512);
e = ktime_get();
if (bh) {
suc++;
__brelse(bh);
}
delta = delta + ktime_to_ns(ktime_sub(e, s));
}
printk(KERN_ERR "%lld ns per __find_get_block (suc=%d)\n", delta/NRLOOPS, suc);
suc = 0;
delta = 0;
for (i=0; i < NRLOOPS; i++) {
struct buffer_head *bh;
s = ktime_get();
bh = __find_get_block_slow(bdev, 1);
e = ktime_get();
if (bh) {
suc++;
__brelse(bh);
}
delta = delta + ktime_to_ns(ktime_sub(e, s));
}
printk(KERN_ERR "%lld ns per __find_get_block_slow (suc=%d)\n", delta/NRLOOPS, suc);
return 0;
}
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
block/bdev.c | 2 -
fs/buffer.c | 209 +++++-------------------------------------------------------------------------------------------------------------------------------------------------------
fs/jbd2/revoke.c | 9 ++----
fs/mpage.c | 3 --
fs/ocfs2/journal.c | 3 --
fs/reiserfs/reiserfs.h | 2 -
include/linux/buffer_head.h | 11 ++------
mm/migrate.c | 12 +-------
mm/swap.c | 4 --
9 files changed, 21 insertions(+), 234 deletions(-)
Comments
On Mon, Jun 26, 2023 at 03:04:53PM -0300, Marcelo Tosatti wrote:
> Upon closer investigation, it was found that in current codebase, lookup_bh_lru
> is slower than __find_get_block_slow:
>
> 114 ns per __find_get_block
> 68 ns per __find_get_block_slow
>
> So remove the per-CPU buffer_head caching.
LOL. That's amazing. I can't even see why it's so expensive. The
local_irq_disable(), perhaps? Your test case is the best possible
one for lookup_bh_lru() where you're not even doing the copy.
Reviewed-by: Matthew Wilcox (oracle) <willy@infradead.org>
On Mon, Jun 26, 2023 at 03:04:53PM -0300, Marcelo Tosatti wrote:
> Upon closer investigation, it was found that in current codebase, lookup_bh_lru
> is slower than __find_get_block_slow:
>
> 114 ns per __find_get_block
> 68 ns per __find_get_block_slow
>
> So remove the per-CPU buffer_head caching.
>
> Test program:
>
> #define NRLOOPS 200000
> static int __init example_init(void)
> {
> ktime_t s, e;
> s64 delta;
> int i, suc;
>
> bdev = blkdev_get_by_path("/dev/loop0", FMODE_READ, NULL);
> if (IS_ERR(bdev)) {
> printk(KERN_ERR "failed to load /dev/loop0\n");
> return -ENODEV;
> }
>
> suc = 0;
> delta = 0;
> for (i=0; i < NRLOOPS; i++) {
> struct buffer_head *bh;
>
> s = ktime_get();
> bh = __find_get_block(bdev, 1, 512);
> e = ktime_get();
> if (bh) {
> suc++;
> __brelse(bh);
> }
> delta = delta + ktime_to_ns(ktime_sub(e, s));
>
> }
> printk(KERN_ERR "%lld ns per __find_get_block (suc=%d)\n", delta/NRLOOPS, suc);
>
> suc = 0;
> delta = 0;
> for (i=0; i < NRLOOPS; i++) {
> struct buffer_head *bh;
>
> s = ktime_get();
> bh = __find_get_block_slow(bdev, 1);
> e = ktime_get();
> if (bh) {
> suc++;
> __brelse(bh);
> }
> delta = delta + ktime_to_ns(ktime_sub(e, s));
> }
> printk(KERN_ERR "%lld ns per __find_get_block_slow (suc=%d)\n", delta/NRLOOPS, suc);
It occurs to me that this is close to being the best-case scenario for
page-cache lookup as well as for lru lookup. Can you re-run it with
block 4UL * 1024 * 1024 * 1024 instead of block 1?
On Mon, Jun 26, 2023 at 07:47:42PM +0100, Matthew Wilcox wrote:
> On Mon, Jun 26, 2023 at 03:04:53PM -0300, Marcelo Tosatti wrote:
> > Upon closer investigation, it was found that in current codebase, lookup_bh_lru
> > is slower than __find_get_block_slow:
> >
> > 114 ns per __find_get_block
> > 68 ns per __find_get_block_slow
> >
> > So remove the per-CPU buffer_head caching.
>
> LOL. That's amazing. I can't even see why it's so expensive. The
> local_irq_disable(), perhaps? Your test case is the best possible
> one for lookup_bh_lru() where you're not even doing the copy.
I think it's even simpler than that.
i.e. the lookaside cache is being missed, so it's a pure cost and
the code is always having to call __find_get_block_slow() anyway.
Peeking at 16 buffers to not find a match is just as expensive as
walking 3-4 tree levels in an Xarray to find the buffer in the first
place....
IMO, this is an example of how lookaside caches are only a benefit
if the working set of items largely fits in the lookaside cache and
the cache lookup itself is much, much slower than a lookaside cache
miss.
Cheers,
Dave.
On Tue, Jun 27, 2023 at 09:30:09AM +1000, Dave Chinner wrote:
> On Mon, Jun 26, 2023 at 07:47:42PM +0100, Matthew Wilcox wrote:
> > On Mon, Jun 26, 2023 at 03:04:53PM -0300, Marcelo Tosatti wrote:
> > > Upon closer investigation, it was found that in current codebase, lookup_bh_lru
> > > is slower than __find_get_block_slow:
> > >
> > > 114 ns per __find_get_block
> > > 68 ns per __find_get_block_slow
> > >
> > > So remove the per-CPU buffer_head caching.
> >
> > LOL. That's amazing. I can't even see why it's so expensive. The
> > local_irq_disable(), perhaps? Your test case is the best possible
> > one for lookup_bh_lru() where you're not even doing the copy.
>
> I think it's even simpler than that.
>
> i.e. the lookaside cache is being missed, so it's a pure cost and
> the code is always having to call __find_get_block_slow() anyway.
How does that happen?
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
if (bh == NULL) {
/* __find_get_block_slow will mark the page accessed */
bh = __find_get_block_slow(bdev, block);
if (bh)
bh_lru_install(bh);
The second (and all subsequent) calls to __find_get_block() should find
the BH in the LRU.
> IMO, this is an example of how lookaside caches are only a benefit
> if the working set of items largely fits in the lookaside cache and
> the cache lookup itself is much, much slower than a lookaside cache
> miss.
But the test code he posted always asks for the same buffer each time.
So it should find it in the lookaside cache?
On Tue, Jun 27, 2023 at 01:13:25AM +0100, Matthew Wilcox wrote:
> On Tue, Jun 27, 2023 at 09:30:09AM +1000, Dave Chinner wrote:
> > On Mon, Jun 26, 2023 at 07:47:42PM +0100, Matthew Wilcox wrote:
> > > On Mon, Jun 26, 2023 at 03:04:53PM -0300, Marcelo Tosatti wrote:
> > > > Upon closer investigation, it was found that in current codebase, lookup_bh_lru
> > > > is slower than __find_get_block_slow:
> > > >
> > > > 114 ns per __find_get_block
> > > > 68 ns per __find_get_block_slow
> > > >
> > > > So remove the per-CPU buffer_head caching.
> > >
> > > LOL. That's amazing. I can't even see why it's so expensive. The
> > > local_irq_disable(), perhaps? Your test case is the best possible
> > > one for lookup_bh_lru() where you're not even doing the copy.
> >
> > I think it's even simpler than that.
> >
> > i.e. the lookaside cache is being missed, so it's a pure cost and
> > the code is always having to call __find_get_block_slow() anyway.
>
> How does that happen?
>
> __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
> {
> struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
>
> if (bh == NULL) {
> /* __find_get_block_slow will mark the page accessed */
> bh = __find_get_block_slow(bdev, block);
> if (bh)
> bh_lru_install(bh);
>
> The second (and all subsequent) calls to __find_get_block() should find
> the BH in the LRU.
>
> > IMO, this is an example of how lookaside caches are only a benefit
> > if the working set of items largely fits in the lookaside cache and
> > the cache lookup itself is much, much slower than a lookaside cache
> > miss.
>
> But the test code he posted always asks for the same buffer each time.
> So it should find it in the lookaside cache?
Oh.
for (i = 0; ....) {
bh = __find_get_block(bdev, 1, 512);
That's a '1' being passed to __find_get_block, not 'i'.
/me goes and gets more coffee.
Maybe it's CONFIG_PREEMPT_RT=y doing something to the locks that
isn't obvious here...
-Dave.
Nice one!
Reviewed-by: Christoph Hellwig <hch@lst.de>
On Mon, Jun 26, 2023 at 07:47:42PM +0100, Matthew Wilcox wrote:
> On Mon, Jun 26, 2023 at 03:04:53PM -0300, Marcelo Tosatti wrote:
> > Upon closer investigation, it was found that in current codebase, lookup_bh_lru
> > is slower than __find_get_block_slow:
> >
> > 114 ns per __find_get_block
> > 68 ns per __find_get_block_slow
> >
> > So remove the per-CPU buffer_head caching.
>
> LOL. That's amazing. I can't even see why it's so expensive. The
> local_irq_disable(), perhaps? Your test case is the best possible
> one for lookup_bh_lru() where you're not even doing the copy.
Oops, that was due to incorrect buffer size being looked up versus
installed size.
About 15ns is due to irq disablement.
6ns due to checking preempt is disabled (from __this_cpu_read).
So the actual numbers for the single block lookup are
(searching for the same block number repeatedly):
42 ns per __find_get_block
68 ns per __find_get_block_slow
And increases linearly as the test increases the number of blocks which
are searched for:
say mod 3 is
__find_get_block(blocknr=1)
__find_get_block(blocknr=2)
__find_get_block(blocknr=3)
41 ns per __find_get_block mod=1
41 ns per __find_get_block mod=2
42 ns per __find_get_block mod=3
43 ns per __find_get_block mod=4
45 ns per __find_get_block mod=5
48 ns per __find_get_block mod=6
48 ns per __find_get_block mod=7
49 ns per __find_get_block mod=8
51 ns per __find_get_block mod=9
52 ns per __find_get_block mod=10
54 ns per __find_get_block mod=11
56 ns per __find_get_block mod=12
58 ns per __find_get_block mod=13
60 ns per __find_get_block mod=14
61 ns per __find_get_block mod=15
63 ns per __find_get_block mod=16
138 ns per __find_get_block mod=17
138 ns per __find_get_block mod=18
138 ns per __find_get_block mod=19 <-- results from first patch, when
lookup_bh_lru is a miss
70 ns per __find_get_block_slow mod=1
71 ns per __find_get_block_slow mod=2
71 ns per __find_get_block_slow mod=3
71 ns per __find_get_block_slow mod=4
71 ns per __find_get_block_slow mod=5
72 ns per __find_get_block_slow mod=6
71 ns per __find_get_block_slow mod=7
72 ns per __find_get_block_slow mod=8
71 ns per __find_get_block_slow mod=9
71 ns per __find_get_block_slow mod=10
71 ns per __find_get_block_slow mod=11
71 ns per __find_get_block_slow mod=12
71 ns per __find_get_block_slow mod=13
71 ns per __find_get_block_slow mod=14
71 ns per __find_get_block_slow mod=15
71 ns per __find_get_block_slow mod=16
71 ns per __find_get_block_slow mod=17
72 ns per __find_get_block_slow mod=18
72 ns per __find_get_block_slow mod=19
ls on home directory:
hits: 2 misses: 91
find on a linux-2.6 git tree:
hits: 25453 misses: 51084
make clean on a linux-2.6 git tree:
hits: 247615 misses: 32855
make on a linux-2.6 git tree:
hits: 1410414 misses: 166896
In more detail, where each bucket below indicates which index into
per-CPU buffer lookup_bh_lru was found:
hits idx1 idx2 ... idx16
hits 139506 24299 21597 7462 15790 19108 6477 1349 1237 938 845 636 637 523 431 454 misses: 65773
So i think it makes more sense to just disable the cache for isolated
CPUs.
> Reviewed-by: Matthew Wilcox (oracle) <willy@infradead.org>
Thanks.
@@ -72,7 +72,6 @@ static void kill_bdev(struct block_device *bdev)
if (mapping_empty(mapping))
return;
- invalidate_bh_lrus();
truncate_inode_pages(mapping, 0);
}
@@ -82,7 +81,6 @@ void invalidate_bdev(struct block_device *bdev)
struct address_space *mapping = bdev->bd_inode->i_mapping;
if (mapping->nrpages) {
- invalidate_bh_lrus();
lru_add_drain_all(); /* make sure all lru add caches are flushed */
invalidate_mapping_pages(mapping, 0, -1);
}
@@ -186,8 +186,8 @@ EXPORT_SYMBOL(end_buffer_write_sync);
* may be quite high. This code could TryLock the page, and if that
* succeeds, there is no need to take private_lock.
*/
-static struct buffer_head *
-__find_get_block_slow(struct block_device *bdev, sector_t block)
+struct buffer_head *
+__find_get_block(struct block_device *bdev, sector_t block)
{
struct inode *bd_inode = bdev->bd_inode;
struct address_space *bd_mapping = bd_inode->i_mapping;
@@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
*/
ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
if (all_mapped && __ratelimit(&last_warned)) {
- printk("__find_get_block_slow() failed. block=%llu, "
+ printk("__find_get_block() failed. block=%llu, "
"b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
"device %pg blocksize: %d\n",
(unsigned long long)block,
@@ -241,6 +241,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
out:
return ret;
}
+EXPORT_SYMBOL(__find_get_block);
static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
@@ -598,10 +599,9 @@ EXPORT_SYMBOL(sync_mapping_buffers);
* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
* dirty, schedule it for IO. So that indirects merge nicely with their data.
*/
-void write_boundary_block(struct block_device *bdev,
- sector_t bblock, unsigned blocksize)
+void write_boundary_block(struct block_device *bdev, sector_t bblock)
{
- struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
+ struct buffer_head *bh = __find_get_block(bdev, bblock + 1);
if (bh) {
if (buffer_dirty(bh))
write_dirty_buffer(bh, 0);
@@ -1080,7 +1080,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
struct buffer_head *bh;
int ret;
- bh = __find_get_block(bdev, block, size);
+ bh = __find_get_block(bdev, block);
if (bh)
return bh;
@@ -1232,137 +1232,6 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
return NULL;
}
-/*
- * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
- * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
- * refcount elevated by one when they're in an LRU. A buffer can only appear
- * once in a particular CPU's LRU. A single buffer can be present in multiple
- * CPU's LRUs at the same time.
- *
- * This is a transparent caching front-end to sb_bread(), sb_getblk() and
- * sb_find_get_block().
- *
- * The LRUs themselves only need locking against invalidate_bh_lrus. We use
- * a local interrupt disable for that.
- */
-
-#define BH_LRU_SIZE 16
-
-struct bh_lru {
- struct buffer_head *bhs[BH_LRU_SIZE];
-};
-
-static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
-
-#ifdef CONFIG_SMP
-#define bh_lru_lock() local_irq_disable()
-#define bh_lru_unlock() local_irq_enable()
-#else
-#define bh_lru_lock() preempt_disable()
-#define bh_lru_unlock() preempt_enable()
-#endif
-
-static inline void check_irqs_on(void)
-{
-#ifdef irqs_disabled
- BUG_ON(irqs_disabled());
-#endif
-}
-
-/*
- * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
- * inserted at the front, and the buffer_head at the back if any is evicted.
- * Or, if already in the LRU it is moved to the front.
- */
-static void bh_lru_install(struct buffer_head *bh)
-{
- struct buffer_head *evictee = bh;
- struct bh_lru *b;
- int i;
-
- check_irqs_on();
- bh_lru_lock();
-
- /*
- * the refcount of buffer_head in bh_lru prevents dropping the
- * attached page(i.e., try_to_free_buffers) so it could cause
- * failing page migration.
- * Skip putting upcoming bh into bh_lru until migration is done.
- */
- if (lru_cache_disabled()) {
- bh_lru_unlock();
- return;
- }
-
- b = this_cpu_ptr(&bh_lrus);
- for (i = 0; i < BH_LRU_SIZE; i++) {
- swap(evictee, b->bhs[i]);
- if (evictee == bh) {
- bh_lru_unlock();
- return;
- }
- }
-
- get_bh(bh);
- bh_lru_unlock();
- brelse(evictee);
-}
-
-/*
- * Look up the bh in this cpu's LRU. If it's there, move it to the head.
- */
-static struct buffer_head *
-lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
-{
- struct buffer_head *ret = NULL;
- unsigned int i;
-
- check_irqs_on();
- bh_lru_lock();
- for (i = 0; i < BH_LRU_SIZE; i++) {
- struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
-
- if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
- bh->b_size == size) {
- if (i) {
- while (i) {
- __this_cpu_write(bh_lrus.bhs[i],
- __this_cpu_read(bh_lrus.bhs[i - 1]));
- i--;
- }
- __this_cpu_write(bh_lrus.bhs[0], bh);
- }
- get_bh(bh);
- ret = bh;
- break;
- }
- }
- bh_lru_unlock();
- return ret;
-}
-
-/*
- * Perform a pagecache lookup for the matching buffer. If it's there, refresh
- * it in the LRU and mark it as accessed. If it is not present then return
- * NULL
- */
-struct buffer_head *
-__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
-{
- struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
-
- if (bh == NULL) {
- /* __find_get_block_slow will mark the page accessed */
- bh = __find_get_block_slow(bdev, block);
- if (bh)
- bh_lru_install(bh);
- } else
- touch_buffer(bh);
-
- return bh;
-}
-EXPORT_SYMBOL(__find_get_block);
-
/*
* __getblk_gfp() will locate (and, if necessary, create) the buffer_head
* which corresponds to the passed block_device, block and size. The
@@ -1375,7 +1244,7 @@ struct buffer_head *
__getblk_gfp(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- struct buffer_head *bh = __find_get_block(bdev, block, size);
+ struct buffer_head *bh = __find_get_block(bdev, block);
might_sleep();
if (bh == NULL)
@@ -1421,61 +1290,6 @@ __bread_gfp(struct block_device *bdev, sector_t block,
}
EXPORT_SYMBOL(__bread_gfp);
-static void __invalidate_bh_lrus(struct bh_lru *b)
-{
- int i;
-
- for (i = 0; i < BH_LRU_SIZE; i++) {
- brelse(b->bhs[i]);
- b->bhs[i] = NULL;
- }
-}
-/*
- * invalidate_bh_lrus() is called rarely - but not only at unmount.
- * This doesn't race because it runs in each cpu either in irq
- * or with preempt disabled.
- */
-static void invalidate_bh_lru(void *arg)
-{
- struct bh_lru *b = &get_cpu_var(bh_lrus);
-
- __invalidate_bh_lrus(b);
- put_cpu_var(bh_lrus);
-}
-
-bool has_bh_in_lru(int cpu, void *dummy)
-{
- struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
- int i;
-
- for (i = 0; i < BH_LRU_SIZE; i++) {
- if (b->bhs[i])
- return true;
- }
-
- return false;
-}
-
-void invalidate_bh_lrus(void)
-{
- on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
-}
-EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
-
-/*
- * It's called from workqueue context so we need a bh_lru_lock to close
- * the race with preemption/irq.
- */
-void invalidate_bh_lrus_cpu(void)
-{
- struct bh_lru *b;
-
- bh_lru_lock();
- b = this_cpu_ptr(&bh_lrus);
- __invalidate_bh_lrus(b);
- bh_lru_unlock();
-}
-
void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset)
{
@@ -2997,13 +2811,6 @@ EXPORT_SYMBOL(free_buffer_head);
static int buffer_exit_cpu_dead(unsigned int cpu)
{
- int i;
- struct bh_lru *b = &per_cpu(bh_lrus, cpu);
-
- for (i = 0; i < BH_LRU_SIZE; i++) {
- brelse(b->bhs[i]);
- b->bhs[i] = NULL;
- }
this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
per_cpu(bh_accounting, cpu).nr = 0;
return 0;
@@ -345,7 +345,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
bh = bh_in;
if (!bh) {
- bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ bh = __find_get_block(bdev, blocknr);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
@@ -355,7 +355,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
/* If there is a different buffer_head lying around in
* memory anywhere... */
- bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ bh2 = __find_get_block(bdev, blocknr);
if (bh2) {
/* ... and it has RevokeValid status... */
if (bh2 != bh && buffer_revokevalid(bh2))
@@ -466,7 +466,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
- bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+ bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr);
if (bh2) {
if (bh2 != bh)
clear_buffer_revoked(bh2);
@@ -496,8 +496,7 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
struct buffer_head *bh;
record = (struct jbd2_revoke_record_s *)list_entry;
bh = __find_get_block(journal->j_fs_dev,
- record->blocknr,
- journal->j_blocksize);
+ record->blocknr);
if (bh) {
clear_buffer_revoked(bh);
__brelse(bh);
@@ -634,8 +634,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
if (boundary || (first_unmapped != blocks_per_page)) {
bio = mpage_bio_submit_write(bio);
if (boundary_block) {
- write_boundary_block(boundary_bdev,
- boundary_block, 1 << blkbits);
+ write_boundary_block(boundary_bdev, boundary_block);
}
} else {
mpd->last_block_in_bio = blocks[blocks_per_page - 1];
@@ -1212,8 +1212,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
}
for (i = 0; i < p_blocks; i++, p_blkno++) {
- bh = __find_get_block(osb->sb->s_bdev, p_blkno,
- osb->sb->s_blocksize);
+ bh = __find_get_block(osb->sb->s_bdev, p_blkno);
/* block not cached. */
if (!bh)
continue;
@@ -2810,7 +2810,7 @@ struct reiserfs_journal_header {
#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
/* We need these to make journal.c code more readable */
-#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block)
#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
@@ -227,8 +227,7 @@ static inline void clean_bdev_bh_alias(struct buffer_head *bh)
void mark_buffer_async_write(struct buffer_head *bh);
void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
-struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
- unsigned size);
+struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block);
struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp);
void __brelse(struct buffer_head *);
@@ -236,9 +235,6 @@ void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
struct buffer_head *__bread_gfp(struct block_device *,
sector_t block, unsigned size, gfp_t gfp);
-void invalidate_bh_lrus(void);
-void invalidate_bh_lrus_cpu(void);
-bool has_bh_in_lru(int cpu, void *dummy);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
@@ -247,8 +243,7 @@ int sync_dirty_buffer(struct buffer_head *bh);
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void submit_bh(blk_opf_t, struct buffer_head *);
-void write_boundary_block(struct block_device *bdev,
- sector_t bblock, unsigned blocksize);
+void write_boundary_block(struct block_device *bdev, sector_t bblock);
int bh_uptodate_or_lock(struct buffer_head *bh);
int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
void __bh_read_batch(int nr, struct buffer_head *bhs[],
@@ -375,7 +370,7 @@ sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
static inline struct buffer_head *
sb_find_get_block(struct super_block *sb, sector_t block)
{
- return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
+ return __find_get_block(sb->s_bdev, block);
}
static inline void
@@ -747,9 +747,7 @@ static int __buffer_migrate_folio(struct address_space *mapping,
if (check_refs) {
bool busy;
- bool invalidated = false;
-recheck_buffers:
busy = false;
spin_lock(&mapping->private_lock);
bh = head;
@@ -761,14 +759,8 @@ static int __buffer_migrate_folio(struct address_space *mapping,
bh = bh->b_this_page;
} while (bh != head);
if (busy) {
- if (invalidated) {
- rc = -EAGAIN;
- goto unlock_buffers;
- }
- spin_unlock(&mapping->private_lock);
- invalidate_bh_lrus();
- invalidated = true;
- goto recheck_buffers;
+ rc = -EAGAIN;
+ goto unlock_buffers;
}
}
@@ -765,7 +765,6 @@ static void lru_add_and_bh_lrus_drain(void)
local_lock(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
local_unlock(&cpu_fbatches.lock);
- invalidate_bh_lrus_cpu();
mlock_drain_local();
}
@@ -798,8 +797,7 @@ static bool cpu_needs_drain(unsigned int cpu)
folio_batch_count(&fbatches->lru_deactivate) ||
folio_batch_count(&fbatches->lru_lazyfree) ||
folio_batch_count(&fbatches->activate) ||
- need_mlock_drain(cpu) ||
- has_bh_in_lru(cpu, NULL);
+ need_mlock_drain(cpu);
}
/*