umount causes invalidate_bh_lrus which calls an IPI on each
CPU that has non empty per-CPU cache:
on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
This interrupts CPUs which might be executing code sensitive
to interferences.
To avoid the IPI, free the per-CPU caches remotely via RCU.
Two bh_lrus structures for each CPU are allocated: one is being
used (assigned to per-CPU bh_lru pointer), and the other is
being freed (or idle).
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
@@ -1203,7 +1203,21 @@ struct bh_lru {
struct buffer_head *bhs[BH_LRU_SIZE];
};
-static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
+
+/*
+ * Allocate two bh_lrus structures for each CPU. bh_lru points to the
+ * one that is currently in use, and the update path does
+ * (consider cpu->bh_lru = bh_lrus[0]).
+ *
+ * cpu->bh_lru = bh_lrus[1]
+ * synchronize_rcu()
+ * free bh's in bh_lrus[0]
+ */
+unsigned int bh_lru_idx;
+static DEFINE_PER_CPU(struct bh_lru, bh_lrus[2]) = {{{ NULL }}, {{NULL}}};
+static DEFINE_PER_CPU(struct bh_lru, *bh_lru);
+
+static DEFINE_MUTEX(bh_lru_invalidate_mutex);
#ifdef CONFIG_SMP
#define bh_lru_lock() local_irq_disable()
@@ -1245,16 +1259,19 @@ static void bh_lru_install(struct buffer_head *bh)
return;
}
- b = this_cpu_ptr(&bh_lrus);
+ rcu_read_lock();
+ b = rcu_dereference(per_cpu(bh_lru, smp_processor_id()));
for (i = 0; i < BH_LRU_SIZE; i++) {
swap(evictee, b->bhs[i]);
if (evictee == bh) {
+ rcu_read_unlock();
bh_lru_unlock();
return;
}
}
get_bh(bh);
+ rcu_read_unlock();
bh_lru_unlock();
brelse(evictee);
}
@@ -1266,28 +1283,32 @@ static struct buffer_head *
lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *ret = NULL;
+ struct bh_lru *lru;
unsigned int i;
check_irqs_on();
bh_lru_lock();
+ rcu_read_lock();
+
+ lru = rcu_dereference(per_cpu(bh_lru, smp_processor_id()));
for (i = 0; i < BH_LRU_SIZE; i++) {
- struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
+ struct buffer_head *bh = lru->bhs[i];
if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
bh->b_size == size) {
if (i) {
while (i) {
- __this_cpu_write(bh_lrus.bhs[i],
- __this_cpu_read(bh_lrus.bhs[i - 1]));
+ lru->bhs[i] = lru->bhs[i - 1];
i--;
}
- __this_cpu_write(bh_lrus.bhs[0], bh);
+ lru->bhs[0] = bh;
}
get_bh(bh);
ret = bh;
break;
}
}
+ rcu_read_unlock();
bh_lru_unlock();
return ret;
}
@@ -1381,35 +1402,56 @@ static void __invalidate_bh_lrus(struct bh_lru *b)
b->bhs[i] = NULL;
}
}
-/*
- * invalidate_bh_lrus() is called rarely - but not only at unmount.
- * This doesn't race because it runs in each cpu either in irq
- * or with preempt disabled.
- */
-static void invalidate_bh_lru(void *arg)
-{
- struct bh_lru *b = &get_cpu_var(bh_lrus);
-
- __invalidate_bh_lrus(b);
- put_cpu_var(bh_lrus);
-}
bool has_bh_in_lru(int cpu, void *dummy)
{
- struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
+ struct bh_lru *b;
int i;
-
+
+ rcu_read_lock();
+ b = rcu_dereference(per_cpu(bh_lru, cpu));
for (i = 0; i < BH_LRU_SIZE; i++) {
- if (b->bhs[i])
+ if (b->bhs[i]) {
+ rcu_read_unlock();
return true;
+ }
}
+ rcu_read_unlock();
return false;
}
+/*
+ * invalidate_bh_lrus() is called rarely - but not only at unmount.
+ */
void invalidate_bh_lrus(void)
{
- on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
+ int cpu, oidx, nidx;
+
+ mutex_lock(&bh_lru_invalidate_mutex);
+ oidx = bh_lru_idx;
+ bh_lru_idx++;
+ if (bh_lru_idx >= 2)
+ bh_lru_idx = 0;
+
+ nidx = bh_lru_idx;
+ /* Assign the per-CPU bh_lru pointer */
+ cpus_read_lock();
+ for_each_online_cpu(cpu)
+ rcu_assign_pointer(per_cpu(bh_lru, cpu), per_cpu_ptr(&bh_lrus[nidx], cpu));
+ cpus_read_unlock();
+ synchronize_rcu_expedited();
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct bh_lru *b = per_cpu_ptr(&bh_lrus[oidx], cpu);
+
+ bh_lru_lock();
+ __invalidate_bh_lrus(b);
+ bh_lru_unlock();
+ }
+ cpus_read_unlock();
+ mutex_unlock(&bh_lru_invalidate_mutex);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
@@ -1422,8 +1464,10 @@ void invalidate_bh_lrus_cpu(void)
struct bh_lru *b;
bh_lru_lock();
- b = this_cpu_ptr(&bh_lrus);
+ rcu_read_lock();
+ b = rcu_dereference(per_cpu(bh_lru, smp_processor_id()));
__invalidate_bh_lrus(b);
+ rcu_read_unlock();
bh_lru_unlock();
}
@@ -2923,12 +2967,15 @@ EXPORT_SYMBOL(free_buffer_head);
static int buffer_exit_cpu_dead(unsigned int cpu)
{
int i;
- struct bh_lru *b = &per_cpu(bh_lrus, cpu);
+ struct bh_lru *b;
+ rcu_read_lock();
+ b = rcu_dereference(per_cpu(bh_lru, cpu));
for (i = 0; i < BH_LRU_SIZE; i++) {
brelse(b->bhs[i]);
b->bhs[i] = NULL;
}
+ rcu_read_unlock();
this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
per_cpu(bh_accounting, cpu).nr = 0;
return 0;
@@ -3021,7 +3068,7 @@ EXPORT_SYMBOL(__bh_read_batch);
void __init buffer_init(void)
{
unsigned long nrpages;
- int ret;
+ int ret, cpu;
bh_cachep = kmem_cache_create("buffer_head",
sizeof(struct buffer_head), 0,
@@ -3029,6 +3076,11 @@ void __init buffer_init(void)
SLAB_MEM_SPREAD),
NULL);
+ cpus_read_lock();
+ for_each_online_cpu(cpu)
+ per_cpu(bh_lru, cpu) = per_cpu_ptr(&bh_lrus[0], cpu);
+ cpus_read_unlock();
+
/*
* Limit the bh occupancy to 10% of ZONE_NORMAL
*/