@@ -34,6 +34,7 @@ union futex_key {
u64 i_seq;
unsigned long pgoff;
unsigned int offset;
+ /* unsigned int node; */
} shared;
struct {
union {
@@ -42,11 +43,13 @@ union futex_key {
};
unsigned long address;
unsigned int offset;
+ /* unsigned int node; */
} private;
struct {
u64 ptr;
unsigned long word;
unsigned int offset;
+ unsigned int node; /* NOT hashed! */
} both;
};
@@ -47,12 +47,14 @@
* reside in the same cacheline.
*/
static struct {
- struct futex_hash_bucket *queues;
unsigned long hashsize;
+ unsigned int hashshift;
+ struct futex_hash_bucket *queues[MAX_NUMNODES];
} __futex_data __read_mostly __aligned(2*sizeof(long));
-#define futex_queues (__futex_data.queues)
-#define futex_hashsize (__futex_data.hashsize)
+#define futex_hashsize (__futex_data.hashsize)
+#define futex_hashshift (__futex_data.hashshift)
+#define futex_queues (__futex_data.queues)
/*
* Fault injections for futexes.
@@ -105,6 +107,26 @@ late_initcall(fail_futex_debugfs);
#endif /* CONFIG_FAIL_FUTEX */
+static int futex_get_value(u32 *val, u32 __user *from, unsigned int flags)
+{
+ switch (futex_size(flags)) {
+ case 1: return __get_user(*val, (u8 __user *)from);
+ case 2: return __get_user(*val, (u16 __user *)from);
+ case 4: return __get_user(*val, (u32 __user *)from);
+ default: BUG();
+ }
+}
+
+static int futex_put_value(u32 val, u32 __user *to, unsigned int flags)
+{
+ switch (futex_size(flags)) {
+ case 1: return __put_user(val, (u8 __user *)to);
+ case 2: return __put_user(val, (u16 __user *)to);
+ case 4: return __put_user(val, (u32 __user *)to);
+ default: BUG();
+ }
+}
+
/**
* futex_hash - Return the hash bucket in the global hash
* @key: Pointer to the futex key for which the hash is calculated
@@ -114,10 +136,20 @@ late_initcall(fail_futex_debugfs);
*/
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
- u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
+ u32 hash = jhash2((u32 *)key,
+ offsetof(typeof(*key), both.offset) / sizeof(u32),
key->both.offset);
+ int node = key->both.node;
+
+ if (node == -1) {
+ /*
+ * In case of !FLAGS_NUMA, use some unused hash bits to pick a
+ * node.
+ */
+ node = (hash >> futex_hashshift) % num_possible_nodes();
+ }
- return &futex_queues[hash & (futex_hashsize - 1)];
+ return &futex_queues[node][hash & (futex_hashsize - 1)];
}
@@ -217,32 +249,64 @@ static u64 get_inode_sequence_number(str
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
-int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
+int get_futex_key(void __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
struct page *page, *tail;
struct address_space *mapping;
- int err, ro = 0;
+ int node, err, size, ro = 0;
bool fshared;
fshared = flags & FLAGS_SHARED;
+ size = futex_size(flags);
/*
* The futex address must be "naturally" aligned.
*/
key->both.offset = address % PAGE_SIZE;
- if (unlikely((address % sizeof(u32)) != 0))
+ if (unlikely((address % size) != 0))
return -EINVAL;
address -= key->both.offset;
- if (unlikely(!access_ok(uaddr, sizeof(u32))))
+ if (flags & FLAGS_NUMA)
+ size *= 2;
+
+ if (unlikely(!access_ok(uaddr, size)))
return -EFAULT;
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
+ key->both.node = -1;
+ if (flags & FLAGS_NUMA) {
+ void __user *naddr = uaddr + size/2;
+
+ if (futex_get_value(&node, naddr, flags))
+ return -EFAULT;
+
+ if (node == -1) {
+ node = numa_node_id();
+ if (futex_put_value(node, naddr, flags))
+ return -EFAULT;
+ }
+
+ if (node >= num_possible_nodes())
+ return -EINVAL;
+
+ key->both.node = node;
+ }
+
+ /*
+ * Encode the futex size in the offset. This makes cross-size
+ * wake-wait fail -- see futex_match().
+ *
+ * NOTE that cross-size wake-wait is fundamentally broken wrt
+ * FLAGS_NUMA but could possibly work for !NUMA.
+ */
+ key->both.offset |= FUT_OFF_SIZE * (flags & FLAGS_SIZE_MASK);
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -1125,27 +1189,42 @@ void futex_exit_release(struct task_stru
static int __init futex_init(void)
{
- unsigned int futex_shift;
- unsigned long i;
+ unsigned int order, n;
+ unsigned long size, i;
#if CONFIG_BASE_SMALL
futex_hashsize = 16;
#else
- futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+ futex_hashsize = 256 * num_possible_cpus();
+ futex_hashsize /= num_possible_nodes();
+ futex_hashsize = roundup_pow_of_two(futex_hashsize);
#endif
+ futex_hashshift = ilog2(futex_hashsize);
+ size = sizeof(struct futex_hash_bucket) * futex_hashsize;
+ order = get_order(size);
+
+ for_each_node(n) {
+ struct futex_hash_bucket *table;
+
+ if (order > MAX_ORDER)
+ table = vmalloc_huge_node(size, GFP_KERNEL, n);
+ else
+ table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
+
+ BUG_ON(!table);
+
+ for (i = 0; i < futex_hashsize; i++) {
+ atomic_set(&table[i].waiters, 0);
+ spin_lock_init(&table[i].lock);
+ plist_head_init(&table[i].chain);
+ }
- futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- futex_hashsize, 0,
- futex_hashsize < 256 ? HASH_SMALL : 0,
- &futex_shift, NULL,
- futex_hashsize, futex_hashsize);
- futex_hashsize = 1UL << futex_shift;
-
- for (i = 0; i < futex_hashsize; i++) {
- atomic_set(&futex_queues[i].waiters, 0);
- plist_head_init(&futex_queues[i].chain);
- spin_lock_init(&futex_queues[i].lock);
+ futex_queues[n] = table;
}
+ pr_info("futex hash table, %d nodes, %ld entries (order: %d, %lu bytes)\n",
+ num_possible_nodes(),
+ futex_hashsize, order,
+ sizeof(struct futex_hash_bucket) * futex_hashsize);
return 0;
}
@@ -158,7 +158,7 @@ enum futex_access {
FUTEX_WRITE
};
-extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
+extern int get_futex_key(void __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw);
extern struct hrtimer_sleeper *
@@ -180,7 +180,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uad
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
-#define FUTEX2_MASK (FUTEX2_64 | FUTEX2_PRIVATE)
+#define FUTEX2_MASK (FUTEX2_64 | FUTEX2_NUMA | FUTEX2_PRIVATE)
/**
* futex_parse_waitv - Parse a waitv array from userspace