[1/3] kernfs: Introduce separate rwsem to protect inode attributes.
Commit Message
Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple
kernfs operations. On a large system with few hundred CPUs and few
hundred applications simultaneoulsy trying to access sysfs, this
results in multiple sys_open(s) contending on kernfs_rwsem via
kernfs_iop_permission and kernfs_dop_revalidate.
For example on a system with 384 cores, if I run 200 instances of an
application which is mostly executing the following loop:
for (int loop = 0; loop <100 ; loop++)
{
for (int port_num = 1; port_num < 2; port_num++)
{
for (int gid_index = 0; gid_index < 254; gid_index++ )
{
char ret_buf[64], ret_buf_lo[64];
char gid_file_path[1024];
int ret_len;
int ret_fd;
ssize_t ret_rd;
ub4 i, saved_errno;
memset(ret_buf, 0, sizeof(ret_buf));
memset(gid_file_path, 0, sizeof(gid_file_path));
ret_len = snprintf(gid_file_path, sizeof(gid_file_path),
"/sys/class/infiniband/%s/ports/%d/gids/%d",
dev_name,
port_num,
gid_index);
ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC);
if (ret_fd < 0)
{
printf("Failed to open %s\n", gid_file_path);
continue;
}
/* Read the GID */
ret_rd = read(ret_fd, ret_buf, 40);
if (ret_rd == -1)
{
printf("Failed to read from file %s, errno: %u\n",
gid_file_path, saved_errno);
continue;
}
close(ret_fd);
}
}
I see contention around kernfs_rwsem as follows:
path_openat
|
|----link_path_walk.part.0.constprop.0
| |
| |--49.92%--inode_permission
| | |
| | --48.69%--kernfs_iop_permission
| | |
| | |--18.16%--down_read
| | |
| | |--15.38%--up_read
| | |
| | --14.58%--_raw_spin_lock
| | |
| | -----
| |
| |--29.08%--walk_component
| | |
| | --29.02%--lookup_fast
| | |
| | |--24.26%--kernfs_dop_revalidate
| | | |
| | | |--14.97%--down_read
| | | |
| | | --9.01%--up_read
| | |
| | --4.74%--__d_lookup
| | |
| | --4.64%--_raw_spin_lock
| | |
| | ----
Having a separate per-fs rwsem to protect kernfs inode attributes,
will avoid the above mentioned contention and result in better
performance as can bee seen below:
path_openat
|
|----link_path_walk.part.0.constprop.0
| |
| |
| |--27.06%--inode_permission
| | |
| | --25.84%--kernfs_iop_permission
| | |
| | |--9.29%--up_read
| | |
| | |--8.19%--down_read
| | |
| | --7.89%--_raw_spin_lock
| | |
| | ----
| |
| |--22.42%--walk_component
| | |
| | --22.36%--lookup_fast
| | |
| | |--16.07%--__d_lookup
| | | |
| | | --16.01%--_raw_spin_lock
| | | |
| | | ----
| | |
| | --6.28%--kernfs_dop_revalidate
| | |
| | |--3.76%--down_read
| | |
| | --2.26%--up_read
As can be seen from the above data the overhead due to both
kerfs_iop_permission and kernfs_dop_revalidate have gone down and
this also reduces overall run time of the earlier mentioned loop.
Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
---
fs/kernfs/dir.c | 7 +++++++
fs/kernfs/inode.c | 16 ++++++++--------
fs/kernfs/kernfs-internal.h | 1 +
3 files changed, 16 insertions(+), 8 deletions(-)
Comments
On Thu, Mar 02, 2023 at 03:32:01PM +1100, Imran Khan wrote:
> +++ b/fs/kernfs/kernfs-internal.h
> @@ -47,6 +47,7 @@ struct kernfs_root {
>
> wait_queue_head_t deactivate_waitq;
> struct rw_semaphore kernfs_rwsem;
> + struct rw_semaphore kernfs_iattr_rwsem;
> };
>
> /* +1 to avoid triggering overflow warning when negating it */
Can you explain why we want one iattr rwsem per kernfs_root instead of
one rwsem per kernfs_node?
Hello Matthew,
Thanks for reviewing this,
On 3/3/2023 3:14 am, Matthew Wilcox wrote:
> On Thu, Mar 02, 2023 at 03:32:01PM +1100, Imran Khan wrote:
>> +++ b/fs/kernfs/kernfs-internal.h
>> @@ -47,6 +47,7 @@ struct kernfs_root {
>>
>> wait_queue_head_t deactivate_waitq;
>> struct rw_semaphore kernfs_rwsem;
>> + struct rw_semaphore kernfs_iattr_rwsem;
>> };
>>
>> /* +1 to avoid triggering overflow warning when negating it */
>
> Can you explain why we want one iattr rwsem per kernfs_root instead of
> one rwsem per kernfs_node?
Having an iattr rwsem per kernfs_node would increase memory usage due to
kobjects. I had tried per kernfs_node approach for a couple of other global
locks earlier [1], but that approach was not encouraged because it would impact
ability to use sysfs on low memory (especially 32 bit) systems. This was the
reason for keeping iattr rwsem in kernfs_root.
Thanks,
Imran
[1]: https://lore.kernel.org/all/YdLH6qQNxa11YmRO@kroah.com/
@@ -770,12 +770,15 @@ int kernfs_add_one(struct kernfs_node *kn)
goto out_unlock;
/* Update timestamps on the parent */
+ down_write(&root->kernfs_iattr_rwsem);
+
ps_iattr = parent->iattr;
if (ps_iattr) {
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
+ up_write(&root->kernfs_iattr_rwsem);
up_write(&root->kernfs_rwsem);
/*
@@ -940,6 +943,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
idr_init(&root->ino_idr);
init_rwsem(&root->kernfs_rwsem);
+ init_rwsem(&root->kernfs_iattr_rwsem);
INIT_LIST_HEAD(&root->supers);
/*
@@ -1462,11 +1466,14 @@ static void __kernfs_remove(struct kernfs_node *kn)
pos->parent ? pos->parent->iattr : NULL;
/* update timestamps on the parent */
+ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
+
if (ps_iattr) {
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
+ up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
kernfs_put(pos);
}
@@ -101,9 +101,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
int ret;
struct kernfs_root *root = kernfs_root(kn);
- down_write(&root->kernfs_rwsem);
+ down_write(&root->kernfs_iattr_rwsem);
ret = __kernfs_setattr(kn, iattr);
- up_write(&root->kernfs_rwsem);
+ up_write(&root->kernfs_iattr_rwsem);
return ret;
}
@@ -119,7 +119,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
return -EINVAL;
root = kernfs_root(kn);
- down_write(&root->kernfs_rwsem);
+ down_write(&root->kernfs_iattr_rwsem);
error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
if (error)
goto out;
@@ -132,7 +132,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
setattr_copy(&nop_mnt_idmap, inode, iattr);
out:
- up_write(&root->kernfs_rwsem);
+ up_write(&root->kernfs_iattr_rwsem);
return error;
}
@@ -189,10 +189,10 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap,
struct kernfs_node *kn = inode->i_private;
struct kernfs_root *root = kernfs_root(kn);
- down_read(&root->kernfs_rwsem);
+ down_read(&root->kernfs_iattr_rwsem);
kernfs_refresh_inode(kn, inode);
generic_fillattr(&nop_mnt_idmap, inode, stat);
- up_read(&root->kernfs_rwsem);
+ up_read(&root->kernfs_iattr_rwsem);
return 0;
}
@@ -285,10 +285,10 @@ int kernfs_iop_permission(struct mnt_idmap *idmap,
kn = inode->i_private;
root = kernfs_root(kn);
- down_read(&root->kernfs_rwsem);
+ down_read(&root->kernfs_iattr_rwsem);
kernfs_refresh_inode(kn, inode);
ret = generic_permission(&nop_mnt_idmap, inode, mask);
- up_read(&root->kernfs_rwsem);
+ up_read(&root->kernfs_iattr_rwsem);
return ret;
}
@@ -47,6 +47,7 @@ struct kernfs_root {
wait_queue_head_t deactivate_waitq;
struct rw_semaphore kernfs_rwsem;
+ struct rw_semaphore kernfs_iattr_rwsem;
};
/* +1 to avoid triggering overflow warning when negating it */