[v3] proc: report open files as size in stat() for /proc/pid/fd
Commit Message
Many monitoring tools include open file count as a metric. Currently
the only way to get this number is to enumerate the files in /proc/pid/fd.
The problem with the current approach is that it does many things people
generally don't care about when they need one number for a metric.
In our tests for cadvisor, which reports open file counts per cgroup,
we observed that reading the number of open files is slow. Out of 35.23%
of CPU time spent in `proc_readfd_common`, we see 29.43% spent in
`proc_fill_cache`, which is responsible for filling dentry info.
Some of this extra time is spinlock contention, but it's a contention
for the lock we don't want to take to begin with.
We considered putting the number of open files in /proc/pid/status.
Unfortunately, counting the number of fds involves iterating the open_files
bitmap, which has a linear complexity in proportion with the number
of open files (bitmap slots really, but it's close). We don't want
to make /proc/pid/status any slower, so instead we put this info
in /proc/pid/fd as a size member of the stat syscall result.
Previously the reported number was zero, so there's very little
risk of breaking anything, while still providing a somewhat logical
way to count the open files with a fallback if it's zero.
RFC for this patch included iterating open fds under RCU. Thanks
to Frank Hofmann for the suggestion to use the bitmap instead.
Previously:
```
$ sudo stat /proc/1/fd | head -n2
File: /proc/1/fd
Size: 0 Blocks: 0 IO Block: 1024 directory
```
With this patch:
```
$ sudo stat /proc/1/fd | head -n2
File: /proc/1/fd
Size: 65 Blocks: 0 IO Block: 1024 directory
```
Correctness check:
```
$ sudo ls /proc/1/fd | wc -l
65
```
I added the docs for /proc/<pid>/fd while I'm at it.
Signed-off-by: Ivan Babrou <ivan@cloudflare.com>
---
v3: Made use of bitmap_weight() to count the bits.
v2: Added missing rcu_read_lock() / rcu_read_unlock(),
task_lock() / task_unlock() and put_task_struct().
---
Documentation/filesystems/proc.rst | 17 +++++++++++++
fs/proc/fd.c | 41 ++++++++++++++++++++++++++++++
2 files changed, 58 insertions(+)
Comments
On Mon, Oct 17, 2022 at 09:58:44PM -0700, Ivan Babrou wrote:
> Many monitoring tools include open file count as a metric. Currently
> the only way to get this number is to enumerate the files in /proc/pid/fd.
>
> The problem with the current approach is that it does many things people
> generally don't care about when they need one number for a metric.
> In our tests for cadvisor, which reports open file counts per cgroup,
> we observed that reading the number of open files is slow. Out of 35.23%
> of CPU time spent in `proc_readfd_common`, we see 29.43% spent in
> `proc_fill_cache`, which is responsible for filling dentry info.
> Some of this extra time is spinlock contention, but it's a contention
> for the lock we don't want to take to begin with.
>
> We considered putting the number of open files in /proc/pid/status.
> Unfortunately, counting the number of fds involves iterating the open_files
> bitmap, which has a linear complexity in proportion with the number
> of open files (bitmap slots really, but it's close). We don't want
> to make /proc/pid/status any slower, so instead we put this info
> in /proc/pid/fd as a size member of the stat syscall result.
> Previously the reported number was zero, so there's very little
> risk of breaking anything, while still providing a somewhat logical
> way to count the open files with a fallback if it's zero.
>
> RFC for this patch included iterating open fds under RCU. Thanks
> to Frank Hofmann for the suggestion to use the bitmap instead.
>
> Previously:
>
> ```
> $ sudo stat /proc/1/fd | head -n2
> File: /proc/1/fd
> Size: 0 Blocks: 0 IO Block: 1024 directory
> ```
>
> With this patch:
>
> ```
> $ sudo stat /proc/1/fd | head -n2
> File: /proc/1/fd
> Size: 65 Blocks: 0 IO Block: 1024 directory
> ```
>
> Correctness check:
>
> ```
> $ sudo ls /proc/1/fd | wc -l
> 65
> ```
>
> I added the docs for /proc/<pid>/fd while I'm at it.
>
> Signed-off-by: Ivan Babrou <ivan@cloudflare.com>
>
> ---
> v3: Made use of bitmap_weight() to count the bits.
> v2: Added missing rcu_read_lock() / rcu_read_unlock(),
> task_lock() / task_unlock() and put_task_struct().
> ---
> Documentation/filesystems/proc.rst | 17 +++++++++++++
> fs/proc/fd.c | 41 ++++++++++++++++++++++++++++++
> 2 files changed, 58 insertions(+)
>
...
> diff --git a/fs/proc/fd.c b/fs/proc/fd.c
> index 913bef0d2a36..439a62c59381 100644
> --- a/fs/proc/fd.c
> +++ b/fs/proc/fd.c
> @@ -279,6 +279,31 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
> return 0;
> }
>
> +static int proc_readfd_count(struct inode *inode)
> +{
> + struct task_struct *p = get_proc_task(inode);
> + struct fdtable *fdt;
> + unsigned int open_fds = 0;
> +
> + if (!p)
> + return -ENOENT;
Maybe this shouldn't happen, but do you mean to assign the error code to
stat->size in the caller? Otherwise this seems reasonable to me.
Brian
> +
> + task_lock(p);
> + if (p->files) {
> + rcu_read_lock();
> +
> + fdt = files_fdtable(p->files);
> + open_fds = bitmap_weight(fdt->open_fds, fdt->max_fds);
> +
> + rcu_read_unlock();
> + }
> + task_unlock(p);
> +
> + put_task_struct(p);
> +
> + return open_fds;
> +}
> +
> static int proc_readfd(struct file *file, struct dir_context *ctx)
> {
> return proc_readfd_common(file, ctx, proc_fd_instantiate);
> @@ -319,9 +344,25 @@ int proc_fd_permission(struct user_namespace *mnt_userns,
> return rv;
> }
>
> +static int proc_fd_getattr(struct user_namespace *mnt_userns,
> + const struct path *path, struct kstat *stat,
> + u32 request_mask, unsigned int query_flags)
> +{
> + struct inode *inode = d_inode(path->dentry);
> +
> + generic_fillattr(&init_user_ns, inode, stat);
> +
> + /* If it's a directory, put the number of open fds there */
> + if (S_ISDIR(inode->i_mode))
> + stat->size = proc_readfd_count(inode);
> +
> + return 0;
> +}
> +
> const struct inode_operations proc_fd_inode_operations = {
> .lookup = proc_lookupfd,
> .permission = proc_fd_permission,
> + .getattr = proc_fd_getattr,
> .setattr = proc_setattr,
> };
>
> --
> 2.37.3
>
On Tue, Oct 18, 2022 at 11:16 AM Brian Foster <bfoster@redhat.com> wrote:
> > +static int proc_readfd_count(struct inode *inode)
> > +{
> > + struct task_struct *p = get_proc_task(inode);
> > + struct fdtable *fdt;
> > + unsigned int open_fds = 0;
> > +
> > + if (!p)
> > + return -ENOENT;
>
> Maybe this shouldn't happen, but do you mean to assign the error code to
> stat->size in the caller? Otherwise this seems reasonable to me.
You are right. As unlikely as it is to happen, we shouldn't return
negative size.
What's the idiomatic way to make this work? My two options are:
1. Pass &stat->size into proc_readfd_count:
if (S_ISDIR(inode->i_mode)) {
rv = proc_readfd_count(inode, &stat->size);
if (rv < 0)
goto out;
}
out:
return rv;
OR without a goto:
if (S_ISDIR(inode->i_mode)) {
rv = proc_readfd_count(inode, &stat->size));
if (rv < 0)
return rv;
}
return rv;
2. Return negative count as error (as we don't expect negative amount
of files open):
if (S_ISDIR(inode->i_mode)) {
size = proc_readfd_count(inode);
if (size < 0)
return size;
stat->size = size;
}
On Mon, 17 Oct 2022 21:58:44 -0700 Ivan Babrou <ivan@cloudflare.com> wrote:
> v3: Made use of bitmap_weight() to count the bits.
Thanks, I queued the below delta:
--- a/fs/proc/fd.c~proc-report-open-files-as-size-in-stat-for-proc-pid-fd-v3
+++ a/fs/proc/fd.c
@@ -283,7 +283,7 @@ static int proc_readfd_count(struct inod
{
struct task_struct *p = get_proc_task(inode);
struct fdtable *fdt;
- unsigned int i, size, open_fds = 0;
+ unsigned int open_fds = 0;
if (!p)
return -ENOENT;
@@ -293,10 +293,7 @@ static int proc_readfd_count(struct inod
rcu_read_lock();
fdt = files_fdtable(p->files);
- size = fdt->max_fds;
-
- for (i = size / BITS_PER_LONG; i > 0;)
- open_fds += hweight64(fdt->open_fds[--i]);
+ open_fds = bitmap_weight(fdt->open_fds, fdt->max_fds);
rcu_read_unlock();
}
On Tue, Oct 18, 2022 at 11:51:02AM -0700, Ivan Babrou wrote:
> On Tue, Oct 18, 2022 at 11:16 AM Brian Foster <bfoster@redhat.com> wrote:
> > > +static int proc_readfd_count(struct inode *inode)
> > > +{
> > > + struct task_struct *p = get_proc_task(inode);
> > > + struct fdtable *fdt;
> > > + unsigned int open_fds = 0;
> > > +
> > > + if (!p)
> > > + return -ENOENT;
> >
> > Maybe this shouldn't happen, but do you mean to assign the error code to
> > stat->size in the caller? Otherwise this seems reasonable to me.
>
> You are right. As unlikely as it is to happen, we shouldn't return
> negative size.
>
> What's the idiomatic way to make this work? My two options are:
>
> 1. Pass &stat->size into proc_readfd_count:
>
> if (S_ISDIR(inode->i_mode)) {
> rv = proc_readfd_count(inode, &stat->size);
> if (rv < 0)
> goto out;
> }
>
> out:
> return rv;
>
> OR without a goto:
>
> if (S_ISDIR(inode->i_mode)) {
> rv = proc_readfd_count(inode, &stat->size));
> if (rv < 0)
> return rv;
> }
>
> return rv;
>
> 2. Return negative count as error (as we don't expect negative amount
> of files open):
>
> if (S_ISDIR(inode->i_mode)) {
> size = proc_readfd_count(inode);
> if (size < 0)
> return size;
> stat->size = size;
> }
>
I suppose the latter is less of a change to the original patch..? Either
way seems reasonable to me. I have no strong preference FWIW.
Brian
On Wed, 19 Oct 2022 07:28:45 -0400 Brian Foster <bfoster@redhat.com> wrote:
> On Tue, Oct 18, 2022 at 11:51:02AM -0700, Ivan Babrou wrote:
> > On Tue, Oct 18, 2022 at 11:16 AM Brian Foster <bfoster@redhat.com> wrote:
> > > > +static int proc_readfd_count(struct inode *inode)
> > > > +{
> > > > + struct task_struct *p = get_proc_task(inode);
> > > > + struct fdtable *fdt;
> > > > + unsigned int open_fds = 0;
> > > > +
> > > > + if (!p)
> > > > + return -ENOENT;
> > >
> > > Maybe this shouldn't happen, but do you mean to assign the error code to
> > > stat->size in the caller? Otherwise this seems reasonable to me.
> >
> > You are right. As unlikely as it is to happen, we shouldn't return
> > negative size.
> >
> > What's the idiomatic way to make this work? My two options are:
> >
> > 1. Pass &stat->size into proc_readfd_count:
> >
> > if (S_ISDIR(inode->i_mode)) {
> > rv = proc_readfd_count(inode, &stat->size);
> > if (rv < 0)
> > goto out;
> > }
> >
> > out:
> > return rv;
> >
> > OR without a goto:
> >
> > if (S_ISDIR(inode->i_mode)) {
> > rv = proc_readfd_count(inode, &stat->size));
> > if (rv < 0)
> > return rv;
> > }
> >
> > return rv;
> >
> > 2. Return negative count as error (as we don't expect negative amount
> > of files open):
> >
> > if (S_ISDIR(inode->i_mode)) {
> > size = proc_readfd_count(inode);
> > if (size < 0)
> > return size;
> > stat->size = size;
> > }
> >
>
> I suppose the latter is less of a change to the original patch..? Either
> way seems reasonable to me. I have no strong preference FWIW.
If get_proc_task() failed then something has gone horridly wrong,
hasn't it? Wouldn't it make sense in this situation to make the
.getattr() itself return an errno, in which case the data at *stat
dosen't matter - it's invalid anyway.
This seems to be the general approach in procfs when get_proc_task()
fails - return -ESRCH (or, seemingly randomly, -ENOENT) to the caller.
@@ -47,6 +47,7 @@ fixes/update part 1.1 Stefani Seibold <stefani@seibold.net> June 9 2009
3.10 /proc/<pid>/timerslack_ns - Task timerslack value
3.11 /proc/<pid>/patch_state - Livepatch patch operation state
3.12 /proc/<pid>/arch_status - Task architecture specific information
+ 3.13 /proc/<pid>/fd - List of symlinks to open files
4 Configuring procfs
4.1 Mount options
@@ -2149,6 +2150,22 @@ AVX512_elapsed_ms
the task is unlikely an AVX512 user, but depends on the workload and the
scheduling scenario, it also could be a false negative mentioned above.
+3.13 /proc/<pid>/fd - List of symlinks to open files
+-------------------------------------------------------
+This directory contains symbolic links which represent open files
+the process is maintaining. Example output::
+
+ lr-x------ 1 root root 64 Sep 20 17:53 0 -> /dev/null
+ l-wx------ 1 root root 64 Sep 20 17:53 1 -> /dev/null
+ lrwx------ 1 root root 64 Sep 20 17:53 10 -> 'socket:[12539]'
+ lrwx------ 1 root root 64 Sep 20 17:53 11 -> 'socket:[12540]'
+ lrwx------ 1 root root 64 Sep 20 17:53 12 -> 'socket:[12542]'
+
+The number of open files for the process is stored in 'size' member
+of stat() output for /proc/<pid>/fd for fast access.
+-------------------------------------------------------
+
+
Chapter 4: Configuring procfs
=============================
@@ -279,6 +279,31 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
return 0;
}
+static int proc_readfd_count(struct inode *inode)
+{
+ struct task_struct *p = get_proc_task(inode);
+ struct fdtable *fdt;
+ unsigned int open_fds = 0;
+
+ if (!p)
+ return -ENOENT;
+
+ task_lock(p);
+ if (p->files) {
+ rcu_read_lock();
+
+ fdt = files_fdtable(p->files);
+ open_fds = bitmap_weight(fdt->open_fds, fdt->max_fds);
+
+ rcu_read_unlock();
+ }
+ task_unlock(p);
+
+ put_task_struct(p);
+
+ return open_fds;
+}
+
static int proc_readfd(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx, proc_fd_instantiate);
@@ -319,9 +344,25 @@ int proc_fd_permission(struct user_namespace *mnt_userns,
return rv;
}
+static int proc_fd_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(&init_user_ns, inode, stat);
+
+ /* If it's a directory, put the number of open fds there */
+ if (S_ISDIR(inode->i_mode))
+ stat->size = proc_readfd_count(inode);
+
+ return 0;
+}
+
const struct inode_operations proc_fd_inode_operations = {
.lookup = proc_lookupfd,
.permission = proc_fd_permission,
+ .getattr = proc_fd_getattr,
.setattr = proc_setattr,
};