[3/4] cachestat: implement cachestat syscall

Message ID 20221115182901.2755368-4-nphamcs@gmail.com
State New
Headers
Series cachestat: a new syscall for page cache state of files |

Commit Message

Nhat Pham Nov. 15, 2022, 6:29 p.m. UTC
  Implement a new syscall that queries cache state of a file and
summarizes the number of cached pages, number of dirty pages, number of
pages marked for writeback, number of (recently) evicted pages, etc. in
a given range.

NAME
    cachestat - query the page cache status of a file.

SYNOPSIS
    #include <sys/mman.h>

    struct cachestat {
        unsigned long nr_cache;
        unsigned long nr_dirty;
        unsigned long nr_writeback;
        unsigned long nr_evicted;
        unsigned long nr_recently_evicted;
    };

    int cachestat(unsigned int fd, off_t off, size_t len,
        struct cachestat *cstat);

DESCRIPTION
    cachestat() queries the number of cached pages, number of dirty
    pages, number of pages marked for writeback, number of (recently)
    evicted pages, in the bytes range given by `off` and `len`.

    These values are returned in a cachestat struct, whose address is
    given by the `cstat` argument.

    The `off` argument must be a non-negative integers, If `off` + `len`
    >= `off`, the queried range is [`off`, `off` + `len`]. Otherwise, we
    will query in the range from `off` to the end of the file.

RETURN VALUE
    On success, cachestat returns 0. On error, -1 is returned, and errno
    is set to indicate the error.

ERRORS
    EFAULT cstat points to an invalid address.

    EINVAL `off` is negative.

    EBADF  invalid file descriptor.

Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
 MAINTAINERS                            |   7 ++
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 include/linux/syscalls.h               |   2 +
 include/uapi/asm-generic/unistd.h      |   5 +-
 include/uapi/linux/mman.h              |   8 ++
 kernel/sys_ni.c                        |   1 +
 mm/Makefile                            |   2 +-
 mm/cachestat.c                         | 109 +++++++++++++++++++++++++
 9 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100644 mm/cachestat.c
  

Comments

kernel test robot Nov. 16, 2022, 5:51 a.m. UTC | #1
Hi Nhat,

I love your patch! Yet something to improve:

[auto build test ERROR on shuah-kselftest/next]
[also build test ERROR on shuah-kselftest/fixes tip/x86/asm arnd-asm-generic/master linus/master v6.1-rc5 next-20221115]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Nhat-Pham/cachestat-a-new-syscall-for-page-cache-state-of-files/20221116-023042
base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
patch link:    https://lore.kernel.org/r/20221115182901.2755368-4-nphamcs%40gmail.com
patch subject: [PATCH 3/4] cachestat: implement cachestat syscall
config: um-i386_defconfig
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/e0d55eece256958cfaec0dd2842bf0edced26150
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Nhat-Pham/cachestat-a-new-syscall-for-page-cache-state-of-files/20221116-023042
        git checkout e0d55eece256958cfaec0dd2842bf0edced26150
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=um SUBARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from mm/cachestat.c:8:
   include/linux/syscalls.h:1060:16: warning: 'struct cachestat' declared inside parameter list will not be visible outside of this definition or declaration
    1060 |         struct cachestat __user *ret_cstat);
         |                ^~~~~~~~~
   In file included from mm/cachestat.c:8:
>> include/linux/syscalls.h:242:25: error: conflicting types for 'sys_cachestat'; have 'long int(unsigned int,  off_t,  size_t,  struct cachestat *)' {aka 'long int(unsigned int,  long int,  unsigned int,  struct cachestat *)'}
     242 |         asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))       \
         |                         ^~~
   include/linux/syscalls.h:228:9: note: in expansion of macro '__SYSCALL_DEFINEx'
     228 |         __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
         |         ^~~~~~~~~~~~~~~~~
   include/linux/syscalls.h:220:36: note: in expansion of macro 'SYSCALL_DEFINEx'
     220 | #define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
         |                                    ^~~~~~~~~~~~~~~
   mm/cachestat.c:37:1: note: in expansion of macro 'SYSCALL_DEFINE4'
      37 | SYSCALL_DEFINE4(cachestat, unsigned int, fd, off_t, off, size_t, len,
         | ^~~~~~~~~~~~~~~
   In file included from mm/cachestat.c:8:
   include/linux/syscalls.h:1059:17: note: previous declaration of 'sys_cachestat' with type 'long int(unsigned int,  off_t,  size_t,  struct cachestat *)' {aka 'long int(unsigned int,  long int,  unsigned int,  struct cachestat *)'}
    1059 | asmlinkage long sys_cachestat(unsigned int fd, off_t off, size_t len,
         |                 ^~~~~~~~~~~~~


vim +242 include/linux/syscalls.h

1bd21c6c21e8489 Dominik Brodowski   2018-04-05  231  
e145242ea0df6b7 Dominik Brodowski   2018-04-09  232  /*
e145242ea0df6b7 Dominik Brodowski   2018-04-09  233   * The asmlinkage stub is aliased to a function named __se_sys_*() which
e145242ea0df6b7 Dominik Brodowski   2018-04-09  234   * sign-extends 32-bit ints to longs whenever needed. The actual work is
e145242ea0df6b7 Dominik Brodowski   2018-04-09  235   * done within __do_sys_*().
e145242ea0df6b7 Dominik Brodowski   2018-04-09  236   */
1bd21c6c21e8489 Dominik Brodowski   2018-04-05  237  #ifndef __SYSCALL_DEFINEx
bed1ffca022cc87 Frederic Weisbecker 2009-03-13  238  #define __SYSCALL_DEFINEx(x, name, ...)					\
bee20031772af3d Arnd Bergmann       2018-06-19  239  	__diag_push();							\
bee20031772af3d Arnd Bergmann       2018-06-19  240  	__diag_ignore(GCC, 8, "-Wattribute-alias",			\
bee20031772af3d Arnd Bergmann       2018-06-19  241  		      "Type aliasing is used to sanitize syscall arguments");\
83460ec8dcac141 Andi Kleen          2013-11-12 @242  	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
e145242ea0df6b7 Dominik Brodowski   2018-04-09  243  		__attribute__((alias(__stringify(__se_sys##name))));	\
c9a211951c7c79c Howard McLauchlan   2018-03-21  244  	ALLOW_ERROR_INJECTION(sys##name, ERRNO);			\
e145242ea0df6b7 Dominik Brodowski   2018-04-09  245  	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
e145242ea0df6b7 Dominik Brodowski   2018-04-09  246  	asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
e145242ea0df6b7 Dominik Brodowski   2018-04-09  247  	asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
1a94bc34768e463 Heiko Carstens      2009-01-14  248  	{								\
e145242ea0df6b7 Dominik Brodowski   2018-04-09  249  		long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
07fe6e00f6cca6f Al Viro             2013-01-21  250  		__MAP(x,__SC_TEST,__VA_ARGS__);				\
2cf0966683430b6 Al Viro             2013-01-21  251  		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
2cf0966683430b6 Al Viro             2013-01-21  252  		return ret;						\
1a94bc34768e463 Heiko Carstens      2009-01-14  253  	}								\
bee20031772af3d Arnd Bergmann       2018-06-19  254  	__diag_pop();							\
e145242ea0df6b7 Dominik Brodowski   2018-04-09  255  	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
1bd21c6c21e8489 Dominik Brodowski   2018-04-05  256  #endif /* __SYSCALL_DEFINEx */
1a94bc34768e463 Heiko Carstens      2009-01-14  257
  
kernel test robot Nov. 16, 2022, 7:02 a.m. UTC | #2
Hi Nhat,

I love your patch! Yet something to improve:

[auto build test ERROR on shuah-kselftest/next]
[also build test ERROR on shuah-kselftest/fixes tip/x86/asm arnd-asm-generic/master linus/master v6.1-rc5 next-20221115]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Nhat-Pham/cachestat-a-new-syscall-for-page-cache-state-of-files/20221116-023042
base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
patch link:    https://lore.kernel.org/r/20221115182901.2755368-4-nphamcs%40gmail.com
patch subject: [PATCH 3/4] cachestat: implement cachestat syscall
config: arm-buildonly-randconfig-r001-20221115
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 463da45892e2d2a262277b91b96f5f8c05dc25d0)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install arm cross compiling tool for clang build
        # apt-get install binutils-arm-linux-gnueabi
        # https://github.com/intel-lab-lkp/linux/commit/e0d55eece256958cfaec0dd2842bf0edced26150
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Nhat-Pham/cachestat-a-new-syscall-for-page-cache-state-of-files/20221116-023042
        git checkout e0d55eece256958cfaec0dd2842bf0edced26150
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash kernel/time/ mm/ prepare

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

>> <stdin>:1565:2: warning: syscall cachestat not implemented [-W#warnings]
   #warning syscall cachestat not implemented
    ^
   1 warning generated.
--
   In file included from kernel/time/time.c:33:
>> include/linux/syscalls.h:1060:9: warning: declaration of 'struct cachestat' will not be visible outside of this function [-Wvisibility]
           struct cachestat __user *ret_cstat);
                  ^
   1 warning generated.
--
   In file included from kernel/time/timer.c:35:
>> include/linux/syscalls.h:1060:9: warning: declaration of 'struct cachestat' will not be visible outside of this function [-Wvisibility]
           struct cachestat __user *ret_cstat);
                  ^
   kernel/time/timer.c:1365:20: warning: unused function 'del_timer_wait_running' [-Wunused-function]
   static inline void del_timer_wait_running(struct timer_list *timer) { }
                      ^
   2 warnings generated.
--
   In file included from kernel/time/hrtimer.c:30:
>> include/linux/syscalls.h:1060:9: warning: declaration of 'struct cachestat' will not be visible outside of this function [-Wvisibility]
           struct cachestat __user *ret_cstat);
                  ^
   kernel/time/hrtimer.c:120:21: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
           [CLOCK_REALTIME]        = HRTIMER_BASE_REALTIME,
                                     ^~~~~~~~~~~~~~~~~~~~~
   kernel/time/hrtimer.c:118:27: note: previous initialization is here
           [0 ... MAX_CLOCKS - 1]  = HRTIMER_MAX_CLOCK_BASES,
                                     ^~~~~~~~~~~~~~~~~~~~~~~
   kernel/time/hrtimer.c:121:22: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
           [CLOCK_MONOTONIC]       = HRTIMER_BASE_MONOTONIC,
                                     ^~~~~~~~~~~~~~~~~~~~~~
   kernel/time/hrtimer.c:118:27: note: previous initialization is here
           [0 ... MAX_CLOCKS - 1]  = HRTIMER_MAX_CLOCK_BASES,
                                     ^~~~~~~~~~~~~~~~~~~~~~~
   kernel/time/hrtimer.c:122:21: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
           [CLOCK_BOOTTIME]        = HRTIMER_BASE_BOOTTIME,
                                     ^~~~~~~~~~~~~~~~~~~~~
   kernel/time/hrtimer.c:118:27: note: previous initialization is here
           [0 ... MAX_CLOCKS - 1]  = HRTIMER_MAX_CLOCK_BASES,
                                     ^~~~~~~~~~~~~~~~~~~~~~~
   kernel/time/hrtimer.c:123:17: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
           [CLOCK_TAI]             = HRTIMER_BASE_TAI,
                                     ^~~~~~~~~~~~~~~~
   kernel/time/hrtimer.c:118:27: note: previous initialization is here
           [0 ... MAX_CLOCKS - 1]  = HRTIMER_MAX_CLOCK_BASES,
                                     ^~~~~~~~~~~~~~~~~~~~~~~
   kernel/time/hrtimer.c:1648:7: warning: variable 'expires_in_hardirq' set but not used [-Wunused-but-set-variable]
           bool expires_in_hardirq;
                ^
   kernel/time/hrtimer.c:276:20: warning: unused function 'is_migration_base' [-Wunused-function]
   static inline bool is_migration_base(struct hrtimer_clock_base *base)
                      ^
   kernel/time/hrtimer.c:650:19: warning: unused function 'hrtimer_hres_active' [-Wunused-function]
   static inline int hrtimer_hres_active(void)
                     ^
   kernel/time/hrtimer.c:1887:20: warning: unused function '__hrtimer_peek_ahead_timers' [-Wunused-function]
   static inline void __hrtimer_peek_ahead_timers(void) { }
                      ^
   9 warnings generated.
--
   In file included from mm/cachestat.c:8:
>> include/linux/syscalls.h:1060:9: warning: declaration of 'struct cachestat' will not be visible outside of this function [-Wvisibility]
           struct cachestat __user *ret_cstat);
                  ^
>> mm/cachestat.c:37:1: error: conflicting types for 'sys_cachestat'
   SYSCALL_DEFINE4(cachestat, unsigned int, fd, off_t, off, size_t, len,
   ^
   include/linux/syscalls.h:220:36: note: expanded from macro 'SYSCALL_DEFINE4'
   #define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
                                      ^
   include/linux/syscalls.h:228:2: note: expanded from macro 'SYSCALL_DEFINEx'
           __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
           ^
   include/linux/syscalls.h:242:18: note: expanded from macro '__SYSCALL_DEFINEx'
           asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))       \
                           ^
   <scratch space>:109:1: note: expanded from here
   sys_cachestat
   ^
   include/linux/syscalls.h:1059:17: note: previous declaration is here
   asmlinkage long sys_cachestat(unsigned int fd, off_t off, size_t len,
                   ^
>> mm/cachestat.c:76:24: error: call to undeclared function 'radix_to_swp_entry'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
                                           swp_entry_t swp = radix_to_swp_entry(folio);
                                                             ^
>> mm/cachestat.c:76:18: error: initializing 'swp_entry_t' with an expression of incompatible type 'int'
                                           swp_entry_t swp = radix_to_swp_entry(folio);
                                                       ^     ~~~~~~~~~~~~~~~~~~~~~~~~~
   1 warning and 3 errors generated.
--
>> <stdin>:1565:2: warning: syscall cachestat not implemented [-W#warnings]
   #warning syscall cachestat not implemented
    ^
   1 warning generated.


vim +/sys_cachestat +37 mm/cachestat.c

    15	
    16	/*
    17	 * The cachestat(3) system call.
    18	 *
    19	 * cachestat() returns the page cache status of a file in the
    20	 * bytes specified by `off` and `len`: number of cached pages,
    21	 * number of dirty pages, number of pages marked for writeback,
    22	 * number of (recently) evicted pages.
    23	 *
    24	 * If `off` + `len` >= `off`, the queried range is [`off`, `off` + `len`].
    25	 * Otherwise, we will query in the range from `off` to the end of the file.
    26	 *
    27	 * Because the status of a page can change after cachestat() checks it
    28	 * but before it returns to the application, the returned values may
    29	 * contain stale information.
    30	 *
    31	 * return values:
    32	 *  zero    - success
    33	 *  -EFAULT - cstat points to an illegal address
    34	 *  -EINVAL - invalid arguments
    35	 *  -EBADF	- invalid file descriptor
    36	 */
  > 37	SYSCALL_DEFINE4(cachestat, unsigned int, fd, off_t, off, size_t, len,
    38		struct cachestat __user *, cstat)
    39	{
    40		struct fd f;
    41		struct cachestat cs;
    42	
    43		memset(&cs, 0, sizeof(struct cachestat));
    44	
    45		if (off < 0)
    46			return -EINVAL;
    47	
    48		if (!access_ok(cstat, sizeof(struct cachestat)))
    49			return -EFAULT;
    50	
    51		f = fdget(fd);
    52		if (f.file) {
    53			struct address_space *mapping = f.file->f_mapping;
    54			pgoff_t first_index = off >> PAGE_SHIFT;
    55			XA_STATE(xas, &mapping->i_pages, first_index);
    56			struct folio *folio;
    57			pgoff_t last_index = (off + len - 1) >> PAGE_SHIFT;
    58	
    59			if (last_index < first_index)
    60				last_index = ULONG_MAX;
    61	
    62			rcu_read_lock();
    63			xas_for_each(&xas, folio, last_index) {
    64				if (xas_retry(&xas, folio) || !folio)
    65					continue;
    66	
    67				if (xa_is_value(folio)) {
    68					/* page is evicted */
    69					void *shadow;
    70					bool workingset; /* not used */
    71	
    72					cs.nr_evicted += 1;
    73	
    74					if (shmem_mapping(mapping)) {
    75						/* shmem file - in swap cache */
  > 76						swp_entry_t swp = radix_to_swp_entry(folio);
  
kernel test robot Nov. 16, 2022, 11:55 a.m. UTC | #3
Hi Nhat,

I love your patch! Perhaps something to improve:

[auto build test WARNING on shuah-kselftest/next]
[also build test WARNING on shuah-kselftest/fixes tip/x86/asm arnd-asm-generic/master linus/master v6.1-rc5 next-20221115]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Nhat-Pham/cachestat-a-new-syscall-for-page-cache-state-of-files/20221116-023042
base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
patch link:    https://lore.kernel.org/r/20221115182901.2755368-4-nphamcs%40gmail.com
patch subject: [PATCH 3/4] cachestat: implement cachestat syscall
config: m68k-allyesconfig
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/e0d55eece256958cfaec0dd2842bf0edced26150
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Nhat-Pham/cachestat-a-new-syscall-for-page-cache-state-of-files/20221116-023042
        git checkout e0d55eece256958cfaec0dd2842bf0edced26150
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash drivers/macintosh/ prepare

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> <stdin>:1565:2: warning: #warning syscall cachestat not implemented [-Wcpp]
--
   In file included from drivers/macintosh/via-pmu.c:46:
>> include/linux/syscalls.h:1060:16: warning: 'struct cachestat' declared inside parameter list will not be visible outside of this definition or declaration
    1060 |         struct cachestat __user *ret_cstat);
         |                ^~~~~~~~~
--
>> <stdin>:1565:2: warning: #warning syscall cachestat not implemented [-Wcpp]
--
>> <stdin>:1565:2: warning: #warning syscall cachestat not implemented [-Wcpp]


vim +1060 include/linux/syscalls.h

   903	
   904	/* mm/, CONFIG_MMU only */
   905	asmlinkage long sys_swapon(const char __user *specialfile, int swap_flags);
   906	asmlinkage long sys_swapoff(const char __user *specialfile);
   907	asmlinkage long sys_mprotect(unsigned long start, size_t len,
   908					unsigned long prot);
   909	asmlinkage long sys_msync(unsigned long start, size_t len, int flags);
   910	asmlinkage long sys_mlock(unsigned long start, size_t len);
   911	asmlinkage long sys_munlock(unsigned long start, size_t len);
   912	asmlinkage long sys_mlockall(int flags);
   913	asmlinkage long sys_munlockall(void);
   914	asmlinkage long sys_mincore(unsigned long start, size_t len,
   915					unsigned char __user * vec);
   916	asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
   917	asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
   918				size_t vlen, int behavior, unsigned int flags);
   919	asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
   920	asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
   921				unsigned long prot, unsigned long pgoff,
   922				unsigned long flags);
   923	asmlinkage long sys_mbind(unsigned long start, unsigned long len,
   924					unsigned long mode,
   925					const unsigned long __user *nmask,
   926					unsigned long maxnode,
   927					unsigned flags);
   928	asmlinkage long sys_get_mempolicy(int __user *policy,
   929					unsigned long __user *nmask,
   930					unsigned long maxnode,
   931					unsigned long addr, unsigned long flags);
   932	asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nmask,
   933					unsigned long maxnode);
   934	asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
   935					const unsigned long __user *from,
   936					const unsigned long __user *to);
   937	asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
   938					const void __user * __user *pages,
   939					const int __user *nodes,
   940					int __user *status,
   941					int flags);
   942	
   943	asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t  pid, int sig,
   944			siginfo_t __user *uinfo);
   945	asmlinkage long sys_perf_event_open(
   946			struct perf_event_attr __user *attr_uptr,
   947			pid_t pid, int cpu, int group_fd, unsigned long flags);
   948	asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int);
   949	asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
   950				     unsigned int vlen, unsigned flags,
   951				     struct __kernel_timespec __user *timeout);
   952	asmlinkage long sys_recvmmsg_time32(int fd, struct mmsghdr __user *msg,
   953				     unsigned int vlen, unsigned flags,
   954				     struct old_timespec32 __user *timeout);
   955	
   956	asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
   957					int options, struct rusage __user *ru);
   958	asmlinkage long sys_prlimit64(pid_t pid, unsigned int resource,
   959					const struct rlimit64 __user *new_rlim,
   960					struct rlimit64 __user *old_rlim);
   961	asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags);
   962	asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
   963					  u64 mask, int fd,
   964					  const char  __user *pathname);
   965	asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
   966					      struct file_handle __user *handle,
   967					      int __user *mnt_id, int flag);
   968	asmlinkage long sys_open_by_handle_at(int mountdirfd,
   969					      struct file_handle __user *handle,
   970					      int flags);
   971	asmlinkage long sys_clock_adjtime(clockid_t which_clock,
   972					struct __kernel_timex __user *tx);
   973	asmlinkage long sys_clock_adjtime32(clockid_t which_clock,
   974					struct old_timex32 __user *tx);
   975	asmlinkage long sys_syncfs(int fd);
   976	asmlinkage long sys_setns(int fd, int nstype);
   977	asmlinkage long sys_pidfd_open(pid_t pid, unsigned int flags);
   978	asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
   979				     unsigned int vlen, unsigned flags);
   980	asmlinkage long sys_process_vm_readv(pid_t pid,
   981					     const struct iovec __user *lvec,
   982					     unsigned long liovcnt,
   983					     const struct iovec __user *rvec,
   984					     unsigned long riovcnt,
   985					     unsigned long flags);
   986	asmlinkage long sys_process_vm_writev(pid_t pid,
   987					      const struct iovec __user *lvec,
   988					      unsigned long liovcnt,
   989					      const struct iovec __user *rvec,
   990					      unsigned long riovcnt,
   991					      unsigned long flags);
   992	asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
   993				 unsigned long idx1, unsigned long idx2);
   994	asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
   995	asmlinkage long sys_sched_setattr(pid_t pid,
   996						struct sched_attr __user *attr,
   997						unsigned int flags);
   998	asmlinkage long sys_sched_getattr(pid_t pid,
   999						struct sched_attr __user *attr,
  1000						unsigned int size,
  1001						unsigned int flags);
  1002	asmlinkage long sys_renameat2(int olddfd, const char __user *oldname,
  1003				      int newdfd, const char __user *newname,
  1004				      unsigned int flags);
  1005	asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
  1006				    void __user *uargs);
  1007	asmlinkage long sys_getrandom(char __user *buf, size_t count,
  1008				      unsigned int flags);
  1009	asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
  1010	asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
  1011	asmlinkage long sys_execveat(int dfd, const char __user *filename,
  1012				const char __user *const __user *argv,
  1013				const char __user *const __user *envp, int flags);
  1014	asmlinkage long sys_userfaultfd(int flags);
  1015	asmlinkage long sys_membarrier(int cmd, unsigned int flags, int cpu_id);
  1016	asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
  1017	asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
  1018					    int fd_out, loff_t __user *off_out,
  1019					    size_t len, unsigned int flags);
  1020	asmlinkage long sys_preadv2(unsigned long fd, const struct iovec __user *vec,
  1021				    unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
  1022				    rwf_t flags);
  1023	asmlinkage long sys_pwritev2(unsigned long fd, const struct iovec __user *vec,
  1024				    unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
  1025				    rwf_t flags);
  1026	asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
  1027					  unsigned long prot, int pkey);
  1028	asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
  1029	asmlinkage long sys_pkey_free(int pkey);
  1030	asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
  1031				  unsigned mask, struct statx __user *buffer);
  1032	asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
  1033				 int flags, uint32_t sig);
  1034	asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
  1035	asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
  1036				       int to_dfd, const char __user *to_path,
  1037				       unsigned int ms_flags);
  1038	asmlinkage long sys_mount_setattr(int dfd, const char __user *path,
  1039					  unsigned int flags,
  1040					  struct mount_attr __user *uattr, size_t usize);
  1041	asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
  1042	asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key,
  1043				     const void __user *value, int aux);
  1044	asmlinkage long sys_fsmount(int fs_fd, unsigned int flags, unsigned int ms_flags);
  1045	asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags);
  1046	asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
  1047					       siginfo_t __user *info,
  1048					       unsigned int flags);
  1049	asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags);
  1050	asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr __user *attr,
  1051			size_t size, __u32 flags);
  1052	asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type,
  1053			const void __user *rule_attr, __u32 flags);
  1054	asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
  1055	asmlinkage long sys_memfd_secret(unsigned int flags);
  1056	asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
  1057						    unsigned long home_node,
  1058						    unsigned long flags);
  1059	asmlinkage long sys_cachestat(unsigned int fd, off_t off, size_t len,
> 1060		struct cachestat __user *ret_cstat);
  1061
  
Brian Foster Nov. 21, 2022, 2:45 p.m. UTC | #4
On Tue, Nov 15, 2022 at 10:29:00AM -0800, Nhat Pham wrote:
> Implement a new syscall that queries cache state of a file and
> summarizes the number of cached pages, number of dirty pages, number of
> pages marked for writeback, number of (recently) evicted pages, etc. in
> a given range.
> 
> NAME
>     cachestat - query the page cache status of a file.
> 
> SYNOPSIS
>     #include <sys/mman.h>
> 
>     struct cachestat {
>         unsigned long nr_cache;
>         unsigned long nr_dirty;
>         unsigned long nr_writeback;
>         unsigned long nr_evicted;
>         unsigned long nr_recently_evicted;
>     };
> 
>     int cachestat(unsigned int fd, off_t off, size_t len,
>         struct cachestat *cstat);
> 

Do you have a strong use case for a user specified range vs. just
checking the entire file? If not, have you considered whether it might
be worth expanding statx() to include this data? That call is already
designed to include "extended" file status and avoids the need for a new
syscall. For example, the fields could be added individually with
multiple flags, or the entire struct tied to a new STATX_CACHE flag or
some such.

That said, I'm not sure how sensitive folks might be to increasing the
size of struct statx or populating it with cache data. Just a thought
that it might be worth bringing up on linux-fsdevel if you haven't
considered it already. A few random nits below..

> DESCRIPTION
>     cachestat() queries the number of cached pages, number of dirty
>     pages, number of pages marked for writeback, number of (recently)
>     evicted pages, in the bytes range given by `off` and `len`.
> 
>     These values are returned in a cachestat struct, whose address is
>     given by the `cstat` argument.
> 
>     The `off` argument must be a non-negative integers, If `off` + `len`
>     >= `off`, the queried range is [`off`, `off` + `len`]. Otherwise, we
>     will query in the range from `off` to the end of the file.
> 

(off + len < off) is an error condition on some (most?) other syscalls.
At least some calls (i.e. fadvise(), sync_file_range()) use len == 0 to
explicitly specify "to EOF."

> RETURN VALUE
>     On success, cachestat returns 0. On error, -1 is returned, and errno
>     is set to indicate the error.
> 
> ERRORS
>     EFAULT cstat points to an invalid address.
> 
>     EINVAL `off` is negative.
> 
>     EBADF  invalid file descriptor.
> 
> Signed-off-by: Nhat Pham <nphamcs@gmail.com>
> ---
>  MAINTAINERS                            |   7 ++
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  include/linux/syscalls.h               |   2 +
>  include/uapi/asm-generic/unistd.h      |   5 +-
>  include/uapi/linux/mman.h              |   8 ++
>  kernel/sys_ni.c                        |   1 +
>  mm/Makefile                            |   2 +-
>  mm/cachestat.c                         | 109 +++++++++++++++++++++++++
>  9 files changed, 134 insertions(+), 2 deletions(-)
>  create mode 100644 mm/cachestat.c
> 
...
> diff --git a/mm/cachestat.c b/mm/cachestat.c
> new file mode 100644
> index 000000000000..193151cb0767
> --- /dev/null
> +++ b/mm/cachestat.c
> @@ -0,0 +1,109 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +/*
> + * The cachestat() system call.
> + */
> +#include <linux/file.h>
> +#include <linux/fs.h>
> +#include <linux/syscalls.h>
> +#include <linux/shmem_fs.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
> +#include <uapi/linux/mman.h>
> +
> +#include "swap.h"
> +
> +/*
> + * The cachestat(3) system call.
> + *
> + * cachestat() returns the page cache status of a file in the
> + * bytes specified by `off` and `len`: number of cached pages,
> + * number of dirty pages, number of pages marked for writeback,
> + * number of (recently) evicted pages.
> + *
> + * If `off` + `len` >= `off`, the queried range is [`off`, `off` + `len`].
> + * Otherwise, we will query in the range from `off` to the end of the file.
> + *
> + * Because the status of a page can change after cachestat() checks it
> + * but before it returns to the application, the returned values may
> + * contain stale information.
> + *
> + * return values:
> + *  zero    - success
> + *  -EFAULT - cstat points to an illegal address
> + *  -EINVAL - invalid arguments
> + *  -EBADF	- invalid file descriptor
> + */
> +SYSCALL_DEFINE4(cachestat, unsigned int, fd, off_t, off, size_t, len,
> +	struct cachestat __user *, cstat)
> +{
> +	struct fd f;
> +	struct cachestat cs;
> +
> +	memset(&cs, 0, sizeof(struct cachestat));

Maybe move this after the next couple error checks to avoid unnecessary
work?

> +
> +	if (off < 0)
> +		return -EINVAL;
> +
> +	if (!access_ok(cstat, sizeof(struct cachestat)))
> +		return -EFAULT;
> +
> +	f = fdget(fd);

Doing this:

	if (!f.file)
		return -EBADF;

... removes a level of indentation for the rest of the function, making
it a bit easier to read (and removing the locals defined in the if
branch, which I'm not sure is widely accepted style for the kernel).

Brian

> +	if (f.file) {
> +		struct address_space *mapping = f.file->f_mapping;
> +		pgoff_t first_index = off >> PAGE_SHIFT;
> +		XA_STATE(xas, &mapping->i_pages, first_index);
> +		struct folio *folio;
> +		pgoff_t last_index = (off + len - 1) >> PAGE_SHIFT;
> +
> +		if (last_index < first_index)
> +			last_index = ULONG_MAX;
> +
> +		rcu_read_lock();
> +		xas_for_each(&xas, folio, last_index) {
> +			if (xas_retry(&xas, folio) || !folio)
> +				continue;
> +
> +			if (xa_is_value(folio)) {
> +				/* page is evicted */
> +				void *shadow;
> +				bool workingset; /* not used */
> +
> +				cs.nr_evicted += 1;
> +
> +				if (shmem_mapping(mapping)) {
> +					/* shmem file - in swap cache */
> +					swp_entry_t swp = radix_to_swp_entry(folio);
> +
> +					shadow = get_shadow_from_swap_cache(swp);
> +				} else {
> +					/* in page cache */
> +					shadow = (void *) folio;
> +				}
> +
> +				if (workingset_test_recent(shadow, true, &workingset))
> +					cs.nr_recently_evicted += 1;
> +
> +				continue;
> +			}
> +
> +			/* page is in cache */
> +			cs.nr_cache += 1;
> +
> +			if (folio_test_dirty(folio))
> +				cs.nr_dirty += 1;
> +
> +			if (folio_test_writeback(folio))
> +				cs.nr_writeback += 1;
> +		}
> +		rcu_read_unlock();
> +		fdput(f);
> +
> +		if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
> +			return -EFAULT;
> +
> +		return 0;
> +	}
> +
> +	return -EBADF;
> +}
> -- 
> 2.30.2
> 
>
  
Johannes Weiner Nov. 21, 2022, 3:55 p.m. UTC | #5
On Mon, Nov 21, 2022 at 09:45:49AM -0500, Brian Foster wrote:
> On Tue, Nov 15, 2022 at 10:29:00AM -0800, Nhat Pham wrote:
> > Implement a new syscall that queries cache state of a file and
> > summarizes the number of cached pages, number of dirty pages, number of
> > pages marked for writeback, number of (recently) evicted pages, etc. in
> > a given range.
> > 
> > NAME
> >     cachestat - query the page cache status of a file.
> > 
> > SYNOPSIS
> >     #include <sys/mman.h>
> > 
> >     struct cachestat {
> >         unsigned long nr_cache;
> >         unsigned long nr_dirty;
> >         unsigned long nr_writeback;
> >         unsigned long nr_evicted;
> >         unsigned long nr_recently_evicted;
> >     };
> > 
> >     int cachestat(unsigned int fd, off_t off, size_t len,
> >         struct cachestat *cstat);
> > 
> 
> Do you have a strong use case for a user specified range vs. just
> checking the entire file? If not, have you considered whether it might
> be worth expanding statx() to include this data? That call is already
> designed to include "extended" file status and avoids the need for a new
> syscall. For example, the fields could be added individually with
> multiple flags, or the entire struct tied to a new STATX_CACHE flag or
> some such.

Whole-file stats are only useful for data that is structured in
directory trees. It doesn't work for structured files. For example,
understanding (and subsequently advising/influencing) the readahead
and dirty flushing in certain sections of a larger database file.

Fadvise/madvise/sync_file_range etc. give the user the ability to
influence cache behavior in sub-ranges, so it makes sense to also
allow querying at that granularity.

> > DESCRIPTION
> >     cachestat() queries the number of cached pages, number of dirty
> >     pages, number of pages marked for writeback, number of (recently)
> >     evicted pages, in the bytes range given by `off` and `len`.
> > 
> >     These values are returned in a cachestat struct, whose address is
> >     given by the `cstat` argument.
> > 
> >     The `off` argument must be a non-negative integers, If `off` + `len`
> >     >= `off`, the queried range is [`off`, `off` + `len`]. Otherwise, we
> >     will query in the range from `off` to the end of the file.
> > 
> 
> (off + len < off) is an error condition on some (most?) other syscalls.
> At least some calls (i.e. fadvise(), sync_file_range()) use len == 0 to
> explicitly specify "to EOF."

Good point, it would make sense to stick to that precedent.
  

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index a198da986146..baa081a1fe52 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4552,6 +4552,13 @@  S:	Supported
 F:	Documentation/filesystems/caching/cachefiles.rst
 F:	fs/cachefiles/
 
+CACHESTAT: PAGE CACHE STATS FOR A FILE
+M:	Nhat Pham <nphamcs@gmail.com>
+M:	Johannes Weiner <hannes@cmpxchg.org>
+L:	linux-mm@kvack.org
+S:	Maintained
+F:	mm/cachestat.c
+
 CADENCE MIPI-CSI2 BRIDGES
 M:	Maxime Ripard <mripard@kernel.org>
 L:	linux-media@vger.kernel.org
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 320480a8db4f..bc0a3c941b35 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -455,3 +455,4 @@ 
 448	i386	process_mrelease	sys_process_mrelease
 449	i386	futex_waitv		sys_futex_waitv
 450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	i386	cachestat		sys_cachestat
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index c84d12608cd2..8eed4cdc7965 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -372,6 +372,7 @@ 
 448	common	process_mrelease	sys_process_mrelease
 449	common	futex_waitv		sys_futex_waitv
 450	common	set_mempolicy_home_node	sys_set_mempolicy_home_node
+451	common	cachestat	sys_cachestat
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a34b0f9a9972..59e4d56225ae 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1056,6 +1056,8 @@  asmlinkage long sys_memfd_secret(unsigned int flags);
 asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
 					    unsigned long home_node,
 					    unsigned long flags);
+asmlinkage long sys_cachestat(unsigned int fd, off_t off, size_t len,
+	struct cachestat __user *ret_cstat);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 45fa180cc56a..cd639fae9086 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -886,8 +886,11 @@  __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
 #define __NR_set_mempolicy_home_node 450
 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
 
+#define __NR_cachestat 451
+__SYSCALL(__NR_cachestat, sys_cachestat)
+
 #undef __NR_syscalls
-#define __NR_syscalls 451
+#define __NR_syscalls 452
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index f55bc680b5b0..8690c82acd02 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -41,4 +41,12 @@ 
 #define MAP_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
 #define MAP_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
 
+struct cachestat {
+	unsigned long nr_cache;
+	unsigned long nr_dirty;
+	unsigned long nr_writeback;
+	unsigned long nr_evicted;
+	unsigned long nr_recently_evicted;
+};
+
 #endif /* _UAPI_LINUX_MMAN_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 860b2dcf3ac4..04bfb1e4d377 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -299,6 +299,7 @@  COND_SYSCALL(set_mempolicy);
 COND_SYSCALL(migrate_pages);
 COND_SYSCALL(move_pages);
 COND_SYSCALL(set_mempolicy_home_node);
+COND_SYSCALL(cachestat);
 
 COND_SYSCALL(perf_event_open);
 COND_SYSCALL(accept4);
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5b3e29..e71b15743ce6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -54,7 +54,7 @@  obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   mm_init.o percpu.o slab_common.o \
 			   compaction.o \
 			   interval_tree.o list_lru.o workingset.o \
-			   debug.o gup.o mmap_lock.o $(mmu-y)
+			   debug.o gup.o mmap_lock.o cachestat.o $(mmu-y)
 
 # Give 'page_alloc' its own module-parameter namespace
 page-alloc-y := page_alloc.o
diff --git a/mm/cachestat.c b/mm/cachestat.c
new file mode 100644
index 000000000000..193151cb0767
--- /dev/null
+++ b/mm/cachestat.c
@@ -0,0 +1,109 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * The cachestat() system call.
+ */
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/syscalls.h>
+#include <linux/shmem_fs.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <uapi/linux/mman.h>
+
+#include "swap.h"
+
+/*
+ * The cachestat(3) system call.
+ *
+ * cachestat() returns the page cache status of a file in the
+ * bytes specified by `off` and `len`: number of cached pages,
+ * number of dirty pages, number of pages marked for writeback,
+ * number of (recently) evicted pages.
+ *
+ * If `off` + `len` >= `off`, the queried range is [`off`, `off` + `len`].
+ * Otherwise, we will query in the range from `off` to the end of the file.
+ *
+ * Because the status of a page can change after cachestat() checks it
+ * but before it returns to the application, the returned values may
+ * contain stale information.
+ *
+ * return values:
+ *  zero    - success
+ *  -EFAULT - cstat points to an illegal address
+ *  -EINVAL - invalid arguments
+ *  -EBADF	- invalid file descriptor
+ */
+SYSCALL_DEFINE4(cachestat, unsigned int, fd, off_t, off, size_t, len,
+	struct cachestat __user *, cstat)
+{
+	struct fd f;
+	struct cachestat cs;
+
+	memset(&cs, 0, sizeof(struct cachestat));
+
+	if (off < 0)
+		return -EINVAL;
+
+	if (!access_ok(cstat, sizeof(struct cachestat)))
+		return -EFAULT;
+
+	f = fdget(fd);
+	if (f.file) {
+		struct address_space *mapping = f.file->f_mapping;
+		pgoff_t first_index = off >> PAGE_SHIFT;
+		XA_STATE(xas, &mapping->i_pages, first_index);
+		struct folio *folio;
+		pgoff_t last_index = (off + len - 1) >> PAGE_SHIFT;
+
+		if (last_index < first_index)
+			last_index = ULONG_MAX;
+
+		rcu_read_lock();
+		xas_for_each(&xas, folio, last_index) {
+			if (xas_retry(&xas, folio) || !folio)
+				continue;
+
+			if (xa_is_value(folio)) {
+				/* page is evicted */
+				void *shadow;
+				bool workingset; /* not used */
+
+				cs.nr_evicted += 1;
+
+				if (shmem_mapping(mapping)) {
+					/* shmem file - in swap cache */
+					swp_entry_t swp = radix_to_swp_entry(folio);
+
+					shadow = get_shadow_from_swap_cache(swp);
+				} else {
+					/* in page cache */
+					shadow = (void *) folio;
+				}
+
+				if (workingset_test_recent(shadow, true, &workingset))
+					cs.nr_recently_evicted += 1;
+
+				continue;
+			}
+
+			/* page is in cache */
+			cs.nr_cache += 1;
+
+			if (folio_test_dirty(folio))
+				cs.nr_dirty += 1;
+
+			if (folio_test_writeback(folio))
+				cs.nr_writeback += 1;
+		}
+		rcu_read_unlock();
+		fdput(f);
+
+		if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	return -EBADF;
+}