[2/2,v2] ceph: use a xarray to record all the opened files for each inode

Message ID 20221114051901.15371-3-xiubli@redhat.com
State New
Headers
Series ceph: fix the use-after-free bug for file_lock |

Commit Message

Xiubo Li Nov. 14, 2022, 5:19 a.m. UTC
  From: Xiubo Li <xiubli@redhat.com>

When releasing the file locks the fl->fl_file memory could be
already released by another thread in filp_close(), so we couldn't
depend on fl->fl_file to get the inode. Just use a xarray to record
the opened files for each inode.

Cc: stable@vger.kernel.org
URL: https://tracker.ceph.com/issues/57986
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/file.c  |  9 +++++++++
 fs/ceph/inode.c |  4 ++++
 fs/ceph/locks.c | 17 ++++++++++++++++-
 fs/ceph/super.h |  4 ++++
 4 files changed, 33 insertions(+), 1 deletion(-)
  

Comments

kernel test robot Nov. 14, 2022, 8:54 a.m. UTC | #1
Hi,

I love your patch! Perhaps something to improve:

[auto build test WARNING on ceph-client/testing]
[also build test WARNING on ceph-client/for-linus linus/master v6.1-rc5 next-20221111]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/xiubli-redhat-com/ceph-fix-the-use-after-free-bug-for-file_lock/20221114-132233
base:   https://github.com/ceph/ceph-client.git testing
patch link:    https://lore.kernel.org/r/20221114051901.15371-3-xiubli%40redhat.com
patch subject: [PATCH 2/2 v2] ceph: use a xarray to record all the opened files for each inode
config: hexagon-randconfig-r041-20221114
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 463da45892e2d2a262277b91b96f5f8c05dc25d0)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/232cc8f1dbeddb308946202a7c67ee4d20451ae7
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review xiubli-redhat-com/ceph-fix-the-use-after-free-bug-for-file_lock/20221114-132233
        git checkout 232cc8f1dbeddb308946202a7c67ee4d20451ae7
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash fs/ceph/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from fs/ceph/locks.c:8:
   In file included from fs/ceph/super.h:8:
   In file included from include/linux/backing-dev.h:16:
   In file included from include/linux/writeback.h:13:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/hexagon/include/asm/io.h:334:
   include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __raw_readb(PCI_IOBASE + addr);
                             ~~~~~~~~~~ ^
   include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
   #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
                                                     ^
   In file included from fs/ceph/locks.c:8:
   In file included from fs/ceph/super.h:8:
   In file included from include/linux/backing-dev.h:16:
   In file included from include/linux/writeback.h:13:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/hexagon/include/asm/io.h:334:
   include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
   #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
                                                     ^
   In file included from fs/ceph/locks.c:8:
   In file included from fs/ceph/super.h:8:
   In file included from include/linux/backing-dev.h:16:
   In file included from include/linux/writeback.h:13:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/hexagon/include/asm/io.h:334:
   include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writeb(value, PCI_IOBASE + addr);
                               ~~~~~~~~~~ ^
   include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
                                                         ~~~~~~~~~~ ^
   include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
                                                         ~~~~~~~~~~ ^
>> fs/ceph/locks.c:66:6: warning: variable 'fi' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
           if (val == CEPH_FILP_AVAILABLE) {
               ^~~~~~~~~~~~~~~~~~~~~~~~~~
   fs/ceph/locks.c:79:14: note: uninitialized use occurs here
           atomic_dec(&fi->num_locks);
                       ^~
   fs/ceph/locks.c:66:2: note: remove the 'if' if its condition is always true
           if (val == CEPH_FILP_AVAILABLE) {
           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   fs/ceph/locks.c:47:27: note: initialize the variable 'fi' to silence this warning
           struct ceph_file_info *fi;
                                    ^
                                     = NULL
   7 warnings generated.


vim +66 fs/ceph/locks.c

    42	
    43	static void ceph_fl_release_lock(struct file_lock *fl)
    44	{
    45		struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
    46		struct ceph_inode_info *ci;
    47		struct ceph_file_info *fi;
    48		void *val;
    49	
    50		/*
    51		 * If inode is NULL it should be a request file_lock,
    52		 * nothing we can do.
    53		 */
    54		if (!inode)
    55			return;
    56	
    57		ci = ceph_inode(inode);
    58	
    59		/*
    60		 * For Posix-style locks, it may race between filp_close()s,
    61		 * and it's possible that the 'file' memory pointed by
    62		 * 'fl->fl_file' has been released. If so just skip it.
    63		 */
    64		rcu_read_lock();
    65		val = xa_load(&ci->i_opened_files, (unsigned long)fl->fl_file);
  > 66		if (val == CEPH_FILP_AVAILABLE) {
    67			fi = fl->fl_file->private_data;
    68			atomic_dec(&fi->num_locks);
    69		}
    70		rcu_read_unlock();
    71	
    72		if (atomic_dec_and_test(&ci->i_filelock_ref)) {
    73			/* clear error when all locks are released */
    74			spin_lock(&ci->i_ceph_lock);
    75			ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
    76			spin_unlock(&ci->i_ceph_lock);
    77		}
    78		iput(inode);
    79		atomic_dec(&fi->num_locks);
    80	}
    81
  
Xiubo Li Nov. 14, 2022, 1:08 p.m. UTC | #2
Hi

Thanks for reporting this.

I will fix it in the next version.

- Xiubo

On 14/11/2022 16:54, kernel test robot wrote:
> Hi,
>
> I love your patch! Perhaps something to improve:
>
> [auto build test WARNING on ceph-client/testing]
> [also build test WARNING on ceph-client/for-linus linus/master v6.1-rc5 next-20221111]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/xiubli-redhat-com/ceph-fix-the-use-after-free-bug-for-file_lock/20221114-132233
> base:   https://github.com/ceph/ceph-client.git testing
> patch link:    https://lore.kernel.org/r/20221114051901.15371-3-xiubli%40redhat.com
> patch subject: [PATCH 2/2 v2] ceph: use a xarray to record all the opened files for each inode
> config: hexagon-randconfig-r041-20221114
> compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 463da45892e2d2a262277b91b96f5f8c05dc25d0)
> reproduce (this is a W=1 build):
>          wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>          chmod +x ~/bin/make.cross
>          # https://github.com/intel-lab-lkp/linux/commit/232cc8f1dbeddb308946202a7c67ee4d20451ae7
>          git remote add linux-review https://github.com/intel-lab-lkp/linux
>          git fetch --no-tags linux-review xiubli-redhat-com/ceph-fix-the-use-after-free-bug-for-file_lock/20221114-132233
>          git checkout 232cc8f1dbeddb308946202a7c67ee4d20451ae7
>          # save the config file
>          mkdir build_dir && cp config build_dir/.config
>          COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash fs/ceph/
>
> If you fix the issue, kindly add following tag where applicable
> | Reported-by: kernel test robot <lkp@intel.com>
>
> All warnings (new ones prefixed by >>):
>
>     In file included from fs/ceph/locks.c:8:
>     In file included from fs/ceph/super.h:8:
>     In file included from include/linux/backing-dev.h:16:
>     In file included from include/linux/writeback.h:13:
>     In file included from include/linux/blk_types.h:10:
>     In file included from include/linux/bvec.h:10:
>     In file included from include/linux/highmem.h:12:
>     In file included from include/linux/hardirq.h:11:
>     In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
>     In file included from include/asm-generic/hardirq.h:17:
>     In file included from include/linux/irq.h:20:
>     In file included from include/linux/io.h:13:
>     In file included from arch/hexagon/include/asm/io.h:334:
>     include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
>             val = __raw_readb(PCI_IOBASE + addr);
>                               ~~~~~~~~~~ ^
>     include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
>             val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
>                                                             ~~~~~~~~~~ ^
>     include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
>     #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
>                                                       ^
>     In file included from fs/ceph/locks.c:8:
>     In file included from fs/ceph/super.h:8:
>     In file included from include/linux/backing-dev.h:16:
>     In file included from include/linux/writeback.h:13:
>     In file included from include/linux/blk_types.h:10:
>     In file included from include/linux/bvec.h:10:
>     In file included from include/linux/highmem.h:12:
>     In file included from include/linux/hardirq.h:11:
>     In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
>     In file included from include/asm-generic/hardirq.h:17:
>     In file included from include/linux/irq.h:20:
>     In file included from include/linux/io.h:13:
>     In file included from arch/hexagon/include/asm/io.h:334:
>     include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
>             val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
>                                                             ~~~~~~~~~~ ^
>     include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
>     #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
>                                                       ^
>     In file included from fs/ceph/locks.c:8:
>     In file included from fs/ceph/super.h:8:
>     In file included from include/linux/backing-dev.h:16:
>     In file included from include/linux/writeback.h:13:
>     In file included from include/linux/blk_types.h:10:
>     In file included from include/linux/bvec.h:10:
>     In file included from include/linux/highmem.h:12:
>     In file included from include/linux/hardirq.h:11:
>     In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
>     In file included from include/asm-generic/hardirq.h:17:
>     In file included from include/linux/irq.h:20:
>     In file included from include/linux/io.h:13:
>     In file included from arch/hexagon/include/asm/io.h:334:
>     include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
>             __raw_writeb(value, PCI_IOBASE + addr);
>                                 ~~~~~~~~~~ ^
>     include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
>             __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
>                                                           ~~~~~~~~~~ ^
>     include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
>             __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
>                                                           ~~~~~~~~~~ ^
>>> fs/ceph/locks.c:66:6: warning: variable 'fi' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
>             if (val == CEPH_FILP_AVAILABLE) {
>                 ^~~~~~~~~~~~~~~~~~~~~~~~~~
>     fs/ceph/locks.c:79:14: note: uninitialized use occurs here
>             atomic_dec(&fi->num_locks);
>                         ^~
>     fs/ceph/locks.c:66:2: note: remove the 'if' if its condition is always true
>             if (val == CEPH_FILP_AVAILABLE) {
>             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>     fs/ceph/locks.c:47:27: note: initialize the variable 'fi' to silence this warning
>             struct ceph_file_info *fi;
>                                      ^
>                                       = NULL
>     7 warnings generated.
>
>
> vim +66 fs/ceph/locks.c
>
>      42	
>      43	static void ceph_fl_release_lock(struct file_lock *fl)
>      44	{
>      45		struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
>      46		struct ceph_inode_info *ci;
>      47		struct ceph_file_info *fi;
>      48		void *val;
>      49	
>      50		/*
>      51		 * If inode is NULL it should be a request file_lock,
>      52		 * nothing we can do.
>      53		 */
>      54		if (!inode)
>      55			return;
>      56	
>      57		ci = ceph_inode(inode);
>      58	
>      59		/*
>      60		 * For Posix-style locks, it may race between filp_close()s,
>      61		 * and it's possible that the 'file' memory pointed by
>      62		 * 'fl->fl_file' has been released. If so just skip it.
>      63		 */
>      64		rcu_read_lock();
>      65		val = xa_load(&ci->i_opened_files, (unsigned long)fl->fl_file);
>    > 66		if (val == CEPH_FILP_AVAILABLE) {
>      67			fi = fl->fl_file->private_data;
>      68			atomic_dec(&fi->num_locks);
>      69		}
>      70		rcu_read_unlock();
>      71	
>      72		if (atomic_dec_and_test(&ci->i_filelock_ref)) {
>      73			/* clear error when all locks are released */
>      74			spin_lock(&ci->i_ceph_lock);
>      75			ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
>      76			spin_unlock(&ci->i_ceph_lock);
>      77		}
>      78		iput(inode);
>      79		atomic_dec(&fi->num_locks);
>      80	}
>      81	
>
  
Jeff Layton Nov. 14, 2022, 1:39 p.m. UTC | #3
On Mon, 2022-11-14 at 13:19 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> When releasing the file locks the fl->fl_file memory could be
> already released by another thread in filp_close(), so we couldn't
> depend on fl->fl_file to get the inode. Just use a xarray to record
> the opened files for each inode.
> 
> Cc: stable@vger.kernel.org
> URL: https://tracker.ceph.com/issues/57986
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/file.c  |  9 +++++++++
>  fs/ceph/inode.c |  4 ++++
>  fs/ceph/locks.c | 17 ++++++++++++++++-
>  fs/ceph/super.h |  4 ++++
>  4 files changed, 33 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 85afcbbb5648..cb4a9c52df27 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -231,6 +231,13 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  			fi->flags |= CEPH_F_SYNC;
>  
>  		file->private_data = fi;
> +
> +		ret = xa_insert(&ci->i_opened_files, (unsigned long)file,
> +				CEPH_FILP_AVAILABLE, GFP_KERNEL);
> +		if (ret) {
> +			kmem_cache_free(ceph_file_cachep, fi);
> +			return ret;
> +		}
>  	}
>  
>  	ceph_get_fmode(ci, fmode, 1);
> @@ -932,6 +939,8 @@ int ceph_release(struct inode *inode, struct file *file)
>  		dout("release inode %p regular file %p\n", inode, file);
>  		WARN_ON(!list_empty(&fi->rw_contexts));
>  
> +		xa_erase(&ci->i_opened_files, (unsigned long)file);
> +
>  		ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
>  		ceph_put_fmode(ci, fi->fmode, 1);
>  
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 77b0cd9af370..554450838e44 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -619,6 +619,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  	INIT_LIST_HEAD(&ci->i_unsafe_iops);
>  	spin_lock_init(&ci->i_unsafe_lock);
>  
> +	xa_init(&ci->i_opened_files);
> +
>  	ci->i_snap_realm = NULL;
>  	INIT_LIST_HEAD(&ci->i_snap_realm_item);
>  	INIT_LIST_HEAD(&ci->i_snap_flush_item);
> @@ -637,6 +639,8 @@ void ceph_free_inode(struct inode *inode)
>  {
>  	struct ceph_inode_info *ci = ceph_inode(inode);
>  
> +	xa_destroy(&ci->i_opened_files);
> +
>  	kfree(ci->i_symlink);
>  #ifdef CONFIG_FS_ENCRYPTION
>  	kfree(ci->fscrypt_auth);
> diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
> index d8385dd0076e..a176a30badd0 100644
> --- a/fs/ceph/locks.c
> +++ b/fs/ceph/locks.c
> @@ -42,9 +42,10 @@ static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
>  
>  static void ceph_fl_release_lock(struct file_lock *fl)
>  {
> -	struct ceph_file_info *fi = fl->fl_file->private_data;
>  	struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
>  	struct ceph_inode_info *ci;
> +	struct ceph_file_info *fi;
> +	void *val;
>  
>  	/*
>  	 * If inode is NULL it should be a request file_lock,
> @@ -54,6 +55,20 @@ static void ceph_fl_release_lock(struct file_lock *fl)
>  		return;
>  
>  	ci = ceph_inode(inode);
> +
> +	/*
> +	 * For Posix-style locks, it may race between filp_close()s,
> +	 * and it's possible that the 'file' memory pointed by
> +	 * 'fl->fl_file' has been released. If so just skip it.
> +	 */
> +	rcu_read_lock();
> +	val = xa_load(&ci->i_opened_files, (unsigned long)fl->fl_file);
> +	if (val == CEPH_FILP_AVAILABLE) {
> +		fi = fl->fl_file->private_data;
> +		atomic_dec(&fi->num_locks);

Don't you need to remove the old atomic_dec from this function if you
move it here?

> +	}
> +	rcu_read_unlock();
> +
>  	if (atomic_dec_and_test(&ci->i_filelock_ref)) {
>  		/* clear error when all locks are released */
>  		spin_lock(&ci->i_ceph_lock);
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 7b75a84ba48d..b3e89192cbec 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -329,6 +329,8 @@ struct ceph_inode_xattrs_info {
>  	u64 version, index_version;
>  };
>  
> +#define CEPH_FILP_AVAILABLE         xa_mk_value(1)
> +
>  /*
>   * Ceph inode.
>   */
> @@ -434,6 +436,8 @@ struct ceph_inode_info {
>  	struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
>  	spinlock_t i_unsafe_lock;
>  
> +	struct xarray		i_opened_files;
> +
>  	union {
>  		struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
>  		struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */

This looks like it'll work, but it's a lot of extra work, having to
track this extra xarray just on the off chance that one of these fd's
might have file locks. The num_locks field is only checked in one place
in ceph_get_caps.

Here's what I'd recommend instead:

Have ceph_get_caps look at the lists in inode->i_flctx and see whether
any of its locks have an fl_file that matches the @filp argument in that
function. Most inodes never get any file locks, so in most cases  this
will turn out to just be a NULL pointer check for i_flctx anyway.

Then you can just remove the num_locks field and call the new helper
from ceph_get_caps instead. I'll send along a proposed patch for the
helper in a bit.
  
Xiubo Li Nov. 14, 2022, 1:46 p.m. UTC | #4
On 14/11/2022 21:39, Jeff Layton wrote:
> On Mon, 2022-11-14 at 13:19 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> When releasing the file locks the fl->fl_file memory could be
>> already released by another thread in filp_close(), so we couldn't
>> depend on fl->fl_file to get the inode. Just use a xarray to record
>> the opened files for each inode.
>>
>> Cc: stable@vger.kernel.org
>> URL: https://tracker.ceph.com/issues/57986
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/file.c  |  9 +++++++++
>>   fs/ceph/inode.c |  4 ++++
>>   fs/ceph/locks.c | 17 ++++++++++++++++-
>>   fs/ceph/super.h |  4 ++++
>>   4 files changed, 33 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>> index 85afcbbb5648..cb4a9c52df27 100644
>> --- a/fs/ceph/file.c
>> +++ b/fs/ceph/file.c
>> @@ -231,6 +231,13 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>>   			fi->flags |= CEPH_F_SYNC;
>>   
>>   		file->private_data = fi;
>> +
>> +		ret = xa_insert(&ci->i_opened_files, (unsigned long)file,
>> +				CEPH_FILP_AVAILABLE, GFP_KERNEL);
>> +		if (ret) {
>> +			kmem_cache_free(ceph_file_cachep, fi);
>> +			return ret;
>> +		}
>>   	}
>>   
>>   	ceph_get_fmode(ci, fmode, 1);
>> @@ -932,6 +939,8 @@ int ceph_release(struct inode *inode, struct file *file)
>>   		dout("release inode %p regular file %p\n", inode, file);
>>   		WARN_ON(!list_empty(&fi->rw_contexts));
>>   
>> +		xa_erase(&ci->i_opened_files, (unsigned long)file);
>> +
>>   		ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
>>   		ceph_put_fmode(ci, fi->fmode, 1);
>>   
>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>> index 77b0cd9af370..554450838e44 100644
>> --- a/fs/ceph/inode.c
>> +++ b/fs/ceph/inode.c
>> @@ -619,6 +619,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>>   	INIT_LIST_HEAD(&ci->i_unsafe_iops);
>>   	spin_lock_init(&ci->i_unsafe_lock);
>>   
>> +	xa_init(&ci->i_opened_files);
>> +
>>   	ci->i_snap_realm = NULL;
>>   	INIT_LIST_HEAD(&ci->i_snap_realm_item);
>>   	INIT_LIST_HEAD(&ci->i_snap_flush_item);
>> @@ -637,6 +639,8 @@ void ceph_free_inode(struct inode *inode)
>>   {
>>   	struct ceph_inode_info *ci = ceph_inode(inode);
>>   
>> +	xa_destroy(&ci->i_opened_files);
>> +
>>   	kfree(ci->i_symlink);
>>   #ifdef CONFIG_FS_ENCRYPTION
>>   	kfree(ci->fscrypt_auth);
>> diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
>> index d8385dd0076e..a176a30badd0 100644
>> --- a/fs/ceph/locks.c
>> +++ b/fs/ceph/locks.c
>> @@ -42,9 +42,10 @@ static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
>>   
>>   static void ceph_fl_release_lock(struct file_lock *fl)
>>   {
>> -	struct ceph_file_info *fi = fl->fl_file->private_data;
>>   	struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
>>   	struct ceph_inode_info *ci;
>> +	struct ceph_file_info *fi;
>> +	void *val;
>>   
>>   	/*
>>   	 * If inode is NULL it should be a request file_lock,
>> @@ -54,6 +55,20 @@ static void ceph_fl_release_lock(struct file_lock *fl)
>>   		return;
>>   
>>   	ci = ceph_inode(inode);
>> +
>> +	/*
>> +	 * For Posix-style locks, it may race between filp_close()s,
>> +	 * and it's possible that the 'file' memory pointed by
>> +	 * 'fl->fl_file' has been released. If so just skip it.
>> +	 */
>> +	rcu_read_lock();
>> +	val = xa_load(&ci->i_opened_files, (unsigned long)fl->fl_file);
>> +	if (val == CEPH_FILP_AVAILABLE) {
>> +		fi = fl->fl_file->private_data;
>> +		atomic_dec(&fi->num_locks);
> Don't you need to remove the old atomic_dec from this function if you
> move it here?

Yeah, I thought I have removed that. Not sure why I added it back.

>
>> +	}
>> +	rcu_read_unlock();
>> +
>>   	if (atomic_dec_and_test(&ci->i_filelock_ref)) {
>>   		/* clear error when all locks are released */
>>   		spin_lock(&ci->i_ceph_lock);
>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 7b75a84ba48d..b3e89192cbec 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -329,6 +329,8 @@ struct ceph_inode_xattrs_info {
>>   	u64 version, index_version;
>>   };
>>   
>> +#define CEPH_FILP_AVAILABLE         xa_mk_value(1)
>> +
>>   /*
>>    * Ceph inode.
>>    */
>> @@ -434,6 +436,8 @@ struct ceph_inode_info {
>>   	struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
>>   	spinlock_t i_unsafe_lock;
>>   
>> +	struct xarray		i_opened_files;
>> +
>>   	union {
>>   		struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
>>   		struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
> This looks like it'll work, but it's a lot of extra work, having to
> track this extra xarray just on the off chance that one of these fd's
> might have file locks. The num_locks field is only checked in one place
> in ceph_get_caps.
>
> Here's what I'd recommend instead:
>
> Have ceph_get_caps look at the lists in inode->i_flctx and see whether
> any of its locks have an fl_file that matches the @filp argument in that
> function. Most inodes never get any file locks, so in most cases  this
> will turn out to just be a NULL pointer check for i_flctx anyway.
>
> Then you can just remove the num_locks field and call the new helper
> from ceph_get_caps instead. I'll send along a proposed patch for the
> helper in a bit.

Sure, Thanks Jeff.

- Xiubo
  

Patch

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 85afcbbb5648..cb4a9c52df27 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -231,6 +231,13 @@  static int ceph_init_file_info(struct inode *inode, struct file *file,
 			fi->flags |= CEPH_F_SYNC;
 
 		file->private_data = fi;
+
+		ret = xa_insert(&ci->i_opened_files, (unsigned long)file,
+				CEPH_FILP_AVAILABLE, GFP_KERNEL);
+		if (ret) {
+			kmem_cache_free(ceph_file_cachep, fi);
+			return ret;
+		}
 	}
 
 	ceph_get_fmode(ci, fmode, 1);
@@ -932,6 +939,8 @@  int ceph_release(struct inode *inode, struct file *file)
 		dout("release inode %p regular file %p\n", inode, file);
 		WARN_ON(!list_empty(&fi->rw_contexts));
 
+		xa_erase(&ci->i_opened_files, (unsigned long)file);
+
 		ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
 		ceph_put_fmode(ci, fi->fmode, 1);
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 77b0cd9af370..554450838e44 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -619,6 +619,8 @@  struct inode *ceph_alloc_inode(struct super_block *sb)
 	INIT_LIST_HEAD(&ci->i_unsafe_iops);
 	spin_lock_init(&ci->i_unsafe_lock);
 
+	xa_init(&ci->i_opened_files);
+
 	ci->i_snap_realm = NULL;
 	INIT_LIST_HEAD(&ci->i_snap_realm_item);
 	INIT_LIST_HEAD(&ci->i_snap_flush_item);
@@ -637,6 +639,8 @@  void ceph_free_inode(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
+	xa_destroy(&ci->i_opened_files);
+
 	kfree(ci->i_symlink);
 #ifdef CONFIG_FS_ENCRYPTION
 	kfree(ci->fscrypt_auth);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index d8385dd0076e..a176a30badd0 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -42,9 +42,10 @@  static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
 
 static void ceph_fl_release_lock(struct file_lock *fl)
 {
-	struct ceph_file_info *fi = fl->fl_file->private_data;
 	struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
 	struct ceph_inode_info *ci;
+	struct ceph_file_info *fi;
+	void *val;
 
 	/*
 	 * If inode is NULL it should be a request file_lock,
@@ -54,6 +55,20 @@  static void ceph_fl_release_lock(struct file_lock *fl)
 		return;
 
 	ci = ceph_inode(inode);
+
+	/*
+	 * For Posix-style locks, it may race between filp_close()s,
+	 * and it's possible that the 'file' memory pointed by
+	 * 'fl->fl_file' has been released. If so just skip it.
+	 */
+	rcu_read_lock();
+	val = xa_load(&ci->i_opened_files, (unsigned long)fl->fl_file);
+	if (val == CEPH_FILP_AVAILABLE) {
+		fi = fl->fl_file->private_data;
+		atomic_dec(&fi->num_locks);
+	}
+	rcu_read_unlock();
+
 	if (atomic_dec_and_test(&ci->i_filelock_ref)) {
 		/* clear error when all locks are released */
 		spin_lock(&ci->i_ceph_lock);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7b75a84ba48d..b3e89192cbec 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -329,6 +329,8 @@  struct ceph_inode_xattrs_info {
 	u64 version, index_version;
 };
 
+#define CEPH_FILP_AVAILABLE         xa_mk_value(1)
+
 /*
  * Ceph inode.
  */
@@ -434,6 +436,8 @@  struct ceph_inode_info {
 	struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
 	spinlock_t i_unsafe_lock;
 
+	struct xarray		i_opened_files;
+
 	union {
 		struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
 		struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */