[RFC] HACK: overlayfs: Optimize overlay/restore creds

Message ID 20231214220222.348101-1-vinicius.gomes@intel.com
State New
Headers
Series [RFC] HACK: overlayfs: Optimize overlay/restore creds |

Commit Message

Vinicius Costa Gomes Dec. 14, 2023, 10:02 p.m. UTC
  Permission checks in overlayfs also check against the credentials
associated with the superblock, which are assigned at mount() time, so
pretty long lived. So, we can omit the reference counting for this
case.

This (very early) proof of concept does two things:

Add a flag "immutable" (TODO: find a better name) to struct cred to
indicate that it is long lived, and that refcount can be omitted.

Add "guard" helpers, so we can use automatic cleanup to be sure
override/restore are always paired. (I dodn't like that I have
'ovl_cred' to be a container for the credentials, but couldn't think
of other solutions)

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
Hi Amir,

Just to know if I am more or less on right track.

This is a different attempt, instead of the local copy idea, I am
using the fact that the credentials associated with the mount() will
be alive for a long time. I think the result is almost the same. But I
could be missing something.

TODO:
 - Add asserts.
 - Replace ovl_override_creds()/revert_Creds() by
   ovl_creator_cred()/guard() everywhere.
 - Probably more.  


 fs/overlayfs/inode.c     |  7 ++++---
 fs/overlayfs/overlayfs.h | 18 ++++++++++++++++++
 fs/overlayfs/params.c    |  4 +++-
 fs/overlayfs/super.c     | 10 +++++++---
 fs/overlayfs/util.c      | 10 ++++++++++
 include/linux/cred.h     | 12 ++++++++++--
 6 files changed, 52 insertions(+), 9 deletions(-)
  

Comments

Amir Goldstein Dec. 15, 2023, 10:30 a.m. UTC | #1
+fsdevel because this may be relevant to any subsystem that
keeps a long live cred copy (e.g. nfsd, ksmbd, cachefiles).

+linus who wrote
d7852fbd0f04 access: avoid the RCU grace period for the temporary
subjective credentials


On Fri, Dec 15, 2023 at 12:02 AM Vinicius Costa Gomes
<vinicius.gomes@intel.com> wrote:
>
> Permission checks in overlayfs also check against the credentials
> associated with the superblock, which are assigned at mount() time, so
> pretty long lived. So, we can omit the reference counting for this
> case.

You forgot to mention WHY you are proposing this and to link to the
original report with the first optimization attempt:

https://lore.kernel.org/linux-unionfs/20231018074553.41333-1-hu1.chen@intel.com/

>
> This (very early) proof of concept does two things:
>
> Add a flag "immutable" (TODO: find a better name) to struct cred to
> indicate that it is long lived, and that refcount can be omitted.
>

This reminds me of the many discussions about Rust abstractions
that are going on right now.
I think an abstraction like this one is called a "borrowed reference".

> Add "guard" helpers, so we can use automatic cleanup to be sure
> override/restore are always paired. (I didn't like that I have
> 'ovl_cred' to be a container for the credentials, but couldn't think
> of other solutions)
>

I like the guard but see comments below...

> Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> ---
> Hi Amir,
>
> Just to know if I am more or less on right track.
>
> This is a different attempt, instead of the local copy idea, I am
> using the fact that the credentials associated with the mount() will
> be alive for a long time. I think the result is almost the same. But I
> could be missing something.
>
> TODO:
>  - Add asserts.
>  - Replace ovl_override_creds()/revert_Creds() by
>    ovl_creator_cred()/guard() everywhere.
>  - Probably more.
>
>
>  fs/overlayfs/inode.c     |  7 ++++---
>  fs/overlayfs/overlayfs.h | 18 ++++++++++++++++++
>  fs/overlayfs/params.c    |  4 +++-
>  fs/overlayfs/super.c     | 10 +++++++---
>  fs/overlayfs/util.c      | 10 ++++++++++
>  include/linux/cred.h     | 12 ++++++++++--
>  6 files changed, 52 insertions(+), 9 deletions(-)
>
> diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
> index c63b31a460be..2c016a3bbe2d 100644
> --- a/fs/overlayfs/inode.c
> +++ b/fs/overlayfs/inode.c
> @@ -290,9 +290,9 @@ int ovl_permission(struct mnt_idmap *idmap,
>                    struct inode *inode, int mask)
>  {
>         struct inode *upperinode = ovl_inode_upper(inode);
> +       struct ovl_cred ovl_cred;
>         struct inode *realinode;
>         struct path realpath;
> -       const struct cred *old_cred;

Nit: please don't reorder the variable definitions.

>         int err;
>
>         /* Careful in RCU walk mode */
> @@ -310,7 +310,9 @@ int ovl_permission(struct mnt_idmap *idmap,
>         if (err)
>                 return err;
>
> -       old_cred = ovl_override_creds(inode->i_sb);
> +       ovl_cred = ovl_creator_cred(inode->i_sb);
> +       guard(ovl_creds)(&ovl_cred);
> +
>         if (!upperinode &&
>             !special_file(realinode->i_mode) && mask & MAY_WRITE) {
>                 mask &= ~(MAY_WRITE | MAY_APPEND);
> @@ -318,7 +320,6 @@ int ovl_permission(struct mnt_idmap *idmap,
>                 mask |= MAY_READ;
>         }
>         err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask);
> -       revert_creds(old_cred);
>
>         return err;
>  }
> diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
> index 05c3dd597fa8..22ea3066376e 100644
> --- a/fs/overlayfs/overlayfs.h
> +++ b/fs/overlayfs/overlayfs.h
> @@ -416,6 +416,24 @@ static inline int ovl_do_getattr(const struct path *path, struct kstat *stat,
>         return vfs_getattr(path, stat, request_mask, flags);
>  }
>
> +struct ovl_cred {
> +       const struct cred *cred;
> +};
> +
> +static inline struct ovl_cred ovl_creator_cred(struct super_block *sb)
> +{
> +       struct ovl_fs *ofs = OVL_FS(sb);
> +
> +       return (struct ovl_cred) { .cred = ofs->creator_cred };
> +}
> +
> +void ovl_override_creds_new(struct ovl_cred *creator_cred);
> +void ovl_revert_creds_new(struct ovl_cred *creator_cred);
> +
> +DEFINE_GUARD(ovl_creds, struct ovl_cred *,
> +            ovl_override_creds_new(_T),
> +            ovl_revert_creds_new(_T));
> +

This pattern is not unique to overlayfs.
It is probably better to define a common container type struct override_cred
in cred.h/cred.c that other code could also use.

>  /* util.c */
>  int ovl_get_write_access(struct dentry *dentry);
>  void ovl_put_write_access(struct dentry *dentry);
> diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
> index 3fe2dde1598f..008377b9241a 100644
> --- a/fs/overlayfs/params.c
> +++ b/fs/overlayfs/params.c
> @@ -770,8 +770,10 @@ void ovl_free_fs(struct ovl_fs *ofs)
>         kfree(ofs->config.lowerdirs);
>         kfree(ofs->config.upperdir);
>         kfree(ofs->config.workdir);
> -       if (ofs->creator_cred)
> +       if (ofs->creator_cred) {
> +               cred_set_immutable(ofs->creator_cred, false);
>                 put_cred(ofs->creator_cred);

Not happy about this API.

Two solutions I can think of:
1. (my preference) keep two copies of creator_cred, one refcounted copy
    and one non-refcounted that is used for override_creds()
2. put_cred_ref() which explicitly opts-in to dropping refcount on
    a borrowed reference, same as you do above but hidden behind
    a properly documented helper

> +       }
>         kfree(ofs);
>  }
>
> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
> index a0967bb25003..1ffb4f0f8186 100644
> --- a/fs/overlayfs/super.c
> +++ b/fs/overlayfs/super.c
> @@ -1304,6 +1304,13 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
>         if (!cred)
>                 goto out_err;
>
> +       /* Never override disk quota limits or use reserved space */
> +       cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
> +       /* The cred that is going to be associated with the super
> +        * block will not change.
> +        */
> +       cred_set_immutable(cred, true);
> +

Likewise, either:
1. Create a non-refcounted copy of creator_cred
or
2. Use a documented helper prepare_creds_ref() to hide
    this implementation detail

>         err = ovl_fs_params_verify(ctx, &ofs->config);
>         if (err)
>                 goto out_err;
> @@ -1438,9 +1445,6 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
>         else if (!ofs->nofh)
>                 sb->s_export_op = &ovl_export_fid_operations;
>
> -       /* Never override disk quota limits or use reserved space */
> -       cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
> -
>         sb->s_magic = OVERLAYFS_SUPER_MAGIC;
>         sb->s_xattr = ovl_xattr_handlers(ofs);
>         sb->s_fs_info = ofs;
> diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> index c3f020ca13a8..9ae9a35a6a7a 100644
> --- a/fs/overlayfs/util.c
> +++ b/fs/overlayfs/util.c
> @@ -68,6 +68,16 @@ const struct cred *ovl_override_creds(struct super_block *sb)
>         return override_creds(ofs->creator_cred);
>  }
>
> +void ovl_override_creds_new(struct ovl_cred *creator_cred)
> +{
> +       creator_cred->cred = override_creds(creator_cred->cred);
> +}
> +
> +void ovl_revert_creds_new(struct ovl_cred *creator_cred)
> +{
> +       revert_creds(creator_cred->cred);
> +}

Would look nicer in this generic form, no?

void override_cred_save(struct override_cred *override)
{
       override->cred = override_creds(override->cred);
}

void override_cred_restore(struct override_cred *old)
{
       revert_creds(old->cred);
}

Which reminds me that memalloc_*_{save,restore} are good
candidates for defining a guard.

> +
>  /*
>   * Check if underlying fs supports file handles and try to determine encoding
>   * type, in order to deduce maximum inode number used by fs.
> diff --git a/include/linux/cred.h b/include/linux/cred.h
> index af8d353a4b86..06eaedfe48ea 100644
> --- a/include/linux/cred.h
> +++ b/include/linux/cred.h
> @@ -151,6 +151,7 @@ struct cred {
>                 int non_rcu;                    /* Can we skip RCU deletion? */
>                 struct rcu_head rcu;            /* RCU deletion hook */
>         };
> +       bool    immutable;
>  } __randomize_layout;
>

If we choose the design that the immutable/non-refcount property
is a const property and we need to create a copy of struct cred
whenever we want to use a non-refcounted copy, then we could
store this in the union because RCU deletion is also not needed for
non-refcounted copy:

        struct {
            int non_refcount:1;              /* A borrowed reference? */
            int non_rcu:1;                      /* Can we skip RCU deletion? */
        };
        struct rcu_head rcu;            /* RCU deletion hook */
    };

>  extern void __put_cred(struct cred *);
> @@ -229,7 +230,8 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
>   */
>  static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
>  {
> -       atomic_add(nr, &cred->usage);
> +       if (!cred->immutable)
> +               atomic_add(nr, &cred->usage);
>         return cred;
>  }
>
> @@ -245,6 +247,12 @@ static inline struct cred *get_new_cred(struct cred *cred)
>         return get_new_cred_many(cred, 1);
>  }
>
> +static inline void cred_set_immutable(const struct cred *cred, bool imm)
> +{
> +       struct cred *nonconst_cred = (struct cred *) cred;
> +       nonconst_cred->immutable = imm;
> +}
> +
>  /**
>   * get_cred_many - Get references on a set of credentials
>   * @cred: The credentials to reference
> @@ -313,7 +321,7 @@ static inline void put_cred_many(const struct cred *_cred, int nr)
>
>         if (cred) {
>                 validate_creds(cred);
> -               if (atomic_sub_and_test(nr, &cred->usage))
> +               if (!cred->immutable && atomic_sub_and_test(nr, &cred->usage))
>                         __put_cred(cred);
>         }
>  }
> --
> 2.43.0
>

Thanks,
Amir.
  
Vinicius Costa Gomes Dec. 15, 2023, 8 p.m. UTC | #2
Hi Amir,

Amir Goldstein <amir73il@gmail.com> writes:

> +fsdevel because this may be relevant to any subsystem that
> keeps a long live cred copy (e.g. nfsd, ksmbd, cachefiles).
>
> +linus who wrote
> d7852fbd0f04 access: avoid the RCU grace period for the temporary
> subjective credentials
>
>
> On Fri, Dec 15, 2023 at 12:02 AM Vinicius Costa Gomes
> <vinicius.gomes@intel.com> wrote:
>>
>> Permission checks in overlayfs also check against the credentials
>> associated with the superblock, which are assigned at mount() time, so
>> pretty long lived. So, we can omit the reference counting for this
>> case.
>
> You forgot to mention WHY you are proposing this and to link to the
> original report with the first optimization attempt:
>
> https://lore.kernel.org/linux-unionfs/20231018074553.41333-1-hu1.chen@intel.com/
>

I thought that the "in-reply-to" would do that, I should have been more
explicit on the context. Sorry.

>>
>> This (very early) proof of concept does two things:
>>
>> Add a flag "immutable" (TODO: find a better name) to struct cred to
>> indicate that it is long lived, and that refcount can be omitted.
>>
>
> This reminds me of the many discussions about Rust abstractions
> that are going on right now.
> I think an abstraction like this one is called a "borrowed reference".
>

Yeah, very similar to a borrow in rust.

>> Add "guard" helpers, so we can use automatic cleanup to be sure
>> override/restore are always paired. (I didn't like that I have
>> 'ovl_cred' to be a container for the credentials, but couldn't think
>> of other solutions)
>>
>
> I like the guard but see comments below...
>
>> Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
>> ---
>> Hi Amir,
>>
>> Just to know if I am more or less on right track.
>>
>> This is a different attempt, instead of the local copy idea, I am
>> using the fact that the credentials associated with the mount() will
>> be alive for a long time. I think the result is almost the same. But I
>> could be missing something.
>>
>> TODO:
>>  - Add asserts.
>>  - Replace ovl_override_creds()/revert_Creds() by
>>    ovl_creator_cred()/guard() everywhere.
>>  - Probably more.
>>
>>
>>  fs/overlayfs/inode.c     |  7 ++++---
>>  fs/overlayfs/overlayfs.h | 18 ++++++++++++++++++
>>  fs/overlayfs/params.c    |  4 +++-
>>  fs/overlayfs/super.c     | 10 +++++++---
>>  fs/overlayfs/util.c      | 10 ++++++++++
>>  include/linux/cred.h     | 12 ++++++++++--
>>  6 files changed, 52 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
>> index c63b31a460be..2c016a3bbe2d 100644
>> --- a/fs/overlayfs/inode.c
>> +++ b/fs/overlayfs/inode.c
>> @@ -290,9 +290,9 @@ int ovl_permission(struct mnt_idmap *idmap,
>>                    struct inode *inode, int mask)
>>  {
>>         struct inode *upperinode = ovl_inode_upper(inode);
>> +       struct ovl_cred ovl_cred;
>>         struct inode *realinode;
>>         struct path realpath;
>> -       const struct cred *old_cred;
>
> Nit: please don't reorder the variable definitions.
>

Sorry about that. "bad" habits from the networking side :-)

>>         int err;
>>
>>         /* Careful in RCU walk mode */
>> @@ -310,7 +310,9 @@ int ovl_permission(struct mnt_idmap *idmap,
>>         if (err)
>>                 return err;
>>
>> -       old_cred = ovl_override_creds(inode->i_sb);
>> +       ovl_cred = ovl_creator_cred(inode->i_sb);
>> +       guard(ovl_creds)(&ovl_cred);
>> +
>>         if (!upperinode &&
>>             !special_file(realinode->i_mode) && mask & MAY_WRITE) {
>>                 mask &= ~(MAY_WRITE | MAY_APPEND);
>> @@ -318,7 +320,6 @@ int ovl_permission(struct mnt_idmap *idmap,
>>                 mask |= MAY_READ;
>>         }
>>         err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask);
>> -       revert_creds(old_cred);
>>
>>         return err;
>>  }
>> diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
>> index 05c3dd597fa8..22ea3066376e 100644
>> --- a/fs/overlayfs/overlayfs.h
>> +++ b/fs/overlayfs/overlayfs.h
>> @@ -416,6 +416,24 @@ static inline int ovl_do_getattr(const struct path *path, struct kstat *stat,
>>         return vfs_getattr(path, stat, request_mask, flags);
>>  }
>>
>> +struct ovl_cred {
>> +       const struct cred *cred;
>> +};
>> +
>> +static inline struct ovl_cred ovl_creator_cred(struct super_block *sb)
>> +{
>> +       struct ovl_fs *ofs = OVL_FS(sb);
>> +
>> +       return (struct ovl_cred) { .cred = ofs->creator_cred };
>> +}
>> +
>> +void ovl_override_creds_new(struct ovl_cred *creator_cred);
>> +void ovl_revert_creds_new(struct ovl_cred *creator_cred);
>> +
>> +DEFINE_GUARD(ovl_creds, struct ovl_cred *,
>> +            ovl_override_creds_new(_T),
>> +            ovl_revert_creds_new(_T));
>> +
>
> This pattern is not unique to overlayfs.
> It is probably better to define a common container type struct override_cred
> in cred.h/cred.c that other code could also use.
>

Good idea.

>>  /* util.c */
>>  int ovl_get_write_access(struct dentry *dentry);
>>  void ovl_put_write_access(struct dentry *dentry);
>> diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
>> index 3fe2dde1598f..008377b9241a 100644
>> --- a/fs/overlayfs/params.c
>> +++ b/fs/overlayfs/params.c
>> @@ -770,8 +770,10 @@ void ovl_free_fs(struct ovl_fs *ofs)
>>         kfree(ofs->config.lowerdirs);
>>         kfree(ofs->config.upperdir);
>>         kfree(ofs->config.workdir);
>> -       if (ofs->creator_cred)
>> +       if (ofs->creator_cred) {
>> +               cred_set_immutable(ofs->creator_cred, false);
>>                 put_cred(ofs->creator_cred);
>
> Not happy about this API.
>
> Two solutions I can think of:
> 1. (my preference) keep two copies of creator_cred, one refcounted copy
>     and one non-refcounted that is used for override_creds()
> 2. put_cred_ref() which explicitly opts-in to dropping refcount on
>     a borrowed reference, same as you do above but hidden behind
>     a properly documented helper
>

Probably because I already have option (2) more or less understood, but
I think that having a single creator_cred marked as
"non-refcounted/long-lived" is simpler than having two copies, even the
the extra copy only exists for the duration of the override.

But it could be that I still can't imagine what you have in mind about
(1).

>> +       }
>>         kfree(ofs);
>>  }
>>
>> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
>> index a0967bb25003..1ffb4f0f8186 100644
>> --- a/fs/overlayfs/super.c
>> +++ b/fs/overlayfs/super.c
>> @@ -1304,6 +1304,13 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
>>         if (!cred)
>>                 goto out_err;
>>
>> +       /* Never override disk quota limits or use reserved space */
>> +       cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
>> +       /* The cred that is going to be associated with the super
>> +        * block will not change.
>> +        */
>> +       cred_set_immutable(cred, true);
>> +
>
> Likewise, either:
> 1. Create a non-refcounted copy of creator_cred

Ah! I think now I see what you meant. Not sure I like, I think it's a
bit too error prone, it's hard to enforce that the copies would be kept
in sync in general. (even if in practice the only thing that would need
to be kept in sync is the destruction of both, at least for now).

> or
> 2. Use a documented helper prepare_creds_ref() to hide
>     this implementation detail

This I like more, having the properties documented in the constructor.
And much better than my _set_immutable().

>
>>         err = ovl_fs_params_verify(ctx, &ofs->config);
>>         if (err)
>>                 goto out_err;
>> @@ -1438,9 +1445,6 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
>>         else if (!ofs->nofh)
>>                 sb->s_export_op = &ovl_export_fid_operations;
>>
>> -       /* Never override disk quota limits or use reserved space */
>> -       cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
>> -
>>         sb->s_magic = OVERLAYFS_SUPER_MAGIC;
>>         sb->s_xattr = ovl_xattr_handlers(ofs);
>>         sb->s_fs_info = ofs;
>> diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
>> index c3f020ca13a8..9ae9a35a6a7a 100644
>> --- a/fs/overlayfs/util.c
>> +++ b/fs/overlayfs/util.c
>> @@ -68,6 +68,16 @@ const struct cred *ovl_override_creds(struct super_block *sb)
>>         return override_creds(ofs->creator_cred);
>>  }
>>
>> +void ovl_override_creds_new(struct ovl_cred *creator_cred)
>> +{
>> +       creator_cred->cred = override_creds(creator_cred->cred);
>> +}
>> +
>> +void ovl_revert_creds_new(struct ovl_cred *creator_cred)
>> +{
>> +       revert_creds(creator_cred->cred);
>> +}
>
> Would look nicer in this generic form, no?
>
> void override_cred_save(struct override_cred *override)
> {
>        override->cred = override_creds(override->cred);
> }
>
> void override_cred_restore(struct override_cred *old)
> {
>        revert_creds(old->cred);
> }
>

Much nicer :-)

> Which reminds me that memalloc_*_{save,restore} are good
> candidates for defining a guard.
>
>> +
>>  /*
>>   * Check if underlying fs supports file handles and try to determine encoding
>>   * type, in order to deduce maximum inode number used by fs.
>> diff --git a/include/linux/cred.h b/include/linux/cred.h
>> index af8d353a4b86..06eaedfe48ea 100644
>> --- a/include/linux/cred.h
>> +++ b/include/linux/cred.h
>> @@ -151,6 +151,7 @@ struct cred {
>>                 int non_rcu;                    /* Can we skip RCU deletion? */
>>                 struct rcu_head rcu;            /* RCU deletion hook */
>>         };
>> +       bool    immutable;
>>  } __randomize_layout;
>>
>
> If we choose the design that the immutable/non-refcount property
> is a const property and we need to create a copy of struct cred
> whenever we want to use a non-refcounted copy, then we could
> store this in the union because RCU deletion is also not needed for
> non-refcounted copy:
>
>         struct {
>             int non_refcount:1;              /* A borrowed reference? */
>             int non_rcu:1;                      /* Can we skip RCU deletion? */
>         };
>         struct rcu_head rcu;            /* RCU deletion hook */
>     };
>

Ah! Now it makes sense. Thank you.

If you, or any others of course, don't have objections against option
(2), I think I am going to play with it a bit and see how it goes.


Cheers,
  
Amir Goldstein Dec. 16, 2023, 10:16 a.m. UTC | #3
On Fri, Dec 15, 2023 at 10:00 PM Vinicius Costa Gomes
<vinicius.gomes@intel.com> wrote:
>
> Hi Amir,
>
> Amir Goldstein <amir73il@gmail.com> writes:
>
> > +fsdevel because this may be relevant to any subsystem that
> > keeps a long live cred copy (e.g. nfsd, ksmbd, cachefiles).
> >
> > +linus who wrote
> > d7852fbd0f04 access: avoid the RCU grace period for the temporary
> > subjective credentials
> >
> >
> > On Fri, Dec 15, 2023 at 12:02 AM Vinicius Costa Gomes
> > <vinicius.gomes@intel.com> wrote:
> >>
> >> Permission checks in overlayfs also check against the credentials
> >> associated with the superblock, which are assigned at mount() time, so
> >> pretty long lived. So, we can omit the reference counting for this
> >> case.
> >
> > You forgot to mention WHY you are proposing this and to link to the
> > original report with the first optimization attempt:
> >
> > https://lore.kernel.org/linux-unionfs/20231018074553.41333-1-hu1.chen@intel.com/
> >
>
> I thought that the "in-reply-to" would do that, I should have been more
> explicit on the context. Sorry.
>
> >>
> >> This (very early) proof of concept does two things:
> >>
> >> Add a flag "immutable" (TODO: find a better name) to struct cred to
> >> indicate that it is long lived, and that refcount can be omitted.
> >>
> >
> > This reminds me of the many discussions about Rust abstractions
> > that are going on right now.
> > I think an abstraction like this one is called a "borrowed reference".
> >
>
> Yeah, very similar to a borrow in rust.
>
> >> Add "guard" helpers, so we can use automatic cleanup to be sure
> >> override/restore are always paired. (I didn't like that I have
> >> 'ovl_cred' to be a container for the credentials, but couldn't think
> >> of other solutions)
> >>
> >
> > I like the guard but see comments below...
> >
> >> Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> >> ---
> >> Hi Amir,
> >>
> >> Just to know if I am more or less on right track.
> >>
> >> This is a different attempt, instead of the local copy idea, I am
> >> using the fact that the credentials associated with the mount() will
> >> be alive for a long time. I think the result is almost the same. But I
> >> could be missing something.
> >>
> >> TODO:
> >>  - Add asserts.
> >>  - Replace ovl_override_creds()/revert_Creds() by
> >>    ovl_creator_cred()/guard() everywhere.
> >>  - Probably more.
> >>
> >>
> >>  fs/overlayfs/inode.c     |  7 ++++---
> >>  fs/overlayfs/overlayfs.h | 18 ++++++++++++++++++
> >>  fs/overlayfs/params.c    |  4 +++-
> >>  fs/overlayfs/super.c     | 10 +++++++---
> >>  fs/overlayfs/util.c      | 10 ++++++++++
> >>  include/linux/cred.h     | 12 ++++++++++--
> >>  6 files changed, 52 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
> >> index c63b31a460be..2c016a3bbe2d 100644
> >> --- a/fs/overlayfs/inode.c
> >> +++ b/fs/overlayfs/inode.c
> >> @@ -290,9 +290,9 @@ int ovl_permission(struct mnt_idmap *idmap,
> >>                    struct inode *inode, int mask)
> >>  {
> >>         struct inode *upperinode = ovl_inode_upper(inode);
> >> +       struct ovl_cred ovl_cred;
> >>         struct inode *realinode;
> >>         struct path realpath;
> >> -       const struct cred *old_cred;
> >
> > Nit: please don't reorder the variable definitions.
> >
>
> Sorry about that. "bad" habits from the networking side :-)
>

I fully support all forms and shapes of OCD ;-)

> >>         int err;
> >>
> >>         /* Careful in RCU walk mode */
> >> @@ -310,7 +310,9 @@ int ovl_permission(struct mnt_idmap *idmap,
> >>         if (err)
> >>                 return err;
> >>
> >> -       old_cred = ovl_override_creds(inode->i_sb);
> >> +       ovl_cred = ovl_creator_cred(inode->i_sb);
> >> +       guard(ovl_creds)(&ovl_cred);
> >> +
> >>         if (!upperinode &&
> >>             !special_file(realinode->i_mode) && mask & MAY_WRITE) {
> >>                 mask &= ~(MAY_WRITE | MAY_APPEND);
> >> @@ -318,7 +320,6 @@ int ovl_permission(struct mnt_idmap *idmap,
> >>                 mask |= MAY_READ;
> >>         }
> >>         err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask);
> >> -       revert_creds(old_cred);
> >>
> >>         return err;
> >>  }
> >> diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
> >> index 05c3dd597fa8..22ea3066376e 100644
> >> --- a/fs/overlayfs/overlayfs.h
> >> +++ b/fs/overlayfs/overlayfs.h
> >> @@ -416,6 +416,24 @@ static inline int ovl_do_getattr(const struct path *path, struct kstat *stat,
> >>         return vfs_getattr(path, stat, request_mask, flags);
> >>  }
> >>
> >> +struct ovl_cred {
> >> +       const struct cred *cred;
> >> +};
> >> +
> >> +static inline struct ovl_cred ovl_creator_cred(struct super_block *sb)
> >> +{
> >> +       struct ovl_fs *ofs = OVL_FS(sb);
> >> +
> >> +       return (struct ovl_cred) { .cred = ofs->creator_cred };
> >> +}
> >> +
> >> +void ovl_override_creds_new(struct ovl_cred *creator_cred);
> >> +void ovl_revert_creds_new(struct ovl_cred *creator_cred);
> >> +
> >> +DEFINE_GUARD(ovl_creds, struct ovl_cred *,
> >> +            ovl_override_creds_new(_T),
> >> +            ovl_revert_creds_new(_T));
> >> +
> >
> > This pattern is not unique to overlayfs.
> > It is probably better to define a common container type struct override_cred
> > in cred.h/cred.c that other code could also use.
> >
>
> Good idea.
>
> >>  /* util.c */
> >>  int ovl_get_write_access(struct dentry *dentry);
> >>  void ovl_put_write_access(struct dentry *dentry);
> >> diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
> >> index 3fe2dde1598f..008377b9241a 100644
> >> --- a/fs/overlayfs/params.c
> >> +++ b/fs/overlayfs/params.c
> >> @@ -770,8 +770,10 @@ void ovl_free_fs(struct ovl_fs *ofs)
> >>         kfree(ofs->config.lowerdirs);
> >>         kfree(ofs->config.upperdir);
> >>         kfree(ofs->config.workdir);
> >> -       if (ofs->creator_cred)
> >> +       if (ofs->creator_cred) {
> >> +               cred_set_immutable(ofs->creator_cred, false);
> >>                 put_cred(ofs->creator_cred);
> >
> > Not happy about this API.
> >
> > Two solutions I can think of:
> > 1. (my preference) keep two copies of creator_cred, one refcounted copy
> >     and one non-refcounted that is used for override_creds()
> > 2. put_cred_ref() which explicitly opts-in to dropping refcount on
> >     a borrowed reference, same as you do above but hidden behind
> >     a properly documented helper
> >
>
> Probably because I already have option (2) more or less understood, but
> I think that having a single creator_cred marked as
> "non-refcounted/long-lived" is simpler than having two copies, even the
> the extra copy only exists for the duration of the override.
>
> But it could be that I still can't imagine what you have in mind about
> (1).
>

(1) is actually quite similar to your first proposal of copying creator_cred
to a local stack variable.
My only concern about this proposal was that the core code was not aware
of the fact that it is working with an object that must not be freed.
And this is where non_refcount comes in to help.

With (2) prepare_creds_ref()/put_cred_ref() can be used to manage the
lifetime of a long-lived cred object from the cred_jar memory pool.

With (1) the caller is responsible to manage the lifetime of the non-refcounted
copy, for example, if the object lives on the stack.

For overlayfs this could mean, either create a non-refcounted copy on the
stack in every operator using ovl_creator_cred() helper as your first proposal
did, or keep another non-refcounted copy in ofs->override_cred, which gets
allocated/freed by overlayfs (i.e. kmalloc/kfree).

> >> +       }
> >>         kfree(ofs);
> >>  }
> >>
> >> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
> >> index a0967bb25003..1ffb4f0f8186 100644
> >> --- a/fs/overlayfs/super.c
> >> +++ b/fs/overlayfs/super.c
> >> @@ -1304,6 +1304,13 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
> >>         if (!cred)
> >>                 goto out_err;
> >>
> >> +       /* Never override disk quota limits or use reserved space */
> >> +       cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
> >> +       /* The cred that is going to be associated with the super
> >> +        * block will not change.
> >> +        */
> >> +       cred_set_immutable(cred, true);
> >> +
> >
> > Likewise, either:
> > 1. Create a non-refcounted copy of creator_cred
>
> Ah! I think now I see what you meant. Not sure I like, I think it's a
> bit too error prone, it's hard to enforce that the copies would be kept
> in sync in general. (even if in practice the only thing that would need
> to be kept in sync is the destruction of both, at least for now).
>

I agree that it makes more sense to create the non-refcounted copy
on the stack.

As a matter of fact, maybe it makes sense to embed a non-refcounted
copy in the struct used for the guard:

struct override_cred {
       struct cred copy;
       const struct cred *cred;
};

Then override_cred_save() can create the non-refcounted copy:

void override_cred_save(struct override_cred *override)
{
       override->copy = *override;
       override->copy.non_refcount = 1;
       override->cred = override_creds(&override->copy);
       override->cred = override_creds(override->cred);
}

> > or
> > 2. Use a documented helper prepare_creds_ref() to hide
> >     this implementation detail
>
> This I like more, having the properties documented in the constructor.
> And much better than my _set_immutable().
>

Yes, the important thing is that an object cannot change
its non_refcount property during its lifetime - allowing that
would be too risky, race prone and much harder to verify
using static code checkers or compilers.

> >
> >>         err = ovl_fs_params_verify(ctx, &ofs->config);
> >>         if (err)
> >>                 goto out_err;
> >> @@ -1438,9 +1445,6 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
> >>         else if (!ofs->nofh)
> >>                 sb->s_export_op = &ovl_export_fid_operations;
> >>
> >> -       /* Never override disk quota limits or use reserved space */
> >> -       cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
> >> -
> >>         sb->s_magic = OVERLAYFS_SUPER_MAGIC;
> >>         sb->s_xattr = ovl_xattr_handlers(ofs);
> >>         sb->s_fs_info = ofs;
> >> diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> >> index c3f020ca13a8..9ae9a35a6a7a 100644
> >> --- a/fs/overlayfs/util.c
> >> +++ b/fs/overlayfs/util.c
> >> @@ -68,6 +68,16 @@ const struct cred *ovl_override_creds(struct super_block *sb)
> >>         return override_creds(ofs->creator_cred);
> >>  }
> >>
> >> +void ovl_override_creds_new(struct ovl_cred *creator_cred)
> >> +{
> >> +       creator_cred->cred = override_creds(creator_cred->cred);
> >> +}
> >> +
> >> +void ovl_revert_creds_new(struct ovl_cred *creator_cred)
> >> +{
> >> +       revert_creds(creator_cred->cred);
> >> +}
> >
> > Would look nicer in this generic form, no?
> >
> > void override_cred_save(struct override_cred *override)
> > {
> >        override->cred = override_creds(override->cred);
> > }
> >
> > void override_cred_restore(struct override_cred *old)
> > {
> >        revert_creds(old->cred);
> > }
> >
>
> Much nicer :-)
>
> > Which reminds me that memalloc_*_{save,restore} are good
> > candidates for defining a guard.
> >
> >> +
> >>  /*
> >>   * Check if underlying fs supports file handles and try to determine encoding
> >>   * type, in order to deduce maximum inode number used by fs.
> >> diff --git a/include/linux/cred.h b/include/linux/cred.h
> >> index af8d353a4b86..06eaedfe48ea 100644
> >> --- a/include/linux/cred.h
> >> +++ b/include/linux/cred.h
> >> @@ -151,6 +151,7 @@ struct cred {
> >>                 int non_rcu;                    /* Can we skip RCU deletion? */
> >>                 struct rcu_head rcu;            /* RCU deletion hook */
> >>         };
> >> +       bool    immutable;
> >>  } __randomize_layout;
> >>
> >
> > If we choose the design that the immutable/non-refcount property
> > is a const property and we need to create a copy of struct cred
> > whenever we want to use a non-refcounted copy, then we could
> > store this in the union because RCU deletion is also not needed for
> > non-refcounted copy:
> >
> >         struct {
> >             int non_refcount:1;              /* A borrowed reference? */
> >             int non_rcu:1;                      /* Can we skip RCU deletion? */
> >         };
> >         struct rcu_head rcu;            /* RCU deletion hook */
> >     };
> >
>
> Ah! Now it makes sense. Thank you.
>
> If you, or any others of course, don't have objections against option
> (2), I think I am going to play with it a bit and see how it goes.
>

No objections here.

Thanks,
Amir.
  
Amir Goldstein Dec. 16, 2023, 11:38 a.m. UTC | #4
On Sat, Dec 16, 2023 at 12:16 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Fri, Dec 15, 2023 at 10:00 PM Vinicius Costa Gomes
> <vinicius.gomes@intel.com> wrote:
> >
> > Hi Amir,
> >
> > Amir Goldstein <amir73il@gmail.com> writes:
> >
> > > +fsdevel because this may be relevant to any subsystem that
> > > keeps a long live cred copy (e.g. nfsd, ksmbd, cachefiles).
> > >
> > > +linus who wrote
> > > d7852fbd0f04 access: avoid the RCU grace period for the temporary
> > > subjective credentials
> > >
> > >
> > > On Fri, Dec 15, 2023 at 12:02 AM Vinicius Costa Gomes
> > > <vinicius.gomes@intel.com> wrote:
> > >>
> > >> Permission checks in overlayfs also check against the credentials
> > >> associated with the superblock, which are assigned at mount() time, so
> > >> pretty long lived. So, we can omit the reference counting for this
> > >> case.
> > >
> > > You forgot to mention WHY you are proposing this and to link to the
> > > original report with the first optimization attempt:
> > >
> > > https://lore.kernel.org/linux-unionfs/20231018074553.41333-1-hu1.chen@intel.com/
> > >
> >
> > I thought that the "in-reply-to" would do that, I should have been more
> > explicit on the context. Sorry.
> >
> > >>
> > >> This (very early) proof of concept does two things:
> > >>
> > >> Add a flag "immutable" (TODO: find a better name) to struct cred to
> > >> indicate that it is long lived, and that refcount can be omitted.
> > >>
> > >
> > > This reminds me of the many discussions about Rust abstractions
> > > that are going on right now.
> > > I think an abstraction like this one is called a "borrowed reference".
> > >
> >
> > Yeah, very similar to a borrow in rust.
> >
> > >> Add "guard" helpers, so we can use automatic cleanup to be sure
> > >> override/restore are always paired. (I didn't like that I have
> > >> 'ovl_cred' to be a container for the credentials, but couldn't think
> > >> of other solutions)
> > >>
> > >
> > > I like the guard but see comments below...
> > >
> > >> Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> > >> ---
> > >> Hi Amir,
> > >>
> > >> Just to know if I am more or less on right track.
> > >>
> > >> This is a different attempt, instead of the local copy idea, I am
> > >> using the fact that the credentials associated with the mount() will
> > >> be alive for a long time. I think the result is almost the same. But I
> > >> could be missing something.
> > >>
> > >> TODO:
> > >>  - Add asserts.
> > >>  - Replace ovl_override_creds()/revert_Creds() by
> > >>    ovl_creator_cred()/guard() everywhere.
> > >>  - Probably more.
> > >>
> > >>
> > >>  fs/overlayfs/inode.c     |  7 ++++---
> > >>  fs/overlayfs/overlayfs.h | 18 ++++++++++++++++++
> > >>  fs/overlayfs/params.c    |  4 +++-
> > >>  fs/overlayfs/super.c     | 10 +++++++---
> > >>  fs/overlayfs/util.c      | 10 ++++++++++
> > >>  include/linux/cred.h     | 12 ++++++++++--
> > >>  6 files changed, 52 insertions(+), 9 deletions(-)
> > >>
> > >> diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
> > >> index c63b31a460be..2c016a3bbe2d 100644
> > >> --- a/fs/overlayfs/inode.c
> > >> +++ b/fs/overlayfs/inode.c
> > >> @@ -290,9 +290,9 @@ int ovl_permission(struct mnt_idmap *idmap,
> > >>                    struct inode *inode, int mask)
> > >>  {
> > >>         struct inode *upperinode = ovl_inode_upper(inode);
> > >> +       struct ovl_cred ovl_cred;
> > >>         struct inode *realinode;
> > >>         struct path realpath;
> > >> -       const struct cred *old_cred;
> > >
> > > Nit: please don't reorder the variable definitions.
> > >
> >
> > Sorry about that. "bad" habits from the networking side :-)
> >
>
> I fully support all forms and shapes of OCD ;-)
>
> > >>         int err;
> > >>
> > >>         /* Careful in RCU walk mode */
> > >> @@ -310,7 +310,9 @@ int ovl_permission(struct mnt_idmap *idmap,
> > >>         if (err)
> > >>                 return err;
> > >>
> > >> -       old_cred = ovl_override_creds(inode->i_sb);
> > >> +       ovl_cred = ovl_creator_cred(inode->i_sb);
> > >> +       guard(ovl_creds)(&ovl_cred);
> > >> +
> > >>         if (!upperinode &&
> > >>             !special_file(realinode->i_mode) && mask & MAY_WRITE) {
> > >>                 mask &= ~(MAY_WRITE | MAY_APPEND);
> > >> @@ -318,7 +320,6 @@ int ovl_permission(struct mnt_idmap *idmap,
> > >>                 mask |= MAY_READ;
> > >>         }
> > >>         err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask);
> > >> -       revert_creds(old_cred);
> > >>
> > >>         return err;
> > >>  }
> > >> diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
> > >> index 05c3dd597fa8..22ea3066376e 100644
> > >> --- a/fs/overlayfs/overlayfs.h
> > >> +++ b/fs/overlayfs/overlayfs.h
> > >> @@ -416,6 +416,24 @@ static inline int ovl_do_getattr(const struct path *path, struct kstat *stat,
> > >>         return vfs_getattr(path, stat, request_mask, flags);
> > >>  }
> > >>
> > >> +struct ovl_cred {
> > >> +       const struct cred *cred;
> > >> +};
> > >> +
> > >> +static inline struct ovl_cred ovl_creator_cred(struct super_block *sb)
> > >> +{
> > >> +       struct ovl_fs *ofs = OVL_FS(sb);
> > >> +
> > >> +       return (struct ovl_cred) { .cred = ofs->creator_cred };
> > >> +}
> > >> +
> > >> +void ovl_override_creds_new(struct ovl_cred *creator_cred);
> > >> +void ovl_revert_creds_new(struct ovl_cred *creator_cred);
> > >> +
> > >> +DEFINE_GUARD(ovl_creds, struct ovl_cred *,
> > >> +            ovl_override_creds_new(_T),
> > >> +            ovl_revert_creds_new(_T));
> > >> +
> > >
> > > This pattern is not unique to overlayfs.
> > > It is probably better to define a common container type struct override_cred
> > > in cred.h/cred.c that other code could also use.
> > >
> >
> > Good idea.
> >
> > >>  /* util.c */
> > >>  int ovl_get_write_access(struct dentry *dentry);
> > >>  void ovl_put_write_access(struct dentry *dentry);
> > >> diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
> > >> index 3fe2dde1598f..008377b9241a 100644
> > >> --- a/fs/overlayfs/params.c
> > >> +++ b/fs/overlayfs/params.c
> > >> @@ -770,8 +770,10 @@ void ovl_free_fs(struct ovl_fs *ofs)
> > >>         kfree(ofs->config.lowerdirs);
> > >>         kfree(ofs->config.upperdir);
> > >>         kfree(ofs->config.workdir);
> > >> -       if (ofs->creator_cred)
> > >> +       if (ofs->creator_cred) {
> > >> +               cred_set_immutable(ofs->creator_cred, false);
> > >>                 put_cred(ofs->creator_cred);
> > >
> > > Not happy about this API.
> > >
> > > Two solutions I can think of:
> > > 1. (my preference) keep two copies of creator_cred, one refcounted copy
> > >     and one non-refcounted that is used for override_creds()
> > > 2. put_cred_ref() which explicitly opts-in to dropping refcount on
> > >     a borrowed reference, same as you do above but hidden behind
> > >     a properly documented helper
> > >
> >
> > Probably because I already have option (2) more or less understood, but
> > I think that having a single creator_cred marked as
> > "non-refcounted/long-lived" is simpler than having two copies, even the
> > the extra copy only exists for the duration of the override.
> >
> > But it could be that I still can't imagine what you have in mind about
> > (1).
> >
>
> (1) is actually quite similar to your first proposal of copying creator_cred
> to a local stack variable.
> My only concern about this proposal was that the core code was not aware
> of the fact that it is working with an object that must not be freed.
> And this is where non_refcount comes in to help.
>
> With (2) prepare_creds_ref()/put_cred_ref() can be used to manage the
> lifetime of a long-lived cred object from the cred_jar memory pool.
>
> With (1) the caller is responsible to manage the lifetime of the non-refcounted
> copy, for example, if the object lives on the stack.
>
> For overlayfs this could mean, either create a non-refcounted copy on the
> stack in every operator using ovl_creator_cred() helper as your first proposal
> did, or keep another non-refcounted copy in ofs->override_cred, which gets
> allocated/freed by overlayfs (i.e. kmalloc/kfree).
>
> > >> +       }
> > >>         kfree(ofs);
> > >>  }
> > >>
> > >> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
> > >> index a0967bb25003..1ffb4f0f8186 100644
> > >> --- a/fs/overlayfs/super.c
> > >> +++ b/fs/overlayfs/super.c
> > >> @@ -1304,6 +1304,13 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
> > >>         if (!cred)
> > >>                 goto out_err;
> > >>
> > >> +       /* Never override disk quota limits or use reserved space */
> > >> +       cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
> > >> +       /* The cred that is going to be associated with the super
> > >> +        * block will not change.
> > >> +        */
> > >> +       cred_set_immutable(cred, true);
> > >> +
> > >
> > > Likewise, either:
> > > 1. Create a non-refcounted copy of creator_cred
> >
> > Ah! I think now I see what you meant. Not sure I like, I think it's a
> > bit too error prone, it's hard to enforce that the copies would be kept
> > in sync in general. (even if in practice the only thing that would need
> > to be kept in sync is the destruction of both, at least for now).
> >
>
> I agree that it makes more sense to create the non-refcounted copy
> on the stack.
>
> As a matter of fact, maybe it makes sense to embed a non-refcounted
> copy in the struct used for the guard:
>
> struct override_cred {
>        struct cred copy;
>        const struct cred *cred;
> };
>
> Then override_cred_save() can create the non-refcounted copy:
>
> void override_cred_save(struct override_cred *override)
> {
>        override->copy = *override;
>        override->copy.non_refcount = 1;
>        override->cred = override_creds(&override->copy);
>        override->cred = override_creds(override->cred);

This second line is leftover of course...

> }
>
> > > or
> > > 2. Use a documented helper prepare_creds_ref() to hide
> > >     this implementation detail
> >
> > This I like more, having the properties documented in the constructor.
> > And much better than my _set_immutable().
> >
>
> Yes, the important thing is that an object cannot change
> its non_refcount property during its lifetime -

... which means that put_creds_ref() should assert that
there is only a single refcount - the one handed out by
prepare_creds_ref() before removing non_refcount or
directly freeing the cred object.

I must say that the semantics of making a non-refcounted copy
to an object whose lifetime is managed by the caller sounds a lot
less confusing to me.

Thanks,
Amir.
  
Linus Torvalds Dec. 16, 2023, 6:26 p.m. UTC | #5
On Sat, 16 Dec 2023 at 02:16, Amir Goldstein <amir73il@gmail.com> wrote:
>
> As a matter of fact, maybe it makes sense to embed a non-refcounted
> copy in the struct used for the guard:

No, please don't. A couple of reasons:

 - that 'struct cred' is not an insignificant size, so stack usage is noticeable

 - we really should strive to avoid passing pointers to random stack
elements around

Don't get me wrong - we pass structures around on the stack all the
time, but it _has_ been a problem with stack usage. Those things tend
to grow..

So in general, the primary use of "pointers to stack objects" is for
when it's either trivially tiny, or when it's a struct that is
explicitly designed for that purpose as a kind of an "extended set of
arguments" (think things like the "tlb_state" for the TLB flushing, or
the various iterator structures we use etc).

When we have a real mainline kernel struct like 'struct cred' that
commonly gets passed around as a pointer argument that *isn't* on the
stack, I get nervous when people then pass it around on the stack too.
It's just too easy to mistakenly pass it off with the wrong lifetime,
and stack corruption is *so* nasty to debug that it's just horrendous.

Yes, lifetime problems are nasty to debug even when it's not some
mis-use of a stack object, but at least for slab allocations etc we
have various generic debug tools that help find them.

For the "you accessed things under the stack, possibly from the wrong
thread", I don't think any of our normal debug coverage will help at
all.

So yes, stack allocations are efficient and fast, and we do use them,
but please don't use them for something like 'struct cred' that has a
proper allocator function normally.

I just removed the CONFIG_DEBUG_CREDENTIALS code, because the fix for
a potential overflow made it have bad padding, and rather than fix the
padding I thought it was better to just remove the long-unused debug
code that just made that thing even more unwieldly than it is.

But I thought that largely because our 'struct cred' use has been
quite stable for a long time (and the original impetus for all that
debug code was the long-ago switch to using the copy-on-write
behavior).

Let's not break that stability with suddenly having a "sometimes it's
allocated on the stack" model.

             Linus
  
Christian Brauner Dec. 18, 2023, 3:17 p.m. UTC | #6
> I just removed the CONFIG_DEBUG_CREDENTIALS code, because the fix for

I noticed this yesterday. Thanks for getting rid of this stuff. I never
understood who was supposed to use this for what.
  
Christian Brauner Dec. 18, 2023, 4:30 p.m. UTC | #7
> > Yes, the important thing is that an object cannot change
> > its non_refcount property during its lifetime -
> 
> ... which means that put_creds_ref() should assert that
> there is only a single refcount - the one handed out by
> prepare_creds_ref() before removing non_refcount or
> directly freeing the cred object.
> 
> I must say that the semantics of making a non-refcounted copy
> to an object whose lifetime is managed by the caller sounds a lot
> less confusing to me.

So can't we do an override_creds() variant that is effectively just:

/* caller guarantees lifetime of @new */
const struct cred *foo_override_cred(const struct cred *new)
{
	const struct cred *old = current->cred;
	rcu_assign_pointer(current->cred, new);
	return old;
}

/* caller guarantees lifetime of @old */
void foo_revert_creds(const struct cred *old)
{
	const struct cred *override = current->cred;
	rcu_assign_pointer(current->cred, old);
}

Maybe I really fail to understand this problem or the proposed solution:
the single reference that overlayfs keeps in ovl->creator_cred is tied
to the lifetime of the overlayfs superblock, no? And anyone who needs a
long term cred reference e.g, file->f_cred will take it's own reference
anyway. So it should be safe to just keep that reference alive until
overlayfs is unmounted, no? I'm sure it's something quite obvious why
that doesn't work but I'm just not seeing it currently.
  
Vinicius Costa Gomes Dec. 18, 2023, 9:57 p.m. UTC | #8
Christian Brauner <brauner@kernel.org> writes:

>> > Yes, the important thing is that an object cannot change
>> > its non_refcount property during its lifetime -
>> 
>> ... which means that put_creds_ref() should assert that
>> there is only a single refcount - the one handed out by
>> prepare_creds_ref() before removing non_refcount or
>> directly freeing the cred object.
>> 
>> I must say that the semantics of making a non-refcounted copy
>> to an object whose lifetime is managed by the caller sounds a lot
>> less confusing to me.
>
> So can't we do an override_creds() variant that is effectively just:
>
> /* caller guarantees lifetime of @new */
> const struct cred *foo_override_cred(const struct cred *new)
> {
> 	const struct cred *old = current->cred;
> 	rcu_assign_pointer(current->cred, new);
> 	return old;
> }
>
> /* caller guarantees lifetime of @old */
> void foo_revert_creds(const struct cred *old)
> {
> 	const struct cred *override = current->cred;
> 	rcu_assign_pointer(current->cred, old);
> }
>
> Maybe I really fail to understand this problem or the proposed solution:
> the single reference that overlayfs keeps in ovl->creator_cred is tied
> to the lifetime of the overlayfs superblock, no? And anyone who needs a
> long term cred reference e.g, file->f_cred will take it's own reference
> anyway. So it should be safe to just keep that reference alive until
> overlayfs is unmounted, no? I'm sure it's something quite obvious why
> that doesn't work but I'm just not seeing it currently.

My read of the code says that what you are proposing should work. (what
I am seeing is that in the "optimized" cases, the only practical effect
of override/revert is the rcu_assign_pointer() dance)

I guess that the question becomes: Do we want this property (that the
'cred' associated with a subperblock/similar is long lived and the
"inner" refcount can be omitted) to be encoded in the constructor? Or do
we want it to be "encoded" in a call by call basis?

I can see both working.


Cheers,
  
Amir Goldstein Dec. 19, 2023, 7:15 a.m. UTC | #9
On Mon, Dec 18, 2023 at 11:57 PM Vinicius Costa Gomes
<vinicius.gomes@intel.com> wrote:
>
> Christian Brauner <brauner@kernel.org> writes:
>
> >> > Yes, the important thing is that an object cannot change
> >> > its non_refcount property during its lifetime -
> >>
> >> ... which means that put_creds_ref() should assert that
> >> there is only a single refcount - the one handed out by
> >> prepare_creds_ref() before removing non_refcount or
> >> directly freeing the cred object.
> >>
> >> I must say that the semantics of making a non-refcounted copy
> >> to an object whose lifetime is managed by the caller sounds a lot
> >> less confusing to me.
> >
> > So can't we do an override_creds() variant that is effectively just:

Yes, I think that we can....

> >
> > /* caller guarantees lifetime of @new */
> > const struct cred *foo_override_cred(const struct cred *new)
> > {
> >       const struct cred *old = current->cred;
> >       rcu_assign_pointer(current->cred, new);
> >       return old;
> > }
> >
> > /* caller guarantees lifetime of @old */
> > void foo_revert_creds(const struct cred *old)
> > {
> >       const struct cred *override = current->cred;
> >       rcu_assign_pointer(current->cred, old);
> > }
> >

Even better(?), we can do this in the actual guard helpers to
discourage use without a guard:

struct override_cred {
        struct cred *cred;
};

DEFINE_GUARD(override_cred, struct override_cred *,
            override_cred_save(_T),
            override_cred_restore(_T));

...

void override_cred_save(struct override_cred *new)
{
        new->cred = rcu_replace_pointer(current->cred, new->cred, true);
}

void override_cred_restore(struct override_cred *old)
{
        rcu_assign_pointer(current->cred, old->cred);
}

> > Maybe I really fail to understand this problem or the proposed solution:
> > the single reference that overlayfs keeps in ovl->creator_cred is tied
> > to the lifetime of the overlayfs superblock, no? And anyone who needs a
> > long term cred reference e.g, file->f_cred will take it's own reference
> > anyway. So it should be safe to just keep that reference alive until
> > overlayfs is unmounted, no? I'm sure it's something quite obvious why
> > that doesn't work but I'm just not seeing it currently.
>
> My read of the code says that what you are proposing should work. (what
> I am seeing is that in the "optimized" cases, the only practical effect
> of override/revert is the rcu_assign_pointer() dance)
>
> I guess that the question becomes: Do we want this property (that the
> 'cred' associated with a subperblock/similar is long lived and the
> "inner" refcount can be omitted) to be encoded in the constructor? Or do
> we want it to be "encoded" in a call by call basis?
>

Neither.

Christian's proposal does not involve marking the cred object as
long lived, which looks a much better idea to me.

The performance issues you observed are (probably) due to get/put
of cred refcount in the helpers {override,revert}_creds().

Christian suggested lightweight variants of {override,revert}_creds()
that do not change refcount. Combining those with a guard and
I don't see what can go wrong (TM).

If you try this out and post a patch, please be sure to include the
motivation for the patch along with performance numbers in the
commit message, even if only posting an RFC patch.

Thanks,
Amir.
  
Christian Brauner Dec. 19, 2023, 1:35 p.m. UTC | #10
On Tue, Dec 19, 2023 at 09:15:52AM +0200, Amir Goldstein wrote:
> On Mon, Dec 18, 2023 at 11:57 PM Vinicius Costa Gomes
> <vinicius.gomes@intel.com> wrote:
> >
> > Christian Brauner <brauner@kernel.org> writes:
> >
> > >> > Yes, the important thing is that an object cannot change
> > >> > its non_refcount property during its lifetime -
> > >>
> > >> ... which means that put_creds_ref() should assert that
> > >> there is only a single refcount - the one handed out by
> > >> prepare_creds_ref() before removing non_refcount or
> > >> directly freeing the cred object.
> > >>
> > >> I must say that the semantics of making a non-refcounted copy
> > >> to an object whose lifetime is managed by the caller sounds a lot
> > >> less confusing to me.
> > >
> > > So can't we do an override_creds() variant that is effectively just:
> 
> Yes, I think that we can....
> 
> > >
> > > /* caller guarantees lifetime of @new */
> > > const struct cred *foo_override_cred(const struct cred *new)
> > > {
> > >       const struct cred *old = current->cred;
> > >       rcu_assign_pointer(current->cred, new);
> > >       return old;
> > > }
> > >
> > > /* caller guarantees lifetime of @old */
> > > void foo_revert_creds(const struct cred *old)
> > > {
> > >       const struct cred *override = current->cred;
> > >       rcu_assign_pointer(current->cred, old);
> > > }
> > >
> 
> Even better(?), we can do this in the actual guard helpers to
> discourage use without a guard:
> 
> struct override_cred {
>         struct cred *cred;
> };
> 
> DEFINE_GUARD(override_cred, struct override_cred *,
>             override_cred_save(_T),
>             override_cred_restore(_T));
> 
> ...
> 
> void override_cred_save(struct override_cred *new)
> {
>         new->cred = rcu_replace_pointer(current->cred, new->cred, true);
> }
> 
> void override_cred_restore(struct override_cred *old)
> {
>         rcu_assign_pointer(current->cred, old->cred);
> }

The main thing we want is that it's somewhat clear that it's special
purpose interface (Sometimes I jokingly feel we should have
include/linux/quirky_overlayfs_helpers.h or actually working module
specific exports so we can export a helper to only a single module.
Whatever happened to that?).

If you do the cred guard thing then maybe name it:

{override,revert}_cred_light()

and then use them to implement the replace portion for:

{override,revert}_cred().

Yes, the {override,revert}_cred() naming isn't optimal but unless we
rename them as well to *_{save,restore} I don't see the point in making
the new helpers deviate from that pattern. They basically do the same
thing.

So my point is to just let them mirror the naming in __fget_light().
To a regular VFS developer the *_light() will give away that it probably
doesn't take a reference.

But I'm not married to that.

So I'd probably just do something like the following COMPLETELY UNTESTED
AND UNCOMPILED thing:

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 2976f534a7a3..c975eb47e691 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -165,6 +165,24 @@ extern int cred_fscmp(const struct cred *, const struct cred *);
 extern void __init cred_init(void);
 extern int set_cred_ucounts(struct cred *);

+/*
+ * Override creds without bumping reference count. Caller must ensure
+ * reference remains valid or has taken reference. Almost always not the
+ * interface you want. Use override_creds()/revert_creds() instead.
+ */
+#define override_creds_light(override_cred)                       \
+       ({                                                        \
+               const struct cred *__old_cred = current->cred;    \
+               rcu_assign_pointer(current->cred, override_cred); \
+               __old_cred;                                       \
+       })
+
+#define revert_creds_light(revert_cred) \
+       rcu_assign_pointer(current->cred, revert_cred);
+
+DEFINE_GUARD(cred, struct cred *, override_creds_light(_T),
+            revert_creds_light(_T));
+
 static inline bool cap_ambient_invariant_ok(const struct cred *cred)
 {
        return cap_issubset(cred->cap_ambient,
diff --git a/kernel/cred.c b/kernel/cred.c
index c033a201c808..d6713edaee37 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -485,7 +485,7 @@ EXPORT_SYMBOL(abort_creds);
  */
 const struct cred *override_creds(const struct cred *new)
 {
-       const struct cred *old = current->cred;
+       const struct cred *old;

        kdebug("override_creds(%p{%ld})", new,
               atomic_long_read(&new->usage));
@@ -499,8 +499,7 @@ const struct cred *override_creds(const struct cred *new)
         * visible to other threads under RCU.
         */
        get_new_cred((struct cred *)new);
-       rcu_assign_pointer(current->cred, new);
-
+       old = override_creds_light(new);
        kdebug("override_creds() = %p{%ld}", old,
               atomic_long_read(&old->usage));
        return old;
@@ -521,7 +520,7 @@ void revert_creds(const struct cred *old)
        kdebug("revert_creds(%p{%ld})", old,
               atomic_long_read(&old->usage));

-       rcu_assign_pointer(current->cred, old);
+       revert_creds_light(old);
        put_cred(override);
 }
 EXPORT_SYMBOL(revert_creds);

> 
> > > Maybe I really fail to understand this problem or the proposed solution:
> > > the single reference that overlayfs keeps in ovl->creator_cred is tied
> > > to the lifetime of the overlayfs superblock, no? And anyone who needs a
> > > long term cred reference e.g, file->f_cred will take it's own reference
> > > anyway. So it should be safe to just keep that reference alive until
> > > overlayfs is unmounted, no? I'm sure it's something quite obvious why
> > > that doesn't work but I'm just not seeing it currently.
> >
> > My read of the code says that what you are proposing should work. (what
> > I am seeing is that in the "optimized" cases, the only practical effect
> > of override/revert is the rcu_assign_pointer() dance)
> >
> > I guess that the question becomes: Do we want this property (that the
> > 'cred' associated with a subperblock/similar is long lived and the
> > "inner" refcount can be omitted) to be encoded in the constructor? Or do
> > we want it to be "encoded" in a call by call basis?
> >
> 
> Neither.
> 
> Christian's proposal does not involve marking the cred object as
> long lived, which looks a much better idea to me.
> 
> The performance issues you observed are (probably) due to get/put
> of cred refcount in the helpers {override,revert}_creds().

Most likely they are. I don't see what else would be expensive. But I
may lack details.

> 
> Christian suggested lightweight variants of {override,revert}_creds()
> that do not change refcount. Combining those with a guard and
> I don't see what can go wrong (TM).

Place a nice comment explaining lifetime expectations in the commit
message. Then someone can always tell us why we're wrong.
  
Vinicius Costa Gomes Dec. 19, 2023, 2:33 p.m. UTC | #11
Amir Goldstein <amir73il@gmail.com> writes:

> On Mon, Dec 18, 2023 at 11:57 PM Vinicius Costa Gomes
> <vinicius.gomes@intel.com> wrote:
>>
>> Christian Brauner <brauner@kernel.org> writes:
>>
>> >> > Yes, the important thing is that an object cannot change
>> >> > its non_refcount property during its lifetime -
>> >>
>> >> ... which means that put_creds_ref() should assert that
>> >> there is only a single refcount - the one handed out by
>> >> prepare_creds_ref() before removing non_refcount or
>> >> directly freeing the cred object.
>> >>
>> >> I must say that the semantics of making a non-refcounted copy
>> >> to an object whose lifetime is managed by the caller sounds a lot
>> >> less confusing to me.
>> >
>> > So can't we do an override_creds() variant that is effectively just:
>
> Yes, I think that we can....
>
>> >
>> > /* caller guarantees lifetime of @new */
>> > const struct cred *foo_override_cred(const struct cred *new)
>> > {
>> >       const struct cred *old = current->cred;
>> >       rcu_assign_pointer(current->cred, new);
>> >       return old;
>> > }
>> >
>> > /* caller guarantees lifetime of @old */
>> > void foo_revert_creds(const struct cred *old)
>> > {
>> >       const struct cred *override = current->cred;
>> >       rcu_assign_pointer(current->cred, old);
>> > }
>> >
>
> Even better(?), we can do this in the actual guard helpers to
> discourage use without a guard:
>
> struct override_cred {
>         struct cred *cred;
> };
>
> DEFINE_GUARD(override_cred, struct override_cred *,
>             override_cred_save(_T),
>             override_cred_restore(_T));
>
> ...
>
> void override_cred_save(struct override_cred *new)
> {
>         new->cred = rcu_replace_pointer(current->cred, new->cred, true);
> }
>
> void override_cred_restore(struct override_cred *old)
> {
>         rcu_assign_pointer(current->cred, old->cred);
> }
>
>> > Maybe I really fail to understand this problem or the proposed solution:
>> > the single reference that overlayfs keeps in ovl->creator_cred is tied
>> > to the lifetime of the overlayfs superblock, no? And anyone who needs a
>> > long term cred reference e.g, file->f_cred will take it's own reference
>> > anyway. So it should be safe to just keep that reference alive until
>> > overlayfs is unmounted, no? I'm sure it's something quite obvious why
>> > that doesn't work but I'm just not seeing it currently.
>>
>> My read of the code says that what you are proposing should work. (what
>> I am seeing is that in the "optimized" cases, the only practical effect
>> of override/revert is the rcu_assign_pointer() dance)
>>
>> I guess that the question becomes: Do we want this property (that the
>> 'cred' associated with a subperblock/similar is long lived and the
>> "inner" refcount can be omitted) to be encoded in the constructor? Or do
>> we want it to be "encoded" in a call by call basis?
>>
>
> Neither.
>
> Christian's proposal does not involve marking the cred object as
> long lived, which looks a much better idea to me.
>

In my mind, I am reading his suggestion as the flag "long lived
cred/lives long enough" is "in our brains" vs. what I proposed that the
flag was "in the object". The effect of the "flag" is the same: when to
use a lighter version (no refcount) of override/revert.

What I was thinking was more more under the covers, implicit. And I can
see the advantages of having them more explicit.

> The performance issues you observed are (probably) due to get/put
> of cred refcount in the helpers {override,revert}_creds().
>

Yes, they are. Sorry that it was lost in the context. The original
report is here:

https://lore.kernel.org/all/20231018074553.41333-1-hu1.chen@intel.com/

> Christian suggested lightweight variants of {override,revert}_creds()
> that do not change refcount. Combining those with a guard and
> I don't see what can go wrong (TM).
>
> If you try this out and post a patch, please be sure to include the
> motivation for the patch along with performance numbers in the
> commit message, even if only posting an RFC patch.
>

Of course.

And to be sure, I will go with Christian's suggestion, it looks neat,
and having a lighter version of references is a more common idiom.

Thank you all.


Cheers,
  
Christian Brauner Jan. 23, 2024, 3:39 p.m. UTC | #12
On Tue, Dec 19, 2023 at 06:33:59AM -0800, Vinicius Costa Gomes wrote:
> Amir Goldstein <amir73il@gmail.com> writes:
> 
> > On Mon, Dec 18, 2023 at 11:57 PM Vinicius Costa Gomes
> > <vinicius.gomes@intel.com> wrote:
> >>
> >> Christian Brauner <brauner@kernel.org> writes:
> >>
> >> >> > Yes, the important thing is that an object cannot change
> >> >> > its non_refcount property during its lifetime -
> >> >>
> >> >> ... which means that put_creds_ref() should assert that
> >> >> there is only a single refcount - the one handed out by
> >> >> prepare_creds_ref() before removing non_refcount or
> >> >> directly freeing the cred object.
> >> >>
> >> >> I must say that the semantics of making a non-refcounted copy
> >> >> to an object whose lifetime is managed by the caller sounds a lot
> >> >> less confusing to me.
> >> >
> >> > So can't we do an override_creds() variant that is effectively just:
> >
> > Yes, I think that we can....
> >
> >> >
> >> > /* caller guarantees lifetime of @new */
> >> > const struct cred *foo_override_cred(const struct cred *new)
> >> > {
> >> >       const struct cred *old = current->cred;
> >> >       rcu_assign_pointer(current->cred, new);
> >> >       return old;
> >> > }
> >> >
> >> > /* caller guarantees lifetime of @old */
> >> > void foo_revert_creds(const struct cred *old)
> >> > {
> >> >       const struct cred *override = current->cred;
> >> >       rcu_assign_pointer(current->cred, old);
> >> > }
> >> >
> >
> > Even better(?), we can do this in the actual guard helpers to
> > discourage use without a guard:
> >
> > struct override_cred {
> >         struct cred *cred;
> > };
> >
> > DEFINE_GUARD(override_cred, struct override_cred *,
> >             override_cred_save(_T),
> >             override_cred_restore(_T));
> >
> > ...
> >
> > void override_cred_save(struct override_cred *new)
> > {
> >         new->cred = rcu_replace_pointer(current->cred, new->cred, true);
> > }
> >
> > void override_cred_restore(struct override_cred *old)
> > {
> >         rcu_assign_pointer(current->cred, old->cred);
> > }
> >
> >> > Maybe I really fail to understand this problem or the proposed solution:
> >> > the single reference that overlayfs keeps in ovl->creator_cred is tied
> >> > to the lifetime of the overlayfs superblock, no? And anyone who needs a
> >> > long term cred reference e.g, file->f_cred will take it's own reference
> >> > anyway. So it should be safe to just keep that reference alive until
> >> > overlayfs is unmounted, no? I'm sure it's something quite obvious why
> >> > that doesn't work but I'm just not seeing it currently.
> >>
> >> My read of the code says that what you are proposing should work. (what
> >> I am seeing is that in the "optimized" cases, the only practical effect
> >> of override/revert is the rcu_assign_pointer() dance)
> >>
> >> I guess that the question becomes: Do we want this property (that the
> >> 'cred' associated with a subperblock/similar is long lived and the
> >> "inner" refcount can be omitted) to be encoded in the constructor? Or do
> >> we want it to be "encoded" in a call by call basis?
> >>
> >
> > Neither.
> >
> > Christian's proposal does not involve marking the cred object as
> > long lived, which looks a much better idea to me.
> >
> 
> In my mind, I am reading his suggestion as the flag "long lived
> cred/lives long enough" is "in our brains" vs. what I proposed that the
> flag was "in the object". The effect of the "flag" is the same: when to
> use a lighter version (no refcount) of override/revert.
> 
> What I was thinking was more more under the covers, implicit. And I can
> see the advantages of having them more explicit.
> 
> > The performance issues you observed are (probably) due to get/put
> > of cred refcount in the helpers {override,revert}_creds().
> >
> 
> Yes, they are. Sorry that it was lost in the context. The original
> report is here:
> 
> https://lore.kernel.org/all/20231018074553.41333-1-hu1.chen@intel.com/
> 
> > Christian suggested lightweight variants of {override,revert}_creds()
> > that do not change refcount. Combining those with a guard and
> > I don't see what can go wrong (TM).
> >
> > If you try this out and post a patch, please be sure to include the
> > motivation for the patch along with performance numbers in the
> > commit message, even if only posting an RFC patch.
> >
> 
> Of course.
> 
> And to be sure, I will go with Christian's suggestion, it looks neat,
> and having a lighter version of references is a more common idiom.

Did this ever go anywhere?
  
Vinicius Costa Gomes Jan. 23, 2024, 4:37 p.m. UTC | #13
Christian Brauner <brauner@kernel.org> writes:

> On Tue, Dec 19, 2023 at 06:33:59AM -0800, Vinicius Costa Gomes wrote:
>> Amir Goldstein <amir73il@gmail.com> writes:
>> 
>> > On Mon, Dec 18, 2023 at 11:57 PM Vinicius Costa Gomes
>> > <vinicius.gomes@intel.com> wrote:
>> >>
>> >> Christian Brauner <brauner@kernel.org> writes:
>> >>
>> >> >> > Yes, the important thing is that an object cannot change
>> >> >> > its non_refcount property during its lifetime -
>> >> >>
>> >> >> ... which means that put_creds_ref() should assert that
>> >> >> there is only a single refcount - the one handed out by
>> >> >> prepare_creds_ref() before removing non_refcount or
>> >> >> directly freeing the cred object.
>> >> >>
>> >> >> I must say that the semantics of making a non-refcounted copy
>> >> >> to an object whose lifetime is managed by the caller sounds a lot
>> >> >> less confusing to me.
>> >> >
>> >> > So can't we do an override_creds() variant that is effectively just:
>> >
>> > Yes, I think that we can....
>> >
>> >> >
>> >> > /* caller guarantees lifetime of @new */
>> >> > const struct cred *foo_override_cred(const struct cred *new)
>> >> > {
>> >> >       const struct cred *old = current->cred;
>> >> >       rcu_assign_pointer(current->cred, new);
>> >> >       return old;
>> >> > }
>> >> >
>> >> > /* caller guarantees lifetime of @old */
>> >> > void foo_revert_creds(const struct cred *old)
>> >> > {
>> >> >       const struct cred *override = current->cred;
>> >> >       rcu_assign_pointer(current->cred, old);
>> >> > }
>> >> >
>> >
>> > Even better(?), we can do this in the actual guard helpers to
>> > discourage use without a guard:
>> >
>> > struct override_cred {
>> >         struct cred *cred;
>> > };
>> >
>> > DEFINE_GUARD(override_cred, struct override_cred *,
>> >             override_cred_save(_T),
>> >             override_cred_restore(_T));
>> >
>> > ...
>> >
>> > void override_cred_save(struct override_cred *new)
>> > {
>> >         new->cred = rcu_replace_pointer(current->cred, new->cred, true);
>> > }
>> >
>> > void override_cred_restore(struct override_cred *old)
>> > {
>> >         rcu_assign_pointer(current->cred, old->cred);
>> > }
>> >
>> >> > Maybe I really fail to understand this problem or the proposed solution:
>> >> > the single reference that overlayfs keeps in ovl->creator_cred is tied
>> >> > to the lifetime of the overlayfs superblock, no? And anyone who needs a
>> >> > long term cred reference e.g, file->f_cred will take it's own reference
>> >> > anyway. So it should be safe to just keep that reference alive until
>> >> > overlayfs is unmounted, no? I'm sure it's something quite obvious why
>> >> > that doesn't work but I'm just not seeing it currently.
>> >>
>> >> My read of the code says that what you are proposing should work. (what
>> >> I am seeing is that in the "optimized" cases, the only practical effect
>> >> of override/revert is the rcu_assign_pointer() dance)
>> >>
>> >> I guess that the question becomes: Do we want this property (that the
>> >> 'cred' associated with a subperblock/similar is long lived and the
>> >> "inner" refcount can be omitted) to be encoded in the constructor? Or do
>> >> we want it to be "encoded" in a call by call basis?
>> >>
>> >
>> > Neither.
>> >
>> > Christian's proposal does not involve marking the cred object as
>> > long lived, which looks a much better idea to me.
>> >
>> 
>> In my mind, I am reading his suggestion as the flag "long lived
>> cred/lives long enough" is "in our brains" vs. what I proposed that the
>> flag was "in the object". The effect of the "flag" is the same: when to
>> use a lighter version (no refcount) of override/revert.
>> 
>> What I was thinking was more more under the covers, implicit. And I can
>> see the advantages of having them more explicit.
>> 
>> > The performance issues you observed are (probably) due to get/put
>> > of cred refcount in the helpers {override,revert}_creds().
>> >
>> 
>> Yes, they are. Sorry that it was lost in the context. The original
>> report is here:
>> 
>> https://lore.kernel.org/all/20231018074553.41333-1-hu1.chen@intel.com/
>> 
>> > Christian suggested lightweight variants of {override,revert}_creds()
>> > that do not change refcount. Combining those with a guard and
>> > I don't see what can go wrong (TM).
>> >
>> > If you try this out and post a patch, please be sure to include the
>> > motivation for the patch along with performance numbers in the
>> > commit message, even if only posting an RFC patch.
>> >
>> 
>> Of course.
>> 
>> And to be sure, I will go with Christian's suggestion, it looks neat,
>> and having a lighter version of references is a more common idiom.
>
> Did this ever go anywhere?

Oh, yes! Had to do a few tweaks to what you suggested, but it's working
fine.

Just collecting some fresh numbers for the cover letter. Will propose
the v2 of the RFC soon.


Cheers,
  

Patch

diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index c63b31a460be..2c016a3bbe2d 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -290,9 +290,9 @@  int ovl_permission(struct mnt_idmap *idmap,
 		   struct inode *inode, int mask)
 {
 	struct inode *upperinode = ovl_inode_upper(inode);
+	struct ovl_cred ovl_cred;
 	struct inode *realinode;
 	struct path realpath;
-	const struct cred *old_cred;
 	int err;
 
 	/* Careful in RCU walk mode */
@@ -310,7 +310,9 @@  int ovl_permission(struct mnt_idmap *idmap,
 	if (err)
 		return err;
 
-	old_cred = ovl_override_creds(inode->i_sb);
+	ovl_cred = ovl_creator_cred(inode->i_sb);
+	guard(ovl_creds)(&ovl_cred);
+
 	if (!upperinode &&
 	    !special_file(realinode->i_mode) && mask & MAY_WRITE) {
 		mask &= ~(MAY_WRITE | MAY_APPEND);
@@ -318,7 +320,6 @@  int ovl_permission(struct mnt_idmap *idmap,
 		mask |= MAY_READ;
 	}
 	err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask);
-	revert_creds(old_cred);
 
 	return err;
 }
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 05c3dd597fa8..22ea3066376e 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -416,6 +416,24 @@  static inline int ovl_do_getattr(const struct path *path, struct kstat *stat,
 	return vfs_getattr(path, stat, request_mask, flags);
 }
 
+struct ovl_cred {
+	const struct cred *cred;
+};
+
+static inline struct ovl_cred ovl_creator_cred(struct super_block *sb)
+{
+	struct ovl_fs *ofs = OVL_FS(sb);
+
+	return (struct ovl_cred) { .cred = ofs->creator_cred };
+}
+
+void ovl_override_creds_new(struct ovl_cred *creator_cred);
+void ovl_revert_creds_new(struct ovl_cred *creator_cred);
+
+DEFINE_GUARD(ovl_creds, struct ovl_cred *,
+	     ovl_override_creds_new(_T),
+	     ovl_revert_creds_new(_T));
+
 /* util.c */
 int ovl_get_write_access(struct dentry *dentry);
 void ovl_put_write_access(struct dentry *dentry);
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 3fe2dde1598f..008377b9241a 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -770,8 +770,10 @@  void ovl_free_fs(struct ovl_fs *ofs)
 	kfree(ofs->config.lowerdirs);
 	kfree(ofs->config.upperdir);
 	kfree(ofs->config.workdir);
-	if (ofs->creator_cred)
+	if (ofs->creator_cred) {
+		cred_set_immutable(ofs->creator_cred, false);
 		put_cred(ofs->creator_cred);
+	}
 	kfree(ofs);
 }
 
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index a0967bb25003..1ffb4f0f8186 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1304,6 +1304,13 @@  int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (!cred)
 		goto out_err;
 
+	/* Never override disk quota limits or use reserved space */
+	cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
+	/* The cred that is going to be associated with the super
+	 * block will not change.
+	 */
+	cred_set_immutable(cred, true);
+
 	err = ovl_fs_params_verify(ctx, &ofs->config);
 	if (err)
 		goto out_err;
@@ -1438,9 +1445,6 @@  int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
 	else if (!ofs->nofh)
 		sb->s_export_op = &ovl_export_fid_operations;
 
-	/* Never override disk quota limits or use reserved space */
-	cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
-
 	sb->s_magic = OVERLAYFS_SUPER_MAGIC;
 	sb->s_xattr = ovl_xattr_handlers(ofs);
 	sb->s_fs_info = ofs;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index c3f020ca13a8..9ae9a35a6a7a 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -68,6 +68,16 @@  const struct cred *ovl_override_creds(struct super_block *sb)
 	return override_creds(ofs->creator_cred);
 }
 
+void ovl_override_creds_new(struct ovl_cred *creator_cred)
+{
+	creator_cred->cred = override_creds(creator_cred->cred);
+}
+
+void ovl_revert_creds_new(struct ovl_cred *creator_cred)
+{
+	revert_creds(creator_cred->cred);
+}
+
 /*
  * Check if underlying fs supports file handles and try to determine encoding
  * type, in order to deduce maximum inode number used by fs.
diff --git a/include/linux/cred.h b/include/linux/cred.h
index af8d353a4b86..06eaedfe48ea 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -151,6 +151,7 @@  struct cred {
 		int non_rcu;			/* Can we skip RCU deletion? */
 		struct rcu_head	rcu;		/* RCU deletion hook */
 	};
+	bool	immutable;
 } __randomize_layout;
 
 extern void __put_cred(struct cred *);
@@ -229,7 +230,8 @@  static inline bool cap_ambient_invariant_ok(const struct cred *cred)
  */
 static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
 {
-	atomic_add(nr, &cred->usage);
+	if (!cred->immutable)
+		atomic_add(nr, &cred->usage);
 	return cred;
 }
 
@@ -245,6 +247,12 @@  static inline struct cred *get_new_cred(struct cred *cred)
 	return get_new_cred_many(cred, 1);
 }
 
+static inline void cred_set_immutable(const struct cred *cred, bool imm)
+{
+	struct cred *nonconst_cred = (struct cred *) cred;
+	nonconst_cred->immutable = imm;
+}
+
 /**
  * get_cred_many - Get references on a set of credentials
  * @cred: The credentials to reference
@@ -313,7 +321,7 @@  static inline void put_cred_many(const struct cred *_cred, int nr)
 
 	if (cred) {
 		validate_creds(cred);
-		if (atomic_sub_and_test(nr, &cred->usage))
+		if (!cred->immutable && atomic_sub_and_test(nr, &cred->usage))
 			__put_cred(cred);
 	}
 }