[RFC] nfsd: return NFS4ERR_DELAY on contention for v4.0 replay_owner rp_mutex
Commit Message
move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
This can lead to a deadlock as move_to_close_lru() waits for sc_count to
drop to 2, and some threads holding a reference might be waiting for either
mutex. These references will never be dropped so sc_count will never
reach 2.
There have been a couple of attempted fixes (see [1] and [2]), but both
were problematic for different reasons.
This patch attempts to break the impasse by simply not waiting for the
rp_mutex. If it's contended then we just have it return NFS4ERR_DELAY.
This will likely cause parallel opens by the same openowner to be even
slower on NFSv4.0, but it should break the deadlock.
One way to address the performance impact might be to allow the wait for
the mutex to time out after a period of time (30ms would be the same as
NFSD_DELEGRETURN_TIMEOUT). We'd need to add a mutex_lock_timeout
function in order for that to work.
Chuck also suggested that we may be able to utilize the svc_defer
mechanism instead of returning NFS4ERR_DELAY in this situation, but I'm
not quite sure how feasible that is.
[1]: https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
[2]: https://lore.kernel.org/linux-nfs/170546328406.23031.11217818844350800811@noble.neil.brown.name/
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
fs/nfsd/nfs4state.c | 30 +++++++++++++++++-------------
1 file changed, 17 insertions(+), 13 deletions(-)
---
base-commit: 2eb3d14898b97bdc0596d184cbf829b5a81cd639
change-id: 20240229-rp_mutex-d20e3319e315
Best regards,
Comments
On Fri, 01 Mar 2024, Jeff Layton wrote:
> move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
> This can lead to a deadlock as move_to_close_lru() waits for sc_count to
> drop to 2, and some threads holding a reference might be waiting for either
> mutex. These references will never be dropped so sc_count will never
> reach 2.
>
> There have been a couple of attempted fixes (see [1] and [2]), but both
> were problematic for different reasons.
>
> This patch attempts to break the impasse by simply not waiting for the
> rp_mutex. If it's contended then we just have it return NFS4ERR_DELAY.
> This will likely cause parallel opens by the same openowner to be even
> slower on NFSv4.0, but it should break the deadlock.
>
> One way to address the performance impact might be to allow the wait for
> the mutex to time out after a period of time (30ms would be the same as
> NFSD_DELEGRETURN_TIMEOUT). We'd need to add a mutex_lock_timeout
> function in order for that to work.
>
> Chuck also suggested that we may be able to utilize the svc_defer
> mechanism instead of returning NFS4ERR_DELAY in this situation, but I'm
> not quite sure how feasible that is.
>
> [1]: https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
> [2]: https://lore.kernel.org/linux-nfs/170546328406.23031.11217818844350800811@noble.neil.brown.name/
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
> fs/nfsd/nfs4state.c | 30 +++++++++++++++++-------------
> 1 file changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index aee12adf0598..4b667eeb06c8 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -4658,13 +4658,16 @@ static void init_nfs4_replay(struct nfs4_replay *rp)
> mutex_init(&rp->rp_mutex);
> }
>
> -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> +static __be32 nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> struct nfs4_stateowner *so)
> {
> if (!nfsd4_has_session(cstate)) {
> - mutex_lock(&so->so_replay.rp_mutex);
> + WARN_ON_ONCE(cstate->replay_owner);
> + if (!mutex_trylock(&so->so_replay.rp_mutex))
> + return nfserr_jukebox;
> cstate->replay_owner = nfs4_get_stateowner(so);
> }
> + return nfs_ok;
> }
>
> void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> @@ -5332,15 +5335,17 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
> strhashval = ownerstr_hashval(&open->op_owner);
> oo = find_openstateowner_str(strhashval, open, clp);
> open->op_openowner = oo;
> - if (!oo) {
> + if (!oo)
> goto new_owner;
> - }
> if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
> /* Replace unconfirmed owners without checking for replay. */
> release_openowner(oo);
> open->op_openowner = NULL;
> goto new_owner;
> }
> + status = nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
> + if (status)
> + return status;
> status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
> if (status)
> return status;
> @@ -5350,6 +5355,9 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
> if (oo == NULL)
> return nfserr_jukebox;
> open->op_openowner = oo;
> + status = nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
> + if (status)
> + return status;
> alloc_stateid:
> open->op_stp = nfs4_alloc_open_stateid(clp);
> if (!open->op_stp)
> @@ -6121,12 +6129,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
> void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
> struct nfsd4_open *open)
> {
> - if (open->op_openowner) {
> - struct nfs4_stateowner *so = &open->op_openowner->oo_owner;
> -
> - nfsd4_cstate_assign_replay(cstate, so);
> - nfs4_put_stateowner(so);
> - }
> + if (open->op_openowner)
> + nfs4_put_stateowner(&open->op_openowner->oo_owner);
> if (open->op_file)
> kmem_cache_free(file_slab, open->op_file);
> if (open->op_stp)
> @@ -7193,9 +7197,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
> if (status)
> return status;
> stp = openlockstateid(s);
> - nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
> -
> - status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
> + status = nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
> + if (!status)
> + status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
> if (!status)
> *stpp = stp;
> else
>
> ---
> base-commit: 2eb3d14898b97bdc0596d184cbf829b5a81cd639
> change-id: 20240229-rp_mutex-d20e3319e315
>
> Best regards,
> --
> Jeff Layton <jlayton@kernel.org>
>
>
I haven't reviewed this thoroughly but I think it is heading in the
right direction.
I think moving the nfsd4_cstate_assign_replay() call out of
nfsd4_cleanup_open_state() and into nfsd4_process_open1() is a really
good idea and possible should be a separate patch.
I wonder if return NFS4ERR_DELAY is really best.
I imagine replacing rp_mutex with an atomic_t rp_locked.
This is normally 0, becomes 1 when nfsd4_cstate_assign_replay()
succeeds, and is set to 2 in move_to_close_lru().
In nfsd4_cstate_assign_replay() we wait while the value is 1.
If it becomes 0, we can get the lock and continue.
If it becomes 2, we return an error.
If this happens, the state has been unhashed. So instead of returning
NFS4ERR_DELAY we can loop back and repeat the nfsd4_lookup_stateid() or
find_openstateowner_str(), which we can be certain won't find the same
owner.
I started writing this, but stumbled over the
nfsd4_cstate_assign_replay() in nfsd4_cleanup_open_state(). Now that
you've shown me how to fix that, I'll have another go.
Thanks,
NeilBrown
@@ -4658,13 +4658,16 @@ static void init_nfs4_replay(struct nfs4_replay *rp)
mutex_init(&rp->rp_mutex);
}
-static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
+static __be32 nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
struct nfs4_stateowner *so)
{
if (!nfsd4_has_session(cstate)) {
- mutex_lock(&so->so_replay.rp_mutex);
+ WARN_ON_ONCE(cstate->replay_owner);
+ if (!mutex_trylock(&so->so_replay.rp_mutex))
+ return nfserr_jukebox;
cstate->replay_owner = nfs4_get_stateowner(so);
}
+ return nfs_ok;
}
void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
@@ -5332,15 +5335,17 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
strhashval = ownerstr_hashval(&open->op_owner);
oo = find_openstateowner_str(strhashval, open, clp);
open->op_openowner = oo;
- if (!oo) {
+ if (!oo)
goto new_owner;
- }
if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
/* Replace unconfirmed owners without checking for replay. */
release_openowner(oo);
open->op_openowner = NULL;
goto new_owner;
}
+ status = nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
+ if (status)
+ return status;
status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
if (status)
return status;
@@ -5350,6 +5355,9 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
if (oo == NULL)
return nfserr_jukebox;
open->op_openowner = oo;
+ status = nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
+ if (status)
+ return status;
alloc_stateid:
open->op_stp = nfs4_alloc_open_stateid(clp);
if (!open->op_stp)
@@ -6121,12 +6129,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
struct nfsd4_open *open)
{
- if (open->op_openowner) {
- struct nfs4_stateowner *so = &open->op_openowner->oo_owner;
-
- nfsd4_cstate_assign_replay(cstate, so);
- nfs4_put_stateowner(so);
- }
+ if (open->op_openowner)
+ nfs4_put_stateowner(&open->op_openowner->oo_owner);
if (open->op_file)
kmem_cache_free(file_slab, open->op_file);
if (open->op_stp)
@@ -7193,9 +7197,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
if (status)
return status;
stp = openlockstateid(s);
- nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
-
- status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
+ status = nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
+ if (!status)
+ status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
if (!status)
*stpp = stp;
else