[3/6] fs: distinguish between user initiated freeze and kernel initiated freeze

Message ID 20230508011717.4034511-4-mcgrof@kernel.org
State New
Headers
Series vfs: provide automatic kernel freeze / resume |

Commit Message

Luis Chamberlain May 8, 2023, 1:17 a.m. UTC
  Userspace can initiate a freeze call using ioctls. If the kernel decides
to freeze a filesystem later it must be able to distinguish if userspace
had initiated the freeze, so that it does not unfreeze it later
automatically on resume.

Likewise if the kernel is initiating a freeze on its own it should *not*
fail to freeze a filesystem if a user had already frozen it on our behalf.
This same concept applies to thawing, even if its not possible for
userspace to beat the kernel in thawing a filesystem. This logic however
has never applied to userspace freezing and thawing, two consecutive
userspace freeze calls will results in only the first one succeeding, so
we must retain the same behaviour in userspace.

This doesn't implement yet kernel initiated filesystem freeze calls,
this will be done in subsequent calls. This change should introduce
no functional changes, it just extends the definitions of a frozen
filesystem to account for future kernel initiated filesystem freeze
and let's us keep record of when userpace initiated it so the kernel
can respect a userspace initiated freeze upon kernel initiated freeze
and its respective thaw cycle.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 block/bdev.c       |  4 ++--
 fs/f2fs/gc.c       |  4 ++--
 fs/gfs2/glops.c    |  2 +-
 fs/gfs2/super.c    |  2 +-
 fs/gfs2/sys.c      |  4 ++--
 fs/gfs2/util.c     |  2 +-
 fs/ioctl.c         |  4 ++--
 fs/super.c         | 29 +++++++++++++++++++++++++----
 include/linux/fs.h | 16 ++++++++++++++--
 9 files changed, 50 insertions(+), 17 deletions(-)
  

Comments

Darrick J. Wong May 16, 2023, 3:23 p.m. UTC | #1
On Sun, May 07, 2023 at 06:17:14PM -0700, Luis Chamberlain wrote:
> Userspace can initiate a freeze call using ioctls. If the kernel decides
> to freeze a filesystem later it must be able to distinguish if userspace
> had initiated the freeze, so that it does not unfreeze it later
> automatically on resume.
> 
> Likewise if the kernel is initiating a freeze on its own it should *not*
> fail to freeze a filesystem if a user had already frozen it on our behalf.
> This same concept applies to thawing, even if its not possible for
> userspace to beat the kernel in thawing a filesystem. This logic however
> has never applied to userspace freezing and thawing, two consecutive
> userspace freeze calls will results in only the first one succeeding, so
> we must retain the same behaviour in userspace.
> 
> This doesn't implement yet kernel initiated filesystem freeze calls,
> this will be done in subsequent calls. This change should introduce
> no functional changes, it just extends the definitions of a frozen
> filesystem to account for future kernel initiated filesystem freeze
> and let's us keep record of when userpace initiated it so the kernel
> can respect a userspace initiated freeze upon kernel initiated freeze
> and its respective thaw cycle.
> 
> Reviewed-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
> ---
>  block/bdev.c       |  4 ++--
>  fs/f2fs/gc.c       |  4 ++--
>  fs/gfs2/glops.c    |  2 +-
>  fs/gfs2/super.c    |  2 +-
>  fs/gfs2/sys.c      |  4 ++--
>  fs/gfs2/util.c     |  2 +-
>  fs/ioctl.c         |  4 ++--
>  fs/super.c         | 29 +++++++++++++++++++++++++----
>  include/linux/fs.h | 16 ++++++++++++++--
>  9 files changed, 50 insertions(+), 17 deletions(-)
> 
> diff --git a/block/bdev.c b/block/bdev.c
> index dc54a2a1c46e..04f7b2c99845 100644
> --- a/block/bdev.c
> +++ b/block/bdev.c
> @@ -250,7 +250,7 @@ int freeze_bdev(struct block_device *bdev)
>  	if (sb->s_op->freeze_super)
>  		error = sb->s_op->freeze_super(sb);
>  	else
> -		error = freeze_super(sb);
> +		error = freeze_super(sb, true);
>  	deactivate_locked_super(sb);
>  
>  	if (error) {
> @@ -295,7 +295,7 @@ int thaw_bdev(struct block_device *bdev)
>  	if (sb->s_op->thaw_super)
>  		error = sb->s_op->thaw_super(sb);
>  	else
> -		error = thaw_super(sb);
> +		error = thaw_super(sb, true);
>  	if (error)
>  		bdev->bd_fsfreeze_count++;
>  	else
> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
> index e31d6791d3e3..a5891055d85d 100644
> --- a/fs/f2fs/gc.c
> +++ b/fs/f2fs/gc.c
> @@ -2168,7 +2168,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
>  
>  	if (!get_active_super(sbi->sb->s_bdev))
>  		return -ENOTTY;
> -	freeze_super(sbi->sb);
> +	freeze_super(sbi->sb, true);
>  
>  	f2fs_down_write(&sbi->gc_lock);
>  	f2fs_down_write(&sbi->cp_global_sem);
> @@ -2221,7 +2221,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
>  	f2fs_up_write(&sbi->cp_global_sem);
>  	f2fs_up_write(&sbi->gc_lock);
>  	/* We use the same active reference from freeze */
> -	thaw_super(sbi->sb);
> +	thaw_super(sbi->sb, true);
>  	deactivate_locked_super(sbi->sb);
>  	return err;
>  }
> diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
> index 01d433ed6ce7..8fd37508f9a0 100644
> --- a/fs/gfs2/glops.c
> +++ b/fs/gfs2/glops.c
> @@ -584,7 +584,7 @@ static int freeze_go_sync(struct gfs2_glock *gl)
>  	if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp) &&
>  	    !test_bit(SDF_NORECOVERY, &sdp->sd_flags)) {
>  		atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
> -		error = freeze_super(sdp->sd_vfs);
> +		error = freeze_super(sdp->sd_vfs, true);
>  		if (error) {
>  			fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
>  				error);
> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> index e57cb593e2f3..f2641891de43 100644
> --- a/fs/gfs2/super.c
> +++ b/fs/gfs2/super.c
> @@ -687,7 +687,7 @@ void gfs2_freeze_func(struct work_struct *work)
>  		gfs2_assert_withdraw(sdp, 0);
>  	} else {
>  		atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
> -		error = thaw_super(sb);
> +		error = thaw_super(sb, true);
>  		if (error) {
>  			fs_info(sdp, "GFS2: couldn't thaw filesystem: %d\n",
>  				error);
> diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
> index e80c827acd09..9e0398f99674 100644
> --- a/fs/gfs2/sys.c
> +++ b/fs/gfs2/sys.c
> @@ -169,10 +169,10 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
>  
>  	switch (n) {
>  	case 0:
> -		error = thaw_super(sdp->sd_vfs);
> +		error = thaw_super(sdp->sd_vfs, true);
>  		break;
>  	case 1:
> -		error = freeze_super(sdp->sd_vfs);
> +		error = freeze_super(sdp->sd_vfs, true);
>  		break;
>  	default:
>  		deactivate_locked_super(sb);
> diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
> index 3a0cd5e9ad84..be9705d618ec 100644
> --- a/fs/gfs2/util.c
> +++ b/fs/gfs2/util.c
> @@ -191,7 +191,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
>  		/* Make sure gfs2_unfreeze works if partially-frozen */
>  		flush_work(&sdp->sd_freeze_work);
>  		atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
> -		thaw_super(sdp->sd_vfs);
> +		thaw_super(sdp->sd_vfs, true);
>  	} else {
>  		wait_on_bit(&i_gl->gl_flags, GLF_DEMOTE,
>  			    TASK_UNINTERRUPTIBLE);
> diff --git a/fs/ioctl.c b/fs/ioctl.c
> index 1d20af762e0d..3cc79b82a5dc 100644
> --- a/fs/ioctl.c
> +++ b/fs/ioctl.c
> @@ -401,7 +401,7 @@ static int ioctl_fsfreeze(struct file *filp)
>  	/* Freeze */
>  	if (sb->s_op->freeze_super)
>  		ret = sb->s_op->freeze_super(sb);
> -	ret = freeze_super(sb);
> +	ret = freeze_super(sb, true);
>  
>  	deactivate_locked_super(sb);
>  
> @@ -418,7 +418,7 @@ static int ioctl_fsthaw(struct file *filp)
>  	/* Thaw */
>  	if (sb->s_op->thaw_super)
>  		return sb->s_op->thaw_super(sb);
> -	return thaw_super(sb);
> +	return thaw_super(sb, true);
>  }
>  
>  static int ioctl_file_dedupe_range(struct file *file,
> diff --git a/fs/super.c b/fs/super.c
> index 46c6475fc765..16ccbb9dd230 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -1026,7 +1026,7 @@ static void do_thaw_all_callback(struct super_block *sb)
>  		return;
>  	if (sb->s_root && sb->s_flags & SB_BORN) {
>  		emergency_thaw_bdev(sb);
> -		thaw_super(sb);
> +		thaw_super(sb, true);
>  	}
>  	deactivate_locked_super(sb);
>  }
> @@ -1636,6 +1636,8 @@ static void sb_freeze_unlock(struct super_block *sb, int level)
>  /**
>   * freeze_super - force a filesystem backed by a block device into a consistent state
>   * @sb: the super to lock
> + * @usercall: whether or not userspace initiated this via an ioctl or if it
> + * 	was a kernel freeze
>   *
>   * Used by filesystems and the kernel to freeze a fileystem backed by a block
>   * device into a consistent state. Callers must use get_active_super(bdev) to
> @@ -1669,10 +1671,13 @@ static void sb_freeze_unlock(struct super_block *sb, int level)
>   *
>   * sb->s_writers.frozen is protected by sb->s_umount.
>   */
> -int freeze_super(struct super_block *sb)
> +int freeze_super(struct super_block *sb, bool usercall)
>  {
>  	int ret;
>  
> +	if (!usercall && sb_is_frozen(sb))
> +		return 0;

If we try a kernel freeze but the fs was already frozen by userspace, we
return ... zero?

What if userspace thaws the fs immediately afterwards?  The kernel
caller is still running, and now it erroneously thinks the fs is frozen.
Won't that break the suspend freezer?

TBH I was more thinking about fscounters scrub, which will report false
corruptions if the fs gets unfrozen while it is running.

--D

> +
>  	if (!sb_is_unfrozen(sb))
>  		return -EBUSY;
>  
> @@ -1682,6 +1687,7 @@ int freeze_super(struct super_block *sb)
>  	if (sb_rdonly(sb)) {
>  		/* Nothing to do really... */
>  		sb->s_writers.frozen = SB_FREEZE_COMPLETE;
> +		sb->s_writers.frozen_by_user = usercall;
>  		return 0;
>  	}
>  
> @@ -1699,6 +1705,7 @@ int freeze_super(struct super_block *sb)
>  	ret = sync_filesystem(sb);
>  	if (ret) {
>  		sb->s_writers.frozen = SB_UNFROZEN;
> +		sb->s_writers.frozen_by_user = false;
>  		sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
>  		wake_up(&sb->s_writers.wait_unfrozen);
>  		return ret;
> @@ -1724,6 +1731,7 @@ int freeze_super(struct super_block *sb)
>  	 * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
>  	 */
>  	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
> +	sb->s_writers.frozen_by_user = usercall;
>  	lockdep_sb_freeze_release(sb);
>  	return 0;
>  }
> @@ -1732,21 +1740,33 @@ EXPORT_SYMBOL(freeze_super);
>  /**
>   * thaw_super -- unlock a filesystem backed by a block device
>   * @sb: the super to thaw
> + * @usercall: whether or not userspace initiated this thaw or if it was the
> + * 	kernel which initiated it
>   *
>   * Used by filesystems and the kernel to thaw a fileystem backed by a block
>   * device. Callers must use get_active_super(bdev) to lock the @sb and when
>   * done must unlock it with deactivate_locked_super(). Once done, this marks
>   * the filesystem as writeable.
>   */
> -int thaw_super(struct super_block *sb)
> +int thaw_super(struct super_block *sb, bool usercall)
>  {
>  	int error;
>  
> -	if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
> +	if (!usercall) {
> +		/*
> +		 * If userspace initiated the freeze don't let the kernel
> +		 * thaw it on return from a kernel initiated freeze.
> +		 */
> +		if (sb_is_unfrozen(sb) || sb_is_frozen_by_user(sb))
> +			return 0;
> +	}
> +
> +	if (!sb_is_frozen(sb))
>  		return -EINVAL;
>  
>  	if (sb_rdonly(sb)) {
>  		sb->s_writers.frozen = SB_UNFROZEN;
> +		sb->s_writers.frozen_by_user = false;
>  		goto out;
>  	}
>  
> @@ -1763,6 +1783,7 @@ int thaw_super(struct super_block *sb)
>  	}
>  
>  	sb->s_writers.frozen = SB_UNFROZEN;
> +	sb->s_writers.frozen_by_user = false;
>  	sb_freeze_unlock(sb, SB_FREEZE_FS);
>  out:
>  	wake_up(&sb->s_writers.wait_unfrozen);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 90b5bdc4071a..d9b46c858103 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1146,6 +1146,7 @@ enum {
>  
>  struct sb_writers {
>  	int				frozen;		/* Is sb frozen? */
> +	bool				frozen_by_user;	/* User freeze? */
>  	wait_queue_head_t		wait_unfrozen;	/* wait for thaw */
>  	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
>  };
> @@ -1632,6 +1633,17 @@ static inline bool sb_is_frozen(struct super_block *sb)
>  	return sb->s_writers.frozen == SB_FREEZE_COMPLETE;
>  }
>  
> +/**
> + * sb_is_frozen_by_user - was the superblock frozen by userspace?
> + * @sb: the super to check
> + *
> + * Returns true if the super is frozen by userspace, such as an ioctl.
> + */
> +static inline bool sb_is_frozen_by_user(struct super_block *sb)
> +{
> +	return sb_is_frozen(sb) && sb->s_writers.frozen_by_user;
> +}
> +
>  /**
>   * sb_is_unfrozen - is superblock unfrozen
>   * @sb: the super to check
> @@ -2308,8 +2320,8 @@ extern int unregister_filesystem(struct file_system_type *);
>  extern int vfs_statfs(const struct path *, struct kstatfs *);
>  extern int user_statfs(const char __user *, struct kstatfs *);
>  extern int fd_statfs(int, struct kstatfs *);
> -extern int freeze_super(struct super_block *super);
> -extern int thaw_super(struct super_block *super);
> +extern int freeze_super(struct super_block *super, bool usercall);
> +extern int thaw_super(struct super_block *super, bool usercall);
>  extern __printf(2, 3)
>  int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
>  extern int super_setup_bdi(struct super_block *sb);
> -- 
> 2.39.2
>
  
Darrick J. Wong May 22, 2023, 11:42 p.m. UTC | #2
How about this as an alternative patch?  Kernel and userspace freeze
state are stored in s_writers; each type cannot block the other (though
you still can't have nested kernel or userspace freezes); and the freeze
is maintained until /both/ freeze types are dropped.

AFAICT this should work for the two other usecases (quiescing pagefaults
for fsdax pmem pre-removal; and freezing fses during suspend) besides
online fsck for xfs.

--D

From: Darrick J. Wong <djwong@kernel.org>
Subject: fs: distinguish between user initiated freeze and kernel initiated freeze

Userspace can freeze a filesystem using the FIFREEZE ioctl or by
suspending the block device; this state persists until userspace thaws
the filesystem with the FITHAW ioctl or resuming the block device.
Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for
the fsfreeze ioctl") we only allow the first freeze command to succeed.

The kernel may decide that it is necessary to freeze a filesystem for
its own internal purposes, such as suspends in progress, filesystem fsck
activities, or quiescing a device prior to removal.  Userspace thaw
commands must never break a kernel freeze, and kernel thaw commands
shouldn't undo userspace's freeze command.

Introduce a couple of freeze holder flags and wire it into the
sb_writers state.  One kernel and one userspace freeze are allowed to
coexist at the same time; the filesystem will not thaw until both are
lifted.

Inspired-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/super.c         |  172 +++++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/fs.h |    8 ++
 2 files changed, 170 insertions(+), 10 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 34afe411cf2b..7496d51affb9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -39,7 +39,7 @@
 #include <uapi/linux/mount.h>
 #include "internal.h"
 
-static int thaw_super_locked(struct super_block *sb);
+static int thaw_super_locked(struct super_block *sb, unsigned short who);
 
 static LIST_HEAD(super_blocks);
 static DEFINE_SPINLOCK(sb_lock);
@@ -1027,7 +1027,7 @@ static void do_thaw_all_callback(struct super_block *sb)
 	down_write(&sb->s_umount);
 	if (sb->s_root && sb->s_flags & SB_BORN) {
 		emergency_thaw_bdev(sb);
-		thaw_super_locked(sb);
+		thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
 	} else {
 		up_write(&sb->s_umount);
 	}
@@ -1636,13 +1636,21 @@ static void sb_freeze_unlock(struct super_block *sb, int level)
 }
 
 /**
- * freeze_super - lock the filesystem and force it into a consistent state
+ * __freeze_super - lock the filesystem and force it into a consistent state
  * @sb: the super to lock
+ * @who: FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
+ * FREEZE_HOLDER_KERNEL if the kernel wants to freeze it
  *
  * Syncs the super to make sure the filesystem is consistent and calls the fs's
- * freeze_fs.  Subsequent calls to this without first thawing the fs will return
+ * freeze_fs.  Subsequent calls to this without first thawing the fs may return
  * -EBUSY.
  *
+ * The @who argument distinguishes between the kernel and userspace trying to
+ * freeze the filesystem.  Although there cannot be multiple kernel freezes or
+ * multiple userspace freezes in effect at any given time, the kernel and
+ * userspace can both hold a filesystem frozen.  The filesystem remains frozen
+ * until there are no kernel or userspace freezes in effect.
+ *
  * During this function, sb->s_writers.frozen goes through these values:
  *
  * SB_UNFROZEN: File system is normal, all writes progress as usual.
@@ -1668,12 +1676,61 @@ static void sb_freeze_unlock(struct super_block *sb, int level)
  *
  * sb->s_writers.frozen is protected by sb->s_umount.
  */
-int freeze_super(struct super_block *sb)
+static int __freeze_super(struct super_block *sb, unsigned short who)
 {
+	struct sb_writers *sbw = &sb->s_writers;
 	int ret;
 
 	atomic_inc(&sb->s_active);
 	down_write(&sb->s_umount);
+
+	if (sbw->frozen == SB_FREEZE_COMPLETE) {
+		switch (who) {
+		case FREEZE_HOLDER_KERNEL:
+			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
+				/*
+				 * Kernel freeze already in effect; caller can
+				 * try again.
+				 */
+				deactivate_locked_super(sb);
+				return -EBUSY;
+			}
+			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
+				/*
+				 * Share the freeze state with the userspace
+				 * freeze already in effect.
+				 */
+				sbw->freeze_holders |= who;
+				deactivate_locked_super(sb);
+				return 0;
+			}
+			break;
+		case FREEZE_HOLDER_USERSPACE:
+			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
+				/*
+				 * Userspace freeze already in effect; tell
+				 * the caller we're busy.
+				 */
+				deactivate_locked_super(sb);
+				return -EBUSY;
+			}
+			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
+				/*
+				 * Share the freeze state with the kernel
+				 * freeze already in effect.
+				 */
+				sbw->freeze_holders |= who;
+				deactivate_locked_super(sb);
+				return 0;
+			}
+			break;
+		default:
+			BUG();
+			deactivate_locked_super(sb);
+			return -EINVAL;
+		}
+	}
+
 	if (sb->s_writers.frozen != SB_UNFROZEN) {
 		deactivate_locked_super(sb);
 		return -EBUSY;
@@ -1686,6 +1743,7 @@ int freeze_super(struct super_block *sb)
 
 	if (sb_rdonly(sb)) {
 		/* Nothing to do really... */
+		sb->s_writers.freeze_holders |= who;
 		sb->s_writers.frozen = SB_FREEZE_COMPLETE;
 		up_write(&sb->s_umount);
 		return 0;
@@ -1731,23 +1789,103 @@ int freeze_super(struct super_block *sb)
 	 * For debugging purposes so that fs can warn if it sees write activity
 	 * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
 	 */
+	sb->s_writers.freeze_holders |= who;
 	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
 	lockdep_sb_freeze_release(sb);
 	up_write(&sb->s_umount);
 	return 0;
 }
+
+/*
+ * freeze_super - lock the filesystem and force it into a consistent state
+ * @sb: the super to lock
+ *
+ * Syncs the super to make sure the filesystem is consistent and calls the fs's
+ * freeze_fs.  Subsequent calls to this without first calling thaw_super will
+ * return -EBUSY.  See the comment for __freeze_super for more information.
+ */
+int freeze_super(struct super_block *sb)
+{
+	return __freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+}
 EXPORT_SYMBOL(freeze_super);
 
-static int thaw_super_locked(struct super_block *sb)
+/**
+ * freeze_super_kernel - lock the filesystem for an internal kernel operation
+ * and force it into a consistent state.
+ * @sb: the super to lock
+ *
+ * Syncs the super to make sure the filesystem is consistent and calls the fs's
+ * freeze_fs.  Subsequent calls to this without first calling thaw_super_excl
+ * will return -EBUSY.
+ */
+int freeze_super_kernel(struct super_block *sb)
 {
+	return __freeze_super(sb, FREEZE_HOLDER_KERNEL);
+}
+EXPORT_SYMBOL_GPL(freeze_super_kernel);
+
+/*
+ * Undoes the effect of a freeze_super_locked call.  If the filesystem is
+ * frozen both by userspace and the kernel, a thaw call from either source
+ * removes that state without releasing the other state or unlocking the
+ * filesystem.
+ */
+static int thaw_super_locked(struct super_block *sb, unsigned short who)
+{
+	struct sb_writers *sbw = &sb->s_writers;
 	int error;
 
+	if (sbw->frozen == SB_FREEZE_COMPLETE) {
+		switch (who) {
+		case FREEZE_HOLDER_KERNEL:
+			if (!(sbw->freeze_holders & FREEZE_HOLDER_KERNEL)) {
+				/* Caller doesn't hold a kernel freeze. */
+				up_write(&sb->s_umount);
+				return -EINVAL;
+			}
+			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
+				/*
+				 * We were sharing the freeze with userspace,
+				 * so drop the userspace freeze but exit
+				 * without unfreezing.
+				 */
+				sbw->freeze_holders &= ~who;
+				up_write(&sb->s_umount);
+				return 0;
+			}
+			break;
+		case FREEZE_HOLDER_USERSPACE:
+			if (!(sbw->freeze_holders & FREEZE_HOLDER_USERSPACE)) {
+				/* Caller doesn't hold a userspace freeze. */
+				up_write(&sb->s_umount);
+				return -EINVAL;
+			}
+			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
+				/*
+				 * We were sharing the freeze with the kernel,
+				 * so drop the kernel freeze but exit without
+				 * unfreezing.
+				 */
+				sbw->freeze_holders &= ~who;
+				up_write(&sb->s_umount);
+				return 0;
+			}
+			break;
+		default:
+			BUG();
+			up_write(&sb->s_umount);
+			return -EINVAL;
+		}
+	}
+
 	if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
 		up_write(&sb->s_umount);
 		return -EINVAL;
 	}
 
 	if (sb_rdonly(sb)) {
+		sb->s_writers.freeze_holders &= ~who;
 		sb->s_writers.frozen = SB_UNFROZEN;
 		goto out;
 	}
@@ -1765,6 +1903,7 @@ static int thaw_super_locked(struct super_block *sb)
 		}
 	}
 
+	sb->s_writers.freeze_holders &= ~who;
 	sb->s_writers.frozen = SB_UNFROZEN;
 	sb_freeze_unlock(sb, SB_FREEZE_FS);
 out:
@@ -1774,18 +1913,33 @@ static int thaw_super_locked(struct super_block *sb)
 }
 
 /**
- * thaw_super -- unlock filesystem
+ * thaw_super -- unlock filesystem frozen with freeze_super
  * @sb: the super to thaw
  *
- * Unlocks the filesystem and marks it writeable again after freeze_super().
+ * Unlocks the filesystem after freeze_super, and make it writeable again if
+ * there is not a freeze_super_kernel still in effect.
  */
 int thaw_super(struct super_block *sb)
 {
 	down_write(&sb->s_umount);
-	return thaw_super_locked(sb);
+	return thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
 }
 EXPORT_SYMBOL(thaw_super);
 
+/**
+ * thaw_super_kernel -- unlock filesystem frozen with freeze_super_kernel
+ * @sb: the super to thaw
+ *
+ * Unlocks the filesystem after freeze_super_kernel, and make it writeable
+ * again if there is not a freeze_super still in effect.
+ */
+int thaw_super_kernel(struct super_block *sb)
+{
+	down_write(&sb->s_umount);
+	return thaw_super_locked(sb, FREEZE_HOLDER_KERNEL);
+}
+EXPORT_SYMBOL_GPL(thaw_super_kernel);
+
 /*
  * Create workqueue for deferred direct IO completions. We allocate the
  * workqueue when it's first needed. This avoids creating workqueue for
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 21a981680856..147644b5d648 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1145,11 +1145,15 @@ enum {
 #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
 
 struct sb_writers {
-	int				frozen;		/* Is sb frozen? */
+	unsigned short			frozen;		/* Is sb frozen? */
+	unsigned short			freeze_holders;	/* Who froze fs? */
 	wait_queue_head_t		wait_unfrozen;	/* wait for thaw */
 	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
 };
 
+#define FREEZE_HOLDER_USERSPACE	(1U << 1)	/* userspace froze fs */
+#define FREEZE_HOLDER_KERNEL	(1U << 2)	/* kernel froze fs */
+
 struct super_block {
 	struct list_head	s_list;		/* Keep this first */
 	dev_t			s_dev;		/* search index; _not_ kdev_t */
@@ -2288,6 +2292,8 @@ extern int user_statfs(const char __user *, struct kstatfs *);
 extern int fd_statfs(int, struct kstatfs *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
+extern int freeze_super_kernel(struct super_block *super);
+extern int thaw_super_kernel(struct super_block *super);
 extern __printf(2, 3)
 int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
 extern int super_setup_bdi(struct super_block *sb);
  
Jan Kara May 25, 2023, 2:14 p.m. UTC | #3
On Mon 22-05-23 16:42:00, Darrick J. Wong wrote:
> How about this as an alternative patch?  Kernel and userspace freeze
> state are stored in s_writers; each type cannot block the other (though
> you still can't have nested kernel or userspace freezes); and the freeze
> is maintained until /both/ freeze types are dropped.
> 
> AFAICT this should work for the two other usecases (quiescing pagefaults
> for fsdax pmem pre-removal; and freezing fses during suspend) besides
> online fsck for xfs.
> 
> --D
> 
> From: Darrick J. Wong <djwong@kernel.org>
> Subject: fs: distinguish between user initiated freeze and kernel initiated freeze
> 
> Userspace can freeze a filesystem using the FIFREEZE ioctl or by
> suspending the block device; this state persists until userspace thaws
> the filesystem with the FITHAW ioctl or resuming the block device.
> Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for
> the fsfreeze ioctl") we only allow the first freeze command to succeed.
> 
> The kernel may decide that it is necessary to freeze a filesystem for
> its own internal purposes, such as suspends in progress, filesystem fsck
> activities, or quiescing a device prior to removal.  Userspace thaw
> commands must never break a kernel freeze, and kernel thaw commands
> shouldn't undo userspace's freeze command.
> 
> Introduce a couple of freeze holder flags and wire it into the
> sb_writers state.  One kernel and one userspace freeze are allowed to
> coexist at the same time; the filesystem will not thaw until both are
> lifted.
> 
> Inspired-by: Luis Chamberlain <mcgrof@kernel.org>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>

Yes, this is exactly how I'd imagine it. Thanks for writing the patch!

I'd just note that this would need rebasing on top of Luis' patches 1 and
2. Also:

> +	if (sbw->frozen == SB_FREEZE_COMPLETE) {
> +		switch (who) {
> +		case FREEZE_HOLDER_KERNEL:
> +			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
> +				/*
> +				 * Kernel freeze already in effect; caller can
> +				 * try again.
> +				 */
> +				deactivate_locked_super(sb);
> +				return -EBUSY;
> +			}
> +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> +				/*
> +				 * Share the freeze state with the userspace
> +				 * freeze already in effect.
> +				 */
> +				sbw->freeze_holders |= who;
> +				deactivate_locked_super(sb);
> +				return 0;
> +			}
> +			break;
> +		case FREEZE_HOLDER_USERSPACE:
> +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> +				/*
> +				 * Userspace freeze already in effect; tell
> +				 * the caller we're busy.
> +				 */
> +				deactivate_locked_super(sb);
> +				return -EBUSY;
> +			}
> +			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
> +				/*
> +				 * Share the freeze state with the kernel
> +				 * freeze already in effect.
> +				 */
> +				sbw->freeze_holders |= who;
> +				deactivate_locked_super(sb);
> +				return 0;
> +			}
> +			break;
> +		default:
> +			BUG();
> +			deactivate_locked_super(sb);
> +			return -EINVAL;
> +		}
> +	}

Can't this be simplified to:

	BUG_ON(who & ~(FREEZE_HOLDER_USERSPACE | FREEZE_HOLDER_KERNEL));
	BUG_ON(!(!(who & FREEZE_HOLDER_USERSPACE) ^
	       !(who & FREEZE_HOLDER_KERNEL)));
retry:
	if (sb->s_writers.freeze_holders & who)
		return -EBUSY;
	/* Already frozen by someone else? */
	if (sb->s_writers.freeze_holders & ~who) {
		sb->s_writers.freeze_holders |= who;
		return 0;
	}

Now the only remaining issue with the code is that the two different
holders can be attempting to freeze the filesystem at once and in that case
one of them has to wait for the other one instead of returning -EBUSY as
would happen currently. This can happen because we temporarily drop
s_umount in freeze_super() due to lock ordering issues. I think we could
do something like:

	if (!sb_unfrozen(sb)) {
		up_write(&sb->s_umount);
		wait_var_event(&sb->s_writers.frozen,
			       sb_unfrozen(sb) || sb_frozen(sb));
		down_write(&sb->s_umount);
		goto retry;
	}

and then sprinkle wake_up_var(&sb->s_writers.frozen) at appropriate places
in freeze_super().

BTW, when reading this code, I've spotted attached cleanup opportunity but
I'll queue that separately so that is JFYI.

> +#define FREEZE_HOLDER_USERSPACE	(1U << 1)	/* userspace froze fs */
> +#define FREEZE_HOLDER_KERNEL	(1U << 2)	/* kernel froze fs */

Why not start from 1U << 0? And bonus points for using BIT() macro :).

								Honza
  
Darrick J. Wong June 6, 2023, 5:19 p.m. UTC | #4
On Thu, May 25, 2023 at 04:14:30PM +0200, Jan Kara wrote:
> On Mon 22-05-23 16:42:00, Darrick J. Wong wrote:
> > How about this as an alternative patch?  Kernel and userspace freeze
> > state are stored in s_writers; each type cannot block the other (though
> > you still can't have nested kernel or userspace freezes); and the freeze
> > is maintained until /both/ freeze types are dropped.
> > 
> > AFAICT this should work for the two other usecases (quiescing pagefaults
> > for fsdax pmem pre-removal; and freezing fses during suspend) besides
> > online fsck for xfs.
> > 
> > --D
> > 
> > From: Darrick J. Wong <djwong@kernel.org>
> > Subject: fs: distinguish between user initiated freeze and kernel initiated freeze
> > 
> > Userspace can freeze a filesystem using the FIFREEZE ioctl or by
> > suspending the block device; this state persists until userspace thaws
> > the filesystem with the FITHAW ioctl or resuming the block device.
> > Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for
> > the fsfreeze ioctl") we only allow the first freeze command to succeed.
> > 
> > The kernel may decide that it is necessary to freeze a filesystem for
> > its own internal purposes, such as suspends in progress, filesystem fsck
> > activities, or quiescing a device prior to removal.  Userspace thaw
> > commands must never break a kernel freeze, and kernel thaw commands
> > shouldn't undo userspace's freeze command.
> > 
> > Introduce a couple of freeze holder flags and wire it into the
> > sb_writers state.  One kernel and one userspace freeze are allowed to
> > coexist at the same time; the filesystem will not thaw until both are
> > lifted.
> > 
> > Inspired-by: Luis Chamberlain <mcgrof@kernel.org>
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> 
> Yes, this is exactly how I'd imagine it. Thanks for writing the patch!
> 
> I'd just note that this would need rebasing on top of Luis' patches 1 and
> 2. Also:

I started doing that, but I noticed that after patch 1, freeze_super no
longer leaves s_active elevated if the freeze is successful.  The
callers drop the s_active ref that they themselves obtained, which
means that we've now changed that behavior, right?  ioctl_fsfreeze now
does:

	if (!get_active_super(sb->s_bdev))
		return -ENOTTY;

(Increase ref)

        /* Freeze */
        if (sb->s_op->freeze_super)
		ret = sb->s_op->freeze_super(sb);
	ret = freeze_super(sb);

(Not sure why we can do both here?)

	deactivate_locked_super(sb);

(Decrease ref; net change to s_active is zero)

	return ret;

Luis hasn't responded to my question, so I stopped.

> > +	if (sbw->frozen == SB_FREEZE_COMPLETE) {
> > +		switch (who) {
> > +		case FREEZE_HOLDER_KERNEL:
> > +			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
> > +				/*
> > +				 * Kernel freeze already in effect; caller can
> > +				 * try again.
> > +				 */
> > +				deactivate_locked_super(sb);
> > +				return -EBUSY;
> > +			}
> > +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> > +				/*
> > +				 * Share the freeze state with the userspace
> > +				 * freeze already in effect.
> > +				 */
> > +				sbw->freeze_holders |= who;
> > +				deactivate_locked_super(sb);
> > +				return 0;
> > +			}
> > +			break;
> > +		case FREEZE_HOLDER_USERSPACE:
> > +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> > +				/*
> > +				 * Userspace freeze already in effect; tell
> > +				 * the caller we're busy.
> > +				 */
> > +				deactivate_locked_super(sb);
> > +				return -EBUSY;
> > +			}
> > +			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
> > +				/*
> > +				 * Share the freeze state with the kernel
> > +				 * freeze already in effect.
> > +				 */
> > +				sbw->freeze_holders |= who;
> > +				deactivate_locked_super(sb);
> > +				return 0;
> > +			}
> > +			break;
> > +		default:
> > +			BUG();
> > +			deactivate_locked_super(sb);
> > +			return -EINVAL;
> > +		}
> > +	}
> 
> Can't this be simplified to:
> 
> 	BUG_ON(who & ~(FREEZE_HOLDER_USERSPACE | FREEZE_HOLDER_KERNEL));
> 	BUG_ON(!(!(who & FREEZE_HOLDER_USERSPACE) ^
> 	       !(who & FREEZE_HOLDER_KERNEL)));
> retry:
> 	if (sb->s_writers.freeze_holders & who)
> 		return -EBUSY;
> 	/* Already frozen by someone else? */
> 	if (sb->s_writers.freeze_holders & ~who) {
> 		sb->s_writers.freeze_holders |= who;
> 		return 0;
> 	}

Yes, it can.

> Now the only remaining issue with the code is that the two different
> holders can be attempting to freeze the filesystem at once and in that case
> one of them has to wait for the other one instead of returning -EBUSY as
> would happen currently. This can happen because we temporarily drop
> s_umount in freeze_super() due to lock ordering issues. I think we could
> do something like:
> 
> 	if (!sb_unfrozen(sb)) {
> 		up_write(&sb->s_umount);
> 		wait_var_event(&sb->s_writers.frozen,
> 			       sb_unfrozen(sb) || sb_frozen(sb));
> 		down_write(&sb->s_umount);
> 		goto retry;
> 	}
> 
> and then sprinkle wake_up_var(&sb->s_writers.frozen) at appropriate places
> in freeze_super().

I think that'd work.  Let me try that.

> BTW, when reading this code, I've spotted attached cleanup opportunity but
> I'll queue that separately so that is JFYI.
> 
> > +#define FREEZE_HOLDER_USERSPACE	(1U << 1)	/* userspace froze fs */
> > +#define FREEZE_HOLDER_KERNEL	(1U << 2)	/* kernel froze fs */
> 
> Why not start from 1U << 0? And bonus points for using BIT() macro :).

I didn't think filesystem code was supposed to be using stuff from
vdso.h...

--D

> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR

> From 9fce35f21f9a62470e764463c84373fb013108fd Mon Sep 17 00:00:00 2001
> From: Jan Kara <jack@suse.cz>
> Date: Thu, 25 May 2023 15:56:19 +0200
> Subject: [PATCH] fs: Drop wait_unfrozen wait queue
> 
> wait_unfrozen waitqueue is used only in quota code to wait for
> filesystem to become unfrozen. In that place we can just use
> sb_start_write() - sb_end_write() pair to achieve the same. So just
> remove the waitqueue.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/quota/quota.c   | 5 +++--
>  fs/super.c         | 4 ----
>  include/linux/fs.h | 1 -
>  3 files changed, 3 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/quota/quota.c b/fs/quota/quota.c
> index 052f143e2e0e..0e41fb84060f 100644
> --- a/fs/quota/quota.c
> +++ b/fs/quota/quota.c
> @@ -895,8 +895,9 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
>  			up_write(&sb->s_umount);
>  		else
>  			up_read(&sb->s_umount);
> -		wait_event(sb->s_writers.wait_unfrozen,
> -			   sb->s_writers.frozen == SB_UNFROZEN);
> +		/* Wait for sb to unfreeze */
> +		sb_start_write(sb);
> +		sb_end_write(sb);
>  		put_super(sb);
>  		goto retry;
>  	}
> diff --git a/fs/super.c b/fs/super.c
> index 34afe411cf2b..6283cea67280 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -236,7 +236,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>  					&type->s_writers_key[i]))
>  			goto fail;
>  	}
> -	init_waitqueue_head(&s->s_writers.wait_unfrozen);
>  	s->s_bdi = &noop_backing_dev_info;
>  	s->s_flags = flags;
>  	if (s->s_user_ns != &init_user_ns)
> @@ -1706,7 +1705,6 @@ int freeze_super(struct super_block *sb)
>  	if (ret) {
>  		sb->s_writers.frozen = SB_UNFROZEN;
>  		sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
> -		wake_up(&sb->s_writers.wait_unfrozen);
>  		deactivate_locked_super(sb);
>  		return ret;
>  	}
> @@ -1722,7 +1720,6 @@ int freeze_super(struct super_block *sb)
>  				"VFS:Filesystem freeze failed\n");
>  			sb->s_writers.frozen = SB_UNFROZEN;
>  			sb_freeze_unlock(sb, SB_FREEZE_FS);
> -			wake_up(&sb->s_writers.wait_unfrozen);
>  			deactivate_locked_super(sb);
>  			return ret;
>  		}
> @@ -1768,7 +1765,6 @@ static int thaw_super_locked(struct super_block *sb)
>  	sb->s_writers.frozen = SB_UNFROZEN;
>  	sb_freeze_unlock(sb, SB_FREEZE_FS);
>  out:
> -	wake_up(&sb->s_writers.wait_unfrozen);
>  	deactivate_locked_super(sb);
>  	return 0;
>  }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 21a981680856..3b65a6194485 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1146,7 +1146,6 @@ enum {
>  
>  struct sb_writers {
>  	int				frozen;		/* Is sb frozen? */
> -	wait_queue_head_t		wait_unfrozen;	/* wait for thaw */
>  	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
>  };
>  
> -- 
> 2.35.3
>
  
Jan Kara June 7, 2023, 9:22 a.m. UTC | #5
On Tue 06-06-23 10:19:56, Darrick J. Wong wrote:
> On Thu, May 25, 2023 at 04:14:30PM +0200, Jan Kara wrote:
> > On Mon 22-05-23 16:42:00, Darrick J. Wong wrote:
> > > How about this as an alternative patch?  Kernel and userspace freeze
> > > state are stored in s_writers; each type cannot block the other (though
> > > you still can't have nested kernel or userspace freezes); and the freeze
> > > is maintained until /both/ freeze types are dropped.
> > > 
> > > AFAICT this should work for the two other usecases (quiescing pagefaults
> > > for fsdax pmem pre-removal; and freezing fses during suspend) besides
> > > online fsck for xfs.
> > > 
> > > --D
> > > 
> > > From: Darrick J. Wong <djwong@kernel.org>
> > > Subject: fs: distinguish between user initiated freeze and kernel initiated freeze
> > > 
> > > Userspace can freeze a filesystem using the FIFREEZE ioctl or by
> > > suspending the block device; this state persists until userspace thaws
> > > the filesystem with the FITHAW ioctl or resuming the block device.
> > > Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for
> > > the fsfreeze ioctl") we only allow the first freeze command to succeed.
> > > 
> > > The kernel may decide that it is necessary to freeze a filesystem for
> > > its own internal purposes, such as suspends in progress, filesystem fsck
> > > activities, or quiescing a device prior to removal.  Userspace thaw
> > > commands must never break a kernel freeze, and kernel thaw commands
> > > shouldn't undo userspace's freeze command.
> > > 
> > > Introduce a couple of freeze holder flags and wire it into the
> > > sb_writers state.  One kernel and one userspace freeze are allowed to
> > > coexist at the same time; the filesystem will not thaw until both are
> > > lifted.
> > > 
> > > Inspired-by: Luis Chamberlain <mcgrof@kernel.org>
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > 
> > Yes, this is exactly how I'd imagine it. Thanks for writing the patch!
> > 
> > I'd just note that this would need rebasing on top of Luis' patches 1 and
> > 2. Also:
> 
> I started doing that, but I noticed that after patch 1, freeze_super no
> longer leaves s_active elevated if the freeze is successful.  The
> callers drop the s_active ref that they themselves obtained, which
> means that we've now changed that behavior, right?  ioctl_fsfreeze now
> does:
> 
> 	if (!get_active_super(sb->s_bdev))
> 		return -ENOTTY;
> 
> (Increase ref)
> 
>         /* Freeze */
>         if (sb->s_op->freeze_super)
> 		ret = sb->s_op->freeze_super(sb);
> 	ret = freeze_super(sb);
> 
> (Not sure why we can do both here?)
> 
> 	deactivate_locked_super(sb);
> 
> (Decrease ref; net change to s_active is zero)
> 
> 	return ret;
> 
> Luis hasn't responded to my question, so I stopped.

Right. I kind of like how he's moved the locking out of freeze_super() /
thaw_super() but I agree this semantic change is problematic and needs much
more thought - e.g. with Luis' version the fs could be unmounted while
frozen which is going to spectacularly deadlock. So yeah let's just ignore
patch 1 for now.

Longer term I believe if the sb is frozen by userspace, we should refuse to
unmount it but that's a separate discussion for some other time.

> > BTW, when reading this code, I've spotted attached cleanup opportunity but
> > I'll queue that separately so that is JFYI.
> > 
> > > +#define FREEZE_HOLDER_USERSPACE	(1U << 1)	/* userspace froze fs */
> > > +#define FREEZE_HOLDER_KERNEL	(1U << 2)	/* kernel froze fs */
> > 
> > Why not start from 1U << 0? And bonus points for using BIT() macro :).
> 
> I didn't think filesystem code was supposed to be using stuff from
> vdso.h...

Hum, so BIT() macro is quite widely used in include/linux/ (from generic
headers e.g. in trace.h). include/linux/bits.h seems to be the right
include to use but I'm pretty sure include/linux/fs.h already gets this
header through something.

								Honza
  
Darrick J. Wong June 7, 2023, 2:50 p.m. UTC | #6
On Wed, Jun 07, 2023 at 11:22:43AM +0200, Jan Kara wrote:
> On Tue 06-06-23 10:19:56, Darrick J. Wong wrote:
> > On Thu, May 25, 2023 at 04:14:30PM +0200, Jan Kara wrote:
> > > On Mon 22-05-23 16:42:00, Darrick J. Wong wrote:
> > > > How about this as an alternative patch?  Kernel and userspace freeze
> > > > state are stored in s_writers; each type cannot block the other (though
> > > > you still can't have nested kernel or userspace freezes); and the freeze
> > > > is maintained until /both/ freeze types are dropped.
> > > > 
> > > > AFAICT this should work for the two other usecases (quiescing pagefaults
> > > > for fsdax pmem pre-removal; and freezing fses during suspend) besides
> > > > online fsck for xfs.
> > > > 
> > > > --D
> > > > 
> > > > From: Darrick J. Wong <djwong@kernel.org>
> > > > Subject: fs: distinguish between user initiated freeze and kernel initiated freeze
> > > > 
> > > > Userspace can freeze a filesystem using the FIFREEZE ioctl or by
> > > > suspending the block device; this state persists until userspace thaws
> > > > the filesystem with the FITHAW ioctl or resuming the block device.
> > > > Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for
> > > > the fsfreeze ioctl") we only allow the first freeze command to succeed.
> > > > 
> > > > The kernel may decide that it is necessary to freeze a filesystem for
> > > > its own internal purposes, such as suspends in progress, filesystem fsck
> > > > activities, or quiescing a device prior to removal.  Userspace thaw
> > > > commands must never break a kernel freeze, and kernel thaw commands
> > > > shouldn't undo userspace's freeze command.
> > > > 
> > > > Introduce a couple of freeze holder flags and wire it into the
> > > > sb_writers state.  One kernel and one userspace freeze are allowed to
> > > > coexist at the same time; the filesystem will not thaw until both are
> > > > lifted.
> > > > 
> > > > Inspired-by: Luis Chamberlain <mcgrof@kernel.org>
> > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > 
> > > Yes, this is exactly how I'd imagine it. Thanks for writing the patch!
> > > 
> > > I'd just note that this would need rebasing on top of Luis' patches 1 and
> > > 2. Also:
> > 
> > I started doing that, but I noticed that after patch 1, freeze_super no
> > longer leaves s_active elevated if the freeze is successful.  The
> > callers drop the s_active ref that they themselves obtained, which
> > means that we've now changed that behavior, right?  ioctl_fsfreeze now
> > does:
> > 
> > 	if (!get_active_super(sb->s_bdev))
> > 		return -ENOTTY;
> > 
> > (Increase ref)
> > 
> >         /* Freeze */
> >         if (sb->s_op->freeze_super)
> > 		ret = sb->s_op->freeze_super(sb);
> > 	ret = freeze_super(sb);
> > 
> > (Not sure why we can do both here?)
> > 
> > 	deactivate_locked_super(sb);
> > 
> > (Decrease ref; net change to s_active is zero)
> > 
> > 	return ret;
> > 
> > Luis hasn't responded to my question, so I stopped.
> 
> Right. I kind of like how he's moved the locking out of freeze_super() /
> thaw_super() but I agree this semantic change is problematic and needs much
> more thought - e.g. with Luis' version the fs could be unmounted while
> frozen which is going to spectacularly deadlock. So yeah let's just ignore
> patch 1 for now.

Agreed, I like moving the locking out of freeze_super too.

I'm less enthused about patch 2's helpers since there are those
intermediate freezer states whose existence are only hinted at if you
get to the point of asking yourself "Why would there be both _is_frozen
and _is_unfrozen predicates?".

> Longer term I believe if the sb is frozen by userspace, we should refuse to
> unmount it but that's a separate discussion for some other time.
> 
> > > BTW, when reading this code, I've spotted attached cleanup opportunity but
> > > I'll queue that separately so that is JFYI.
> > > 
> > > > +#define FREEZE_HOLDER_USERSPACE	(1U << 1)	/* userspace froze fs */
> > > > +#define FREEZE_HOLDER_KERNEL	(1U << 2)	/* kernel froze fs */
> > > 
> > > Why not start from 1U << 0? And bonus points for using BIT() macro :).
> > 
> > I didn't think filesystem code was supposed to be using stuff from
> > vdso.h...
> 
> Hum, so BIT() macro is quite widely used in include/linux/ (from generic
> headers e.g. in trace.h). include/linux/bits.h seems to be the right
> include to use but I'm pretty sure include/linux/fs.h already gets this
> header through something.

Ok, will do.

--D

> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
  
Darrick J. Wong June 7, 2023, 4:31 p.m. UTC | #7
On Thu, May 25, 2023 at 04:14:30PM +0200, Jan Kara wrote:
> On Mon 22-05-23 16:42:00, Darrick J. Wong wrote:
> > How about this as an alternative patch?  Kernel and userspace freeze
> > state are stored in s_writers; each type cannot block the other (though
> > you still can't have nested kernel or userspace freezes); and the freeze
> > is maintained until /both/ freeze types are dropped.
> > 
> > AFAICT this should work for the two other usecases (quiescing pagefaults
> > for fsdax pmem pre-removal; and freezing fses during suspend) besides
> > online fsck for xfs.
> > 
> > --D
> > 
> > From: Darrick J. Wong <djwong@kernel.org>
> > Subject: fs: distinguish between user initiated freeze and kernel initiated freeze
> > 
> > Userspace can freeze a filesystem using the FIFREEZE ioctl or by
> > suspending the block device; this state persists until userspace thaws
> > the filesystem with the FITHAW ioctl or resuming the block device.
> > Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for
> > the fsfreeze ioctl") we only allow the first freeze command to succeed.
> > 
> > The kernel may decide that it is necessary to freeze a filesystem for
> > its own internal purposes, such as suspends in progress, filesystem fsck
> > activities, or quiescing a device prior to removal.  Userspace thaw
> > commands must never break a kernel freeze, and kernel thaw commands
> > shouldn't undo userspace's freeze command.
> > 
> > Introduce a couple of freeze holder flags and wire it into the
> > sb_writers state.  One kernel and one userspace freeze are allowed to
> > coexist at the same time; the filesystem will not thaw until both are
> > lifted.
> > 
> > Inspired-by: Luis Chamberlain <mcgrof@kernel.org>
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> 
> Yes, this is exactly how I'd imagine it. Thanks for writing the patch!
> 
> I'd just note that this would need rebasing on top of Luis' patches 1 and
> 2. Also:
> 
> > +	if (sbw->frozen == SB_FREEZE_COMPLETE) {
> > +		switch (who) {
> > +		case FREEZE_HOLDER_KERNEL:
> > +			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
> > +				/*
> > +				 * Kernel freeze already in effect; caller can
> > +				 * try again.
> > +				 */
> > +				deactivate_locked_super(sb);
> > +				return -EBUSY;
> > +			}
> > +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> > +				/*
> > +				 * Share the freeze state with the userspace
> > +				 * freeze already in effect.
> > +				 */
> > +				sbw->freeze_holders |= who;
> > +				deactivate_locked_super(sb);
> > +				return 0;
> > +			}
> > +			break;
> > +		case FREEZE_HOLDER_USERSPACE:
> > +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> > +				/*
> > +				 * Userspace freeze already in effect; tell
> > +				 * the caller we're busy.
> > +				 */
> > +				deactivate_locked_super(sb);
> > +				return -EBUSY;
> > +			}
> > +			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
> > +				/*
> > +				 * Share the freeze state with the kernel
> > +				 * freeze already in effect.
> > +				 */
> > +				sbw->freeze_holders |= who;
> > +				deactivate_locked_super(sb);
> > +				return 0;
> > +			}
> > +			break;
> > +		default:
> > +			BUG();
> > +			deactivate_locked_super(sb);
> > +			return -EINVAL;
> > +		}
> > +	}
> 
> Can't this be simplified to:
> 
> 	BUG_ON(who & ~(FREEZE_HOLDER_USERSPACE | FREEZE_HOLDER_KERNEL));
> 	BUG_ON(!(!(who & FREEZE_HOLDER_USERSPACE) ^
> 	       !(who & FREEZE_HOLDER_KERNEL)));
> retry:
> 	if (sb->s_writers.freeze_holders & who)
> 		return -EBUSY;
> 	/* Already frozen by someone else? */
> 	if (sb->s_writers.freeze_holders & ~who) {
> 		sb->s_writers.freeze_holders |= who;
> 		return 0;
> 	}
> 
> Now the only remaining issue with the code is that the two different
> holders can be attempting to freeze the filesystem at once and in that case
> one of them has to wait for the other one instead of returning -EBUSY as
> would happen currently. This can happen because we temporarily drop
> s_umount in freeze_super() due to lock ordering issues. I think we could
> do something like:
> 
> 	if (!sb_unfrozen(sb)) {
> 		up_write(&sb->s_umount);
> 		wait_var_event(&sb->s_writers.frozen,
> 			       sb_unfrozen(sb) || sb_frozen(sb));
> 		down_write(&sb->s_umount);
> 		goto retry;
> 	}
> 
> and then sprinkle wake_up_var(&sb->s_writers.frozen) at appropriate places
> in freeze_super().

If we implemented this behavior change, it ought to be a separate patch.

For the case where the kernel is freezing the fs and userspace wants to
start freezing the fs, we could make userspace wait and then share the
kernel freeze.

For any case where the fs is !unfrozen and the kernel wants to start
freezing the fs, I think I'd rather return EBUSY immediately and let the
caller decide to wait and/or call back.

For the case where one userspace thread is freezing the fs and another
userspace thread wants to start freezing the fs, I think the current
behavior of returning EBUSY immediately is ok.

--D

> BTW, when reading this code, I've spotted attached cleanup opportunity but
> I'll queue that separately so that is JFYI.
> 
> > +#define FREEZE_HOLDER_USERSPACE	(1U << 1)	/* userspace froze fs */
> > +#define FREEZE_HOLDER_KERNEL	(1U << 2)	/* kernel froze fs */
> 
> Why not start from 1U << 0? And bonus points for using BIT() macro :).
> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR

> From 9fce35f21f9a62470e764463c84373fb013108fd Mon Sep 17 00:00:00 2001
> From: Jan Kara <jack@suse.cz>
> Date: Thu, 25 May 2023 15:56:19 +0200
> Subject: [PATCH] fs: Drop wait_unfrozen wait queue
> 
> wait_unfrozen waitqueue is used only in quota code to wait for
> filesystem to become unfrozen. In that place we can just use
> sb_start_write() - sb_end_write() pair to achieve the same. So just
> remove the waitqueue.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/quota/quota.c   | 5 +++--
>  fs/super.c         | 4 ----
>  include/linux/fs.h | 1 -
>  3 files changed, 3 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/quota/quota.c b/fs/quota/quota.c
> index 052f143e2e0e..0e41fb84060f 100644
> --- a/fs/quota/quota.c
> +++ b/fs/quota/quota.c
> @@ -895,8 +895,9 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
>  			up_write(&sb->s_umount);
>  		else
>  			up_read(&sb->s_umount);
> -		wait_event(sb->s_writers.wait_unfrozen,
> -			   sb->s_writers.frozen == SB_UNFROZEN);
> +		/* Wait for sb to unfreeze */
> +		sb_start_write(sb);
> +		sb_end_write(sb);
>  		put_super(sb);
>  		goto retry;
>  	}
> diff --git a/fs/super.c b/fs/super.c
> index 34afe411cf2b..6283cea67280 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -236,7 +236,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>  					&type->s_writers_key[i]))
>  			goto fail;
>  	}
> -	init_waitqueue_head(&s->s_writers.wait_unfrozen);
>  	s->s_bdi = &noop_backing_dev_info;
>  	s->s_flags = flags;
>  	if (s->s_user_ns != &init_user_ns)
> @@ -1706,7 +1705,6 @@ int freeze_super(struct super_block *sb)
>  	if (ret) {
>  		sb->s_writers.frozen = SB_UNFROZEN;
>  		sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
> -		wake_up(&sb->s_writers.wait_unfrozen);
>  		deactivate_locked_super(sb);
>  		return ret;
>  	}
> @@ -1722,7 +1720,6 @@ int freeze_super(struct super_block *sb)
>  				"VFS:Filesystem freeze failed\n");
>  			sb->s_writers.frozen = SB_UNFROZEN;
>  			sb_freeze_unlock(sb, SB_FREEZE_FS);
> -			wake_up(&sb->s_writers.wait_unfrozen);
>  			deactivate_locked_super(sb);
>  			return ret;
>  		}
> @@ -1768,7 +1765,6 @@ static int thaw_super_locked(struct super_block *sb)
>  	sb->s_writers.frozen = SB_UNFROZEN;
>  	sb_freeze_unlock(sb, SB_FREEZE_FS);
>  out:
> -	wake_up(&sb->s_writers.wait_unfrozen);
>  	deactivate_locked_super(sb);
>  	return 0;
>  }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 21a981680856..3b65a6194485 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1146,7 +1146,6 @@ enum {
>  
>  struct sb_writers {
>  	int				frozen;		/* Is sb frozen? */
> -	wait_queue_head_t		wait_unfrozen;	/* wait for thaw */
>  	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
>  };
>  
> -- 
> 2.35.3
>
  
Jan Kara June 7, 2023, 8:46 p.m. UTC | #8
On Wed 07-06-23 09:31:10, Darrick J. Wong wrote:
> On Thu, May 25, 2023 at 04:14:30PM +0200, Jan Kara wrote:
> > On Mon 22-05-23 16:42:00, Darrick J. Wong wrote:
> > > How about this as an alternative patch?  Kernel and userspace freeze
> > > state are stored in s_writers; each type cannot block the other (though
> > > you still can't have nested kernel or userspace freezes); and the freeze
> > > is maintained until /both/ freeze types are dropped.
> > > 
> > > AFAICT this should work for the two other usecases (quiescing pagefaults
> > > for fsdax pmem pre-removal; and freezing fses during suspend) besides
> > > online fsck for xfs.
> > > 
> > > --D
> > > 
> > > From: Darrick J. Wong <djwong@kernel.org>
> > > Subject: fs: distinguish between user initiated freeze and kernel initiated freeze
> > > 
> > > Userspace can freeze a filesystem using the FIFREEZE ioctl or by
> > > suspending the block device; this state persists until userspace thaws
> > > the filesystem with the FITHAW ioctl or resuming the block device.
> > > Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for
> > > the fsfreeze ioctl") we only allow the first freeze command to succeed.
> > > 
> > > The kernel may decide that it is necessary to freeze a filesystem for
> > > its own internal purposes, such as suspends in progress, filesystem fsck
> > > activities, or quiescing a device prior to removal.  Userspace thaw
> > > commands must never break a kernel freeze, and kernel thaw commands
> > > shouldn't undo userspace's freeze command.
> > > 
> > > Introduce a couple of freeze holder flags and wire it into the
> > > sb_writers state.  One kernel and one userspace freeze are allowed to
> > > coexist at the same time; the filesystem will not thaw until both are
> > > lifted.
> > > 
> > > Inspired-by: Luis Chamberlain <mcgrof@kernel.org>
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > 
> > Yes, this is exactly how I'd imagine it. Thanks for writing the patch!
> > 
> > I'd just note that this would need rebasing on top of Luis' patches 1 and
> > 2. Also:
> > 
> > > +	if (sbw->frozen == SB_FREEZE_COMPLETE) {
> > > +		switch (who) {
> > > +		case FREEZE_HOLDER_KERNEL:
> > > +			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
> > > +				/*
> > > +				 * Kernel freeze already in effect; caller can
> > > +				 * try again.
> > > +				 */
> > > +				deactivate_locked_super(sb);
> > > +				return -EBUSY;
> > > +			}
> > > +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> > > +				/*
> > > +				 * Share the freeze state with the userspace
> > > +				 * freeze already in effect.
> > > +				 */
> > > +				sbw->freeze_holders |= who;
> > > +				deactivate_locked_super(sb);
> > > +				return 0;
> > > +			}
> > > +			break;
> > > +		case FREEZE_HOLDER_USERSPACE:
> > > +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> > > +				/*
> > > +				 * Userspace freeze already in effect; tell
> > > +				 * the caller we're busy.
> > > +				 */
> > > +				deactivate_locked_super(sb);
> > > +				return -EBUSY;
> > > +			}
> > > +			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
> > > +				/*
> > > +				 * Share the freeze state with the kernel
> > > +				 * freeze already in effect.
> > > +				 */
> > > +				sbw->freeze_holders |= who;
> > > +				deactivate_locked_super(sb);
> > > +				return 0;
> > > +			}
> > > +			break;
> > > +		default:
> > > +			BUG();
> > > +			deactivate_locked_super(sb);
> > > +			return -EINVAL;
> > > +		}
> > > +	}
> > 
> > Can't this be simplified to:
> > 
> > 	BUG_ON(who & ~(FREEZE_HOLDER_USERSPACE | FREEZE_HOLDER_KERNEL));
> > 	BUG_ON(!(!(who & FREEZE_HOLDER_USERSPACE) ^
> > 	       !(who & FREEZE_HOLDER_KERNEL)));
> > retry:
> > 	if (sb->s_writers.freeze_holders & who)
> > 		return -EBUSY;
> > 	/* Already frozen by someone else? */
> > 	if (sb->s_writers.freeze_holders & ~who) {
> > 		sb->s_writers.freeze_holders |= who;
> > 		return 0;
> > 	}
> > 
> > Now the only remaining issue with the code is that the two different
> > holders can be attempting to freeze the filesystem at once and in that case
> > one of them has to wait for the other one instead of returning -EBUSY as
> > would happen currently. This can happen because we temporarily drop
> > s_umount in freeze_super() due to lock ordering issues. I think we could
> > do something like:
> > 
> > 	if (!sb_unfrozen(sb)) {
> > 		up_write(&sb->s_umount);
> > 		wait_var_event(&sb->s_writers.frozen,
> > 			       sb_unfrozen(sb) || sb_frozen(sb));
> > 		down_write(&sb->s_umount);
> > 		goto retry;
> > 	}
> > 
> > and then sprinkle wake_up_var(&sb->s_writers.frozen) at appropriate places
> > in freeze_super().
> 
> If we implemented this behavior change, it ought to be a separate patch.
> 
> For the case where the kernel is freezing the fs and userspace wants to
> start freezing the fs, we could make userspace wait and then share the
> kernel freeze.

Yes.

> For any case where the fs is !unfrozen and the kernel wants to start
> freezing the fs, I think I'd rather return EBUSY immediately and let the
> caller decide to wait and/or call back.

Possibly, although I thought that if userspace has frozen the fs and kernel
wants to freeze, we want to return success? At least that was what I think
your patches were doing. And then I don't see the point why we should be
returning EBUSY if userspace is in the middle of the freeze. So what's the
intended semantics?
 
> For the case where one userspace thread is freezing the fs and another
> userspace thread wants to start freezing the fs, I think the current
> behavior of returning EBUSY immediately is ok.

Yes.

								Honza
  
Christoph Hellwig June 8, 2023, 5:24 a.m. UTC | #9
On Mon, May 22, 2023 at 04:42:00PM -0700, Darrick J. Wong wrote:
> How about this as an alternative patch?  Kernel and userspace freeze
> state are stored in s_writers; each type cannot block the other (though
> you still can't have nested kernel or userspace freezes); and the freeze
> is maintained until /both/ freeze types are dropped.
> 
> AFAICT this should work for the two other usecases (quiescing pagefaults
> for fsdax pmem pre-removal; and freezing fses during suspend) besides
> online fsck for xfs.

I think this is fundamentally the right thing.  Can you send this as
a standalone thread in a separate thread to make it sure it gets
expedited?

> -static int thaw_super_locked(struct super_block *sb);
> +static int thaw_super_locked(struct super_block *sb, unsigned short who);

Is the unsigned short really worth it?  Even if it's just two values
I'd also go for a __bitwise type to get the typechecking as that tends
to help a lot goind down the road.

>  /**
> - * freeze_super - lock the filesystem and force it into a consistent state
> + * __freeze_super - lock the filesystem and force it into a consistent state
>   * @sb: the super to lock
> + * @who: FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
> + * FREEZE_HOLDER_KERNEL if the kernel wants to freeze it
>   *
>   * Syncs the super to make sure the filesystem is consistent and calls the fs's
> - * freeze_fs.  Subsequent calls to this without first thawing the fs will return
> + * freeze_fs.  Subsequent calls to this without first thawing the fs may return
>   * -EBUSY.
>   *
> + * The @who argument distinguishes between the kernel and userspace trying to
> + * freeze the filesystem.  Although there cannot be multiple kernel freezes or
> + * multiple userspace freezes in effect at any given time, the kernel and
> + * userspace can both hold a filesystem frozen.  The filesystem remains frozen
> + * until there are no kernel or userspace freezes in effect.
> + *
>   * During this function, sb->s_writers.frozen goes through these values:
>   *
>   * SB_UNFROZEN: File system is normal, all writes progress as usual.
> @@ -1668,12 +1676,61 @@ static void sb_freeze_unlock(struct super_block *sb, int level)
>   *
>   * sb->s_writers.frozen is protected by sb->s_umount.
>   */

There's really no point in having a kerneldoc for a static function.
Either this moves to the actual exported functions, or it should
become a normal non-kerneldoc comment.  But I'm not even sre this split
makes much sense to start with.  I'd just add a the who argument
to freeze_super given that we have only very few callers anyway,
and it is way easier to follow than thse rappers hardoding the argument.

> +static int __freeze_super(struct super_block *sb, unsigned short who)
>  {
> +	struct sb_writers *sbw = &sb->s_writers;
>  	int ret;
>  
>  	atomic_inc(&sb->s_active);
>  	down_write(&sb->s_umount);
> +
> +	if (sbw->frozen == SB_FREEZE_COMPLETE) {
> +		switch (who) {

Nit, but maybe split evetything inside this branch into a
freeze_frozen_super helper?

> +static int thaw_super_locked(struct super_block *sb, unsigned short who)
> +{
> +	struct sb_writers *sbw = &sb->s_writers;
>  	int error;
>  
> +	if (sbw->frozen == SB_FREEZE_COMPLETE) {
> +		switch (who) {
> +		case FREEZE_HOLDER_KERNEL:
> +			if (!(sbw->freeze_holders & FREEZE_HOLDER_KERNEL)) {
> +				/* Caller doesn't hold a kernel freeze. */
> +				up_write(&sb->s_umount);
> +				return -EINVAL;
> +			}
> +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> +				/*
> +				 * We were sharing the freeze with userspace,
> +				 * so drop the userspace freeze but exit
> +				 * without unfreezing.
> +				 */
> +				sbw->freeze_holders &= ~who;
> +				up_write(&sb->s_umount);
> +				return 0;
> +			}
> +			break;
> +		case FREEZE_HOLDER_USERSPACE:
> +			if (!(sbw->freeze_holders & FREEZE_HOLDER_USERSPACE)) {
> +				/* Caller doesn't hold a userspace freeze. */
> +				up_write(&sb->s_umount);
> +				return -EINVAL;
> +			}
> +			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
> +				/*
> +				 * We were sharing the freeze with the kernel,
> +				 * so drop the kernel freeze but exit without
> +				 * unfreezing.
> +				 */
> +				sbw->freeze_holders &= ~who;
> +				up_write(&sb->s_umount);
> +				return 0;
> +			}
> +			break;
> +		default:
> +			BUG();
> +			up_write(&sb->s_umount);
> +			return -EINVAL;
> +		}

To me this screams for another 'is_partial_thaw' or so helper, whith
which we could doe something like:

	if (sbw->frozen != SB_FREEZE_COMPLETE) {
		ret = -EINVAL;
		goto out_unlock;
	}
	ret = is_partial_thaw(sb, who);
	if (ret) {
		if (ret == 1) {
			sbw->freeze_holders &= ~who;
			ret = 0
		}
		goto out_unlock;
	}

Btw, same comment about the wrappers as on the freeze side.
  
Christoph Hellwig June 8, 2023, 5:29 a.m. UTC | #10
On Thu, May 25, 2023 at 04:14:30PM +0200, Jan Kara wrote:
> Yes, this is exactly how I'd imagine it. Thanks for writing the patch!
> 
> I'd just note that this would need rebasing on top of Luis' patches 1 and
> 2. Also:

I'd not do that for now.  1 needs a lot more work, and 2 seems rather
questionable.

> Now the only remaining issue with the code is that the two different
> holders can be attempting to freeze the filesystem at once and in that case
> one of them has to wait for the other one instead of returning -EBUSY as
> would happen currently. This can happen because we temporarily drop
> s_umount in freeze_super() due to lock ordering issues. I think we could
> do something like:
> 
> 	if (!sb_unfrozen(sb)) {
> 		up_write(&sb->s_umount);
> 		wait_var_event(&sb->s_writers.frozen,
> 			       sb_unfrozen(sb) || sb_frozen(sb));
> 		down_write(&sb->s_umount);
> 		goto retry;
> 	}
> 
> and then sprinkle wake_up_var(&sb->s_writers.frozen) at appropriate places
> in freeze_super().

Let's do that separately as a follow on..

> 
> BTW, when reading this code, I've spotted attached cleanup opportunity but
> I'll queue that separately so that is JFYI.
> 
> > +#define FREEZE_HOLDER_USERSPACE	(1U << 1)	/* userspace froze fs */
> > +#define FREEZE_HOLDER_KERNEL	(1U << 2)	/* kernel froze fs */
> 
> Why not start from 1U << 0? And bonus points for using BIT() macro :).

BIT() is a nasty thing and actually makes code harder to read. And it
doesn't interact very well with the __bitwise annotation that might
actually be useful here.
  
Jan Kara June 8, 2023, 9:11 a.m. UTC | #11
On Wed 07-06-23 22:29:04, Christoph Hellwig wrote:
> On Thu, May 25, 2023 at 04:14:30PM +0200, Jan Kara wrote:
> > Yes, this is exactly how I'd imagine it. Thanks for writing the patch!
> > 
> > I'd just note that this would need rebasing on top of Luis' patches 1 and
> > 2. Also:
> 
> I'd not do that for now.  1 needs a lot more work, and 2 seems rather
> questionable.

OK, I agree the wrappers could be confusing (they didn't confuse me but
when you spelled it out, I agree).

> > Now the only remaining issue with the code is that the two different
> > holders can be attempting to freeze the filesystem at once and in that case
> > one of them has to wait for the other one instead of returning -EBUSY as
> > would happen currently. This can happen because we temporarily drop
> > s_umount in freeze_super() due to lock ordering issues. I think we could
> > do something like:
> > 
> > 	if (!sb_unfrozen(sb)) {
> > 		up_write(&sb->s_umount);
> > 		wait_var_event(&sb->s_writers.frozen,
> > 			       sb_unfrozen(sb) || sb_frozen(sb));
> > 		down_write(&sb->s_umount);
> > 		goto retry;
> > 	}
> > 
> > and then sprinkle wake_up_var(&sb->s_writers.frozen) at appropriate places
> > in freeze_super().
> 
> Let's do that separately as a follow on..

Well, we need to somehow settle on how to deal with a race when both kernel
& userspace race to freeze the filesystem and make the result consistent
with the situation when the fs is already frozen by someone.

> > BTW, when reading this code, I've spotted attached cleanup opportunity but
> > I'll queue that separately so that is JFYI.
> > 
> > > +#define FREEZE_HOLDER_USERSPACE	(1U << 1)	/* userspace froze fs */
> > > +#define FREEZE_HOLDER_KERNEL	(1U << 2)	/* kernel froze fs */
> > 
> > Why not start from 1U << 0? And bonus points for using BIT() macro :).
> 
> BIT() is a nasty thing and actually makes code harder to read. And it
> doesn't interact very well with the __bitwise annotation that might
> actually be useful here.

OK. I'm not too hung up on BIT() macro.

								Honza
  
Darrick J. Wong June 8, 2023, 6:15 p.m. UTC | #12
On Wed, Jun 07, 2023 at 10:24:32PM -0700, Christoph Hellwig wrote:
> On Mon, May 22, 2023 at 04:42:00PM -0700, Darrick J. Wong wrote:
> > How about this as an alternative patch?  Kernel and userspace freeze
> > state are stored in s_writers; each type cannot block the other (though
> > you still can't have nested kernel or userspace freezes); and the freeze
> > is maintained until /both/ freeze types are dropped.
> > 
> > AFAICT this should work for the two other usecases (quiescing pagefaults
> > for fsdax pmem pre-removal; and freezing fses during suspend) besides
> > online fsck for xfs.
> 
> I think this is fundamentally the right thing.  Can you send this as
> a standalone thread in a separate thread to make it sure it gets
> expedited?

Yeah, I'll do that.

> > -static int thaw_super_locked(struct super_block *sb);
> > +static int thaw_super_locked(struct super_block *sb, unsigned short who);
> 
> Is the unsigned short really worth it?  Even if it's just two values
> I'd also go for a __bitwise type to get the typechecking as that tends
> to help a lot goind down the road.

Instead of __bitwise, I'll make freeze_super() take an enum, since
callers can only initiate one at a time, and the compiler can (in
theory) catch people passing garbage inputs.

> >  /**
> > - * freeze_super - lock the filesystem and force it into a consistent state
> > + * __freeze_super - lock the filesystem and force it into a consistent state
> >   * @sb: the super to lock
> > + * @who: FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
> > + * FREEZE_HOLDER_KERNEL if the kernel wants to freeze it
> >   *
> >   * Syncs the super to make sure the filesystem is consistent and calls the fs's
> > - * freeze_fs.  Subsequent calls to this without first thawing the fs will return
> > + * freeze_fs.  Subsequent calls to this without first thawing the fs may return
> >   * -EBUSY.
> >   *
> > + * The @who argument distinguishes between the kernel and userspace trying to
> > + * freeze the filesystem.  Although there cannot be multiple kernel freezes or
> > + * multiple userspace freezes in effect at any given time, the kernel and
> > + * userspace can both hold a filesystem frozen.  The filesystem remains frozen
> > + * until there are no kernel or userspace freezes in effect.
> > + *
> >   * During this function, sb->s_writers.frozen goes through these values:
> >   *
> >   * SB_UNFROZEN: File system is normal, all writes progress as usual.
> > @@ -1668,12 +1676,61 @@ static void sb_freeze_unlock(struct super_block *sb, int level)
> >   *
> >   * sb->s_writers.frozen is protected by sb->s_umount.
> >   */
> 
> There's really no point in having a kerneldoc for a static function.
> Either this moves to the actual exported functions, or it should
> become a normal non-kerneldoc comment.  But I'm not even sre this split
> makes much sense to start with.  I'd just add a the who argument
> to freeze_super given that we have only very few callers anyway,
> and it is way easier to follow than thse rappers hardoding the argument.

Agreed.

> > +static int __freeze_super(struct super_block *sb, unsigned short who)
> >  {
> > +	struct sb_writers *sbw = &sb->s_writers;
> >  	int ret;
> >  
> >  	atomic_inc(&sb->s_active);
> >  	down_write(&sb->s_umount);
> > +
> > +	if (sbw->frozen == SB_FREEZE_COMPLETE) {
> > +		switch (who) {
> 
> Nit, but maybe split evetything inside this branch into a
> freeze_frozen_super helper?

Yes, will do.  I think Jan's simplification will condense this too.

> > +static int thaw_super_locked(struct super_block *sb, unsigned short who)
> > +{
> > +	struct sb_writers *sbw = &sb->s_writers;
> >  	int error;
> >  
> > +	if (sbw->frozen == SB_FREEZE_COMPLETE) {
> > +		switch (who) {
> > +		case FREEZE_HOLDER_KERNEL:
> > +			if (!(sbw->freeze_holders & FREEZE_HOLDER_KERNEL)) {
> > +				/* Caller doesn't hold a kernel freeze. */
> > +				up_write(&sb->s_umount);
> > +				return -EINVAL;
> > +			}
> > +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> > +				/*
> > +				 * We were sharing the freeze with userspace,
> > +				 * so drop the userspace freeze but exit
> > +				 * without unfreezing.
> > +				 */
> > +				sbw->freeze_holders &= ~who;
> > +				up_write(&sb->s_umount);
> > +				return 0;
> > +			}
> > +			break;
> > +		case FREEZE_HOLDER_USERSPACE:
> > +			if (!(sbw->freeze_holders & FREEZE_HOLDER_USERSPACE)) {
> > +				/* Caller doesn't hold a userspace freeze. */
> > +				up_write(&sb->s_umount);
> > +				return -EINVAL;
> > +			}
> > +			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
> > +				/*
> > +				 * We were sharing the freeze with the kernel,
> > +				 * so drop the kernel freeze but exit without
> > +				 * unfreezing.
> > +				 */
> > +				sbw->freeze_holders &= ~who;
> > +				up_write(&sb->s_umount);
> > +				return 0;
> > +			}
> > +			break;
> > +		default:
> > +			BUG();
> > +			up_write(&sb->s_umount);
> > +			return -EINVAL;
> > +		}
> 
> To me this screams for another 'is_partial_thaw' or so helper, whith
> which we could doe something like:
> 
> 	if (sbw->frozen != SB_FREEZE_COMPLETE) {
> 		ret = -EINVAL;
> 		goto out_unlock;
> 	}
> 	ret = is_partial_thaw(sb, who);
> 	if (ret) {
> 		if (ret == 1) {
> 			sbw->freeze_holders &= ~who;
> 			ret = 0
> 		}
> 		goto out_unlock;
> 	}

<nod>

> Btw, same comment about the wrappers as on the freeze side.

<nod>

--D
  
Darrick J. Wong June 8, 2023, 6:16 p.m. UTC | #13
On Thu, Jun 08, 2023 at 11:11:30AM +0200, Jan Kara wrote:
> On Wed 07-06-23 22:29:04, Christoph Hellwig wrote:
> > On Thu, May 25, 2023 at 04:14:30PM +0200, Jan Kara wrote:
> > > Yes, this is exactly how I'd imagine it. Thanks for writing the patch!
> > > 
> > > I'd just note that this would need rebasing on top of Luis' patches 1 and
> > > 2. Also:
> > 
> > I'd not do that for now.  1 needs a lot more work, and 2 seems rather
> > questionable.
> 
> OK, I agree the wrappers could be confusing (they didn't confuse me but
> when you spelled it out, I agree).
> 
> > > Now the only remaining issue with the code is that the two different
> > > holders can be attempting to freeze the filesystem at once and in that case
> > > one of them has to wait for the other one instead of returning -EBUSY as
> > > would happen currently. This can happen because we temporarily drop
> > > s_umount in freeze_super() due to lock ordering issues. I think we could
> > > do something like:
> > > 
> > > 	if (!sb_unfrozen(sb)) {
> > > 		up_write(&sb->s_umount);
> > > 		wait_var_event(&sb->s_writers.frozen,
> > > 			       sb_unfrozen(sb) || sb_frozen(sb));
> > > 		down_write(&sb->s_umount);
> > > 		goto retry;
> > > 	}
> > > 
> > > and then sprinkle wake_up_var(&sb->s_writers.frozen) at appropriate places
> > > in freeze_super().
> > 
> > Let's do that separately as a follow on..
> 
> Well, we need to somehow settle on how to deal with a race when both kernel
> & userspace race to freeze the filesystem and make the result consistent
> with the situation when the fs is already frozen by someone.

<nod> I'll see what I can do about that.

> > > BTW, when reading this code, I've spotted attached cleanup opportunity but
> > > I'll queue that separately so that is JFYI.
> > > 
> > > > +#define FREEZE_HOLDER_USERSPACE	(1U << 1)	/* userspace froze fs */
> > > > +#define FREEZE_HOLDER_KERNEL	(1U << 2)	/* kernel froze fs */
> > > 
> > > Why not start from 1U << 0? And bonus points for using BIT() macro :).
> > 
> > BIT() is a nasty thing and actually makes code harder to read. And it
> > doesn't interact very well with the __bitwise annotation that might
> > actually be useful here.
> 
> OK. I'm not too hung up on BIT() macro.
> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
  
Darrick J. Wong June 8, 2023, 6:58 p.m. UTC | #14
On Wed, Jun 07, 2023 at 10:46:10PM +0200, Jan Kara wrote:
> On Wed 07-06-23 09:31:10, Darrick J. Wong wrote:
> > On Thu, May 25, 2023 at 04:14:30PM +0200, Jan Kara wrote:
> > > On Mon 22-05-23 16:42:00, Darrick J. Wong wrote:
> > > > How about this as an alternative patch?  Kernel and userspace freeze
> > > > state are stored in s_writers; each type cannot block the other (though
> > > > you still can't have nested kernel or userspace freezes); and the freeze
> > > > is maintained until /both/ freeze types are dropped.
> > > > 
> > > > AFAICT this should work for the two other usecases (quiescing pagefaults
> > > > for fsdax pmem pre-removal; and freezing fses during suspend) besides
> > > > online fsck for xfs.
> > > > 
> > > > --D
> > > > 
> > > > From: Darrick J. Wong <djwong@kernel.org>
> > > > Subject: fs: distinguish between user initiated freeze and kernel initiated freeze
> > > > 
> > > > Userspace can freeze a filesystem using the FIFREEZE ioctl or by
> > > > suspending the block device; this state persists until userspace thaws
> > > > the filesystem with the FITHAW ioctl or resuming the block device.
> > > > Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for
> > > > the fsfreeze ioctl") we only allow the first freeze command to succeed.
> > > > 
> > > > The kernel may decide that it is necessary to freeze a filesystem for
> > > > its own internal purposes, such as suspends in progress, filesystem fsck
> > > > activities, or quiescing a device prior to removal.  Userspace thaw
> > > > commands must never break a kernel freeze, and kernel thaw commands
> > > > shouldn't undo userspace's freeze command.
> > > > 
> > > > Introduce a couple of freeze holder flags and wire it into the
> > > > sb_writers state.  One kernel and one userspace freeze are allowed to
> > > > coexist at the same time; the filesystem will not thaw until both are
> > > > lifted.
> > > > 
> > > > Inspired-by: Luis Chamberlain <mcgrof@kernel.org>
> > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > 
> > > Yes, this is exactly how I'd imagine it. Thanks for writing the patch!
> > > 
> > > I'd just note that this would need rebasing on top of Luis' patches 1 and
> > > 2. Also:
> > > 
> > > > +	if (sbw->frozen == SB_FREEZE_COMPLETE) {
> > > > +		switch (who) {
> > > > +		case FREEZE_HOLDER_KERNEL:
> > > > +			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
> > > > +				/*
> > > > +				 * Kernel freeze already in effect; caller can
> > > > +				 * try again.
> > > > +				 */
> > > > +				deactivate_locked_super(sb);
> > > > +				return -EBUSY;
> > > > +			}
> > > > +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> > > > +				/*
> > > > +				 * Share the freeze state with the userspace
> > > > +				 * freeze already in effect.
> > > > +				 */
> > > > +				sbw->freeze_holders |= who;
> > > > +				deactivate_locked_super(sb);
> > > > +				return 0;
> > > > +			}
> > > > +			break;
> > > > +		case FREEZE_HOLDER_USERSPACE:
> > > > +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> > > > +				/*
> > > > +				 * Userspace freeze already in effect; tell
> > > > +				 * the caller we're busy.
> > > > +				 */
> > > > +				deactivate_locked_super(sb);
> > > > +				return -EBUSY;
> > > > +			}
> > > > +			if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
> > > > +				/*
> > > > +				 * Share the freeze state with the kernel
> > > > +				 * freeze already in effect.
> > > > +				 */
> > > > +				sbw->freeze_holders |= who;
> > > > +				deactivate_locked_super(sb);
> > > > +				return 0;
> > > > +			}
> > > > +			break;
> > > > +		default:
> > > > +			BUG();
> > > > +			deactivate_locked_super(sb);
> > > > +			return -EINVAL;
> > > > +		}
> > > > +	}
> > > 
> > > Can't this be simplified to:
> > > 
> > > 	BUG_ON(who & ~(FREEZE_HOLDER_USERSPACE | FREEZE_HOLDER_KERNEL));
> > > 	BUG_ON(!(!(who & FREEZE_HOLDER_USERSPACE) ^
> > > 	       !(who & FREEZE_HOLDER_KERNEL)));
> > > retry:
> > > 	if (sb->s_writers.freeze_holders & who)
> > > 		return -EBUSY;
> > > 	/* Already frozen by someone else? */
> > > 	if (sb->s_writers.freeze_holders & ~who) {
> > > 		sb->s_writers.freeze_holders |= who;
> > > 		return 0;
> > > 	}
> > > 
> > > Now the only remaining issue with the code is that the two different
> > > holders can be attempting to freeze the filesystem at once and in that case
> > > one of them has to wait for the other one instead of returning -EBUSY as
> > > would happen currently. This can happen because we temporarily drop
> > > s_umount in freeze_super() due to lock ordering issues. I think we could
> > > do something like:
> > > 
> > > 	if (!sb_unfrozen(sb)) {
> > > 		up_write(&sb->s_umount);
> > > 		wait_var_event(&sb->s_writers.frozen,
> > > 			       sb_unfrozen(sb) || sb_frozen(sb));
> > > 		down_write(&sb->s_umount);
> > > 		goto retry;
> > > 	}
> > > 
> > > and then sprinkle wake_up_var(&sb->s_writers.frozen) at appropriate places
> > > in freeze_super().
> > 
> > If we implemented this behavior change, it ought to be a separate patch.
> > 
> > For the case where the kernel is freezing the fs and userspace wants to
> > start freezing the fs, we could make userspace wait and then share the
> > kernel freeze.
> 
> Yes.
> 
> > For any case where the fs is !unfrozen and the kernel wants to start
> > freezing the fs, I think I'd rather return EBUSY immediately and let the
> > caller decide to wait and/or call back.
> 
> Possibly, although I thought that if userspace has frozen the fs and kernel
> wants to freeze, we want to return success?

Yes.  Apologies, I tripped over the four-states-of-gray thing and forgot
that frozen != !unfrozen.

> At least that was what I think
> your patches were doing. And then I don't see the point why we should be
> returning EBUSY if userspace is in the middle of the freeze. So what's the
> intended semantics?

Let me try again:

"For the case where one kernel thread is freezing the fs and another
kernel thread wants to start freezing the fs, return -EBUSY immediately.

"For the case where userspace is freezing the fs and kernel wants to
start freezing the fs, return -EBUSY immediately.  Callers decide if
they want to sleep and/or retry the operation."

--D

> > For the case where one userspace thread is freezing the fs and another
> > userspace thread wants to start freezing the fs, I think the current
> > behavior of returning EBUSY immediately is ok.
> 
> Yes.
> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
  
Luis Chamberlain June 8, 2023, 8:26 p.m. UTC | #15
On Mon, May 22, 2023 at 04:42:00PM -0700, Darrick J. Wong wrote:
> How about this as an alternative patch?

I'm all for it, this is low hanging fruit and I try to get back to it
as no one else does, so I'm glad someone else is looking and trying too!

Hopefully dropping patch 1 and 2 would help with this.

Comments below.

> From: Darrick J. Wong <djwong@kernel.org>
> Subject: fs: distinguish between user initiated freeze and kernel initiated freeze
> 
> Userspace can freeze a filesystem using the FIFREEZE ioctl or by
> suspending the block device; this state persists until userspace thaws
> the filesystem with the FITHAW ioctl or resuming the block device.
> Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for
> the fsfreeze ioctl") we only allow the first freeze command to succeed.
> 
> The kernel may decide that it is necessary to freeze a filesystem for
> its own internal purposes, such as suspends in progress, filesystem fsck
> activities, or quiescing a device prior to removal.  Userspace thaw
> commands must never break a kernel freeze, and kernel thaw commands
> shouldn't undo userspace's freeze command.
> 
> Introduce a couple of freeze holder flags and wire it into the
> sb_writers state.  One kernel and one userspace freeze are allowed to
> coexist at the same time; the filesystem will not thaw until both are
> lifted.

This mix-match stuff is also important to document so we can get
userspace to understand what is allowed and we get a sense of direction
written / documented. Without this trying to navigate around this is
all implied. We may need to adjust things with time for thing we may
not have considered.

> -int freeze_super(struct super_block *sb)
> +static int __freeze_super(struct super_block *sb, unsigned short who)
>  {
> +	struct sb_writers *sbw = &sb->s_writers;
>  	int ret;
>  
>  	atomic_inc(&sb->s_active);
>  	down_write(&sb->s_umount);
> +
> +	if (sbw->frozen == SB_FREEZE_COMPLETE) {
> +		switch (who) {

<-- snip -->

> +		case FREEZE_HOLDER_USERSPACE:
> +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> +				/*
> +				 * Userspace freeze already in effect; tell
> +				 * the caller we're busy.
> +				 */
> +				deactivate_locked_super(sb);
> +				return -EBUSY;

I'm thinking some userspace might find this OK so thought maybe
something like -EALREADY would be better, to then allow userspace
to decide, however, since userspace would not control the thaw it
seems like risky business to support that.

Anyway, I'm all for any alternative!

  Luis
  
Luis Chamberlain June 8, 2023, 8:30 p.m. UTC | #16
On Tue, Jun 06, 2023 at 10:19:56AM -0700, Darrick J. Wong wrote:
> Luis hasn't responded to my question, so I stopped.

Sorry for the delay, by all means alternatives are hugely
welcomed. I just worked on it as it was back logged work and
not a high priority thing for me, so I try to get to it when I
can. Having someone with more immediate needs give it a stab
is hugely appreciated!

  Luis
  
Darrick J. Wong June 8, 2023, 9:10 p.m. UTC | #17
On Thu, Jun 08, 2023 at 01:26:19PM -0700, Luis Chamberlain wrote:
> On Mon, May 22, 2023 at 04:42:00PM -0700, Darrick J. Wong wrote:
> > How about this as an alternative patch?
> 
> I'm all for it, this is low hanging fruit and I try to get back to it
> as no one else does, so I'm glad someone else is looking and trying too!
> 
> Hopefully dropping patch 1 and 2 would help with this.
> 
> Comments below.
> 
> > From: Darrick J. Wong <djwong@kernel.org>
> > Subject: fs: distinguish between user initiated freeze and kernel initiated freeze
> > 
> > Userspace can freeze a filesystem using the FIFREEZE ioctl or by
> > suspending the block device; this state persists until userspace thaws
> > the filesystem with the FITHAW ioctl or resuming the block device.
> > Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for
> > the fsfreeze ioctl") we only allow the first freeze command to succeed.
> > 
> > The kernel may decide that it is necessary to freeze a filesystem for
> > its own internal purposes, such as suspends in progress, filesystem fsck
> > activities, or quiescing a device prior to removal.  Userspace thaw
> > commands must never break a kernel freeze, and kernel thaw commands
> > shouldn't undo userspace's freeze command.
> > 
> > Introduce a couple of freeze holder flags and wire it into the
> > sb_writers state.  One kernel and one userspace freeze are allowed to
> > coexist at the same time; the filesystem will not thaw until both are
> > lifted.
> 
> This mix-match stuff is also important to document so we can get
> userspace to understand what is allowed and we get a sense of direction
> written / documented. Without this trying to navigate around this is
> all implied. We may need to adjust things with time for thing we may
> not have considered.

That's captured in the kernledoc for freeze_super, which is no longer
getting cut up into __freeze_super here.

> > -int freeze_super(struct super_block *sb)
> > +static int __freeze_super(struct super_block *sb, unsigned short who)
> >  {
> > +	struct sb_writers *sbw = &sb->s_writers;
> >  	int ret;
> >  
> >  	atomic_inc(&sb->s_active);
> >  	down_write(&sb->s_umount);
> > +
> > +	if (sbw->frozen == SB_FREEZE_COMPLETE) {
> > +		switch (who) {
> 
> <-- snip -->
> 
> > +		case FREEZE_HOLDER_USERSPACE:
> > +			if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
> > +				/*
> > +				 * Userspace freeze already in effect; tell
> > +				 * the caller we're busy.
> > +				 */
> > +				deactivate_locked_super(sb);
> > +				return -EBUSY;
> 
> I'm thinking some userspace might find this OK so thought maybe
> something like -EALREADY would be better, to then allow userspace
> to decide, however, since userspace would not control the thaw it
> seems like risky business to support that.

It already has to, since we've been returning EBUSY for "fs already
frozen or being frozen" for years.

--D

> Anyway, I'm all for any alternative!
> 
>   Luis
  

Patch

diff --git a/block/bdev.c b/block/bdev.c
index dc54a2a1c46e..04f7b2c99845 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -250,7 +250,7 @@  int freeze_bdev(struct block_device *bdev)
 	if (sb->s_op->freeze_super)
 		error = sb->s_op->freeze_super(sb);
 	else
-		error = freeze_super(sb);
+		error = freeze_super(sb, true);
 	deactivate_locked_super(sb);
 
 	if (error) {
@@ -295,7 +295,7 @@  int thaw_bdev(struct block_device *bdev)
 	if (sb->s_op->thaw_super)
 		error = sb->s_op->thaw_super(sb);
 	else
-		error = thaw_super(sb);
+		error = thaw_super(sb, true);
 	if (error)
 		bdev->bd_fsfreeze_count++;
 	else
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index e31d6791d3e3..a5891055d85d 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -2168,7 +2168,7 @@  int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
 
 	if (!get_active_super(sbi->sb->s_bdev))
 		return -ENOTTY;
-	freeze_super(sbi->sb);
+	freeze_super(sbi->sb, true);
 
 	f2fs_down_write(&sbi->gc_lock);
 	f2fs_down_write(&sbi->cp_global_sem);
@@ -2221,7 +2221,7 @@  int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
 	f2fs_up_write(&sbi->cp_global_sem);
 	f2fs_up_write(&sbi->gc_lock);
 	/* We use the same active reference from freeze */
-	thaw_super(sbi->sb);
+	thaw_super(sbi->sb, true);
 	deactivate_locked_super(sbi->sb);
 	return err;
 }
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 01d433ed6ce7..8fd37508f9a0 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -584,7 +584,7 @@  static int freeze_go_sync(struct gfs2_glock *gl)
 	if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp) &&
 	    !test_bit(SDF_NORECOVERY, &sdp->sd_flags)) {
 		atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
-		error = freeze_super(sdp->sd_vfs);
+		error = freeze_super(sdp->sd_vfs, true);
 		if (error) {
 			fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
 				error);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e57cb593e2f3..f2641891de43 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -687,7 +687,7 @@  void gfs2_freeze_func(struct work_struct *work)
 		gfs2_assert_withdraw(sdp, 0);
 	} else {
 		atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
-		error = thaw_super(sb);
+		error = thaw_super(sb, true);
 		if (error) {
 			fs_info(sdp, "GFS2: couldn't thaw filesystem: %d\n",
 				error);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index e80c827acd09..9e0398f99674 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -169,10 +169,10 @@  static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 
 	switch (n) {
 	case 0:
-		error = thaw_super(sdp->sd_vfs);
+		error = thaw_super(sdp->sd_vfs, true);
 		break;
 	case 1:
-		error = freeze_super(sdp->sd_vfs);
+		error = freeze_super(sdp->sd_vfs, true);
 		break;
 	default:
 		deactivate_locked_super(sb);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 3a0cd5e9ad84..be9705d618ec 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -191,7 +191,7 @@  static void signal_our_withdraw(struct gfs2_sbd *sdp)
 		/* Make sure gfs2_unfreeze works if partially-frozen */
 		flush_work(&sdp->sd_freeze_work);
 		atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
-		thaw_super(sdp->sd_vfs);
+		thaw_super(sdp->sd_vfs, true);
 	} else {
 		wait_on_bit(&i_gl->gl_flags, GLF_DEMOTE,
 			    TASK_UNINTERRUPTIBLE);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 1d20af762e0d..3cc79b82a5dc 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -401,7 +401,7 @@  static int ioctl_fsfreeze(struct file *filp)
 	/* Freeze */
 	if (sb->s_op->freeze_super)
 		ret = sb->s_op->freeze_super(sb);
-	ret = freeze_super(sb);
+	ret = freeze_super(sb, true);
 
 	deactivate_locked_super(sb);
 
@@ -418,7 +418,7 @@  static int ioctl_fsthaw(struct file *filp)
 	/* Thaw */
 	if (sb->s_op->thaw_super)
 		return sb->s_op->thaw_super(sb);
-	return thaw_super(sb);
+	return thaw_super(sb, true);
 }
 
 static int ioctl_file_dedupe_range(struct file *file,
diff --git a/fs/super.c b/fs/super.c
index 46c6475fc765..16ccbb9dd230 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1026,7 +1026,7 @@  static void do_thaw_all_callback(struct super_block *sb)
 		return;
 	if (sb->s_root && sb->s_flags & SB_BORN) {
 		emergency_thaw_bdev(sb);
-		thaw_super(sb);
+		thaw_super(sb, true);
 	}
 	deactivate_locked_super(sb);
 }
@@ -1636,6 +1636,8 @@  static void sb_freeze_unlock(struct super_block *sb, int level)
 /**
  * freeze_super - force a filesystem backed by a block device into a consistent state
  * @sb: the super to lock
+ * @usercall: whether or not userspace initiated this via an ioctl or if it
+ * 	was a kernel freeze
  *
  * Used by filesystems and the kernel to freeze a fileystem backed by a block
  * device into a consistent state. Callers must use get_active_super(bdev) to
@@ -1669,10 +1671,13 @@  static void sb_freeze_unlock(struct super_block *sb, int level)
  *
  * sb->s_writers.frozen is protected by sb->s_umount.
  */
-int freeze_super(struct super_block *sb)
+int freeze_super(struct super_block *sb, bool usercall)
 {
 	int ret;
 
+	if (!usercall && sb_is_frozen(sb))
+		return 0;
+
 	if (!sb_is_unfrozen(sb))
 		return -EBUSY;
 
@@ -1682,6 +1687,7 @@  int freeze_super(struct super_block *sb)
 	if (sb_rdonly(sb)) {
 		/* Nothing to do really... */
 		sb->s_writers.frozen = SB_FREEZE_COMPLETE;
+		sb->s_writers.frozen_by_user = usercall;
 		return 0;
 	}
 
@@ -1699,6 +1705,7 @@  int freeze_super(struct super_block *sb)
 	ret = sync_filesystem(sb);
 	if (ret) {
 		sb->s_writers.frozen = SB_UNFROZEN;
+		sb->s_writers.frozen_by_user = false;
 		sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
 		wake_up(&sb->s_writers.wait_unfrozen);
 		return ret;
@@ -1724,6 +1731,7 @@  int freeze_super(struct super_block *sb)
 	 * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
 	 */
 	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
+	sb->s_writers.frozen_by_user = usercall;
 	lockdep_sb_freeze_release(sb);
 	return 0;
 }
@@ -1732,21 +1740,33 @@  EXPORT_SYMBOL(freeze_super);
 /**
  * thaw_super -- unlock a filesystem backed by a block device
  * @sb: the super to thaw
+ * @usercall: whether or not userspace initiated this thaw or if it was the
+ * 	kernel which initiated it
  *
  * Used by filesystems and the kernel to thaw a fileystem backed by a block
  * device. Callers must use get_active_super(bdev) to lock the @sb and when
  * done must unlock it with deactivate_locked_super(). Once done, this marks
  * the filesystem as writeable.
  */
-int thaw_super(struct super_block *sb)
+int thaw_super(struct super_block *sb, bool usercall)
 {
 	int error;
 
-	if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
+	if (!usercall) {
+		/*
+		 * If userspace initiated the freeze don't let the kernel
+		 * thaw it on return from a kernel initiated freeze.
+		 */
+		if (sb_is_unfrozen(sb) || sb_is_frozen_by_user(sb))
+			return 0;
+	}
+
+	if (!sb_is_frozen(sb))
 		return -EINVAL;
 
 	if (sb_rdonly(sb)) {
 		sb->s_writers.frozen = SB_UNFROZEN;
+		sb->s_writers.frozen_by_user = false;
 		goto out;
 	}
 
@@ -1763,6 +1783,7 @@  int thaw_super(struct super_block *sb)
 	}
 
 	sb->s_writers.frozen = SB_UNFROZEN;
+	sb->s_writers.frozen_by_user = false;
 	sb_freeze_unlock(sb, SB_FREEZE_FS);
 out:
 	wake_up(&sb->s_writers.wait_unfrozen);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 90b5bdc4071a..d9b46c858103 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1146,6 +1146,7 @@  enum {
 
 struct sb_writers {
 	int				frozen;		/* Is sb frozen? */
+	bool				frozen_by_user;	/* User freeze? */
 	wait_queue_head_t		wait_unfrozen;	/* wait for thaw */
 	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
 };
@@ -1632,6 +1633,17 @@  static inline bool sb_is_frozen(struct super_block *sb)
 	return sb->s_writers.frozen == SB_FREEZE_COMPLETE;
 }
 
+/**
+ * sb_is_frozen_by_user - was the superblock frozen by userspace?
+ * @sb: the super to check
+ *
+ * Returns true if the super is frozen by userspace, such as an ioctl.
+ */
+static inline bool sb_is_frozen_by_user(struct super_block *sb)
+{
+	return sb_is_frozen(sb) && sb->s_writers.frozen_by_user;
+}
+
 /**
  * sb_is_unfrozen - is superblock unfrozen
  * @sb: the super to check
@@ -2308,8 +2320,8 @@  extern int unregister_filesystem(struct file_system_type *);
 extern int vfs_statfs(const struct path *, struct kstatfs *);
 extern int user_statfs(const char __user *, struct kstatfs *);
 extern int fd_statfs(int, struct kstatfs *);
-extern int freeze_super(struct super_block *super);
-extern int thaw_super(struct super_block *super);
+extern int freeze_super(struct super_block *super, bool usercall);
+extern int thaw_super(struct super_block *super, bool usercall);
 extern __printf(2, 3)
 int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
 extern int super_setup_bdi(struct super_block *sb);