[-v4,5/9] migrate_pages: batch _unmap and _move

Message ID 20230206063313.635011-6-ying.huang@intel.com
State New
Headers
Series migrate_pages(): batch TLB flushing |

Commit Message

Huang, Ying Feb. 6, 2023, 6:33 a.m. UTC
  In this patch the _unmap and _move stage of the folio migration is
batched.  That for, previously, it is,

  for each folio
    _unmap()
    _move()

Now, it is,

  for each folio
    _unmap()
  for each folio
    _move()

Based on this, we can batch the TLB flushing and use some hardware
accelerator to copy folios between batched _unmap and batched _move
stages.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: haoxin <xhao@linux.alibaba.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
---
 mm/migrate.c | 208 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 184 insertions(+), 24 deletions(-)
  

Comments

Zi Yan Feb. 6, 2023, 4:10 p.m. UTC | #1
On 6 Feb 2023, at 1:33, Huang Ying wrote:

> In this patch the _unmap and _move stage of the folio migration is
> batched.  That for, previously, it is,
>
>   for each folio
>     _unmap()
>     _move()
>
> Now, it is,
>
>   for each folio
>     _unmap()
>   for each folio
>     _move()
>
> Based on this, we can batch the TLB flushing and use some hardware
> accelerator to copy folios between batched _unmap and batched _move
> stages.
>
> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
> Cc: Zi Yan <ziy@nvidia.com>
> Cc: Yang Shi <shy828301@gmail.com>
> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Bharata B Rao <bharata@amd.com>
> Cc: Alistair Popple <apopple@nvidia.com>
> Cc: haoxin <xhao@linux.alibaba.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  mm/migrate.c | 208 +++++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 184 insertions(+), 24 deletions(-)
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 0428449149f4..fa7212330cb6 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1033,6 +1033,33 @@ static void __migrate_folio_extract(struct folio *dst,
>  	dst->private = NULL;
>  }
>
> +/* Restore the source folio to the original state upon failure */
> +static void migrate_folio_undo_src(struct folio *src,
> +				   int page_was_mapped,
> +				   struct anon_vma *anon_vma,
> +				   struct list_head *ret)
> +{
> +	if (page_was_mapped)
> +		remove_migration_ptes(src, src, false);
> +	/* Drop an anon_vma reference if we took one */
> +	if (anon_vma)
> +		put_anon_vma(anon_vma);
> +	folio_unlock(src);
> +	list_move_tail(&src->lru, ret);
> +}
> +
> +/* Restore the destination folio to the original state upon failure */
> +static void migrate_folio_undo_dst(struct folio *dst,
> +				   free_page_t put_new_page,
> +				   unsigned long private)
> +{
> +	folio_unlock(dst);
> +	if (put_new_page)
> +		put_new_page(&dst->page, private);
> +	else
> +		folio_put(dst);
> +}
> +
>  /* Cleanup src folio upon migration success */
>  static void migrate_folio_done(struct folio *src,
>  			       enum migrate_reason reason)
> @@ -1052,7 +1079,7 @@ static void migrate_folio_done(struct folio *src,
>  }
>
>  static int __migrate_folio_unmap(struct folio *src, struct folio *dst,
> -				int force, enum migrate_mode mode)
> +				 int force, bool force_lock, enum migrate_mode mode)
>  {
>  	int rc = -EAGAIN;
>  	int page_was_mapped = 0;
> @@ -1079,6 +1106,17 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst,
>  		if (current->flags & PF_MEMALLOC)
>  			goto out;
>
> +		/*
> +		 * We have locked some folios, to avoid deadlock, we cannot
> +		 * lock the folio synchronously.  Go out to process (and
> +		 * unlock) all the locked folios.  Then we can lock the folio
> +		 * synchronously.
> +		 */
The comment alone is quite confusing and the variable might be better
renamed to avoid_force_lock, since there is a force variable to force
lock folio already. And the variable intends to discourage force lock
on a folio to avoid potential deadlock.

How about? Since "lock synchronously" might not be as straightforward
as wait to lock.

/*
 * We have locked some folios and are going to wait to lock this folio.
 * To avoid a potential deadlock, let's bail out and not do that. The
 * locked folios will be moved and unlocked, then we can wait to lock
 * this folio
 */

> +		if (!force_lock) {
> +			rc = -EDEADLOCK;
> +			goto out;
> +		}
> +
>  		folio_lock(src);
>  	}
>
> @@ -1187,10 +1225,20 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst,
>  	int page_was_mapped = 0;
>  	struct anon_vma *anon_vma = NULL;
>  	bool is_lru = !__PageMovable(&src->page);
> +	struct list_head *prev;
>
>  	__migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
> +	prev = dst->lru.prev;
> +	list_del(&dst->lru);
>
>  	rc = move_to_new_folio(dst, src, mode);
> +
> +	if (rc == -EAGAIN) {
> +		list_add(&dst->lru, prev);
> +		__migrate_folio_record(dst, page_was_mapped, anon_vma);
> +		return rc;
> +	}
> +
>  	if (unlikely(!is_lru))
>  		goto out_unlock_both;
>
> @@ -1233,7 +1281,7 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst,
>  /* Obtain the lock on page, remove all ptes. */
>  static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
>  			       unsigned long private, struct folio *src,
> -			       struct folio **dstp, int force,
> +			       struct folio **dstp, int force, bool force_lock,
>  			       enum migrate_mode mode, enum migrate_reason reason,
>  			       struct list_head *ret)
>  {
> @@ -1261,7 +1309,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
>  	*dstp = dst;
>
>  	dst->private = NULL;
> -	rc = __migrate_folio_unmap(src, dst, force, mode);
> +	rc = __migrate_folio_unmap(src, dst, force, force_lock, mode);
>  	if (rc == MIGRATEPAGE_UNMAP)
>  		return rc;
>
> @@ -1270,7 +1318,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
>  	 * references and be restored.
>  	 */
>  	/* restore the folio to right list. */
> -	if (rc != -EAGAIN)
> +	if (rc != -EAGAIN && rc != -EDEADLOCK)
>  		list_move_tail(&src->lru, ret);
>
>  	if (put_new_page)
> @@ -1309,9 +1357,8 @@ static int migrate_folio_move(free_page_t put_new_page, unsigned long private,
>  	 */
>  	if (rc == MIGRATEPAGE_SUCCESS) {
>  		migrate_folio_done(src, reason);
> -	} else {
> -		if (rc != -EAGAIN)
> -			list_add_tail(&src->lru, ret);
> +	} else if (rc != -EAGAIN) {
> +		list_add_tail(&src->lru, ret);
>
>  		if (put_new_page)
>  			put_new_page(&dst->page, private);
> @@ -1591,7 +1638,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>  		enum migrate_mode mode, int reason, struct list_head *ret_folios,
>  		struct migrate_pages_stats *stats)

Like I said in my last comment to this patch, migrate_pages_batch() function
deserves a detailed comment about its working flow including the error handling.
Now you only put some in the git log, which is hard to access after several code
changes later.

How about?

/*
 * migrate_pages_batch() first unmaps pages in the from as many as possible,
 * then migrates the unmapped pages. During unmap process, different situations
 * are handled differently:
 * 1. ENOSYS, unsupported large folio migration: move to ret_folios list
 * 2. ENOMEM, lower memory at the destination: migrate existing unmapped folios
 *    and stop, since existing unmapped folios have new pages allocated and can
 *    be migrated
 * 3. EDEADLOCK, to be unmapped page is locked by someone else, to avoid deadlock,
 *    we migrate existing unmapped pages and try to lock again
 * 4. MIGRATEPAGE_SUCCESS, the folios was freed under us: no action
 * 5. MIGRATEPAGE_UNMAP, unmap succeeded: set avoid_force_lock to true to avoid
 *    wait to lock a folio in the future to avoid deadlock.
 *
 * For folios unmapped but cannot be migrated, we will restore their original
 * states during cleanup stage at the end.
 */

>  {
> -	int retry = 1;
> +	int retry;
>  	int large_retry = 1;
>  	int thp_retry = 1;
>  	int nr_failed = 0;
> @@ -1600,13 +1647,19 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>  	int pass = 0;
>  	bool is_large = false;
>  	bool is_thp = false;
> -	struct folio *folio, *folio2, *dst = NULL;
> -	int rc, nr_pages;
> +	struct folio *folio, *folio2, *dst = NULL, *dst2;
> +	int rc, rc_saved, nr_pages;
>  	LIST_HEAD(split_folios);
> +	LIST_HEAD(unmap_folios);
> +	LIST_HEAD(dst_folios);
>  	bool nosplit = (reason == MR_NUMA_MISPLACED);
>  	bool no_split_folio_counting = false;
> +	bool force_lock;
>
> -split_folio_migration:
> +retry:
> +	rc_saved = 0;
> +	force_lock = true;
> +	retry = 1;
>  	for (pass = 0;
>  	     pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
>  	     pass++) {
> @@ -1628,16 +1681,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>  			cond_resched();
>
>  			rc = migrate_folio_unmap(get_new_page, put_new_page, private,
> -						 folio, &dst, pass > 2, mode,
> -						 reason, ret_folios);
> -			if (rc == MIGRATEPAGE_UNMAP)
> -				rc = migrate_folio_move(put_new_page, private,
> -							folio, dst, mode,
> -							reason, ret_folios);
> +						 folio, &dst, pass > 2, force_lock,
> +						 mode, reason, ret_folios);
>  			/*
>  			 * The rules are:
>  			 *	Success: folio will be freed
> +			 *	Unmap: folio will be put on unmap_folios list,
> +			 *	       dst folio put on dst_folios list
>  			 *	-EAGAIN: stay on the from list
> +			 *	-EDEADLOCK: stay on the from list
>  			 *	-ENOMEM: stay on the from list
>  			 *	-ENOSYS: stay on the from list
>  			 *	Other errno: put on ret_folios list
> @@ -1672,7 +1724,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>  			case -ENOMEM:
>  				/*
>  				 * When memory is low, don't bother to try to migrate
> -				 * other folios, just exit.
> +				 * other folios, move unmapped folios, then exit.
>  				 */
>  				if (is_large) {
>  					nr_large_failed++;
> @@ -1711,7 +1763,19 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>  				/* nr_failed isn't updated for not used */
>  				nr_large_failed += large_retry;
>  				stats->nr_thp_failed += thp_retry;
> -				goto out;
> +				rc_saved = rc;
> +				if (list_empty(&unmap_folios))
> +					goto out;
> +				else
> +					goto move;
> +			case -EDEADLOCK:
> +				/*
> +				 * The folio cannot be locked for potential deadlock.
> +				 * Go move (and unlock) all locked folios.  Then we can
> +				 * try again.
> +				 */
> +				rc_saved = rc;
> +				goto move;
>  			case -EAGAIN:
>  				if (is_large) {
>  					large_retry++;
> @@ -1725,6 +1789,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>  				stats->nr_succeeded += nr_pages;
>  				stats->nr_thp_succeeded += is_thp;
>  				break;
> +			case MIGRATEPAGE_UNMAP:
> +				/*
> +				 * We have locked some folios, don't force lock
> +				 * to avoid deadlock.
> +				 */
> +				force_lock = false;
> +				list_move_tail(&folio->lru, &unmap_folios);
> +				list_add_tail(&dst->lru, &dst_folios);
> +				break;
>  			default:
>  				/*
>  				 * Permanent failure (-EBUSY, etc.):
> @@ -1748,12 +1821,95 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>  	nr_large_failed += large_retry;
>  	stats->nr_thp_failed += thp_retry;
>  	stats->nr_failed_pages += nr_retry_pages;
> +move:
> +	retry = 1;
> +	for (pass = 0;
> +	     pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
> +	     pass++) {
> +		retry = 0;
> +		large_retry = 0;
> +		thp_retry = 0;
> +		nr_retry_pages = 0;
> +
> +		dst = list_first_entry(&dst_folios, struct folio, lru);
> +		dst2 = list_next_entry(dst, lru);
> +		list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
> +			is_large = folio_test_large(folio);
> +			is_thp = is_large && folio_test_pmd_mappable(folio);
> +			nr_pages = folio_nr_pages(folio);
> +
> +			cond_resched();
> +
> +			rc = migrate_folio_move(put_new_page, private,
> +						folio, dst, mode,
> +						reason, ret_folios);
> +			/*
> +			 * The rules are:
> +			 *	Success: folio will be freed
> +			 *	-EAGAIN: stay on the unmap_folios list
> +			 *	Other errno: put on ret_folios list
> +			 */
> +			switch(rc) {
> +			case -EAGAIN:
> +				if (is_large) {
> +					large_retry++;
> +					thp_retry += is_thp;
> +				} else if (!no_split_folio_counting) {
> +					retry++;
> +				}
> +				nr_retry_pages += nr_pages;
> +				break;
> +			case MIGRATEPAGE_SUCCESS:
> +				stats->nr_succeeded += nr_pages;
> +				stats->nr_thp_succeeded += is_thp;
> +				break;
> +			default:
> +				if (is_large) {
> +					nr_large_failed++;
> +					stats->nr_thp_failed += is_thp;
> +				} else if (!no_split_folio_counting) {
> +					nr_failed++;
> +				}
> +
> +				stats->nr_failed_pages += nr_pages;
> +				break;
> +			}
> +			dst = dst2;
> +			dst2 = list_next_entry(dst, lru);
> +		}
> +	}
> +	nr_failed += retry;
> +	nr_large_failed += large_retry;
> +	stats->nr_thp_failed += thp_retry;
> +	stats->nr_failed_pages += nr_retry_pages;
> +
> +	if (rc_saved)
> +		rc = rc_saved;
> +	else
> +		rc = nr_failed + nr_large_failed;
> +out:
> +	/* Cleanup remaining folios */
> +	dst = list_first_entry(&dst_folios, struct folio, lru);
> +	dst2 = list_next_entry(dst, lru);
> +	list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
> +		int page_was_mapped = 0;
> +		struct anon_vma *anon_vma = NULL;
> +
> +		__migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
> +		migrate_folio_undo_src(folio, page_was_mapped, anon_vma,
> +				       ret_folios);
> +		list_del(&dst->lru);
> +		migrate_folio_undo_dst(dst, put_new_page, private);
> +		dst = dst2;
> +		dst2 = list_next_entry(dst, lru);
> +	}
> +
>  	/*
>  	 * Try to migrate split folios of fail-to-migrate large folios, no
>  	 * nr_failed counting in this round, since all split folios of a
>  	 * large folio is counted as 1 failure in the first round.
>  	 */
> -	if (!list_empty(&split_folios)) {
> +	if (rc >= 0 && !list_empty(&split_folios)) {
>  		/*
>  		 * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY
>  		 * retries) to ret_folios to avoid migrating them again.
> @@ -1761,12 +1917,16 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>  		list_splice_init(from, ret_folios);
>  		list_splice_init(&split_folios, from);
>  		no_split_folio_counting = true;
> -		retry = 1;
> -		goto split_folio_migration;
> +		goto retry;
>  	}
>
> -	rc = nr_failed + nr_large_failed;
> -out:
> +	/*
> +	 * We have unlocked all locked folios, so we can force lock now, let's
> +	 * try again.
> +	 */
> +	if (rc == -EDEADLOCK)
> +		goto retry;
> +
>  	return rc;
>  }
>
> -- 
> 2.35.1

After rename the variable (or give it a better name) and add the comments,
you can add Reviewed-by: Zi Yan <ziy@nvidia.com>

Thanks.

--
Best Regards,
Yan, Zi
  
Huang, Ying Feb. 7, 2023, 5:58 a.m. UTC | #2
Zi Yan <ziy@nvidia.com> writes:

> On 6 Feb 2023, at 1:33, Huang Ying wrote:
>
>> In this patch the _unmap and _move stage of the folio migration is
>> batched.  That for, previously, it is,
>>
>>   for each folio
>>     _unmap()
>>     _move()
>>
>> Now, it is,
>>
>>   for each folio
>>     _unmap()
>>   for each folio
>>     _move()
>>
>> Based on this, we can batch the TLB flushing and use some hardware
>> accelerator to copy folios between batched _unmap and batched _move
>> stages.
>>
>> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
>> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
>> Cc: Zi Yan <ziy@nvidia.com>
>> Cc: Yang Shi <shy828301@gmail.com>
>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Matthew Wilcox <willy@infradead.org>
>> Cc: Bharata B Rao <bharata@amd.com>
>> Cc: Alistair Popple <apopple@nvidia.com>
>> Cc: haoxin <xhao@linux.alibaba.com>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mike Kravetz <mike.kravetz@oracle.com>
>> ---
>>  mm/migrate.c | 208 +++++++++++++++++++++++++++++++++++++++++++++------
>>  1 file changed, 184 insertions(+), 24 deletions(-)
>>
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 0428449149f4..fa7212330cb6 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -1033,6 +1033,33 @@ static void __migrate_folio_extract(struct folio *dst,
>>  	dst->private = NULL;
>>  }
>>
>> +/* Restore the source folio to the original state upon failure */
>> +static void migrate_folio_undo_src(struct folio *src,
>> +				   int page_was_mapped,
>> +				   struct anon_vma *anon_vma,
>> +				   struct list_head *ret)
>> +{
>> +	if (page_was_mapped)
>> +		remove_migration_ptes(src, src, false);
>> +	/* Drop an anon_vma reference if we took one */
>> +	if (anon_vma)
>> +		put_anon_vma(anon_vma);
>> +	folio_unlock(src);
>> +	list_move_tail(&src->lru, ret);
>> +}
>> +
>> +/* Restore the destination folio to the original state upon failure */
>> +static void migrate_folio_undo_dst(struct folio *dst,
>> +				   free_page_t put_new_page,
>> +				   unsigned long private)
>> +{
>> +	folio_unlock(dst);
>> +	if (put_new_page)
>> +		put_new_page(&dst->page, private);
>> +	else
>> +		folio_put(dst);
>> +}
>> +
>>  /* Cleanup src folio upon migration success */
>>  static void migrate_folio_done(struct folio *src,
>>  			       enum migrate_reason reason)
>> @@ -1052,7 +1079,7 @@ static void migrate_folio_done(struct folio *src,
>>  }
>>
>>  static int __migrate_folio_unmap(struct folio *src, struct folio *dst,
>> -				int force, enum migrate_mode mode)
>> +				 int force, bool force_lock, enum migrate_mode mode)
>>  {
>>  	int rc = -EAGAIN;
>>  	int page_was_mapped = 0;
>> @@ -1079,6 +1106,17 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst,
>>  		if (current->flags & PF_MEMALLOC)
>>  			goto out;
>>
>> +		/*
>> +		 * We have locked some folios, to avoid deadlock, we cannot
>> +		 * lock the folio synchronously.  Go out to process (and
>> +		 * unlock) all the locked folios.  Then we can lock the folio
>> +		 * synchronously.
>> +		 */
> The comment alone is quite confusing and the variable might be better
> renamed to avoid_force_lock, since there is a force variable to force
> lock folio already. And the variable intends to discourage force lock
> on a folio to avoid potential deadlock.

OK.  Will rename "force_lock" to "avoid_force_lock" in the next version.

> How about? Since "lock synchronously" might not be as straightforward
> as wait to lock.
>
> /*
>  * We have locked some folios and are going to wait to lock this folio.
>  * To avoid a potential deadlock, let's bail out and not do that. The
>  * locked folios will be moved and unlocked, then we can wait to lock
>  * this folio
>  */

Thanks!  It looks better.  Will use it in the next version.

Best Regards,
Huang, Ying

>> +		if (!force_lock) {
>> +			rc = -EDEADLOCK;
>> +			goto out;
>> +		}
>> +
>>  		folio_lock(src);
>>  	}
>>
>> @@ -1187,10 +1225,20 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst,
>>  	int page_was_mapped = 0;
>>  	struct anon_vma *anon_vma = NULL;
>>  	bool is_lru = !__PageMovable(&src->page);
>> +	struct list_head *prev;
>>
>>  	__migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
>> +	prev = dst->lru.prev;
>> +	list_del(&dst->lru);
>>
>>  	rc = move_to_new_folio(dst, src, mode);
>> +
>> +	if (rc == -EAGAIN) {
>> +		list_add(&dst->lru, prev);
>> +		__migrate_folio_record(dst, page_was_mapped, anon_vma);
>> +		return rc;
>> +	}
>> +
>>  	if (unlikely(!is_lru))
>>  		goto out_unlock_both;
>>
>> @@ -1233,7 +1281,7 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst,
>>  /* Obtain the lock on page, remove all ptes. */
>>  static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
>>  			       unsigned long private, struct folio *src,
>> -			       struct folio **dstp, int force,
>> +			       struct folio **dstp, int force, bool force_lock,
>>  			       enum migrate_mode mode, enum migrate_reason reason,
>>  			       struct list_head *ret)
>>  {
>> @@ -1261,7 +1309,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
>>  	*dstp = dst;
>>
>>  	dst->private = NULL;
>> -	rc = __migrate_folio_unmap(src, dst, force, mode);
>> +	rc = __migrate_folio_unmap(src, dst, force, force_lock, mode);
>>  	if (rc == MIGRATEPAGE_UNMAP)
>>  		return rc;
>>
>> @@ -1270,7 +1318,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
>>  	 * references and be restored.
>>  	 */
>>  	/* restore the folio to right list. */
>> -	if (rc != -EAGAIN)
>> +	if (rc != -EAGAIN && rc != -EDEADLOCK)
>>  		list_move_tail(&src->lru, ret);
>>
>>  	if (put_new_page)
>> @@ -1309,9 +1357,8 @@ static int migrate_folio_move(free_page_t put_new_page, unsigned long private,
>>  	 */
>>  	if (rc == MIGRATEPAGE_SUCCESS) {
>>  		migrate_folio_done(src, reason);
>> -	} else {
>> -		if (rc != -EAGAIN)
>> -			list_add_tail(&src->lru, ret);
>> +	} else if (rc != -EAGAIN) {
>> +		list_add_tail(&src->lru, ret);
>>
>>  		if (put_new_page)
>>  			put_new_page(&dst->page, private);
>> @@ -1591,7 +1638,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  		enum migrate_mode mode, int reason, struct list_head *ret_folios,
>>  		struct migrate_pages_stats *stats)
>
> Like I said in my last comment to this patch, migrate_pages_batch() function
> deserves a detailed comment about its working flow including the error handling.
> Now you only put some in the git log, which is hard to access after several code
> changes later.
>
> How about?
>
> /*
>  * migrate_pages_batch() first unmaps pages in the from as many as possible,
>  * then migrates the unmapped pages. During unmap process, different situations
>  * are handled differently:
>  * 1. ENOSYS, unsupported large folio migration: move to ret_folios list
>  * 2. ENOMEM, lower memory at the destination: migrate existing unmapped folios
>  *    and stop, since existing unmapped folios have new pages allocated and can
>  *    be migrated
>  * 3. EDEADLOCK, to be unmapped page is locked by someone else, to avoid deadlock,
>  *    we migrate existing unmapped pages and try to lock again
>  * 4. MIGRATEPAGE_SUCCESS, the folios was freed under us: no action
>  * 5. MIGRATEPAGE_UNMAP, unmap succeeded: set avoid_force_lock to true to avoid
>  *    wait to lock a folio in the future to avoid deadlock.
>  *
>  * For folios unmapped but cannot be migrated, we will restore their original
>  * states during cleanup stage at the end.
>  */
>
>>  {
>> -	int retry = 1;
>> +	int retry;
>>  	int large_retry = 1;
>>  	int thp_retry = 1;
>>  	int nr_failed = 0;
>> @@ -1600,13 +1647,19 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  	int pass = 0;
>>  	bool is_large = false;
>>  	bool is_thp = false;
>> -	struct folio *folio, *folio2, *dst = NULL;
>> -	int rc, nr_pages;
>> +	struct folio *folio, *folio2, *dst = NULL, *dst2;
>> +	int rc, rc_saved, nr_pages;
>>  	LIST_HEAD(split_folios);
>> +	LIST_HEAD(unmap_folios);
>> +	LIST_HEAD(dst_folios);
>>  	bool nosplit = (reason == MR_NUMA_MISPLACED);
>>  	bool no_split_folio_counting = false;
>> +	bool force_lock;
>>
>> -split_folio_migration:
>> +retry:
>> +	rc_saved = 0;
>> +	force_lock = true;
>> +	retry = 1;
>>  	for (pass = 0;
>>  	     pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
>>  	     pass++) {
>> @@ -1628,16 +1681,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  			cond_resched();
>>
>>  			rc = migrate_folio_unmap(get_new_page, put_new_page, private,
>> -						 folio, &dst, pass > 2, mode,
>> -						 reason, ret_folios);
>> -			if (rc == MIGRATEPAGE_UNMAP)
>> -				rc = migrate_folio_move(put_new_page, private,
>> -							folio, dst, mode,
>> -							reason, ret_folios);
>> +						 folio, &dst, pass > 2, force_lock,
>> +						 mode, reason, ret_folios);
>>  			/*
>>  			 * The rules are:
>>  			 *	Success: folio will be freed
>> +			 *	Unmap: folio will be put on unmap_folios list,
>> +			 *	       dst folio put on dst_folios list
>>  			 *	-EAGAIN: stay on the from list
>> +			 *	-EDEADLOCK: stay on the from list
>>  			 *	-ENOMEM: stay on the from list
>>  			 *	-ENOSYS: stay on the from list
>>  			 *	Other errno: put on ret_folios list
>> @@ -1672,7 +1724,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  			case -ENOMEM:
>>  				/*
>>  				 * When memory is low, don't bother to try to migrate
>> -				 * other folios, just exit.
>> +				 * other folios, move unmapped folios, then exit.
>>  				 */
>>  				if (is_large) {
>>  					nr_large_failed++;
>> @@ -1711,7 +1763,19 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  				/* nr_failed isn't updated for not used */
>>  				nr_large_failed += large_retry;
>>  				stats->nr_thp_failed += thp_retry;
>> -				goto out;
>> +				rc_saved = rc;
>> +				if (list_empty(&unmap_folios))
>> +					goto out;
>> +				else
>> +					goto move;
>> +			case -EDEADLOCK:
>> +				/*
>> +				 * The folio cannot be locked for potential deadlock.
>> +				 * Go move (and unlock) all locked folios.  Then we can
>> +				 * try again.
>> +				 */
>> +				rc_saved = rc;
>> +				goto move;
>>  			case -EAGAIN:
>>  				if (is_large) {
>>  					large_retry++;
>> @@ -1725,6 +1789,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  				stats->nr_succeeded += nr_pages;
>>  				stats->nr_thp_succeeded += is_thp;
>>  				break;
>> +			case MIGRATEPAGE_UNMAP:
>> +				/*
>> +				 * We have locked some folios, don't force lock
>> +				 * to avoid deadlock.
>> +				 */
>> +				force_lock = false;
>> +				list_move_tail(&folio->lru, &unmap_folios);
>> +				list_add_tail(&dst->lru, &dst_folios);
>> +				break;
>>  			default:
>>  				/*
>>  				 * Permanent failure (-EBUSY, etc.):
>> @@ -1748,12 +1821,95 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  	nr_large_failed += large_retry;
>>  	stats->nr_thp_failed += thp_retry;
>>  	stats->nr_failed_pages += nr_retry_pages;
>> +move:
>> +	retry = 1;
>> +	for (pass = 0;
>> +	     pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
>> +	     pass++) {
>> +		retry = 0;
>> +		large_retry = 0;
>> +		thp_retry = 0;
>> +		nr_retry_pages = 0;
>> +
>> +		dst = list_first_entry(&dst_folios, struct folio, lru);
>> +		dst2 = list_next_entry(dst, lru);
>> +		list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
>> +			is_large = folio_test_large(folio);
>> +			is_thp = is_large && folio_test_pmd_mappable(folio);
>> +			nr_pages = folio_nr_pages(folio);
>> +
>> +			cond_resched();
>> +
>> +			rc = migrate_folio_move(put_new_page, private,
>> +						folio, dst, mode,
>> +						reason, ret_folios);
>> +			/*
>> +			 * The rules are:
>> +			 *	Success: folio will be freed
>> +			 *	-EAGAIN: stay on the unmap_folios list
>> +			 *	Other errno: put on ret_folios list
>> +			 */
>> +			switch(rc) {
>> +			case -EAGAIN:
>> +				if (is_large) {
>> +					large_retry++;
>> +					thp_retry += is_thp;
>> +				} else if (!no_split_folio_counting) {
>> +					retry++;
>> +				}
>> +				nr_retry_pages += nr_pages;
>> +				break;
>> +			case MIGRATEPAGE_SUCCESS:
>> +				stats->nr_succeeded += nr_pages;
>> +				stats->nr_thp_succeeded += is_thp;
>> +				break;
>> +			default:
>> +				if (is_large) {
>> +					nr_large_failed++;
>> +					stats->nr_thp_failed += is_thp;
>> +				} else if (!no_split_folio_counting) {
>> +					nr_failed++;
>> +				}
>> +
>> +				stats->nr_failed_pages += nr_pages;
>> +				break;
>> +			}
>> +			dst = dst2;
>> +			dst2 = list_next_entry(dst, lru);
>> +		}
>> +	}
>> +	nr_failed += retry;
>> +	nr_large_failed += large_retry;
>> +	stats->nr_thp_failed += thp_retry;
>> +	stats->nr_failed_pages += nr_retry_pages;
>> +
>> +	if (rc_saved)
>> +		rc = rc_saved;
>> +	else
>> +		rc = nr_failed + nr_large_failed;
>> +out:
>> +	/* Cleanup remaining folios */
>> +	dst = list_first_entry(&dst_folios, struct folio, lru);
>> +	dst2 = list_next_entry(dst, lru);
>> +	list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
>> +		int page_was_mapped = 0;
>> +		struct anon_vma *anon_vma = NULL;
>> +
>> +		__migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
>> +		migrate_folio_undo_src(folio, page_was_mapped, anon_vma,
>> +				       ret_folios);
>> +		list_del(&dst->lru);
>> +		migrate_folio_undo_dst(dst, put_new_page, private);
>> +		dst = dst2;
>> +		dst2 = list_next_entry(dst, lru);
>> +	}
>> +
>>  	/*
>>  	 * Try to migrate split folios of fail-to-migrate large folios, no
>>  	 * nr_failed counting in this round, since all split folios of a
>>  	 * large folio is counted as 1 failure in the first round.
>>  	 */
>> -	if (!list_empty(&split_folios)) {
>> +	if (rc >= 0 && !list_empty(&split_folios)) {
>>  		/*
>>  		 * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY
>>  		 * retries) to ret_folios to avoid migrating them again.
>> @@ -1761,12 +1917,16 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  		list_splice_init(from, ret_folios);
>>  		list_splice_init(&split_folios, from);
>>  		no_split_folio_counting = true;
>> -		retry = 1;
>> -		goto split_folio_migration;
>> +		goto retry;
>>  	}
>>
>> -	rc = nr_failed + nr_large_failed;
>> -out:
>> +	/*
>> +	 * We have unlocked all locked folios, so we can force lock now, let's
>> +	 * try again.
>> +	 */
>> +	if (rc == -EDEADLOCK)
>> +		goto retry;
>> +
>>  	return rc;
>>  }
>>
>> -- 
>> 2.35.1
>
> After rename the variable (or give it a better name) and add the comments,
> you can add Reviewed-by: Zi Yan <ziy@nvidia.com>
>
> Thanks.
>
> --
> Best Regards,
> Yan, Zi
  
haoxin Feb. 7, 2023, 5:33 p.m. UTC | #3
I will do some tests on my arm64 server, but there is a problem with my 
machine environment, I will provide the test results later.

Of course, i should combine with this patch 
https://lore.kernel.org/lkml/20221117082648.47526-1-yangyicong@huawei.com/

在 2023/2/6 下午2:33, Huang Ying 写道:
> In this patch the _unmap and _move stage of the folio migration is
> batched.  That for, previously, it is,
>
>    for each folio
>      _unmap()
>      _move()
>
> Now, it is,
>
>    for each folio
>      _unmap()
>    for each folio
>      _move()
>
> Based on this, we can batch the TLB flushing and use some hardware
> accelerator to copy folios between batched _unmap and batched _move
> stages.
>
> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
> Cc: Zi Yan <ziy@nvidia.com>
> Cc: Yang Shi <shy828301@gmail.com>
> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Bharata B Rao <bharata@amd.com>
> Cc: Alistair Popple <apopple@nvidia.com>
> Cc: haoxin <xhao@linux.alibaba.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   mm/migrate.c | 208 +++++++++++++++++++++++++++++++++++++++++++++------
>   1 file changed, 184 insertions(+), 24 deletions(-)
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 0428449149f4..fa7212330cb6 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1033,6 +1033,33 @@ static void __migrate_folio_extract(struct folio *dst,
>   	dst->private = NULL;
>   }
>   
> +/* Restore the source folio to the original state upon failure */
> +static void migrate_folio_undo_src(struct folio *src,
> +				   int page_was_mapped,
> +				   struct anon_vma *anon_vma,
> +				   struct list_head *ret)
> +{
> +	if (page_was_mapped)
> +		remove_migration_ptes(src, src, false);
> +	/* Drop an anon_vma reference if we took one */
> +	if (anon_vma)
> +		put_anon_vma(anon_vma);
> +	folio_unlock(src);
> +	list_move_tail(&src->lru, ret);
> +}
> +
> +/* Restore the destination folio to the original state upon failure */
> +static void migrate_folio_undo_dst(struct folio *dst,
> +				   free_page_t put_new_page,
> +				   unsigned long private)
> +{
> +	folio_unlock(dst);
> +	if (put_new_page)
> +		put_new_page(&dst->page, private);
> +	else
> +		folio_put(dst);
> +}
> +
>   /* Cleanup src folio upon migration success */
>   static void migrate_folio_done(struct folio *src,
>   			       enum migrate_reason reason)
> @@ -1052,7 +1079,7 @@ static void migrate_folio_done(struct folio *src,
>   }
>   
>   static int __migrate_folio_unmap(struct folio *src, struct folio *dst,
> -				int force, enum migrate_mode mode)
> +				 int force, bool force_lock, enum migrate_mode mode)
>   {
>   	int rc = -EAGAIN;
>   	int page_was_mapped = 0;
> @@ -1079,6 +1106,17 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst,
>   		if (current->flags & PF_MEMALLOC)
>   			goto out;
>   
> +		/*
> +		 * We have locked some folios, to avoid deadlock, we cannot
> +		 * lock the folio synchronously.  Go out to process (and
> +		 * unlock) all the locked folios.  Then we can lock the folio
> +		 * synchronously.
> +		 */
> +		if (!force_lock) {
> +			rc = -EDEADLOCK;
> +			goto out;
> +		}
> +
>   		folio_lock(src);
>   	}
>   
> @@ -1187,10 +1225,20 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst,
>   	int page_was_mapped = 0;
>   	struct anon_vma *anon_vma = NULL;
>   	bool is_lru = !__PageMovable(&src->page);
> +	struct list_head *prev;
>   
>   	__migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
> +	prev = dst->lru.prev;
> +	list_del(&dst->lru);
>   
>   	rc = move_to_new_folio(dst, src, mode);
> +
> +	if (rc == -EAGAIN) {
> +		list_add(&dst->lru, prev);
> +		__migrate_folio_record(dst, page_was_mapped, anon_vma);
> +		return rc;
> +	}
> +
>   	if (unlikely(!is_lru))
>   		goto out_unlock_both;
>   
> @@ -1233,7 +1281,7 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst,
>   /* Obtain the lock on page, remove all ptes. */
>   static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
>   			       unsigned long private, struct folio *src,
> -			       struct folio **dstp, int force,
> +			       struct folio **dstp, int force, bool force_lock,
>   			       enum migrate_mode mode, enum migrate_reason reason,
>   			       struct list_head *ret)
>   {
> @@ -1261,7 +1309,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
>   	*dstp = dst;
>   
>   	dst->private = NULL;
> -	rc = __migrate_folio_unmap(src, dst, force, mode);
> +	rc = __migrate_folio_unmap(src, dst, force, force_lock, mode);
>   	if (rc == MIGRATEPAGE_UNMAP)
>   		return rc;
>   
> @@ -1270,7 +1318,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
>   	 * references and be restored.
>   	 */
>   	/* restore the folio to right list. */
> -	if (rc != -EAGAIN)
> +	if (rc != -EAGAIN && rc != -EDEADLOCK)
>   		list_move_tail(&src->lru, ret);
>   
>   	if (put_new_page)
> @@ -1309,9 +1357,8 @@ static int migrate_folio_move(free_page_t put_new_page, unsigned long private,
>   	 */
>   	if (rc == MIGRATEPAGE_SUCCESS) {
>   		migrate_folio_done(src, reason);
> -	} else {
> -		if (rc != -EAGAIN)
> -			list_add_tail(&src->lru, ret);
> +	} else if (rc != -EAGAIN) {
> +		list_add_tail(&src->lru, ret);
>   
>   		if (put_new_page)
>   			put_new_page(&dst->page, private);
> @@ -1591,7 +1638,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>   		enum migrate_mode mode, int reason, struct list_head *ret_folios,
>   		struct migrate_pages_stats *stats)
>   {
> -	int retry = 1;
> +	int retry;
>   	int large_retry = 1;
>   	int thp_retry = 1;
>   	int nr_failed = 0;
> @@ -1600,13 +1647,19 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>   	int pass = 0;
>   	bool is_large = false;
>   	bool is_thp = false;
> -	struct folio *folio, *folio2, *dst = NULL;
> -	int rc, nr_pages;
> +	struct folio *folio, *folio2, *dst = NULL, *dst2;
> +	int rc, rc_saved, nr_pages;
>   	LIST_HEAD(split_folios);
> +	LIST_HEAD(unmap_folios);
> +	LIST_HEAD(dst_folios);
>   	bool nosplit = (reason == MR_NUMA_MISPLACED);
>   	bool no_split_folio_counting = false;
> +	bool force_lock;
>   
> -split_folio_migration:
> +retry:
> +	rc_saved = 0;
> +	force_lock = true;
> +	retry = 1;
>   	for (pass = 0;
>   	     pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
>   	     pass++) {
> @@ -1628,16 +1681,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>   			cond_resched();
>   
>   			rc = migrate_folio_unmap(get_new_page, put_new_page, private,
> -						 folio, &dst, pass > 2, mode,
> -						 reason, ret_folios);
> -			if (rc == MIGRATEPAGE_UNMAP)
> -				rc = migrate_folio_move(put_new_page, private,
> -							folio, dst, mode,
> -							reason, ret_folios);
> +						 folio, &dst, pass > 2, force_lock,
> +						 mode, reason, ret_folios);
>   			/*
>   			 * The rules are:
>   			 *	Success: folio will be freed
> +			 *	Unmap: folio will be put on unmap_folios list,
> +			 *	       dst folio put on dst_folios list
>   			 *	-EAGAIN: stay on the from list
> +			 *	-EDEADLOCK: stay on the from list
>   			 *	-ENOMEM: stay on the from list
>   			 *	-ENOSYS: stay on the from list
>   			 *	Other errno: put on ret_folios list
> @@ -1672,7 +1724,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>   			case -ENOMEM:
>   				/*
>   				 * When memory is low, don't bother to try to migrate
> -				 * other folios, just exit.
> +				 * other folios, move unmapped folios, then exit.
>   				 */
>   				if (is_large) {
>   					nr_large_failed++;
> @@ -1711,7 +1763,19 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>   				/* nr_failed isn't updated for not used */
>   				nr_large_failed += large_retry;
>   				stats->nr_thp_failed += thp_retry;
> -				goto out;
> +				rc_saved = rc;
> +				if (list_empty(&unmap_folios))
> +					goto out;
> +				else
> +					goto move;
> +			case -EDEADLOCK:
> +				/*
> +				 * The folio cannot be locked for potential deadlock.
> +				 * Go move (and unlock) all locked folios.  Then we can
> +				 * try again.
> +				 */
> +				rc_saved = rc;
> +				goto move;
>   			case -EAGAIN:
>   				if (is_large) {
>   					large_retry++;
> @@ -1725,6 +1789,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>   				stats->nr_succeeded += nr_pages;
>   				stats->nr_thp_succeeded += is_thp;
>   				break;
> +			case MIGRATEPAGE_UNMAP:
> +				/*
> +				 * We have locked some folios, don't force lock
> +				 * to avoid deadlock.
> +				 */
> +				force_lock = false;
> +				list_move_tail(&folio->lru, &unmap_folios);
> +				list_add_tail(&dst->lru, &dst_folios);
> +				break;
>   			default:
>   				/*
>   				 * Permanent failure (-EBUSY, etc.):
> @@ -1748,12 +1821,95 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>   	nr_large_failed += large_retry;
>   	stats->nr_thp_failed += thp_retry;
>   	stats->nr_failed_pages += nr_retry_pages;
> +move:
> +	retry = 1;
> +	for (pass = 0;
> +	     pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
> +	     pass++) {
> +		retry = 0;
> +		large_retry = 0;
> +		thp_retry = 0;
> +		nr_retry_pages = 0;
> +
> +		dst = list_first_entry(&dst_folios, struct folio, lru);
> +		dst2 = list_next_entry(dst, lru);
> +		list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
> +			is_large = folio_test_large(folio);
> +			is_thp = is_large && folio_test_pmd_mappable(folio);
> +			nr_pages = folio_nr_pages(folio);
> +
> +			cond_resched();
> +
> +			rc = migrate_folio_move(put_new_page, private,
> +						folio, dst, mode,
> +						reason, ret_folios);
> +			/*
> +			 * The rules are:
> +			 *	Success: folio will be freed
> +			 *	-EAGAIN: stay on the unmap_folios list
> +			 *	Other errno: put on ret_folios list
> +			 */
> +			switch(rc) {
> +			case -EAGAIN:
> +				if (is_large) {
> +					large_retry++;
> +					thp_retry += is_thp;
> +				} else if (!no_split_folio_counting) {
> +					retry++;
> +				}
> +				nr_retry_pages += nr_pages;
> +				break;
> +			case MIGRATEPAGE_SUCCESS:
> +				stats->nr_succeeded += nr_pages;
> +				stats->nr_thp_succeeded += is_thp;
> +				break;
> +			default:
> +				if (is_large) {
> +					nr_large_failed++;
> +					stats->nr_thp_failed += is_thp;
> +				} else if (!no_split_folio_counting) {
> +					nr_failed++;
> +				}
> +
> +				stats->nr_failed_pages += nr_pages;
> +				break;
> +			}
> +			dst = dst2;
> +			dst2 = list_next_entry(dst, lru);
> +		}
> +	}
> +	nr_failed += retry;
> +	nr_large_failed += large_retry;
> +	stats->nr_thp_failed += thp_retry;
> +	stats->nr_failed_pages += nr_retry_pages;
> +
> +	if (rc_saved)
> +		rc = rc_saved;
> +	else
> +		rc = nr_failed + nr_large_failed;
> +out:
> +	/* Cleanup remaining folios */
> +	dst = list_first_entry(&dst_folios, struct folio, lru);
> +	dst2 = list_next_entry(dst, lru);
> +	list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
> +		int page_was_mapped = 0;
> +		struct anon_vma *anon_vma = NULL;
> +
> +		__migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
> +		migrate_folio_undo_src(folio, page_was_mapped, anon_vma,
> +				       ret_folios);
> +		list_del(&dst->lru);
> +		migrate_folio_undo_dst(dst, put_new_page, private);
> +		dst = dst2;
> +		dst2 = list_next_entry(dst, lru);
> +	}
> +
>   	/*
>   	 * Try to migrate split folios of fail-to-migrate large folios, no
>   	 * nr_failed counting in this round, since all split folios of a
>   	 * large folio is counted as 1 failure in the first round.
>   	 */
> -	if (!list_empty(&split_folios)) {
> +	if (rc >= 0 && !list_empty(&split_folios)) {
>   		/*
>   		 * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY
>   		 * retries) to ret_folios to avoid migrating them again.
> @@ -1761,12 +1917,16 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>   		list_splice_init(from, ret_folios);
>   		list_splice_init(&split_folios, from);
>   		no_split_folio_counting = true;
> -		retry = 1;
> -		goto split_folio_migration;
> +		goto retry;
>   	}
>   
> -	rc = nr_failed + nr_large_failed;
> -out:
> +	/*
> +	 * We have unlocked all locked folios, so we can force lock now, let's
> +	 * try again.
> +	 */
> +	if (rc == -EDEADLOCK)
> +		goto retry;
> +
>   	return rc;
>   }
>
  
Huang, Ying Feb. 13, 2023, 6:55 a.m. UTC | #4
Zi Yan <ziy@nvidia.com> writes:

> On 6 Feb 2023, at 1:33, Huang Ying wrote:
>
>> In this patch the _unmap and _move stage of the folio migration is
>> batched.  That for, previously, it is,
>>
>>   for each folio
>>     _unmap()
>>     _move()
>>
>> Now, it is,
>>
>>   for each folio
>>     _unmap()
>>   for each folio
>>     _move()
>>
>> Based on this, we can batch the TLB flushing and use some hardware
>> accelerator to copy folios between batched _unmap and batched _move
>> stages.
>>
>> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
>> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
>> Cc: Zi Yan <ziy@nvidia.com>
>> Cc: Yang Shi <shy828301@gmail.com>
>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Matthew Wilcox <willy@infradead.org>
>> Cc: Bharata B Rao <bharata@amd.com>
>> Cc: Alistair Popple <apopple@nvidia.com>
>> Cc: haoxin <xhao@linux.alibaba.com>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mike Kravetz <mike.kravetz@oracle.com>
>> ---
>>  mm/migrate.c | 208 +++++++++++++++++++++++++++++++++++++++++++++------
>>  1 file changed, 184 insertions(+), 24 deletions(-)
>>
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 0428449149f4..fa7212330cb6 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -1033,6 +1033,33 @@ static void __migrate_folio_extract(struct folio *dst,
>>  	dst->private = NULL;
>>  }
>>
>> +/* Restore the source folio to the original state upon failure */
>> +static void migrate_folio_undo_src(struct folio *src,
>> +				   int page_was_mapped,
>> +				   struct anon_vma *anon_vma,
>> +				   struct list_head *ret)
>> +{
>> +	if (page_was_mapped)
>> +		remove_migration_ptes(src, src, false);
>> +	/* Drop an anon_vma reference if we took one */
>> +	if (anon_vma)
>> +		put_anon_vma(anon_vma);
>> +	folio_unlock(src);
>> +	list_move_tail(&src->lru, ret);
>> +}
>> +
>> +/* Restore the destination folio to the original state upon failure */
>> +static void migrate_folio_undo_dst(struct folio *dst,
>> +				   free_page_t put_new_page,
>> +				   unsigned long private)
>> +{
>> +	folio_unlock(dst);
>> +	if (put_new_page)
>> +		put_new_page(&dst->page, private);
>> +	else
>> +		folio_put(dst);
>> +}
>> +
>>  /* Cleanup src folio upon migration success */
>>  static void migrate_folio_done(struct folio *src,
>>  			       enum migrate_reason reason)
>> @@ -1052,7 +1079,7 @@ static void migrate_folio_done(struct folio *src,
>>  }
>>
>>  static int __migrate_folio_unmap(struct folio *src, struct folio *dst,
>> -				int force, enum migrate_mode mode)
>> +				 int force, bool force_lock, enum migrate_mode mode)
>>  {
>>  	int rc = -EAGAIN;
>>  	int page_was_mapped = 0;
>> @@ -1079,6 +1106,17 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst,
>>  		if (current->flags & PF_MEMALLOC)
>>  			goto out;
>>
>> +		/*
>> +		 * We have locked some folios, to avoid deadlock, we cannot
>> +		 * lock the folio synchronously.  Go out to process (and
>> +		 * unlock) all the locked folios.  Then we can lock the folio
>> +		 * synchronously.
>> +		 */
> The comment alone is quite confusing and the variable might be better
> renamed to avoid_force_lock, since there is a force variable to force
> lock folio already. And the variable intends to discourage force lock
> on a folio to avoid potential deadlock.
>
> How about? Since "lock synchronously" might not be as straightforward
> as wait to lock.
>
> /*
>  * We have locked some folios and are going to wait to lock this folio.
>  * To avoid a potential deadlock, let's bail out and not do that. The
>  * locked folios will be moved and unlocked, then we can wait to lock
>  * this folio
>  */
>
>> +		if (!force_lock) {
>> +			rc = -EDEADLOCK;
>> +			goto out;
>> +		}
>> +
>>  		folio_lock(src);
>>  	}
>>
>> @@ -1187,10 +1225,20 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst,
>>  	int page_was_mapped = 0;
>>  	struct anon_vma *anon_vma = NULL;
>>  	bool is_lru = !__PageMovable(&src->page);
>> +	struct list_head *prev;
>>
>>  	__migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
>> +	prev = dst->lru.prev;
>> +	list_del(&dst->lru);
>>
>>  	rc = move_to_new_folio(dst, src, mode);
>> +
>> +	if (rc == -EAGAIN) {
>> +		list_add(&dst->lru, prev);
>> +		__migrate_folio_record(dst, page_was_mapped, anon_vma);
>> +		return rc;
>> +	}
>> +
>>  	if (unlikely(!is_lru))
>>  		goto out_unlock_both;
>>
>> @@ -1233,7 +1281,7 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst,
>>  /* Obtain the lock on page, remove all ptes. */
>>  static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
>>  			       unsigned long private, struct folio *src,
>> -			       struct folio **dstp, int force,
>> +			       struct folio **dstp, int force, bool force_lock,
>>  			       enum migrate_mode mode, enum migrate_reason reason,
>>  			       struct list_head *ret)
>>  {
>> @@ -1261,7 +1309,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
>>  	*dstp = dst;
>>
>>  	dst->private = NULL;
>> -	rc = __migrate_folio_unmap(src, dst, force, mode);
>> +	rc = __migrate_folio_unmap(src, dst, force, force_lock, mode);
>>  	if (rc == MIGRATEPAGE_UNMAP)
>>  		return rc;
>>
>> @@ -1270,7 +1318,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
>>  	 * references and be restored.
>>  	 */
>>  	/* restore the folio to right list. */
>> -	if (rc != -EAGAIN)
>> +	if (rc != -EAGAIN && rc != -EDEADLOCK)
>>  		list_move_tail(&src->lru, ret);
>>
>>  	if (put_new_page)
>> @@ -1309,9 +1357,8 @@ static int migrate_folio_move(free_page_t put_new_page, unsigned long private,
>>  	 */
>>  	if (rc == MIGRATEPAGE_SUCCESS) {
>>  		migrate_folio_done(src, reason);
>> -	} else {
>> -		if (rc != -EAGAIN)
>> -			list_add_tail(&src->lru, ret);
>> +	} else if (rc != -EAGAIN) {
>> +		list_add_tail(&src->lru, ret);
>>
>>  		if (put_new_page)
>>  			put_new_page(&dst->page, private);
>> @@ -1591,7 +1638,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  		enum migrate_mode mode, int reason, struct list_head *ret_folios,
>>  		struct migrate_pages_stats *stats)
>
> Like I said in my last comment to this patch, migrate_pages_batch() function
> deserves a detailed comment about its working flow including the error handling.
> Now you only put some in the git log, which is hard to access after several code
> changes later.
>
> How about?
>
> /*
>  * migrate_pages_batch() first unmaps pages in the from as many as possible,
>  * then migrates the unmapped pages. During unmap process, different situations
>  * are handled differently:
>  * 1. ENOSYS, unsupported large folio migration: move to ret_folios list
>  * 2. ENOMEM, lower memory at the destination: migrate existing unmapped folios
>  *    and stop, since existing unmapped folios have new pages allocated and can
>  *    be migrated
>  * 3. EDEADLOCK, to be unmapped page is locked by someone else, to avoid deadlock,
>  *    we migrate existing unmapped pages and try to lock again
>  * 4. MIGRATEPAGE_SUCCESS, the folios was freed under us: no action
>  * 5. MIGRATEPAGE_UNMAP, unmap succeeded: set avoid_force_lock to true to avoid
>  *    wait to lock a folio in the future to avoid deadlock.
>  *
>  * For folios unmapped but cannot be migrated, we will restore their original
>  * states during cleanup stage at the end.
>  */

Sorry, I didn't notice the above comments in the previous reply.

The comments appear to too detailed for me.  I think that it's better
for people to get the details from the code itself.  So, I want to use
the much simplified version as below.

/*
 * migrate_pages_batch() first unmaps folios in the from list as many as
 * possible, then move the unmapped folios.
 */

Best Regards,
Huang, Ying

>>  {
>> -	int retry = 1;
>> +	int retry;
>>  	int large_retry = 1;
>>  	int thp_retry = 1;
>>  	int nr_failed = 0;
>> @@ -1600,13 +1647,19 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  	int pass = 0;
>>  	bool is_large = false;
>>  	bool is_thp = false;
>> -	struct folio *folio, *folio2, *dst = NULL;
>> -	int rc, nr_pages;
>> +	struct folio *folio, *folio2, *dst = NULL, *dst2;
>> +	int rc, rc_saved, nr_pages;
>>  	LIST_HEAD(split_folios);
>> +	LIST_HEAD(unmap_folios);
>> +	LIST_HEAD(dst_folios);
>>  	bool nosplit = (reason == MR_NUMA_MISPLACED);
>>  	bool no_split_folio_counting = false;
>> +	bool force_lock;
>>
>> -split_folio_migration:
>> +retry:
>> +	rc_saved = 0;
>> +	force_lock = true;
>> +	retry = 1;
>>  	for (pass = 0;
>>  	     pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
>>  	     pass++) {
>> @@ -1628,16 +1681,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  			cond_resched();
>>
>>  			rc = migrate_folio_unmap(get_new_page, put_new_page, private,
>> -						 folio, &dst, pass > 2, mode,
>> -						 reason, ret_folios);
>> -			if (rc == MIGRATEPAGE_UNMAP)
>> -				rc = migrate_folio_move(put_new_page, private,
>> -							folio, dst, mode,
>> -							reason, ret_folios);
>> +						 folio, &dst, pass > 2, force_lock,
>> +						 mode, reason, ret_folios);
>>  			/*
>>  			 * The rules are:
>>  			 *	Success: folio will be freed
>> +			 *	Unmap: folio will be put on unmap_folios list,
>> +			 *	       dst folio put on dst_folios list
>>  			 *	-EAGAIN: stay on the from list
>> +			 *	-EDEADLOCK: stay on the from list
>>  			 *	-ENOMEM: stay on the from list
>>  			 *	-ENOSYS: stay on the from list
>>  			 *	Other errno: put on ret_folios list
>> @@ -1672,7 +1724,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  			case -ENOMEM:
>>  				/*
>>  				 * When memory is low, don't bother to try to migrate
>> -				 * other folios, just exit.
>> +				 * other folios, move unmapped folios, then exit.
>>  				 */
>>  				if (is_large) {
>>  					nr_large_failed++;
>> @@ -1711,7 +1763,19 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  				/* nr_failed isn't updated for not used */
>>  				nr_large_failed += large_retry;
>>  				stats->nr_thp_failed += thp_retry;
>> -				goto out;
>> +				rc_saved = rc;
>> +				if (list_empty(&unmap_folios))
>> +					goto out;
>> +				else
>> +					goto move;
>> +			case -EDEADLOCK:
>> +				/*
>> +				 * The folio cannot be locked for potential deadlock.
>> +				 * Go move (and unlock) all locked folios.  Then we can
>> +				 * try again.
>> +				 */
>> +				rc_saved = rc;
>> +				goto move;
>>  			case -EAGAIN:
>>  				if (is_large) {
>>  					large_retry++;
>> @@ -1725,6 +1789,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  				stats->nr_succeeded += nr_pages;
>>  				stats->nr_thp_succeeded += is_thp;
>>  				break;
>> +			case MIGRATEPAGE_UNMAP:
>> +				/*
>> +				 * We have locked some folios, don't force lock
>> +				 * to avoid deadlock.
>> +				 */
>> +				force_lock = false;
>> +				list_move_tail(&folio->lru, &unmap_folios);
>> +				list_add_tail(&dst->lru, &dst_folios);
>> +				break;
>>  			default:
>>  				/*
>>  				 * Permanent failure (-EBUSY, etc.):
>> @@ -1748,12 +1821,95 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  	nr_large_failed += large_retry;
>>  	stats->nr_thp_failed += thp_retry;
>>  	stats->nr_failed_pages += nr_retry_pages;
>> +move:
>> +	retry = 1;
>> +	for (pass = 0;
>> +	     pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
>> +	     pass++) {
>> +		retry = 0;
>> +		large_retry = 0;
>> +		thp_retry = 0;
>> +		nr_retry_pages = 0;
>> +
>> +		dst = list_first_entry(&dst_folios, struct folio, lru);
>> +		dst2 = list_next_entry(dst, lru);
>> +		list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
>> +			is_large = folio_test_large(folio);
>> +			is_thp = is_large && folio_test_pmd_mappable(folio);
>> +			nr_pages = folio_nr_pages(folio);
>> +
>> +			cond_resched();
>> +
>> +			rc = migrate_folio_move(put_new_page, private,
>> +						folio, dst, mode,
>> +						reason, ret_folios);
>> +			/*
>> +			 * The rules are:
>> +			 *	Success: folio will be freed
>> +			 *	-EAGAIN: stay on the unmap_folios list
>> +			 *	Other errno: put on ret_folios list
>> +			 */
>> +			switch(rc) {
>> +			case -EAGAIN:
>> +				if (is_large) {
>> +					large_retry++;
>> +					thp_retry += is_thp;
>> +				} else if (!no_split_folio_counting) {
>> +					retry++;
>> +				}
>> +				nr_retry_pages += nr_pages;
>> +				break;
>> +			case MIGRATEPAGE_SUCCESS:
>> +				stats->nr_succeeded += nr_pages;
>> +				stats->nr_thp_succeeded += is_thp;
>> +				break;
>> +			default:
>> +				if (is_large) {
>> +					nr_large_failed++;
>> +					stats->nr_thp_failed += is_thp;
>> +				} else if (!no_split_folio_counting) {
>> +					nr_failed++;
>> +				}
>> +
>> +				stats->nr_failed_pages += nr_pages;
>> +				break;
>> +			}
>> +			dst = dst2;
>> +			dst2 = list_next_entry(dst, lru);
>> +		}
>> +	}
>> +	nr_failed += retry;
>> +	nr_large_failed += large_retry;
>> +	stats->nr_thp_failed += thp_retry;
>> +	stats->nr_failed_pages += nr_retry_pages;
>> +
>> +	if (rc_saved)
>> +		rc = rc_saved;
>> +	else
>> +		rc = nr_failed + nr_large_failed;
>> +out:
>> +	/* Cleanup remaining folios */
>> +	dst = list_first_entry(&dst_folios, struct folio, lru);
>> +	dst2 = list_next_entry(dst, lru);
>> +	list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
>> +		int page_was_mapped = 0;
>> +		struct anon_vma *anon_vma = NULL;
>> +
>> +		__migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
>> +		migrate_folio_undo_src(folio, page_was_mapped, anon_vma,
>> +				       ret_folios);
>> +		list_del(&dst->lru);
>> +		migrate_folio_undo_dst(dst, put_new_page, private);
>> +		dst = dst2;
>> +		dst2 = list_next_entry(dst, lru);
>> +	}
>> +
>>  	/*
>>  	 * Try to migrate split folios of fail-to-migrate large folios, no
>>  	 * nr_failed counting in this round, since all split folios of a
>>  	 * large folio is counted as 1 failure in the first round.
>>  	 */
>> -	if (!list_empty(&split_folios)) {
>> +	if (rc >= 0 && !list_empty(&split_folios)) {
>>  		/*
>>  		 * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY
>>  		 * retries) to ret_folios to avoid migrating them again.
>> @@ -1761,12 +1917,16 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
>>  		list_splice_init(from, ret_folios);
>>  		list_splice_init(&split_folios, from);
>>  		no_split_folio_counting = true;
>> -		retry = 1;
>> -		goto split_folio_migration;
>> +		goto retry;
>>  	}
>>
>> -	rc = nr_failed + nr_large_failed;
>> -out:
>> +	/*
>> +	 * We have unlocked all locked folios, so we can force lock now, let's
>> +	 * try again.
>> +	 */
>> +	if (rc == -EDEADLOCK)
>> +		goto retry;
>> +
>>  	return rc;
>>  }
>>
>> -- 
>> 2.35.1
>
> After rename the variable (or give it a better name) and add the comments,
> you can add Reviewed-by: Zi Yan <ziy@nvidia.com>
>
> Thanks.
>
> --
> Best Regards,
> Yan, Zi
  

Patch

diff --git a/mm/migrate.c b/mm/migrate.c
index 0428449149f4..fa7212330cb6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1033,6 +1033,33 @@  static void __migrate_folio_extract(struct folio *dst,
 	dst->private = NULL;
 }
 
+/* Restore the source folio to the original state upon failure */
+static void migrate_folio_undo_src(struct folio *src,
+				   int page_was_mapped,
+				   struct anon_vma *anon_vma,
+				   struct list_head *ret)
+{
+	if (page_was_mapped)
+		remove_migration_ptes(src, src, false);
+	/* Drop an anon_vma reference if we took one */
+	if (anon_vma)
+		put_anon_vma(anon_vma);
+	folio_unlock(src);
+	list_move_tail(&src->lru, ret);
+}
+
+/* Restore the destination folio to the original state upon failure */
+static void migrate_folio_undo_dst(struct folio *dst,
+				   free_page_t put_new_page,
+				   unsigned long private)
+{
+	folio_unlock(dst);
+	if (put_new_page)
+		put_new_page(&dst->page, private);
+	else
+		folio_put(dst);
+}
+
 /* Cleanup src folio upon migration success */
 static void migrate_folio_done(struct folio *src,
 			       enum migrate_reason reason)
@@ -1052,7 +1079,7 @@  static void migrate_folio_done(struct folio *src,
 }
 
 static int __migrate_folio_unmap(struct folio *src, struct folio *dst,
-				int force, enum migrate_mode mode)
+				 int force, bool force_lock, enum migrate_mode mode)
 {
 	int rc = -EAGAIN;
 	int page_was_mapped = 0;
@@ -1079,6 +1106,17 @@  static int __migrate_folio_unmap(struct folio *src, struct folio *dst,
 		if (current->flags & PF_MEMALLOC)
 			goto out;
 
+		/*
+		 * We have locked some folios, to avoid deadlock, we cannot
+		 * lock the folio synchronously.  Go out to process (and
+		 * unlock) all the locked folios.  Then we can lock the folio
+		 * synchronously.
+		 */
+		if (!force_lock) {
+			rc = -EDEADLOCK;
+			goto out;
+		}
+
 		folio_lock(src);
 	}
 
@@ -1187,10 +1225,20 @@  static int __migrate_folio_move(struct folio *src, struct folio *dst,
 	int page_was_mapped = 0;
 	struct anon_vma *anon_vma = NULL;
 	bool is_lru = !__PageMovable(&src->page);
+	struct list_head *prev;
 
 	__migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
+	prev = dst->lru.prev;
+	list_del(&dst->lru);
 
 	rc = move_to_new_folio(dst, src, mode);
+
+	if (rc == -EAGAIN) {
+		list_add(&dst->lru, prev);
+		__migrate_folio_record(dst, page_was_mapped, anon_vma);
+		return rc;
+	}
+
 	if (unlikely(!is_lru))
 		goto out_unlock_both;
 
@@ -1233,7 +1281,7 @@  static int __migrate_folio_move(struct folio *src, struct folio *dst,
 /* Obtain the lock on page, remove all ptes. */
 static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
 			       unsigned long private, struct folio *src,
-			       struct folio **dstp, int force,
+			       struct folio **dstp, int force, bool force_lock,
 			       enum migrate_mode mode, enum migrate_reason reason,
 			       struct list_head *ret)
 {
@@ -1261,7 +1309,7 @@  static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
 	*dstp = dst;
 
 	dst->private = NULL;
-	rc = __migrate_folio_unmap(src, dst, force, mode);
+	rc = __migrate_folio_unmap(src, dst, force, force_lock, mode);
 	if (rc == MIGRATEPAGE_UNMAP)
 		return rc;
 
@@ -1270,7 +1318,7 @@  static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
 	 * references and be restored.
 	 */
 	/* restore the folio to right list. */
-	if (rc != -EAGAIN)
+	if (rc != -EAGAIN && rc != -EDEADLOCK)
 		list_move_tail(&src->lru, ret);
 
 	if (put_new_page)
@@ -1309,9 +1357,8 @@  static int migrate_folio_move(free_page_t put_new_page, unsigned long private,
 	 */
 	if (rc == MIGRATEPAGE_SUCCESS) {
 		migrate_folio_done(src, reason);
-	} else {
-		if (rc != -EAGAIN)
-			list_add_tail(&src->lru, ret);
+	} else if (rc != -EAGAIN) {
+		list_add_tail(&src->lru, ret);
 
 		if (put_new_page)
 			put_new_page(&dst->page, private);
@@ -1591,7 +1638,7 @@  static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
 		enum migrate_mode mode, int reason, struct list_head *ret_folios,
 		struct migrate_pages_stats *stats)
 {
-	int retry = 1;
+	int retry;
 	int large_retry = 1;
 	int thp_retry = 1;
 	int nr_failed = 0;
@@ -1600,13 +1647,19 @@  static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
 	int pass = 0;
 	bool is_large = false;
 	bool is_thp = false;
-	struct folio *folio, *folio2, *dst = NULL;
-	int rc, nr_pages;
+	struct folio *folio, *folio2, *dst = NULL, *dst2;
+	int rc, rc_saved, nr_pages;
 	LIST_HEAD(split_folios);
+	LIST_HEAD(unmap_folios);
+	LIST_HEAD(dst_folios);
 	bool nosplit = (reason == MR_NUMA_MISPLACED);
 	bool no_split_folio_counting = false;
+	bool force_lock;
 
-split_folio_migration:
+retry:
+	rc_saved = 0;
+	force_lock = true;
+	retry = 1;
 	for (pass = 0;
 	     pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
 	     pass++) {
@@ -1628,16 +1681,15 @@  static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
 			cond_resched();
 
 			rc = migrate_folio_unmap(get_new_page, put_new_page, private,
-						 folio, &dst, pass > 2, mode,
-						 reason, ret_folios);
-			if (rc == MIGRATEPAGE_UNMAP)
-				rc = migrate_folio_move(put_new_page, private,
-							folio, dst, mode,
-							reason, ret_folios);
+						 folio, &dst, pass > 2, force_lock,
+						 mode, reason, ret_folios);
 			/*
 			 * The rules are:
 			 *	Success: folio will be freed
+			 *	Unmap: folio will be put on unmap_folios list,
+			 *	       dst folio put on dst_folios list
 			 *	-EAGAIN: stay on the from list
+			 *	-EDEADLOCK: stay on the from list
 			 *	-ENOMEM: stay on the from list
 			 *	-ENOSYS: stay on the from list
 			 *	Other errno: put on ret_folios list
@@ -1672,7 +1724,7 @@  static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
 			case -ENOMEM:
 				/*
 				 * When memory is low, don't bother to try to migrate
-				 * other folios, just exit.
+				 * other folios, move unmapped folios, then exit.
 				 */
 				if (is_large) {
 					nr_large_failed++;
@@ -1711,7 +1763,19 @@  static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
 				/* nr_failed isn't updated for not used */
 				nr_large_failed += large_retry;
 				stats->nr_thp_failed += thp_retry;
-				goto out;
+				rc_saved = rc;
+				if (list_empty(&unmap_folios))
+					goto out;
+				else
+					goto move;
+			case -EDEADLOCK:
+				/*
+				 * The folio cannot be locked for potential deadlock.
+				 * Go move (and unlock) all locked folios.  Then we can
+				 * try again.
+				 */
+				rc_saved = rc;
+				goto move;
 			case -EAGAIN:
 				if (is_large) {
 					large_retry++;
@@ -1725,6 +1789,15 @@  static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
 				stats->nr_succeeded += nr_pages;
 				stats->nr_thp_succeeded += is_thp;
 				break;
+			case MIGRATEPAGE_UNMAP:
+				/*
+				 * We have locked some folios, don't force lock
+				 * to avoid deadlock.
+				 */
+				force_lock = false;
+				list_move_tail(&folio->lru, &unmap_folios);
+				list_add_tail(&dst->lru, &dst_folios);
+				break;
 			default:
 				/*
 				 * Permanent failure (-EBUSY, etc.):
@@ -1748,12 +1821,95 @@  static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
 	nr_large_failed += large_retry;
 	stats->nr_thp_failed += thp_retry;
 	stats->nr_failed_pages += nr_retry_pages;
+move:
+	retry = 1;
+	for (pass = 0;
+	     pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
+	     pass++) {
+		retry = 0;
+		large_retry = 0;
+		thp_retry = 0;
+		nr_retry_pages = 0;
+
+		dst = list_first_entry(&dst_folios, struct folio, lru);
+		dst2 = list_next_entry(dst, lru);
+		list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
+			is_large = folio_test_large(folio);
+			is_thp = is_large && folio_test_pmd_mappable(folio);
+			nr_pages = folio_nr_pages(folio);
+
+			cond_resched();
+
+			rc = migrate_folio_move(put_new_page, private,
+						folio, dst, mode,
+						reason, ret_folios);
+			/*
+			 * The rules are:
+			 *	Success: folio will be freed
+			 *	-EAGAIN: stay on the unmap_folios list
+			 *	Other errno: put on ret_folios list
+			 */
+			switch(rc) {
+			case -EAGAIN:
+				if (is_large) {
+					large_retry++;
+					thp_retry += is_thp;
+				} else if (!no_split_folio_counting) {
+					retry++;
+				}
+				nr_retry_pages += nr_pages;
+				break;
+			case MIGRATEPAGE_SUCCESS:
+				stats->nr_succeeded += nr_pages;
+				stats->nr_thp_succeeded += is_thp;
+				break;
+			default:
+				if (is_large) {
+					nr_large_failed++;
+					stats->nr_thp_failed += is_thp;
+				} else if (!no_split_folio_counting) {
+					nr_failed++;
+				}
+
+				stats->nr_failed_pages += nr_pages;
+				break;
+			}
+			dst = dst2;
+			dst2 = list_next_entry(dst, lru);
+		}
+	}
+	nr_failed += retry;
+	nr_large_failed += large_retry;
+	stats->nr_thp_failed += thp_retry;
+	stats->nr_failed_pages += nr_retry_pages;
+
+	if (rc_saved)
+		rc = rc_saved;
+	else
+		rc = nr_failed + nr_large_failed;
+out:
+	/* Cleanup remaining folios */
+	dst = list_first_entry(&dst_folios, struct folio, lru);
+	dst2 = list_next_entry(dst, lru);
+	list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
+		int page_was_mapped = 0;
+		struct anon_vma *anon_vma = NULL;
+
+		__migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
+		migrate_folio_undo_src(folio, page_was_mapped, anon_vma,
+				       ret_folios);
+		list_del(&dst->lru);
+		migrate_folio_undo_dst(dst, put_new_page, private);
+		dst = dst2;
+		dst2 = list_next_entry(dst, lru);
+	}
+
 	/*
 	 * Try to migrate split folios of fail-to-migrate large folios, no
 	 * nr_failed counting in this round, since all split folios of a
 	 * large folio is counted as 1 failure in the first round.
 	 */
-	if (!list_empty(&split_folios)) {
+	if (rc >= 0 && !list_empty(&split_folios)) {
 		/*
 		 * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY
 		 * retries) to ret_folios to avoid migrating them again.
@@ -1761,12 +1917,16 @@  static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
 		list_splice_init(from, ret_folios);
 		list_splice_init(&split_folios, from);
 		no_split_folio_counting = true;
-		retry = 1;
-		goto split_folio_migration;
+		goto retry;
 	}
 
-	rc = nr_failed + nr_large_failed;
-out:
+	/*
+	 * We have unlocked all locked folios, so we can force lock now, let's
+	 * try again.
+	 */
+	if (rc == -EDEADLOCK)
+		goto retry;
+
 	return rc;
 }