@@ -10,7 +10,6 @@ enum compact_priority {
COMPACT_PRIO_SYNC_FULL,
MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL,
COMPACT_PRIO_SYNC_LIGHT,
- MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
COMPACT_PRIO_ASYNC,
INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC
@@ -1784,15 +1784,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
if (cc->order <= PAGE_ALLOC_COSTLY_ORDER)
return pfn;
- /*
- * Only allow kcompactd and direct requests for movable pages to
- * quickly clear out a MOVABLE pageblock for allocation. This
- * reduces the risk that a large movable pageblock is freed for
- * an unmovable/reclaimable small allocation.
- */
- if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE)
- return pfn;
-
/*
* When starting the migration scanner, pick any pageblock within the
* first half of the search space. Otherwise try and pick a pageblock
@@ -2065,8 +2056,7 @@ static bool should_proactive_compact_node(pg_data_t *pgdat)
static enum compact_result __compact_finished(struct compact_control *cc)
{
- unsigned int order;
- const int migratetype = cc->migratetype;
+ unsigned long mark;
int ret;
/* Compaction run completes if the migrate and free scanner meet */
@@ -2120,39 +2110,16 @@ static enum compact_result __compact_finished(struct compact_control *cc)
if (!pageblock_aligned(cc->migrate_pfn))
return COMPACT_CONTINUE;
- /* Direct compactor: Is a suitable page free? */
+ /* Done when watermarks are restored */
ret = COMPACT_NO_SUITABLE_PAGE;
- for (order = cc->order; order < MAX_ORDER; order++) {
- struct free_area *area = &cc->zone->free_area[order];
- bool can_steal;
-
- /* Job done if page is free of the right migratetype */
- if (!free_area_empty(area, migratetype))
- return COMPACT_SUCCESS;
-
-#ifdef CONFIG_CMA
- /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
- if (migratetype == MIGRATE_MOVABLE &&
- !free_area_empty(area, MIGRATE_CMA))
- return COMPACT_SUCCESS;
-#endif
- /*
- * Job done if allocation would steal freepages from
- * other migratetype buddy lists.
- */
- if (find_suitable_fallback(area, order, migratetype,
- true, &can_steal) != -1)
- /*
- * Movable pages are OK in any pageblock. If we are
- * stealing for a non-movable allocation, make sure
- * we finish compacting the current pageblock first
- * (which is assured by the above migrate_pfn align
- * check) so it is as free as possible and we won't
- * have to steal another one soon.
- */
- return COMPACT_SUCCESS;
- }
-
+ if (cc->direct_compaction)
+ mark = wmark_pages(cc->zone,
+ cc->alloc_flags & ALLOC_WMARK_MASK);
+ else
+ mark = high_wmark_pages(cc->zone);
+ if (zone_watermark_ok(cc->zone, cc->order, mark,
+ cc->highest_zoneidx, cc->alloc_flags))
+ return COMPACT_SUCCESS;
out:
if (cc->contended || fatal_signal_pending(current))
ret = COMPACT_CONTENDED;
@@ -2310,8 +2277,12 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
unsigned long watermark;
/* Allocation can already succeed, nothing to do */
- watermark = wmark_pages(cc->zone,
- cc->alloc_flags & ALLOC_WMARK_MASK);
+ if (cc->direct_compaction)
+ watermark = wmark_pages(cc->zone,
+ cc->alloc_flags &
+ ALLOC_WMARK_MASK);
+ else
+ watermark = high_wmark_pages(cc->zone);
if (zone_watermark_ok(cc->zone, cc->order, watermark,
cc->highest_zoneidx, cc->alloc_flags))
return COMPACT_SUCCESS;
@@ -2800,7 +2771,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
/* Allocation can succeed in any zone, done */
if (zone_watermark_ok(zone, pgdat->kcompactd_max_order,
- min_wmark_pages(zone),
+ high_wmark_pages(zone),
highest_zoneidx, 0))
return true;
@@ -2845,7 +2816,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
/* Allocation can already succeed, nothing to do */
if (zone_watermark_ok(zone, cc.order,
- min_wmark_pages(zone), zoneid, 0))
+ high_wmark_pages(zone), zoneid, 0))
continue;
if (compaction_suitable(zone, cc.order,
@@ -432,6 +432,7 @@ struct compact_control {
*/
struct capture_control {
struct compact_control *cc;
+ int order;
int migratetype;
struct page *page;
};
@@ -1075,7 +1075,7 @@ static inline bool
compaction_capture(struct zone *zone, struct page *page, int order,
int migratetype, struct capture_control *capc)
{
- if (!capc || order < capc->cc->order)
+ if (!capc || order < capc->order)
return false;
/* Do not accidentally pollute CMA or isolated regions*/
@@ -1097,8 +1097,8 @@ compaction_capture(struct zone *zone, struct page *page, int order,
return false;
}
- if (order > capc->cc->order)
- expand(zone, page, capc->cc->order, order, migratetype);
+ if (order > capc->order)
+ expand(zone, page, capc->order, order, migratetype);
capc->page = page;
return true;
@@ -3649,15 +3649,15 @@ int __isolate_free_page(struct page *page, unsigned int order)
int mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt)) {
+ long free_pages = zone_page_state(zone, NR_FREE_PAGES);
unsigned long watermark;
/*
- * Obey watermarks as if the page was being allocated. We can
- * emulate a high-order watermark check with a raised order-0
- * watermark, because we already know our high-order page
- * exists.
+ * Keep a lid on concurrent compaction. MIGRATE_FREE
+ * watermarks alone cannot be checked here, because
+ * that's what the caller is trying to produce.
*/
watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ if (!__zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA, free_pages))
return 0;
}
@@ -3976,27 +3976,59 @@ noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
}
ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
-static inline long __zone_watermark_unusable_free(struct zone *z,
- unsigned int order, unsigned int alloc_flags)
+static long page_state(struct zone *zone, enum zone_stat_item item, bool safe)
{
- const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
- long unusable_free = (1 << order) - 1;
+ if (safe)
+ return zone_page_state_snapshot(zone, item);
+ else
+ return zone_page_state(zone, item);
+}
+
+static long __zone_free_pages(struct zone *zone, int alloc_flags, bool safe)
+{
+ long free_pages;
/*
- * If the caller does not have rights to ALLOC_HARDER then subtract
- * the high-atomic reserves. This will over-estimate the size of the
- * atomic reserve but it avoids a search.
+ * Enforce watermarks against MIGRATE_FREE pages. This ensures
+ * that there is always a reserve of higher-order pages
+ * maintained for all migratetypes and allocation contexts.
+ *
+ * Allocations will still use up any compatible free pages
+ * that may exist inside claimed blocks first. But the reserve
+ * prevents smaller allocations from starving out higher-order
+ * requests (which may not be able to sleep, e.g. highatomic).
+ *
+ * The additional memory requirements of this are mininmal. If
+ * internal free pages already exceed the compact_gap(), only
+ * compaction is necessary to restore the watermarks.
*/
- if (likely(!alloc_harder))
- unusable_free += z->nr_reserved_highatomic;
+ free_pages = page_state(zone, NR_FREE_FREE, safe);
+ if (alloc_flags & (ALLOC_HARDER | ALLOC_OOM))
+ free_pages += page_state(zone, NR_FREE_HIGHATOMIC, safe);
+ if (IS_ENABLED(CONFIG_CMA) && (alloc_flags & ALLOC_CMA))
+ free_pages += page_state(zone, NR_FREE_CMA_PAGES, safe);
-#ifdef CONFIG_CMA
- /* If allocation can't use CMA areas don't use free CMA pages */
- if (!(alloc_flags & ALLOC_CMA))
- unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
+ if (!IS_ENABLED(CONFIG_COMPACTION)) {
+ /*
+ * We can't reasonably defragment without compaction.
+ * Consider everything and do best-effort grouping.
+ */
+ free_pages += page_state(zone, NR_FREE_UNMOVABLE, safe);
+ free_pages += page_state(zone, NR_FREE_MOVABLE, safe);
+ free_pages += page_state(zone, NR_FREE_RECLAIMABLE, safe);
+ }
- return unusable_free;
+ return free_pages;
+}
+
+static long zone_free_pages(struct zone *zone, int alloc_flags)
+{
+ return __zone_free_pages(zone, alloc_flags, false);
+}
+
+static long zone_free_pages_safe(struct zone *zone, int alloc_flags)
+{
+ return __zone_free_pages(zone, alloc_flags, true);
}
/*
@@ -4014,7 +4046,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
- free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
+ free_pages -= (1 << order) - 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
@@ -4076,33 +4108,22 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int highest_zoneidx, unsigned int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
- zone_page_state(z, NR_FREE_PAGES));
+ zone_free_pages(z, alloc_flags));
}
static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx,
unsigned int alloc_flags, gfp_t gfp_mask)
{
- long free_pages;
-
- free_pages = zone_page_state(z, NR_FREE_PAGES);
+ long free_pages = zone_free_pages(z, alloc_flags);
/*
* Fast check for order-0 only. If this fails then the reserves
* need to be calculated.
*/
- if (!order) {
- long usable_free;
- long reserved;
-
- usable_free = free_pages;
- reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
-
- /* reserved may over estimate high-atomic reserves. */
- usable_free -= min(usable_free, reserved);
- if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
- return true;
- }
+ if (!order && (free_pages - ((1 << order) - 1) >
+ mark + z->lowmem_reserve[highest_zoneidx]))
+ return true;
if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
free_pages))
@@ -4126,13 +4147,8 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx)
{
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
-
- if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
- free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
-
return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
- free_pages);
+ zone_free_pages_safe(z, 0));
}
#ifdef CONFIG_NUMA
@@ -4524,12 +4540,14 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned long pflags;
unsigned int noreclaim_flag;
struct capture_control capc = {
+ .order = order,
.migratetype = ac->migratetype,
.page = NULL,
};
+ int compact_order;
- if (!order)
- return NULL;
+ /* Use reclaim/compaction to produce neutral blocks */
+ compact_order = max_t(int, order, pageblock_order);
/*
* Make sure the structs are really initialized before we expose the
@@ -4543,8 +4561,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
delayacct_compact_start();
noreclaim_flag = memalloc_noreclaim_save();
- *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- prio, &capc);
+ *compact_result = try_to_compact_pages(gfp_mask, compact_order,
+ alloc_flags, ac, prio, &capc);
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
@@ -4608,13 +4626,12 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
enum compact_priority *compact_priority,
int *compaction_retries)
{
- int min_priority;
bool ret = false;
int retries = *compaction_retries;
enum compact_priority priority = *compact_priority;
- if (!order)
- return false;
+ /* Use reclaim/compaction to produce neutral blocks */
+ order = max_t(int, order, pageblock_order);
if (fatal_signal_pending(current))
return false;
@@ -4624,20 +4641,6 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
* failed, presumably due to a race. Retry a few times.
*/
if (compact_result == COMPACT_SUCCESS) {
- int max_retries = MAX_COMPACT_RETRIES;
-
- /*
- * !costly requests are much more important than
- * __GFP_RETRY_MAYFAIL costly ones because they are de
- * facto nofail and invoke OOM killer to move on while
- * costly can fail and users are ready to cope with
- * that. 1/4 retries is rather arbitrary but we would
- * need much more detailed feedback from compaction to
- * make a better decision.
- */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
- max_retries /= 4;
-
ret = ++(*compaction_retries) <= MAX_COMPACT_RETRIES;
goto out;
}
@@ -4654,16 +4657,13 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
/*
* Compaction failed. Retry with increasing priority.
*/
- min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
- MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
-
- if (*compact_priority > min_priority) {
+ if (*compact_priority > MIN_COMPACT_PRIORITY) {
(*compact_priority)--;
*compaction_retries = 0;
ret = true;
}
out:
- trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
+ trace_compact_retry(order, priority, compact_result, retries, MAX_COMPACT_RETRIES, ret);
return ret;
}
#else
@@ -4822,9 +4822,16 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
unsigned long pflags;
bool drained = false;
+ int reclaim_order;
+
+ /* Use reclaim/compaction to produce neutral blocks */
+ if (IS_ENABLED(CONFIG_COMPACTION))
+ reclaim_order = max_t(int, order, pageblock_order);
+ else
+ reclaim_order = order;
psi_memstall_enter(&pflags);
- *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
+ *did_some_progress = __perform_reclaim(gfp_mask, reclaim_order, ac);
if (unlikely(!(*did_some_progress)))
goto out;
@@ -4856,6 +4863,10 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
pg_data_t *last_pgdat = NULL;
enum zone_type highest_zoneidx = ac->highest_zoneidx;
+ /* Use reclaim/compaction to produce neutral blocks */
+ if (IS_ENABLED(CONFIG_COMPACTION))
+ order = max_t(unsigned int, order, pageblock_order);
+
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
if (!managed_zone(zone))
@@ -4970,6 +4981,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
struct zoneref *z;
bool ret = false;
+ /*
+ * In the old world, order-0 pages only need reclaim, and
+ * higher orders might be present but the order-0 watermarks
+ * aren't met yet. These things can be fixed by reclaim alone.
+ *
+ * In the new world, though, watermark checks are against
+ * MIGRATE_FREE blocks. That means if the watermarks aren't
+ * met, reclaim isn't going to be the solution. Neither for
+ * order-0 nor for anything else. Whether it makes sense to
+ * retry depends fully on whether compaction should retry.
+ *
+ * should_compact_retry() already checks for COMPACT_SKIPPED
+ * and compaction_zonelist_suitable() to test whether reclaim
+ * is needed.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION))
+ goto schedule;
+
/*
* Costly allocations might have made a progress but this doesn't mean
* their order will become available due to high fragmentation so
@@ -5019,6 +5048,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
}
}
+schedule:
/*
* Memory allocation/reclaim might be called from a WQ context and the
* current implementation of the WQ concurrency control doesn't
@@ -8833,6 +8863,13 @@ static void __setup_per_zone_wmarks(void)
mult_frac(zone_managed_pages(zone),
watermark_scale_factor, 10000));
+ /*
+ * Ensure the watermark delta is a multiple of the
+ * neutral block that reclaim/compaction produces.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION))
+ tmp = ALIGN(tmp, 1 << pageblock_order);
+
zone->watermark_boost = 0;
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
@@ -7438,8 +7438,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
/* Hopeless node, leave it to direct reclaim if possible */
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
- (pgdat_balanced(pgdat, order, highest_zoneidx) &&
- !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
+ pgdat_balanced(pgdat, order, highest_zoneidx)) {
/*
* There may be plenty of free memory available, but it's too
* fragmented for high-order allocations. Wake up kcompactd
@@ -7447,8 +7446,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
* needed. If it fails, it will defer subsequent attempts to
* ratelimit its work.
*/
- if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
- wakeup_kcompactd(pgdat, order, highest_zoneidx);
+ wakeup_kcompactd(pgdat, order, highest_zoneidx);
return;
}