[v2,1/2] mm/memfd: refactor memfd_tag_pins() and memfd_wait_for_pins()
Commit Message
Both functions are the remaining users of total_mapcount(). Let's get
rid of the calls by converting the code to folios.
As it turns out, the code is unnecessarily complicated, especially:
1) We can query the number of pagecache references for a folio simply via
folio_nr_pages(). This will handle other folio sizes in the future
correctly.
2) The xas_set(xas, page->index + cache_count) call to increment the
iterator for large folios is not required. Remove it.
Further, simplify the XA_CHECK_SCHED check, counting each entry exactly
once.
Memfd pages can be swapped out when using shmem; leave xa_is_value()
checks in place.
Co-developed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
mm/memfd.c | 47 ++++++++++++++++++-----------------------------
1 file changed, 18 insertions(+), 29 deletions(-)
Comments
On Mon, Feb 26, 2024 at 03:13:23PM +0100, David Hildenbrand wrote:
> + xas_for_each(xas, folio, ULONG_MAX) {
> + if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
> xas_set_mark(xas, MEMFD_TAG_PINNED);
.. we decline to tag value entries here ...
> @@ -95,20 +90,15 @@ static int memfd_wait_for_pins(struct address_space *mapping)
>
> xas_set(&xas, 0);
> xas_lock_irq(&xas);
> - xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
> + xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
> bool clear = true;
>
> - cache_count = 1;
> - if (!xa_is_value(page) &&
> - PageTransHuge(page) && !PageHuge(page))
> - cache_count = HPAGE_PMD_NR;
> -
> - if (!xa_is_value(page) && cache_count !=
> - page_count(page) - total_mapcount(page)) {
> + if (!xa_is_value(folio) &&
> + memfd_folio_has_extra_refs(folio)) {
.. so we don't need to test it here because we'll never see any value
entries. No?
On 26.02.24 17:07, Matthew Wilcox wrote:
> On Mon, Feb 26, 2024 at 03:13:23PM +0100, David Hildenbrand wrote:
>> + xas_for_each(xas, folio, ULONG_MAX) {
>> + if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
>> xas_set_mark(xas, MEMFD_TAG_PINNED);
>
> ... we decline to tag value entries here ...
>
>> @@ -95,20 +90,15 @@ static int memfd_wait_for_pins(struct address_space *mapping)
>>
>> xas_set(&xas, 0);
>> xas_lock_irq(&xas);
>> - xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
>> + xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
>> bool clear = true;
>>
>> - cache_count = 1;
>> - if (!xa_is_value(page) &&
>> - PageTransHuge(page) && !PageHuge(page))
>> - cache_count = HPAGE_PMD_NR;
>> -
>> - if (!xa_is_value(page) && cache_count !=
>> - page_count(page) - total_mapcount(page)) {
>> + if (!xa_is_value(folio) &&
>> + memfd_folio_has_extra_refs(folio)) {
>
> ... so we don't need to test it here because we'll never see any value
> entries. No?
I was not able to convince myself that swapout code would clear the mark
when replacing the entry.
shmem_writepage()->shmem_delete_from_page_cache()->shmem_replace_entry()
will perform a xas_store() with swp_to_radix_entry(swap) under
xa_lock_irq().
Reading the doc, and staring at the code for a bit too long, I think
xas_store() would only clear tags when deleting an entry (passing NULL).
But maybe xas_store() will always clear tags?
In memfd code, I think we could see swapout between memfd_tag_pins() and
the check for tags, where we drop the xa_lock. Unless some other lock
(inode lock?) protects us.
Thanks!
On Mon, Feb 26, 2024 at 05:56:09PM +0100, David Hildenbrand wrote:
> > > @@ -95,20 +90,15 @@ static int memfd_wait_for_pins(struct address_space *mapping)
> > > xas_set(&xas, 0);
> > > xas_lock_irq(&xas);
> > > - xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
> > > + xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
> > > bool clear = true;
> > > - cache_count = 1;
> > > - if (!xa_is_value(page) &&
> > > - PageTransHuge(page) && !PageHuge(page))
> > > - cache_count = HPAGE_PMD_NR;
> > > -
> > > - if (!xa_is_value(page) && cache_count !=
> > > - page_count(page) - total_mapcount(page)) {
> > > + if (!xa_is_value(folio) &&
> > > + memfd_folio_has_extra_refs(folio)) {
> >
> > ... so we don't need to test it here because we'll never see any value
> > entries. No?
>
> I was not able to convince myself that swapout code would clear the mark
> when replacing the entry.
>
> shmem_writepage()->shmem_delete_from_page_cache()->shmem_replace_entry()
>
> will perform a xas_store() with swp_to_radix_entry(swap) under
> xa_lock_irq().
>
> Reading the doc, and staring at the code for a bit too long, I think
> xas_store() would only clear tags when deleting an entry (passing NULL).
>
> But maybe xas_store() will always clear tags?
No, xas_store() will leave the tag alone ... this is the right thing to
do for the pagecache because we always clear the tags before removing
a folio from the cache.
> In memfd code, I think we could see swapout between memfd_tag_pins() and the
> check for tags, where we drop the xa_lock. Unless some other lock (inode
> lock?) protects us.
.. and if it does happen, we see the value entry tagged and clear the
tag on it. OK.
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
@@ -29,29 +29,25 @@
#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
#define LAST_SCAN 4 /* about 150ms max */
+static bool memfd_folio_has_extra_refs(struct folio *folio)
+{
+ return folio_ref_count(folio) - folio_mapcount(folio) !=
+ folio_nr_pages(folio);
+}
+
static void memfd_tag_pins(struct xa_state *xas)
{
- struct page *page;
+ struct folio *folio;
int latency = 0;
- int cache_count;
lru_add_drain();
xas_lock_irq(xas);
- xas_for_each(xas, page, ULONG_MAX) {
- cache_count = 1;
- if (!xa_is_value(page) &&
- PageTransHuge(page) && !PageHuge(page))
- cache_count = HPAGE_PMD_NR;
-
- if (!xa_is_value(page) &&
- page_count(page) - total_mapcount(page) != cache_count)
+ xas_for_each(xas, folio, ULONG_MAX) {
+ if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
xas_set_mark(xas, MEMFD_TAG_PINNED);
- if (cache_count != 1)
- xas_set(xas, page->index + cache_count);
- latency += cache_count;
- if (latency < XA_CHECK_SCHED)
+ if (++latency < XA_CHECK_SCHED)
continue;
latency = 0;
@@ -66,16 +62,16 @@ static void memfd_tag_pins(struct xa_state *xas)
/*
* Setting SEAL_WRITE requires us to verify there's no pending writer. However,
* via get_user_pages(), drivers might have some pending I/O without any active
- * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios
* and see whether it has an elevated ref-count. If so, we tag them and wait for
* them to be dropped.
* The caller must guarantee that no new user will acquire writable references
- * to those pages to avoid races.
+ * to those folios to avoid races.
*/
static int memfd_wait_for_pins(struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, 0);
- struct page *page;
+ struct folio *folio;
int error, scan;
memfd_tag_pins(&xas);
@@ -83,7 +79,6 @@ static int memfd_wait_for_pins(struct address_space *mapping)
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
int latency = 0;
- int cache_count;
if (!xas_marked(&xas, MEMFD_TAG_PINNED))
break;
@@ -95,20 +90,15 @@ static int memfd_wait_for_pins(struct address_space *mapping)
xas_set(&xas, 0);
xas_lock_irq(&xas);
- xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
+ xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
bool clear = true;
- cache_count = 1;
- if (!xa_is_value(page) &&
- PageTransHuge(page) && !PageHuge(page))
- cache_count = HPAGE_PMD_NR;
-
- if (!xa_is_value(page) && cache_count !=
- page_count(page) - total_mapcount(page)) {
+ if (!xa_is_value(folio) &&
+ memfd_folio_has_extra_refs(folio)) {
/*
* On the last scan, we clean up all those tags
* we inserted; but make a note that we still
- * found pages pinned.
+ * found folios pinned.
*/
if (scan == LAST_SCAN)
error = -EBUSY;
@@ -118,8 +108,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
if (clear)
xas_clear_mark(&xas, MEMFD_TAG_PINNED);
- latency += cache_count;
- if (latency < XA_CHECK_SCHED)
+ if (++latency < XA_CHECK_SCHED)
continue;
latency = 0;