[v3] xen: speed up grant-table reclaim

Message ID 20230624205624.1817-1-demi@invisiblethingslab.com
State New
Headers
Series [v3] xen: speed up grant-table reclaim |

Commit Message

Demi Marie Obenour June 24, 2023, 8:56 p.m. UTC
  When a grant entry is still in use by the remote domain, Linux must put
it on a deferred list.  Normally, this list is very short, because
the PV network and block protocols expect the backend to unmap the grant
first.  However, Qubes OS's GUI protocol is subject to the constraints
of the X Window System, and as such winds up with the frontend unmapping
the window first.  As a result, the list can grow very large, resulting
in a massive memory leak and eventual VM freeze.

To partially solve this problem, make the number of entries that the VM
will attempt to free at each iteration tunable.  The default is still
10, but it can be overridden at compile-time (via Kconfig), boot-time
(via a kernel command-line option), or runtime (via sysfs).

This is Cc: stable because (when combined with appropriate userspace
changes) it fixes a severe performance and stability problem for Qubes
OS users.

Cc: stable@vger.kernel.org
Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
---
 drivers/xen/grant-table.c | 40 ++++++++++++++++++++++++++++-----------
 2 files changed, 41 insertions(+), 11 deletions(-)
  

Comments

Juergen Gross June 26, 2023, 1:05 p.m. UTC | #1
On 24.06.23 22:56, Demi Marie Obenour wrote:
> When a grant entry is still in use by the remote domain, Linux must put
> it on a deferred list.  Normally, this list is very short, because
> the PV network and block protocols expect the backend to unmap the grant
> first.  However, Qubes OS's GUI protocol is subject to the constraints
> of the X Window System, and as such winds up with the frontend unmapping
> the window first.  As a result, the list can grow very large, resulting
> in a massive memory leak and eventual VM freeze.
> 
> To partially solve this problem, make the number of entries that the VM
> will attempt to free at each iteration tunable.  The default is still
> 10, but it can be overridden at compile-time (via Kconfig), boot-time
> (via a kernel command-line option), or runtime (via sysfs).

Using Kconfig has been dropped.

> 
> This is Cc: stable because (when combined with appropriate userspace
> changes) it fixes a severe performance and stability problem for Qubes
> OS users.
> 
> Cc: stable@vger.kernel.org
> Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
> ---
>   drivers/xen/grant-table.c | 40 ++++++++++++++++++++++++++++-----------
>   2 files changed, 41 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
> index e1ec725c2819d4d5dede063eb00d86a6d52944c0..fa666aa6abc3e786dddc94f895641505ec0b23d8 100644
> --- a/drivers/xen/grant-table.c
> +++ b/drivers/xen/grant-table.c
> @@ -498,14 +498,20 @@ static LIST_HEAD(deferred_list);
>   static void gnttab_handle_deferred(struct timer_list *);
>   static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
>   
> +static atomic64_t deferred_count;
> +static atomic64_t leaked_count;
> +static unsigned int free_per_iteration = 10;
> +
>   static void gnttab_handle_deferred(struct timer_list *unused)
>   {
> -	unsigned int nr = 10;
> +	unsigned int nr = READ_ONCE(free_per_iteration);
> +	const bool ignore_limit = nr == 0;
>   	struct deferred_entry *first = NULL;
>   	unsigned long flags;
> +	size_t freed = 0;
>   
>   	spin_lock_irqsave(&gnttab_list_lock, flags);
> -	while (nr--) {
> +	while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
>   		struct deferred_entry *entry
>   			= list_first_entry(&deferred_list,
>   					   struct deferred_entry, list);
> @@ -515,10 +521,13 @@ static void gnttab_handle_deferred(struct timer_list *unused)
>   		list_del(&entry->list);
>   		spin_unlock_irqrestore(&gnttab_list_lock, flags);
>   		if (_gnttab_end_foreign_access_ref(entry->ref)) {
> +			uint64_t ret = atomic64_sub_return(1, &deferred_count);

Use atomic64_dec_return()?

Please add an empty line here.

>   			put_free_entry(entry->ref);
> -			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
> -				 entry->ref, page_to_pfn(entry->page));
> +			pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
> +				 entry->ref, page_to_pfn(entry->page),
> +				 (unsigned long long)ret);
>   			put_page(entry->page);
> +			freed++;
>   			kfree(entry);
>   			entry = NULL;
>   		} else {
> @@ -530,21 +539,22 @@ static void gnttab_handle_deferred(struct timer_list *unused)
>   		spin_lock_irqsave(&gnttab_list_lock, flags);
>   		if (entry)
>   			list_add_tail(&entry->list, &deferred_list);
> -		else if (list_empty(&deferred_list))
> -			break;
>   	}
> -	if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
> +	if (list_empty(&deferred_list))
> +		WARN_ON(atomic64_read(&deferred_count));
> +	else if (!timer_pending(&deferred_timer)) {
>   		deferred_timer.expires = jiffies + HZ;
>   		add_timer(&deferred_timer);
>   	}
>   	spin_unlock_irqrestore(&gnttab_list_lock, flags);
> +	pr_debug("Freed %zu references", freed);
>   }
>   
>   static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
>   {
>   	struct deferred_entry *entry;
>   	gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
> -	const char *what = KERN_WARNING "leaking";
> +	uint64_t leaked, deferred;
>   
>   	entry = kmalloc(sizeof(*entry), gfp);
>   	if (!page) {
> @@ -567,12 +577,20 @@ static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
>   			add_timer(&deferred_timer);
>   		}
>   		spin_unlock_irqrestore(&gnttab_list_lock, flags);
> -		what = KERN_DEBUG "deferring";
> +		deferred = atomic64_add_return(1, &deferred_count);

Use atomic64_inc_return() (same below)?

> +		leaked = atomic64_read(&leaked_count);
> +		pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
> +			 ref, page ? page_to_pfn(page) : -1, deferred, leaked);
> +	} else {
> +		deferred = atomic64_read(&deferred_count);
> +		leaked = atomic64_add_return(1, &leaked_count);
> +		pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
> +			ref, page ? page_to_pfn(page) : -1, deferred, leaked);
>   	}
> -	printk("%s g.e. %#x (pfn %#lx)\n",
> -	       what, ref, page ? page_to_pfn(page) : -1);
>   }
>   
> +module_param(free_per_iteration, uint, 0600);
> +

As said for v2 already: please move this closer to the related variable
definition.

>   int gnttab_try_end_foreign_access(grant_ref_t ref)
>   {
>   	int ret = _gnttab_end_foreign_access_ref(ref);


Juergen
  

Patch

diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index e1ec725c2819d4d5dede063eb00d86a6d52944c0..fa666aa6abc3e786dddc94f895641505ec0b23d8 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -498,14 +498,20 @@  static LIST_HEAD(deferred_list);
 static void gnttab_handle_deferred(struct timer_list *);
 static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
 
+static atomic64_t deferred_count;
+static atomic64_t leaked_count;
+static unsigned int free_per_iteration = 10;
+
 static void gnttab_handle_deferred(struct timer_list *unused)
 {
-	unsigned int nr = 10;
+	unsigned int nr = READ_ONCE(free_per_iteration);
+	const bool ignore_limit = nr == 0;
 	struct deferred_entry *first = NULL;
 	unsigned long flags;
+	size_t freed = 0;
 
 	spin_lock_irqsave(&gnttab_list_lock, flags);
-	while (nr--) {
+	while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
 		struct deferred_entry *entry
 			= list_first_entry(&deferred_list,
 					   struct deferred_entry, list);
@@ -515,10 +521,13 @@  static void gnttab_handle_deferred(struct timer_list *unused)
 		list_del(&entry->list);
 		spin_unlock_irqrestore(&gnttab_list_lock, flags);
 		if (_gnttab_end_foreign_access_ref(entry->ref)) {
+			uint64_t ret = atomic64_sub_return(1, &deferred_count);
 			put_free_entry(entry->ref);
-			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
-				 entry->ref, page_to_pfn(entry->page));
+			pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
+				 entry->ref, page_to_pfn(entry->page),
+				 (unsigned long long)ret);
 			put_page(entry->page);
+			freed++;
 			kfree(entry);
 			entry = NULL;
 		} else {
@@ -530,21 +539,22 @@  static void gnttab_handle_deferred(struct timer_list *unused)
 		spin_lock_irqsave(&gnttab_list_lock, flags);
 		if (entry)
 			list_add_tail(&entry->list, &deferred_list);
-		else if (list_empty(&deferred_list))
-			break;
 	}
-	if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
+	if (list_empty(&deferred_list))
+		WARN_ON(atomic64_read(&deferred_count));
+	else if (!timer_pending(&deferred_timer)) {
 		deferred_timer.expires = jiffies + HZ;
 		add_timer(&deferred_timer);
 	}
 	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+	pr_debug("Freed %zu references", freed);
 }
 
 static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
 {
 	struct deferred_entry *entry;
 	gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
-	const char *what = KERN_WARNING "leaking";
+	uint64_t leaked, deferred;
 
 	entry = kmalloc(sizeof(*entry), gfp);
 	if (!page) {
@@ -567,12 +577,20 @@  static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
 			add_timer(&deferred_timer);
 		}
 		spin_unlock_irqrestore(&gnttab_list_lock, flags);
-		what = KERN_DEBUG "deferring";
+		deferred = atomic64_add_return(1, &deferred_count);
+		leaked = atomic64_read(&leaked_count);
+		pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+			 ref, page ? page_to_pfn(page) : -1, deferred, leaked);
+	} else {
+		deferred = atomic64_read(&deferred_count);
+		leaked = atomic64_add_return(1, &leaked_count);
+		pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+			ref, page ? page_to_pfn(page) : -1, deferred, leaked);
 	}
-	printk("%s g.e. %#x (pfn %#lx)\n",
-	       what, ref, page ? page_to_pfn(page) : -1);
 }
 
+module_param(free_per_iteration, uint, 0600);
+
 int gnttab_try_end_foreign_access(grant_ref_t ref)
 {
 	int ret = _gnttab_end_foreign_access_ref(ref);