[v2] nvme_core: scan namespaces asynchronously

Message ID 20240118210303.10484-1-stuart.w.hayes@gmail.com
State New
Headers
Series [v2] nvme_core: scan namespaces asynchronously |

Commit Message

stuart hayes Jan. 18, 2024, 9:03 p.m. UTC
  Use async function calls to make namespace scanning happen in parallel.

Without the patch, NVME namespaces are scanned serially, so it can take a
long time for all of a controller's namespaces to become available,
especially with a slower (TCP) interface with large number of namespaces.

The time it took for all namespaces to show up after connecting (via TCP)
to a controller with 1002 namespaces was measured:

network latency   without patch   with patch
     0                 6s            1s
    50ms             210s           10s
   100ms             417s           18s

Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>

--
V2: remove module param to enable/disable async scanning
    add scan time measurements to commit message
  

Comments

Sagi Grimberg Jan. 22, 2024, 9:13 a.m. UTC | #1
On 1/18/24 23:03, Stuart Hayes wrote:
> Use async function calls to make namespace scanning happen in parallel.
> 
> Without the patch, NVME namespaces are scanned serially, so it can take a
> long time for all of a controller's namespaces to become available,
> especially with a slower (TCP) interface with large number of namespaces.
> 
> The time it took for all namespaces to show up after connecting (via TCP)
> to a controller with 1002 namespaces was measured:
> 
> network latency   without patch   with patch
>       0                 6s            1s
>      50ms             210s           10s
>     100ms             417s           18s
> 

Impressive speedup. Not a very common use-case though...

> Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
> 
> --
> V2: remove module param to enable/disable async scanning
>      add scan time measurements to commit message
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 0af612387083..069350f85b83 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -4,6 +4,7 @@
>    * Copyright (c) 2011-2014, Intel Corporation.
>    */
>   
> +#include <linux/async.h>
>   #include <linux/blkdev.h>
>   #include <linux/blk-mq.h>
>   #include <linux/blk-integrity.h>
> @@ -3812,12 +3813,38 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
>   		nvme_ns_remove(ns);
>   }
>   
> -static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> +/*
> + * struct nvme_scan_state - keeps track of controller & NSIDs to scan
> + * @ctrl:	Controller on which namespaces are being scanned
> + * @count:	Next NSID to scan (for sequential scan), or
> + *		Index of next NSID to scan in ns_list (for list scan)
> + * @ns_list:	pointer to list of NSIDs to scan (NULL if sequential scan)
> + */
> +struct nvme_scan_state {
> +	struct nvme_ctrl *ctrl;
> +	atomic_t count;
> +	__le32 *ns_list;
> +};
> +
> +static void nvme_scan_ns(void *data, async_cookie_t cookie)

I think its better to call it nvme_scan_ns_async to indicate what
it is.

>   {
> -	struct nvme_ns_info info = { .nsid = nsid };
> +	struct nvme_ns_info info = {};
> +	struct nvme_scan_state *scan_state;
> +	struct nvme_ctrl *ctrl;
> +	u32 nsid;
>   	struct nvme_ns *ns;
>   	int ret;
>   
> +	scan_state = data;
> +	ctrl = scan_state->ctrl;

I think these assignments can be done on the declaration.

> +	nsid = (u32)atomic_fetch_add(1, &scan_state->count);
> +	/*
> +	 * get NSID from list (if scanning from a list, not sequentially)
> +	 */
> +	if (scan_state->ns_list)
> +		nsid = le32_to_cpu(scan_state->ns_list[nsid]);
> +

This is awkward. ns_list passed in optionally.
How about we limit this change to only operate on nvme_scan_ns_list?
If the controller is old or quirked to support only a sequential scan
it does not benefit from a parallel scan. I doubt that these controllers
are likely to expose a large number of namespaces anyways.

> +	info.nsid = nsid;
>   	if (nvme_identify_ns_descs(ctrl, &info))
>   		return;
>   
> @@ -3881,11 +3908,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
>   	__le32 *ns_list;
>   	u32 prev = 0;
>   	int ret = 0, i;
> +	ASYNC_DOMAIN(domain);
> +	struct nvme_scan_state scan_state;
>   
>   	ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
>   	if (!ns_list)
>   		return -ENOMEM;
>   
> +	scan_state.ctrl = ctrl;
> +	scan_state.ns_list = ns_list;

Is there a need to have a local ns_list variable here?

>   	for (;;) {
>   		struct nvme_command cmd = {
>   			.identify.opcode	= nvme_admin_identify,
> @@ -3901,19 +3932,25 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
>   			goto free;
>   		}
>   
> +		/*
> +		 * scan list starting at list offset 0
> +		 */
> +		atomic_set(&scan_state.count, 0);
>   		for (i = 0; i < nr_entries; i++) {
>   			u32 nsid = le32_to_cpu(ns_list[i]);
>   
>   			if (!nsid)	/* end of the list? */
>   				goto out;
> -			nvme_scan_ns(ctrl, nsid);
> +			async_schedule_domain(nvme_scan_ns, &scan_state, &domain);
>   			while (++prev < nsid)
>   				nvme_ns_remove_by_nsid(ctrl, prev);
>   		}
> +		async_synchronize_full_domain(&domain);
>   	}
>    out:
>   	nvme_remove_invalid_namespaces(ctrl, prev);

Is it a good idea to remove the invalid namespaces before synchronizing
the async scans?

>    free:
> +	async_synchronize_full_domain(&domain);
>   	kfree(ns_list);
>   	return ret;
>   }
> @@ -3922,14 +3959,23 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
>   {
>   	struct nvme_id_ctrl *id;
>   	u32 nn, i;
> +	ASYNC_DOMAIN(domain);
> +	struct nvme_scan_state scan_state;
>   
>   	if (nvme_identify_ctrl(ctrl, &id))
>   		return;
>   	nn = le32_to_cpu(id->nn);
>   	kfree(id);
>   
> +	scan_state.ctrl = ctrl;
> +	/*
> +	 * scan sequentially starting at NSID 1
> +	 */
> +	atomic_set(&scan_state.count, 1);
> +	scan_state.ns_list = NULL;
>   	for (i = 1; i <= nn; i++)
> -		nvme_scan_ns(ctrl, i);
> +		async_schedule_domain(nvme_scan_ns, &scan_state, &domain);
> +	async_synchronize_full_domain(&domain);
>   
>   	nvme_remove_invalid_namespaces(ctrl, nn);
>   }

I think we need a blktest for this. ns scanning has been notorious when
running simultaneously with controller reset/reconnect/remove
sequences... Ideally a test with a larger number of namespaces to
exercise the code.

Also, make sure that blktest suite does not complain about anything
else.
  
Sagi Grimberg Jan. 22, 2024, 9:21 a.m. UTC | #2
Resending... didn't make it to the list, probably smtp issues....

On 1/22/24 11:13, Sagi Grimberg wrote:
> 
> 
> On 1/18/24 23:03, Stuart Hayes wrote:
>> Use async function calls to make namespace scanning happen in parallel.
>>
>> Without the patch, NVME namespaces are scanned serially, so it can take a
>> long time for all of a controller's namespaces to become available,
>> especially with a slower (TCP) interface with large number of namespaces.
>>
>> The time it took for all namespaces to show up after connecting (via TCP)
>> to a controller with 1002 namespaces was measured:
>>
>> network latency   without patch   with patch
>>       0                 6s            1s
>>      50ms             210s           10s
>>     100ms             417s           18s
>>
> 
> Impressive speedup. Not a very common use-case though...
> 
>> Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
>>
>> -- 
>> V2: remove module param to enable/disable async scanning
>>      add scan time measurements to commit message
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 0af612387083..069350f85b83 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -4,6 +4,7 @@
>>    * Copyright (c) 2011-2014, Intel Corporation.
>>    */
>> +#include <linux/async.h>
>>   #include <linux/blkdev.h>
>>   #include <linux/blk-mq.h>
>>   #include <linux/blk-integrity.h>
>> @@ -3812,12 +3813,38 @@ static void nvme_validate_ns(struct nvme_ns 
>> *ns, struct nvme_ns_info *info)
>>           nvme_ns_remove(ns);
>>   }
>> -static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>> +/*
>> + * struct nvme_scan_state - keeps track of controller & NSIDs to scan
>> + * @ctrl:    Controller on which namespaces are being scanned
>> + * @count:    Next NSID to scan (for sequential scan), or
>> + *        Index of next NSID to scan in ns_list (for list scan)
>> + * @ns_list:    pointer to list of NSIDs to scan (NULL if sequential 
>> scan)
>> + */
>> +struct nvme_scan_state {
>> +    struct nvme_ctrl *ctrl;
>> +    atomic_t count;
>> +    __le32 *ns_list;
>> +};
>> +
>> +static void nvme_scan_ns(void *data, async_cookie_t cookie)
> 
> I think its better to call it nvme_scan_ns_async to indicate what
> it is.
> 
>>   {
>> -    struct nvme_ns_info info = { .nsid = nsid };
>> +    struct nvme_ns_info info = {};
>> +    struct nvme_scan_state *scan_state;
>> +    struct nvme_ctrl *ctrl;
>> +    u32 nsid;
>>       struct nvme_ns *ns;
>>       int ret;
>> +    scan_state = data;
>> +    ctrl = scan_state->ctrl;
> 
> I think these assignments can be done on the declaration.
> 
>> +    nsid = (u32)atomic_fetch_add(1, &scan_state->count);
>> +    /*
>> +     * get NSID from list (if scanning from a list, not sequentially)
>> +     */
>> +    if (scan_state->ns_list)
>> +        nsid = le32_to_cpu(scan_state->ns_list[nsid]);
>> +
> 
> This is awkward. ns_list passed in optionally.
> How about we limit this change to only operate on nvme_scan_ns_list?
> If the controller is old or quirked to support only a sequential scan
> it does not benefit from a parallel scan. I doubt that these controllers
> are likely to expose a large number of namespaces anyways.
> 
>> +    info.nsid = nsid;
>>       if (nvme_identify_ns_descs(ctrl, &info))
>>           return;
>> @@ -3881,11 +3908,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl 
>> *ctrl)
>>       __le32 *ns_list;
>>       u32 prev = 0;
>>       int ret = 0, i;
>> +    ASYNC_DOMAIN(domain);
>> +    struct nvme_scan_state scan_state;
>>       ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
>>       if (!ns_list)
>>           return -ENOMEM;
>> +    scan_state.ctrl = ctrl;
>> +    scan_state.ns_list = ns_list;
> 
> Is there a need to have a local ns_list variable here?
> 
>>       for (;;) {
>>           struct nvme_command cmd = {
>>               .identify.opcode    = nvme_admin_identify,
>> @@ -3901,19 +3932,25 @@ static int nvme_scan_ns_list(struct nvme_ctrl 
>> *ctrl)
>>               goto free;
>>           }
>> +        /*
>> +         * scan list starting at list offset 0
>> +         */
>> +        atomic_set(&scan_state.count, 0);
>>           for (i = 0; i < nr_entries; i++) {
>>               u32 nsid = le32_to_cpu(ns_list[i]);
>>               if (!nsid)    /* end of the list? */
>>                   goto out;
>> -            nvme_scan_ns(ctrl, nsid);
>> +            async_schedule_domain(nvme_scan_ns, &scan_state, &domain);
>>               while (++prev < nsid)
>>                   nvme_ns_remove_by_nsid(ctrl, prev);
>>           }
>> +        async_synchronize_full_domain(&domain);
>>       }
>>    out:
>>       nvme_remove_invalid_namespaces(ctrl, prev);
> 
> Is it a good idea to remove the invalid namespaces before synchronizing
> the async scans?
> 
>>    free:
>> +    async_synchronize_full_domain(&domain);
>>       kfree(ns_list);
>>       return ret;
>>   }
>> @@ -3922,14 +3959,23 @@ static void nvme_scan_ns_sequential(struct 
>> nvme_ctrl *ctrl)
>>   {
>>       struct nvme_id_ctrl *id;
>>       u32 nn, i;
>> +    ASYNC_DOMAIN(domain);
>> +    struct nvme_scan_state scan_state;
>>       if (nvme_identify_ctrl(ctrl, &id))
>>           return;
>>       nn = le32_to_cpu(id->nn);
>>       kfree(id);
>> +    scan_state.ctrl = ctrl;
>> +    /*
>> +     * scan sequentially starting at NSID 1
>> +     */
>> +    atomic_set(&scan_state.count, 1);
>> +    scan_state.ns_list = NULL;
>>       for (i = 1; i <= nn; i++)
>> -        nvme_scan_ns(ctrl, i);
>> +        async_schedule_domain(nvme_scan_ns, &scan_state, &domain);
>> +    async_synchronize_full_domain(&domain);
>>       nvme_remove_invalid_namespaces(ctrl, nn);
>>   }
> 
> I think we need a blktest for this. ns scanning has been notorious when
> running simultaneously with controller reset/reconnect/remove
> sequences... Ideally a test with a larger number of namespaces to
> exercise the code.
> 
> Also, make sure that blktest suite does not complain about anything
> else.
  
Keith Busch Jan. 23, 2024, 4:37 p.m. UTC | #3
On Mon, Jan 22, 2024 at 11:13:15AM +0200, Sagi Grimberg wrote:
> On 1/18/24 23:03, Stuart Hayes wrote:
> > @@ -3901,19 +3932,25 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
> >   			goto free;
> >   		}
> > +		/*
> > +		 * scan list starting at list offset 0
> > +		 */
> > +		atomic_set(&scan_state.count, 0);
> >   		for (i = 0; i < nr_entries; i++) {
> >   			u32 nsid = le32_to_cpu(ns_list[i]);
> >   			if (!nsid)	/* end of the list? */
> >   				goto out;
> > -			nvme_scan_ns(ctrl, nsid);
> > +			async_schedule_domain(nvme_scan_ns, &scan_state, &domain);
> >   			while (++prev < nsid)
> >   				nvme_ns_remove_by_nsid(ctrl, prev);
> >   		}
> > +		async_synchronize_full_domain(&domain);

You mentioned async scanning was an improvement if you have 1000
namespaces, but wouldn't this be worse if you have very few namespaces?
IOW, the decision to use the async schedule should be based on
nr_entries, right?
  
Chaitanya Kulkarni Jan. 23, 2024, 8:21 p.m. UTC | #4
On 1/23/2024 8:37 AM, Keith Busch wrote:
> On Mon, Jan 22, 2024 at 11:13:15AM +0200, Sagi Grimberg wrote:
>> On 1/18/24 23:03, Stuart Hayes wrote:
>>> @@ -3901,19 +3932,25 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
>>>    			goto free;
>>>    		}
>>> +		/*
>>> +		 * scan list starting at list offset 0
>>> +		 */
>>> +		atomic_set(&scan_state.count, 0);
>>>    		for (i = 0; i < nr_entries; i++) {
>>>    			u32 nsid = le32_to_cpu(ns_list[i]);
>>>    			if (!nsid)	/* end of the list? */
>>>    				goto out;
>>> -			nvme_scan_ns(ctrl, nsid);
>>> +			async_schedule_domain(nvme_scan_ns, &scan_state, &domain);
>>>    			while (++prev < nsid)
>>>    				nvme_ns_remove_by_nsid(ctrl, prev);
>>>    		}
>>> +		async_synchronize_full_domain(&domain);
> 
> You mentioned async scanning was an improvement if you have 1000
> namespaces, but wouldn't this be worse if you have very few namespaces?
> IOW, the decision to use the async schedule should be based on
> nr_entries, right?
> 

Perhaps it's also helpful to documents the data for small number of
namespaces, we can think of collecting data something like this:-

NR Namespaces        Seq Scan        Async Scan
2
4
8
16
32
64
128
256
512
1024

If we find that difference is not that much then we can go ahead with
this patch, if it the difference is not acceptable to the point that it
will regress for common setups then we can make it configurable ?

-ck
  
stuart hayes Jan. 24, 2024, 7:32 p.m. UTC | #5
On 1/23/2024 2:21 PM, Chaitanya Kulkarni wrote:
> On 1/23/2024 8:37 AM, Keith Busch wrote:
>> On Mon, Jan 22, 2024 at 11:13:15AM +0200, Sagi Grimberg wrote:
>>> On 1/18/24 23:03, Stuart Hayes wrote:
>>>> @@ -3901,19 +3932,25 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
>>>>     			goto free;
>>>>     		}
>>>> +		/*
>>>> +		 * scan list starting at list offset 0
>>>> +		 */
>>>> +		atomic_set(&scan_state.count, 0);
>>>>     		for (i = 0; i < nr_entries; i++) {
>>>>     			u32 nsid = le32_to_cpu(ns_list[i]);
>>>>     			if (!nsid)	/* end of the list? */
>>>>     				goto out;
>>>> -			nvme_scan_ns(ctrl, nsid);
>>>> +			async_schedule_domain(nvme_scan_ns, &scan_state, &domain);
>>>>     			while (++prev < nsid)
>>>>     				nvme_ns_remove_by_nsid(ctrl, prev);
>>>>     		}
>>>> +		async_synchronize_full_domain(&domain);
>>
>> You mentioned async scanning was an improvement if you have 1000
>> namespaces, but wouldn't this be worse if you have very few namespaces?
>> IOW, the decision to use the async schedule should be based on
>> nr_entries, right?
>>
> 
> Perhaps it's also helpful to documents the data for small number of
> namespaces, we can think of collecting data something like this:-
> 
> NR Namespaces        Seq Scan        Async Scan
> 2
> 4
> 8
> 16
> 32
> 64
> 128
> 256
> 512
> 1024
> 
> If we find that difference is not that much then we can go ahead with
> this patch, if it the difference is not acceptable to the point that it
> will regress for common setups then we can make it configurable ?
> 
> -ck
> 
> 
I believe the only reason the async scanning should take any longer than
sync scanning is that nvme_scan_ns() has to wait on the workqueue until it
is scheduled.

Testing on my system (with pcie nvme devices with a single namespace), it
looks like it only takes a fraction of a ms (100us or less typically) for
that to happen.  Then it takes 6-10ms or more for the actual namesapce scan.

So scanning asynchronously, even using a local pcie device with a single
namespace, doesn't take significantly longer.  Of course I guess it might
take a bit longer on a busy system, but I wouldn't think that scanning
namespaces is a critical path where a few milliseconds would make much
difference (?).  It wouldn't be too hard to make it scan synchronously if
there aren't more than, say, a couple namespaces, but my opinion is that
the minimal benefit wouldn't be worth the extra code.
  
stuart hayes Jan. 24, 2024, 7:38 p.m. UTC | #6
On 1/22/2024 3:13 AM, Sagi Grimberg wrote:
> 
> 
> On 1/18/24 23:03, Stuart Hayes wrote:
>> Use async function calls to make namespace scanning happen in parallel.
>>
>> Without the patch, NVME namespaces are scanned serially, so it can take a
>> long time for all of a controller's namespaces to become available,
>> especially with a slower (TCP) interface with large number of namespaces.
>>
>> The time it took for all namespaces to show up after connecting (via TCP)
>> to a controller with 1002 namespaces was measured:
>>
>> network latency   without patch   with patch
>>       0                 6s            1s
>>      50ms             210s           10s
>>     100ms             417s           18s
>>
> 
> Impressive speedup. Not a very common use-case though...
> 
>> Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
>>
>> -- 
>> V2: remove module param to enable/disable async scanning
>>      add scan time measurements to commit message
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 0af612387083..069350f85b83 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -4,6 +4,7 @@
>>    * Copyright (c) 2011-2014, Intel Corporation.
>>    */
>> +#include <linux/async.h>
>>   #include <linux/blkdev.h>
>>   #include <linux/blk-mq.h>
>>   #include <linux/blk-integrity.h>
>> @@ -3812,12 +3813,38 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
>>           nvme_ns_remove(ns);
>>   }
>> -static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>> +/*
>> + * struct nvme_scan_state - keeps track of controller & NSIDs to scan
>> + * @ctrl:    Controller on which namespaces are being scanned
>> + * @count:    Next NSID to scan (for sequential scan), or
>> + *        Index of next NSID to scan in ns_list (for list scan)
>> + * @ns_list:    pointer to list of NSIDs to scan (NULL if sequential scan)
>> + */
>> +struct nvme_scan_state {
>> +    struct nvme_ctrl *ctrl;
>> +    atomic_t count;
>> +    __le32 *ns_list;
>> +};
>> +
>> +static void nvme_scan_ns(void *data, async_cookie_t cookie)
> 
> I think its better to call it nvme_scan_ns_async to indicate what
> it is.
> 
>>   {
>> -    struct nvme_ns_info info = { .nsid = nsid };
>> +    struct nvme_ns_info info = {};
>> +    struct nvme_scan_state *scan_state;
>> +    struct nvme_ctrl *ctrl;
>> +    u32 nsid;
>>       struct nvme_ns *ns;
>>       int ret;
>> +    scan_state = data;
>> +    ctrl = scan_state->ctrl;
> 
> I think these assignments can be done on the declaration.
> 
>> +    nsid = (u32)atomic_fetch_add(1, &scan_state->count);
>> +    /*
>> +     * get NSID from list (if scanning from a list, not sequentially)
>> +     */
>> +    if (scan_state->ns_list)
>> +        nsid = le32_to_cpu(scan_state->ns_list[nsid]);
>> +
> 
> This is awkward. ns_list passed in optionally.
> How about we limit this change to only operate on nvme_scan_ns_list?
> If the controller is old or quirked to support only a sequential scan
> it does not benefit from a parallel scan. I doubt that these controllers
> are likely to expose a large number of namespaces anyways.
> 
>> +    info.nsid = nsid;
>>       if (nvme_identify_ns_descs(ctrl, &info))
>>           return;
>> @@ -3881,11 +3908,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
>>       __le32 *ns_list;
>>       u32 prev = 0;
>>       int ret = 0, i;
>> +    ASYNC_DOMAIN(domain);
>> +    struct nvme_scan_state scan_state;
>>       ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
>>       if (!ns_list)
>>           return -ENOMEM;
>> +    scan_state.ctrl = ctrl;
>> +    scan_state.ns_list = ns_list;
> 
> Is there a need to have a local ns_list variable here?
> 
>>       for (;;) {
>>           struct nvme_command cmd = {
>>               .identify.opcode    = nvme_admin_identify,
>> @@ -3901,19 +3932,25 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
>>               goto free;
>>           }
>> +        /*
>> +         * scan list starting at list offset 0
>> +         */
>> +        atomic_set(&scan_state.count, 0);
>>           for (i = 0; i < nr_entries; i++) {
>>               u32 nsid = le32_to_cpu(ns_list[i]);
>>               if (!nsid)    /* end of the list? */
>>                   goto out;
>> -            nvme_scan_ns(ctrl, nsid);
>> +            async_schedule_domain(nvme_scan_ns, &scan_state, &domain);
>>               while (++prev < nsid)
>>                   nvme_ns_remove_by_nsid(ctrl, prev);
>>           }
>> +        async_synchronize_full_domain(&domain);
>>       }
>>    out:
>>       nvme_remove_invalid_namespaces(ctrl, prev);
> 
> Is it a good idea to remove the invalid namespaces before synchronizing
> the async scans?
> 
>>    free:
>> +    async_synchronize_full_domain(&domain);
>>       kfree(ns_list);
>>       return ret;
>>   }
>> @@ -3922,14 +3959,23 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
>>   {
>>       struct nvme_id_ctrl *id;
>>       u32 nn, i;
>> +    ASYNC_DOMAIN(domain);
>> +    struct nvme_scan_state scan_state;
>>       if (nvme_identify_ctrl(ctrl, &id))
>>           return;
>>       nn = le32_to_cpu(id->nn);
>>       kfree(id);
>> +    scan_state.ctrl = ctrl;
>> +    /*
>> +     * scan sequentially starting at NSID 1
>> +     */
>> +    atomic_set(&scan_state.count, 1);
>> +    scan_state.ns_list = NULL;
>>       for (i = 1; i <= nn; i++)
>> -        nvme_scan_ns(ctrl, i);
>> +        async_schedule_domain(nvme_scan_ns, &scan_state, &domain);
>> +    async_synchronize_full_domain(&domain);
>>       nvme_remove_invalid_namespaces(ctrl, nn);
>>   }
> 
> I think we need a blktest for this. ns scanning has been notorious when
> running simultaneously with controller reset/reconnect/remove
> sequences... Ideally a test with a larger number of namespaces to
> exercise the code.
> 
> Also, make sure that blktest suite does not complain about anything
> else.

Thank you for the feedback on the patch, I agree with it.

I'm not sure how to implement a blktest suite for this, though.  I can look into it.
  

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0af612387083..069350f85b83 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4,6 +4,7 @@ 
  * Copyright (c) 2011-2014, Intel Corporation.
  */
 
+#include <linux/async.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
 #include <linux/blk-integrity.h>
@@ -3812,12 +3813,38 @@  static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
 		nvme_ns_remove(ns);
 }
 
-static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+/*
+ * struct nvme_scan_state - keeps track of controller & NSIDs to scan
+ * @ctrl:	Controller on which namespaces are being scanned
+ * @count:	Next NSID to scan (for sequential scan), or
+ *		Index of next NSID to scan in ns_list (for list scan)
+ * @ns_list:	pointer to list of NSIDs to scan (NULL if sequential scan)
+ */
+struct nvme_scan_state {
+	struct nvme_ctrl *ctrl;
+	atomic_t count;
+	__le32 *ns_list;
+};
+
+static void nvme_scan_ns(void *data, async_cookie_t cookie)
 {
-	struct nvme_ns_info info = { .nsid = nsid };
+	struct nvme_ns_info info = {};
+	struct nvme_scan_state *scan_state;
+	struct nvme_ctrl *ctrl;
+	u32 nsid;
 	struct nvme_ns *ns;
 	int ret;
 
+	scan_state = data;
+	ctrl = scan_state->ctrl;
+	nsid = (u32)atomic_fetch_add(1, &scan_state->count);
+	/*
+	 * get NSID from list (if scanning from a list, not sequentially)
+	 */
+	if (scan_state->ns_list)
+		nsid = le32_to_cpu(scan_state->ns_list[nsid]);
+
+	info.nsid = nsid;
 	if (nvme_identify_ns_descs(ctrl, &info))
 		return;
 
@@ -3881,11 +3908,15 @@  static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
 	__le32 *ns_list;
 	u32 prev = 0;
 	int ret = 0, i;
+	ASYNC_DOMAIN(domain);
+	struct nvme_scan_state scan_state;
 
 	ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
 	if (!ns_list)
 		return -ENOMEM;
 
+	scan_state.ctrl = ctrl;
+	scan_state.ns_list = ns_list;
 	for (;;) {
 		struct nvme_command cmd = {
 			.identify.opcode	= nvme_admin_identify,
@@ -3901,19 +3932,25 @@  static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
 			goto free;
 		}
 
+		/*
+		 * scan list starting at list offset 0
+		 */
+		atomic_set(&scan_state.count, 0);
 		for (i = 0; i < nr_entries; i++) {
 			u32 nsid = le32_to_cpu(ns_list[i]);
 
 			if (!nsid)	/* end of the list? */
 				goto out;
-			nvme_scan_ns(ctrl, nsid);
+			async_schedule_domain(nvme_scan_ns, &scan_state, &domain);
 			while (++prev < nsid)
 				nvme_ns_remove_by_nsid(ctrl, prev);
 		}
+		async_synchronize_full_domain(&domain);
 	}
  out:
 	nvme_remove_invalid_namespaces(ctrl, prev);
  free:
+	async_synchronize_full_domain(&domain);
 	kfree(ns_list);
 	return ret;
 }
@@ -3922,14 +3959,23 @@  static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
 {
 	struct nvme_id_ctrl *id;
 	u32 nn, i;
+	ASYNC_DOMAIN(domain);
+	struct nvme_scan_state scan_state;
 
 	if (nvme_identify_ctrl(ctrl, &id))
 		return;
 	nn = le32_to_cpu(id->nn);
 	kfree(id);
 
+	scan_state.ctrl = ctrl;
+	/*
+	 * scan sequentially starting at NSID 1
+	 */
+	atomic_set(&scan_state.count, 1);
+	scan_state.ns_list = NULL;
 	for (i = 1; i <= nn; i++)
-		nvme_scan_ns(ctrl, i);
+		async_schedule_domain(nvme_scan_ns, &scan_state, &domain);
+	async_synchronize_full_domain(&domain);
 
 	nvme_remove_invalid_namespaces(ctrl, nn);
 }