[v3,06/11] scsi: scsi_debug: Dynamically allocate sdebug_queued_cmd

Message ID 20230327074310.1862889-7-john.g.garry@oracle.com
State New
Headers
Series Fix shost command overloading issues |

Commit Message

John Garry March 27, 2023, 7:43 a.m. UTC
  Eventually we will drop the sdebug_queue struct as it is not really
required, so start with making the sdebug_queued_cmd dynamically allocated
for the lifetime of the scsi_cmnd in the driver.

As an interim measure, make sdebug_queued_cmd.sd_dp a pointer to struct
sdebug_defer. Also keep a value of the index allocated in
sdebug_queued_cmd.qc_arr in struct sdebug_queued_cmd.

To deal with an races in accessing the scsi cmnd allocated struct
sdebug_queued_cmd, add a spinlock for the scsi command in its priv area.
Races may be between scheduling a command for completion, aborting a
command, and the command actually completing and freeing the struct
sdebug_queued_cmd.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 drivers/scsi/scsi_debug.c | 427 ++++++++++++++++++++++----------------
 1 file changed, 253 insertions(+), 174 deletions(-)
  

Comments

Douglas Gilbert April 3, 2023, 5:18 a.m. UTC | #1
On 2023-03-27 03:43, John Garry wrote:
> Eventually we will drop the sdebug_queue struct as it is not really
> required, so start with making the sdebug_queued_cmd dynamically allocated
> for the lifetime of the scsi_cmnd in the driver.
> 
> As an interim measure, make sdebug_queued_cmd.sd_dp a pointer to struct
> sdebug_defer. Also keep a value of the index allocated in
> sdebug_queued_cmd.qc_arr in struct sdebug_queued_cmd.
> 
> To deal with an races in accessing the scsi cmnd allocated struct
> sdebug_queued_cmd, add a spinlock for the scsi command in its priv area.
> Races may be between scheduling a command for completion, aborting a
> command, and the command actually completing and freeing the struct
> sdebug_queued_cmd.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
Acked-by: Douglas Gilbert <dgilbert@interlog.com>

Thanks.
  
Liu, Yujie April 7, 2023, 4:18 a.m. UTC | #2
Hello,

kernel test robot noticed "BUG_sdebug_queued_cmd(Tainted:G_S):Objects_remaining_in_sdebug_queued_cmd_on__kmem_cache_shutdown()" on:

commit: f28c8a7d0f7a705395439889a52b09e2b61ea422 ("[PATCH v3 06/11] scsi: scsi_debug: Dynamically allocate sdebug_queued_cmd")
url: https://github.com/intel-lab-lkp/linux/commits/John-Garry/scsi-scsi_debug-Fix-check-for-sdev-queue-full/20230327-154448
base: https://git.kernel.org/cgit/linux/kernel/git/mkp/scsi.git for-next
patch link: https://lore.kernel.org/all/20230327074310.1862889-7-john.g.garry@oracle.com/
patch subject: [PATCH v3 06/11] scsi: scsi_debug: Dynamically allocate sdebug_queued_cmd

in testcase: blktests
version: blktests-x86_64-676d42c-1_20230323
with following parameters:

	disk: 1HDD
	test: scsi-group-00

compiler: gcc-11
test machine: 16 threads 1 sockets Intel(R) Xeon(R) CPU D-1541 @ 2.10GHz (Broadwell-DE) with 48G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)


If you fix the issue, kindly add following tag
| Reported-by: kernel test robot <yujie.liu@intel.com>
| Link: https://lore.kernel.org/oe-lkp/202304071111.e762fcbd-yujie.liu@intel.com


[  101.910746][ T7924] scsi host6: waking up host to restart
[  101.910751][ T7924] scsi host6: scsi_eh_6: sleeping
[  101.976012][  T203] Buffer I/O error on dev sdc, logical block 2032, async page read
[  102.135530][ T8020] sd 6:0:0:0: [sdc] Synchronizing SCSI cache
[  102.312331][ T8020] =============================================================================
[  102.322321][ T8020] BUG sdebug_queued_cmd (Tainted: G S                ): Objects remaining in sdebug_queued_cmd on __kmem_cache_shutdown()
[  102.336810][ T8020] -----------------------------------------------------------------------------
[  102.336810][ T8020]
[  102.349880][ T8020] Slab 0x0000000013ac9b84 objects=32 used=1 fp=0x00000000a6dc3cb1 flags=0x17ffffc0010200(slab|head|node=0|zone=2|lastcpupid=0x1fffff)
[  102.365549][ T8020] CPU: 4 PID: 8020 Comm: modprobe Tainted: G S                 6.3.0-rc1-00188-gf28c8a7d0f7a #1
[  102.376919][ T8020] Hardware name: Supermicro SYS-5018D-FN4T/X10SDV-8C-TLN4F, BIOS 1.1 03/02/2016
[  102.386904][ T8020] Call Trace:
[  102.391151][ T8020]  <TASK>
[ 102.395042][ T8020] dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1))
[ 102.400503][ T8020] slab_err (mm/slub.c:995)
[ 102.405432][ T8020] ? _raw_spin_lock_bh (kernel/locking/spinlock.c:169)
[ 102.411316][ T8020] ? start_poll_synchronize_srcu (kernel/rcu/srcutree.c:1306)
[ 102.418070][ T8020] __kmem_cache_shutdown (include/linux/spinlock.h:350 mm/slub.c:4555 mm/slub.c:4586 mm/slub.c:4618)
[ 102.424308][ T8020] kmem_cache_destroy (mm/slab_common.c:457 mm/slab_common.c:497 mm/slab_common.c:480)
[ 102.430196][ T8020] scsi_debug_exit (drivers/scsi/scsi_debug.c:7807) scsi_debug
[ 102.436885][ T8020] __do_sys_delete_module+0x2ea/0x530
[ 102.444259][ T8020] ? module_flags (kernel/module/main.c:694)
[ 102.449892][ T8020] ? __fget_light (include/linux/atomic/atomic-arch-fallback.h:227 include/linux/atomic/atomic-instrumented.h:35 fs/file.c:1015)
[ 102.455439][ T8020] ? __blkcg_punt_bio_submit (block/blk-cgroup.c:1840)
[ 102.462034][ T8020] ? _raw_spin_lock (arch/x86/include/asm/atomic.h:202 include/linux/atomic/atomic-instrumented.h:543 include/asm-generic/qspinlock.h:111 include/linux/spinlock.h:186 include/linux/spinlock_api_smp.h:134 kernel/locking/spinlock.c:154)
[ 102.467667][ T8020] ? exit_to_user_mode_loop (include/linux/sched.h:2326 include/linux/resume_user_mode.h:61 kernel/entry/common.c:171)
[ 102.474080][ T8020] ? exit_to_user_mode_prepare (kernel/entry/common.c:203)
[ 102.480660][ T8020] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
[ 102.486014][ T8020] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[  102.492844][ T8020] RIP: 0033:0x7f4dddaaa417
[ 102.498191][ T8020] Code: 73 01 c3 48 8b 0d 79 1a 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 49 1a 0d 00 f7 d8 64 89 01 48
All code
========
   0:   73 01                   jae    0x3
   2:   c3                      ret
   3:   48 8b 0d 79 1a 0d 00    mov    0xd1a79(%rip),%rcx        # 0xd1a83
   a:   f7 d8                   neg    %eax
   c:   64 89 01                mov    %eax,%fs:(%rcx)
   f:   48 83 c8 ff             or     $0xffffffffffffffff,%rax
  13:   c3                      ret
  14:   66 2e 0f 1f 84 00 00    cs nopw 0x0(%rax,%rax,1)
  1b:   00 00 00
  1e:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  23:   b8 b0 00 00 00          mov    $0xb0,%eax
  28:   0f 05                   syscall
  2a:*  48 3d 01 f0 ff ff       cmp    $0xfffffffffffff001,%rax         <-- trapping instruction
  30:   73 01                   jae    0x33
  32:   c3                      ret
  33:   48 8b 0d 49 1a 0d 00    mov    0xd1a49(%rip),%rcx        # 0xd1a83
  3a:   f7 d8                   neg    %eax
  3c:   64 89 01                mov    %eax,%fs:(%rcx)
  3f:   48                      rex.W

Code starting with the faulting instruction
===========================================
   0:   48 3d 01 f0 ff ff       cmp    $0xfffffffffffff001,%rax
   6:   73 01                   jae    0x9
   8:   c3                      ret
   9:   48 8b 0d 49 1a 0d 00    mov    0xd1a49(%rip),%rcx        # 0xd1a59
  10:   f7 d8                   neg    %eax
  12:   64 89 01                mov    %eax,%fs:(%rcx)
  15:   48                      rex.W
[  102.519919][ T8020] RSP: 002b:00007ffdf44e1448 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
[  102.529339][ T8020] RAX: ffffffffffffffda RBX: 0000564303e6dd60 RCX: 00007f4dddaaa417
[  102.538325][ T8020] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 0000564303e6ddc8
[  102.547322][ T8020] RBP: 0000564303e6dd60 R08: 0000000000000000 R09: 0000000000000000
[  102.556318][ T8020] R10: 00007f4dddb2aac0 R11: 0000000000000206 R12: 0000564303e6ddc8
[  102.565315][ T8020] R13: 0000000000000000 R14: 0000000000000000 R15: 0000564303e6dee0
[  102.574310][ T8020]  </TASK>
  
John Garry April 12, 2023, 10 a.m. UTC | #3
On 07/04/2023 05:18, kernel test robot wrote:
> Hello,
> 
> kernel test robot noticed "BUG_sdebug_queued_cmd(Tainted:G_S):Objects_remaining_in_sdebug_queued_cmd_on__kmem_cache_shutdown()" on:
> 
> commit: f28c8a7d0f7a705395439889a52b09e2b61ea422 ("[PATCH v3 06/11] scsi: scsi_debug: Dynamically allocate sdebug_queued_cmd")
> url:https://github.com/intel-lab-lkp/linux/commits/John-Garry/scsi-scsi_debug-Fix-check-for-sdev-queue-full/20230327-154448
> base:https://git.kernel.org/cgit/linux/kernel/git/mkp/scsi.git  for-next
> patch link:https://lore.kernel.org/all/20230327074310.1862889-7-john.g.garry@oracle.com/
> patch subject: [PATCH v3 06/11] scsi: scsi_debug: Dynamically allocate sdebug_queued_cmd
> 
> in testcase: blktests
> version: blktests-x86_64-676d42c-1_20230323
> with following parameters:
> 
> 	disk: 1HDD
> 	test: scsi-group-00
> 
> compiler: gcc-11
> test machine: 16 threads 1 sockets Intel(R) Xeon(R) CPU D-1541 @ 2.10GHz (Broadwell-DE) with 48G memory
> 
> (please refer to attached dmesg/kmsg for entire log/backtrace)
> 
> 

I don't know how I missed this. Maybe it's because running blktests with 
buildroot initrd is not streamlined.

Anyway, the issue is that we don't properly abort the scsi cmnd in 
scsi_debug_device_reset() after the scsi cmnd timeouts for the 2nd time.

We get away with this in the previous code as all active IOs are 
terminated when the in scsi_debug_exit() -> stop_all_queued(), which was 
not the right thing to do.

I suppose scsi_debug_device_reset() should abort all IO for that sdev 
(which it doesn't do) - I'll look to make that change.

Thanks,
John

> If you fix the issue, kindly add following tag
> | Reported-by: kernel test robot<yujie.liu@intel.com>
> | Link:https://lore.kernel.org/oe-lkp/202304071111.e762fcbd-yujie.liu@intel.com
> 
> 
> [  101.910746][ T7924] scsi host6: waking up host to restart
> [  101.910751][ T7924] scsi host6: scsi_eh_6: sleeping
> [  101.976012][  T203] Buffer I/O error on dev sdc, logical block 2032, async page read
> [  102.135530][ T8020] sd 6:0:0:0: [sdc] Synchronizing SCSI cache
> [  102.312331][ T8020] =============================================================================
> [  102.322321][ T8020] BUG sdebug_queued_cmd (Tainted: G S                ): Objects remaining in sdebug_queued_cmd on __kmem_cache_shutdown()
> [  102.336810][ T8020] -----------------------------------------------------------------------------
> [  102.336810][ T8020]
> [  102.349880][ T8020] Slab 0x0000000013ac9b84 objects=32 used=1 fp=0x00000000a6dc3cb1 flags=0x17ffffc0010200(slab|head|node=0|zone=2|lastcpupid=0x1fffff)
> [  102.365549][ T8020] CPU: 4 PID: 8020 Comm: modprobe Tainted: G S                 6.3.0-rc1-00188-gf28c8a7d0f7a #1
> [  102.376919][ T8020] Hardware name: Supermicro SYS-5018D-FN4T/X10SDV-8C-TLN4F, BIOS 1.1 03/02/2016
> [  102.386904][ T8020] Call Trace:
> [  102.391151][ T8020]  <TASK>
> [ 102.395042][ T8020] dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1))
> [ 102.400503][ T8020] slab_err (mm/slub.c:995)
> [ 102.405432][ T8020] ? _raw_spin_lock_bh (kernel/locking/spinlock.c:169)
> [ 102.411316][ T8020] ? start_poll_synchronize_srcu (kernel/rcu/srcutree.c:1306)
> [ 102.418070][ T8020] __kmem_cache_shutdown (include/linux/spinlock.h:350 mm/slub.c:4555 mm/slub.c:4586 mm/slub.c:4618)
> [ 102.424308][ T8020] kmem_cache_destroy (mm/slab_common.c:457 mm/slab_common.c:497 mm/slab_common.c:480)
> [ 102.430196][ T8020] scsi_debug_exit (drivers/scsi/scsi_debug.c:7807) scsi_debug
> [ 102.436885][ T8020] __do_sys_delete_module+0x2ea/0x530
> [ 102.444259][ T8020] ? module_flags (kernel/module/main.c:694)
> [ 102.449892][ T8020] ? __fget_light (include/linux/atomic/atomic-arch-fallback.h:227 include/linux/atomic/atomic-instrumented.h:35 fs/file.c:1015)
> [ 102.455439][ T8020] ? __blkcg_punt_bio_submit (block/blk-cgroup.c:1840)
> [ 102.462034][ T8020] ? _raw_spin_lock (arch/x86/include/asm/atomic.h:202 include/linux/atomic/atomic-instrumented.h:543 include/asm-generic/qspinlock.h:111 include/linux/spinlock.h:186 include/linux/spinlock_api_smp.h:134 kernel/locking/spinlock.c:154)
> [ 102.467667][ T8020] ? exit_to_user_mode_loop (include/linux/sched.h:2326 include/linux/resume_user_mode.h:61 kernel/entry/common.c:171)
> [ 102.474080][ T8020] ? exit_to_user_mode_prepare (kernel/entry/common.c:203)
> [ 102.480660][ T8020] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
> [ 102.486014][ T8020] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
> [  102.492844][ T8020] RIP: 0033:0x7f4dddaaa417
> [ 102.498191][ T8020] Code: 73 01 c3 48 8b 0d 79 1a 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 49 1a 0d 00 f7 d8 64 89 01 48
> All code
  

Patch

diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index f53f3e78aaa1..7dd2dd6cbd6c 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -250,6 +250,11 @@  static const char *sdebug_version_date = "20210520";
 
 #define SDEB_XA_NOT_IN_USE XA_MARK_1
 
+static struct kmem_cache *queued_cmd_cache;
+
+#define TO_QEUEUED_CMD(scmd)  ((void *)(scmd)->host_scribble)
+#define ASSIGN_QEUEUED_CMD(scmnd, qc) { (scmnd)->host_scribble = (void *) qc; }
+
 /* Zone types (zbcr05 table 25) */
 enum sdebug_z_type {
 	ZBC_ZTYPE_CNV	= 0x1,
@@ -337,12 +342,8 @@  struct sdebug_defer {
 	struct execute_work ew;
 	ktime_t cmpl_ts;/* time since boot to complete this cmd */
 	int sqa_idx;	/* index of sdebug_queue array */
-	int qc_idx;	/* index of sdebug_queued_cmd array within sqa_idx */
 	int hc_idx;	/* hostwide tag index */
 	int issuing_cpu;
-	bool init_hrt;
-	bool init_wq;
-	bool init_poll;
 	bool aborted;	/* true when blk_abort_request() already called */
 	enum sdeb_defer_type defer_t;
 };
@@ -351,12 +352,16 @@  struct sdebug_queued_cmd {
 	/* corresponding bit set in in_use_bm[] in owning struct sdebug_queue
 	 * instance indicates this slot is in use.
 	 */
-	struct sdebug_defer *sd_dp;
-	struct scsi_cmnd *a_cmnd;
+	struct sdebug_defer sd_dp;
+	struct scsi_cmnd *scmd;
+};
+
+struct sdebug_scsi_cmd {
+	spinlock_t   lock;
 };
 
 struct sdebug_queue {
-	struct sdebug_queued_cmd qc_arr[SDEBUG_CANQUEUE];
+	struct sdebug_queued_cmd *qc_arr[SDEBUG_CANQUEUE];
 	unsigned long in_use_bm[SDEBUG_CANQUEUE_WORDS];
 	spinlock_t qc_lock;
 };
@@ -508,6 +513,8 @@  static int sdebug_add_store(void);
 static void sdebug_erase_store(int idx, struct sdeb_store_info *sip);
 static void sdebug_erase_all_stores(bool apart_from_first);
 
+static void sdebug_free_queued_cmd(struct sdebug_queued_cmd *sqcp);
+
 /*
  * The following are overflow arrays for cdbs that "hit" the same index in
  * the opcode_info_arr array. The most time sensitive (or commonly used) cdb
@@ -4919,46 +4926,48 @@  static u32 get_tag(struct scsi_cmnd *cmnd)
 /* Queued (deferred) command completions converge here. */
 static void sdebug_q_cmd_complete(struct sdebug_defer *sd_dp)
 {
-	bool aborted = sd_dp->aborted;
+	struct sdebug_queued_cmd *sqcp = container_of(sd_dp, struct sdebug_queued_cmd, sd_dp);
 	int qc_idx;
 	int retiring = 0;
-	unsigned long iflags;
+	unsigned long flags, iflags;
+	struct scsi_cmnd *scp = sqcp->scmd;
+	struct sdebug_scsi_cmd *sdsc;
+	bool aborted;
 	struct sdebug_queue *sqp;
-	struct sdebug_queued_cmd *sqcp;
-	struct scsi_cmnd *scp;
 
-	if (unlikely(aborted))
-		sd_dp->aborted = false;
-	qc_idx = sd_dp->qc_idx;
-	sqp = sdebug_q_arr + sd_dp->sqa_idx;
+	qc_idx = sd_dp->sqa_idx;
 	if (sdebug_statistics) {
 		atomic_inc(&sdebug_completions);
 		if (raw_smp_processor_id() != sd_dp->issuing_cpu)
 			atomic_inc(&sdebug_miss_cpus);
 	}
+	if (!scp) {
+		pr_err("scmd=NULL\n");
+		goto out;
+	}
 	if (unlikely((qc_idx < 0) || (qc_idx >= SDEBUG_CANQUEUE))) {
 		pr_err("wild qc_idx=%d\n", qc_idx);
-		return;
+		goto out;
 	}
+
+	sdsc = scsi_cmd_priv(scp);
+	sqp = get_queue(scp);
 	spin_lock_irqsave(&sqp->qc_lock, iflags);
-	WRITE_ONCE(sd_dp->defer_t, SDEB_DEFER_NONE);
-	sqcp = &sqp->qc_arr[qc_idx];
-	scp = sqcp->a_cmnd;
-	if (unlikely(scp == NULL)) {
-		spin_unlock_irqrestore(&sqp->qc_lock, iflags);
-		pr_err("scp is NULL, sqa_idx=%d, qc_idx=%d, hc_idx=%d\n",
-		       sd_dp->sqa_idx, qc_idx, sd_dp->hc_idx);
-		return;
-	}
+	spin_lock_irqsave(&sdsc->lock, flags);
+	aborted = sd_dp->aborted;
+	if (unlikely(aborted))
+		sd_dp->aborted = false;
+	ASSIGN_QEUEUED_CMD(scp, NULL);
 
 	if (unlikely(atomic_read(&retired_max_queue) > 0))
 		retiring = 1;
 
-	sqcp->a_cmnd = NULL;
+	sqp->qc_arr[qc_idx] = NULL;
 	if (unlikely(!test_and_clear_bit(qc_idx, sqp->in_use_bm))) {
+		spin_unlock_irqrestore(&sdsc->lock, flags);
 		spin_unlock_irqrestore(&sqp->qc_lock, iflags);
-		pr_err("Unexpected completion\n");
-		return;
+		pr_err("Unexpected completion qc_idx=%d\n", qc_idx);
+		goto out;
 	}
 
 	if (unlikely(retiring)) {	/* user has reduced max_queue */
@@ -4966,9 +4975,10 @@  static void sdebug_q_cmd_complete(struct sdebug_defer *sd_dp)
 
 		retval = atomic_read(&retired_max_queue);
 		if (qc_idx >= retval) {
+			spin_unlock_irqrestore(&sdsc->lock, flags);
 			spin_unlock_irqrestore(&sqp->qc_lock, iflags);
 			pr_err("index %d too large\n", retval);
-			return;
+			goto out;
 		}
 		k = find_last_bit(sqp->in_use_bm, retval);
 		if ((k < sdebug_max_queue) || (k == retval))
@@ -4976,14 +4986,19 @@  static void sdebug_q_cmd_complete(struct sdebug_defer *sd_dp)
 		else
 			atomic_set(&retired_max_queue, k + 1);
 	}
+
+	spin_unlock_irqrestore(&sdsc->lock, flags);
 	spin_unlock_irqrestore(&sqp->qc_lock, iflags);
-	if (unlikely(aborted)) {
-		if (sdebug_verbose)
-			pr_info("bypassing scsi_done() due to aborted cmd, kicking-off EH\n");
+
+	if (aborted) {
+		pr_info("bypassing scsi_done() due to aborted cmd, kicking-off EH\n");
 		blk_abort_request(scsi_cmd_to_rq(scp));
-		return;
+		goto out;
 	}
+
 	scsi_done(scp); /* callback to mid level */
+out:
+	sdebug_free_queued_cmd(sqcp);
 }
 
 /* When high resolution timer goes off this function is called. */
@@ -5233,115 +5248,126 @@  static void scsi_debug_slave_destroy(struct scsi_device *sdp)
 	}
 }
 
-static void stop_qc_helper(struct sdebug_defer *sd_dp,
+/* Returns true if we require the queued memory to be freed by the caller. */
+static bool stop_qc_helper(struct sdebug_defer *sd_dp,
 			   enum sdeb_defer_type defer_t)
 {
-	if (!sd_dp)
-		return;
-	if (defer_t == SDEB_DEFER_HRT)
-		hrtimer_cancel(&sd_dp->hrt);
-	else if (defer_t == SDEB_DEFER_WQ)
-		cancel_work_sync(&sd_dp->ew.work);
+	if (defer_t == SDEB_DEFER_HRT) {
+		int res = hrtimer_try_to_cancel(&sd_dp->hrt);
+
+		switch (res) {
+		case 0: /* Not active, it must have already run */
+		case -1: /* -1 It's executing the CB */
+			return false;
+		case 1: /* Was active, we've now cancelled */
+		default:
+			return true;
+		}
+	} else if (defer_t == SDEB_DEFER_WQ) {
+		/* Cancel if pending */
+		if (cancel_work_sync(&sd_dp->ew.work))
+			return true;
+		/* Was not pending, so it must have run */
+		return false;
+	} else if (defer_t == SDEB_DEFER_POLL) {
+		return true;
+	}
+
+	return false;
 }
 
-/* If @cmnd found deletes its timer or work queue and returns true; else
-   returns false */
-static bool stop_queued_cmnd(struct scsi_cmnd *cmnd)
+
+static bool scsi_debug_stop_cmnd(struct scsi_cmnd *cmnd, int *sqa_idx)
 {
-	unsigned long iflags;
-	int j, k, qmax, r_qmax;
 	enum sdeb_defer_type l_defer_t;
-	struct sdebug_queue *sqp;
 	struct sdebug_queued_cmd *sqcp;
 	struct sdebug_defer *sd_dp;
+	struct sdebug_scsi_cmd *sdsc = scsi_cmd_priv(cmnd);
 
-	for (j = 0, sqp = sdebug_q_arr; j < submit_queues; ++j, ++sqp) {
+	lockdep_assert_held(&sdsc->lock);
+
+	sqcp = TO_QEUEUED_CMD(cmnd);
+	if (!sqcp)
+		return false;
+	sd_dp = &sqcp->sd_dp;
+	if (sqa_idx)
+		*sqa_idx = sd_dp->sqa_idx;
+	l_defer_t = READ_ONCE(sd_dp->defer_t);
+	ASSIGN_QEUEUED_CMD(cmnd, NULL);
+
+	if (stop_qc_helper(sd_dp, l_defer_t))
+		sdebug_free_queued_cmd(sqcp);
+
+	return true;
+}
+
+/*
+ * Called from scsi_debug_abort() only, which is for timed-out cmd.
+ */
+static bool scsi_debug_abort_cmnd(struct scsi_cmnd *cmnd)
+{
+	struct sdebug_scsi_cmd *sdsc = scsi_cmd_priv(cmnd);
+	struct sdebug_queue *sqp = get_queue(cmnd);
+	unsigned long flags, iflags;
+	int k = -1;
+	bool res;
+
+	spin_lock_irqsave(&sdsc->lock, flags);
+	res = scsi_debug_stop_cmnd(cmnd, &k);
+	spin_unlock_irqrestore(&sdsc->lock, flags);
+
+	if (k >= 0) {
 		spin_lock_irqsave(&sqp->qc_lock, iflags);
-		qmax = sdebug_max_queue;
-		r_qmax = atomic_read(&retired_max_queue);
-		if (r_qmax > qmax)
-			qmax = r_qmax;
-		for (k = 0; k < qmax; ++k) {
-			if (test_bit(k, sqp->in_use_bm)) {
-				sqcp = &sqp->qc_arr[k];
-				if (cmnd != sqcp->a_cmnd)
-					continue;
-				/* found */
-				sqcp->a_cmnd = NULL;
-				sd_dp = sqcp->sd_dp;
-				if (sd_dp) {
-					l_defer_t = READ_ONCE(sd_dp->defer_t);
-					WRITE_ONCE(sd_dp->defer_t, SDEB_DEFER_NONE);
-				} else
-					l_defer_t = SDEB_DEFER_NONE;
-				spin_unlock_irqrestore(&sqp->qc_lock, iflags);
-				stop_qc_helper(sd_dp, l_defer_t);
-				clear_bit(k, sqp->in_use_bm);
-				return true;
-			}
-		}
+		clear_bit(k, sqp->in_use_bm);
+		sqp->qc_arr[k] = NULL;
 		spin_unlock_irqrestore(&sqp->qc_lock, iflags);
 	}
-	return false;
+
+	return res;
 }
 
 /* Deletes (stops) timers or work queues of all queued commands */
 static void stop_all_queued(void)
 {
-	unsigned long iflags;
+	unsigned long iflags, flags;
 	int j, k;
-	enum sdeb_defer_type l_defer_t;
 	struct sdebug_queue *sqp;
-	struct sdebug_queued_cmd *sqcp;
-	struct sdebug_defer *sd_dp;
 
 	for (j = 0, sqp = sdebug_q_arr; j < submit_queues; ++j, ++sqp) {
 		spin_lock_irqsave(&sqp->qc_lock, iflags);
 		for (k = 0; k < SDEBUG_CANQUEUE; ++k) {
 			if (test_bit(k, sqp->in_use_bm)) {
-				sqcp = &sqp->qc_arr[k];
-				if (sqcp->a_cmnd == NULL)
+				struct sdebug_queued_cmd *sqcp = sqp->qc_arr[k];
+				struct sdebug_scsi_cmd *sdsc;
+				struct scsi_cmnd *scmd;
+
+				if (!sqcp)
 					continue;
-				sqcp->a_cmnd = NULL;
-				sd_dp = sqcp->sd_dp;
-				if (sd_dp) {
-					l_defer_t = READ_ONCE(sd_dp->defer_t);
-					WRITE_ONCE(sd_dp->defer_t, SDEB_DEFER_NONE);
-				} else
-					l_defer_t = SDEB_DEFER_NONE;
-				spin_unlock_irqrestore(&sqp->qc_lock, iflags);
-				stop_qc_helper(sd_dp, l_defer_t);
+				scmd = sqcp->scmd;
+				if (!scmd)
+					continue;
+				sdsc = scsi_cmd_priv(scmd);
+				spin_lock_irqsave(&sdsc->lock, flags);
+				if (TO_QEUEUED_CMD(scmd) != sqcp) {
+					spin_unlock_irqrestore(&sdsc->lock, flags);
+					continue;
+				}
+				scsi_debug_stop_cmnd(scmd, NULL);
+				spin_unlock_irqrestore(&sdsc->lock, flags);
+				sqp->qc_arr[k] = NULL;
 				clear_bit(k, sqp->in_use_bm);
-				spin_lock_irqsave(&sqp->qc_lock, iflags);
 			}
 		}
 		spin_unlock_irqrestore(&sqp->qc_lock, iflags);
 	}
 }
 
-/* Free queued command memory on heap */
-static void free_all_queued(void)
-{
-	int j, k;
-	struct sdebug_queue *sqp;
-	struct sdebug_queued_cmd *sqcp;
-
-	for (j = 0, sqp = sdebug_q_arr; j < submit_queues; ++j, ++sqp) {
-		for (k = 0; k < SDEBUG_CANQUEUE; ++k) {
-			sqcp = &sqp->qc_arr[k];
-			kfree(sqcp->sd_dp);
-			sqcp->sd_dp = NULL;
-		}
-	}
-}
-
 static int scsi_debug_abort(struct scsi_cmnd *SCpnt)
 {
-	bool ok;
+	bool ok = scsi_debug_abort_cmnd(SCpnt);
 
 	++num_aborts;
 
-	ok = stop_queued_cmnd(SCpnt);
 	if (SDEBUG_OPT_ALL_NOISE & sdebug_opts)
 		sdev_printk(KERN_INFO, SCpnt->device,
 			    "%s: command%s found\n", __func__,
@@ -5543,6 +5569,34 @@  static bool inject_on_this_cmd(void)
 
 #define INCLUSIVE_TIMING_MAX_NS 1000000		/* 1 millisecond */
 
+
+void sdebug_free_queued_cmd(struct sdebug_queued_cmd *sqcp)
+{
+	if (sqcp)
+		kmem_cache_free(queued_cmd_cache, sqcp);
+}
+
+static struct sdebug_queued_cmd *sdebug_alloc_queued_cmd(struct scsi_cmnd *scmd)
+{
+	struct sdebug_queued_cmd *sqcp;
+	struct sdebug_defer *sd_dp;
+
+	sqcp = kmem_cache_zalloc(queued_cmd_cache, GFP_ATOMIC);
+	if (!sqcp)
+		return NULL;
+
+	sd_dp = &sqcp->sd_dp;
+
+	hrtimer_init(&sd_dp->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+	sd_dp->hrt.function = sdebug_q_cmd_hrt_complete;
+	INIT_WORK(&sd_dp->ew.work, sdebug_q_cmd_wq_complete);
+
+	sqcp->scmd = scmd;
+	sd_dp->sqa_idx = -1;
+
+	return sqcp;
+}
+
 /* Complete the processing of the thread that queued a SCSI command to this
  * driver. It either completes the command by calling cmnd_done() or
  * schedules a hr timer or work queue then returns 0. Returns
@@ -5554,15 +5608,16 @@  static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 				    struct sdebug_dev_info *),
 			 int delta_jiff, int ndelay)
 {
-	bool new_sd_dp;
-	bool polled = scsi_cmd_to_rq(cmnd)->cmd_flags & REQ_POLLED;
-	int k;
-	unsigned long iflags;
+	struct request *rq = scsi_cmd_to_rq(cmnd);
+	bool polled = rq->cmd_flags & REQ_POLLED;
+	struct sdebug_scsi_cmd *sdsc = scsi_cmd_priv(cmnd);
+	unsigned long iflags, flags;
 	u64 ns_from_boot = 0;
 	struct sdebug_queue *sqp;
 	struct sdebug_queued_cmd *sqcp;
 	struct scsi_device *sdp;
 	struct sdebug_defer *sd_dp;
+	int k;
 
 	if (unlikely(devip == NULL)) {
 		if (scsi_result == 0)
@@ -5606,22 +5661,17 @@  static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 		goto respond_in_thread;
 	}
 	set_bit(k, sqp->in_use_bm);
-	sqcp = &sqp->qc_arr[k];
-	sqcp->a_cmnd = cmnd;
-	cmnd->host_scribble = (unsigned char *)sqcp;
-	sd_dp = sqcp->sd_dp;
-	spin_unlock_irqrestore(&sqp->qc_lock, iflags);
 
-	if (!sd_dp) {
-		sd_dp = kzalloc(sizeof(*sd_dp), GFP_ATOMIC);
-		if (!sd_dp) {
-			clear_bit(k, sqp->in_use_bm);
-			return SCSI_MLQUEUE_HOST_BUSY;
-		}
-		new_sd_dp = true;
-	} else {
-		new_sd_dp = false;
+	sqcp = sdebug_alloc_queued_cmd(cmnd);
+	if (!sqcp) {
+		clear_bit(k, sqp->in_use_bm);
+		spin_unlock_irqrestore(&sqp->qc_lock, iflags);
+		return SCSI_MLQUEUE_HOST_BUSY;
 	}
+	sd_dp = &sqcp->sd_dp;
+	sd_dp->sqa_idx = k;
+	sqp->qc_arr[k] = sqcp;
+	spin_unlock_irqrestore(&sqp->qc_lock, iflags);
 
 	/* Set the hostwide tag */
 	if (sdebug_host_max_queue)
@@ -5673,12 +5723,11 @@  static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 
 				if (kt <= d) {	/* elapsed duration >= kt */
 					spin_lock_irqsave(&sqp->qc_lock, iflags);
-					sqcp->a_cmnd = NULL;
+					sqp->qc_arr[k] = NULL;
 					clear_bit(k, sqp->in_use_bm);
 					spin_unlock_irqrestore(&sqp->qc_lock, iflags);
-					if (new_sd_dp)
-						kfree(sd_dp);
 					/* call scsi_done() from this thread */
+					sdebug_free_queued_cmd(sqcp);
 					scsi_done(cmnd);
 					return 0;
 				}
@@ -5686,33 +5735,28 @@  static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 				kt -= d;
 			}
 		}
+		if (sdebug_statistics)
+			sd_dp->issuing_cpu = raw_smp_processor_id();
 		if (polled) {
+			spin_lock_irqsave(&sdsc->lock, flags);
 			sd_dp->cmpl_ts = ktime_add(ns_to_ktime(ns_from_boot), kt);
-			spin_lock_irqsave(&sqp->qc_lock, iflags);
-			if (!sd_dp->init_poll) {
-				sd_dp->init_poll = true;
-				sqcp->sd_dp = sd_dp;
-				sd_dp->sqa_idx = sqp - sdebug_q_arr;
-				sd_dp->qc_idx = k;
-			}
+			ASSIGN_QEUEUED_CMD(cmnd, sqcp);
 			WRITE_ONCE(sd_dp->defer_t, SDEB_DEFER_POLL);
-			spin_unlock_irqrestore(&sqp->qc_lock, iflags);
+			spin_unlock_irqrestore(&sdsc->lock, flags);
 		} else {
-			if (!sd_dp->init_hrt) {
-				sd_dp->init_hrt = true;
-				sqcp->sd_dp = sd_dp;
-				hrtimer_init(&sd_dp->hrt, CLOCK_MONOTONIC,
-					     HRTIMER_MODE_REL_PINNED);
-				sd_dp->hrt.function = sdebug_q_cmd_hrt_complete;
-				sd_dp->sqa_idx = sqp - sdebug_q_arr;
-				sd_dp->qc_idx = k;
-			}
-			WRITE_ONCE(sd_dp->defer_t, SDEB_DEFER_HRT);
 			/* schedule the invocation of scsi_done() for a later time */
+			spin_lock_irqsave(&sdsc->lock, flags);
+			ASSIGN_QEUEUED_CMD(cmnd, sqcp);
+			WRITE_ONCE(sd_dp->defer_t, SDEB_DEFER_HRT);
 			hrtimer_start(&sd_dp->hrt, kt, HRTIMER_MODE_REL_PINNED);
+			/*
+			 * The completion handler will try to grab sqcp->lock,
+			 * so there is no chance that the completion handler
+			 * will call scsi_done() until we release the lock
+			 * here (so ok to keep referencing sdsc).
+			 */
+			spin_unlock_irqrestore(&sdsc->lock, flags);
 		}
-		if (sdebug_statistics)
-			sd_dp->issuing_cpu = raw_smp_processor_id();
 	} else {	/* jdelay < 0, use work queue */
 		if (unlikely((sdebug_opts & SDEBUG_OPT_CMD_ABORT) &&
 			     atomic_read(&sdeb_inject_pending))) {
@@ -5722,30 +5766,21 @@  static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 				    blk_mq_unique_tag_to_tag(get_tag(cmnd)));
 		}
 
+		if (sdebug_statistics)
+			sd_dp->issuing_cpu = raw_smp_processor_id();
 		if (polled) {
+			spin_lock_irqsave(&sdsc->lock, flags);
+			ASSIGN_QEUEUED_CMD(cmnd, sqcp);
 			sd_dp->cmpl_ts = ns_to_ktime(ns_from_boot);
-			spin_lock_irqsave(&sqp->qc_lock, iflags);
-			if (!sd_dp->init_poll) {
-				sd_dp->init_poll = true;
-				sqcp->sd_dp = sd_dp;
-				sd_dp->sqa_idx = sqp - sdebug_q_arr;
-				sd_dp->qc_idx = k;
-			}
 			WRITE_ONCE(sd_dp->defer_t, SDEB_DEFER_POLL);
-			spin_unlock_irqrestore(&sqp->qc_lock, iflags);
+			spin_unlock_irqrestore(&sdsc->lock, flags);
 		} else {
-			if (!sd_dp->init_wq) {
-				sd_dp->init_wq = true;
-				sqcp->sd_dp = sd_dp;
-				sd_dp->sqa_idx = sqp - sdebug_q_arr;
-				sd_dp->qc_idx = k;
-				INIT_WORK(&sd_dp->ew.work, sdebug_q_cmd_wq_complete);
-			}
+			spin_lock_irqsave(&sdsc->lock, flags);
+			ASSIGN_QEUEUED_CMD(cmnd, sqcp);
 			WRITE_ONCE(sd_dp->defer_t, SDEB_DEFER_WQ);
 			schedule_work(&sd_dp->ew.work);
+			spin_unlock_irqrestore(&sdsc->lock, flags);
 		}
-		if (sdebug_statistics)
-			sd_dp->issuing_cpu = raw_smp_processor_id();
 	}
 
 	return 0;
@@ -7066,6 +7101,10 @@  static int __init scsi_debug_init(void)
 	hosts_to_add = sdebug_add_host;
 	sdebug_add_host = 0;
 
+	queued_cmd_cache = KMEM_CACHE(sdebug_queued_cmd, SLAB_HWCACHE_ALIGN);
+	if (!queued_cmd_cache)
+		goto driver_unreg;
+
 	for (k = 0; k < hosts_to_add; k++) {
 		if (want_store && k == 0) {
 			ret = sdebug_add_host_helper(idx);
@@ -7088,6 +7127,8 @@  static int __init scsi_debug_init(void)
 
 	return 0;
 
+driver_unreg:
+	driver_unregister(&sdebug_driverfs_driver);
 bus_unreg:
 	bus_unregister(&pseudo_lld_bus);
 dev_unreg:
@@ -7103,10 +7144,9 @@  static void __exit scsi_debug_exit(void)
 {
 	int k = sdebug_num_hosts;
 
-	stop_all_queued();
 	for (; k; k--)
 		sdebug_do_remove_host(true);
-	free_all_queued();
+	kmem_cache_destroy(queued_cmd_cache);
 	driver_unregister(&sdebug_driverfs_driver);
 	bus_unregister(&pseudo_lld_bus);
 	root_device_unregister(pseudo_primary);
@@ -7493,6 +7533,8 @@  static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
 		goto unlock;
 
 	for (first = true; first || qc_idx + 1 < sdebug_max_queue; )   {
+		unsigned long flags;
+		struct sdebug_scsi_cmd *sdsc;
 		if (first) {
 			first = false;
 			if (!test_bit(qc_idx, sqp->in_use_bm))
@@ -7503,37 +7545,60 @@  static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
 		if (qc_idx >= sdebug_max_queue)
 			break;
 
-		sqcp = &sqp->qc_arr[qc_idx];
-		sd_dp = sqcp->sd_dp;
-		if (unlikely(!sd_dp))
-			continue;
-		scp = sqcp->a_cmnd;
+		sqcp = sqp->qc_arr[qc_idx];
+		if (!sqcp) {
+			pr_err("sqcp is NULL, queue_num=%d, qc_idx=%u from %s\n",
+			       queue_num, qc_idx, __func__);
+			break;
+		}
+		sd_dp = &sqcp->sd_dp;
+
+		scp = sqcp->scmd;
 		if (unlikely(scp == NULL)) {
 			pr_err("scp is NULL, queue_num=%d, qc_idx=%u from %s\n",
 			       queue_num, qc_idx, __func__);
 			break;
 		}
+		sdsc = scsi_cmd_priv(scp);
+		spin_lock_irqsave(&sdsc->lock, flags);
 		if (READ_ONCE(sd_dp->defer_t) == SDEB_DEFER_POLL) {
-			if (kt_from_boot < sd_dp->cmpl_ts)
+			struct sdebug_queued_cmd *_sqcp = TO_QEUEUED_CMD(scp);
+
+			if (_sqcp != sqcp) {
+				pr_err("inconsistent queued cmd tag=%#x\n",
+				       blk_mq_unique_tag(scsi_cmd_to_rq(scp)));
+				spin_unlock_irqrestore(&sdsc->lock, flags);
 				continue;
+			}
+
+			if (kt_from_boot < sd_dp->cmpl_ts) {
+				spin_unlock_irqrestore(&sdsc->lock, flags);
+				continue;
+			}
 
-		} else		/* ignoring non REQ_POLLED requests */
+		} else		/* ignoring non REQ_POLLED requests */ {
+			spin_unlock_irqrestore(&sdsc->lock, flags);
 			continue;
+		}
 		if (unlikely(atomic_read(&retired_max_queue) > 0))
 			retiring = true;
 
-		sqcp->a_cmnd = NULL;
 		if (unlikely(!test_and_clear_bit(qc_idx, sqp->in_use_bm))) {
+			spin_unlock_irqrestore(&sdsc->lock, flags);
 			pr_err("Unexpected completion sqp %p queue_num=%d qc_idx=%u from %s\n",
 				sqp, queue_num, qc_idx, __func__);
+			sdebug_free_queued_cmd(sqcp);
 			break;
 		}
+		sqp->qc_arr[qc_idx] = NULL;
 		if (unlikely(retiring)) {	/* user has reduced max_queue */
 			int k, retval;
 
 			retval = atomic_read(&retired_max_queue);
 			if (qc_idx >= retval) {
 				pr_err("index %d too large\n", retval);
+				spin_unlock_irqrestore(&sdsc->lock, flags);
+				sdebug_free_queued_cmd(sqcp);
 				break;
 			}
 			k = find_last_bit(sqp->in_use_bm, retval);
@@ -7542,7 +7607,7 @@  static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
 			else
 				atomic_set(&retired_max_queue, k + 1);
 		}
-		WRITE_ONCE(sd_dp->defer_t, SDEB_DEFER_NONE);
+		spin_unlock_irqrestore(&sdsc->lock, flags);
 		spin_unlock_irqrestore(&sqp->qc_lock, iflags);
 
 		if (sdebug_statistics) {
@@ -7551,6 +7616,8 @@  static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
 				atomic_inc(&sdebug_miss_cpus);
 		}
 
+		sdebug_free_queued_cmd(sqcp);
+
 		scsi_done(scp); /* callback to mid level */
 		num_entries++;
 		spin_lock_irqsave(&sqp->qc_lock, iflags);
@@ -7733,6 +7800,16 @@  static int scsi_debug_queuecommand(struct Scsi_Host *shost,
 	return schedule_resp(scp, NULL, DID_NO_CONNECT << 16, NULL, 0, 0);
 }
 
+static int sdebug_init_cmd_priv(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
+{
+	struct sdebug_scsi_cmd *sdsc = scsi_cmd_priv(cmd);
+
+	spin_lock_init(&sdsc->lock);
+
+	return 0;
+}
+
+
 static struct scsi_host_template sdebug_driver_template = {
 	.show_info =		scsi_debug_show_info,
 	.write_info =		scsi_debug_write_info,
@@ -7760,6 +7837,8 @@  static struct scsi_host_template sdebug_driver_template = {
 	.max_segment_size =	-1U,
 	.module =		THIS_MODULE,
 	.track_queue_depth =	1,
+	.cmd_size = sizeof(struct sdebug_scsi_cmd),
+	.init_cmd_priv = sdebug_init_cmd_priv,
 };
 
 static int sdebug_driver_probe(struct device *dev)