[net-next,v3,10/18] nvme/host: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage

Message ID 20230620145338.1300897-11-dhowells@redhat.com
State New
Headers
Series splice, net: Switch over users of sendpage() and remove it |

Commit Message

David Howells June 20, 2023, 2:53 p.m. UTC
  When transmitting data, call down into TCP using a single sendmsg with
MSG_SPLICE_PAGES to indicate that content should be spliced rather than
performing several sendmsg and sendpage calls to transmit header, data
pages and trailer.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Sagi Grimberg <sagi@grimberg.me>
Acked-by: Willem de Bruijn <willemb@google.com>
cc: Keith Busch <kbusch@kernel.org>
cc: Jens Axboe <axboe@fb.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Chaitanya Kulkarni <kch@nvidia.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-nvme@lists.infradead.org
cc: netdev@vger.kernel.org
---

Notes:
    ver #2)
     - Wrap lines at 80.
    
    ver #3)
     - Split nvme/host from nvme/target changes.

 drivers/nvme/host/tcp.c | 46 +++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 22 deletions(-)
  

Comments

Sagi Grimberg June 21, 2023, 10:15 a.m. UTC | #1
One comment:

format for title in nvme-tcp is:

"nvme-tcp: ..." for host patches, and
"nvmet-tcp: ..." for target patches.

But this can be fixed up when applying the patch set.

Other than that, for both nvme patches:
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>

What tree will this be going from btw?

On 6/20/23 17:53, David Howells wrote:
> When transmitting data, call down into TCP using a single sendmsg with
> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
> performing several sendmsg and sendpage calls to transmit header, data
> pages and trailer.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> Tested-by: Sagi Grimberg <sagi@grimberg.me>
> Acked-by: Willem de Bruijn <willemb@google.com>
> cc: Keith Busch <kbusch@kernel.org>
> cc: Jens Axboe <axboe@fb.com>
> cc: Christoph Hellwig <hch@lst.de>
> cc: Chaitanya Kulkarni <kch@nvidia.com>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: Eric Dumazet <edumazet@google.com>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: linux-nvme@lists.infradead.org
> cc: netdev@vger.kernel.org
> ---
> 
> Notes:
>      ver #2)
>       - Wrap lines at 80.
>      
>      ver #3)
>       - Split nvme/host from nvme/target changes.
> 
>   drivers/nvme/host/tcp.c | 46 +++++++++++++++++++++--------------------
>   1 file changed, 24 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index bf0230442d57..6f31cdbb696a 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -997,25 +997,25 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
>   	u32 h2cdata_left = req->h2cdata_left;
>   
>   	while (true) {
> +		struct bio_vec bvec;
> +		struct msghdr msg = {
> +			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
> +		};
>   		struct page *page = nvme_tcp_req_cur_page(req);
>   		size_t offset = nvme_tcp_req_cur_offset(req);
>   		size_t len = nvme_tcp_req_cur_length(req);
>   		bool last = nvme_tcp_pdu_last_send(req, len);
>   		int req_data_sent = req->data_sent;
> -		int ret, flags = MSG_DONTWAIT;
> +		int ret;
>   
>   		if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
> -			flags |= MSG_EOR;
> +			msg.msg_flags |= MSG_EOR;
>   		else
> -			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> +			msg.msg_flags |= MSG_MORE;
>   
> -		if (sendpage_ok(page)) {
> -			ret = kernel_sendpage(queue->sock, page, offset, len,
> -					flags);
> -		} else {
> -			ret = sock_no_sendpage(queue->sock, page, offset, len,
> -					flags);
> -		}
> +		bvec_set_page(&bvec, page, len, offset);
> +		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> +		ret = sock_sendmsg(queue->sock, &msg);
>   		if (ret <= 0)
>   			return ret;
>   
> @@ -1054,22 +1054,24 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>   {
>   	struct nvme_tcp_queue *queue = req->queue;
>   	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
> +	struct bio_vec bvec;
> +	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
>   	bool inline_data = nvme_tcp_has_inline_data(req);
>   	u8 hdgst = nvme_tcp_hdgst_len(queue);
>   	int len = sizeof(*pdu) + hdgst - req->offset;
> -	int flags = MSG_DONTWAIT;
>   	int ret;
>   
>   	if (inline_data || nvme_tcp_queue_more(queue))
> -		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> +		msg.msg_flags |= MSG_MORE;
>   	else
> -		flags |= MSG_EOR;
> +		msg.msg_flags |= MSG_EOR;
>   
>   	if (queue->hdr_digest && !req->offset)
>   		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
>   
> -	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> -			offset_in_page(pdu) + req->offset, len,  flags);
> +	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
> +	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> +	ret = sock_sendmsg(queue->sock, &msg);
>   	if (unlikely(ret <= 0))
>   		return ret;
>   
> @@ -1093,6 +1095,8 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
>   {
>   	struct nvme_tcp_queue *queue = req->queue;
>   	struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
> +	struct bio_vec bvec;
> +	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, };
>   	u8 hdgst = nvme_tcp_hdgst_len(queue);
>   	int len = sizeof(*pdu) - req->offset + hdgst;
>   	int ret;
> @@ -1101,13 +1105,11 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
>   		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
>   
>   	if (!req->h2cdata_left)
> -		ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> -				offset_in_page(pdu) + req->offset, len,
> -				MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
> -	else
> -		ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
> -				offset_in_page(pdu) + req->offset, len,
> -				MSG_DONTWAIT | MSG_MORE);
> +		msg.msg_flags |= MSG_SPLICE_PAGES;
> +
> +	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
> +	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> +	ret = sock_sendmsg(queue->sock, &msg);
>   	if (unlikely(ret <= 0))
>   		return ret;
>   
>
  
David Howells June 21, 2023, 12:35 p.m. UTC | #2
Sagi Grimberg <sagi@grimberg.me> wrote:

> What tree will this be going from btw?

It's aimed at net-next, as mentioned in the subject line.

Thanks,
David
  
Sagi Grimberg June 21, 2023, 2:05 p.m. UTC | #3
>> What tree will this be going from btw?
> 
> It's aimed at net-next, as mentioned in the subject line.

ok, thanks.
  
Aurelien Aptel June 29, 2023, 2:45 p.m. UTC | #4
Hi David,

David Howells <dhowells@redhat.com> writes:
> When transmitting data, call down into TCP using a single sendmsg with
> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
> performing several sendmsg and sendpage calls to transmit header, data
> pages and trailer.

This series makes my kernel crash.

From the current net-next main branch:

commit 9ae440b8fdd6772b6c007fa3d3766530a09c9045 (HEAD)
Merge: b545a13ca9b2 b848b26c6672
Author: Jakub Kicinski <kuba@kernel.org>
Date:   Sat Jun 24 15:50:21 2023 -0700

    Merge branch 'splice-net-switch-over-users-of-sendpage-and-remove-it'


Steps to reproduce:

* connect a remote nvme null block device (nvmet) with 1 IO queue to keep
  things simple
* open /dev/nvme0n1 with O_RDWR|O_DIRECT|O_SYNC
* write() a 8k buffer or 4k buffer

Trace:

[  311.766163] BUG: kernel NULL pointer dereference, address: 0000000000000008
[  311.768136] #PF: supervisor read access in kernel mode
[  311.769327] #PF: error_code(0x0000) - not-present page
[  311.770393] PGD 148988067 P4D 0
[  311.771074] Oops: 0000 [#1] PREEMPT SMP NOPTI
[  311.771978] CPU: 0 PID: 180 Comm: kworker/0:1H Not tainted 6.4.0-rc7+ #27
[  311.773380] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014
[  311.774808] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp]
[  311.775547] RIP: 0010:skb_splice_from_iter+0xf1/0x370
[  311.776176] Code: 8b 45 88 4d 89 fa 4d 89 e7 45 89 ec 44 89 e3 41 83
               c4 01 83 fb 07 0f 87 56 02 00 00 48 8b 5c dd 90 41 bd 00 10 00 00 49 29
               c5 <48> 8b 53 08 4d 39 f5 4d 0f 47 ee f6 c2 01 0f 85 c7 01 00 00 66 90
[  311.778472] RSP: 0018:ff633e24c0747b08 EFLAGS: 00010206
[  311.779115] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
[  311.780007] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ff633e24c0747d30
[  311.780861] RBP: ff633e24c0747bb0 R08: ff633e24c0747d40 R09: 000000006db29140
[  311.781748] R10: ff3001bd00a22800 R11: 0000000008000000 R12: 0000000000000001
[  311.782631] R13: 0000000000001000 R14: 0000000000001000 R15: 0000000000000000
[  311.783506] FS:  0000000000000000(0000) GS:ff3001be77800000(0000) knlGS:0000000000000000
[  311.784494] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  311.785197] CR2: 0000000000000008 CR3: 0000000107f5c001 CR4: 0000000000771ef0
[  311.786076] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  311.786948] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  311.787822] PKRU: 55555554
[  311.788165] Call Trace:
[  311.788480]  <TASK>
[  311.788756]  ? show_regs+0x6e/0x80
[  311.789189]  ? __die+0x29/0x70
[  311.789577]  ? page_fault_oops+0x154/0x4a0
[  311.790097]  ? ip_output+0x7c/0x110
[  311.790541]  ? __sys_socketpair+0x1b4/0x280
[  311.791065]  ? __pfx_ip_finish_output+0x10/0x10
[  311.791640]  ? do_user_addr_fault+0x360/0x770
[  311.792184]  ? exc_page_fault+0x7d/0x190
[  311.792677]  ? asm_exc_page_fault+0x2b/0x30
[  311.793198]  ? skb_splice_from_iter+0xf1/0x370
[  311.793748]  ? skb_splice_from_iter+0xb7/0x370
[  311.794312]  ? __sk_mem_schedule+0x34/0x50
[  311.794824]  tcp_sendmsg_locked+0x3a6/0xdd0
[  311.795344]  ? tcp_push+0x10c/0x120
[  311.795789]  tcp_sendmsg+0x31/0x50
[  311.796213]  inet_sendmsg+0x47/0x80
[  311.796655]  sock_sendmsg+0x99/0xb0
[  311.797095]  ? inet_sendmsg+0x47/0x80
[  311.797557]  nvme_tcp_try_send_data+0x149/0x490 [nvme_tcp]
[  311.798242]  ? kvm_clock_get_cycles+0xd/0x20
[  311.799181]  nvme_tcp_try_send+0x1b7/0x300 [nvme_tcp]
[  311.800133]  nvme_tcp_io_work+0x40/0xc0 [nvme_tcp]
[  311.801044]  process_one_work+0x21c/0x430
[  311.801847]  worker_thread+0x54/0x3e0
[  311.802611]  ? __pfx_worker_thread+0x10/0x10
[  311.803433]  kthread+0xf8/0x130
[  311.804116]  ? __pfx_kthread+0x10/0x10
[  311.804865]  ret_from_fork+0x29/0x50
[  311.805596]  </TASK>
[  311.806165] Modules linked in: mlx5_ib ib_uverbs ib_core nvme_tcp
 mlx5_core mlxfw psample pci_hyperv_intf rpcsec_gss_krb5 nfsv3
 auth_rpcgss nfs_acl nfsv4 nfs lockd grace fscache netfs nvme_fabrics
 nvme_core nvme_common intel_rapl_msr intel_rapl_common
 intel_uncore_frequency_common nfit kvm_intel kvm rapl input_leds
 serio_raw sunrpc binfmt_misc qemu_fw_cfg sch_fq_codel dm_multipath
 scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr ramoops reed_solomon
 efi_pstore virtio_rng ip_tables x_tables autofs4 btrfs blake2b_generic
 raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor
 async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear
 hid_generic usbhid hid qxl drm_ttm_helper ttm crct10dif_pclmul
 crc32_pclmul ghash_clmulni_intel drm_kms_helper sha512_ssse3
 syscopyarea sysfillrect sysimgblt aesni_intel crypto_simd i2c_i801 ahci
 cryptd psmous e drm virtio_net i2c_smbus libahci lpc_ich net_failover
 xhci_pci virtio_blk failover xhci_pci_renesas [last unloaded: ib_core]
[  311.818698] CR2: 0000000000000008
[  311.819437] ---[ end trace 0000000000000000 ]---

Cheers,
  
Sagi Grimberg June 29, 2023, 2:49 p.m. UTC | #5
> Hi David,
> 
> David Howells <dhowells@redhat.com> writes:
>> When transmitting data, call down into TCP using a single sendmsg with
>> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
>> performing several sendmsg and sendpage calls to transmit header, data
>> pages and trailer.
> 
> This series makes my kernel crash.
> 
>  From the current net-next main branch:
> 
> commit 9ae440b8fdd6772b6c007fa3d3766530a09c9045 (HEAD)
> Merge: b545a13ca9b2 b848b26c6672
> Author: Jakub Kicinski <kuba@kernel.org>
> Date:   Sat Jun 24 15:50:21 2023 -0700
> 
>      Merge branch 'splice-net-switch-over-users-of-sendpage-and-remove-it'
> 
> 
> Steps to reproduce:
> 
> * connect a remote nvme null block device (nvmet) with 1 IO queue to keep
>    things simple
> * open /dev/nvme0n1 with O_RDWR|O_DIRECT|O_SYNC
> * write() a 8k buffer or 4k buffer

Most likely this also reproduces with blktests?
https://github.com/osandov/blktests

simple way to check is to run:
nvme_trtype=tcp ./check nvme

This runs nvme tests over nvme-tcp.

No need for network, disk or anything. It runs
both nvme and nvmet over the lo device..
  
Aurelien Aptel June 29, 2023, 3:02 p.m. UTC | #6
Sagi Grimberg <sagi@grimberg.me> writes:
> Most likely this also reproduces with blktests?
> https://github.com/osandov/blktests
>
> simple way to check is to run:
> nvme_trtype=tcp ./check nvme
>
> This runs nvme tests over nvme-tcp.

Yes, it crashes similarly during test 10:

root@ktest:~/blktests# nvme_trtype=tcp ./check nvme
nvme/002 (create many subsystems and test discovery)         [not run]
    nvme_trtype=tcp is not supported in this test
nvme/003 (test if we're sending keep-alives to a discovery controller) [passed]
    runtime    ...  10.389s
nvme/004 (test nvme and nvmet UUID NS descriptors)           [passed]
    runtime    ...  1.264s
nvme/005 (reset local loopback target)                       [passed]
    runtime    ...  1.403s
nvme/006 (create an NVMeOF target with a block device-backed ns) [passed]
    runtime    ...  0.129s
nvme/007 (create an NVMeOF target with a file-backed ns)     [passed]
    runtime    ...  0.083s
nvme/008 (create an NVMeOF host with a block device-backed ns) [passed]
    runtime    ...  1.239s
nvme/009 (create an NVMeOF host with a file-backed ns)       [passed]
    runtime    ...  1.229s
nvme/010 (run data verification fio job on NVMeOF block device-backed
ns)

Same trace, null ptr deref at skb_splice_from_iter+0x10a/0x370

Cheers,
  
David Howells June 29, 2023, 9:23 p.m. UTC | #7
Sagi Grimberg <sagi@grimberg.me> wrote:

> simple way to check is to run:
> nvme_trtype=tcp ./check nvme

It says a lot of:

nvme/002 (create many subsystems and test discovery)         [not run]
    nvme is not available
    nvme_trtype=tcp is not supported in this test
nvme/003 (test if we're sending keep-alives to a discovery controller) [not run]
    nvme is not available
nvme/004 (test nvme and nvmet UUID NS descriptors)           [not run]
    nvme is not available
nvme/005 (reset local loopback target)                       [not run]
    nvme is not available
...

I have the following NVMe config:

# NVME Support
CONFIG_NVME_COMMON=y
CONFIG_NVME_CORE=y
CONFIG_BLK_DEV_NVME=y
CONFIG_NVME_MULTIPATH=y
# CONFIG_NVME_VERBOSE_ERRORS is not set
# CONFIG_NVME_HWMON is not set
CONFIG_NVME_FABRICS=y
# CONFIG_NVME_RDMA is not set
# CONFIG_NVME_FC is not set
CONFIG_NVME_TCP=y
CONFIG_NVME_AUTH=y
CONFIG_NVME_TARGET=y
CONFIG_NVME_TARGET_PASSTHRU=y
CONFIG_NVME_TARGET_LOOP=y
# CONFIG_NVME_TARGET_RDMA is not set
# CONFIG_NVME_TARGET_FC is not set
CONFIG_NVME_TARGET_TCP=y
CONFIG_NVME_TARGET_AUTH=y
# end of NVME Support
CONFIG_RTC_NVMEM=y
CONFIG_NVMEM=y
# CONFIG_NVMEM_SYSFS is not set
# CONFIG_NVMEM_LAYOUT_SL28_VPD is not set
# CONFIG_NVMEM_LAYOUT_ONIE_TLV is not set
# CONFIG_NVMEM_RMEM is not set

Can you tell me what I'm missing?

David
  
Sagi Grimberg June 29, 2023, 9:33 p.m. UTC | #8
> Sagi Grimberg <sagi@grimberg.me> wrote:
> 
>> simple way to check is to run:
>> nvme_trtype=tcp ./check nvme
> 
> It says a lot of:
> 
> nvme/002 (create many subsystems and test discovery)         [not run]
>      nvme is not available
>      nvme_trtype=tcp is not supported in this test
> nvme/003 (test if we're sending keep-alives to a discovery controller) [not run]
>      nvme is not available
> nvme/004 (test nvme and nvmet UUID NS descriptors)           [not run]
>      nvme is not available
> nvme/005 (reset local loopback target)                       [not run]
>      nvme is not available
> ...
> 
> I have the following NVMe config:
> 
> # NVME Support
> CONFIG_NVME_COMMON=y
> CONFIG_NVME_CORE=y
> CONFIG_BLK_DEV_NVME=y
> CONFIG_NVME_MULTIPATH=y
> # CONFIG_NVME_VERBOSE_ERRORS is not set
> # CONFIG_NVME_HWMON is not set
> CONFIG_NVME_FABRICS=y
> # CONFIG_NVME_RDMA is not set
> # CONFIG_NVME_FC is not set
> CONFIG_NVME_TCP=y
> CONFIG_NVME_AUTH=y
> CONFIG_NVME_TARGET=y
> CONFIG_NVME_TARGET_PASSTHRU=y
> CONFIG_NVME_TARGET_LOOP=y
> # CONFIG_NVME_TARGET_RDMA is not set
> # CONFIG_NVME_TARGET_FC is not set
> CONFIG_NVME_TARGET_TCP=y
> CONFIG_NVME_TARGET_AUTH=y
> # end of NVME Support
> CONFIG_RTC_NVMEM=y
> CONFIG_NVMEM=y
> # CONFIG_NVMEM_SYSFS is not set
> # CONFIG_NVMEM_LAYOUT_SL28_VPD is not set
> # CONFIG_NVMEM_LAYOUT_ONIE_TLV is not set
> # CONFIG_NVMEM_RMEM is not set
> 
> Can you tell me what I'm missing?

install nvme-cli.

nvme/010 is enough to reproduce I think:
nvme_trtype=tcp ./check nvme/010
  
David Howells June 29, 2023, 9:34 p.m. UTC | #9
Meh:

                if (!sendpage_ok(page))
-                       msg.msg_flags &= ~MSG_SPLICE_PAGES,
+                       msg.msg_flags &= ~MSG_SPLICE_PAGES;
 
                bvec_set_page(&bvec, page, len, offset);

David
  
Jakub Kicinski June 29, 2023, 11:43 p.m. UTC | #10
On Thu, 29 Jun 2023 22:34:59 +0100 David Howells wrote:
>                 if (!sendpage_ok(page))
> -                       msg.msg_flags &= ~MSG_SPLICE_PAGES,
> +                       msg.msg_flags &= ~MSG_SPLICE_PAGES;

😵️

Let me CC llvm@ in case someone's there is willing to make 
the compiler warn about this.
  
Nathan Chancellor June 30, 2023, 4:10 p.m. UTC | #11
On Thu, Jun 29, 2023 at 04:43:18PM -0700, Jakub Kicinski wrote:
> On Thu, 29 Jun 2023 22:34:59 +0100 David Howells wrote:
> >                 if (!sendpage_ok(page))
> > -                       msg.msg_flags &= ~MSG_SPLICE_PAGES,
> > +                       msg.msg_flags &= ~MSG_SPLICE_PAGES;
> 
> 😵️
> 
> Let me CC llvm@ in case someone's there is willing to make 
> the compiler warn about this.
> 

Turns out clang already has a warning for this, -Wcomma:

  drivers/nvme/host/tcp.c:1017:38: error: possible misuse of comma operator here [-Werror,-Wcomma]
   1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
        |                                                           ^
  drivers/nvme/host/tcp.c:1017:4: note: cast expression to void to silence warning
   1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
        |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        |                         (void)(                           )
  1 error generated.

Let me do some wider build testing to see if it is viable to turn this
on for the whole kernel because it seems worth it, at least in this
case. There are a lot of cases where a warning won't be emitted (see the
original upstream review for a list: https://reviews.llvm.org/D3976) but
something is better than nothing, right? :)

Cheers,
Nathan
  
Jakub Kicinski June 30, 2023, 4:14 p.m. UTC | #12
On Fri, 30 Jun 2023 09:10:43 -0700 Nathan Chancellor wrote:
> > Let me CC llvm@ in case someone's there is willing to make 
> > the compiler warn about this.
> 
> Turns out clang already has a warning for this, -Wcomma:
> 
>   drivers/nvme/host/tcp.c:1017:38: error: possible misuse of comma operator here [-Werror,-Wcomma]
>    1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
>         |                                                           ^
>   drivers/nvme/host/tcp.c:1017:4: note: cast expression to void to silence warning
>    1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
>         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>         |                         (void)(                           )
>   1 error generated.
> 
> Let me do some wider build testing to see if it is viable to turn this
> on for the whole kernel because it seems worth it, at least in this
> case. There are a lot of cases where a warning won't be emitted (see the
> original upstream review for a list: https://reviews.llvm.org/D3976) but
> something is better than nothing, right? :)

Ah, neat. Misleading indentation is another possible angle, I reckon,
but not sure if that's enabled/possible to enable for the entire kernel
either :( We test-build with W=1 in networking, FWIW, so W=1 would be
enough for us.
  
Nathan Chancellor June 30, 2023, 7:28 p.m. UTC | #13
On Fri, Jun 30, 2023 at 09:14:42AM -0700, Jakub Kicinski wrote:
> On Fri, 30 Jun 2023 09:10:43 -0700 Nathan Chancellor wrote:
> > > Let me CC llvm@ in case someone's there is willing to make 
> > > the compiler warn about this.
> > 
> > Turns out clang already has a warning for this, -Wcomma:
> > 
> >   drivers/nvme/host/tcp.c:1017:38: error: possible misuse of comma operator here [-Werror,-Wcomma]
> >    1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
> >         |                                                           ^
> >   drivers/nvme/host/tcp.c:1017:4: note: cast expression to void to silence warning
> >    1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
> >         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> >         |                         (void)(                           )
> >   1 error generated.
> > 
> > Let me do some wider build testing to see if it is viable to turn this
> > on for the whole kernel because it seems worth it, at least in this
> > case. There are a lot of cases where a warning won't be emitted (see the
> > original upstream review for a list: https://reviews.llvm.org/D3976) but
> > something is better than nothing, right? :)

Well, that was a pipe dream :/ In ARCH=arm multi_v7_defconfig alone,
there are 289 unique instances of the warning (although a good number
have multiple instances per line, so it is not quite as bad as it seems,
but still bad):

$ rg -- -Wcomma arm-multi_v7_defconfig.log | sort | uniq -c | wc -l
289

https://gist.github.com/nathanchance/907867e0a7adffc877fd39fd08853801

Probably not a good sign of the signal to noise ratio, I looked through
a good handful and all the cases I saw were not interesting... Perhaps
the warning could be tuned further to become useful for the kernel but
in its current form, it is definitely a no-go :/

> Ah, neat. Misleading indentation is another possible angle, I reckon,
> but not sure if that's enabled/possible to enable for the entire kernel

Yeah, I was surprised there was no warning for misleading indentation...
it is a part of -Wall for both clang and GCC, so it is on for the
kernel, it just appears not to trigger in this case.

> either :( We test-build with W=1 in networking, FWIW, so W=1 would be
> enough for us.

Unfortunately, even in its current form, it is way too noisy for W=1, as
the qualifier for W=1 is "do not occur too often". Probably could be
placed under W=2 but it still has the problem of wading through every
instance and it is basically a no-op because nobody tests with W=2.

Cheers,
Nathan
  
Nick Desaulniers July 7, 2023, 8:45 p.m. UTC | #14
On Fri, Jun 30, 2023 at 12:28 PM Nathan Chancellor <nathan@kernel.org> wrote:
>
> On Fri, Jun 30, 2023 at 09:14:42AM -0700, Jakub Kicinski wrote:
> > On Fri, 30 Jun 2023 09:10:43 -0700 Nathan Chancellor wrote:
> > > > Let me CC llvm@ in case someone's there is willing to make
> > > > the compiler warn about this.
> > >
> > > Turns out clang already has a warning for this, -Wcomma:
> > >
> > >   drivers/nvme/host/tcp.c:1017:38: error: possible misuse of comma operator here [-Werror,-Wcomma]
> > >    1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
> > >         |                                                           ^
> > >   drivers/nvme/host/tcp.c:1017:4: note: cast expression to void to silence warning
> > >    1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
> > >         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > >         |                         (void)(                           )
> > >   1 error generated.
> > >
> > > Let me do some wider build testing to see if it is viable to turn this
> > > on for the whole kernel because it seems worth it, at least in this
> > > case. There are a lot of cases where a warning won't be emitted (see the
> > > original upstream review for a list: https://reviews.llvm.org/D3976) but
> > > something is better than nothing, right? :)
>
> Well, that was a pipe dream :/ In ARCH=arm multi_v7_defconfig alone,
> there are 289 unique instances of the warning (although a good number
> have multiple instances per line, so it is not quite as bad as it seems,
> but still bad):
>
> $ rg -- -Wcomma arm-multi_v7_defconfig.log | sort | uniq -c | wc -l
> 289
>
> https://gist.github.com/nathanchance/907867e0a7adffc877fd39fd08853801

It's definitely interesting to take a look at some of these cases.
Some are pretty funny IMO.

>
> Probably not a good sign of the signal to noise ratio, I looked through
> a good handful and all the cases I saw were not interesting... Perhaps
> the warning could be tuned further to become useful for the kernel but
> in its current form, it is definitely a no-go :/
>
> > Ah, neat. Misleading indentation is another possible angle, I reckon,
> > but not sure if that's enabled/possible to enable for the entire kernel
>
> Yeah, I was surprised there was no warning for misleading indentation...
> it is a part of -Wall for both clang and GCC, so it is on for the
> kernel, it just appears not to trigger in this case.
>
> > either :( We test-build with W=1 in networking, FWIW, so W=1 would be
> > enough for us.
>
> Unfortunately, even in its current form, it is way too noisy for W=1, as
> the qualifier for W=1 is "do not occur too often". Probably could be
> placed under W=2 but it still has the problem of wading through every
> instance and it is basically a no-op because nobody tests with W=2.
>
> Cheers,
> Nathan
>
  

Patch

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index bf0230442d57..6f31cdbb696a 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -997,25 +997,25 @@  static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 	u32 h2cdata_left = req->h2cdata_left;
 
 	while (true) {
+		struct bio_vec bvec;
+		struct msghdr msg = {
+			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
+		};
 		struct page *page = nvme_tcp_req_cur_page(req);
 		size_t offset = nvme_tcp_req_cur_offset(req);
 		size_t len = nvme_tcp_req_cur_length(req);
 		bool last = nvme_tcp_pdu_last_send(req, len);
 		int req_data_sent = req->data_sent;
-		int ret, flags = MSG_DONTWAIT;
+		int ret;
 
 		if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
-			flags |= MSG_EOR;
+			msg.msg_flags |= MSG_EOR;
 		else
-			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+			msg.msg_flags |= MSG_MORE;
 
-		if (sendpage_ok(page)) {
-			ret = kernel_sendpage(queue->sock, page, offset, len,
-					flags);
-		} else {
-			ret = sock_no_sendpage(queue->sock, page, offset, len,
-					flags);
-		}
+		bvec_set_page(&bvec, page, len, offset);
+		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+		ret = sock_sendmsg(queue->sock, &msg);
 		if (ret <= 0)
 			return ret;
 
@@ -1054,22 +1054,24 @@  static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 {
 	struct nvme_tcp_queue *queue = req->queue;
 	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
+	struct bio_vec bvec;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
 	bool inline_data = nvme_tcp_has_inline_data(req);
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 	int len = sizeof(*pdu) + hdgst - req->offset;
-	int flags = MSG_DONTWAIT;
 	int ret;
 
 	if (inline_data || nvme_tcp_queue_more(queue))
-		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+		msg.msg_flags |= MSG_MORE;
 	else
-		flags |= MSG_EOR;
+		msg.msg_flags |= MSG_EOR;
 
 	if (queue->hdr_digest && !req->offset)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 
-	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
-			offset_in_page(pdu) + req->offset, len,  flags);
+	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+	ret = sock_sendmsg(queue->sock, &msg);
 	if (unlikely(ret <= 0))
 		return ret;
 
@@ -1093,6 +1095,8 @@  static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 {
 	struct nvme_tcp_queue *queue = req->queue;
 	struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
+	struct bio_vec bvec;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, };
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 	int len = sizeof(*pdu) - req->offset + hdgst;
 	int ret;
@@ -1101,13 +1105,11 @@  static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 
 	if (!req->h2cdata_left)
-		ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
-				offset_in_page(pdu) + req->offset, len,
-				MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
-	else
-		ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
-				offset_in_page(pdu) + req->offset, len,
-				MSG_DONTWAIT | MSG_MORE);
+		msg.msg_flags |= MSG_SPLICE_PAGES;
+
+	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+	ret = sock_sendmsg(queue->sock, &msg);
 	if (unlikely(ret <= 0))
 		return ret;