David Laight <David.Laight@ACULAB.COM> wrote:
> > Move the iterator functions to a header file so that other operations that
> > need to scan over an iterator can be added. For instance, the rbd driver
> > could use this to scan a buffer to see if it is all zeros and libceph could
> > use this to generate a crc.
>
> These all look a bit big for being more generally inlined.
>
> I know you want to avoid the indirect call in the normal cases,
> but maybe it would be ok for other uses?
So you'd advocate for something like:
size_t generic_iterate(struct iov_iter *iter, size_t len, void *priv,
void *priv2, iov_ustep_f ustep, iov_step_f step)
{
return iterate_and_advance2(iter, len, priv, priv2,
ustep, step);
}
EXPORT_SYMBOL(generic_iterate);
in lib/iov_iter.c and then call that from the places that want to use it?
I tried benchmarking that (see attached patch - it needs to go on top of my
iov patches). Running the insmod thrice and then filtering out and sorting
the results:
iov_kunit_benchmark_bvec: avg 3174 uS, stddev 68 uS
iov_kunit_benchmark_bvec: avg 3176 uS, stddev 61 uS
iov_kunit_benchmark_bvec: avg 3180 uS, stddev 64 uS
iov_kunit_benchmark_bvec_outofline: avg 3678 uS, stddev 4 uS
iov_kunit_benchmark_bvec_outofline: avg 3678 uS, stddev 5 uS
iov_kunit_benchmark_bvec_outofline: avg 3679 uS, stddev 6 uS
iov_kunit_benchmark_xarray: avg 3560 uS, stddev 5 uS
iov_kunit_benchmark_xarray: avg 3560 uS, stddev 6 uS
iov_kunit_benchmark_xarray: avg 3570 uS, stddev 16 uS
iov_kunit_benchmark_xarray_outofline: avg 4125 uS, stddev 13 uS
iov_kunit_benchmark_xarray_outofline: avg 4125 uS, stddev 2 uS
iov_kunit_benchmark_xarray_outofline: avg 4125 uS, stddev 6 uS
It adds almost 16% overhead:
(gdb) p 4125/3560.0
$2 = 1.1587078651685394
(gdb) p 3678/3174.0
$3 = 1.1587901701323251
I'm guessing a lot of that is due to function pointer mitigations.
Now, part of the code size expansion can be mitigated by using, say,
iterate_and_advance_kernel() if you know you aren't going to encounter
user-backed iterators, or even using, say, iterate_bvec() if you know you're
only going to see a specific iterator type.
David
---
iov_iter: Benchmark out of line generic iterator
diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h
index 2ebb86c041b6..8f562e80473b 100644
--- a/include/linux/iov_iter.h
+++ b/include/linux/iov_iter.h
@@ -293,4 +293,7 @@ size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv,
return progress;
}
+size_t generic_iterate(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+ iov_ustep_f ustep, iov_step_f step);
+
#endif /* _LINUX_IOV_ITER_H */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 8f7a10c4a295..f9643dd02676 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1684,3 +1684,10 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i,
return -EFAULT;
}
EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
+
+size_t generic_iterate(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+ iov_ustep_f ustep, iov_step_f step)
+{
+ return iterate_and_advance2(iter, len, priv, priv2, ustep, step);
+}
+EXPORT_SYMBOL(generic_iterate);
diff --git a/lib/kunit_iov_iter.c b/lib/kunit_iov_iter.c
index cc9c64663a73..f208516a68c9 100644
--- a/lib/kunit_iov_iter.c
+++ b/lib/kunit_iov_iter.c
@@ -18,6 +18,7 @@
#include <linux/writeback.h>
#include <linux/uio.h>
#include <linux/bvec.h>
+#include <linux/iov_iter.h>
#include <kunit/test.h>
MODULE_DESCRIPTION("iov_iter testing");
@@ -1571,6 +1572,124 @@ static void __init iov_kunit_benchmark_xarray(struct kunit *test)
KUNIT_SUCCEED();
}
+static noinline
+size_t shovel_to_user_iter(void __user *iter_to, size_t progress,
+ size_t len, void *from, void *priv2)
+{
+ if (should_fail_usercopy())
+ return len;
+ if (access_ok(iter_to, len)) {
+ from += progress;
+ instrument_copy_to_user(iter_to, from, len);
+ len = raw_copy_to_user(iter_to, from, len);
+ }
+ return len;
+}
+
+static noinline
+size_t shovel_to_kernel_iter(void *iter_to, size_t progress,
+ size_t len, void *from, void *priv2)
+{
+ memcpy(iter_to, from + progress, len);
+ return 0;
+}
+
+/*
+ * Time copying 256MiB through an ITER_BVEC with an out-of-line copier
+ * function.
+ */
+static void __init iov_kunit_benchmark_bvec_outofline(struct kunit *test)
+{
+ struct iov_iter iter;
+ struct bio_vec *bvec;
+ struct page *page;
+ unsigned int samples[IOV_KUNIT_NR_SAMPLES];
+ ktime_t a, b;
+ ssize_t copied;
+ size_t size = 256 * 1024 * 1024, npages = size / PAGE_SIZE;
+ void *scratch;
+ int i;
+
+ /* Allocate a page and tile it repeatedly in the buffer. */
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ kunit_add_action_or_reset(test, iov_kunit_free_page, page);
+
+ bvec = kunit_kmalloc_array(test, npages, sizeof(bvec[0]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, bvec);
+ for (i = 0; i < npages; i++)
+ bvec_set_page(&bvec[i], page, PAGE_SIZE, 0);
+
+ /* Create a single large buffer to copy to/from. */
+ scratch = iov_kunit_create_source(test, npages);
+
+ /* Perform and time a bunch of copies. */
+ kunit_info(test, "Benchmarking copy_to_iter() over BVEC:\n");
+ for (i = 0; i < IOV_KUNIT_NR_SAMPLES; i++) {
+ iov_iter_bvec(&iter, ITER_DEST, bvec, npages, size);
+ a = ktime_get_real();
+ copied = generic_iterate(&iter, size, scratch, NULL,
+ shovel_to_user_iter,
+ shovel_to_kernel_iter);
+ b = ktime_get_real();
+ KUNIT_EXPECT_EQ(test, copied, size);
+ samples[i] = ktime_to_us(ktime_sub(b, a));
+ }
+
+ iov_kunit_benchmark_print_stats(test, samples);
+ KUNIT_SUCCEED();
+}
+
+/*
+ * Time copying 256MiB through an ITER_XARRAY with an out-of-line copier
+ * function.
+ */
+static void __init iov_kunit_benchmark_xarray_outofline(struct kunit *test)
+{
+ struct iov_iter iter;
+ struct xarray *xarray;
+ struct page *page;
+ unsigned int samples[IOV_KUNIT_NR_SAMPLES];
+ ktime_t a, b;
+ ssize_t copied;
+ size_t size = 256 * 1024 * 1024, npages = size / PAGE_SIZE;
+ void *scratch;
+ int i;
+
+ /* Allocate a page and tile it repeatedly in the buffer. */
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ kunit_add_action_or_reset(test, iov_kunit_free_page, page);
+
+ xarray = iov_kunit_create_xarray(test);
+
+ for (i = 0; i < npages; i++) {
+ void *x = xa_store(xarray, i, page, GFP_KERNEL);
+
+ KUNIT_ASSERT_FALSE(test, xa_is_err(x));
+ }
+
+ /* Create a single large buffer to copy to/from. */
+ scratch = iov_kunit_create_source(test, npages);
+
+ /* Perform and time a bunch of copies. */
+ kunit_info(test, "Benchmarking copy_to_iter() over XARRAY:\n");
+ for (i = 0; i < IOV_KUNIT_NR_SAMPLES; i++) {
+ iov_iter_xarray(&iter, ITER_DEST, xarray, 0, size);
+ a = ktime_get_real();
+
+ copied = generic_iterate(&iter, size, scratch, NULL,
+ shovel_to_user_iter,
+ shovel_to_kernel_iter);
+ b = ktime_get_real();
+ KUNIT_EXPECT_EQ(test, copied, size);
+ samples[i] = ktime_to_us(ktime_sub(b, a));
+ }
+
+ iov_kunit_benchmark_print_stats(test, samples);
+ KUNIT_SUCCEED();
+}
+
static struct kunit_case __refdata iov_kunit_cases[] = {
KUNIT_CASE(iov_kunit_copy_to_ubuf),
KUNIT_CASE(iov_kunit_copy_from_ubuf),
@@ -1593,6 +1712,8 @@ static struct kunit_case __refdata iov_kunit_cases[] = {
KUNIT_CASE(iov_kunit_benchmark_bvec),
KUNIT_CASE(iov_kunit_benchmark_bvec_split),
KUNIT_CASE(iov_kunit_benchmark_xarray),
+ KUNIT_CASE(iov_kunit_benchmark_bvec_outofline),
+ KUNIT_CASE(iov_kunit_benchmark_xarray_outofline),
{}
};
new file mode 100644
@@ -0,0 +1,261 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* I/O iterator iteration building functions.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _LINUX_IOV_ITER_H
+#define _LINUX_IOV_ITER_H
+
+#include <linux/uio.h>
+#include <linux/bvec.h>
+
+typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
+ void *priv, void *priv2);
+typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len,
+ void *priv, void *priv2);
+
+/*
+ * Handle ITER_UBUF.
+ */
+static __always_inline
+size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+ iov_ustep_f step)
+{
+ void __user *base = iter->ubuf;
+ size_t progress = 0, remain;
+
+ remain = step(base + iter->iov_offset, 0, len, priv, priv2);
+ progress = len - remain;
+ iter->iov_offset += progress;
+ return progress;
+}
+
+/*
+ * Handle ITER_IOVEC.
+ */
+static __always_inline
+size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+ iov_ustep_f step)
+{
+ const struct iovec *p = iter->__iov;
+ size_t progress = 0, skip = iter->iov_offset;
+
+ do {
+ size_t remain, consumed;
+ size_t part = min(len, p->iov_len - skip);
+
+ if (likely(part)) {
+ remain = step(p->iov_base + skip, progress, part, priv, priv2);
+ consumed = part - remain;
+ progress += consumed;
+ skip += consumed;
+ len -= consumed;
+ if (skip < p->iov_len)
+ break;
+ }
+ p++;
+ skip = 0;
+ } while (len);
+
+ iter->__iov = p;
+ iter->nr_segs -= p - iter->__iov;
+ iter->iov_offset = skip;
+ return progress;
+}
+
+/*
+ * Handle ITER_KVEC.
+ */
+static __always_inline
+size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+ iov_step_f step)
+{
+ const struct kvec *p = iter->kvec;
+ size_t progress = 0, skip = iter->iov_offset;
+
+ do {
+ size_t remain, consumed;
+ size_t part = min(len, p->iov_len - skip);
+
+ if (likely(part)) {
+ remain = step(p->iov_base + skip, progress, part, priv, priv2);
+ consumed = part - remain;
+ progress += consumed;
+ skip += consumed;
+ len -= consumed;
+ if (skip < p->iov_len)
+ break;
+ }
+ p++;
+ skip = 0;
+ } while (len);
+
+ iter->nr_segs -= p - iter->kvec;
+ iter->kvec = p;
+ iter->iov_offset = skip;
+ return progress;
+}
+
+/*
+ * Handle ITER_BVEC.
+ */
+static __always_inline
+size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+ iov_step_f step)
+{
+ const struct bio_vec *p = iter->bvec;
+ size_t progress = 0, skip = iter->iov_offset;
+
+ do {
+ size_t remain, consumed;
+ size_t offset = p->bv_offset + skip, part;
+ void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE);
+
+ part = min3(len,
+ (size_t)(p->bv_len - skip),
+ (size_t)(PAGE_SIZE - offset % PAGE_SIZE));
+ remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2);
+ kunmap_local(kaddr);
+ consumed = part - remain;
+ len -= consumed;
+ progress += consumed;
+ skip += consumed;
+ if (skip >= p->bv_len) {
+ skip = 0;
+ p++;
+ }
+ if (remain)
+ break;
+ } while (len);
+
+ iter->nr_segs -= p - iter->bvec;
+ iter->bvec = p;
+ iter->iov_offset = skip;
+ return progress;
+}
+
+/*
+ * Handle ITER_XARRAY.
+ */
+static __always_inline
+size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+ iov_step_f step)
+{
+ struct folio *folio;
+ size_t progress = 0;
+ loff_t start = iter->xarray_start + iter->iov_offset;
+ pgoff_t index = start / PAGE_SIZE;
+ XA_STATE(xas, iter->xarray, index);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ size_t remain, consumed, offset, part, flen;
+
+ if (xas_retry(&xas, folio))
+ continue;
+ if (WARN_ON(xa_is_value(folio)))
+ break;
+ if (WARN_ON(folio_test_hugetlb(folio)))
+ break;
+
+ offset = offset_in_folio(folio, start + progress);
+ flen = min(folio_size(folio) - offset, len);
+
+ while (flen) {
+ void *base = kmap_local_folio(folio, offset);
+
+ part = min_t(size_t, flen,
+ PAGE_SIZE - offset_in_page(offset));
+ remain = step(base, progress, part, priv, priv2);
+ kunmap_local(base);
+
+ consumed = part - remain;
+ progress += consumed;
+ len -= consumed;
+
+ if (remain || len == 0)
+ goto out;
+ flen -= consumed;
+ offset += consumed;
+ }
+ }
+
+out:
+ rcu_read_unlock();
+ iter->iov_offset += progress;
+ return progress;
+}
+
+/**
+ * iterate_and_advance2 - Iterate over an iterator
+ * @iter: The iterator to iterate over.
+ * @len: The amount to iterate over.
+ * @priv: Data for the step functions.
+ * @priv2: More data for the step functions.
+ * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
+ * @step: Function for other iterators; given kernel addresses.
+ *
+ * Iterate over the next part of an iterator, up to the specified length. The
+ * buffer is presented in segments, which for kernel iteration are broken up by
+ * physical pages and mapped, with the mapped address being presented.
+ *
+ * Two step functions, @step and @ustep, must be provided, one for handling
+ * mapped kernel addresses and the other is given user addresses which have the
+ * potential to fault since no pinning is performed.
+ *
+ * The step functions are passed the address and length of the segment, @priv,
+ * @priv2 and the amount of data so far iterated over (which can, for example,
+ * be added to @priv to point to the right part of a second buffer). The step
+ * functions should return the amount of the segment they didn't process (ie. 0
+ * indicates complete processsing).
+ *
+ * This function returns the amount of data processed (ie. 0 means nothing was
+ * processed and the value of @len means processes to completion).
+ */
+static __always_inline
+size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
+ void *priv2, iov_ustep_f ustep, iov_step_f step)
+{
+ size_t progress;
+
+ if (unlikely(iter->count < len))
+ len = iter->count;
+ if (unlikely(!len))
+ return 0;
+
+ if (likely(iter_is_ubuf(iter)))
+ progress = iterate_ubuf(iter, len, priv, priv2, ustep);
+ else if (likely(iter_is_iovec(iter)))
+ progress = iterate_iovec(iter, len, priv, priv2, ustep);
+ else if (iov_iter_is_bvec(iter))
+ progress = iterate_bvec(iter, len, priv, priv2, step);
+ else if (iov_iter_is_kvec(iter))
+ progress = iterate_kvec(iter, len, priv, priv2, step);
+ else if (iov_iter_is_xarray(iter))
+ progress = iterate_xarray(iter, len, priv, priv2, step);
+ else
+ progress = len;
+ iter->count -= progress;
+ return progress;
+}
+
+/**
+ * iterate_and_advance - Iterate over an iterator
+ * @iter: The iterator to iterate over.
+ * @len: The amount to iterate over.
+ * @priv: Data for the step functions.
+ * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
+ * @step: Function for other iterators; given kernel addresses.
+ *
+ * As iterate_and_advance2(), but priv2 is always NULL.
+ */
+static __always_inline
+size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv,
+ iov_ustep_f ustep, iov_step_f step)
+{
+ return iterate_and_advance2(iter, len, priv, NULL, ustep, step);
+}
+
+#endif /* _LINUX_IOV_ITER_H */
@@ -13,202 +13,7 @@
#include <net/checksum.h>
#include <linux/scatterlist.h>
#include <linux/instrumented.h>
-
-typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
- void *priv, void *priv2);
-typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len,
- void *priv, void *priv2);
-
-static __always_inline
-size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2,
- iov_ustep_f step)
-{
- void __user *base = iter->ubuf;
- size_t progress = 0, remain;
-
- remain = step(base + iter->iov_offset, 0, len, priv, priv2);
- progress = len - remain;
- iter->iov_offset += progress;
- return progress;
-}
-
-static __always_inline
-size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
- iov_ustep_f step)
-{
- const struct iovec *p = iter->__iov;
- size_t progress = 0, skip = iter->iov_offset;
-
- do {
- size_t remain, consumed;
- size_t part = min(len, p->iov_len - skip);
-
- if (likely(part)) {
- remain = step(p->iov_base + skip, progress, part, priv, priv2);
- consumed = part - remain;
- progress += consumed;
- skip += consumed;
- len -= consumed;
- if (skip < p->iov_len)
- break;
- }
- p++;
- skip = 0;
- } while (len);
-
- iter->__iov = p;
- iter->nr_segs -= p - iter->__iov;
- iter->iov_offset = skip;
- return progress;
-}
-
-static __always_inline
-size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
- iov_step_f step)
-{
- const struct kvec *p = iter->kvec;
- size_t progress = 0, skip = iter->iov_offset;
-
- do {
- size_t remain, consumed;
- size_t part = min(len, p->iov_len - skip);
-
- if (likely(part)) {
- remain = step(p->iov_base + skip, progress, part, priv, priv2);
- consumed = part - remain;
- progress += consumed;
- skip += consumed;
- len -= consumed;
- if (skip < p->iov_len)
- break;
- }
- p++;
- skip = 0;
- } while (len);
-
- iter->nr_segs -= p - iter->kvec;
- iter->kvec = p;
- iter->iov_offset = skip;
- return progress;
-}
-
-static __always_inline
-size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
- iov_step_f step)
-{
- const struct bio_vec *p = iter->bvec;
- size_t progress = 0, skip = iter->iov_offset;
-
- do {
- size_t remain, consumed;
- size_t offset = p->bv_offset + skip, part;
- void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE);
-
- part = min3(len,
- (size_t)(p->bv_len - skip),
- (size_t)(PAGE_SIZE - offset % PAGE_SIZE));
- remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2);
- kunmap_local(kaddr);
- consumed = part - remain;
- len -= consumed;
- progress += consumed;
- skip += consumed;
- if (skip >= p->bv_len) {
- skip = 0;
- p++;
- }
- if (remain)
- break;
- } while (len);
-
- iter->nr_segs -= p - iter->bvec;
- iter->bvec = p;
- iter->iov_offset = skip;
- return progress;
-}
-
-static __always_inline
-size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2,
- iov_step_f step)
-{
- struct folio *folio;
- size_t progress = 0;
- loff_t start = iter->xarray_start + iter->iov_offset;
- pgoff_t index = start / PAGE_SIZE;
- XA_STATE(xas, iter->xarray, index);
-
- rcu_read_lock();
- xas_for_each(&xas, folio, ULONG_MAX) {
- size_t remain, consumed, offset, part, flen;
-
- if (xas_retry(&xas, folio))
- continue;
- if (WARN_ON(xa_is_value(folio)))
- break;
- if (WARN_ON(folio_test_hugetlb(folio)))
- break;
-
- offset = offset_in_folio(folio, start + progress);
- flen = min(folio_size(folio) - offset, len);
-
- while (flen) {
- void *base = kmap_local_folio(folio, offset);
-
- part = min_t(size_t, flen,
- PAGE_SIZE - offset_in_page(offset));
- remain = step(base, progress, part, priv, priv2);
- kunmap_local(base);
-
- consumed = part - remain;
- progress += consumed;
- len -= consumed;
-
- if (remain || len == 0)
- goto out;
- flen -= consumed;
- offset += consumed;
- }
- }
-
-out:
- rcu_read_unlock();
- iter->iov_offset += progress;
- return progress;
-}
-
-static __always_inline
-size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
- void *priv2, iov_ustep_f ustep, iov_step_f step)
-{
- size_t progress;
-
- if (unlikely(iter->count < len))
- len = iter->count;
- if (unlikely(!len))
- return 0;
-
- if (likely(iter_is_ubuf(iter)))
- progress = iterate_ubuf(iter, len, priv, priv2, ustep);
- else if (likely(iter_is_iovec(iter)))
- progress = iterate_iovec(iter, len, priv, priv2, ustep);
- else if (iov_iter_is_bvec(iter))
- progress = iterate_bvec(iter, len, priv, priv2, step);
- else if (iov_iter_is_kvec(iter))
- progress = iterate_kvec(iter, len, priv, priv2, step);
- else if (iov_iter_is_xarray(iter))
- progress = iterate_xarray(iter, len, priv, priv2, step);
- else
- progress = len;
- iter->count -= progress;
- return progress;
-}
-
-static __always_inline
-size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv,
- iov_ustep_f ustep, iov_step_f step)
-{
- return iterate_and_advance2(iter, len, priv, NULL, ustep, step);
-}
+#include <linux/iov_iter.h>
static __always_inline
size_t copy_to_user_iter(void __user *iter_to, size_t progress,