block: introduce content activity based ioprio

Message ID 20240123093352.3007660-1-zhaoyang.huang@unisoc.com
State New
Headers
Series block: introduce content activity based ioprio |

Commit Message

zhaoyang.huang Jan. 23, 2024, 9:33 a.m. UTC
  From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

Currently, request's ioprio are set via task's schedule priority(when no
blkcg configured), which has high priority tasks possess the privilege on
both of CPU and IO scheduling.
This commit try to introduce a feature which adjust the request ioprio
based on the folio's activity. The original idea comes from LRU_GEN
which provides more precised folio activity than before. This commit try
to adjust the request's ioprio when certain part of its folios are hot,
which indicate that this request carry important contents and need be
scheduled ealier.

This commit is verified via bellowing script[1] on a v6.6 android system
and get significant improved fault time as expected[2] while dd's cost
also shrink from 55s to 40s.

1. fault_latency.bin is an ebpf based test tool which measure all task's
   iowait latency during page fault when scheduled out/in.
2. costmem generate page fault by mmaping a file and access the VA.
3. dd generate concurrent vfs io.

[1]
/fault_latency.bin 1 5 > /data/dd_costmem &
costmem -c0 -a2048000 -b128000 -o0 1>/dev/null &
costmem -c0 -a2048000 -b128000 -o0 1>/dev/null &
costmem -c0 -a2048000 -b128000 -o0 1>/dev/null &
costmem -c0 -a2048000 -b128000 -o0 1>/dev/null &
dd if=/dev/block/sda of=/data/ddtest bs=1024 count=2048000 &
dd if=/dev/block/sda of=/data/ddtest1 bs=1024 count=2048000 &
dd if=/dev/block/sda of=/data/ddtest2 bs=1024 count=2048000 &
dd if=/dev/block/sda of=/data/ddtest3 bs=1024 count=2048000

[2]
                       mainline		commit
io wait                836us            156us

This commit is also verified by the case of launching camera APP which is
usually considered as heavy working load on both of memory and IO, which
shows 12%-24% improvement.

		ttl = 0		ttl = 50	ttl = 100
mainline        2267ms		2420ms		2316ms
commit          1992ms          1806ms          1998ms

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 block/Kconfig               |  8 ++++++++
 block/bio.c                 | 10 ++++++++++
 block/blk-mq.c              | 21 +++++++++++++++++++++
 include/uapi/linux/ioprio.h | 20 +++++++++++++++-----
 4 files changed, 54 insertions(+), 5 deletions(-)
  

Comments

Christoph Hellwig Jan. 23, 2024, 1:06 p.m. UTC | #1
On Tue, Jan 23, 2024 at 05:33:52PM +0800, zhaoyang.huang wrote:
>  #define ALLOC_CACHE_MAX		256
> @@ -1069,12 +1070,21 @@ EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
>  void __bio_add_page(struct bio *bio, struct page *page,
>  		unsigned int len, unsigned int off)
>  {
> +	int class, level, hint, activity;
> +
> +	class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
> +	level = IOPRIO_PRIO_LEVEL(bio->bi_ioprio);
> +	hint = IOPRIO_PRIO_HINT(bio->bi_ioprio);
> +	activity = IOPRIO_PRIO_ACTIVITY(bio->bi_ioprio);
> +
>  	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
>  	WARN_ON_ONCE(bio_full(bio, len));
>  
>  	bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
>  	bio->bi_iter.bi_size += len;
>  	bio->bi_vcnt++;
> +	activity += (bio->bi_vcnt <= IOPRIO_NR_ACTIVITY && PageWorkingset(page)) ? 1 : 0;

The block layer must not look at page bits.  I've fixed all this crap
with a lot of work and we're not going to re-add it for another qute
hack.  The place to figure out any kind of I/O priority is the file
system (preferably using generic helpers).
  
Bart Van Assche Jan. 23, 2024, 3:13 p.m. UTC | #2
On 1/23/24 01:33, zhaoyang.huang wrote:
> +	activity += (bio->bi_vcnt <= IOPRIO_NR_ACTIVITY && PageWorkingset(page)) ? 1 : 0;
> +	bio->bi_ioprio = IOPRIO_PRIO_VALUE_ACTIVITY(class, level, hint, activity);

Shouldn't any code that is related to the page cache occur in filesystem code
(fs/) instead of in the block layer (block/)?

Thanks,

Bart.
  
Zhaoyang Huang Jan. 24, 2024, 7:53 a.m. UTC | #3
On Tue, Jan 23, 2024 at 9:06 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Tue, Jan 23, 2024 at 05:33:52PM +0800, zhaoyang.huang wrote:
> >  #define ALLOC_CACHE_MAX              256
> > @@ -1069,12 +1070,21 @@ EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
> >  void __bio_add_page(struct bio *bio, struct page *page,
> >               unsigned int len, unsigned int off)
> >  {
> > +     int class, level, hint, activity;
> > +
> > +     class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
> > +     level = IOPRIO_PRIO_LEVEL(bio->bi_ioprio);
> > +     hint = IOPRIO_PRIO_HINT(bio->bi_ioprio);
> > +     activity = IOPRIO_PRIO_ACTIVITY(bio->bi_ioprio);
> > +
> >       WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
> >       WARN_ON_ONCE(bio_full(bio, len));
> >
> >       bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
> >       bio->bi_iter.bi_size += len;
> >       bio->bi_vcnt++;
> > +     activity += (bio->bi_vcnt <= IOPRIO_NR_ACTIVITY && PageWorkingset(page)) ? 1 : 0;
>
> The block layer must not look at page bits.  I've fixed all this crap
> with a lot of work and we're not going to re-add it for another qute
> hack.  The place to figure out any kind of I/O priority is the file
> system (preferably using generic helpers).
ok, will use helper function in next version
>
  

Patch

diff --git a/block/Kconfig b/block/Kconfig
index f1364d1c0d93..8d6075575eae 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -228,6 +228,14 @@  config BLOCK_HOLDER_DEPRECATED
 config BLK_MQ_STACKING
 	bool
 
+config CONTENT_ACT_BASED_IOPRIO
+	bool "Enable content activity based ioprio"
+	depends on LRU_GEN
+	default y
+	help
+	This item enable the feature of adjust bio's priority by
+	calculating its content's activity.
+
 source "block/Kconfig.iosched"
 
 endif # BLOCK
diff --git a/block/bio.c b/block/bio.c
index 816d412c06e9..7fb65745d838 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -24,6 +24,7 @@ 
 #include "blk.h"
 #include "blk-rq-qos.h"
 #include "blk-cgroup.h"
+#include "blk-ioprio.h"
 
 #define ALLOC_CACHE_THRESHOLD	16
 #define ALLOC_CACHE_MAX		256
@@ -1069,12 +1070,21 @@  EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
 void __bio_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off)
 {
+	int class, level, hint, activity;
+
+	class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+	level = IOPRIO_PRIO_LEVEL(bio->bi_ioprio);
+	hint = IOPRIO_PRIO_HINT(bio->bi_ioprio);
+	activity = IOPRIO_PRIO_ACTIVITY(bio->bi_ioprio);
+
 	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
 	WARN_ON_ONCE(bio_full(bio, len));
 
 	bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
 	bio->bi_iter.bi_size += len;
 	bio->bi_vcnt++;
+	activity += (bio->bi_vcnt <= IOPRIO_NR_ACTIVITY && PageWorkingset(page)) ? 1 : 0;
+	bio->bi_ioprio = IOPRIO_PRIO_VALUE_ACTIVITY(class, level, hint, activity);
 }
 EXPORT_SYMBOL_GPL(__bio_add_page);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1fafd54dce3c..05cdd3adde94 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2939,6 +2939,26 @@  static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
 	return rq;
 }
 
+#ifdef CONFIG_CONTENT_ACT_BASED_IOPRIO
+static void bio_set_ioprio(struct bio *bio)
+{
+	int class, level, hint, activity;
+
+	class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+	level = IOPRIO_PRIO_LEVEL(bio->bi_ioprio);
+	hint = IOPRIO_PRIO_HINT(bio->bi_ioprio);
+	activity = IOPRIO_PRIO_ACTIVITY(bio->bi_ioprio);
+
+	if (activity >= bio->bi_vcnt / 2)
+		class = IOPRIO_CLASS_RT;
+	else if (activity >= bio->bi_vcnt / 4)
+		class = max(IOPRIO_PRIO_CLASS(get_current_ioprio()), IOPRIO_CLASS_BE);
+
+	bio->bi_ioprio = IOPRIO_PRIO_VALUE_ACTIVITY(class, level, hint, activity);
+
+	blkcg_set_ioprio(bio);
+}
+#else
 static void bio_set_ioprio(struct bio *bio)
 {
 	/* Nobody set ioprio so far? Initialize it based on task's nice value */
@@ -2946,6 +2966,7 @@  static void bio_set_ioprio(struct bio *bio)
 		bio->bi_ioprio = get_current_ioprio();
 	blkcg_set_ioprio(bio);
 }
+#endif
 
 /**
  * blk_mq_submit_bio - Create and send a request to block device.
diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h
index bee2bdb0eedb..d1c6081e796b 100644
--- a/include/uapi/linux/ioprio.h
+++ b/include/uapi/linux/ioprio.h
@@ -71,12 +71,18 @@  enum {
  * class and level.
  */
 #define IOPRIO_HINT_SHIFT		IOPRIO_LEVEL_NR_BITS
-#define IOPRIO_HINT_NR_BITS		10
+#define IOPRIO_HINT_NR_BITS		3
 #define IOPRIO_NR_HINTS			(1 << IOPRIO_HINT_NR_BITS)
 #define IOPRIO_HINT_MASK		(IOPRIO_NR_HINTS - 1)
 #define IOPRIO_PRIO_HINT(ioprio)	\
 	(((ioprio) >> IOPRIO_HINT_SHIFT) & IOPRIO_HINT_MASK)
 
+#define IOPRIO_ACTIVITY_SHIFT		(IOPRIO_HINT_NR_BITS + IOPRIO_LEVEL_NR_BITS)
+#define IOPRIO_ACTIVITY_NR_BITS		7
+#define IOPRIO_NR_ACTIVITY		(1 << IOPRIO_ACTIVITY_NR_BITS)
+#define IOPRIO_ACTIVITY_MASK		(IOPRIO_NR_ACTIVITY - 1)
+#define IOPRIO_PRIO_ACTIVITY(ioprio)	\
+	(((ioprio) >> IOPRIO_ACTIVITY_SHIFT) & IOPRIO_ACTIVITY_MASK)
 /*
  * I/O hints.
  */
@@ -108,20 +114,24 @@  enum {
  * Return an I/O priority value based on a class, a level and a hint.
  */
 static __always_inline __u16 ioprio_value(int prioclass, int priolevel,
-					  int priohint)
+					  int priohint, int activity)
 {
 	if (IOPRIO_BAD_VALUE(prioclass, IOPRIO_NR_CLASSES) ||
 	    IOPRIO_BAD_VALUE(priolevel, IOPRIO_NR_LEVELS) ||
-	    IOPRIO_BAD_VALUE(priohint, IOPRIO_NR_HINTS))
+	    IOPRIO_BAD_VALUE(priohint, IOPRIO_NR_HINTS) ||
+	    IOPRIO_BAD_VALUE(activity, IOPRIO_NR_ACTIVITY))
 		return IOPRIO_CLASS_INVALID << IOPRIO_CLASS_SHIFT;
 
 	return (prioclass << IOPRIO_CLASS_SHIFT) |
+		(activity << IOPRIO_ACTIVITY_SHIFT) |
 		(priohint << IOPRIO_HINT_SHIFT) | priolevel;
 }
 
 #define IOPRIO_PRIO_VALUE(prioclass, priolevel)			\
-	ioprio_value(prioclass, priolevel, IOPRIO_HINT_NONE)
+	ioprio_value(prioclass, priolevel, IOPRIO_HINT_NONE, 0)
 #define IOPRIO_PRIO_VALUE_HINT(prioclass, priolevel, priohint)	\
-	ioprio_value(prioclass, priolevel, priohint)
+	ioprio_value(prioclass, priolevel, priohint, 0)
+#define IOPRIO_PRIO_VALUE_ACTIVITY(prioclass, priolevel, priohint, activity)	\
+	ioprio_value(prioclass, priolevel, priohint, activity)
 
 #endif /* _UAPI_LINUX_IOPRIO_H */