[v2] sched/fair: sanitize vruntime of entity being migrated

Message ID 20230306132418.50389-1-zhangqiao22@huawei.com
State New
Headers
Series [v2] sched/fair: sanitize vruntime of entity being migrated |

Commit Message

Zhang Qiao March 6, 2023, 1:24 p.m. UTC
  Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
entity being placed") fix an overflowing bug, but ignore
a case that se->exec_start is reset after a migration.

For fixing this case, we reset the vruntime of a long
sleeping task in migrate_task_rq_fair().

Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---

v1 -> v2:
- fix some typos and update comments
- reformat the patch

---
 kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 21 deletions(-)
  

Comments

Vincent Guittot March 6, 2023, 1:53 p.m. UTC | #1
On Mon, 6 Mar 2023 at 13:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>
> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
> entity being placed") fix an overflowing bug, but ignore
> a case that se->exec_start is reset after a migration.
>
> For fixing this case, we reset the vruntime of a long
> sleeping task in migrate_task_rq_fair().
>
> Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>

Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>

> ---
>
> v1 -> v2:
> - fix some typos and update comments
> - reformat the patch
>
> ---
>  kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++-------------
>  1 file changed, 55 insertions(+), 21 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7a1b1f855b96..74c9918ffe76 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4648,11 +4648,45 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  #endif
>  }
>
> +static inline bool entity_is_long_sleep(struct sched_entity *se)
> +{
> +       struct cfs_rq *cfs_rq;
> +       u64 sleep_time;
> +
> +       if (se->exec_start == 0)
> +               return false;
> +
> +       cfs_rq = cfs_rq_of(se);
> +       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> +       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> +               return true;
> +
> +       return false;
> +}
> +
> +static inline u64 sched_sleeper_credit(struct sched_entity *se)
> +{
> +       unsigned long thresh;
> +
> +       if (se_is_idle(se))
> +               thresh = sysctl_sched_min_granularity;
> +       else
> +               thresh = sysctl_sched_latency;
> +
> +       /*
> +        * Halve their sleep time's effect, to allow
> +        * for a gentler effect of sleepers:
> +        */
> +       if (sched_feat(GENTLE_FAIR_SLEEPERS))
> +               thresh >>= 1;
> +
> +       return thresh;
> +}
> +
>  static void
>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>  {
>         u64 vruntime = cfs_rq->min_vruntime;
> -       u64 sleep_time;
>
>         /*
>          * The 'current' period is already promised to the current tasks,
> @@ -4664,23 +4698,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>                 vruntime += sched_vslice(cfs_rq, se);
>
>         /* sleeps up to a single latency don't count. */
> -       if (!initial) {
> -               unsigned long thresh;
> -
> -               if (se_is_idle(se))
> -                       thresh = sysctl_sched_min_granularity;
> -               else
> -                       thresh = sysctl_sched_latency;
> -
> -               /*
> -                * Halve their sleep time's effect, to allow
> -                * for a gentler effect of sleepers:
> -                */
> -               if (sched_feat(GENTLE_FAIR_SLEEPERS))
> -                       thresh >>= 1;
> -
> -               vruntime -= thresh;
> -       }
> +       if (!initial)
> +               vruntime -= sched_sleeper_credit(se);
>
>         /*
>          * Pull vruntime of the entity being placed to the base level of
> @@ -4689,8 +4708,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>          * the base as it may be too far off and the comparison may get
>          * inversed due to s64 overflow.
>          */
> -       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> +       if (entity_is_long_sleep(se))
>                 se->vruntime = vruntime;
>         else
>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>         if (READ_ONCE(p->__state) == TASK_WAKING) {
>                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
>
> -               se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> +               /*
> +                * We determine whether a task sleeps for long by checking
> +                * se->exec_start, and if it is, we sanitize its vruntime at
> +                * place_entity(). However, after a migration, this detection
> +                * method fails due to se->exec_start being reset.
> +                *
> +                * For fixing this case, we add the same check here. For a task
> +                * which has slept for a long time, its vruntime should be reset
> +                * to cfs_rq->min_vruntime with a sleep credit. Because waking
> +                * task's vruntime will be added to cfs_rq->min_vruntime when
> +                * enqueue, we only need to reset the se->vruntime of waking task
> +                * to a credit here.
> +                */
> +               if (entity_is_long_sleep(se))
> +                       se->vruntime = -sched_sleeper_credit(se);
> +               else
> +                       se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>         }
>
>         if (!task_on_rq_migrating(p)) {
> --
> 2.17.1
>
  
kernel test robot March 7, 2023, 2:16 a.m. UTC | #2
Greeting,

FYI, we noticed WARNING:at_kernel/sched/sched.h:#assert_clock_updated due to commit (built with gcc-11):

commit: 77900e3a6934d2a2e33f26775447e1dceeb1c503 ("[PATCH v2] sched/fair: sanitize vruntime of entity being migrated")
url: https://github.com/intel-lab-lkp/linux/commits/Zhang-Qiao/sched-fair-sanitize-vruntime-of-entity-being-migrated/20230306-205822
base: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git 7c4a5b89a0b5a57a64b601775b296abf77a9fe97
patch link: https://lore.kernel.org/all/20230306132418.50389-1-zhangqiao22@huawei.com/
patch subject: [PATCH v2] sched/fair: sanitize vruntime of entity being migrated

in testcase: boot

on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):


If you fix the issue, kindly add following tag
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Link: https://lore.kernel.org/oe-lkp/202303070945.c7b65897-oliver.sang@intel.com


[    0.702980][    T1] ------------[ cut here ]------------
[    0.704539][    T1] rq->clock_update_flags < RQCF_ACT_SKIP
[ 0.704551][ T1] WARNING: CPU: 0 PID: 1 at kernel/sched/sched.h:1496 assert_clock_updated+0x21/0x24 
[    0.706944][    T1] Modules linked in:
[    0.706944][    T1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.2.0-rc6-00112-g77900e3a6934 #11
[    0.706944][    T1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
[ 0.706944][ T1] RIP: assert_clock_updated+0x21/0x24 
[ 0.706944][ T1] Code: 01 85 c0 7e 03 ff 47 60 c3 83 ff 01 77 1e 80 3d 1a 1c 89 01 00 75 15 48 c7 c7 ce fd 22 82 c6 05 0a 1c 89 01 01 e8 27 ea fc ff <0f> 0b c3 0f 1f 44 00 00 41 55 41 54 49 89 fc 55 53 48 8b af 30 01
All code
========
   0:	01 85 c0 7e 03 ff    	add    %eax,-0xfc8140(%rbp)
   6:	47 60                	rex.RXB (bad) 
   8:	c3                   	retq   
   9:	83 ff 01             	cmp    $0x1,%edi
   c:	77 1e                	ja     0x2c
   e:	80 3d 1a 1c 89 01 00 	cmpb   $0x0,0x1891c1a(%rip)        # 0x1891c2f
  15:	75 15                	jne    0x2c
  17:	48 c7 c7 ce fd 22 82 	mov    $0xffffffff8222fdce,%rdi
  1e:	c6 05 0a 1c 89 01 01 	movb   $0x1,0x1891c0a(%rip)        # 0x1891c2f
  25:	e8 27 ea fc ff       	callq  0xfffffffffffcea51
  2a:*	0f 0b                	ud2    		<-- trapping instruction
  2c:	c3                   	retq   
  2d:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)
  32:	41 55                	push   %r13
  34:	41 54                	push   %r12
  36:	49 89 fc             	mov    %rdi,%r12
  39:	55                   	push   %rbp
  3a:	53                   	push   %rbx
  3b:	48                   	rex.W
  3c:	8b                   	.byte 0x8b
  3d:	af                   	scas   %es:(%rdi),%eax
  3e:	30 01                	xor    %al,(%rcx)

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2    
   2:	c3                   	retq   
   3:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)
   8:	41 55                	push   %r13
   a:	41 54                	push   %r12
   c:	49 89 fc             	mov    %rdi,%r12
   f:	55                   	push   %rbp
  10:	53                   	push   %rbx
  11:	48                   	rex.W
  12:	8b                   	.byte 0x8b
  13:	af                   	scas   %es:(%rdi),%eax
  14:	30 01                	xor    %al,(%rcx)
[    0.706944][    T1] RSP: 0000:ffffc90000013d28 EFLAGS: 00010082
[    0.706944][    T1] RAX: 0000000000000000 RBX: ffff88810cba1fc0 RCX: 0000000000000003
[    0.706944][    T1] RDX: 0000000000000105 RSI: 0000000000000001 RDI: 0000000000000001
[    0.706944][    T1] RBP: ffff88842fc2c580 R08: 0000000000000000 R09: 0000000000000026
[    0.706944][    T1] R10: 0000000000000000 R11: 000000002d2d2d2d R12: ffff88842fc2c600
[    0.706944][    T1] R13: 0000000000000001 R14: 0000000000000001 R15: ffff88810cba1f40
[    0.706944][    T1] FS:  0000000000000000(0000) GS:ffff88842fc00000(0000) knlGS:0000000000000000
[    0.706944][    T1] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    0.706944][    T1] CR2: ffff88843ffff000 CR3: 0000000002412000 CR4: 00000000000406f0
[    0.706944][    T1] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    0.706944][    T1] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[    0.706944][    T1] Call Trace:
[    0.706944][    T1]  <TASK>
[ 0.706944][ T1] entity_is_long_sleep+0x1b/0x38 
[ 0.706944][ T1] migrate_task_rq_fair (fair.c:?) 
[ 0.706944][ T1] set_task_cpu (??:?) 
[ 0.706944][ T1] try_to_wake_up (core.c:?) 
[ 0.706944][ T1] ? __kthread_bind_mask (kthread.c:?) 
[ 0.706944][ T1] bringup_cpu (cpu.c:?) 
[ 0.706944][ T1] ? cpu_smt_allowed (cpu.c:?) 
[ 0.706944][ T1] cpuhp_invoke_callback (cpu.c:?) 
[ 0.706944][ T1] __cpuhp_invoke_callback_range (cpu.c:?) 
[ 0.706944][ T1] _cpu_up (cpu.c:?) 
[ 0.706944][ T1] cpu_up (cpu.c:?) 
[ 0.706944][ T1] bringup_nonboot_cpus (??:?) 
[ 0.706944][ T1] smp_init (??:?) 
[ 0.706944][ T1] kernel_init_freeable (main.c:?) 
[ 0.706944][ T1] ? rest_init (main.c:?) 
[ 0.706944][ T1] kernel_init (main.c:?) 
[ 0.706944][ T1] ret_from_fork (??:?) 
[    0.706944][    T1]  </TASK>
[    0.706944][    T1] ---[ end trace 0000000000000000 ]---
[    0.707026][    T1] smp: Brought up 1 node, 2 CPUs
[    0.707887][    T1] smpboot: Max logical packages: 1
[    0.708793][    T1] smpboot: Total of 2 processors activated (8779.66 BogoMIPS)
[    0.711760][    T1] devtmpfs: initialized
[    0.711827][    T1] x86/mm: Memory block size: 128MB
[ 0.716871][ T1] calling ipc_ns_init+0x0/0x18 @ 1 
[ 0.717857][ T1] initcall ipc_ns_init+0x0/0x18 returned 0 after 0 usecs 
[ 0.718963][ T1] calling init_mmap_min_addr+0x0/0x1a @ 1 
[ 0.719972][ T1] initcall init_mmap_min_addr+0x0/0x1a returned 0 after 0 usecs 
[ 0.721432][ T1] calling pci_realloc_setup_params+0x0/0x49 @ 1 
[ 0.722492][ T1] initcall pci_realloc_setup_params+0x0/0x49 returned 0 after 0 usecs 
[ 0.723421][ T1] calling inet_frag_wq_init+0x0/0x41 @ 1 
[ 0.726984][ T1] initcall inet_frag_wq_init+0x0/0x41 returned 0 after 4000 usecs 
[ 0.728809][ T1] calling e820__register_nvs_regions+0x0/0x3c @ 1 
[ 0.729914][ T1] initcall e820__register_nvs_regions+0x0/0x3c returned 0 after 0 usecs 
[ 0.730959][ T1] calling cpufreq_register_tsc_scaling+0x0/0x2e @ 1 
[ 0.732104][ T1] initcall cpufreq_register_tsc_scaling+0x0/0x2e returned 0 after 0 usecs 
[ 0.733576][ T1] calling cache_ap_register+0x0/0x2c @ 1 
[ 0.734609][ T1] initcall cache_ap_register+0x0/0x2c returned 0 after 0 usecs 
[ 0.735412][ T1] calling reboot_init+0x0/0x41 @ 1 
[ 0.736372][ T1] initcall reboot_init+0x0/0x41 returned 0 after 0 usecs 
[ 0.737551][ T1] calling init_lapic_sysfs+0x0/0x25 @ 1 
[ 0.738527][ T1] initcall init_lapic_sysfs+0x0/0x25 returned 0 after 0 usecs 
[ 0.739387][ T1] calling alloc_frozen_cpus+0x0/0xc @ 1 
[ 0.740351][ T1] initcall alloc_frozen_cpus+0x0/0xc returned 0 after 0 usecs 
[ 0.743426][ T1] calling cpu_hotplug_pm_sync_init+0x0/0x18 @ 1 
[ 0.744547][ T1] initcall cpu_hotplug_pm_sync_init+0x0/0x18 returned 0 after 0 usecs 
[ 0.745965][ T1] calling wq_sysfs_init+0x0/0x2f @ 1 
[ 0.746922][ T1] initcall wq_sysfs_init+0x0/0x2f returned 0 after 0 usecs 
[ 0.747307][ T1] calling ksysfs_init+0x0/0x9d @ 1 
[ 0.748263][ T1] initcall ksysfs_init+0x0/0x9d returned 0 after 0 usecs 
[ 0.749522][ T1] calling schedutil_gov_init+0x0/0x15 @ 1 
[ 0.750537][ T1] initcall schedutil_gov_init+0x0/0x15 returned 0 after 0 usecs 
[ 0.751416][ T1] calling pm_init+0x0/0x77 @ 1 
[ 0.752336][ T1] initcall pm_init+0x0/0x77 returned 0 after 0 usecs 
[ 0.753480][ T1] calling pm_disk_init+0x0/0x1c @ 1 
[ 0.754434][ T1] initcall pm_disk_init+0x0/0x1c returned 0 after 0 usecs 
[ 0.755378][ T1] calling swsusp_header_init+0x0/0x30 @ 1 
[ 0.756414][ T1] initcall swsusp_header_init+0x0/0x30 returned 0 after 0 usecs 
[ 0.757816][ T1] calling rcu_set_runtime_mode+0x0/0x1b @ 1 
[ 0.758858][ T1] initcall rcu_set_runtime_mode+0x0/0x1b returned 0 after 0 usecs 
[ 0.759395][ T1] calling init_jiffies_clocksource+0x0/0x1c @ 1 
[    0.760446][    T1] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
[ 0.762235][ T1] initcall init_jiffies_clocksource+0x0/0x1c returned 0 after 0 usecs 
[ 0.763415][ T1] calling futex_init+0x0/0xcf @ 1 
[    0.764361][    T1] futex hash table entries: 512 (order: 3, 32768 bytes, linear)
[ 0.765718][ T1] initcall futex_init+0x0/0xcf returned 0 after 0 usecs 
[ 0.766890][ T1] calling cgroup_wq_init+0x0/0x2d @ 1 
[ 0.767275][ T1] initcall cgroup_wq_init+0x0/0x2d returned 0 after 0 usecs 
[ 0.768502][ T1] calling cgroup1_wq_init+0x0/0x2d @ 1 
[ 0.769581][ T1] initcall cgroup1_wq_init+0x0/0x2d returned 0 after 0 usecs 
[ 0.770943][ T1] calling ftrace_mod_cmd_init+0x0/0x10 @ 1 
[ 0.772006][ T1] initcall ftrace_mod_cmd_init+0x0/0x10 returned 0 after 0 usecs 
[ 0.773409][ T1] calling init_graph_trace+0x0/0x61 @ 1 
[ 0.774336][ T1] initcall init_graph_trace+0x0/0x61 returned 0 after 0 usecs 
[ 0.778960][ T1] calling trace_events_eprobe_init_early+0x0/0x2b @ 1 
[ 0.780166][ T1] initcall trace_events_eprobe_init_early+0x0/0x2b returned 0 after 0 usecs 
[ 0.781739][ T1] calling trace_events_synth_init_early+0x0/0x2b @ 1 
[ 0.782877][ T1] initcall trace_events_synth_init_early+0x0/0x2b returned 0 after 0 usecs 
[ 0.783421][ T1] calling init_kprobe_trace_early+0x0/0x2a @ 1 
[ 0.784494][ T1] initcall init_kprobe_trace_early+0x0/0x2a returned 0 after 0 usecs 
[ 0.785931][ T1] calling memory_failure_init+0x0/0x9a @ 1 
[ 0.786956][ T1] initcall memory_failure_init+0x0/0x9a returned 0 after 0 usecs 
[ 0.788354][ T1] calling cma_init_reserved_areas+0x0/0x2f @ 1 
[ 0.790834][ T1] initcall cma_init_reserved_areas+0x0/0x2f returned 0 after 0 usecs 
[ 0.791422][ T1] calling fsnotify_init+0x0/0x4d @ 1 
[ 0.792399][ T1] initcall fsnotify_init+0x0/0x4d returned 0 after 0 usecs 
[ 0.793578][ T1] calling filelock_init+0x0/0xa4 @ 1 
[ 0.794580][ T1] initcall filelock_init+0x0/0xa4 returned 0 after 0 usecs 
[ 0.795312][ T1] calling init_script_binfmt+0x0/0x1a @ 1 
[ 0.796310][ T1] initcall init_script_binfmt+0x0/0x1a returned 0 after 0 usecs 
[ 0.797654][ T1] calling init_elf_binfmt+0x0/0x1a @ 1 
[ 0.798613][ T1] initcall init_elf_binfmt+0x0/0x1a returned 0 after 0 usecs 
[ 0.799388][ T1] calling init_compat_elf_binfmt+0x0/0x1a @ 1 
[ 0.800453][ T1] initcall init_compat_elf_binfmt+0x0/0x1a returned 0 after 0 usecs 


To reproduce:

        # build kernel
	cd linux
	cp config-6.2.0-rc6-00112-g77900e3a6934 .config
	make HOSTCC=gcc-11 CC=gcc-11 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage modules
	make HOSTCC=gcc-11 CC=gcc-11 ARCH=x86_64 INSTALL_MOD_PATH=<mod-install-dir> modules_install
	cd <mod-install-dir>
	find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz


        git clone https://github.com/intel/lkp-tests.git
        cd lkp-tests
        bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email

        # if come across any failure that blocks the test,
        # please remove ~/.lkp and /lkp dir to run from a clean state.
  
Vincent Guittot March 7, 2023, 10:26 a.m. UTC | #3
On Mon, 6 Mar 2023 at 14:53, Vincent Guittot <vincent.guittot@linaro.org> wrote:
>
> On Mon, 6 Mar 2023 at 13:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> >
> > Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
> > entity being placed") fix an overflowing bug, but ignore
> > a case that se->exec_start is reset after a migration.
> >
> > For fixing this case, we reset the vruntime of a long
> > sleeping task in migrate_task_rq_fair().
> >
> > Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
> > Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
> > Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
>
> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
>
> > ---
> >
> > v1 -> v2:
> > - fix some typos and update comments
> > - reformat the patch
> >
> > ---
> >  kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++-------------
> >  1 file changed, 55 insertions(+), 21 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 7a1b1f855b96..74c9918ffe76 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4648,11 +4648,45 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >  #endif
> >  }
> >
> > +static inline bool entity_is_long_sleep(struct sched_entity *se)
> > +{
> > +       struct cfs_rq *cfs_rq;
> > +       u64 sleep_time;
> > +
> > +       if (se->exec_start == 0)
> > +               return false;
> > +
> > +       cfs_rq = cfs_rq_of(se);
> > +       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> > +       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> > +               return true;
> > +
> > +       return false;
> > +}
> > +
> > +static inline u64 sched_sleeper_credit(struct sched_entity *se)
> > +{
> > +       unsigned long thresh;
> > +
> > +       if (se_is_idle(se))
> > +               thresh = sysctl_sched_min_granularity;
> > +       else
> > +               thresh = sysctl_sched_latency;
> > +
> > +       /*
> > +        * Halve their sleep time's effect, to allow
> > +        * for a gentler effect of sleepers:
> > +        */
> > +       if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > +               thresh >>= 1;
> > +
> > +       return thresh;
> > +}
> > +
> >  static void
> >  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >  {
> >         u64 vruntime = cfs_rq->min_vruntime;
> > -       u64 sleep_time;
> >
> >         /*
> >          * The 'current' period is already promised to the current tasks,
> > @@ -4664,23 +4698,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >                 vruntime += sched_vslice(cfs_rq, se);
> >
> >         /* sleeps up to a single latency don't count. */
> > -       if (!initial) {
> > -               unsigned long thresh;
> > -
> > -               if (se_is_idle(se))
> > -                       thresh = sysctl_sched_min_granularity;
> > -               else
> > -                       thresh = sysctl_sched_latency;
> > -
> > -               /*
> > -                * Halve their sleep time's effect, to allow
> > -                * for a gentler effect of sleepers:
> > -                */
> > -               if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > -                       thresh >>= 1;
> > -
> > -               vruntime -= thresh;
> > -       }
> > +       if (!initial)
> > +               vruntime -= sched_sleeper_credit(se);
> >
> >         /*
> >          * Pull vruntime of the entity being placed to the base level of
> > @@ -4689,8 +4708,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >          * the base as it may be too far off and the comparison may get
> >          * inversed due to s64 overflow.
> >          */
> > -       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> > -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> > +       if (entity_is_long_sleep(se))
> >                 se->vruntime = vruntime;
> >         else
> >                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> > @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >         if (READ_ONCE(p->__state) == TASK_WAKING) {
> >                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
> >
> > -               se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> > +               /*
> > +                * We determine whether a task sleeps for long by checking
> > +                * se->exec_start, and if it is, we sanitize its vruntime at
> > +                * place_entity(). However, after a migration, this detection
> > +                * method fails due to se->exec_start being reset.
> > +                *
> > +                * For fixing this case, we add the same check here. For a task
> > +                * which has slept for a long time, its vruntime should be reset
> > +                * to cfs_rq->min_vruntime with a sleep credit. Because waking
> > +                * task's vruntime will be added to cfs_rq->min_vruntime when
> > +                * enqueue, we only need to reset the se->vruntime of waking task
> > +                * to a credit here.
> > +                */
> > +               if (entity_is_long_sleep(se))

I completely overlooked that we can't use rq_clock_task here. Need to
think a bit more on this

> > +                       se->vruntime = -sched_sleeper_credit(se);
> > +               else
> > +                       se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> >         }
> >
> >         if (!task_on_rq_migrating(p)) {
> > --
> > 2.17.1
> >
  
Zhang Qiao March 7, 2023, 11:05 a.m. UTC | #4
在 2023/3/7 18:26, Vincent Guittot 写道:
> On Mon, 6 Mar 2023 at 14:53, Vincent Guittot <vincent.guittot@linaro.org> wrote:
>>
>> On Mon, 6 Mar 2023 at 13:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>>
>>> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
>>> entity being placed") fix an overflowing bug, but ignore
>>> a case that se->exec_start is reset after a migration.
>>>
>>> For fixing this case, we reset the vruntime of a long
>>> sleeping task in migrate_task_rq_fair().
>>>
>>> Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
>>> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
>>> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
>>
>> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
>>
>>> ---
>>>
>>> v1 -> v2:
>>> - fix some typos and update comments
>>> - reformat the patch
>>>
>>> ---
>>>  kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++-------------
>>>  1 file changed, 55 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 7a1b1f855b96..74c9918ffe76 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -4648,11 +4648,45 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>>  #endif
>>>  }
>>>
>>> +static inline bool entity_is_long_sleep(struct sched_entity *se)
>>> +{
>>> +       struct cfs_rq *cfs_rq;
>>> +       u64 sleep_time;
>>> +
>>> +       if (se->exec_start == 0)
>>> +               return false;
>>> +
>>> +       cfs_rq = cfs_rq_of(se);
>>> +       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
>>> +       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>>> +               return true;
>>> +
>>> +       return false;
>>> +}
>>> +
>>> +static inline u64 sched_sleeper_credit(struct sched_entity *se)
>>> +{
>>> +       unsigned long thresh;
>>> +
>>> +       if (se_is_idle(se))
>>> +               thresh = sysctl_sched_min_granularity;
>>> +       else
>>> +               thresh = sysctl_sched_latency;
>>> +
>>> +       /*
>>> +        * Halve their sleep time's effect, to allow
>>> +        * for a gentler effect of sleepers:
>>> +        */
>>> +       if (sched_feat(GENTLE_FAIR_SLEEPERS))
>>> +               thresh >>= 1;
>>> +
>>> +       return thresh;
>>> +}
>>> +
>>>  static void
>>>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>  {
>>>         u64 vruntime = cfs_rq->min_vruntime;
>>> -       u64 sleep_time;
>>>
>>>         /*
>>>          * The 'current' period is already promised to the current tasks,
>>> @@ -4664,23 +4698,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>                 vruntime += sched_vslice(cfs_rq, se);
>>>
>>>         /* sleeps up to a single latency don't count. */
>>> -       if (!initial) {
>>> -               unsigned long thresh;
>>> -
>>> -               if (se_is_idle(se))
>>> -                       thresh = sysctl_sched_min_granularity;
>>> -               else
>>> -                       thresh = sysctl_sched_latency;
>>> -
>>> -               /*
>>> -                * Halve their sleep time's effect, to allow
>>> -                * for a gentler effect of sleepers:
>>> -                */
>>> -               if (sched_feat(GENTLE_FAIR_SLEEPERS))
>>> -                       thresh >>= 1;
>>> -
>>> -               vruntime -= thresh;
>>> -       }
>>> +       if (!initial)
>>> +               vruntime -= sched_sleeper_credit(se);
>>>
>>>         /*
>>>          * Pull vruntime of the entity being placed to the base level of
>>> @@ -4689,8 +4708,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>          * the base as it may be too far off and the comparison may get
>>>          * inversed due to s64 overflow.
>>>          */
>>> -       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
>>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>>> +       if (entity_is_long_sleep(se))
>>>                 se->vruntime = vruntime;
>>>         else
>>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
>>> @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>>>         if (READ_ONCE(p->__state) == TASK_WAKING) {
>>>                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
>>>
>>> -               se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>>> +               /*
>>> +                * We determine whether a task sleeps for long by checking
>>> +                * se->exec_start, and if it is, we sanitize its vruntime at
>>> +                * place_entity(). However, after a migration, this detection
>>> +                * method fails due to se->exec_start being reset.
>>> +                *
>>> +                * For fixing this case, we add the same check here. For a task
>>> +                * which has slept for a long time, its vruntime should be reset
>>> +                * to cfs_rq->min_vruntime with a sleep credit. Because waking
>>> +                * task's vruntime will be added to cfs_rq->min_vruntime when
>>> +                * enqueue, we only need to reset the se->vruntime of waking task
>>> +                * to a credit here.
>>> +                */
>>> +               if (entity_is_long_sleep(se))
> 
> I completely overlooked that we can't use rq_clock_task here. Need to
> think a bit more on this

yes, i am trying to slove it.

> 
>>> +                       se->vruntime = -sched_sleeper_credit(se);
>>> +               else
>>> +                       se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>>>         }
>>>
>>>         if (!task_on_rq_migrating(p)) {
>>> --
>>> 2.17.1
>>>
> .
>
  
Dietmar Eggemann March 7, 2023, 12:45 p.m. UTC | #5
On 06/03/2023 14:24, Zhang Qiao wrote:
> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
> entity being placed") fix an overflowing bug, but ignore
> a case that se->exec_start is reset after a migration.
> 
> For fixing this case, we reset the vruntime of a long
> sleeping task in migrate_task_rq_fair().
> 
> Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>

[...]

> @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>  	if (READ_ONCE(p->__state) == TASK_WAKING) {
>  		struct cfs_rq *cfs_rq = cfs_rq_of(se);
>  
> -		se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> +		/*
> +		 * We determine whether a task sleeps for long by checking
> +		 * se->exec_start, and if it is, we sanitize its vruntime at
> +		 * place_entity(). However, after a migration, this detection
> +		 * method fails due to se->exec_start being reset.
> +		 *
> +		 * For fixing this case, we add the same check here. For a task
> +		 * which has slept for a long time, its vruntime should be reset
> +		 * to cfs_rq->min_vruntime with a sleep credit. Because waking
> +		 * task's vruntime will be added to cfs_rq->min_vruntime when

Isn't this the other way around? `vruntime += min_vruntime`

> +		 * enqueue, we only need to reset the se->vruntime of waking task
> +		 * to a credit here.

You not reset it to credit, you subtract the credit from vruntime ?

I assume this is done to have sleeper credit accounted on both
(se->vruntime and vruntime) for `se->vruntime =
max_vruntime(se->vruntime, vruntime)` in place_entity() since
entity_is_long_sleep(se)=false for a remove wakeup since `se->exec_start=0`.


> +		 */
> +		if (entity_is_long_sleep(se))
> +			se->vruntime = -sched_sleeper_credit(se);
> +		else
> +			se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);

Not sure I understand this part.
Don't we have to do `vruntime -= min_vruntime` here for long sleeping
task as well?

Since we always do the `vruntime += min_vruntime` on the new CPU for a
remote wakeup.

[...]
  
Zhang Qiao March 7, 2023, 1:41 p.m. UTC | #6
在 2023/3/7 18:26, Vincent Guittot 写道:
> On Mon, 6 Mar 2023 at 14:53, Vincent Guittot <vincent.guittot@linaro.org> wrote:
>>
>> On Mon, 6 Mar 2023 at 13:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>>
>>> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
>>> entity being placed") fix an overflowing bug, but ignore
>>> a case that se->exec_start is reset after a migration.
>>>
>>> For fixing this case, we reset the vruntime of a long
>>> sleeping task in migrate_task_rq_fair().
>>>
>>> Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
>>> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
>>> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
>>
>> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
>>
>>> ---
>>>
>>> v1 -> v2:
>>> - fix some typos and update comments
>>> - reformat the patch
>>>
>>> ---
>>>  kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++-------------
>>>  1 file changed, 55 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 7a1b1f855b96..74c9918ffe76 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -4648,11 +4648,45 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>>  #endif
>>>  }
>>>
>>> +static inline bool entity_is_long_sleep(struct sched_entity *se)
>>> +{
>>> +       struct cfs_rq *cfs_rq;
>>> +       u64 sleep_time;
>>> +
>>> +       if (se->exec_start == 0)
>>> +               return false;
>>> +
>>> +       cfs_rq = cfs_rq_of(se);
>>> +       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
>>> +       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>>> +               return true;
>>> +
>>> +       return false;
>>> +}
>>> +
>>> +static inline u64 sched_sleeper_credit(struct sched_entity *se)
>>> +{
>>> +       unsigned long thresh;
>>> +
>>> +       if (se_is_idle(se))
>>> +               thresh = sysctl_sched_min_granularity;
>>> +       else
>>> +               thresh = sysctl_sched_latency;
>>> +
>>> +       /*
>>> +        * Halve their sleep time's effect, to allow
>>> +        * for a gentler effect of sleepers:
>>> +        */
>>> +       if (sched_feat(GENTLE_FAIR_SLEEPERS))
>>> +               thresh >>= 1;
>>> +
>>> +       return thresh;
>>> +}
>>> +
>>>  static void
>>>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>  {
>>>         u64 vruntime = cfs_rq->min_vruntime;
>>> -       u64 sleep_time;
>>>
>>>         /*
>>>          * The 'current' period is already promised to the current tasks,
>>> @@ -4664,23 +4698,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>                 vruntime += sched_vslice(cfs_rq, se);
>>>
>>>         /* sleeps up to a single latency don't count. */
>>> -       if (!initial) {
>>> -               unsigned long thresh;
>>> -
>>> -               if (se_is_idle(se))
>>> -                       thresh = sysctl_sched_min_granularity;
>>> -               else
>>> -                       thresh = sysctl_sched_latency;
>>> -
>>> -               /*
>>> -                * Halve their sleep time's effect, to allow
>>> -                * for a gentler effect of sleepers:
>>> -                */
>>> -               if (sched_feat(GENTLE_FAIR_SLEEPERS))
>>> -                       thresh >>= 1;
>>> -
>>> -               vruntime -= thresh;
>>> -       }
>>> +       if (!initial)
>>> +               vruntime -= sched_sleeper_credit(se);
>>>
>>>         /*
>>>          * Pull vruntime of the entity being placed to the base level of
>>> @@ -4689,8 +4708,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>          * the base as it may be too far off and the comparison may get
>>>          * inversed due to s64 overflow.
>>>          */
>>> -       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
>>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>>> +       if (entity_is_long_sleep(se))
>>>                 se->vruntime = vruntime;
>>>         else
>>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
>>> @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>>>         if (READ_ONCE(p->__state) == TASK_WAKING) {
>>>                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
>>>
>>> -               se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>>> +               /*
>>> +                * We determine whether a task sleeps for long by checking
>>> +                * se->exec_start, and if it is, we sanitize its vruntime at
>>> +                * place_entity(). However, after a migration, this detection
>>> +                * method fails due to se->exec_start being reset.
>>> +                *
>>> +                * For fixing this case, we add the same check here. For a task
>>> +                * which has slept for a long time, its vruntime should be reset
>>> +                * to cfs_rq->min_vruntime with a sleep credit. Because waking
>>> +                * task's vruntime will be added to cfs_rq->min_vruntime when
>>> +                * enqueue, we only need to reset the se->vruntime of waking task
>>> +                * to a credit here.
>>> +                */
>>> +               if (entity_is_long_sleep(se))
> 
> I completely overlooked that we can't use rq_clock_task here. Need to
> think a bit more on this

Hi,Vincent,

How about using exec_start of the parent sched_entity instant of rq_clock_task()?

If during the waking sched_entity sleep, there are sibling sched_entity running,
than `pse->exec_start` was up to date.
If the cfs_rq was idle, we don't have a problem of possible overflow.

static inline bool vruntime_need_sanitized(struct sched_entity *se)
{
	struct sched_entity *pse = parent_entity(se);
	u64 diff ;

	if (se->exec_start == 0 || !pse)
		return false;

	diff = pse->exec_start - se->exec_start;
	if ((s64)diff > 60LL * NSEC_PER_SEC)
		return true;

	return false;
}

Thanks.
ZhangQiao.

> 
>>> +                       se->vruntime = -sched_sleeper_credit(se);
>>> +               else
>>> +                       se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>>>         }
>>>
>>>         if (!task_on_rq_migrating(p)) {
>>> --
>>> 2.17.1
>>>
> .
>
  
Zhang Qiao March 7, 2023, 2:06 p.m. UTC | #7
在 2023/3/7 20:45, Dietmar Eggemann 写道:
> On 06/03/2023 14:24, Zhang Qiao wrote:
>> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
>> entity being placed") fix an overflowing bug, but ignore
>> a case that se->exec_start is reset after a migration.
>>
>> For fixing this case, we reset the vruntime of a long
>> sleeping task in migrate_task_rq_fair().
>>
>> Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
>> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
>> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
> 
> [...]
> 
>> @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>>  	if (READ_ONCE(p->__state) == TASK_WAKING) {
>>  		struct cfs_rq *cfs_rq = cfs_rq_of(se);
>>  
>> -		se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>> +		/*
>> +		 * We determine whether a task sleeps for long by checking
>> +		 * se->exec_start, and if it is, we sanitize its vruntime at
>> +		 * place_entity(). However, after a migration, this detection
>> +		 * method fails due to se->exec_start being reset.
>> +		 *
>> +		 * For fixing this case, we add the same check here. For a task
>> +		 * which has slept for a long time, its vruntime should be reset
>> +		 * to cfs_rq->min_vruntime with a sleep credit. Because waking
>> +		 * task's vruntime will be added to cfs_rq->min_vruntime when
> 
> Isn't this the other way around? `vruntime += min_vruntime`
> 
>> +		 * enqueue, we only need to reset the se->vruntime of waking task
>> +		 * to a credit here.
> 
> You not reset it to credit, you subtract the credit from vruntime ?
> 
> I assume this is done to have sleeper credit accounted on both
> (se->vruntime and vruntime) for `se->vruntime =
> max_vruntime(se->vruntime, vruntime)` in place_entity() since
> entity_is_long_sleep(se)=false for a remove wakeup since `se->exec_start=0`.
> 
> 
>> +		 */
>> +		if (entity_is_long_sleep(se))
>> +			se->vruntime = -sched_sleeper_credit(se);
>> +		else
>> +			se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> 
> Not sure I understand this part.
> Don't we have to do `vruntime -= min_vruntime` here for long sleeping
> task as well?

Hi, Dietmar,

At this time, `se->vruntime - min_vruntime` maybe greater than s64max as well.

thanks,
ZhangQiao

> 
> Since we always do the `vruntime += min_vruntime` on the new CPU for a
> remote wakeup.
> 
> [...]
> 
> .
>
  
Vincent Guittot March 8, 2023, 8:01 a.m. UTC | #8
On Tue, 7 Mar 2023 at 14:41, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>
>
>
> 在 2023/3/7 18:26, Vincent Guittot 写道:
> > On Mon, 6 Mar 2023 at 14:53, Vincent Guittot <vincent.guittot@linaro.org> wrote:
> >>
> >> On Mon, 6 Mar 2023 at 13:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> >>>
> >>> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
> >>> entity being placed") fix an overflowing bug, but ignore
> >>> a case that se->exec_start is reset after a migration.
> >>>
> >>> For fixing this case, we reset the vruntime of a long
> >>> sleeping task in migrate_task_rq_fair().
> >>>
> >>> Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
> >>> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
> >>> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
> >>
> >> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
> >>
> >>> ---
> >>>
> >>> v1 -> v2:
> >>> - fix some typos and update comments
> >>> - reformat the patch
> >>>
> >>> ---
> >>>  kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++-------------
> >>>  1 file changed, 55 insertions(+), 21 deletions(-)
> >>>
> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>> index 7a1b1f855b96..74c9918ffe76 100644
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> @@ -4648,11 +4648,45 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >>>  #endif
> >>>  }
> >>>
> >>> +static inline bool entity_is_long_sleep(struct sched_entity *se)
> >>> +{
> >>> +       struct cfs_rq *cfs_rq;
> >>> +       u64 sleep_time;
> >>> +
> >>> +       if (se->exec_start == 0)
> >>> +               return false;
> >>> +
> >>> +       cfs_rq = cfs_rq_of(se);
> >>> +       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> >>> +       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> >>> +               return true;
> >>> +
> >>> +       return false;
> >>> +}
> >>> +
> >>> +static inline u64 sched_sleeper_credit(struct sched_entity *se)
> >>> +{
> >>> +       unsigned long thresh;
> >>> +
> >>> +       if (se_is_idle(se))
> >>> +               thresh = sysctl_sched_min_granularity;
> >>> +       else
> >>> +               thresh = sysctl_sched_latency;
> >>> +
> >>> +       /*
> >>> +        * Halve their sleep time's effect, to allow
> >>> +        * for a gentler effect of sleepers:
> >>> +        */
> >>> +       if (sched_feat(GENTLE_FAIR_SLEEPERS))
> >>> +               thresh >>= 1;
> >>> +
> >>> +       return thresh;
> >>> +}
> >>> +
> >>>  static void
> >>>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >>>  {
> >>>         u64 vruntime = cfs_rq->min_vruntime;
> >>> -       u64 sleep_time;
> >>>
> >>>         /*
> >>>          * The 'current' period is already promised to the current tasks,
> >>> @@ -4664,23 +4698,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >>>                 vruntime += sched_vslice(cfs_rq, se);
> >>>
> >>>         /* sleeps up to a single latency don't count. */
> >>> -       if (!initial) {
> >>> -               unsigned long thresh;
> >>> -
> >>> -               if (se_is_idle(se))
> >>> -                       thresh = sysctl_sched_min_granularity;
> >>> -               else
> >>> -                       thresh = sysctl_sched_latency;
> >>> -
> >>> -               /*
> >>> -                * Halve their sleep time's effect, to allow
> >>> -                * for a gentler effect of sleepers:
> >>> -                */
> >>> -               if (sched_feat(GENTLE_FAIR_SLEEPERS))
> >>> -                       thresh >>= 1;
> >>> -
> >>> -               vruntime -= thresh;
> >>> -       }
> >>> +       if (!initial)
> >>> +               vruntime -= sched_sleeper_credit(se);
> >>>
> >>>         /*
> >>>          * Pull vruntime of the entity being placed to the base level of
> >>> @@ -4689,8 +4708,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >>>          * the base as it may be too far off and the comparison may get
> >>>          * inversed due to s64 overflow.
> >>>          */
> >>> -       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> >>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> >>> +       if (entity_is_long_sleep(se))
> >>>                 se->vruntime = vruntime;
> >>>         else
> >>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> >>> @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >>>         if (READ_ONCE(p->__state) == TASK_WAKING) {
> >>>                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
> >>>
> >>> -               se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> >>> +               /*
> >>> +                * We determine whether a task sleeps for long by checking
> >>> +                * se->exec_start, and if it is, we sanitize its vruntime at
> >>> +                * place_entity(). However, after a migration, this detection
> >>> +                * method fails due to se->exec_start being reset.
> >>> +                *
> >>> +                * For fixing this case, we add the same check here. For a task
> >>> +                * which has slept for a long time, its vruntime should be reset
> >>> +                * to cfs_rq->min_vruntime with a sleep credit. Because waking
> >>> +                * task's vruntime will be added to cfs_rq->min_vruntime when
> >>> +                * enqueue, we only need to reset the se->vruntime of waking task
> >>> +                * to a credit here.
> >>> +                */
> >>> +               if (entity_is_long_sleep(se))
> >
> > I completely overlooked that we can't use rq_clock_task here. Need to
> > think a bit more on this
>
> Hi,Vincent,
>
> How about using exec_start of the parent sched_entity instant of rq_clock_task()?

How do we handle sched_entity without a parent sched_entity ?



>
> If during the waking sched_entity sleep, there are sibling sched_entity running,
> than `pse->exec_start` was up to date.
> If the cfs_rq was idle, we don't have a problem of possible overflow.
>
> static inline bool vruntime_need_sanitized(struct sched_entity *se)
> {
>         struct sched_entity *pse = parent_entity(se);
>         u64 diff ;
>
>         if (se->exec_start == 0 || !pse)
>                 return false;
>
>         diff = pse->exec_start - se->exec_start;
>         if ((s64)diff > 60LL * NSEC_PER_SEC)
>                 return true;
>
>         return false;
> }
>
> Thanks.
> ZhangQiao.
>
> >
> >>> +                       se->vruntime = -sched_sleeper_credit(se);
> >>> +               else
> >>> +                       se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> >>>         }
> >>>
> >>>         if (!task_on_rq_migrating(p)) {
> >>> --
> >>> 2.17.1
> >>>
> > .
> >
  
Vincent Guittot March 8, 2023, 12:55 p.m. UTC | #9
Le mercredi 08 mars 2023 à 09:01:05 (+0100), Vincent Guittot a écrit :
> On Tue, 7 Mar 2023 at 14:41, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> >
> >
> >
> > 在 2023/3/7 18:26, Vincent Guittot 写道:
> > > On Mon, 6 Mar 2023 at 14:53, Vincent Guittot <vincent.guittot@linaro.org> wrote:
> > >>
> > >> On Mon, 6 Mar 2023 at 13:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> > >>>
> > >>> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
> > >>> entity being placed") fix an overflowing bug, but ignore
> > >>> a case that se->exec_start is reset after a migration.
> > >>>
> > >>> For fixing this case, we reset the vruntime of a long
> > >>> sleeping task in migrate_task_rq_fair().
> > >>>
> > >>> Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
> > >>> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
> > >>> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
> > >>
> > >> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
> > >>
> > >>> ---
> > >>>
> > >>> v1 -> v2:
> > >>> - fix some typos and update comments
> > >>> - reformat the patch
> > >>>
> > >>> ---
> > >>>  kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++-------------
> > >>>  1 file changed, 55 insertions(+), 21 deletions(-)
> > >>>
> > >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > >>> index 7a1b1f855b96..74c9918ffe76 100644
> > >>> --- a/kernel/sched/fair.c
> > >>> +++ b/kernel/sched/fair.c
> > >>> @@ -4648,11 +4648,45 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > >>>  #endif
> > >>>  }
> > >>>
> > >>> +static inline bool entity_is_long_sleep(struct sched_entity *se)
> > >>> +{
> > >>> +       struct cfs_rq *cfs_rq;
> > >>> +       u64 sleep_time;
> > >>> +
> > >>> +       if (se->exec_start == 0)
> > >>> +               return false;
> > >>> +
> > >>> +       cfs_rq = cfs_rq_of(se);
> > >>> +       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> > >>> +       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> > >>> +               return true;
> > >>> +
> > >>> +       return false;
> > >>> +}
> > >>> +
> > >>> +static inline u64 sched_sleeper_credit(struct sched_entity *se)
> > >>> +{
> > >>> +       unsigned long thresh;
> > >>> +
> > >>> +       if (se_is_idle(se))
> > >>> +               thresh = sysctl_sched_min_granularity;
> > >>> +       else
> > >>> +               thresh = sysctl_sched_latency;
> > >>> +
> > >>> +       /*
> > >>> +        * Halve their sleep time's effect, to allow
> > >>> +        * for a gentler effect of sleepers:
> > >>> +        */
> > >>> +       if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > >>> +               thresh >>= 1;
> > >>> +
> > >>> +       return thresh;
> > >>> +}
> > >>> +
> > >>>  static void
> > >>>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> > >>>  {
> > >>>         u64 vruntime = cfs_rq->min_vruntime;
> > >>> -       u64 sleep_time;
> > >>>
> > >>>         /*
> > >>>          * The 'current' period is already promised to the current tasks,
> > >>> @@ -4664,23 +4698,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> > >>>                 vruntime += sched_vslice(cfs_rq, se);
> > >>>
> > >>>         /* sleeps up to a single latency don't count. */
> > >>> -       if (!initial) {
> > >>> -               unsigned long thresh;
> > >>> -
> > >>> -               if (se_is_idle(se))
> > >>> -                       thresh = sysctl_sched_min_granularity;
> > >>> -               else
> > >>> -                       thresh = sysctl_sched_latency;
> > >>> -
> > >>> -               /*
> > >>> -                * Halve their sleep time's effect, to allow
> > >>> -                * for a gentler effect of sleepers:
> > >>> -                */
> > >>> -               if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > >>> -                       thresh >>= 1;
> > >>> -
> > >>> -               vruntime -= thresh;
> > >>> -       }
> > >>> +       if (!initial)
> > >>> +               vruntime -= sched_sleeper_credit(se);
> > >>>
> > >>>         /*
> > >>>          * Pull vruntime of the entity being placed to the base level of
> > >>> @@ -4689,8 +4708,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> > >>>          * the base as it may be too far off and the comparison may get
> > >>>          * inversed due to s64 overflow.
> > >>>          */
> > >>> -       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> > >>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> > >>> +       if (entity_is_long_sleep(se))
> > >>>                 se->vruntime = vruntime;
> > >>>         else
> > >>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> > >>> @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> > >>>         if (READ_ONCE(p->__state) == TASK_WAKING) {
> > >>>                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
> > >>>
> > >>> -               se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> > >>> +               /*
> > >>> +                * We determine whether a task sleeps for long by checking
> > >>> +                * se->exec_start, and if it is, we sanitize its vruntime at
> > >>> +                * place_entity(). However, after a migration, this detection
> > >>> +                * method fails due to se->exec_start being reset.
> > >>> +                *
> > >>> +                * For fixing this case, we add the same check here. For a task
> > >>> +                * which has slept for a long time, its vruntime should be reset
> > >>> +                * to cfs_rq->min_vruntime with a sleep credit. Because waking
> > >>> +                * task's vruntime will be added to cfs_rq->min_vruntime when
> > >>> +                * enqueue, we only need to reset the se->vruntime of waking task
> > >>> +                * to a credit here.
> > >>> +                */
> > >>> +               if (entity_is_long_sleep(se))
> > >
> > > I completely overlooked that we can't use rq_clock_task here. Need to
> > > think a bit more on this
> >
> > Hi,Vincent,
> >
> > How about using exec_start of the parent sched_entity instant of rq_clock_task()?
> 
> How do we handle sched_entity without a parent sched_entity ?

The change below should do the stuff. Not that we can't use it in place entity because
pelt is not enabled in UP.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 74c9918ffe76..b8b381b0ff20 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7635,6 +7635,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
        return new_cpu;
 }

+static inline bool migrate_long_sleeper(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq;
+       u64 sleep_time;
+
+       if (se->exec_start == 0)
+               return false;
+
+       cfs_rq = cfs_rq_of(se);
+       /*
+        * If the entity slept for a long time, don't even try to normalize its
+        * vruntime with the base as it may be too far off and might generate
+        * wrong decision because of s64 overflow.
+        * We estimate its sleep duration with the last update of se's pelt.
+        * The last update happened before sleeping. The cfs' pelt is not
+        * always updated when cfs is idle but this is not a problem because
+        * its min_vruntime is not updated too, so the situation can't get
+        * worse.
+        */
+       sleep_time = cfs_rq_last_update_time(cfs_rq) - se->avg.last_update_time;
+       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+               return true;
+
+       return false;
+}
+
 /*
  * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
@@ -7666,7 +7692,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
                 * enqueue, we only need to reset the se->vruntime of waking task
                 * to a credit here.
                 */
-               if (entity_is_long_sleep(se))
+               if (migrate_long_sleeper(se))
                        se->vruntime = -sched_sleeper_credit(se);
                else
                        se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);



> 
> 
> 
> > >
> > >>> +                       se->vruntime = -sched_sleeper_credit(se);
> > >>> +               else
> > >>> +                       se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> > >>>         }
> > >>>
> > >>>         if (!task_on_rq_migrating(p)) {
> > >>> --
> > >>> 2.17.1
> > >>>
> > > .
> > >
  
Chen Yu March 8, 2023, 2:33 p.m. UTC | #10
On 2023-03-06 at 21:24:18 +0800, Zhang Qiao wrote:
> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
> entity being placed") fix an overflowing bug, but ignore
> a case that se->exec_start is reset after a migration.
> 
> For fixing this case, we reset the vruntime of a long
> sleeping task in migrate_task_rq_fair().
> 
> Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
> ---
> 
> v1 -> v2:
> - fix some typos and update comments
> - reformat the patch
> 
> ---
>  kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++-------------
>  1 file changed, 55 insertions(+), 21 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7a1b1f855b96..74c9918ffe76 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4648,11 +4648,45 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  #endif
>  }
>  
> +static inline bool entity_is_long_sleep(struct sched_entity *se)
> +{
> +	struct cfs_rq *cfs_rq;
> +	u64 sleep_time;
> +
> +	if (se->exec_start == 0)
> +		return false;
> +
Just 2 cents, can we add some comments for above to
mention that, because the se is a migrated one, and we
have taken care of its vruntime during migration,
so no need to further massage the vruntime to cfs_rq->min_vruntime.

thanks,
Chenyu
  
Zhang Qiao March 9, 2023, 8:37 a.m. UTC | #11
在 2023/3/8 20:55, Vincent Guittot 写道:
> Le mercredi 08 mars 2023 à 09:01:05 (+0100), Vincent Guittot a écrit :
>> On Tue, 7 Mar 2023 at 14:41, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>>
>>>
>>>
>>> 在 2023/3/7 18:26, Vincent Guittot 写道:
>>>> On Mon, 6 Mar 2023 at 14:53, Vincent Guittot <vincent.guittot@linaro.org> wrote:
>>>>>
>>>>> On Mon, 6 Mar 2023 at 13:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>>>>>
>>>>>> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
>>>>>> entity being placed") fix an overflowing bug, but ignore
>>>>>> a case that se->exec_start is reset after a migration.
>>>>>>
>>>>>> For fixing this case, we reset the vruntime of a long
>>>>>> sleeping task in migrate_task_rq_fair().
>>>>>>
>>>>>> Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
>>>>>> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
>>>>>> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
>>>>>
>>>>> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
>>>>>
>>>>>> ---
>>>>>>
>>>>>> v1 -> v2:
>>>>>> - fix some typos and update comments
>>>>>> - reformat the patch
>>>>>>
>>>>>> ---
>>>>>>  kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++-------------
>>>>>>  1 file changed, 55 insertions(+), 21 deletions(-)
>>>>>>
>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>> index 7a1b1f855b96..74c9918ffe76 100644
>>>>>> --- a/kernel/sched/fair.c
>>>>>> +++ b/kernel/sched/fair.c
>>>>>> @@ -4648,11 +4648,45 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>>>>>  #endif
>>>>>>  }
>>>>>>
>>>>>> +static inline bool entity_is_long_sleep(struct sched_entity *se)
>>>>>> +{
>>>>>> +       struct cfs_rq *cfs_rq;
>>>>>> +       u64 sleep_time;
>>>>>> +
>>>>>> +       if (se->exec_start == 0)
>>>>>> +               return false;
>>>>>> +
>>>>>> +       cfs_rq = cfs_rq_of(se);
>>>>>> +       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
>>>>>> +       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>>>>>> +               return true;
>>>>>> +
>>>>>> +       return false;
>>>>>> +}
>>>>>> +
>>>>>> +static inline u64 sched_sleeper_credit(struct sched_entity *se)
>>>>>> +{
>>>>>> +       unsigned long thresh;
>>>>>> +
>>>>>> +       if (se_is_idle(se))
>>>>>> +               thresh = sysctl_sched_min_granularity;
>>>>>> +       else
>>>>>> +               thresh = sysctl_sched_latency;
>>>>>> +
>>>>>> +       /*
>>>>>> +        * Halve their sleep time's effect, to allow
>>>>>> +        * for a gentler effect of sleepers:
>>>>>> +        */
>>>>>> +       if (sched_feat(GENTLE_FAIR_SLEEPERS))
>>>>>> +               thresh >>= 1;
>>>>>> +
>>>>>> +       return thresh;
>>>>>> +}
>>>>>> +
>>>>>>  static void
>>>>>>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>>>>  {
>>>>>>         u64 vruntime = cfs_rq->min_vruntime;
>>>>>> -       u64 sleep_time;
>>>>>>
>>>>>>         /*
>>>>>>          * The 'current' period is already promised to the current tasks,
>>>>>> @@ -4664,23 +4698,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>>>>                 vruntime += sched_vslice(cfs_rq, se);
>>>>>>
>>>>>>         /* sleeps up to a single latency don't count. */
>>>>>> -       if (!initial) {
>>>>>> -               unsigned long thresh;
>>>>>> -
>>>>>> -               if (se_is_idle(se))
>>>>>> -                       thresh = sysctl_sched_min_granularity;
>>>>>> -               else
>>>>>> -                       thresh = sysctl_sched_latency;
>>>>>> -
>>>>>> -               /*
>>>>>> -                * Halve their sleep time's effect, to allow
>>>>>> -                * for a gentler effect of sleepers:
>>>>>> -                */
>>>>>> -               if (sched_feat(GENTLE_FAIR_SLEEPERS))
>>>>>> -                       thresh >>= 1;
>>>>>> -
>>>>>> -               vruntime -= thresh;
>>>>>> -       }
>>>>>> +       if (!initial)
>>>>>> +               vruntime -= sched_sleeper_credit(se);
>>>>>>
>>>>>>         /*
>>>>>>          * Pull vruntime of the entity being placed to the base level of
>>>>>> @@ -4689,8 +4708,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>>>>          * the base as it may be too far off and the comparison may get
>>>>>>          * inversed due to s64 overflow.
>>>>>>          */
>>>>>> -       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
>>>>>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>>>>>> +       if (entity_is_long_sleep(se))
>>>>>>                 se->vruntime = vruntime;
>>>>>>         else
>>>>>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
>>>>>> @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>>>>>>         if (READ_ONCE(p->__state) == TASK_WAKING) {
>>>>>>                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
>>>>>>
>>>>>> -               se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>>>>>> +               /*
>>>>>> +                * We determine whether a task sleeps for long by checking
>>>>>> +                * se->exec_start, and if it is, we sanitize its vruntime at
>>>>>> +                * place_entity(). However, after a migration, this detection
>>>>>> +                * method fails due to se->exec_start being reset.
>>>>>> +                *
>>>>>> +                * For fixing this case, we add the same check here. For a task
>>>>>> +                * which has slept for a long time, its vruntime should be reset
>>>>>> +                * to cfs_rq->min_vruntime with a sleep credit. Because waking
>>>>>> +                * task's vruntime will be added to cfs_rq->min_vruntime when
>>>>>> +                * enqueue, we only need to reset the se->vruntime of waking task
>>>>>> +                * to a credit here.
>>>>>> +                */
>>>>>> +               if (entity_is_long_sleep(se))
>>>>
>>>> I completely overlooked that we can't use rq_clock_task here. Need to
>>>> think a bit more on this
>>>
>>> Hi,Vincent,
>>>
>>> How about using exec_start of the parent sched_entity instant of rq_clock_task()?
>>
>> How do we handle sched_entity without a parent sched_entity ?
> 
> The change below should do the stuff. Not that we can't use it in place entity because
> pelt is not enabled in UP.
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 74c9918ffe76..b8b381b0ff20 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7635,6 +7635,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>         return new_cpu;
>  }
> 
> +static inline bool migrate_long_sleeper(struct sched_entity *se)
> +{
> +       struct cfs_rq *cfs_rq;
> +       u64 sleep_time;
> +
> +       if (se->exec_start == 0)

How about use `se->avg.last_update_time == 0` here?

> +               return false;
> +
> +       cfs_rq = cfs_rq_of(se);
> +       /*
> +        * If the entity slept for a long time, don't even try to normalize its
> +        * vruntime with the base as it may be too far off and might generate
> +        * wrong decision because of s64 overflow.
> +        * We estimate its sleep duration with the last update of se's pelt.
> +        * The last update happened before sleeping. The cfs' pelt is not
> +        * always updated when cfs is idle but this is not a problem because
> +        * its min_vruntime is not updated too, so the situation can't get
> +        * worse.
> +        */
> +       sleep_time = cfs_rq_last_update_time(cfs_rq) - se->avg.last_update_time;
> +       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)

I tested with this patch and found it make hackbench slower.


> +               return true;
> +
> +       return false;
> +}
> +
>  /*
>   * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
>   * cfs_rq_of(p) references at time of call are still valid and identify the
> @@ -7666,7 +7692,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>                  * enqueue, we only need to reset the se->vruntime of waking task
>                  * to a credit here.
>                  */
> -               if (entity_is_long_sleep(se))
> +               if (migrate_long_sleeper(se))
>                         se->vruntime = -sched_sleeper_credit(se);
>                 else
>                         se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> 
> 
> 
>>
>>
>>
>>>>
>>>>>> +                       se->vruntime = -sched_sleeper_credit(se);
>>>>>> +               else
>>>>>> +                       se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>>>>>>         }
>>>>>>
>>>>>>         if (!task_on_rq_migrating(p)) {
>>>>>> --
>>>>>> 2.17.1
>>>>>>
>>>> .
>>>>
> .
>
  
Dietmar Eggemann March 9, 2023, 9:09 a.m. UTC | #12
On 09/03/2023 09:37, Zhang Qiao wrote:
> 
> 在 2023/3/8 20:55, Vincent Guittot 写道:
>> Le mercredi 08 mars 2023 à 09:01:05 (+0100), Vincent Guittot a écrit :
>>> On Tue, 7 Mar 2023 at 14:41, Zhang Qiao <zhangqiao22@huawei.com> wrote:

[...]

>>>> 在 2023/3/7 18:26, Vincent Guittot 写道:
>>>>> On Mon, 6 Mar 2023 at 14:53, Vincent Guittot <vincent.guittot@linaro.org> wrote:
>>>>>>
>>>>>> On Mon, 6 Mar 2023 at 13:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:

[...]

>> +static inline bool migrate_long_sleeper(struct sched_entity *se)
>> +{
>> +       struct cfs_rq *cfs_rq;
>> +       u64 sleep_time;
>> +
>> +       if (se->exec_start == 0)
> 
> How about use `se->avg.last_update_time == 0` here?

IMHO, both checks are not needed here since we're still dealing with the
originating CPU of the migration. Both of them are set to 0 only at the
end of migrate_task_rq_fair().


>> +               return false;
>> +
>> +       cfs_rq = cfs_rq_of(se);
>> +       /*
>> +        * If the entity slept for a long time, don't even try to normalize its
>> +        * vruntime with the base as it may be too far off and might generate
>> +        * wrong decision because of s64 overflow.
>> +        * We estimate its sleep duration with the last update of se's pelt.
>> +        * The last update happened before sleeping. The cfs' pelt is not
>> +        * always updated when cfs is idle but this is not a problem because
>> +        * its min_vruntime is not updated too, so the situation can't get
>> +        * worse.
>> +        */
>> +       sleep_time = cfs_rq_last_update_time(cfs_rq) - se->avg.last_update_time;

Looks like this doesn't work for asymmetric CPU capacity systems since
we specifically do a sync_entity_load_avg() in select_task_rq_fair()
(find_energy_efficient_cpu() for EAS and select_idle_sibling() for CAS)
to sync cfs_rq and se (including their last_update_time).

[...]
  
Zhang Qiao March 9, 2023, 9:30 a.m. UTC | #13
在 2023/3/9 17:09, Dietmar Eggemann 写道:
> On 09/03/2023 09:37, Zhang Qiao wrote:
>>
>> 在 2023/3/8 20:55, Vincent Guittot 写道:
>>> Le mercredi 08 mars 2023 à 09:01:05 (+0100), Vincent Guittot a écrit :
>>>> On Tue, 7 Mar 2023 at 14:41, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> 
> [...]
> 
>>>>> 在 2023/3/7 18:26, Vincent Guittot 写道:
>>>>>> On Mon, 6 Mar 2023 at 14:53, Vincent Guittot <vincent.guittot@linaro.org> wrote:
>>>>>>>
>>>>>>> On Mon, 6 Mar 2023 at 13:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> 
> [...]
> 
>>> +static inline bool migrate_long_sleeper(struct sched_entity *se)
>>> +{
>>> +       struct cfs_rq *cfs_rq;
>>> +       u64 sleep_time;
>>> +
>>> +       if (se->exec_start == 0)
>>
>> How about use `se->avg.last_update_time == 0` here?
> 
> IMHO, both checks are not needed here since we're still dealing with the
> originating CPU of the migration. Both of them are set to 0 only at the
> end of migrate_task_rq_fair().

Yes, if place_entity() don't call migrate_long_sleeper(), the check can remove.

> 
> 
>>> +               return false;
>>> +
>>> +       cfs_rq = cfs_rq_of(se);
>>> +       /*
>>> +        * If the entity slept for a long time, don't even try to normalize its
>>> +        * vruntime with the base as it may be too far off and might generate
>>> +        * wrong decision because of s64 overflow.
>>> +        * We estimate its sleep duration with the last update of se's pelt.
>>> +        * The last update happened before sleeping. The cfs' pelt is not
>>> +        * always updated when cfs is idle but this is not a problem because
>>> +        * its min_vruntime is not updated too, so the situation can't get
>>> +        * worse.
>>> +        */
>>> +       sleep_time = cfs_rq_last_update_time(cfs_rq) - se->avg.last_update_time;
> 
> Looks like this doesn't work for asymmetric CPU capacity systems since
> we specifically do a sync_entity_load_avg() in select_task_rq_fair()
> (find_energy_efficient_cpu() for EAS and select_idle_sibling() for CAS)
> to sync cfs_rq and se (including their last_update_time).
> 
> [...]
> 
> .
>
  
Zhang Qiao March 9, 2023, 9:43 a.m. UTC | #14
Hi,

在 2023/3/7 20:45, Dietmar Eggemann 写道:
> On 06/03/2023 14:24, Zhang Qiao wrote:
>> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
>> entity being placed") fix an overflowing bug, but ignore
>> a case that se->exec_start is reset after a migration.
>>
>> For fixing this case, we reset the vruntime of a long
>> sleeping task in migrate_task_rq_fair().
>>
>> Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
>> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
>> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
> 
> [...]
> 
>> @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>>  	if (READ_ONCE(p->__state) == TASK_WAKING) {
>>  		struct cfs_rq *cfs_rq = cfs_rq_of(se);
>>  
>> -		se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>> +		/*
>> +		 * We determine whether a task sleeps for long by checking
>> +		 * se->exec_start, and if it is, we sanitize its vruntime at
>> +		 * place_entity(). However, after a migration, this detection
>> +		 * method fails due to se->exec_start being reset.
>> +		 *
>> +		 * For fixing this case, we add the same check here. For a task
>> +		 * which has slept for a long time, its vruntime should be reset
>> +		 * to cfs_rq->min_vruntime with a sleep credit. Because waking
>> +		 * task's vruntime will be added to cfs_rq->min_vruntime when
>> Isn't this the other way around? `vruntime += min_vruntime`

Yes, you're right, we can  refer to:

  enqueue_entity()

    ...
    if (renorm && !curr) {
      se->vruntime += cfs_rq->min_vruntime;
    ...


> 
>> +		 * enqueue, we only need to reset the se->vruntime of waking task
>> +		 * to a credit here.
> 
> You not reset it to credit, you subtract the credit from vruntime ?
> 
> I assume this is done to have sleeper credit accounted on both
> (se->vruntime and vruntime) for `se->vruntime =
> max_vruntime(se->vruntime, vruntime)` in place_entity() since
> entity_is_long_sleep(se)=false for a remove wakeup since `se->exec_start=0`.
> 
> 
>> +		 */
>> +		if (entity_is_long_sleep(se))
>> +			se->vruntime = -sched_sleeper_credit(se);

We subtract the credit here on the originating CPU since the long
sleeping task which migrates will go through:

  place_entity()

    else
      se->vruntime = max_vruntime(se->vruntime, vruntime (1));

and not the `if (entity_is_long_sleep(se))` path. And sleeper credit is
also subtracted from vruntime (1) before in place_entity().

IOW, We do the same thing in advance in migrate_task_rq_fair().
For the long sleeping task, se->vruntime is equal to vruntime(1) in place_entity().

Thanks.
ZhangQiao.


>> +		else
>> +			se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> 
> Not sure I understand this part.
> Don't we have to do `vruntime -= min_vruntime` here for long sleeping
> task as well?
> 
> Since we always do the `vruntime += min_vruntime` on the new CPU for a
> remote wakeup.
> 
> [...]
> 
> .
>
  
Vincent Guittot March 9, 2023, 10:48 a.m. UTC | #15
On Thu, 9 Mar 2023 at 09:37, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>
>
>
> 在 2023/3/8 20:55, Vincent Guittot 写道:
> > Le mercredi 08 mars 2023 à 09:01:05 (+0100), Vincent Guittot a écrit :
> >> On Tue, 7 Mar 2023 at 14:41, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> >>>
> >>>
> >>>
> >>> 在 2023/3/7 18:26, Vincent Guittot 写道:
> >>>> On Mon, 6 Mar 2023 at 14:53, Vincent Guittot <vincent.guittot@linaro.org> wrote:
> >>>>>
> >>>>> On Mon, 6 Mar 2023 at 13:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> >>>>>>
> >>>>>> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
> >>>>>> entity being placed") fix an overflowing bug, but ignore
> >>>>>> a case that se->exec_start is reset after a migration.
> >>>>>>
> >>>>>> For fixing this case, we reset the vruntime of a long
> >>>>>> sleeping task in migrate_task_rq_fair().
> >>>>>>
> >>>>>> Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
> >>>>>> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
> >>>>>> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
> >>>>>
> >>>>> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
> >>>>>
> >>>>>> ---
> >>>>>>
> >>>>>> v1 -> v2:
> >>>>>> - fix some typos and update comments
> >>>>>> - reformat the patch
> >>>>>>
> >>>>>> ---
> >>>>>>  kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++-------------
> >>>>>>  1 file changed, 55 insertions(+), 21 deletions(-)
> >>>>>>
> >>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>>>>> index 7a1b1f855b96..74c9918ffe76 100644
> >>>>>> --- a/kernel/sched/fair.c
> >>>>>> +++ b/kernel/sched/fair.c
> >>>>>> @@ -4648,11 +4648,45 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >>>>>>  #endif
> >>>>>>  }
> >>>>>>
> >>>>>> +static inline bool entity_is_long_sleep(struct sched_entity *se)
> >>>>>> +{
> >>>>>> +       struct cfs_rq *cfs_rq;
> >>>>>> +       u64 sleep_time;
> >>>>>> +
> >>>>>> +       if (se->exec_start == 0)
> >>>>>> +               return false;
> >>>>>> +
> >>>>>> +       cfs_rq = cfs_rq_of(se);
> >>>>>> +       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> >>>>>> +       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> >>>>>> +               return true;
> >>>>>> +
> >>>>>> +       return false;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static inline u64 sched_sleeper_credit(struct sched_entity *se)
> >>>>>> +{
> >>>>>> +       unsigned long thresh;
> >>>>>> +
> >>>>>> +       if (se_is_idle(se))
> >>>>>> +               thresh = sysctl_sched_min_granularity;
> >>>>>> +       else
> >>>>>> +               thresh = sysctl_sched_latency;
> >>>>>> +
> >>>>>> +       /*
> >>>>>> +        * Halve their sleep time's effect, to allow
> >>>>>> +        * for a gentler effect of sleepers:
> >>>>>> +        */
> >>>>>> +       if (sched_feat(GENTLE_FAIR_SLEEPERS))
> >>>>>> +               thresh >>= 1;
> >>>>>> +
> >>>>>> +       return thresh;
> >>>>>> +}
> >>>>>> +
> >>>>>>  static void
> >>>>>>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >>>>>>  {
> >>>>>>         u64 vruntime = cfs_rq->min_vruntime;
> >>>>>> -       u64 sleep_time;
> >>>>>>
> >>>>>>         /*
> >>>>>>          * The 'current' period is already promised to the current tasks,
> >>>>>> @@ -4664,23 +4698,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >>>>>>                 vruntime += sched_vslice(cfs_rq, se);
> >>>>>>
> >>>>>>         /* sleeps up to a single latency don't count. */
> >>>>>> -       if (!initial) {
> >>>>>> -               unsigned long thresh;
> >>>>>> -
> >>>>>> -               if (se_is_idle(se))
> >>>>>> -                       thresh = sysctl_sched_min_granularity;
> >>>>>> -               else
> >>>>>> -                       thresh = sysctl_sched_latency;
> >>>>>> -
> >>>>>> -               /*
> >>>>>> -                * Halve their sleep time's effect, to allow
> >>>>>> -                * for a gentler effect of sleepers:
> >>>>>> -                */
> >>>>>> -               if (sched_feat(GENTLE_FAIR_SLEEPERS))
> >>>>>> -                       thresh >>= 1;
> >>>>>> -
> >>>>>> -               vruntime -= thresh;
> >>>>>> -       }
> >>>>>> +       if (!initial)
> >>>>>> +               vruntime -= sched_sleeper_credit(se);
> >>>>>>
> >>>>>>         /*
> >>>>>>          * Pull vruntime of the entity being placed to the base level of
> >>>>>> @@ -4689,8 +4708,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >>>>>>          * the base as it may be too far off and the comparison may get
> >>>>>>          * inversed due to s64 overflow.
> >>>>>>          */
> >>>>>> -       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> >>>>>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> >>>>>> +       if (entity_is_long_sleep(se))
> >>>>>>                 se->vruntime = vruntime;
> >>>>>>         else
> >>>>>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> >>>>>> @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >>>>>>         if (READ_ONCE(p->__state) == TASK_WAKING) {
> >>>>>>                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
> >>>>>>
> >>>>>> -               se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> >>>>>> +               /*
> >>>>>> +                * We determine whether a task sleeps for long by checking
> >>>>>> +                * se->exec_start, and if it is, we sanitize its vruntime at
> >>>>>> +                * place_entity(). However, after a migration, this detection
> >>>>>> +                * method fails due to se->exec_start being reset.
> >>>>>> +                *
> >>>>>> +                * For fixing this case, we add the same check here. For a task
> >>>>>> +                * which has slept for a long time, its vruntime should be reset
> >>>>>> +                * to cfs_rq->min_vruntime with a sleep credit. Because waking
> >>>>>> +                * task's vruntime will be added to cfs_rq->min_vruntime when
> >>>>>> +                * enqueue, we only need to reset the se->vruntime of waking task
> >>>>>> +                * to a credit here.
> >>>>>> +                */
> >>>>>> +               if (entity_is_long_sleep(se))
> >>>>
> >>>> I completely overlooked that we can't use rq_clock_task here. Need to
> >>>> think a bit more on this
> >>>
> >>> Hi,Vincent,
> >>>
> >>> How about using exec_start of the parent sched_entity instant of rq_clock_task()?
> >>
> >> How do we handle sched_entity without a parent sched_entity ?
> >
> > The change below should do the stuff. Not that we can't use it in place entity because
> > pelt is not enabled in UP.
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 74c9918ffe76..b8b381b0ff20 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -7635,6 +7635,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
> >         return new_cpu;
> >  }
> >
> > +static inline bool migrate_long_sleeper(struct sched_entity *se)
> > +{
> > +       struct cfs_rq *cfs_rq;
> > +       u64 sleep_time;
> > +
> > +       if (se->exec_start == 0)
>
> How about use `se->avg.last_update_time == 0` here?
>
> > +               return false;
> > +
> > +       cfs_rq = cfs_rq_of(se);
> > +       /*
> > +        * If the entity slept for a long time, don't even try to normalize its
> > +        * vruntime with the base as it may be too far off and might generate
> > +        * wrong decision because of s64 overflow.
> > +        * We estimate its sleep duration with the last update of se's pelt.
> > +        * The last update happened before sleeping. The cfs' pelt is not
> > +        * always updated when cfs is idle but this is not a problem because
> > +        * its min_vruntime is not updated too, so the situation can't get
> > +        * worse.
> > +        */
> > +       sleep_time = cfs_rq_last_update_time(cfs_rq) - se->avg.last_update_time;
> > +       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>
> I tested with this patch and found it make hackbench slower.

Compared to which version ?
- v6.2 + revert commit 829c1651e9c4 ?
- v6.2 ?

v6.2 has a bug because newly migrated task gets a runtime credit
whatever its previous vruntime and hackbench generates a lot of
migration



>
>
> > +               return true;
> > +
> > +       return false;
> > +}
> > +
> >  /*
> >   * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
> >   * cfs_rq_of(p) references at time of call are still valid and identify the
> > @@ -7666,7 +7692,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >                  * enqueue, we only need to reset the se->vruntime of waking task
> >                  * to a credit here.
> >                  */
> > -               if (entity_is_long_sleep(se))
> > +               if (migrate_long_sleeper(se))
> >                         se->vruntime = -sched_sleeper_credit(se);
> >                 else
> >                         se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> >
> >
> >
> >>
> >>
> >>
> >>>>
> >>>>>> +                       se->vruntime = -sched_sleeper_credit(se);
> >>>>>> +               else
> >>>>>> +                       se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> >>>>>>         }
> >>>>>>
> >>>>>>         if (!task_on_rq_migrating(p)) {
> >>>>>> --
> >>>>>> 2.17.1
> >>>>>>
> >>>> .
> >>>>
> > .
> >
  
Peter Zijlstra March 9, 2023, 1:05 p.m. UTC | #16
On Mon, Mar 06, 2023 at 09:24:18PM +0800, Zhang Qiao wrote:
> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
> entity being placed") fix an overflowing bug, but ignore
> a case that se->exec_start is reset after a migration.
> 
> For fixing this case, we reset the vruntime of a long
> sleeping task in migrate_task_rq_fair().
> 

> @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>  	if (READ_ONCE(p->__state) == TASK_WAKING) {
>  		struct cfs_rq *cfs_rq = cfs_rq_of(se);
>  
> -		se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> +		/*
> +		 * We determine whether a task sleeps for long by checking
> +		 * se->exec_start, and if it is, we sanitize its vruntime at
> +		 * place_entity(). However, after a migration, this detection
> +		 * method fails due to se->exec_start being reset.
> +		 *
> +		 * For fixing this case, we add the same check here. For a task
> +		 * which has slept for a long time, its vruntime should be reset
> +		 * to cfs_rq->min_vruntime with a sleep credit. Because waking
> +		 * task's vruntime will be added to cfs_rq->min_vruntime when
> +		 * enqueue, we only need to reset the se->vruntime of waking task
> +		 * to a credit here.
> +		 */
> +		if (entity_is_long_sleep(se))
> +			se->vruntime = -sched_sleeper_credit(se);
> +		else
> +			se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>  	}

*groan*, that again...

Can't we simply do:

https://lkml.kernel.org/r/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com

?
  
Vincent Guittot March 9, 2023, 1:34 p.m. UTC | #17
On Thu, 9 Mar 2023 at 14:05, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Mar 06, 2023 at 09:24:18PM +0800, Zhang Qiao wrote:
> > Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
> > entity being placed") fix an overflowing bug, but ignore
> > a case that se->exec_start is reset after a migration.
> >
> > For fixing this case, we reset the vruntime of a long
> > sleeping task in migrate_task_rq_fair().
> >
>
> > @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >       if (READ_ONCE(p->__state) == TASK_WAKING) {
> >               struct cfs_rq *cfs_rq = cfs_rq_of(se);
> >
> > -             se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> > +             /*
> > +              * We determine whether a task sleeps for long by checking
> > +              * se->exec_start, and if it is, we sanitize its vruntime at
> > +              * place_entity(). However, after a migration, this detection
> > +              * method fails due to se->exec_start being reset.
> > +              *
> > +              * For fixing this case, we add the same check here. For a task
> > +              * which has slept for a long time, its vruntime should be reset
> > +              * to cfs_rq->min_vruntime with a sleep credit. Because waking
> > +              * task's vruntime will be added to cfs_rq->min_vruntime when
> > +              * enqueue, we only need to reset the se->vruntime of waking task
> > +              * to a credit here.
> > +              */
> > +             if (entity_is_long_sleep(se))
> > +                     se->vruntime = -sched_sleeper_credit(se);
> > +             else
> > +                     se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> >       }
>
> *groan*, that again...
>
> Can't we simply do:
>
> https://lkml.kernel.org/r/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com

I'm not sure this would help here, exec_start == 0 can be easily
managed in place_entity

The problem stands for task about to be migrated out of taking prev_cpu lock

Then, even if we don't clear exec_start before migrating and keep
current value to be used in place_entity on the new cpu, we can't
compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT

side note, why do we reset exec_start ? Is it only to make task_hot
returns false ?

>
> ?
  
Zhang Qiao March 9, 2023, 2:23 p.m. UTC | #18
在 2023/3/9 18:48, Vincent Guittot 写道:
> On Thu, 9 Mar 2023 at 09:37, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>
>>
>>
>> 在 2023/3/8 20:55, Vincent Guittot 写道:
>>> Le mercredi 08 mars 2023 à 09:01:05 (+0100), Vincent Guittot a écrit :
>>>> On Tue, 7 Mar 2023 at 14:41, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>> 在 2023/3/7 18:26, Vincent Guittot 写道:
>>>>>> On Mon, 6 Mar 2023 at 14:53, Vincent Guittot <vincent.guittot@linaro.org> wrote:
>>>>>>>
>>>>>>> On Mon, 6 Mar 2023 at 13:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>>>>>>>
>>>>>>>> Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of
>>>>>>>> entity being placed") fix an overflowing bug, but ignore
>>>>>>>> a case that se->exec_start is reset after a migration.
>>>>>>>>
>>>>>>>> For fixing this case, we reset the vruntime of a long
>>>>>>>> sleeping task in migrate_task_rq_fair().
>>>>>>>>
>>>>>>>> Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed")
>>>>>>>> Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
>>>>>>>> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
>>>>>>>
>>>>>>> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
>>>>>>>
>>>>>>>> ---
>>>>>>>>
>>>>>>>> v1 -> v2:
>>>>>>>> - fix some typos and update comments
>>>>>>>> - reformat the patch
>>>>>>>>
>>>>>>>> ---
>>>>>>>>  kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++-------------
>>>>>>>>  1 file changed, 55 insertions(+), 21 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>>>> index 7a1b1f855b96..74c9918ffe76 100644
>>>>>>>> --- a/kernel/sched/fair.c
>>>>>>>> +++ b/kernel/sched/fair.c
>>>>>>>> @@ -4648,11 +4648,45 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>>>>>>>  #endif
>>>>>>>>  }
>>>>>>>>
>>>>>>>> +static inline bool entity_is_long_sleep(struct sched_entity *se)
>>>>>>>> +{
>>>>>>>> +       struct cfs_rq *cfs_rq;
>>>>>>>> +       u64 sleep_time;
>>>>>>>> +
>>>>>>>> +       if (se->exec_start == 0)
>>>>>>>> +               return false;
>>>>>>>> +
>>>>>>>> +       cfs_rq = cfs_rq_of(se);
>>>>>>>> +       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
>>>>>>>> +       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>>>>>>>> +               return true;
>>>>>>>> +
>>>>>>>> +       return false;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static inline u64 sched_sleeper_credit(struct sched_entity *se)
>>>>>>>> +{
>>>>>>>> +       unsigned long thresh;
>>>>>>>> +
>>>>>>>> +       if (se_is_idle(se))
>>>>>>>> +               thresh = sysctl_sched_min_granularity;
>>>>>>>> +       else
>>>>>>>> +               thresh = sysctl_sched_latency;
>>>>>>>> +
>>>>>>>> +       /*
>>>>>>>> +        * Halve their sleep time's effect, to allow
>>>>>>>> +        * for a gentler effect of sleepers:
>>>>>>>> +        */
>>>>>>>> +       if (sched_feat(GENTLE_FAIR_SLEEPERS))
>>>>>>>> +               thresh >>= 1;
>>>>>>>> +
>>>>>>>> +       return thresh;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>  static void
>>>>>>>>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>>>>>>  {
>>>>>>>>         u64 vruntime = cfs_rq->min_vruntime;
>>>>>>>> -       u64 sleep_time;
>>>>>>>>
>>>>>>>>         /*
>>>>>>>>          * The 'current' period is already promised to the current tasks,
>>>>>>>> @@ -4664,23 +4698,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>>>>>>                 vruntime += sched_vslice(cfs_rq, se);
>>>>>>>>
>>>>>>>>         /* sleeps up to a single latency don't count. */
>>>>>>>> -       if (!initial) {
>>>>>>>> -               unsigned long thresh;
>>>>>>>> -
>>>>>>>> -               if (se_is_idle(se))
>>>>>>>> -                       thresh = sysctl_sched_min_granularity;
>>>>>>>> -               else
>>>>>>>> -                       thresh = sysctl_sched_latency;
>>>>>>>> -
>>>>>>>> -               /*
>>>>>>>> -                * Halve their sleep time's effect, to allow
>>>>>>>> -                * for a gentler effect of sleepers:
>>>>>>>> -                */
>>>>>>>> -               if (sched_feat(GENTLE_FAIR_SLEEPERS))
>>>>>>>> -                       thresh >>= 1;
>>>>>>>> -
>>>>>>>> -               vruntime -= thresh;
>>>>>>>> -       }
>>>>>>>> +       if (!initial)
>>>>>>>> +               vruntime -= sched_sleeper_credit(se);
>>>>>>>>
>>>>>>>>         /*
>>>>>>>>          * Pull vruntime of the entity being placed to the base level of
>>>>>>>> @@ -4689,8 +4708,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>>>>>>          * the base as it may be too far off and the comparison may get
>>>>>>>>          * inversed due to s64 overflow.
>>>>>>>>          */
>>>>>>>> -       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
>>>>>>>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>>>>>>>> +       if (entity_is_long_sleep(se))
>>>>>>>>                 se->vruntime = vruntime;
>>>>>>>>         else
>>>>>>>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
>>>>>>>> @@ -7635,7 +7653,23 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>>>>>>>>         if (READ_ONCE(p->__state) == TASK_WAKING) {
>>>>>>>>                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
>>>>>>>>
>>>>>>>> -               se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>>>>>>>> +               /*
>>>>>>>> +                * We determine whether a task sleeps for long by checking
>>>>>>>> +                * se->exec_start, and if it is, we sanitize its vruntime at
>>>>>>>> +                * place_entity(). However, after a migration, this detection
>>>>>>>> +                * method fails due to se->exec_start being reset.
>>>>>>>> +                *
>>>>>>>> +                * For fixing this case, we add the same check here. For a task
>>>>>>>> +                * which has slept for a long time, its vruntime should be reset
>>>>>>>> +                * to cfs_rq->min_vruntime with a sleep credit. Because waking
>>>>>>>> +                * task's vruntime will be added to cfs_rq->min_vruntime when
>>>>>>>> +                * enqueue, we only need to reset the se->vruntime of waking task
>>>>>>>> +                * to a credit here.
>>>>>>>> +                */
>>>>>>>> +               if (entity_is_long_sleep(se))
>>>>>>
>>>>>> I completely overlooked that we can't use rq_clock_task here. Need to
>>>>>> think a bit more on this
>>>>>
>>>>> Hi,Vincent,
>>>>>
>>>>> How about using exec_start of the parent sched_entity instant of rq_clock_task()?
>>>>
>>>> How do we handle sched_entity without a parent sched_entity ?
>>>
>>> The change below should do the stuff. Not that we can't use it in place entity because
>>> pelt is not enabled in UP.
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 74c9918ffe76..b8b381b0ff20 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -7635,6 +7635,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>>>         return new_cpu;
>>>  }
>>>
>>> +static inline bool migrate_long_sleeper(struct sched_entity *se)
>>> +{
>>> +       struct cfs_rq *cfs_rq;
>>> +       u64 sleep_time;
>>> +
>>> +       if (se->exec_start == 0)
>>
>> How about use `se->avg.last_update_time == 0` here?
>>
>>> +               return false;
>>> +
>>> +       cfs_rq = cfs_rq_of(se);
>>> +       /*
>>> +        * If the entity slept for a long time, don't even try to normalize its
>>> +        * vruntime with the base as it may be too far off and might generate
>>> +        * wrong decision because of s64 overflow.
>>> +        * We estimate its sleep duration with the last update of se's pelt.
>>> +        * The last update happened before sleeping. The cfs' pelt is not
>>> +        * always updated when cfs is idle but this is not a problem because
>>> +        * its min_vruntime is not updated too, so the situation can't get
>>> +        * worse.
>>> +        */
>>> +       sleep_time = cfs_rq_last_update_time(cfs_rq) - se->avg.last_update_time;
>>> +       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>>
>> I tested with this patch and found it make hackbench slower.
> 
> Compared to which version ?
> - v6.2 + revert commit 829c1651e9c4 ?
> - v6.2 ?
> 
> v6.2 has a bug because newly migrated task gets a runtime credit
> whatever its previous vruntime and hackbench generates a lot of
> migration
> 

Sorry, When I was testing, I forgot to add the check 'se->exec_start == 0' in place_entity().
After i re-test, the hackbench result is ok.


> 
> 
>>
>>
>>> +               return true;
>>> +
>>> +       return false;
>>> +}
>>> +
>>>  /*
>>>   * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
>>>   * cfs_rq_of(p) references at time of call are still valid and identify the
>>> @@ -7666,7 +7692,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>>>                  * enqueue, we only need to reset the se->vruntime of waking task
>>>                  * to a credit here.
>>>                  */
>>> -               if (entity_is_long_sleep(se))
>>> +               if (migrate_long_sleeper(se))
>>>                         se->vruntime = -sched_sleeper_credit(se);
>>>                 else
>>>                         se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>>>
>>>
>>>
>>>>
>>>>
>>>>
>>>>>>
>>>>>>>> +                       se->vruntime = -sched_sleeper_credit(se);
>>>>>>>> +               else
>>>>>>>> +                       se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
>>>>>>>>         }
>>>>>>>>
>>>>>>>>         if (!task_on_rq_migrating(p)) {
>>>>>>>> --
>>>>>>>> 2.17.1
>>>>>>>>
>>>>>> .
>>>>>>
>>> .
>>>
> .
>
  
Peter Zijlstra March 9, 2023, 2:28 p.m. UTC | #19
On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:

> Then, even if we don't clear exec_start before migrating and keep
> current value to be used in place_entity on the new cpu, we can't
> compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT

Blergh -- indeed, irq and steal time can skew them between CPUs :/
I suppose we can fudge that... wait_start (which is basically what we're
making it do) also does that IIRC.

I really dislike having this placement muck spreadout like proposed.

> side note, why do we reset exec_start ? Is it only to make task_hot
> returns false ?

Yeah, god aweful hack.
  
Peter Zijlstra March 9, 2023, 2:36 p.m. UTC | #20
On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
> 
> > Then, even if we don't clear exec_start before migrating and keep
> > current value to be used in place_entity on the new cpu, we can't
> > compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT
> 
> Blergh -- indeed, irq and steal time can skew them between CPUs :/
> I suppose we can fudge that... wait_start (which is basically what we're
> making it do) also does that IIRC.
> 
> I really dislike having this placement muck spreadout like proposed.

Also, I think we might be over-engineering this, we don't care about
accuracy at all, all we really care about is 'long-time'.
  
Vincent Guittot March 9, 2023, 3:14 p.m. UTC | #21
On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
> > On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
> >
> > > Then, even if we don't clear exec_start before migrating and keep
> > > current value to be used in place_entity on the new cpu, we can't
> > > compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT
> >
> > Blergh -- indeed, irq and steal time can skew them between CPUs :/
> > I suppose we can fudge that... wait_start (which is basically what we're
> > making it do) also does that IIRC.
> >
> > I really dislike having this placement muck spreadout like proposed.
>
> Also, I think we might be over-engineering this, we don't care about
> accuracy at all, all we really care about is 'long-time'.

you mean taking the patch 1/2 that you mentioned here to add a
migrated field:
https://lore.kernel.org/all/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com/T/#ma5637eb8010f3f4a4abff778af8db705429d003b

And assume that the divergence between the rq_clock_task() can be ignored ?

That could probably work but we need to replace the (60LL *
NSEC_PER_SEC) by ((1ULL << 63) / NICE_0_LOAD) because 60sec divergence
would not be unrealistic.
and a comment to explain why it's acceptable


>
>
  
Vincent Guittot March 10, 2023, 2:29 p.m. UTC | #22
Le jeudi 09 mars 2023 à 16:14:38 (+0100), Vincent Guittot a écrit :
> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
> > > On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
> > >
> > > > Then, even if we don't clear exec_start before migrating and keep
> > > > current value to be used in place_entity on the new cpu, we can't
> > > > compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT
> > >
> > > Blergh -- indeed, irq and steal time can skew them between CPUs :/
> > > I suppose we can fudge that... wait_start (which is basically what we're
> > > making it do) also does that IIRC.
> > >
> > > I really dislike having this placement muck spreadout like proposed.
> >
> > Also, I think we might be over-engineering this, we don't care about
> > accuracy at all, all we really care about is 'long-time'.
> 
> you mean taking the patch 1/2 that you mentioned here to add a
> migrated field:
> https://lore.kernel.org/all/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com/T/#ma5637eb8010f3f4a4abff778af8db705429d003b
> 
> And assume that the divergence between the rq_clock_task() can be ignored ?
> 
> That could probably work but we need to replace the (60LL *
> NSEC_PER_SEC) by ((1ULL << 63) / NICE_0_LOAD) because 60sec divergence
> would not be unrealistic.
> and a comment to explain why it's acceptable

Zhang,

Could you try the patch below ?
This is a rebase/merge/update of:
-patch 1/2 above and 
-https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/

The proposal accepts a divergence of up to 52 days between the 2 rqs.

If this work, we will prepare a proper patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..cb8af0a137f7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -550,6 +550,7 @@ struct sched_entity {
        struct rb_node                  run_node;
        struct list_head                group_node;
        unsigned int                    on_rq;
+       unsigned int                    migrated;

        u64                             exec_start;
        u64                             sum_exec_runtime;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a1b1f855b96..36acd9598b40 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
        /*
         * We are starting a new run period:
         */
+       se->migrated = 0;
        se->exec_start = rq_clock_task(rq_of(cfs_rq));
 }

@@ -4684,13 +4685,23 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)

        /*
         * Pull vruntime of the entity being placed to the base level of
-        * cfs_rq, to prevent boosting it if placed backwards.  If the entity
-        * slept for a long time, don't even try to compare its vruntime with
-        * the base as it may be too far off and the comparison may get
-        * inversed due to s64 overflow.
+        * cfs_rq, to prevent boosting it if placed backwards.
+        * However, min_vruntime can advance much faster than real time, with
+        * the exterme being when an entity with the minimal weight always runs
+        * on the cfs_rq. If the new entity slept for long, its vruntime
+        * difference from min_vruntime may overflow s64 and their comparison
+        * may get inversed, so ignore the entity's original vruntime in that
+        * case.
+        * The maximal vruntime speedup is given by the ratio of normal to
+        * minimal weight: NICE_0_LOAD / MIN_SHARES, so cutting off on the
+        * sleep time of 2^63 / NICE_0_LOAD should be safe.
+        * When placing a migrated waking entity, its exec_start has been set
+        * from a different rq. In order to take into account a possible
+        * divergence between new and prev rq's clocks task because of irq and
+        * stolen time, we take an additional margin.
         */
        sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
-       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+       if ((s64)sleep_time > (1ULL << 63) / NICE_0_LOAD / 2)
                se->vruntime = vruntime;
        else
                se->vruntime = max_vruntime(se->vruntime, vruntime);
@@ -7658,7 +7669,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
        se->avg.last_update_time = 0;

        /* We have migrated, no longer consider this task hot */
-       se->exec_start = 0;
+       se->migrated = 1;

        update_scan_period(p, new_cpu);
 }
@@ -8344,6 +8355,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
        if (sysctl_sched_migration_cost == 0)
                return 0;

+       if (p->se.migrated)
+               return 0;
+
        delta = rq_clock_task(env->src_rq) - p->se.exec_start;

        return delta < (s64)sysctl_sched_migration_cost;



> 
> 
> >
> >
  
Zhang Qiao March 11, 2023, 9:57 a.m. UTC | #23
在 2023/3/10 22:29, Vincent Guittot 写道:
> Le jeudi 09 mars 2023 à 16:14:38 (+0100), Vincent Guittot a écrit :
>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
>>>
>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
>>>>
>>>>> Then, even if we don't clear exec_start before migrating and keep
>>>>> current value to be used in place_entity on the new cpu, we can't
>>>>> compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT
>>>>
>>>> Blergh -- indeed, irq and steal time can skew them between CPUs :/
>>>> I suppose we can fudge that... wait_start (which is basically what we're
>>>> making it do) also does that IIRC.
>>>>
>>>> I really dislike having this placement muck spreadout like proposed.
>>>
>>> Also, I think we might be over-engineering this, we don't care about
>>> accuracy at all, all we really care about is 'long-time'.
>>
>> you mean taking the patch 1/2 that you mentioned here to add a
>> migrated field:
>> https://lore.kernel.org/all/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com/T/#ma5637eb8010f3f4a4abff778af8db705429d003b
>>
>> And assume that the divergence between the rq_clock_task() can be ignored ?
>>
>> That could probably work but we need to replace the (60LL *
>> NSEC_PER_SEC) by ((1ULL << 63) / NICE_0_LOAD) because 60sec divergence
>> would not be unrealistic.
>> and a comment to explain why it's acceptable
> 
> Zhang,
> 
> Could you try the patch below ?
> This is a rebase/merge/update of:
> -patch 1/2 above and 
> -https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/


I applyed and tested this patch, and it make hackbench slower.
According to my previous test results. The good result is 82.1(s).
But the result of this patch is 108.725(s).


> version1: v6.2
> version2: v6.2 + commit 829c1651e9c4
> version3: v6.2 + commit 829c1651e9c4 + this patch
>
> -------------------------------------------------
> 	version1	version2	version3
> test1	81.0 		118.1 		82.1
> test2	82.1 		116.9 		80.3
> test3	83.2 		103.9 		83.3
> avg(s)	82.1 		113.0 		81.9
>
> -------------------------------------------------
> 
> The proposal accepts a divergence of up to 52 days between the 2 rqs.
> 
> If this work, we will prepare a proper patch
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 63d242164b1a..cb8af0a137f7 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -550,6 +550,7 @@ struct sched_entity {
>         struct rb_node                  run_node;
>         struct list_head                group_node;
>         unsigned int                    on_rq;
> +       unsigned int                    migrated;
> 
>         u64                             exec_start;
>         u64                             sum_exec_runtime;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7a1b1f855b96..36acd9598b40 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>         /*
>          * We are starting a new run period:
>          */
> +       se->migrated = 0;
>         se->exec_start = rq_clock_task(rq_of(cfs_rq));
>  }
> 
> @@ -4684,13 +4685,23 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> 
>         /*
>          * Pull vruntime of the entity being placed to the base level of
> -        * cfs_rq, to prevent boosting it if placed backwards.  If the entity
> -        * slept for a long time, don't even try to compare its vruntime with
> -        * the base as it may be too far off and the comparison may get
> -        * inversed due to s64 overflow.
> +        * cfs_rq, to prevent boosting it if placed backwards.
> +        * However, min_vruntime can advance much faster than real time, with
> +        * the exterme being when an entity with the minimal weight always runs
> +        * on the cfs_rq. If the new entity slept for long, its vruntime
> +        * difference from min_vruntime may overflow s64 and their comparison
> +        * may get inversed, so ignore the entity's original vruntime in that
> +        * case.
> +        * The maximal vruntime speedup is given by the ratio of normal to
> +        * minimal weight: NICE_0_LOAD / MIN_SHARES, so cutting off on the

why not is `scale_load_down(NICE_0_LOAD) / MIN_SHARES` here ?


> +        * sleep time of 2^63 / NICE_0_LOAD should be safe.
> +        * When placing a migrated waking entity, its exec_start has been set
> +        * from a different rq. In order to take into account a possible
> +        * divergence between new and prev rq's clocks task because of irq and

This divergence might be larger, it cause `sleep_time` maybe negative.

> +        * stolen time, we take an additional margin.
>          */
>         sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> +       if ((s64)sleep_time > (1ULL << 63) / NICE_0_LOAD / 2)>                 se->vruntime = vruntime;
>         else
>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> @@ -7658,7 +7669,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>         se->avg.last_update_time = 0;
> 
>         /* We have migrated, no longer consider this task hot */
> -       se->exec_start = 0;
> +       se->migrated = 1;
> 
>         update_scan_period(p, new_cpu);
>  }
> @@ -8344,6 +8355,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>         if (sysctl_sched_migration_cost == 0)
>                 return 0;
> 
> +       if (p->se.migrated)
> +               return 0;
> +
>         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
> 
>         return delta < (s64)sysctl_sched_migration_cost;
> 
> 
> 
>>
>>
>>>
>>>
> .
>
  
Dietmar Eggemann March 13, 2023, 9:06 a.m. UTC | #24
On 10/03/2023 15:29, Vincent Guittot wrote:
> Le jeudi 09 mars 2023 � 16:14:38 (+0100), Vincent Guittot a �crit :
>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
>>>
>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:

[...]

>> you mean taking the patch 1/2 that you mentioned here to add a
>> migrated field:
>> https://lore.kernel.org/all/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com/T/#ma5637eb8010f3f4a4abff778af8db705429d003b
>>
>> And assume that the divergence between the rq_clock_task() can be ignored ?
>>
>> That could probably work but we need to replace the (60LL *
>> NSEC_PER_SEC) by ((1ULL << 63) / NICE_0_LOAD) because 60sec divergence
>> would not be unrealistic.
>> and a comment to explain why it's acceptable
> 
> Zhang,
> 
> Could you try the patch below ?
> This is a rebase/merge/update of:
> -patch 1/2 above and 
> -https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
> 
> The proposal accepts a divergence of up to 52 days between the 2 rqs.
> 
> If this work, we will prepare a proper patch

Looks to me that this patch brings back the old numbers:

model name	: Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz

perf stat --null --repeat 10 -- perf bench sched messaging -g 50 -l 5000

tip sched/core

a2e90611b9f4 - sched/fair: Remove capacity inversion detection
(2023-02-11 Vincent Guittot)

  5.7295 +- 0.0219 seconds time elapsed  ( +-  0.38% )

829c1651e9c4 - sched/fair: sanitize vruntime of entity being placed
(2023-02-11 Zhang Qiao)

  6.0961 +- 0.0297 seconds time elapsed  ( +-  0.49% )

this patch on top 829c1651e9c4

  5.7165 +- 0.0231 seconds time elapsed  ( +-  0.40% )

[...]
  
Vincent Guittot March 13, 2023, 2:23 p.m. UTC | #25
On Sat, 11 Mar 2023 at 10:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>
>
>
> 在 2023/3/10 22:29, Vincent Guittot 写道:
> > Le jeudi 09 mars 2023 à 16:14:38 (+0100), Vincent Guittot a écrit :
> >> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
> >>>
> >>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
> >>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
> >>>>
> >>>>> Then, even if we don't clear exec_start before migrating and keep
> >>>>> current value to be used in place_entity on the new cpu, we can't
> >>>>> compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT
> >>>>
> >>>> Blergh -- indeed, irq and steal time can skew them between CPUs :/
> >>>> I suppose we can fudge that... wait_start (which is basically what we're
> >>>> making it do) also does that IIRC.
> >>>>
> >>>> I really dislike having this placement muck spreadout like proposed.
> >>>
> >>> Also, I think we might be over-engineering this, we don't care about
> >>> accuracy at all, all we really care about is 'long-time'.
> >>
> >> you mean taking the patch 1/2 that you mentioned here to add a
> >> migrated field:
> >> https://lore.kernel.org/all/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com/T/#ma5637eb8010f3f4a4abff778af8db705429d003b
> >>
> >> And assume that the divergence between the rq_clock_task() can be ignored ?
> >>
> >> That could probably work but we need to replace the (60LL *
> >> NSEC_PER_SEC) by ((1ULL << 63) / NICE_0_LOAD) because 60sec divergence
> >> would not be unrealistic.
> >> and a comment to explain why it's acceptable
> >
> > Zhang,
> >
> > Could you try the patch below ?
> > This is a rebase/merge/update of:
> > -patch 1/2 above and
> > -https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
>
>
> I applyed and tested this patch, and it make hackbench slower.
> According to my previous test results. The good result is 82.1(s).
> But the result of this patch is 108.725(s).

By "the result of this patch is 108.725(s)",  you mean the result of
https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
alone, don't you ?

>
>
> > version1: v6.2
> > version2: v6.2 + commit 829c1651e9c4
> > version3: v6.2 + commit 829c1651e9c4 + this patch
> >
> > -------------------------------------------------
> >       version1        version2        version3
> > test1 81.0            118.1           82.1
> > test2 82.1            116.9           80.3
> > test3 83.2            103.9           83.3
> > avg(s)        82.1            113.0           81.9

Ok, it looks like we are back to normal figures

> >
> > -------------------------------------------------
> >
> > The proposal accepts a divergence of up to 52 days between the 2 rqs.
> >
> > If this work, we will prepare a proper patch
> >
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 63d242164b1a..cb8af0a137f7 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -550,6 +550,7 @@ struct sched_entity {
> >         struct rb_node                  run_node;
> >         struct list_head                group_node;
> >         unsigned int                    on_rq;
> > +       unsigned int                    migrated;
> >
> >         u64                             exec_start;
> >         u64                             sum_exec_runtime;
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 7a1b1f855b96..36acd9598b40 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >         /*
> >          * We are starting a new run period:
> >          */
> > +       se->migrated = 0;
> >         se->exec_start = rq_clock_task(rq_of(cfs_rq));
> >  }
> >
> > @@ -4684,13 +4685,23 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >
> >         /*
> >          * Pull vruntime of the entity being placed to the base level of
> > -        * cfs_rq, to prevent boosting it if placed backwards.  If the entity
> > -        * slept for a long time, don't even try to compare its vruntime with
> > -        * the base as it may be too far off and the comparison may get
> > -        * inversed due to s64 overflow.
> > +        * cfs_rq, to prevent boosting it if placed backwards.
> > +        * However, min_vruntime can advance much faster than real time, with
> > +        * the exterme being when an entity with the minimal weight always runs
> > +        * on the cfs_rq. If the new entity slept for long, its vruntime
> > +        * difference from min_vruntime may overflow s64 and their comparison
> > +        * may get inversed, so ignore the entity's original vruntime in that
> > +        * case.
> > +        * The maximal vruntime speedup is given by the ratio of normal to
> > +        * minimal weight: NICE_0_LOAD / MIN_SHARES, so cutting off on the
>
> why not is `scale_load_down(NICE_0_LOAD) / MIN_SHARES` here ?

yes, you are right.

>
>
> > +        * sleep time of 2^63 / NICE_0_LOAD should be safe.
> > +        * When placing a migrated waking entity, its exec_start has been set
> > +        * from a different rq. In order to take into account a possible
> > +        * divergence between new and prev rq's clocks task because of irq and
>
> This divergence might be larger, it cause `sleep_time` maybe negative.

AFAICT, we are safe with ((1ULL << 63) / scale_load_down(NICE_0_LOAD)
/ 2) as long as the divergence between the 2 rqs clocks task is lower
than 2^52nsec. Do you expect a divergence higher than 2^52 nsec
(around 52 days)?

We can probably keep using (1ULL << 63) / scale_load_down(NICE_0_LOAD)
which is already half the max value if needed.

the fact that sleep_time can be negative is not a problem as
s64)sleep_time > will take care of this.

>
> > +        * stolen time, we take an additional margin.
> >          */
> >         sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> > -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> > +       if ((s64)sleep_time > (1ULL << 63) / NICE_0_LOAD / 2)>                 se->vruntime = vruntime;
> >         else
> >                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> > @@ -7658,7 +7669,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >         se->avg.last_update_time = 0;
> >
> >         /* We have migrated, no longer consider this task hot */
> > -       se->exec_start = 0;
> > +       se->migrated = 1;
> >
> >         update_scan_period(p, new_cpu);
> >  }
> > @@ -8344,6 +8355,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> >         if (sysctl_sched_migration_cost == 0)
> >                 return 0;
> >
> > +       if (p->se.migrated)
> > +               return 0;
> > +
> >         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
> >
> >         return delta < (s64)sysctl_sched_migration_cost;
> >
> >
> >
> >>
> >>
> >>>
> >>>
> > .
> >
  
Dietmar Eggemann March 13, 2023, 6:17 p.m. UTC | #26
On 13/03/2023 10:06, Dietmar Eggemann wrote:
> On 10/03/2023 15:29, Vincent Guittot wrote:
>> Le jeudi 09 mars 2023 � 16:14:38 (+0100), Vincent Guittot a �crit :
>>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
>>>>
>>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
>>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:

[...]

> Looks to me that this patch brings back the old numbers:
> 
> model name	: Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz
> 
> perf stat --null --repeat 10 -- perf bench sched messaging -g 50 -l 5000
> 
> tip sched/core
> 
> a2e90611b9f4 - sched/fair: Remove capacity inversion detection
> (2023-02-11 Vincent Guittot)
> 
>   5.7295 +- 0.0219 seconds time elapsed  ( +-  0.38% )
> 
> 829c1651e9c4 - sched/fair: sanitize vruntime of entity being placed
> (2023-02-11 Zhang Qiao)
> 
>   6.0961 +- 0.0297 seconds time elapsed  ( +-  0.49% )
> 
> this patch on top 829c1651e9c4
> 
>   5.7165 +- 0.0231 seconds time elapsed  ( +-  0.40% )
> 
> [...]

Couldn't we not just defer setting `se->exec_start = 0` until the end of
place_entity() for ENQUEUE_MIGRATED instead to avoid this extra se flag
`migrated`?

-->8--

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0c70c558b12c..4df2b3e76b30 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -550,7 +550,6 @@ struct sched_entity {
        struct rb_node                  run_node;
        struct list_head                group_node;
        unsigned int                    on_rq;
-       unsigned int                    migrated;
 
        u64                             exec_start;
        u64                             sum_exec_runtime;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a8aa8cd3c745..365ee548e9f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1057,7 +1057,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
        /*
         * We are starting a new run period:
         */
-       se->migrated = 0;
        se->exec_start = rq_clock_task(rq_of(cfs_rq));
 }
 
@@ -4649,8 +4648,8 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #endif
 }
 
-static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                        int flags, int initial)
 {
        u64 vruntime = cfs_rq->min_vruntime;
        u64 sleep_time;
@@ -4705,6 +4704,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
                se->vruntime = vruntime;
        else
                se->vruntime = max_vruntime(se->vruntime, vruntime);
+
+       if (flags & ENQUEUE_MIGRATED)
+               se->exec_start = 0;
 }
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -4780,7 +4782,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        account_entity_enqueue(cfs_rq, se);
 
        if (flags & ENQUEUE_WAKEUP)
-               place_entity(cfs_rq, se, 0);
+               place_entity(cfs_rq, se, flags, 0);
 
        check_schedstat_required();
        update_stats_enqueue_fair(cfs_rq, se, flags);
@@ -7668,9 +7670,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
        /* Tell new CPU we are migrated */
        se->avg.last_update_time = 0;
 
-       /* We have migrated, no longer consider this task hot */
-       se->migrated = 1;
-
        update_scan_period(p, new_cpu);
 }
 
@@ -8355,9 +8354,6 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
        if (sysctl_sched_migration_cost == 0)
                return 0;
 
-       if (p->se.migrated)
-               return 0;
-
        delta = rq_clock_task(env->src_rq) - p->se.exec_start;
 
        return delta < (s64)sysctl_sched_migration_cost;
@@ -11999,7 +11995,7 @@ static void task_fork_fair(struct task_struct *p)
                update_curr(cfs_rq);
                se->vruntime = curr->vruntime;
        }
-       place_entity(cfs_rq, se, 1);
+       place_entity(cfs_rq, se, 0, 1);
 
        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
                /*
@@ -12144,7 +12140,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
                 * Fix up our vruntime so that the current sleep doesn't
                 * cause 'unlimited' sleep bonus.
                 */
-               place_entity(cfs_rq, se, 0);
+               place_entity(cfs_rq, se, 0, 0);
                se->vruntime -= cfs_rq->min_vruntime;
        }
  
Vincent Guittot March 14, 2023, 7:41 a.m. UTC | #27
On Mon, 13 Mar 2023 at 19:17, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>
> On 13/03/2023 10:06, Dietmar Eggemann wrote:
> > On 10/03/2023 15:29, Vincent Guittot wrote:
> >> Le jeudi 09 mars 2023 � 16:14:38 (+0100), Vincent Guittot a �crit :
> >>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
> >>>>
> >>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
> >>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
>
> [...]
>
> > Looks to me that this patch brings back the old numbers:
> >
> > model name    : Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz
> >
> > perf stat --null --repeat 10 -- perf bench sched messaging -g 50 -l 5000
> >
> > tip sched/core
> >
> > a2e90611b9f4 - sched/fair: Remove capacity inversion detection
> > (2023-02-11 Vincent Guittot)
> >
> >   5.7295 +- 0.0219 seconds time elapsed  ( +-  0.38% )
> >
> > 829c1651e9c4 - sched/fair: sanitize vruntime of entity being placed
> > (2023-02-11 Zhang Qiao)
> >
> >   6.0961 +- 0.0297 seconds time elapsed  ( +-  0.49% )
> >
> > this patch on top 829c1651e9c4
> >
> >   5.7165 +- 0.0231 seconds time elapsed  ( +-  0.40% )
> >
> > [...]
>
> Couldn't we not just defer setting `se->exec_start = 0` until the end of
> place_entity() for ENQUEUE_MIGRATED instead to avoid this extra se flag
> `migrated`?

Yes, that's a good point.

I'm going to use something a bit different from your proposal below by
merging initial and flag
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity
*se, int flags)

with flags:
0 for initial placement
ENQUEUE_WAKEUP for wakeup
ENQUEUE_MIGRATED for migrated task

>
> -->8--
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0c70c558b12c..4df2b3e76b30 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -550,7 +550,6 @@ struct sched_entity {
>         struct rb_node                  run_node;
>         struct list_head                group_node;
>         unsigned int                    on_rq;
> -       unsigned int                    migrated;
>
>         u64                             exec_start;
>         u64                             sum_exec_runtime;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a8aa8cd3c745..365ee548e9f0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1057,7 +1057,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>         /*
>          * We are starting a new run period:
>          */
> -       se->migrated = 0;
>         se->exec_start = rq_clock_task(rq_of(cfs_rq));
>  }
>
> @@ -4649,8 +4648,8 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  #endif
>  }
>
> -static void
> -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> +static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
> +                        int flags, int initial)
>  {
>         u64 vruntime = cfs_rq->min_vruntime;
>         u64 sleep_time;
> @@ -4705,6 +4704,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>                 se->vruntime = vruntime;
>         else
>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> +
> +       if (flags & ENQUEUE_MIGRATED)
> +               se->exec_start = 0;
>  }
>
>  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
> @@ -4780,7 +4782,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>         account_entity_enqueue(cfs_rq, se);
>
>         if (flags & ENQUEUE_WAKEUP)
> -               place_entity(cfs_rq, se, 0);
> +               place_entity(cfs_rq, se, flags, 0);
>
>         check_schedstat_required();
>         update_stats_enqueue_fair(cfs_rq, se, flags);
> @@ -7668,9 +7670,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>         /* Tell new CPU we are migrated */
>         se->avg.last_update_time = 0;
>
> -       /* We have migrated, no longer consider this task hot */
> -       se->migrated = 1;
> -
>         update_scan_period(p, new_cpu);
>  }
>
> @@ -8355,9 +8354,6 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>         if (sysctl_sched_migration_cost == 0)
>                 return 0;
>
> -       if (p->se.migrated)
> -               return 0;
> -
>         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
>
>         return delta < (s64)sysctl_sched_migration_cost;
> @@ -11999,7 +11995,7 @@ static void task_fork_fair(struct task_struct *p)
>                 update_curr(cfs_rq);
>                 se->vruntime = curr->vruntime;
>         }
> -       place_entity(cfs_rq, se, 1);
> +       place_entity(cfs_rq, se, 0, 1);
>
>         if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
>                 /*
> @@ -12144,7 +12140,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
>                  * Fix up our vruntime so that the current sleep doesn't
>                  * cause 'unlimited' sleep bonus.
>                  */
> -               place_entity(cfs_rq, se, 0);
> +               place_entity(cfs_rq, se, 0, 0);
>                 se->vruntime -= cfs_rq->min_vruntime;
>         }
>
>
  
Zhang Qiao March 14, 2023, 11:03 a.m. UTC | #28
在 2023/3/13 22:23, Vincent Guittot 写道:
> On Sat, 11 Mar 2023 at 10:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>
>>
>>
>> 在 2023/3/10 22:29, Vincent Guittot 写道:
>>> Le jeudi 09 mars 2023 à 16:14:38 (+0100), Vincent Guittot a écrit :
>>>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
>>>>>
>>>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
>>>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
>>>>>>
>>>>>>> Then, even if we don't clear exec_start before migrating and keep
>>>>>>> current value to be used in place_entity on the new cpu, we can't
>>>>>>> compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT
>>>>>>
>>>>>> Blergh -- indeed, irq and steal time can skew them between CPUs :/
>>>>>> I suppose we can fudge that... wait_start (which is basically what we're
>>>>>> making it do) also does that IIRC.
>>>>>>
>>>>>> I really dislike having this placement muck spreadout like proposed.
>>>>>
>>>>> Also, I think we might be over-engineering this, we don't care about
>>>>> accuracy at all, all we really care about is 'long-time'.
>>>>
>>>> you mean taking the patch 1/2 that you mentioned here to add a
>>>> migrated field:
>>>> https://lore.kernel.org/all/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com/T/#ma5637eb8010f3f4a4abff778af8db705429d003b
>>>>
>>>> And assume that the divergence between the rq_clock_task() can be ignored ?
>>>>
>>>> That could probably work but we need to replace the (60LL *
>>>> NSEC_PER_SEC) by ((1ULL << 63) / NICE_0_LOAD) because 60sec divergence
>>>> would not be unrealistic.
>>>> and a comment to explain why it's acceptable
>>>
>>> Zhang,
>>>
>>> Could you try the patch below ?
>>> This is a rebase/merge/update of:
>>> -patch 1/2 above and
>>> -https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
>>
>>
>> I applyed and tested this patch, and it make hackbench slower.
>> According to my previous test results. The good result is 82.1(s).
>> But the result of this patch is 108.725(s).
> 
> By "the result of this patch is 108.725(s)",  you mean the result of
> https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
> alone, don't you ?

No, with your patch, the test results is 108.725(s),

git diff:

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..93a3909ae4c4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -550,6 +550,7 @@ struct sched_entity {
        struct rb_node                  run_node;
        struct list_head                group_node;
        unsigned int                    on_rq;
+       unsigned int                    migrated;

        u64                             exec_start;
        u64                             sum_exec_runtime;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff4dbbae3b10..e60defc39f6e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
        /*
         * We are starting a new run period:
         */
+       se->migrated = 0;
        se->exec_start = rq_clock_task(rq_of(cfs_rq));
 }

@@ -4690,9 +4691,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
         * inversed due to s64 overflow.
         */
        sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
-       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+       if ((s64)sleep_time > (1ULL << 63) / scale_load_down(NICE_0_LOAD) / 2) {
                se->vruntime = vruntime;
-       else
+       } else
                se->vruntime = max_vruntime(se->vruntime, vruntime);
 }

@@ -7658,8 +7659,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
        se->avg.last_update_time = 0;

        /* We have migrated, no longer consider this task hot */
-       se->exec_start = 0;
-
+       se->migrated = 1;
        update_scan_period(p, new_cpu);
 }

@@ -8343,6 +8343,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)

        if (sysctl_sched_migration_cost == 0)
                return 0;
+       if (p->se.migrated)
+               return 0;

        delta = rq_clock_task(env->src_rq) - p->se.exec_start;



> 
>>
>>
>>> version1: v6.2
>>> version2: v6.2 + commit 829c1651e9c4
>>> version3: v6.2 + commit 829c1651e9c4 + this patch
>>>
>>> -------------------------------------------------
>>>       version1        version2        version3
>>> test1 81.0            118.1           82.1
>>> test2 82.1            116.9           80.3
>>> test3 83.2            103.9           83.3
>>> avg(s)        82.1            113.0           81.9
> 
> Ok, it looks like we are back to normal figures
> 
>>>
>>> -------------------------------------------------
>>>
>>> The proposal accepts a divergence of up to 52 days between the 2 rqs.
>>>
>>> If this work, we will prepare a proper patch
>>>
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index 63d242164b1a..cb8af0a137f7 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -550,6 +550,7 @@ struct sched_entity {
>>>         struct rb_node                  run_node;
>>>         struct list_head                group_node;
>>>         unsigned int                    on_rq;
>>> +       unsigned int                    migrated;
>>>
>>>         u64                             exec_start;
>>>         u64                             sum_exec_runtime;
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 7a1b1f855b96..36acd9598b40 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>>         /*
>>>          * We are starting a new run period:
>>>          */
>>> +       se->migrated = 0;
>>>         se->exec_start = rq_clock_task(rq_of(cfs_rq));
>>>  }
>>>
>>> @@ -4684,13 +4685,23 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>
>>>         /*
>>>          * Pull vruntime of the entity being placed to the base level of
>>> -        * cfs_rq, to prevent boosting it if placed backwards.  If the entity
>>> -        * slept for a long time, don't even try to compare its vruntime with
>>> -        * the base as it may be too far off and the comparison may get
>>> -        * inversed due to s64 overflow.
>>> +        * cfs_rq, to prevent boosting it if placed backwards.
>>> +        * However, min_vruntime can advance much faster than real time, with
>>> +        * the exterme being when an entity with the minimal weight always runs
>>> +        * on the cfs_rq. If the new entity slept for long, its vruntime
>>> +        * difference from min_vruntime may overflow s64 and their comparison
>>> +        * may get inversed, so ignore the entity's original vruntime in that
>>> +        * case.
>>> +        * The maximal vruntime speedup is given by the ratio of normal to
>>> +        * minimal weight: NICE_0_LOAD / MIN_SHARES, so cutting off on the
>>
>> why not is `scale_load_down(NICE_0_LOAD) / MIN_SHARES` here ?
> 
> yes, you are right.
> 
>>
>>
>>> +        * sleep time of 2^63 / NICE_0_LOAD should be safe.
>>> +        * When placing a migrated waking entity, its exec_start has been set
>>> +        * from a different rq. In order to take into account a possible
>>> +        * divergence between new and prev rq's clocks task because of irq and
>>
>> This divergence might be larger, it cause `sleep_time` maybe negative.
> 
> AFAICT, we are safe with ((1ULL << 63) / scale_load_down(NICE_0_LOAD)
> / 2) as long as the divergence between the 2 rqs clocks task is lower
> than 2^52nsec. Do you expect a divergence higher than 2^52 nsec
> (around 52 days)?
> 
> We can probably keep using (1ULL << 63) / scale_load_down(NICE_0_LOAD)
> which is already half the max value if needed.
> 
> the fact that sleep_time can be negative is not a problem as
> s64)sleep_time > will take care of this.

In my opinion, when comparing signed with unsigned, the compiler converts the signed value to unsigned.
So, if sleep_time < 0, "(s64)sleep_time > (1ULL << 63) / NICE_0_LOAD / 2" will be true.

> 
>>
>>> +        * stolen time, we take an additional margin.
>>>          */
>>>         sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
>>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>>> +       if ((s64)sleep_time > (1ULL << 63) / NICE_0_LOAD / 2)>                 se->vruntime = vruntime;
>>>         else
>>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
>>> @@ -7658,7 +7669,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>>>         se->avg.last_update_time = 0;
>>>
>>>         /* We have migrated, no longer consider this task hot */
>>> -       se->exec_start = 0;
>>> +       se->migrated = 1;
>>>
>>>         update_scan_period(p, new_cpu);
>>>  }
>>> @@ -8344,6 +8355,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>>>         if (sysctl_sched_migration_cost == 0)
>>>                 return 0;
>>>
>>> +       if (p->se.migrated)
>>> +               return 0;
>>> +
>>>         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
>>>
>>>         return delta < (s64)sysctl_sched_migration_cost;
>>>
>>>
>>>
>>>>
>>>>
>>>>>
>>>>>
>>> .
>>>
> .
>
  
Peter Zijlstra March 14, 2023, 12:07 p.m. UTC | #29
On Tue, Mar 14, 2023 at 08:41:30AM +0100, Vincent Guittot wrote:

> I'm going to use something a bit different from your proposal below by
> merging initial and flag
> static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity
> *se, int flags)
> 
> with flags:
> 0 for initial placement
> ENQUEUE_WAKEUP for wakeup
> ENQUEUE_MIGRATED for migrated task

So when a task is not running for a long time (our case at hand), then
there's two cases:

 - it wakes up locally and place_entity() gets to reset vruntime;
 - it wakes up remotely and migrate_task_rq_fair() can reset vruntime.

So if we can rely on ENQUEUE_MIGRATED to differentiate between these
cases, when wouldn't something like this work?

---

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a1b1f855b96..a0d00b6a8bc6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4648,11 +4648,31 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #endif
 }
 
+static bool reset_vruntime(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	const s64 limit = 60LL * NSEC_PER_SEC;
+	s64 sleep_time;
+
+	/*
+	 * Pull vruntime of the entity being placed to the base level of
+	 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
+	 * slept for a long time, don't even try to compare its vruntime with
+	 * the base as it may be too far off and the comparison may get
+	 * inversed due to s64 overflow.
+	 */
+	sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
+	if (unlikely(sleep_time > limit)) {
+		se->vruntime = cfs_rq->min_vruntime - calc_delta_fair(limit, se);
+		return true;
+	}
+
+	return false;
+}
+
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
 	u64 vruntime = cfs_rq->min_vruntime;
-	u64 sleep_time;
 
 	/*
 	 * The 'current' period is already promised to the current tasks,
@@ -4682,18 +4702,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 		vruntime -= thresh;
 	}
 
-	/*
-	 * Pull vruntime of the entity being placed to the base level of
-	 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
-	 * slept for a long time, don't even try to compare its vruntime with
-	 * the base as it may be too far off and the comparison may get
-	 * inversed due to s64 overflow.
-	 */
-	sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
-	if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
-		se->vruntime = vruntime;
-	else
-		se->vruntime = max_vruntime(se->vruntime, vruntime);
+	/* ensure we don't gain time by being placed backwards */
+	se->vruntime = max_vruntime(se->vruntime, vruntime);
 }
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -4768,8 +4778,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_cfs_group(se);
 	account_entity_enqueue(cfs_rq, se);
 
-	if (flags & ENQUEUE_WAKEUP)
+	if (flags & ENQUEUE_WAKEUP) {
+		if (!(flags & ENQUEUE_MIGRATED))
+			reset_vruntime(cfs_rq, se);
 		place_entity(cfs_rq, se, 0);
+	}
 
 	check_schedstat_required();
 	update_stats_enqueue_fair(cfs_rq, se, flags);
@@ -7625,6 +7638,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 {
 	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 	/*
 	 * As blocked tasks retain absolute vruntime the migration needs to
@@ -7632,11 +7646,8 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 	 * min_vruntime -- the latter is done by enqueue_entity() when placing
 	 * the task on the new runqueue.
 	 */
-	if (READ_ONCE(p->__state) == TASK_WAKING) {
-		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
+	if (READ_ONCE(p->__state) == TASK_WAKING || reset_vruntime(cfs_rq, se))
 		se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
-	}
 
 	if (!task_on_rq_migrating(p)) {
 		remove_entity_load_avg(se);
  
Vincent Guittot March 14, 2023, 1:24 p.m. UTC | #30
On Tue, 14 Mar 2023 at 13:07, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Mar 14, 2023 at 08:41:30AM +0100, Vincent Guittot wrote:
>
> > I'm going to use something a bit different from your proposal below by
> > merging initial and flag
> > static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity
> > *se, int flags)
> >
> > with flags:
> > 0 for initial placement
> > ENQUEUE_WAKEUP for wakeup
> > ENQUEUE_MIGRATED for migrated task
>
> So when a task is not running for a long time (our case at hand), then
> there's two cases:
>
>  - it wakes up locally and place_entity() gets to reset vruntime;
>  - it wakes up remotely and migrate_task_rq_fair() can reset vruntime.
>
> So if we can rely on ENQUEUE_MIGRATED to differentiate between these
> cases, when wouldn't something like this work?
>
> ---
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7a1b1f855b96..a0d00b6a8bc6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4648,11 +4648,31 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  #endif
>  }
>
> +static bool reset_vruntime(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> +       const s64 limit = 60LL * NSEC_PER_SEC;
> +       s64 sleep_time;
> +
> +       /*
> +        * Pull vruntime of the entity being placed to the base level of
> +        * cfs_rq, to prevent boosting it if placed backwards.  If the entity
> +        * slept for a long time, don't even try to compare its vruntime with
> +        * the base as it may be too far off and the comparison may get
> +        * inversed due to s64 overflow.
> +        */
> +       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> +       if (unlikely(sleep_time > limit)) {
> +               se->vruntime = cfs_rq->min_vruntime - calc_delta_fair(limit, se);
> +               return true;
> +       }
> +
> +       return false;
> +}
> +
>  static void
>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>  {
>         u64 vruntime = cfs_rq->min_vruntime;
> -       u64 sleep_time;
>
>         /*
>          * The 'current' period is already promised to the current tasks,
> @@ -4682,18 +4702,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>                 vruntime -= thresh;
>         }
>
> -       /*
> -        * Pull vruntime of the entity being placed to the base level of
> -        * cfs_rq, to prevent boosting it if placed backwards.  If the entity
> -        * slept for a long time, don't even try to compare its vruntime with
> -        * the base as it may be too far off and the comparison may get
> -        * inversed due to s64 overflow.
> -        */
> -       sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> -               se->vruntime = vruntime;
> -       else
> -               se->vruntime = max_vruntime(se->vruntime, vruntime);
> +       /* ensure we don't gain time by being placed backwards */
> +       se->vruntime = max_vruntime(se->vruntime, vruntime);
>  }
>
>  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
> @@ -4768,8 +4778,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>         update_cfs_group(se);
>         account_entity_enqueue(cfs_rq, se);
>
> -       if (flags & ENQUEUE_WAKEUP)
> +       if (flags & ENQUEUE_WAKEUP) {
> +               if (!(flags & ENQUEUE_MIGRATED))
> +                       reset_vruntime(cfs_rq, se);
>                 place_entity(cfs_rq, se, 0);
> +       }
>
>         check_schedstat_required();
>         update_stats_enqueue_fair(cfs_rq, se, flags);
> @@ -7625,6 +7638,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>  static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>  {
>         struct sched_entity *se = &p->se;
> +       struct cfs_rq *cfs_rq = cfs_rq_of(se);
>
>         /*
>          * As blocked tasks retain absolute vruntime the migration needs to
> @@ -7632,11 +7646,8 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>          * min_vruntime -- the latter is done by enqueue_entity() when placing
>          * the task on the new runqueue.
>          */
> -       if (READ_ONCE(p->__state) == TASK_WAKING) {
> -               struct cfs_rq *cfs_rq = cfs_rq_of(se);
> -
> +       if (READ_ONCE(p->__state) == TASK_WAKING || reset_vruntime(cfs_rq, se))

That's somehow what was proposed in one of the previous proposals but
we can't call rq_clock_task(rq_of(cfs_rq)) because rq lock might not
be hold and rq task clock has not been updated before being used

>                 se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> -       }
>
>         if (!task_on_rq_migrating(p)) {
>                 remove_entity_load_avg(se);
  
Vincent Guittot March 14, 2023, 1:26 p.m. UTC | #31
On Tue, 14 Mar 2023 at 12:03, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>
>
>
> 在 2023/3/13 22:23, Vincent Guittot 写道:
> > On Sat, 11 Mar 2023 at 10:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> >>
> >>
> >>
> >> 在 2023/3/10 22:29, Vincent Guittot 写道:
> >>> Le jeudi 09 mars 2023 à 16:14:38 (+0100), Vincent Guittot a écrit :
> >>>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
> >>>>>
> >>>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
> >>>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
> >>>>>>
> >>>>>>> Then, even if we don't clear exec_start before migrating and keep
> >>>>>>> current value to be used in place_entity on the new cpu, we can't
> >>>>>>> compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT
> >>>>>>
> >>>>>> Blergh -- indeed, irq and steal time can skew them between CPUs :/
> >>>>>> I suppose we can fudge that... wait_start (which is basically what we're
> >>>>>> making it do) also does that IIRC.
> >>>>>>
> >>>>>> I really dislike having this placement muck spreadout like proposed.
> >>>>>
> >>>>> Also, I think we might be over-engineering this, we don't care about
> >>>>> accuracy at all, all we really care about is 'long-time'.
> >>>>
> >>>> you mean taking the patch 1/2 that you mentioned here to add a
> >>>> migrated field:
> >>>> https://lore.kernel.org/all/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com/T/#ma5637eb8010f3f4a4abff778af8db705429d003b
> >>>>
> >>>> And assume that the divergence between the rq_clock_task() can be ignored ?
> >>>>
> >>>> That could probably work but we need to replace the (60LL *
> >>>> NSEC_PER_SEC) by ((1ULL << 63) / NICE_0_LOAD) because 60sec divergence
> >>>> would not be unrealistic.
> >>>> and a comment to explain why it's acceptable
> >>>
> >>> Zhang,
> >>>
> >>> Could you try the patch below ?
> >>> This is a rebase/merge/update of:
> >>> -patch 1/2 above and
> >>> -https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
> >>
> >>
> >> I applyed and tested this patch, and it make hackbench slower.
> >> According to my previous test results. The good result is 82.1(s).
> >> But the result of this patch is 108.725(s).
> >
> > By "the result of this patch is 108.725(s)",  you mean the result of
> > https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
> > alone, don't you ?
>
> No, with your patch, the test results is 108.725(s),

Ok

>
> git diff:
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 63d242164b1a..93a3909ae4c4 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -550,6 +550,7 @@ struct sched_entity {
>         struct rb_node                  run_node;
>         struct list_head                group_node;
>         unsigned int                    on_rq;
> +       unsigned int                    migrated;
>
>         u64                             exec_start;
>         u64                             sum_exec_runtime;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ff4dbbae3b10..e60defc39f6e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>         /*
>          * We are starting a new run period:
>          */
> +       se->migrated = 0;
>         se->exec_start = rq_clock_task(rq_of(cfs_rq));
>  }
>
> @@ -4690,9 +4691,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>          * inversed due to s64 overflow.
>          */
>         sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> +       if ((s64)sleep_time > (1ULL << 63) / scale_load_down(NICE_0_LOAD) / 2) {
>                 se->vruntime = vruntime;
> -       else
> +       } else
>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
>  }
>
> @@ -7658,8 +7659,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>         se->avg.last_update_time = 0;
>
>         /* We have migrated, no longer consider this task hot */
> -       se->exec_start = 0;
> -
> +       se->migrated = 1;
>         update_scan_period(p, new_cpu);
>  }
>
> @@ -8343,6 +8343,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>
>         if (sysctl_sched_migration_cost == 0)
>                 return 0;
> +       if (p->se.migrated)
> +               return 0;
>
>         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
>
>
>
> >
> >>
> >>
> >>> version1: v6.2
> >>> version2: v6.2 + commit 829c1651e9c4
> >>> version3: v6.2 + commit 829c1651e9c4 + this patch
> >>>
> >>> -------------------------------------------------
> >>>       version1        version2        version3
> >>> test1 81.0            118.1           82.1
> >>> test2 82.1            116.9           80.3
> >>> test3 83.2            103.9           83.3
> >>> avg(s)        82.1            113.0           81.9
> >
> > Ok, it looks like we are back to normal figures

What do those results refer to then ?


> >
> >>>
> >>> -------------------------------------------------
> >>>
> >>> The proposal accepts a divergence of up to 52 days between the 2 rqs.
> >>>
> >>> If this work, we will prepare a proper patch
> >>>
> >>> diff --git a/include/linux/sched.h b/include/linux/sched.h
> >>> index 63d242164b1a..cb8af0a137f7 100644
> >>> --- a/include/linux/sched.h
> >>> +++ b/include/linux/sched.h
> >>> @@ -550,6 +550,7 @@ struct sched_entity {
> >>>         struct rb_node                  run_node;
> >>>         struct list_head                group_node;
> >>>         unsigned int                    on_rq;
> >>> +       unsigned int                    migrated;
> >>>
> >>>         u64                             exec_start;
> >>>         u64                             sum_exec_runtime;
> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>> index 7a1b1f855b96..36acd9598b40 100644
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> @@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >>>         /*
> >>>          * We are starting a new run period:
> >>>          */
> >>> +       se->migrated = 0;
> >>>         se->exec_start = rq_clock_task(rq_of(cfs_rq));
> >>>  }
> >>>
> >>> @@ -4684,13 +4685,23 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >>>
> >>>         /*
> >>>          * Pull vruntime of the entity being placed to the base level of
> >>> -        * cfs_rq, to prevent boosting it if placed backwards.  If the entity
> >>> -        * slept for a long time, don't even try to compare its vruntime with
> >>> -        * the base as it may be too far off and the comparison may get
> >>> -        * inversed due to s64 overflow.
> >>> +        * cfs_rq, to prevent boosting it if placed backwards.
> >>> +        * However, min_vruntime can advance much faster than real time, with
> >>> +        * the exterme being when an entity with the minimal weight always runs
> >>> +        * on the cfs_rq. If the new entity slept for long, its vruntime
> >>> +        * difference from min_vruntime may overflow s64 and their comparison
> >>> +        * may get inversed, so ignore the entity's original vruntime in that
> >>> +        * case.
> >>> +        * The maximal vruntime speedup is given by the ratio of normal to
> >>> +        * minimal weight: NICE_0_LOAD / MIN_SHARES, so cutting off on the
> >>
> >> why not is `scale_load_down(NICE_0_LOAD) / MIN_SHARES` here ?
> >
> > yes, you are right.
> >
> >>
> >>
> >>> +        * sleep time of 2^63 / NICE_0_LOAD should be safe.
> >>> +        * When placing a migrated waking entity, its exec_start has been set
> >>> +        * from a different rq. In order to take into account a possible
> >>> +        * divergence between new and prev rq's clocks task because of irq and
> >>
> >> This divergence might be larger, it cause `sleep_time` maybe negative.
> >
> > AFAICT, we are safe with ((1ULL << 63) / scale_load_down(NICE_0_LOAD)
> > / 2) as long as the divergence between the 2 rqs clocks task is lower
> > than 2^52nsec. Do you expect a divergence higher than 2^52 nsec
> > (around 52 days)?
> >
> > We can probably keep using (1ULL << 63) / scale_load_down(NICE_0_LOAD)
> > which is already half the max value if needed.
> >
> > the fact that sleep_time can be negative is not a problem as
> > s64)sleep_time > will take care of this.
>
> In my opinion, when comparing signed with unsigned, the compiler converts the signed value to unsigned.
> So, if sleep_time < 0, "(s64)sleep_time > (1ULL << 63) / NICE_0_LOAD / 2" will be true.
>
> >
> >>
> >>> +        * stolen time, we take an additional margin.
> >>>          */
> >>>         sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> >>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> >>> +       if ((s64)sleep_time > (1ULL << 63) / NICE_0_LOAD / 2)>                 se->vruntime = vruntime;
> >>>         else
> >>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> >>> @@ -7658,7 +7669,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >>>         se->avg.last_update_time = 0;
> >>>
> >>>         /* We have migrated, no longer consider this task hot */
> >>> -       se->exec_start = 0;
> >>> +       se->migrated = 1;
> >>>
> >>>         update_scan_period(p, new_cpu);
> >>>  }
> >>> @@ -8344,6 +8355,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> >>>         if (sysctl_sched_migration_cost == 0)
> >>>                 return 0;
> >>>
> >>> +       if (p->se.migrated)
> >>> +               return 0;
> >>> +
> >>>         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
> >>>
> >>>         return delta < (s64)sysctl_sched_migration_cost;
> >>>
> >>>
> >>>
> >>>>
> >>>>
> >>>>>
> >>>>>
> >>> .
> >>>
> > .
> >
  
Dietmar Eggemann March 14, 2023, 1:29 p.m. UTC | #32
On 14/03/2023 13:07, Peter Zijlstra wrote:
> On Tue, Mar 14, 2023 at 08:41:30AM +0100, Vincent Guittot wrote:
> 
>> I'm going to use something a bit different from your proposal below by
>> merging initial and flag
>> static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity
>> *se, int flags)
>>
>> with flags:
>> 0 for initial placement
>> ENQUEUE_WAKEUP for wakeup
>> ENQUEUE_MIGRATED for migrated task
> 
> So when a task is not running for a long time (our case at hand), then
> there's two cases:
> 
>  - it wakes up locally and place_entity() gets to reset vruntime;
>  - it wakes up remotely and migrate_task_rq_fair() can reset vruntime.
> 
> So if we can rely on ENQUEUE_MIGRATED to differentiate between these
> cases, when wouldn't something like this work?

I guess so. We would avoid rq_clock_task skews or to be forced to pass
state that migrating se's vruntime is too old.

[...]

> @@ -7632,11 +7646,8 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>  	 * min_vruntime -- the latter is done by enqueue_entity() when placing
>  	 * the task on the new runqueue.
>  	 */
> -	if (READ_ONCE(p->__state) == TASK_WAKING) {
> -		struct cfs_rq *cfs_rq = cfs_rq_of(se);
> -
> +	if (READ_ONCE(p->__state) == TASK_WAKING || reset_vruntime(cfs_rq, se))

Don't you want to call reset_vruntime() specifically on the waking task?

>  		se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> -	}
>  
>  	if (!task_on_rq_migrating(p)) {
>  		remove_entity_load_avg(se);
  
Dietmar Eggemann March 14, 2023, 1:37 p.m. UTC | #33
On 14/03/2023 14:29, Dietmar Eggemann wrote:
> On 14/03/2023 13:07, Peter Zijlstra wrote:
>> On Tue, Mar 14, 2023 at 08:41:30AM +0100, Vincent Guittot wrote:

[...]

>> So when a task is not running for a long time (our case at hand), then
>> there's two cases:
>>
>>  - it wakes up locally and place_entity() gets to reset vruntime;
>>  - it wakes up remotely and migrate_task_rq_fair() can reset vruntime.
>>
>> So if we can rely on ENQUEUE_MIGRATED to differentiate between these
>> cases, when wouldn't something like this work?
> 
> I guess so. We would avoid rq_clock_task skews or to be forced to pass
> state that migrating se's vruntime is too old.

... just saw Vincent's reply ... I forgot the limitation that we can't
all rq_clock_task() in migrate_task_rq_fair() again.

[...]
  
Zhang Qiao March 14, 2023, 1:38 p.m. UTC | #34
在 2023/3/14 21:26, Vincent Guittot 写道:
> On Tue, 14 Mar 2023 at 12:03, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>
>>
>>
>> 在 2023/3/13 22:23, Vincent Guittot 写道:
>>> On Sat, 11 Mar 2023 at 10:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>>>
>>>>
>>>>
>>>> 在 2023/3/10 22:29, Vincent Guittot 写道:
>>>>> Le jeudi 09 mars 2023 à 16:14:38 (+0100), Vincent Guittot a écrit :
>>>>>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
>>>>>>>
>>>>>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
>>>>>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
>>>>>>>>
>>>>>>>>> Then, even if we don't clear exec_start before migrating and keep
>>>>>>>>> current value to be used in place_entity on the new cpu, we can't
>>>>>>>>> compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT
>>>>>>>>
>>>>>>>> Blergh -- indeed, irq and steal time can skew them between CPUs :/
>>>>>>>> I suppose we can fudge that... wait_start (which is basically what we're
>>>>>>>> making it do) also does that IIRC.
>>>>>>>>
>>>>>>>> I really dislike having this placement muck spreadout like proposed.
>>>>>>>
>>>>>>> Also, I think we might be over-engineering this, we don't care about
>>>>>>> accuracy at all, all we really care about is 'long-time'.
>>>>>>
>>>>>> you mean taking the patch 1/2 that you mentioned here to add a
>>>>>> migrated field:
>>>>>> https://lore.kernel.org/all/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com/T/#ma5637eb8010f3f4a4abff778af8db705429d003b
>>>>>>
>>>>>> And assume that the divergence between the rq_clock_task() can be ignored ?
>>>>>>
>>>>>> That could probably work but we need to replace the (60LL *
>>>>>> NSEC_PER_SEC) by ((1ULL << 63) / NICE_0_LOAD) because 60sec divergence
>>>>>> would not be unrealistic.
>>>>>> and a comment to explain why it's acceptable
>>>>>
>>>>> Zhang,
>>>>>
>>>>> Could you try the patch below ?
>>>>> This is a rebase/merge/update of:
>>>>> -patch 1/2 above and
>>>>> -https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
>>>>
>>>>
>>>> I applyed and tested this patch, and it make hackbench slower.
>>>> According to my previous test results. The good result is 82.1(s).
>>>> But the result of this patch is 108.725(s).
>>>
>>> By "the result of this patch is 108.725(s)",  you mean the result of
>>> https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
>>> alone, don't you ?
>>
>> No, with your patch, the test results is 108.725(s),
> 
> Ok
> 
>>
>> git diff:
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 63d242164b1a..93a3909ae4c4 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -550,6 +550,7 @@ struct sched_entity {
>>         struct rb_node                  run_node;
>>         struct list_head                group_node;
>>         unsigned int                    on_rq;
>> +       unsigned int                    migrated;
>>
>>         u64                             exec_start;
>>         u64                             sum_exec_runtime;
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index ff4dbbae3b10..e60defc39f6e 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>         /*
>>          * We are starting a new run period:
>>          */
>> +       se->migrated = 0;
>>         se->exec_start = rq_clock_task(rq_of(cfs_rq));
>>  }
>>
>> @@ -4690,9 +4691,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>          * inversed due to s64 overflow.
>>          */
>>         sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>> +       if ((s64)sleep_time > (1ULL << 63) / scale_load_down(NICE_0_LOAD) / 2) {
>>                 se->vruntime = vruntime;
>> -       else
>> +       } else
>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
>>  }
>>
>> @@ -7658,8 +7659,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>>         se->avg.last_update_time = 0;
>>
>>         /* We have migrated, no longer consider this task hot */
>> -       se->exec_start = 0;
>> -
>> +       se->migrated = 1;
>>         update_scan_period(p, new_cpu);
>>  }
>>
>> @@ -8343,6 +8343,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>>
>>         if (sysctl_sched_migration_cost == 0)
>>                 return 0;
>> +       if (p->se.migrated)
>> +               return 0;
>>
>>         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
>>
>>
>>
>>>
>>>>
>>>>
>>>>> version1: v6.2
>>>>> version2: v6.2 + commit 829c1651e9c4
>>>>> version3: v6.2 + commit 829c1651e9c4 + this patch
>>>>>
>>>>> -------------------------------------------------
>>>>>       version1        version2        version3
>>>>> test1 81.0            118.1           82.1
>>>>> test2 82.1            116.9           80.3
>>>>> test3 83.2            103.9           83.3
>>>>> avg(s)        82.1            113.0           81.9
>>>
>>> Ok, it looks like we are back to normal figures
> 
> What do those results refer to then ?

Quote from this email (https://lore.kernel.org/lkml/1cd19d3f-18c4-92f9-257a-378cc18cfbc7@huawei.com/).

> 
> 
>>>
>>>>>
>>>>> -------------------------------------------------
>>>>>
>>>>> The proposal accepts a divergence of up to 52 days between the 2 rqs.
>>>>>
>>>>> If this work, we will prepare a proper patch
>>>>>
>>>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>>>> index 63d242164b1a..cb8af0a137f7 100644
>>>>> --- a/include/linux/sched.h
>>>>> +++ b/include/linux/sched.h
>>>>> @@ -550,6 +550,7 @@ struct sched_entity {
>>>>>         struct rb_node                  run_node;
>>>>>         struct list_head                group_node;
>>>>>         unsigned int                    on_rq;
>>>>> +       unsigned int                    migrated;
>>>>>
>>>>>         u64                             exec_start;
>>>>>         u64                             sum_exec_runtime;
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index 7a1b1f855b96..36acd9598b40 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>>>>         /*
>>>>>          * We are starting a new run period:
>>>>>          */
>>>>> +       se->migrated = 0;
>>>>>         se->exec_start = rq_clock_task(rq_of(cfs_rq));
>>>>>  }
>>>>>
>>>>> @@ -4684,13 +4685,23 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>>>
>>>>>         /*
>>>>>          * Pull vruntime of the entity being placed to the base level of
>>>>> -        * cfs_rq, to prevent boosting it if placed backwards.  If the entity
>>>>> -        * slept for a long time, don't even try to compare its vruntime with
>>>>> -        * the base as it may be too far off and the comparison may get
>>>>> -        * inversed due to s64 overflow.
>>>>> +        * cfs_rq, to prevent boosting it if placed backwards.
>>>>> +        * However, min_vruntime can advance much faster than real time, with
>>>>> +        * the exterme being when an entity with the minimal weight always runs
>>>>> +        * on the cfs_rq. If the new entity slept for long, its vruntime
>>>>> +        * difference from min_vruntime may overflow s64 and their comparison
>>>>> +        * may get inversed, so ignore the entity's original vruntime in that
>>>>> +        * case.
>>>>> +        * The maximal vruntime speedup is given by the ratio of normal to
>>>>> +        * minimal weight: NICE_0_LOAD / MIN_SHARES, so cutting off on the
>>>>
>>>> why not is `scale_load_down(NICE_0_LOAD) / MIN_SHARES` here ?
>>>
>>> yes, you are right.
>>>
>>>>
>>>>
>>>>> +        * sleep time of 2^63 / NICE_0_LOAD should be safe.
>>>>> +        * When placing a migrated waking entity, its exec_start has been set
>>>>> +        * from a different rq. In order to take into account a possible
>>>>> +        * divergence between new and prev rq's clocks task because of irq and
>>>>
>>>> This divergence might be larger, it cause `sleep_time` maybe negative.
>>>
>>> AFAICT, we are safe with ((1ULL << 63) / scale_load_down(NICE_0_LOAD)
>>> / 2) as long as the divergence between the 2 rqs clocks task is lower
>>> than 2^52nsec. Do you expect a divergence higher than 2^52 nsec
>>> (around 52 days)?
>>>
>>> We can probably keep using (1ULL << 63) / scale_load_down(NICE_0_LOAD)
>>> which is already half the max value if needed.
>>>
>>> the fact that sleep_time can be negative is not a problem as
>>> s64)sleep_time > will take care of this.
>>
>> In my opinion, when comparing signed with unsigned, the compiler converts the signed value to unsigned.
>> So, if sleep_time < 0, "(s64)sleep_time > (1ULL << 63) / NICE_0_LOAD / 2" will be true.
>>
>>>
>>>>
>>>>> +        * stolen time, we take an additional margin.
>>>>>          */
>>>>>         sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
>>>>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>>>>> +       if ((s64)sleep_time > (1ULL << 63) / NICE_0_LOAD / 2)>                 se->vruntime = vruntime;
>>>>>         else
>>>>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
>>>>> @@ -7658,7 +7669,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>>>>>         se->avg.last_update_time = 0;
>>>>>
>>>>>         /* We have migrated, no longer consider this task hot */
>>>>> -       se->exec_start = 0;
>>>>> +       se->migrated = 1;
>>>>>
>>>>>         update_scan_period(p, new_cpu);
>>>>>  }
>>>>> @@ -8344,6 +8355,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>>>>>         if (sysctl_sched_migration_cost == 0)
>>>>>                 return 0;
>>>>>
>>>>> +       if (p->se.migrated)
>>>>> +               return 0;
>>>>> +
>>>>>         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
>>>>>
>>>>>         return delta < (s64)sysctl_sched_migration_cost;
>>>>>
>>>>>
>>>>>
>>>>>>
>>>>>>
>>>>>>>
>>>>>>>
>>>>> .
>>>>>
>>> .
>>>
> .
>
  
Vincent Guittot March 14, 2023, 1:39 p.m. UTC | #35
On Tue, 14 Mar 2023 at 14:38, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>
>
>
> 在 2023/3/14 21:26, Vincent Guittot 写道:
> > On Tue, 14 Mar 2023 at 12:03, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> >>
> >>
> >>
> >> 在 2023/3/13 22:23, Vincent Guittot 写道:
> >>> On Sat, 11 Mar 2023 at 10:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>> 在 2023/3/10 22:29, Vincent Guittot 写道:
> >>>>> Le jeudi 09 mars 2023 à 16:14:38 (+0100), Vincent Guittot a écrit :
> >>>>>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
> >>>>>>>
> >>>>>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
> >>>>>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
> >>>>>>>>
> >>>>>>>>> Then, even if we don't clear exec_start before migrating and keep
> >>>>>>>>> current value to be used in place_entity on the new cpu, we can't
> >>>>>>>>> compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT
> >>>>>>>>
> >>>>>>>> Blergh -- indeed, irq and steal time can skew them between CPUs :/
> >>>>>>>> I suppose we can fudge that... wait_start (which is basically what we're
> >>>>>>>> making it do) also does that IIRC.
> >>>>>>>>
> >>>>>>>> I really dislike having this placement muck spreadout like proposed.
> >>>>>>>
> >>>>>>> Also, I think we might be over-engineering this, we don't care about
> >>>>>>> accuracy at all, all we really care about is 'long-time'.
> >>>>>>
> >>>>>> you mean taking the patch 1/2 that you mentioned here to add a
> >>>>>> migrated field:
> >>>>>> https://lore.kernel.org/all/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com/T/#ma5637eb8010f3f4a4abff778af8db705429d003b
> >>>>>>
> >>>>>> And assume that the divergence between the rq_clock_task() can be ignored ?
> >>>>>>
> >>>>>> That could probably work but we need to replace the (60LL *
> >>>>>> NSEC_PER_SEC) by ((1ULL << 63) / NICE_0_LOAD) because 60sec divergence
> >>>>>> would not be unrealistic.
> >>>>>> and a comment to explain why it's acceptable
> >>>>>
> >>>>> Zhang,
> >>>>>
> >>>>> Could you try the patch below ?
> >>>>> This is a rebase/merge/update of:
> >>>>> -patch 1/2 above and
> >>>>> -https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
> >>>>
> >>>>
> >>>> I applyed and tested this patch, and it make hackbench slower.
> >>>> According to my previous test results. The good result is 82.1(s).
> >>>> But the result of this patch is 108.725(s).
> >>>
> >>> By "the result of this patch is 108.725(s)",  you mean the result of
> >>> https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
> >>> alone, don't you ?
> >>
> >> No, with your patch, the test results is 108.725(s),
> >
> > Ok
> >
> >>
> >> git diff:
> >>
> >> diff --git a/include/linux/sched.h b/include/linux/sched.h
> >> index 63d242164b1a..93a3909ae4c4 100644
> >> --- a/include/linux/sched.h
> >> +++ b/include/linux/sched.h
> >> @@ -550,6 +550,7 @@ struct sched_entity {
> >>         struct rb_node                  run_node;
> >>         struct list_head                group_node;
> >>         unsigned int                    on_rq;
> >> +       unsigned int                    migrated;
> >>
> >>         u64                             exec_start;
> >>         u64                             sum_exec_runtime;
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index ff4dbbae3b10..e60defc39f6e 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >>         /*
> >>          * We are starting a new run period:
> >>          */
> >> +       se->migrated = 0;
> >>         se->exec_start = rq_clock_task(rq_of(cfs_rq));
> >>  }
> >>
> >> @@ -4690,9 +4691,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >>          * inversed due to s64 overflow.
> >>          */
> >>         sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> >> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> >> +       if ((s64)sleep_time > (1ULL << 63) / scale_load_down(NICE_0_LOAD) / 2) {
> >>                 se->vruntime = vruntime;
> >> -       else
> >> +       } else
> >>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> >>  }
> >>
> >> @@ -7658,8 +7659,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >>         se->avg.last_update_time = 0;
> >>
> >>         /* We have migrated, no longer consider this task hot */
> >> -       se->exec_start = 0;
> >> -
> >> +       se->migrated = 1;
> >>         update_scan_period(p, new_cpu);
> >>  }
> >>
> >> @@ -8343,6 +8343,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> >>
> >>         if (sysctl_sched_migration_cost == 0)
> >>                 return 0;
> >> +       if (p->se.migrated)
> >> +               return 0;
> >>
> >>         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
> >>
> >>
> >>
> >>>
> >>>>
> >>>>
> >>>>> version1: v6.2
> >>>>> version2: v6.2 + commit 829c1651e9c4
> >>>>> version3: v6.2 + commit 829c1651e9c4 + this patch
> >>>>>
> >>>>> -------------------------------------------------
> >>>>>       version1        version2        version3
> >>>>> test1 81.0            118.1           82.1
> >>>>> test2 82.1            116.9           80.3
> >>>>> test3 83.2            103.9           83.3
> >>>>> avg(s)        82.1            113.0           81.9
> >>>
> >>> Ok, it looks like we are back to normal figures
> >
> > What do those results refer to then ?
>
> Quote from this email (https://lore.kernel.org/lkml/1cd19d3f-18c4-92f9-257a-378cc18cfbc7@huawei.com/).

ok.

Then, there is something wrong in my patch. Let me look at it more deeply

>
> >
> >
> >>>
> >>>>>
> >>>>> -------------------------------------------------
> >>>>>
> >>>>> The proposal accepts a divergence of up to 52 days between the 2 rqs.
> >>>>>
> >>>>> If this work, we will prepare a proper patch
> >>>>>
> >>>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
> >>>>> index 63d242164b1a..cb8af0a137f7 100644
> >>>>> --- a/include/linux/sched.h
> >>>>> +++ b/include/linux/sched.h
> >>>>> @@ -550,6 +550,7 @@ struct sched_entity {
> >>>>>         struct rb_node                  run_node;
> >>>>>         struct list_head                group_node;
> >>>>>         unsigned int                    on_rq;
> >>>>> +       unsigned int                    migrated;
> >>>>>
> >>>>>         u64                             exec_start;
> >>>>>         u64                             sum_exec_runtime;
> >>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>>>> index 7a1b1f855b96..36acd9598b40 100644
> >>>>> --- a/kernel/sched/fair.c
> >>>>> +++ b/kernel/sched/fair.c
> >>>>> @@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >>>>>         /*
> >>>>>          * We are starting a new run period:
> >>>>>          */
> >>>>> +       se->migrated = 0;
> >>>>>         se->exec_start = rq_clock_task(rq_of(cfs_rq));
> >>>>>  }
> >>>>>
> >>>>> @@ -4684,13 +4685,23 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >>>>>
> >>>>>         /*
> >>>>>          * Pull vruntime of the entity being placed to the base level of
> >>>>> -        * cfs_rq, to prevent boosting it if placed backwards.  If the entity
> >>>>> -        * slept for a long time, don't even try to compare its vruntime with
> >>>>> -        * the base as it may be too far off and the comparison may get
> >>>>> -        * inversed due to s64 overflow.
> >>>>> +        * cfs_rq, to prevent boosting it if placed backwards.
> >>>>> +        * However, min_vruntime can advance much faster than real time, with
> >>>>> +        * the exterme being when an entity with the minimal weight always runs
> >>>>> +        * on the cfs_rq. If the new entity slept for long, its vruntime
> >>>>> +        * difference from min_vruntime may overflow s64 and their comparison
> >>>>> +        * may get inversed, so ignore the entity's original vruntime in that
> >>>>> +        * case.
> >>>>> +        * The maximal vruntime speedup is given by the ratio of normal to
> >>>>> +        * minimal weight: NICE_0_LOAD / MIN_SHARES, so cutting off on the
> >>>>
> >>>> why not is `scale_load_down(NICE_0_LOAD) / MIN_SHARES` here ?
> >>>
> >>> yes, you are right.
> >>>
> >>>>
> >>>>
> >>>>> +        * sleep time of 2^63 / NICE_0_LOAD should be safe.
> >>>>> +        * When placing a migrated waking entity, its exec_start has been set
> >>>>> +        * from a different rq. In order to take into account a possible
> >>>>> +        * divergence between new and prev rq's clocks task because of irq and
> >>>>
> >>>> This divergence might be larger, it cause `sleep_time` maybe negative.
> >>>
> >>> AFAICT, we are safe with ((1ULL << 63) / scale_load_down(NICE_0_LOAD)
> >>> / 2) as long as the divergence between the 2 rqs clocks task is lower
> >>> than 2^52nsec. Do you expect a divergence higher than 2^52 nsec
> >>> (around 52 days)?
> >>>
> >>> We can probably keep using (1ULL << 63) / scale_load_down(NICE_0_LOAD)
> >>> which is already half the max value if needed.
> >>>
> >>> the fact that sleep_time can be negative is not a problem as
> >>> s64)sleep_time > will take care of this.
> >>
> >> In my opinion, when comparing signed with unsigned, the compiler converts the signed value to unsigned.
> >> So, if sleep_time < 0, "(s64)sleep_time > (1ULL << 63) / NICE_0_LOAD / 2" will be true.
> >>
> >>>
> >>>>
> >>>>> +        * stolen time, we take an additional margin.
> >>>>>          */
> >>>>>         sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> >>>>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> >>>>> +       if ((s64)sleep_time > (1ULL << 63) / NICE_0_LOAD / 2)>                 se->vruntime = vruntime;
> >>>>>         else
> >>>>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> >>>>> @@ -7658,7 +7669,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >>>>>         se->avg.last_update_time = 0;
> >>>>>
> >>>>>         /* We have migrated, no longer consider this task hot */
> >>>>> -       se->exec_start = 0;
> >>>>> +       se->migrated = 1;
> >>>>>
> >>>>>         update_scan_period(p, new_cpu);
> >>>>>  }
> >>>>> @@ -8344,6 +8355,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> >>>>>         if (sysctl_sched_migration_cost == 0)
> >>>>>                 return 0;
> >>>>>
> >>>>> +       if (p->se.migrated)
> >>>>> +               return 0;
> >>>>> +
> >>>>>         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
> >>>>>
> >>>>>         return delta < (s64)sysctl_sched_migration_cost;
> >>>>>
> >>>>>
> >>>>>
> >>>>>>
> >>>>>>
> >>>>>>>
> >>>>>>>
> >>>>> .
> >>>>>
> >>> .
> >>>
> > .
> >
  
Vincent Guittot March 14, 2023, 3:32 p.m. UTC | #36
Le mardi 14 mars 2023 à 14:39:49 (+0100), Vincent Guittot a écrit :
> On Tue, 14 Mar 2023 at 14:38, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> >
> >
> >
> > 在 2023/3/14 21:26, Vincent Guittot 写道:
> > > On Tue, 14 Mar 2023 at 12:03, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> > >>
> > >>
> > >>
> > >> 在 2023/3/13 22:23, Vincent Guittot 写道:
> > >>> On Sat, 11 Mar 2023 at 10:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> > >>>>
> > >>>>
> > >>>>
> > >>>> 在 2023/3/10 22:29, Vincent Guittot 写道:
> > >>>>> Le jeudi 09 mars 2023 à 16:14:38 (+0100), Vincent Guittot a écrit :
> > >>>>>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
> > >>>>>>>
> > >>>>>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
> > >>>>>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
> > >>>>>>>>
> > >>>>>>>>> Then, even if we don't clear exec_start before migrating and keep
> > >>>>>>>>> current value to be used in place_entity on the new cpu, we can't
> > >>>>>>>>> compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT
> > >>>>>>>>
> > >>>>>>>> Blergh -- indeed, irq and steal time can skew them between CPUs :/
> > >>>>>>>> I suppose we can fudge that... wait_start (which is basically what we're
> > >>>>>>>> making it do) also does that IIRC.
> > >>>>>>>>
> > >>>>>>>> I really dislike having this placement muck spreadout like proposed.
> > >>>>>>>
> > >>>>>>> Also, I think we might be over-engineering this, we don't care about
> > >>>>>>> accuracy at all, all we really care about is 'long-time'.
> > >>>>>>
> > >>>>>> you mean taking the patch 1/2 that you mentioned here to add a
> > >>>>>> migrated field:
> > >>>>>> https://lore.kernel.org/all/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com/T/#ma5637eb8010f3f4a4abff778af8db705429d003b
> > >>>>>>
> > >>>>>> And assume that the divergence between the rq_clock_task() can be ignored ?
> > >>>>>>
> > >>>>>> That could probably work but we need to replace the (60LL *
> > >>>>>> NSEC_PER_SEC) by ((1ULL << 63) / NICE_0_LOAD) because 60sec divergence
> > >>>>>> would not be unrealistic.
> > >>>>>> and a comment to explain why it's acceptable
> > >>>>>
> > >>>>> Zhang,
> > >>>>>
> > >>>>> Could you try the patch below ?
> > >>>>> This is a rebase/merge/update of:
> > >>>>> -patch 1/2 above and
> > >>>>> -https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
> > >>>>
> > >>>>
> > >>>> I applyed and tested this patch, and it make hackbench slower.
> > >>>> According to my previous test results. The good result is 82.1(s).
> > >>>> But the result of this patch is 108.725(s).
> > >>>
> > >>> By "the result of this patch is 108.725(s)",  you mean the result of
> > >>> https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
> > >>> alone, don't you ?
> > >>
> > >> No, with your patch, the test results is 108.725(s),
> > >
> > > Ok
> > >
> > >>
> > >> git diff:
> > >>
> > >> diff --git a/include/linux/sched.h b/include/linux/sched.h
> > >> index 63d242164b1a..93a3909ae4c4 100644
> > >> --- a/include/linux/sched.h
> > >> +++ b/include/linux/sched.h
> > >> @@ -550,6 +550,7 @@ struct sched_entity {
> > >>         struct rb_node                  run_node;
> > >>         struct list_head                group_node;
> > >>         unsigned int                    on_rq;
> > >> +       unsigned int                    migrated;
> > >>
> > >>         u64                             exec_start;
> > >>         u64                             sum_exec_runtime;
> > >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > >> index ff4dbbae3b10..e60defc39f6e 100644
> > >> --- a/kernel/sched/fair.c
> > >> +++ b/kernel/sched/fair.c
> > >> @@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > >>         /*
> > >>          * We are starting a new run period:
> > >>          */
> > >> +       se->migrated = 0;
> > >>         se->exec_start = rq_clock_task(rq_of(cfs_rq));
> > >>  }
> > >>
> > >> @@ -4690,9 +4691,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> > >>          * inversed due to s64 overflow.
> > >>          */
> > >>         sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> > >> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> > >> +       if ((s64)sleep_time > (1ULL << 63) / scale_load_down(NICE_0_LOAD) / 2) {
> > >>                 se->vruntime = vruntime;
> > >> -       else
> > >> +       } else
> > >>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> > >>  }
> > >>
> > >> @@ -7658,8 +7659,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> > >>         se->avg.last_update_time = 0;
> > >>
> > >>         /* We have migrated, no longer consider this task hot */
> > >> -       se->exec_start = 0;
> > >> -
> > >> +       se->migrated = 1;
> > >>         update_scan_period(p, new_cpu);
> > >>  }
> > >>
> > >> @@ -8343,6 +8343,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> > >>
> > >>         if (sysctl_sched_migration_cost == 0)
> > >>                 return 0;
> > >> +       if (p->se.migrated)
> > >> +               return 0;
> > >>
> > >>         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
> > >>
> > >>
> > >>
> > >>>
> > >>>>
> > >>>>
> > >>>>> version1: v6.2
> > >>>>> version2: v6.2 + commit 829c1651e9c4
> > >>>>> version3: v6.2 + commit 829c1651e9c4 + this patch
> > >>>>>
> > >>>>> -------------------------------------------------
> > >>>>>       version1        version2        version3
> > >>>>> test1 81.0            118.1           82.1
> > >>>>> test2 82.1            116.9           80.3
> > >>>>> test3 83.2            103.9           83.3
> > >>>>> avg(s)        82.1            113.0           81.9
> > >>>
> > >>> Ok, it looks like we are back to normal figures
> > >
> > > What do those results refer to then ?
> >
> > Quote from this email (https://lore.kernel.org/lkml/1cd19d3f-18c4-92f9-257a-378cc18cfbc7@huawei.com/).
> 
> ok.
> 
> Then, there is something wrong in my patch. Let me look at it more deeply

Coudl you try the patc below. It fixes the problem on my system

---
 kernel/sched/fair.c | 84 +++++++++++++++++++++++++++++++--------------
 1 file changed, 59 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f499e9a74b5..f8722e47bb0b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4648,23 +4648,36 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #endif
 }

+static inline bool entity_is_long_sleeper(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq;
+       u64 sleep_time;
+
+       if (se->exec_start == 0)
+               return false;
+
+       cfs_rq = cfs_rq_of(se);
+
+       sleep_time = rq_clock_task(rq_of(cfs_rq));
+
+       /* Happen while migrating because of clock task divergence */
+       if (sleep_time <= se->exec_start)
+	       return false;
+
+       sleep_time -= se->exec_start;
+       if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
+               return true;
+
+       return false;
+}
+
 static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	u64 vruntime = cfs_rq->min_vruntime;
-	u64 sleep_time;
-
-	/*
-	 * The 'current' period is already promised to the current tasks,
-	 * however the extra weight of the new task will slow them down a
-	 * little, place the new task so that it fits in the slot that
-	 * stays open at the end.
-	 */
-	if (initial && sched_feat(START_DEBIT))
-		vruntime += sched_vslice(cfs_rq, se);

 	/* sleeps up to a single latency don't count. */
-	if (!initial) {
+	if (flags & ENQUEUE_WAKEUP) {
 		unsigned long thresh;

 		if (se_is_idle(se))
@@ -4680,20 +4693,43 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 			thresh >>= 1;

 		vruntime -= thresh;
+	} else if sched_feat(START_DEBIT) {
+		/*
+		 * The 'current' period is already promised to the current tasks,
+		 * however the extra weight of the new task will slow them down a
+		 * little, place the new task so that it fits in the slot that
+		 * stays open at the end.
+		 */
+		vruntime += sched_vslice(cfs_rq, se);
 	}

 	/*
 	 * Pull vruntime of the entity being placed to the base level of
-	 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
-	 * slept for a long time, don't even try to compare its vruntime with
-	 * the base as it may be too far off and the comparison may get
-	 * inversed due to s64 overflow.
-	 */
-	sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
-	if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+	 * cfs_rq, to prevent boosting it if placed backwards.
+	 * However, min_vruntime can advance much faster than real time, with
+	 * the exterme being when an entity with the minimal weight always runs
+	 * on the cfs_rq. If the new entity slept for long, its vruntime
+	 * difference from min_vruntime may overflow s64 and their comparison
+	 * may get inversed, so ignore the entity's original vruntime in that
+	 * case.
+	 * The maximal vruntime speedup is given by the ratio of normal to
+	 * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
+	 * When placing a migrated waking entity, its exec_start has been set
+	 * from a different rq. In order to take into account a possible
+	 * divergence between new and prev rq's clocks task because of irq and
+	 * stolen time, we take an additional margin.
+	 * So, cutting off on the sleep time of
+	 *     2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
+	 * should be safe.
+
+	 */
+	if (entity_is_long_sleeper(se))
 		se->vruntime = vruntime;
 	else
 		se->vruntime = max_vruntime(se->vruntime, vruntime);
+
+	if (flags & ENQUEUE_MIGRATED)
+		se->exec_start = 0;
 }

 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -4769,7 +4805,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	account_entity_enqueue(cfs_rq, se);

 	if (flags & ENQUEUE_WAKEUP)
-		place_entity(cfs_rq, se, 0);
+		place_entity(cfs_rq, se, flags);

 	check_schedstat_required();
 	update_stats_enqueue_fair(cfs_rq, se, flags);
@@ -7665,9 +7701,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 	/* Tell new CPU we are migrated */
 	se->avg.last_update_time = 0;

-	/* We have migrated, no longer consider this task hot */
-	se->exec_start = 0;
-
 	update_scan_period(p, new_cpu);
 }

@@ -11993,7 +12026,7 @@ static void task_fork_fair(struct task_struct *p)
 		update_curr(cfs_rq);
 		se->vruntime = curr->vruntime;
 	}
-	place_entity(cfs_rq, se, 1);
+	place_entity(cfs_rq, se, 0);

 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
 		/*
@@ -12137,8 +12170,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
 		/*
 		 * Fix up our vruntime so that the current sleep doesn't
 		 * cause 'unlimited' sleep bonus.
+		 * This is the same as placing a waking task.
 		 */
-		place_entity(cfs_rq, se, 0);
+		place_entity(cfs_rq, se, ENQUEUE_WAKEUP);
 		se->vruntime -= cfs_rq->min_vruntime;
 	}

--
2.34.1


> 
> >
> > >
  
Peter Zijlstra March 14, 2023, 5:16 p.m. UTC | #37
On Tue, Mar 14, 2023 at 02:24:37PM +0100, Vincent Guittot wrote:

> > @@ -7632,11 +7646,8 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >          * min_vruntime -- the latter is done by enqueue_entity() when placing
> >          * the task on the new runqueue.
> >          */
> > -       if (READ_ONCE(p->__state) == TASK_WAKING) {
> > -               struct cfs_rq *cfs_rq = cfs_rq_of(se);
> > -
> > +       if (READ_ONCE(p->__state) == TASK_WAKING || reset_vruntime(cfs_rq, se))
> 
> That's somehow what was proposed in one of the previous proposals but
> we can't call rq_clock_task(rq_of(cfs_rq)) because rq lock might not
> be hold and rq task clock has not been updated before being used

Argh indeed. I spend a lot of time ensuring we didn't take the old rq
lock on wakeup -- and then a lot of time cursing about how we don't :-)

Now, if we could rely on the rq-clock being no more than 1 tick behind
current, this would still be entirely sufficient to catch the long sleep
case.

Except I suppose that NOHZ can bite us here. If the old CPU is idle, the
timestamps can be arbitrarily old. Mooo :/
  
Vincent Guittot March 15, 2023, 7:18 a.m. UTC | #38
On Tue, 14 Mar 2023 at 18:16, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Mar 14, 2023 at 02:24:37PM +0100, Vincent Guittot wrote:
>
> > > @@ -7632,11 +7646,8 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> > >          * min_vruntime -- the latter is done by enqueue_entity() when placing
> > >          * the task on the new runqueue.
> > >          */
> > > -       if (READ_ONCE(p->__state) == TASK_WAKING) {
> > > -               struct cfs_rq *cfs_rq = cfs_rq_of(se);
> > > -
> > > +       if (READ_ONCE(p->__state) == TASK_WAKING || reset_vruntime(cfs_rq, se))
> >
> > That's somehow what was proposed in one of the previous proposals but
> > we can't call rq_clock_task(rq_of(cfs_rq)) because rq lock might not
> > be hold and rq task clock has not been updated before being used
>
> Argh indeed. I spend a lot of time ensuring we didn't take the old rq
> lock on wakeup -- and then a lot of time cursing about how we don't :-)
>
> Now, if we could rely on the rq-clock being no more than 1 tick behind
> current, this would still be entirely sufficient to catch the long sleep
> case.

We should also take care when loading rq_clock_task that we are not
racing with an update especially for a 32bits system like pelt
last_update_time

>
> Except I suppose that NOHZ can bite us here. If the old CPU is idle, the
> timestamps can be arbitrarily old. Mooo :/

That should not be a real problem because if the cpu is idle and the
rq clock is not updated, the min_vruntime will not move forward so we
are "safe" in regard to the overflow.

That's what was done in the v2 and v3 of this patch


>
>
  
Vincent Guittot March 15, 2023, 8:42 a.m. UTC | #39
On Wed, 15 Mar 2023 at 08:18, Vincent Guittot
<vincent.guittot@linaro.org> wrote:
>
> On Tue, 14 Mar 2023 at 18:16, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Mar 14, 2023 at 02:24:37PM +0100, Vincent Guittot wrote:
> >
> > > > @@ -7632,11 +7646,8 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> > > >          * min_vruntime -- the latter is done by enqueue_entity() when placing
> > > >          * the task on the new runqueue.
> > > >          */
> > > > -       if (READ_ONCE(p->__state) == TASK_WAKING) {
> > > > -               struct cfs_rq *cfs_rq = cfs_rq_of(se);
> > > > -
> > > > +       if (READ_ONCE(p->__state) == TASK_WAKING || reset_vruntime(cfs_rq, se))
> > >
> > > That's somehow what was proposed in one of the previous proposals but
> > > we can't call rq_clock_task(rq_of(cfs_rq)) because rq lock might not
> > > be hold and rq task clock has not been updated before being used
> >
> > Argh indeed. I spend a lot of time ensuring we didn't take the old rq
> > lock on wakeup -- and then a lot of time cursing about how we don't :-)
> >
> > Now, if we could rely on the rq-clock being no more than 1 tick behind
> > current, this would still be entirely sufficient to catch the long sleep
> > case.
>
> We should also take care when loading rq_clock_task that we are not
> racing with an update especially for a 32bits system like pelt
> last_update_time

We still have this possibility:
https://lore.kernel.org/lkml/ZAiFxWLSb9HDazSI@vingu-book/

which uses pelt last_update_time when migrating and keep using
rq_clock_task in place_entity

>
> >
> > Except I suppose that NOHZ can bite us here. If the old CPU is idle, the
> > timestamps can be arbitrarily old. Mooo :/
>
> That should not be a real problem because if the cpu is idle and the
> rq clock is not updated, the min_vruntime will not move forward so we
> are "safe" in regard to the overflow.
>
> That's what was done in the v2 and v3 of this patch
>
>
> >
> >
  
Zhang Qiao March 15, 2023, 9:16 a.m. UTC | #40
在 2023/3/14 23:32, Vincent Guittot 写道:
> Le mardi 14 mars 2023 à 14:39:49 (+0100), Vincent Guittot a écrit :
>> On Tue, 14 Mar 2023 at 14:38, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>>
>>>
>>>
>>> 在 2023/3/14 21:26, Vincent Guittot 写道:
>>>> On Tue, 14 Mar 2023 at 12:03, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>> 在 2023/3/13 22:23, Vincent Guittot 写道:
>>>>>> On Sat, 11 Mar 2023 at 10:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> 在 2023/3/10 22:29, Vincent Guittot 写道:
>>>>>>>> Le jeudi 09 mars 2023 à 16:14:38 (+0100), Vincent Guittot a écrit :
>>>>>>>>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
>>>>>>>>>>
>>>>>>>>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
>>>>>>>>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
>>>>>>>>>>>
>>>>>>>>>>>> Then, even if we don't clear exec_start before migrating and keep
>>>>>>>>>>>> current value to be used in place_entity on the new cpu, we can't
>>>>>>>>>>>> compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT
>>>>>>>>>>>
>>>>>>>>>>> Blergh -- indeed, irq and steal time can skew them between CPUs :/
>>>>>>>>>>> I suppose we can fudge that... wait_start (which is basically what we're
>>>>>>>>>>> making it do) also does that IIRC.
>>>>>>>>>>>
>>>>>>>>>>> I really dislike having this placement muck spreadout like proposed.
>>>>>>>>>>
>>>>>>>>>> Also, I think we might be over-engineering this, we don't care about
>>>>>>>>>> accuracy at all, all we really care about is 'long-time'.
>>>>>>>>>
>>>>>>>>> you mean taking the patch 1/2 that you mentioned here to add a
>>>>>>>>> migrated field:
>>>>>>>>> https://lore.kernel.org/all/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com/T/#ma5637eb8010f3f4a4abff778af8db705429d003b
>>>>>>>>>
>>>>>>>>> And assume that the divergence between the rq_clock_task() can be ignored ?
>>>>>>>>>
>>>>>>>>> That could probably work but we need to replace the (60LL *
>>>>>>>>> NSEC_PER_SEC) by ((1ULL << 63) / NICE_0_LOAD) because 60sec divergence
>>>>>>>>> would not be unrealistic.
>>>>>>>>> and a comment to explain why it's acceptable
>>>>>>>>
>>>>>>>> Zhang,
>>>>>>>>
>>>>>>>> Could you try the patch below ?
>>>>>>>> This is a rebase/merge/update of:
>>>>>>>> -patch 1/2 above and
>>>>>>>> -https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
>>>>>>>
>>>>>>>
>>>>>>> I applyed and tested this patch, and it make hackbench slower.
>>>>>>> According to my previous test results. The good result is 82.1(s).
>>>>>>> But the result of this patch is 108.725(s).
>>>>>>
>>>>>> By "the result of this patch is 108.725(s)",  you mean the result of
>>>>>> https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
>>>>>> alone, don't you ?
>>>>>
>>>>> No, with your patch, the test results is 108.725(s),
>>>>
>>>> Ok
>>>>
>>>>>
>>>>> git diff:
>>>>>
>>>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>>>> index 63d242164b1a..93a3909ae4c4 100644
>>>>> --- a/include/linux/sched.h
>>>>> +++ b/include/linux/sched.h
>>>>> @@ -550,6 +550,7 @@ struct sched_entity {
>>>>>         struct rb_node                  run_node;
>>>>>         struct list_head                group_node;
>>>>>         unsigned int                    on_rq;
>>>>> +       unsigned int                    migrated;
>>>>>
>>>>>         u64                             exec_start;
>>>>>         u64                             sum_exec_runtime;
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index ff4dbbae3b10..e60defc39f6e 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>>>>         /*
>>>>>          * We are starting a new run period:
>>>>>          */
>>>>> +       se->migrated = 0;
>>>>>         se->exec_start = rq_clock_task(rq_of(cfs_rq));
>>>>>  }
>>>>>
>>>>> @@ -4690,9 +4691,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>>>>>          * inversed due to s64 overflow.
>>>>>          */
>>>>>         sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
>>>>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
>>>>> +       if ((s64)sleep_time > (1ULL << 63) / scale_load_down(NICE_0_LOAD) / 2) {
>>>>>                 se->vruntime = vruntime;
>>>>> -       else
>>>>> +       } else
>>>>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
>>>>>  }
>>>>>
>>>>> @@ -7658,8 +7659,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>>>>>         se->avg.last_update_time = 0;
>>>>>
>>>>>         /* We have migrated, no longer consider this task hot */
>>>>> -       se->exec_start = 0;
>>>>> -
>>>>> +       se->migrated = 1;
>>>>>         update_scan_period(p, new_cpu);
>>>>>  }
>>>>>
>>>>> @@ -8343,6 +8343,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>>>>>
>>>>>         if (sysctl_sched_migration_cost == 0)
>>>>>                 return 0;
>>>>> +       if (p->se.migrated)
>>>>> +               return 0;
>>>>>
>>>>>         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
>>>>>
>>>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> version1: v6.2
>>>>>>>> version2: v6.2 + commit 829c1651e9c4
>>>>>>>> version3: v6.2 + commit 829c1651e9c4 + this patch
>>>>>>>>
>>>>>>>> -------------------------------------------------
>>>>>>>>       version1        version2        version3
>>>>>>>> test1 81.0            118.1           82.1
>>>>>>>> test2 82.1            116.9           80.3
>>>>>>>> test3 83.2            103.9           83.3
>>>>>>>> avg(s)        82.1            113.0           81.9
>>>>>>
>>>>>> Ok, it looks like we are back to normal figures
>>>>
>>>> What do those results refer to then ?
>>>
>>> Quote from this email (https://lore.kernel.org/lkml/1cd19d3f-18c4-92f9-257a-378cc18cfbc7@huawei.com/).
>>
>> ok.
>>
>> Then, there is something wrong in my patch. Let me look at it more deeply
> 
> Coudl you try the patc below. It fixes the problem on my system
> 

Yes, it fixes the problem.

------

 Performance counter stats for 'hackbench -g 44 -f 20 --process --pipe -l 60000 -s 100' (10 runs):

             79.53 +- 1.21 seconds time elapsed  ( +-  1.52% )



> ---
>  kernel/sched/fair.c | 84 +++++++++++++++++++++++++++++++--------------
>  1 file changed, 59 insertions(+), 25 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0f499e9a74b5..f8722e47bb0b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4648,23 +4648,36 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  #endif
>  }
> 
> +static inline bool entity_is_long_sleeper(struct sched_entity *se)
> +{
> +       struct cfs_rq *cfs_rq;
> +       u64 sleep_time;
> +
> +       if (se->exec_start == 0)
> +               return false;
> +
> +       cfs_rq = cfs_rq_of(se);
> +
> +       sleep_time = rq_clock_task(rq_of(cfs_rq));
> +
> +       /* Happen while migrating because of clock task divergence */
> +       if (sleep_time <= se->exec_start)
> +	       return false;
> +
> +       sleep_time -= se->exec_start;
> +       if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
> +               return true;
> +
> +       return false;
> +}
> +
>  static void
> -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  {
>  	u64 vruntime = cfs_rq->min_vruntime;
> -	u64 sleep_time;
> -
> -	/*
> -	 * The 'current' period is already promised to the current tasks,
> -	 * however the extra weight of the new task will slow them down a
> -	 * little, place the new task so that it fits in the slot that
> -	 * stays open at the end.
> -	 */
> -	if (initial && sched_feat(START_DEBIT))
> -		vruntime += sched_vslice(cfs_rq, se);
> 
>  	/* sleeps up to a single latency don't count. */
> -	if (!initial) {
> +	if (flags & ENQUEUE_WAKEUP) {
>  		unsigned long thresh;
> 
>  		if (se_is_idle(se))
> @@ -4680,20 +4693,43 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>  			thresh >>= 1;
> 
>  		vruntime -= thresh;
> +	} else if sched_feat(START_DEBIT) {

There's a parenthesis missing here.


> +		/*
> +		 * The 'current' period is already promised to the current tasks,
> +		 * however the extra weight of the new task will slow them down a
> +		 * little, place the new task so that it fits in the slot that
> +		 * stays open at the end.
> +		 */
> +		vruntime += sched_vslice(cfs_rq, se);
>  	}
> 
>  	/*
>  	 * Pull vruntime of the entity being placed to the base level of
> -	 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
> -	 * slept for a long time, don't even try to compare its vruntime with
> -	 * the base as it may be too far off and the comparison may get
> -	 * inversed due to s64 overflow.
> -	 */
> -	sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> -	if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> +	 * cfs_rq, to prevent boosting it if placed backwards.
> +	 * However, min_vruntime can advance much faster than real time, with
> +	 * the exterme being when an entity with the minimal weight always runs
> +	 * on the cfs_rq. If the new entity slept for long, its vruntime
> +	 * difference from min_vruntime may overflow s64 and their comparison
> +	 * may get inversed, so ignore the entity's original vruntime in that
> +	 * case.
> +	 * The maximal vruntime speedup is given by the ratio of normal to
> +	 * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
> +	 * When placing a migrated waking entity, its exec_start has been set
> +	 * from a different rq. In order to take into account a possible
> +	 * divergence between new and prev rq's clocks task because of irq and
> +	 * stolen time, we take an additional margin.
> +	 * So, cutting off on the sleep time of
> +	 *     2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
> +	 * should be safe.
> +
> +	 */
> +	if (entity_is_long_sleeper(se))
>  		se->vruntime = vruntime;
>  	else
>  		se->vruntime = max_vruntime(se->vruntime, vruntime);
> +
> +	if (flags & ENQUEUE_MIGRATED)
> +		se->exec_start = 0;
>  }
> 
>  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
> @@ -4769,7 +4805,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  	account_entity_enqueue(cfs_rq, se);
> 
>  	if (flags & ENQUEUE_WAKEUP)
> -		place_entity(cfs_rq, se, 0);
> +		place_entity(cfs_rq, se, flags);
> 
>  	check_schedstat_required();
>  	update_stats_enqueue_fair(cfs_rq, se, flags);
> @@ -7665,9 +7701,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>  	/* Tell new CPU we are migrated */
>  	se->avg.last_update_time = 0;
> 
> -	/* We have migrated, no longer consider this task hot */
> -	se->exec_start = 0;
> -
>  	update_scan_period(p, new_cpu);
>  }
> 
> @@ -11993,7 +12026,7 @@ static void task_fork_fair(struct task_struct *p)
>  		update_curr(cfs_rq);
>  		se->vruntime = curr->vruntime;
>  	}
> -	place_entity(cfs_rq, se, 1);
> +	place_entity(cfs_rq, se, 0);
> 
>  	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
>  		/*
> @@ -12137,8 +12170,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
>  		/*
>  		 * Fix up our vruntime so that the current sleep doesn't
>  		 * cause 'unlimited' sleep bonus.
> +		 * This is the same as placing a waking task.
>  		 */
> -		place_entity(cfs_rq, se, 0);
> +		place_entity(cfs_rq, se, ENQUEUE_WAKEUP);
>  		se->vruntime -= cfs_rq->min_vruntime;
>  	}
> 
> --
> 2.34.1
> 
> 
>>
>>>
>>>>
> .
>
  
Dietmar Eggemann March 15, 2023, 10:15 a.m. UTC | #41
On 15/03/2023 09:42, Vincent Guittot wrote:
> On Wed, 15 Mar 2023 at 08:18, Vincent Guittot
> <vincent.guittot@linaro.org> wrote:
>>
>> On Tue, 14 Mar 2023 at 18:16, Peter Zijlstra <peterz@infradead.org> wrote:
>>>
>>> On Tue, Mar 14, 2023 at 02:24:37PM +0100, Vincent Guittot wrote:
>>>
>>>>> @@ -7632,11 +7646,8 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
>>>>>          * min_vruntime -- the latter is done by enqueue_entity() when placing
>>>>>          * the task on the new runqueue.
>>>>>          */
>>>>> -       if (READ_ONCE(p->__state) == TASK_WAKING) {
>>>>> -               struct cfs_rq *cfs_rq = cfs_rq_of(se);
>>>>> -
>>>>> +       if (READ_ONCE(p->__state) == TASK_WAKING || reset_vruntime(cfs_rq, se))
>>>>
>>>> That's somehow what was proposed in one of the previous proposals but
>>>> we can't call rq_clock_task(rq_of(cfs_rq)) because rq lock might not
>>>> be hold and rq task clock has not been updated before being used
>>>
>>> Argh indeed. I spend a lot of time ensuring we didn't take the old rq
>>> lock on wakeup -- and then a lot of time cursing about how we don't :-)
>>>
>>> Now, if we could rely on the rq-clock being no more than 1 tick behind
>>> current, this would still be entirely sufficient to catch the long sleep
>>> case.
>>
>> We should also take care when loading rq_clock_task that we are not
>> racing with an update especially for a 32bits system like pelt
>> last_update_time
> 
> We still have this possibility:
> https://lore.kernel.org/lkml/ZAiFxWLSb9HDazSI@vingu-book/
> 
> which uses pelt last_update_time when migrating and keep using
> rq_clock_task in place_entity

Isn't there an issue with this approach on asymmetric CPU capacity systems?

We do a sync_entity_load_avg() in select_task_rq_fair()
(find_energy_efficient_cpu() for EAS and select_idle_sibling() for CAS)
to sync cfs_rq and se.

[...]
  
Vincent Guittot March 15, 2023, 10:21 a.m. UTC | #42
On Wed, 15 Mar 2023 at 11:15, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>
> On 15/03/2023 09:42, Vincent Guittot wrote:
> > On Wed, 15 Mar 2023 at 08:18, Vincent Guittot
> > <vincent.guittot@linaro.org> wrote:
> >>
> >> On Tue, 14 Mar 2023 at 18:16, Peter Zijlstra <peterz@infradead.org> wrote:
> >>>
> >>> On Tue, Mar 14, 2023 at 02:24:37PM +0100, Vincent Guittot wrote:
> >>>
> >>>>> @@ -7632,11 +7646,8 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >>>>>          * min_vruntime -- the latter is done by enqueue_entity() when placing
> >>>>>          * the task on the new runqueue.
> >>>>>          */
> >>>>> -       if (READ_ONCE(p->__state) == TASK_WAKING) {
> >>>>> -               struct cfs_rq *cfs_rq = cfs_rq_of(se);
> >>>>> -
> >>>>> +       if (READ_ONCE(p->__state) == TASK_WAKING || reset_vruntime(cfs_rq, se))
> >>>>
> >>>> That's somehow what was proposed in one of the previous proposals but
> >>>> we can't call rq_clock_task(rq_of(cfs_rq)) because rq lock might not
> >>>> be hold and rq task clock has not been updated before being used
> >>>
> >>> Argh indeed. I spend a lot of time ensuring we didn't take the old rq
> >>> lock on wakeup -- and then a lot of time cursing about how we don't :-)
> >>>
> >>> Now, if we could rely on the rq-clock being no more than 1 tick behind
> >>> current, this would still be entirely sufficient to catch the long sleep
> >>> case.
> >>
> >> We should also take care when loading rq_clock_task that we are not
> >> racing with an update especially for a 32bits system like pelt
> >> last_update_time
> >
> > We still have this possibility:
> > https://lore.kernel.org/lkml/ZAiFxWLSb9HDazSI@vingu-book/
> >
> > which uses pelt last_update_time when migrating and keep using
> > rq_clock_task in place_entity
>
> Isn't there an issue with this approach on asymmetric CPU capacity systems?
>
> We do a sync_entity_load_avg() in select_task_rq_fair()
> (find_energy_efficient_cpu() for EAS and select_idle_sibling() for CAS)
> to sync cfs_rq and se.

ah yes, I forgot this point.
That being said, is it a valid problem for EAS based system ? I mean
we are trying to fix a vruntime comparison overflow that can happen
with a very long sleeping task (around 200 days) while only a very low
weight entity is always running  during those 200 days.

>
> [...]
>
  
Dietmar Eggemann March 15, 2023, 1:35 p.m. UTC | #43
On 15/03/2023 11:21, Vincent Guittot wrote:
> On Wed, 15 Mar 2023 at 11:15, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>>
>> On 15/03/2023 09:42, Vincent Guittot wrote:
>>> On Wed, 15 Mar 2023 at 08:18, Vincent Guittot
>>> <vincent.guittot@linaro.org> wrote:
>>>>
>>>> On Tue, 14 Mar 2023 at 18:16, Peter Zijlstra <peterz@infradead.org> wrote:
>>>>>
>>>>> On Tue, Mar 14, 2023 at 02:24:37PM +0100, Vincent Guittot wrote:
>>>>>

[...]

>> Isn't there an issue with this approach on asymmetric CPU capacity systems?
>>
>> We do a sync_entity_load_avg() in select_task_rq_fair()
>> (find_energy_efficient_cpu() for EAS and select_idle_sibling() for CAS)
>> to sync cfs_rq and se.
> 
> ah yes, I forgot this point.
> That being said, is it a valid problem for EAS based system ? I mean
> we are trying to fix a vruntime comparison overflow that can happen
> with a very long sleeping task (around 200 days) while only a very low
> weight entity is always running  during those 200 days.

True. Definitively very unlikely. But it's not only EAS, any asymmetric
CPU capacity wakeup wouldn't have this check in this case.

This dependency between sync_entity_load_avg() and the overflow
detection would be very hard to spot though (in case
sync_entity_load_avg() would get introduced in more common wakeup paths
later).
  
Vincent Guittot March 15, 2023, 3:30 p.m. UTC | #44
On Wed, 15 Mar 2023 at 10:16, Zhang Qiao <zhangqiao22@huawei.com> wrote:
>
>
>
> 在 2023/3/14 23:32, Vincent Guittot 写道:
> > Le mardi 14 mars 2023 à 14:39:49 (+0100), Vincent Guittot a écrit :
> >> On Tue, 14 Mar 2023 at 14:38, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> >>>
> >>>
> >>>
> >>> 在 2023/3/14 21:26, Vincent Guittot 写道:
> >>>> On Tue, 14 Mar 2023 at 12:03, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> >>>>>
> >>>>>
> >>>>>
> >>>>> 在 2023/3/13 22:23, Vincent Guittot 写道:
> >>>>>> On Sat, 11 Mar 2023 at 10:57, Zhang Qiao <zhangqiao22@huawei.com> wrote:
> >>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>> 在 2023/3/10 22:29, Vincent Guittot 写道:
> >>>>>>>> Le jeudi 09 mars 2023 à 16:14:38 (+0100), Vincent Guittot a écrit :
> >>>>>>>>> On Thu, 9 Mar 2023 at 15:37, Peter Zijlstra <peterz@infradead.org> wrote:
> >>>>>>>>>>
> >>>>>>>>>> On Thu, Mar 09, 2023 at 03:28:25PM +0100, Peter Zijlstra wrote:
> >>>>>>>>>>> On Thu, Mar 09, 2023 at 02:34:05PM +0100, Vincent Guittot wrote:
> >>>>>>>>>>>
> >>>>>>>>>>>> Then, even if we don't clear exec_start before migrating and keep
> >>>>>>>>>>>> current value to be used in place_entity on the new cpu, we can't
> >>>>>>>>>>>> compare the rq_clock_task(rq_of(cfs_rq)) of 2 different rqs AFAICT
> >>>>>>>>>>>
> >>>>>>>>>>> Blergh -- indeed, irq and steal time can skew them between CPUs :/
> >>>>>>>>>>> I suppose we can fudge that... wait_start (which is basically what we're
> >>>>>>>>>>> making it do) also does that IIRC.
> >>>>>>>>>>>
> >>>>>>>>>>> I really dislike having this placement muck spreadout like proposed.
> >>>>>>>>>>
> >>>>>>>>>> Also, I think we might be over-engineering this, we don't care about
> >>>>>>>>>> accuracy at all, all we really care about is 'long-time'.
> >>>>>>>>>
> >>>>>>>>> you mean taking the patch 1/2 that you mentioned here to add a
> >>>>>>>>> migrated field:
> >>>>>>>>> https://lore.kernel.org/all/68832dfbb60fda030540b5f4e39c5801942689b1.1648228023.git.tim.c.chen@linux.intel.com/T/#ma5637eb8010f3f4a4abff778af8db705429d003b
> >>>>>>>>>
> >>>>>>>>> And assume that the divergence between the rq_clock_task() can be ignored ?
> >>>>>>>>>
> >>>>>>>>> That could probably work but we need to replace the (60LL *
> >>>>>>>>> NSEC_PER_SEC) by ((1ULL << 63) / NICE_0_LOAD) because 60sec divergence
> >>>>>>>>> would not be unrealistic.
> >>>>>>>>> and a comment to explain why it's acceptable
> >>>>>>>>
> >>>>>>>> Zhang,
> >>>>>>>>
> >>>>>>>> Could you try the patch below ?
> >>>>>>>> This is a rebase/merge/update of:
> >>>>>>>> -patch 1/2 above and
> >>>>>>>> -https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
> >>>>>>>
> >>>>>>>
> >>>>>>> I applyed and tested this patch, and it make hackbench slower.
> >>>>>>> According to my previous test results. The good result is 82.1(s).
> >>>>>>> But the result of this patch is 108.725(s).
> >>>>>>
> >>>>>> By "the result of this patch is 108.725(s)",  you mean the result of
> >>>>>> https://lore.kernel.org/lkml/20230209193107.1432770-1-rkagan@amazon.de/
> >>>>>> alone, don't you ?
> >>>>>
> >>>>> No, with your patch, the test results is 108.725(s),
> >>>>
> >>>> Ok
> >>>>
> >>>>>
> >>>>> git diff:
> >>>>>
> >>>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
> >>>>> index 63d242164b1a..93a3909ae4c4 100644
> >>>>> --- a/include/linux/sched.h
> >>>>> +++ b/include/linux/sched.h
> >>>>> @@ -550,6 +550,7 @@ struct sched_entity {
> >>>>>         struct rb_node                  run_node;
> >>>>>         struct list_head                group_node;
> >>>>>         unsigned int                    on_rq;
> >>>>> +       unsigned int                    migrated;
> >>>>>
> >>>>>         u64                             exec_start;
> >>>>>         u64                             sum_exec_runtime;
> >>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>>>> index ff4dbbae3b10..e60defc39f6e 100644
> >>>>> --- a/kernel/sched/fair.c
> >>>>> +++ b/kernel/sched/fair.c
> >>>>> @@ -1057,6 +1057,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >>>>>         /*
> >>>>>          * We are starting a new run period:
> >>>>>          */
> >>>>> +       se->migrated = 0;
> >>>>>         se->exec_start = rq_clock_task(rq_of(cfs_rq));
> >>>>>  }
> >>>>>
> >>>>> @@ -4690,9 +4691,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >>>>>          * inversed due to s64 overflow.
> >>>>>          */
> >>>>>         sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> >>>>> -       if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> >>>>> +       if ((s64)sleep_time > (1ULL << 63) / scale_load_down(NICE_0_LOAD) / 2) {
> >>>>>                 se->vruntime = vruntime;
> >>>>> -       else
> >>>>> +       } else
> >>>>>                 se->vruntime = max_vruntime(se->vruntime, vruntime);
> >>>>>  }
> >>>>>
> >>>>> @@ -7658,8 +7659,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >>>>>         se->avg.last_update_time = 0;
> >>>>>
> >>>>>         /* We have migrated, no longer consider this task hot */
> >>>>> -       se->exec_start = 0;
> >>>>> -
> >>>>> +       se->migrated = 1;
> >>>>>         update_scan_period(p, new_cpu);
> >>>>>  }
> >>>>>
> >>>>> @@ -8343,6 +8343,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> >>>>>
> >>>>>         if (sysctl_sched_migration_cost == 0)
> >>>>>                 return 0;
> >>>>> +       if (p->se.migrated)
> >>>>> +               return 0;
> >>>>>
> >>>>>         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
> >>>>>
> >>>>>
> >>>>>
> >>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>>> version1: v6.2
> >>>>>>>> version2: v6.2 + commit 829c1651e9c4
> >>>>>>>> version3: v6.2 + commit 829c1651e9c4 + this patch
> >>>>>>>>
> >>>>>>>> -------------------------------------------------
> >>>>>>>>       version1        version2        version3
> >>>>>>>> test1 81.0            118.1           82.1
> >>>>>>>> test2 82.1            116.9           80.3
> >>>>>>>> test3 83.2            103.9           83.3
> >>>>>>>> avg(s)        82.1            113.0           81.9
> >>>>>>
> >>>>>> Ok, it looks like we are back to normal figures
> >>>>
> >>>> What do those results refer to then ?
> >>>
> >>> Quote from this email (https://lore.kernel.org/lkml/1cd19d3f-18c4-92f9-257a-378cc18cfbc7@huawei.com/).
> >>
> >> ok.
> >>
> >> Then, there is something wrong in my patch. Let me look at it more deeply
> >
> > Coudl you try the patc below. It fixes the problem on my system
> >
>
> Yes, it fixes the problem.
>
> ------
>
>  Performance counter stats for 'hackbench -g 44 -f 20 --process --pipe -l 60000 -s 100' (10 runs):
>
>              79.53 +- 1.21 seconds time elapsed  ( +-  1.52% )

Thanks for testing

>
>
>
> > ---
> >  kernel/sched/fair.c | 84 +++++++++++++++++++++++++++++++--------------
> >  1 file changed, 59 insertions(+), 25 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 0f499e9a74b5..f8722e47bb0b 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4648,23 +4648,36 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >  #endif
> >  }
> >
> > +static inline bool entity_is_long_sleeper(struct sched_entity *se)
> > +{
> > +       struct cfs_rq *cfs_rq;
> > +       u64 sleep_time;
> > +
> > +       if (se->exec_start == 0)
> > +               return false;
> > +
> > +       cfs_rq = cfs_rq_of(se);
> > +
> > +       sleep_time = rq_clock_task(rq_of(cfs_rq));
> > +
> > +       /* Happen while migrating because of clock task divergence */
> > +       if (sleep_time <= se->exec_start)
> > +            return false;
> > +
> > +       sleep_time -= se->exec_start;
> > +       if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
> > +               return true;
> > +
> > +       return false;
> > +}
> > +
> >  static void
> > -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> > +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> >  {
> >       u64 vruntime = cfs_rq->min_vruntime;
> > -     u64 sleep_time;
> > -
> > -     /*
> > -      * The 'current' period is already promised to the current tasks,
> > -      * however the extra weight of the new task will slow them down a
> > -      * little, place the new task so that it fits in the slot that
> > -      * stays open at the end.
> > -      */
> > -     if (initial && sched_feat(START_DEBIT))
> > -             vruntime += sched_vslice(cfs_rq, se);
> >
> >       /* sleeps up to a single latency don't count. */
> > -     if (!initial) {
> > +     if (flags & ENQUEUE_WAKEUP) {
> >               unsigned long thresh;
> >
> >               if (se_is_idle(se))
> > @@ -4680,20 +4693,43 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >                       thresh >>= 1;
> >
> >               vruntime -= thresh;
> > +     } else if sched_feat(START_DEBIT) {
>
> There's a parenthesis missing here.

+1

>
>
> > +             /*
> > +              * The 'current' period is already promised to the current tasks,
> > +              * however the extra weight of the new task will slow them down a
> > +              * little, place the new task so that it fits in the slot that
> > +              * stays open at the end.
> > +              */
> > +             vruntime += sched_vslice(cfs_rq, se);
> >       }
> >
> >       /*
> >        * Pull vruntime of the entity being placed to the base level of
> > -      * cfs_rq, to prevent boosting it if placed backwards.  If the entity
> > -      * slept for a long time, don't even try to compare its vruntime with
> > -      * the base as it may be too far off and the comparison may get
> > -      * inversed due to s64 overflow.
> > -      */
> > -     sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
> > -     if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
> > +      * cfs_rq, to prevent boosting it if placed backwards.
> > +      * However, min_vruntime can advance much faster than real time, with
> > +      * the exterme being when an entity with the minimal weight always runs
> > +      * on the cfs_rq. If the new entity slept for long, its vruntime
> > +      * difference from min_vruntime may overflow s64 and their comparison
> > +      * may get inversed, so ignore the entity's original vruntime in that
> > +      * case.
> > +      * The maximal vruntime speedup is given by the ratio of normal to
> > +      * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
> > +      * When placing a migrated waking entity, its exec_start has been set
> > +      * from a different rq. In order to take into account a possible
> > +      * divergence between new and prev rq's clocks task because of irq and
> > +      * stolen time, we take an additional margin.
> > +      * So, cutting off on the sleep time of
> > +      *     2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
> > +      * should be safe.
> > +
> > +      */
> > +     if (entity_is_long_sleeper(se))
> >               se->vruntime = vruntime;
> >       else
> >               se->vruntime = max_vruntime(se->vruntime, vruntime);
> > +
> > +     if (flags & ENQUEUE_MIGRATED)
> > +             se->exec_start = 0;
> >  }
> >
> >  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
> > @@ -4769,7 +4805,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> >       account_entity_enqueue(cfs_rq, se);
> >
> >       if (flags & ENQUEUE_WAKEUP)
> > -             place_entity(cfs_rq, se, 0);
> > +             place_entity(cfs_rq, se, flags);
> >
> >       check_schedstat_required();
> >       update_stats_enqueue_fair(cfs_rq, se, flags);
> > @@ -7665,9 +7701,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> >       /* Tell new CPU we are migrated */
> >       se->avg.last_update_time = 0;
> >
> > -     /* We have migrated, no longer consider this task hot */
> > -     se->exec_start = 0;
> > -
> >       update_scan_period(p, new_cpu);
> >  }
> >
> > @@ -11993,7 +12026,7 @@ static void task_fork_fair(struct task_struct *p)
> >               update_curr(cfs_rq);
> >               se->vruntime = curr->vruntime;
> >       }
> > -     place_entity(cfs_rq, se, 1);
> > +     place_entity(cfs_rq, se, 0);
> >
> >       if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
> >               /*
> > @@ -12137,8 +12170,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
> >               /*
> >                * Fix up our vruntime so that the current sleep doesn't
> >                * cause 'unlimited' sleep bonus.
> > +              * This is the same as placing a waking task.
> >                */
> > -             place_entity(cfs_rq, se, 0);
> > +             place_entity(cfs_rq, se, ENQUEUE_WAKEUP);
> >               se->vruntime -= cfs_rq->min_vruntime;
> >       }
> >
> > --
> > 2.34.1
> >
> >
> >>
> >>>
> >>>>
> > .
> >
  
Vincent Guittot March 15, 2023, 3:32 p.m. UTC | #45
On Wed, 15 Mar 2023 at 14:36, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>
> On 15/03/2023 11:21, Vincent Guittot wrote:
> > On Wed, 15 Mar 2023 at 11:15, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> >>
> >> On 15/03/2023 09:42, Vincent Guittot wrote:
> >>> On Wed, 15 Mar 2023 at 08:18, Vincent Guittot
> >>> <vincent.guittot@linaro.org> wrote:
> >>>>
> >>>> On Tue, 14 Mar 2023 at 18:16, Peter Zijlstra <peterz@infradead.org> wrote:
> >>>>>
> >>>>> On Tue, Mar 14, 2023 at 02:24:37PM +0100, Vincent Guittot wrote:
> >>>>>
>
> [...]
>
> >> Isn't there an issue with this approach on asymmetric CPU capacity systems?
> >>
> >> We do a sync_entity_load_avg() in select_task_rq_fair()
> >> (find_energy_efficient_cpu() for EAS and select_idle_sibling() for CAS)
> >> to sync cfs_rq and se.
> >
> > ah yes, I forgot this point.
> > That being said, is it a valid problem for EAS based system ? I mean
> > we are trying to fix a vruntime comparison overflow that can happen
> > with a very long sleeping task (around 200 days) while only a very low
> > weight entity is always running  during those 200 days.
>
> True. Definitively very unlikely. But it's not only EAS, any asymmetric
> CPU capacity wakeup wouldn't have this check in this case.
>
> This dependency between sync_entity_load_avg() and the overflow
> detection would be very hard to spot though (in case
> sync_entity_load_avg() would get introduced in more common wakeup paths
> later).

fair enough
  

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a1b1f855b96..74c9918ffe76 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4648,11 +4648,45 @@  static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #endif
 }
 
+static inline bool entity_is_long_sleep(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq;
+	u64 sleep_time;
+
+	if (se->exec_start == 0)
+		return false;
+
+	cfs_rq = cfs_rq_of(se);
+	sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
+	if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+		return true;
+
+	return false;
+}
+
+static inline u64 sched_sleeper_credit(struct sched_entity *se)
+{
+	unsigned long thresh;
+
+	if (se_is_idle(se))
+		thresh = sysctl_sched_min_granularity;
+	else
+		thresh = sysctl_sched_latency;
+
+	/*
+	 * Halve their sleep time's effect, to allow
+	 * for a gentler effect of sleepers:
+	 */
+	if (sched_feat(GENTLE_FAIR_SLEEPERS))
+		thresh >>= 1;
+
+	return thresh;
+}
+
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
 	u64 vruntime = cfs_rq->min_vruntime;
-	u64 sleep_time;
 
 	/*
 	 * The 'current' period is already promised to the current tasks,
@@ -4664,23 +4698,8 @@  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 		vruntime += sched_vslice(cfs_rq, se);
 
 	/* sleeps up to a single latency don't count. */
-	if (!initial) {
-		unsigned long thresh;
-
-		if (se_is_idle(se))
-			thresh = sysctl_sched_min_granularity;
-		else
-			thresh = sysctl_sched_latency;
-
-		/*
-		 * Halve their sleep time's effect, to allow
-		 * for a gentler effect of sleepers:
-		 */
-		if (sched_feat(GENTLE_FAIR_SLEEPERS))
-			thresh >>= 1;
-
-		vruntime -= thresh;
-	}
+	if (!initial)
+		vruntime -= sched_sleeper_credit(se);
 
 	/*
 	 * Pull vruntime of the entity being placed to the base level of
@@ -4689,8 +4708,7 @@  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	 * the base as it may be too far off and the comparison may get
 	 * inversed due to s64 overflow.
 	 */
-	sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
-	if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+	if (entity_is_long_sleep(se))
 		se->vruntime = vruntime;
 	else
 		se->vruntime = max_vruntime(se->vruntime, vruntime);
@@ -7635,7 +7653,23 @@  static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 	if (READ_ONCE(p->__state) == TASK_WAKING) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
+		/*
+		 * We determine whether a task sleeps for long by checking
+		 * se->exec_start, and if it is, we sanitize its vruntime at
+		 * place_entity(). However, after a migration, this detection
+		 * method fails due to se->exec_start being reset.
+		 *
+		 * For fixing this case, we add the same check here. For a task
+		 * which has slept for a long time, its vruntime should be reset
+		 * to cfs_rq->min_vruntime with a sleep credit. Because waking
+		 * task's vruntime will be added to cfs_rq->min_vruntime when
+		 * enqueue, we only need to reset the se->vruntime of waking task
+		 * to a credit here.
+		 */
+		if (entity_is_long_sleep(se))
+			se->vruntime = -sched_sleeper_credit(se);
+		else
+			se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
 	}
 
 	if (!task_on_rq_migrating(p)) {