Message ID | 20230117193159.22816-1-jonas.oberhauser@huaweicloud.com |
---|---|
State | New |
Headers |
Return-Path: <linux-kernel-owner@vger.kernel.org> Delivered-To: ouuuleilei@gmail.com Received: by 2002:adf:eb09:0:0:0:0:0 with SMTP id s9csp1981606wrn; Tue, 17 Jan 2023 13:06:44 -0800 (PST) X-Google-Smtp-Source: AMrXdXtwlKk3B7hewRcD3pkYglMHo8vZadB0y9PX7m1GGeUcdV37Ei3oFtd3VaO7M7KfPOlNgTZd X-Received: by 2002:a17:902:aa97:b0:186:748e:9383 with SMTP id d23-20020a170902aa9700b00186748e9383mr4528821plr.46.1673989604000; Tue, 17 Jan 2023 13:06:44 -0800 (PST) ARC-Seal: i=1; a=rsa-sha256; t=1673989603; cv=none; d=google.com; s=arc-20160816; b=qITipS45aECiA7+DlUM/r7AjIw9nJiaTF9lZYspV5vaV2HoNoRwooEwPWgO3DGmbf0 22kW2haaX/BYQpxmbwIz/vvNfZ+MSeqC4aRldp/wGCsEuz233B9yzU+bagU02Prh3bp4 Qmhgh9mja+EALePxflbkJSXIJ48/nWpQKjnITVo8xqSohxClG+L5jk1R+iT1ptXbrT22 JUBsxGI3KxSdljVu6Sf7FnqLW0yY0SIqyGhyBMDvd/ISzedM/R9SJ8AKEX3oXFEtLQ8o ODDKBxpHAYOP+9KSHf0vXbVxpvnZ6FL0Bk5pEkGIZUoZk4YBUJy3D5Ayx2E4U4GN8OJ5 Kfdg== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=list-id:precedence:message-id:date:subject:cc:to:from; bh=swS4ktQXrHCbkDbEOI1uw7daHK9BwsG7DSS+iKtAiik=; b=WQtez2VxYn3f/gW8GaCnuLb2C35N/wqcMiuddFjEiu5JpLh20jac9d4lxU4IgGdaj9 vN1Cx+tGs/3ozIS57Tr+MT2rGNOeUJB7bgrhSCP9gtgRY76ZanIqnx3kOn1XCe3GJsWs qdTZV1ON3oL2TITT/0zc8rPjIACgqqDCEeyCo0swfjbHva9adg08YG1fAcJxkQbMUQ48 ysguxhjRnAfL2xo2yp6I9eHIHLzxc6+6SYVZEyoAbRgGqZ0bhsOnRkM8iAX1CTabrHOw JtrNW5kDmLi+jtH0BrlrSNtf4k8npG4d7jxRTHYb7dqVsgL3OSZu2gxUI7E+NHh96CPi oR9g== ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) smtp.mailfrom=linux-kernel-owner@vger.kernel.org Received: from out1.vger.email (out1.vger.email. [2620:137:e000::1:20]) by mx.google.com with ESMTP id u17-20020a170902e81100b00186c3afbd25si3539940plg.349.2023.01.17.13.06.28; Tue, 17 Jan 2023 13:06:43 -0800 (PST) Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20; Authentication-Results: mx.google.com; spf=pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) smtp.mailfrom=linux-kernel-owner@vger.kernel.org Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229458AbjAQVCq (ORCPT <rfc822;pfffrao@gmail.com> + 99 others); Tue, 17 Jan 2023 16:02:46 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:36054 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229697AbjAQVA6 (ORCPT <rfc822;linux-kernel@vger.kernel.org>); Tue, 17 Jan 2023 16:00:58 -0500 Received: from frasgout12.his.huawei.com (frasgout12.his.huawei.com [14.137.139.154]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 708DA4A1D6 for <linux-kernel@vger.kernel.org>; Tue, 17 Jan 2023 11:28:03 -0800 (PST) Received: from mail02.huawei.com (unknown [172.18.147.227]) by frasgout12.his.huawei.com (SkyGuard) with ESMTP id 4NxJcm2LXMz9v7Jk for <linux-kernel@vger.kernel.org>; Wed, 18 Jan 2023 03:20:08 +0800 (CST) Received: from huaweicloud.com (unknown [10.206.133.88]) by APP2 (Coremail) with SMTP id GxC2BwC3imGg9sZjq_yjAA--.64912S2; Tue, 17 Jan 2023 20:27:38 +0100 (CET) From: Jonas Oberhauser <jonas.oberhauser@huaweicloud.com> To: paulmck@kernel.org Cc: stern@rowland.harvard.edu, parri.andrea@gmail.com, will@kernel.org, peterz@infradead.org, boqun.feng@gmail.com, npiggin@gmail.com, dhowells@redhat.com, j.alglave@ucl.ac.uk, luc.maranget@inria.fr, akiyks@gmail.com, dlustig@nvidia.com, joel@joelfernandes.org, urezki@gmail.com, quic_neeraju@quicinc.com, frederic@kernel.org, linux-kernel@vger.kernel.org, Jonas Oberhauser <jonas.oberhauser@huaweicloud.com> Subject: [PATCH] tools/memory-model: Make ppo a subrelation of po Date: Tue, 17 Jan 2023 20:31:59 +0100 Message-Id: <20230117193159.22816-1-jonas.oberhauser@huaweicloud.com> X-Mailer: git-send-email 2.17.1 X-CM-TRANSID: GxC2BwC3imGg9sZjq_yjAA--.64912S2 X-Coremail-Antispam: 1UD129KBjvAXoW3CrWxZF43tw13Ar4xCrW8Crg_yoW8Gr4DXo Wxuws7uw48Kr1jg3yDKrnYyFy7WFZ7uF1fKr15KFnFg3Wqqrn5Aa47J3s7Wa43Xr4rKanx JFyUXasrKr98tF4fn29KB7ZKAUJUUUU5529EdanIXcx71UUUUU7v73VFW2AGmfu7bjvjm3 AaLaJ3UjIYCTnIWjp_UUUYq7AC8VAFwI0_Xr0_Wr1l1xkIjI8I6I8E6xAIw20EY4v20xva j40_Wr0E3s1l1IIY67AEw4v_Jr0_Jr4l8cAvFVAK0II2c7xJM28CjxkF64kEwVA0rcxSw2 x7M28EF7xvwVC0I7IYx2IY67AKxVWUJVWUCwA2z4x0Y4vE2Ix0cI8IcVCY1x0267AKxVW8 JVWxJwA2z4x0Y4vEx4A2jsIE14v26r4j6F4UM28EF7xvwVC2z280aVCY1x0267AKxVW8JV W8Jr1le2I262IYc4CY6c8Ij28IcVAaY2xG8wAqx4xG64xvF2IEw4CE5I8CrVC2j2WlYx0E 2Ix0cI8IcVAFwI0_Jrv_JF1lYx0Ex4A2jsIE14v26r1j6r4UMcvjeVCFs4IE7xkEbVWUJV W8JwACjcxG0xvY0x0EwIxGrwACjI8F5VA0II8E6IAqYI8I648v4I1lFIxGxcIEc7CjxVA2 Y2ka0xkIwI1l42xK82IYc2Ij64vIr41l4I8I3I0E4IkC6x0Yz7v_Jr0_Gr1l4IxYO2xFxV AFwI0_Jrv_JF1lx2IqxVAqx4xG67AKxVWUJVWUGwC20s026x8GjcxK67AKxVWUGVWUWwC2 zVAF1VAY17CE14v26r4a6rW5MIIYrxkI7VAKI48JMIIF0xvE2Ix0cI8IcVAFwI0_Jr0_JF 4lIxAIcVC0I7IYx2IY6xkF7I0E14v26r4j6F4UMIIF0xvE42xK8VAvwI8IcIk0rVW8JVW3 JwCI42IY6I8E87Iv67AKxVW8JVWxJwCI42IY6I8E87Iv6xkF7I0E14v26r4j6r4UJbIYCT nIWIevJa73UjIFyTuYvjfUosqXDUUUU X-CM-SenderInfo: 5mrqt2oorev25kdx2v3u6k3tpzhluzxrxghudrp/ X-CFilter-Loop: Reflected X-Spam-Status: No, score=-1.9 required=5.0 tests=BAYES_00,SPF_HELO_NONE, SPF_NONE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lindbergh.monkeyblade.net Precedence: bulk List-ID: <linux-kernel.vger.kernel.org> X-Mailing-List: linux-kernel@vger.kernel.org X-getmail-retrieved-from-mailbox: =?utf-8?q?INBOX?= X-GMAIL-THRID: =?utf-8?q?1755305322895959775?= X-GMAIL-MSGID: =?utf-8?q?1755305322895959775?= |
Series |
tools/memory-model: Make ppo a subrelation of po
|
|
Commit Message
Jonas Oberhauser
Jan. 17, 2023, 7:31 p.m. UTC
As stated in the documentation and implied by its name, the ppo
(preserved program order) relation is intended to link po-earlier
to po-later instructions under certain conditions. However, a
corner case currently allows instructions to be linked by ppo that
are not executed by the same thread, i.e., instructions are being
linked that have no po relation.
This happens due to the mb/strong-fence relations, which (as one
case) provide order when locks are passed between threads followed
by an smp_mb__after_unlock_lock() fence. This is illustrated in
the following litmus test (as can be seen when using herd7 with
`doshow ppo`):
P0(int *x, int *y)
{
spin_lock(x);
spin_unlock(x);
}
P1(int *x, int *y)
{
spin_lock(x);
smp_mb__after_unlock_lock();
*y = 1;
}
The ppo relation will link P0's spin_lock(x) and P1's *y=1,
because P0 passes a lock to P1 which then uses this fence.
The patch makes ppo a subrelation of po by eliminating this
possibility from mb and strong-fence, and instead introduces the
notion of strong ordering operations, which are allowed to link
events of distinct threads.
Signed-off-by: Jonas Oberhauser <jonas.oberhauser@huaweicloud.com>
---
.../Documentation/explanation.txt | 101 +++++++++++-------
tools/memory-model/linux-kernel.cat | 20 ++--
2 files changed, 76 insertions(+), 45 deletions(-)
Comments
On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: > As stated in the documentation and implied by its name, the ppo > (preserved program order) relation is intended to link po-earlier > to po-later instructions under certain conditions. However, a > corner case currently allows instructions to be linked by ppo that > are not executed by the same thread, i.e., instructions are being > linked that have no po relation. > > This happens due to the mb/strong-fence relations, which (as one > case) provide order when locks are passed between threads followed > by an smp_mb__after_unlock_lock() fence. This is illustrated in > the following litmus test (as can be seen when using herd7 with > `doshow ppo`): > > P0(int *x, int *y) > { > spin_lock(x); > spin_unlock(x); > } > > P1(int *x, int *y) > { > spin_lock(x); > smp_mb__after_unlock_lock(); > *y = 1; > } > > The ppo relation will link P0's spin_lock(x) and P1's *y=1, > because P0 passes a lock to P1 which then uses this fence. > > The patch makes ppo a subrelation of po by eliminating this > possibility from mb and strong-fence, and instead introduces the > notion of strong ordering operations, which are allowed to link > events of distinct threads. > > Signed-off-by: Jonas Oberhauser <jonas.oberhauser@huaweicloud.com> > --- > .../Documentation/explanation.txt | 101 +++++++++++------- > tools/memory-model/linux-kernel.cat | 20 ++-- I've reviewed the cat change part, and it looks good to me. Now going to catch up with the interesting discussion on rcu-order... One more thing, do we want something in the cat file to describe our "internal axioms" as follow? (* Model internal invariants *) (* ppo is the subset of po *) flag ~empty (ppo \ po) as INTERNAL-ERROR-PPO Maybe it's helpful for people working on other tools to understand LKMM cat file? Anyway, with or without it: Acked-by: Boqun Feng <boqun.feng@gmail.com> Regards, Boqun > 2 files changed, 76 insertions(+), 45 deletions(-) > > diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt > index e901b47236c3..4f5f6ac098de 100644 > --- a/tools/memory-model/Documentation/explanation.txt > +++ b/tools/memory-model/Documentation/explanation.txt > @@ -865,14 +865,44 @@ propagates stores. When a fence instruction is executed on CPU C: > propagate to all other CPUs before any instructions po-after > the strong fence are executed on C. > > -The propagation ordering enforced by release fences and strong fences > -affects stores from other CPUs that propagate to CPU C before the > -fence is executed, as well as stores that are executed on C before the > -fence. We describe this property by saying that release fences and > -strong fences are A-cumulative. By contrast, smp_wmb() fences are not > -A-cumulative; they only affect the propagation of stores that are > -executed on C before the fence (i.e., those which precede the fence in > -program order). > + Whenever any CPU C' executes an unlock operation U such that > + CPU C executes a lock operation L followed by a po-later > + smp_mb__after_unlock_lock() fence, and L is either a later lock > + operation on the lock released by U or is po-after U, then any > + store that propagates to C' before U must propagate to all other > + CPUs before any instructions po-after the fence are executed on C. > + > +While smp_wmb() and release fences only force certain earlier stores > +to propagate to another CPU C' before certain later stores propagate > +to the same CPU C', strong fences and smp_mb__after_unlock_lock() > +force those stores to propagate to all other CPUs before any later > +instruction is executed. We collectively refer to the latter > +operations as strong ordering operations, as they provide much > +stronger ordering in two ways: > + > + Firstly, strong ordering operations also create order between > + earlier stores and later reads. > + > + Secondly, strong ordering operations create a form of global > + ordering: When an earlier store W propagates to CPU C and is > + ordered by a strong ordering operation with a store W' of C, > + and another CPU C' observes W' and in response issues yet > + another store W'', then W'' also can not propagate to any CPU > + before W. By contrast, a release fence or smp_wmb() would only > + order W and W', but not force any ordering between W and W''. > + To summarize, the ordering forced by strong ordering operations > + extends to later stores of all CPUs, while other fences only > + force ordering with relation to stores on the CPU that executes > + the fence. > + > +The propagation ordering enforced by release fences and strong ordering > +operations affects stores from other CPUs that propagate to CPU C before > +the fence is executed, as well as stores that are executed on C before > +the fence. We describe this property by saying that release fences and > +strong ordering operations are A-cumulative. By contrast, smp_wmb() > +fences are not A-cumulative; they only affect the propagation of stores > +that are executed on C before the fence (i.e., those which precede the > +fence in program order). > > rcu_read_lock(), rcu_read_unlock(), and synchronize_rcu() fences have > other properties which we discuss later. > @@ -1425,36 +1455,36 @@ requirement is the content of the LKMM's "happens-before" axiom. > > The LKMM defines yet another relation connected to times of > instruction execution, but it is not included in hb. It relies on the > -particular properties of strong fences, which we cover in the next > -section. > +particular properties of strong ordering operations, which we cover in the > +next section. > > > THE PROPAGATES-BEFORE RELATION: pb > ---------------------------------- > > The propagates-before (pb) relation capitalizes on the special > -features of strong fences. It links two events E and F whenever some > -store is coherence-later than E and propagates to every CPU and to RAM > -before F executes. The formal definition requires that E be linked to > -F via a coe or fre link, an arbitrary number of cumul-fences, an > -optional rfe link, a strong fence, and an arbitrary number of hb > -links. Let's see how this definition works out. > +features of strong ordering operations. It links two events E and F > +whenever some store is coherence-later than E and propagates to every > +CPU and to RAM before F executes. The formal definition requires that > +E be linked to F via a coe or fre link, an arbitrary number of > +cumul-fences, an optional rfe link, a strong ordering operation, and an > +arbitrary number of hb links. Let's see how this definition works out. > > Consider first the case where E is a store (implying that the sequence > of links begins with coe). Then there are events W, X, Y, and Z such > that: > > - E ->coe W ->cumul-fence* X ->rfe? Y ->strong-fence Z ->hb* F, > + E ->coe W ->cumul-fence* X ->rfe? Y ->strong-order Z ->hb* F, > > where the * suffix indicates an arbitrary number of links of the > specified type, and the ? suffix indicates the link is optional (Y may > be equal to X). Because of the cumul-fence links, we know that W will > propagate to Y's CPU before X does, hence before Y executes and hence > -before the strong fence executes. Because this fence is strong, we > -know that W will propagate to every CPU and to RAM before Z executes. > -And because of the hb links, we know that Z will execute before F. > -Thus W, which comes later than E in the coherence order, will > -propagate to every CPU and to RAM before F executes. > +before the strong ordering operation executes. Because of the strong > +ordering, we know that W will propagate to every CPU and to RAM before > +Z executes. And because of the hb links, we know that Z will execute > +before F. Thus W, which comes later than E in the coherence order, > +will propagate to every CPU and to RAM before F executes. > > The case where E is a load is exactly the same, except that the first > link in the sequence is fre instead of coe. > @@ -1637,8 +1667,8 @@ does not imply X ->rcu-order V, because the sequence contains only > one rcu-gp link but two rcu-rscsi links. > > The rcu-order relation is important because the Grace Period Guarantee > -means that rcu-order links act kind of like strong fences. In > -particular, E ->rcu-order F implies not only that E begins before F > +means that rcu-order links act kind of like a strong ordering operation. > +In particular, E ->rcu-order F implies not only that E begins before F > ends, but also that any write po-before E will propagate to every CPU > before any instruction po-after F can execute. (However, it does not > imply that E must execute before F; in fact, each synchronize_rcu() > @@ -1675,24 +1705,23 @@ The rcu-fence relation is a simple extension of rcu-order. While > rcu-order only links certain fence events (calls to synchronize_rcu(), > rcu_read_lock(), or rcu_read_unlock()), rcu-fence links any events > that are separated by an rcu-order link. This is analogous to the way > -the strong-fence relation links events that are separated by an > +the strong-order relation links events that are separated by an > smp_mb() fence event (as mentioned above, rcu-order links act kind of > -like strong fences). Written symbolically, X ->rcu-fence Y means > -there are fence events E and F such that: > +like strong ordering operations). Written symbolically, X ->rcu-fence Y > +means there are fence events E and F such that: > > X ->po E ->rcu-order F ->po Y. > > From the discussion above, we see this implies not only that X > executes before Y, but also (if X is a store) that X propagates to > -every CPU before Y executes. Thus rcu-fence is sort of a > -"super-strong" fence: Unlike the original strong fences (smp_mb() and > +every CPU before Y executes. Thus unlike strong fences (smp_mb() and > synchronize_rcu()), rcu-fence is able to link events on different > CPUs. (Perhaps this fact should lead us to say that rcu-fence isn't > really a fence at all!) > > Finally, the LKMM defines the RCU-before (rb) relation in terms of > rcu-fence. This is done in essentially the same way as the pb > -relation was defined in terms of strong-fence. We will omit the > +relation was defined in terms of strong-order. We will omit the > details; the end result is that E ->rb F implies E must execute > before F, just as E ->pb F does (and for much the same reasons). > > @@ -2134,7 +2163,7 @@ intermediate event Z such that: > > and either: > > - Z is connected to Y by a strong-fence link followed by a > + Z is connected to Y by a strong-order link followed by a > possibly empty sequence of xb links, > > or: > @@ -2153,10 +2182,10 @@ The motivations behind this definition are straightforward: > from W, which certainly means that W must have propagated to > R's CPU before R executed. > > - strong-fence memory barriers force stores that are po-before > - the barrier, or that propagate to the barrier's CPU before the > - barrier executes, to propagate to all CPUs before any events > - po-after the barrier can execute. > + Strong ordering operations force stores that are po-before > + the operation or that propagate to the CPU that begins the > + operation before the operation executes, to propagate to all > + CPUs before any events po-after the operation execute. > > To see how this works out in practice, consider our old friend, the MP > pattern (with fences and statement labels, but without the conditional > @@ -2485,7 +2514,7 @@ sequence or if W can be linked to R by a > sequence. For the cases involving a vis link, the LKMM also accepts > sequences in which W is linked to W' or R by a > > - strong-fence ; xb* ; {w and/or r}-pre-bounded > + strong-order ; xb* ; {w and/or r}-pre-bounded > > sequence with no post-bounding, and in every case the LKMM also allows > the link simply to be a fence with no bounding at all. If no sequence > diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat > index 07f884f9b2bf..1d91edbc10fe 100644 > --- a/tools/memory-model/linux-kernel.cat > +++ b/tools/memory-model/linux-kernel.cat > @@ -36,9 +36,7 @@ let wmb = [W] ; fencerel(Wmb) ; [W] > let mb = ([M] ; fencerel(Mb) ; [M]) | > ([M] ; fencerel(Before-atomic) ; [RMW] ; po? ; [M]) | > ([M] ; po? ; [RMW] ; fencerel(After-atomic) ; [M]) | > - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | > - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; > - fencerel(After-unlock-lock) ; [M]) > + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) > let gp = po ; [Sync-rcu | Sync-srcu] ; po? > let strong-fence = mb | gp > > @@ -91,8 +89,12 @@ acyclic hb as happens-before > (* Write and fence propagation ordering *) > (****************************************) > > -(* Propagation: Each non-rf link needs a strong fence. *) > -let pb = prop ; strong-fence ; hb* ; [Marked] > +(* Strong ordering operations *) > +let strong-order = strong-fence | ([M] ; po-unlock-lock-po ; > + [After-unlock-lock] ; po ; [M]) > + > +(* Propagation: Each non-rf link needs a strong ordering operation. *) > +let pb = prop ; strong-order ; hb* ; [Marked] > acyclic pb as propagation > > (*******) > @@ -141,7 +143,7 @@ let rec rcu-order = rcu-gp | srcu-gp | > (rcu-order ; rcu-link ; rcu-order) > let rcu-fence = po ; rcu-order ; po? > let fence = fence | rcu-fence > -let strong-fence = strong-fence | rcu-fence > +let strong-order = strong-order | rcu-fence > > (* rb orders instructions just as pb does *) > let rb = prop ; rcu-fence ; hb* ; pb* ; [Marked] > @@ -169,7 +171,7 @@ flag ~empty mixed-accesses as mixed-accesses > (* Executes-before and visibility *) > let xbstar = (hb | pb | rb)* > let vis = cumul-fence* ; rfe? ; [Marked] ; > - ((strong-fence ; [Marked] ; xbstar) | (xbstar & int)) > + ((strong-order ; [Marked] ; xbstar) | (xbstar & int)) > > (* Boundaries for lifetimes of plain accesses *) > let w-pre-bounded = [Marked] ; (addr | fence)? > @@ -180,9 +182,9 @@ let r-post-bounded = (nonrw-fence | ([~Noreturn] ; fencerel(Rmb) ; [R4rmb]))? ; > [Marked] > > (* Visibility and executes-before for plain accesses *) > -let ww-vis = fence | (strong-fence ; xbstar ; w-pre-bounded) | > +let ww-vis = fence | (strong-order ; xbstar ; w-pre-bounded) | > (w-post-bounded ; vis ; w-pre-bounded) > -let wr-vis = fence | (strong-fence ; xbstar ; r-pre-bounded) | > +let wr-vis = fence | (strong-order ; xbstar ; r-pre-bounded) | > (w-post-bounded ; vis ; r-pre-bounded) > let rw-xbstar = fence | (r-post-bounded ; xbstar ; w-pre-bounded) > > -- > 2.17.1 >
On 1/18/2023 12:45 AM, Boqun Feng wrote: > On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: >> --- >> .../Documentation/explanation.txt | 101 +++++++++++------- >> tools/memory-model/linux-kernel.cat | 20 ++-- > I've reviewed the cat change part, and it looks good to me. It's not the cat part that worries me! I'm worried that I made the explanation inconsistent somewhere, or that it's not very understandable. If you have time, please take a look at that as well. > One more thing, do we want something in the cat file to describe our > "internal axioms" as follow? > > (* Model internal invariants *) > (* ppo is the subset of po *) > flag ~empty (ppo \ po) as INTERNAL-ERROR-PPO > > Maybe it's helpful for people working on other tools to understand LKMM > cat file? I've thought that there should be a way to continuously test/prove properties of LKMM so that future changes to LKMM can't silently invalidate these properites (like what happened when this fence was added to LKMM). I'm not sure yet what's the best way to do this. I suppose a simple first step could be to have an additional cat file lkmm-properties.cat that flags all violations of the properties in the litmus tests, but I'm not sure how much that really helps: I think many violations are not present in any of the existing litmus tests. And some properties are hard to state as a cat flag. best wishes and thanks for reviewing, jonas
On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: > As stated in the documentation and implied by its name, the ppo > (preserved program order) relation is intended to link po-earlier > to po-later instructions under certain conditions. However, a > corner case currently allows instructions to be linked by ppo that > are not executed by the same thread, i.e., instructions are being > linked that have no po relation. > > This happens due to the mb/strong-fence relations, which (as one > case) provide order when locks are passed between threads followed > by an smp_mb__after_unlock_lock() fence. This is illustrated in > the following litmus test (as can be seen when using herd7 with > `doshow ppo`): > > P0(int *x, int *y) > { > spin_lock(x); > spin_unlock(x); > } > > P1(int *x, int *y) > { > spin_lock(x); > smp_mb__after_unlock_lock(); > *y = 1; > } > > The ppo relation will link P0's spin_lock(x) and P1's *y=1, > because P0 passes a lock to P1 which then uses this fence. > > The patch makes ppo a subrelation of po by eliminating this > possibility from mb and strong-fence, and instead introduces the > notion of strong ordering operations, which are allowed to link > events of distinct threads. > > Signed-off-by: Jonas Oberhauser <jonas.oberhauser@huaweicloud.com> > --- I'm just going to comment on the changes to the cat file. I'll review the documentation changes later. > diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat > index 07f884f9b2bf..1d91edbc10fe 100644 > --- a/tools/memory-model/linux-kernel.cat > +++ b/tools/memory-model/linux-kernel.cat > @@ -36,9 +36,7 @@ let wmb = [W] ; fencerel(Wmb) ; [W] > let mb = ([M] ; fencerel(Mb) ; [M]) | > ([M] ; fencerel(Before-atomic) ; [RMW] ; po? ; [M]) | > ([M] ; po? ; [RMW] ; fencerel(After-atomic) ; [M]) | > - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | > - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; > - fencerel(After-unlock-lock) ; [M]) > + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) Shouldn't the po case of (co | po) remain intact here? > let gp = po ; [Sync-rcu | Sync-srcu] ; po? > let strong-fence = mb | gp > > @@ -91,8 +89,12 @@ acyclic hb as happens-before > (* Write and fence propagation ordering *) > (****************************************) > > -(* Propagation: Each non-rf link needs a strong fence. *) > -let pb = prop ; strong-fence ; hb* ; [Marked] > +(* Strong ordering operations *) > +let strong-order = strong-fence | ([M] ; po-unlock-lock-po ; > + [After-unlock-lock] ; po ; [M]) This is not the same as the definition removed above. In particular, po-unlock-lock-po only allows one step along the locking chain -- it has rf where the definition above has co. > + > +(* Propagation: Each non-rf link needs a strong ordering operation. *) > +let pb = prop ; strong-order ; hb* ; [Marked] > acyclic pb as propagation > > (*******) > @@ -141,7 +143,7 @@ let rec rcu-order = rcu-gp | srcu-gp | > (rcu-order ; rcu-link ; rcu-order) > let rcu-fence = po ; rcu-order ; po? > let fence = fence | rcu-fence It would be nice here to have a separate term for a potentially cross-CPU fence. In fact, why don't we make a concerted effort to straighten out the terminology more fully? Now seems like a good time to do it. To begin with, let's be more careful about the difference between an order-inducing object (an event or pair of events) and the relation of being ordered by such an object. For instance, given: A: WRITE_ONCE(x, 1); B: smp_mb(); C: r1 = READ_ONCE(y); then B is an order-inducing object (a memory barrier), and (A,C) is a pair of events ordered by that object. In general, an order is related to an order-inducing object by: order = po ; [order-inducing object] ; po with suitable modifications for things like smp_store_release where one of the events being ordered _is_ the order-inducing event. So for example, we could consistently refer to all order-inducing events as either barriers or fences, and all order-reflecting relations as orders. This would require widespread changes to the .cat file, but I think it would be worthwhile. (Treating "barrier" and "fence" as synonyms seems to be too deeply entrenched to try and fight against.) Once that is straightened out, we can distinguish between fences or orders that are weak vs. strong. And then we can divide up strong fences/orders into single-CPU vs. cross-CPU, if we want to. How does that sound? Alan > -let strong-fence = strong-fence | rcu-fence > +let strong-order = strong-order | rcu-fence > > (* rb orders instructions just as pb does *) > let rb = prop ; rcu-fence ; hb* ; pb* ; [Marked] > @@ -169,7 +171,7 @@ flag ~empty mixed-accesses as mixed-accesses > (* Executes-before and visibility *) > let xbstar = (hb | pb | rb)* > let vis = cumul-fence* ; rfe? ; [Marked] ; > - ((strong-fence ; [Marked] ; xbstar) | (xbstar & int)) > + ((strong-order ; [Marked] ; xbstar) | (xbstar & int)) > > (* Boundaries for lifetimes of plain accesses *) > let w-pre-bounded = [Marked] ; (addr | fence)? > @@ -180,9 +182,9 @@ let r-post-bounded = (nonrw-fence | ([~Noreturn] ; fencerel(Rmb) ; [R4rmb]))? ; > [Marked] > > (* Visibility and executes-before for plain accesses *) > -let ww-vis = fence | (strong-fence ; xbstar ; w-pre-bounded) | > +let ww-vis = fence | (strong-order ; xbstar ; w-pre-bounded) | > (w-post-bounded ; vis ; w-pre-bounded) > -let wr-vis = fence | (strong-fence ; xbstar ; r-pre-bounded) | > +let wr-vis = fence | (strong-order ; xbstar ; r-pre-bounded) | > (w-post-bounded ; vis ; r-pre-bounded) > let rw-xbstar = fence | (r-post-bounded ; xbstar ; w-pre-bounded) > > -- > 2.17.1 >
> It would be nice here to have a separate term for a potentially > cross-CPU fence. > > In fact, why don't we make a concerted effort to straighten out the > terminology more fully? Now seems like a good time to do it. > > To begin with, let's be more careful about the difference between an > order-inducing object (an event or pair of events) and the relation of > being ordered by such an object. For instance, given: > > A: WRITE_ONCE(x, 1); > B: smp_mb(); > C: r1 = READ_ONCE(y); > > then B is an order-inducing object (a memory barrier), and (A,C) is a > pair of events ordered by that object. In general, an order is related > to an order-inducing object by: > > order = po ; [order-inducing object] ; po > > with suitable modifications for things like smp_store_release where > one of the events being ordered _is_ the order-inducing event. > > So for example, we could consistently refer to all order-inducing events > as either barriers or fences, and all order-reflecting relations as > orders. This would require widespread changes to the .cat file, but I > think it would be worthwhile. > > (Treating "barrier" and "fence" as synonyms seems to be too deeply > entrenched to try and fight against.) > > Once that is straightened out, we can distinguish between fences or > orders that are weak vs. strong. And then we can divide up strong > fences/orders into single-CPU vs. cross-CPU, if we want to. > > How does that sound? Sounds like a lot of work, renaming and review, for no clear win to me. :-) But hey, if other are into it... Andrea
On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: > As stated in the documentation and implied by its name, the ppo > (preserved program order) relation is intended to link po-earlier > to po-later instructions under certain conditions. However, a > corner case currently allows instructions to be linked by ppo that > are not executed by the same thread, i.e., instructions are being > linked that have no po relation. > > This happens due to the mb/strong-fence relations, which (as one > case) provide order when locks are passed between threads followed > by an smp_mb__after_unlock_lock() fence. This is illustrated in > the following litmus test (as can be seen when using herd7 with > `doshow ppo`): > > P0(int *x, int *y) spinlock_t > { > spin_lock(x); > spin_unlock(x); > } > > P1(int *x, int *y) > { > spin_lock(x); > smp_mb__after_unlock_lock(); > *y = 1; > } > > The ppo relation will link P0's spin_lock(x) and P1's *y=1, > because P0 passes a lock to P1 which then uses this fence. > > The patch makes ppo a subrelation of po by eliminating this > possibility from mb and strong-fence, and instead introduces the > notion of strong ordering operations, which are allowed to link > events of distinct threads. > > Signed-off-by: Jonas Oberhauser <jonas.oberhauser@huaweicloud.com> > --- > + Whenever any CPU C' executes an unlock operation U such that > + CPU C executes a lock operation L followed by a po-later > + smp_mb__after_unlock_lock() fence, and L is either a later lock > + operation on the lock released by U or is po-after U, then any > + store that propagates to C' before U must propagate to all other > + CPUs before any instructions po-after the fence are executed on C. The barrier is never mentioned in this document. This is a relatively oddball/rare barrier. Also, IMO, this description doesn't add much to the notions of execution and propagation being introduced. I'd rather move it (or parts of it) to ODDS AND ENDS where smp_mb__after_spinlock() and other smp_mb__*() are currently briefly described. > +While smp_wmb() and release fences only force certain earlier stores > +to propagate to another CPU C' before certain later stores propagate > +to the same CPU C', If "earlier" means po-earlier, this statement is wrong, cf. the comment about A-cumulativity. IAC, it should be clarified. > strong fences and smp_mb__after_unlock_lock() > +force those stores to propagate to all other CPUs before any later > +instruction is executed. We collectively refer to the latter > +operations as strong ordering operations, as they provide much > +stronger ordering in two ways: > + > + Firstly, strong ordering operations also create order between > + earlier stores and later reads. Switching back to "execution order" I guess; need clarification. > + > + Secondly, strong ordering operations create a form of global > + ordering: When an earlier store W propagates to CPU C and is > + ordered by a strong ordering operation with a store W' of C, > + and another CPU C' observes W' and in response issues yet > + another store W'', then W'' also can not propagate to any CPU > + before W. By contrast, a release fence or smp_wmb() would only > + order W and W', but not force any ordering between W and W''. > + To summarize, the ordering forced by strong ordering operations > + extends to later stores of all CPUs, while other fences only > + force ordering with relation to stores on the CPU that executes > + the fence. > + > +The propagation ordering enforced by release fences and strong ordering > +operations affects stores from other CPUs that propagate to CPU C before > +the fence is executed, as well as stores that are executed on C before > +the fence. We describe this property by saying that release fences and > +strong ordering operations are A-cumulative. By contrast, smp_wmb() > +fences are not A-cumulative; they only affect the propagation of stores > +that are executed on C before the fence (i.e., those which precede the > +fence in program order). [lots of renaming unless I missed something] > diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat > index 07f884f9b2bf..1d91edbc10fe 100644 > --- a/tools/memory-model/linux-kernel.cat > +++ b/tools/memory-model/linux-kernel.cat > @@ -36,9 +36,7 @@ let wmb = [W] ; fencerel(Wmb) ; [W] > let mb = ([M] ; fencerel(Mb) ; [M]) | > ([M] ; fencerel(Before-atomic) ; [RMW] ; po? ; [M]) | > ([M] ; po? ; [RMW] ; fencerel(After-atomic) ; [M]) | > - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | > - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; > - fencerel(After-unlock-lock) ; [M]) > + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) > let gp = po ; [Sync-rcu | Sync-srcu] ; po? > let strong-fence = mb | gp > > @@ -91,8 +89,12 @@ acyclic hb as happens-before > (* Write and fence propagation ordering *) > (****************************************) > > -(* Propagation: Each non-rf link needs a strong fence. *) > -let pb = prop ; strong-fence ; hb* ; [Marked] > +(* Strong ordering operations *) > +let strong-order = strong-fence | ([M] ; po-unlock-lock-po ; > + [After-unlock-lock] ; po ; [M]) > + > +(* Propagation: Each non-rf link needs a strong ordering operation. *) > +let pb = prop ; strong-order ; hb* ; [Marked] > acyclic pb as propagation > > (*******) > @@ -141,7 +143,7 @@ let rec rcu-order = rcu-gp | srcu-gp | > (rcu-order ; rcu-link ; rcu-order) > let rcu-fence = po ; rcu-order ; po? > let fence = fence | rcu-fence > -let strong-fence = strong-fence | rcu-fence > +let strong-order = strong-order | rcu-fence > > (* rb orders instructions just as pb does *) > let rb = prop ; rcu-fence ; hb* ; pb* ; [Marked] > @@ -169,7 +171,7 @@ flag ~empty mixed-accesses as mixed-accesses > (* Executes-before and visibility *) > let xbstar = (hb | pb | rb)* > let vis = cumul-fence* ; rfe? ; [Marked] ; > - ((strong-fence ; [Marked] ; xbstar) | (xbstar & int)) > + ((strong-order ; [Marked] ; xbstar) | (xbstar & int)) > > (* Boundaries for lifetimes of plain accesses *) > let w-pre-bounded = [Marked] ; (addr | fence)? > @@ -180,9 +182,9 @@ let r-post-bounded = (nonrw-fence | ([~Noreturn] ; fencerel(Rmb) ; [R4rmb]))? ; > [Marked] > > (* Visibility and executes-before for plain accesses *) > -let ww-vis = fence | (strong-fence ; xbstar ; w-pre-bounded) | > +let ww-vis = fence | (strong-order ; xbstar ; w-pre-bounded) | > (w-post-bounded ; vis ; w-pre-bounded) > -let wr-vis = fence | (strong-fence ; xbstar ; r-pre-bounded) | > +let wr-vis = fence | (strong-order ; xbstar ; r-pre-bounded) | > (w-post-bounded ; vis ; r-pre-bounded) > let rw-xbstar = fence | (r-post-bounded ; xbstar ; w-pre-bounded) If "making ppo a subrelation of po" is the goal, why not something like: diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index d70315fddef6e..6e08e92323b5d 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -36,9 +36,7 @@ let wmb = [W] ; fencerel(Wmb) ; [W] let mb = ([M] ; fencerel(Mb) ; [M]) | ([M] ; fencerel(Before-atomic) ; [RMW] ; po? ; [M]) | ([M] ; po? ; [RMW] ; fencerel(After-atomic) ; [M]) | - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; - fencerel(After-unlock-lock) ; [M]) + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) let gp = po ; [Sync-rcu | Sync-srcu] ; po? let strong-fence = mb | gp @@ -90,6 +88,9 @@ acyclic hb as happens-before (* Write and fence propagation ordering *) (****************************************) +let strong-fence = strong-fence | ([M] ; po-unlock-lock-po ; + [After-unlock-lock] ; po ; [M]) + (* Propagation: Each non-rf link needs a strong fence. *) let pb = prop ; strong-fence ; hb* ; [Marked] acyclic pb as propagation No other changes to the CAT/TXT files (or, if you like, a short addition to ODDS AND ENDS). Sorry, this doesn't really seem to be worth the noise/diff... Andrea
On 1/18/2023 8:52 PM, Alan Stern wrote: > On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: >> As stated in the documentation and implied by its name, the ppo >> (preserved program order) relation is intended to link po-earlier >> to po-later instructions under certain conditions. However, a >> corner case currently allows instructions to be linked by ppo that >> are not executed by the same thread, i.e., instructions are being >> linked that have no po relation. >> >> This happens due to the mb/strong-fence relations, which (as one >> case) provide order when locks are passed between threads followed >> by an smp_mb__after_unlock_lock() fence. This is illustrated in >> the following litmus test (as can be seen when using herd7 with >> `doshow ppo`): >> >> P0(int *x, int *y) >> { >> spin_lock(x); >> spin_unlock(x); >> } >> >> P1(int *x, int *y) >> { >> spin_lock(x); >> smp_mb__after_unlock_lock(); >> *y = 1; >> } >> >> The ppo relation will link P0's spin_lock(x) and P1's *y=1, >> because P0 passes a lock to P1 which then uses this fence. >> >> The patch makes ppo a subrelation of po by eliminating this >> possibility from mb and strong-fence, and instead introduces the >> notion of strong ordering operations, which are allowed to link >> events of distinct threads. >> >> Signed-off-by: Jonas Oberhauser <jonas.oberhauser@huaweicloud.com> >> --- > I'm just going to comment on the changes to the cat file. I'll review > the documentation changes later. > >> diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat >> index 07f884f9b2bf..1d91edbc10fe 100644 >> --- a/tools/memory-model/linux-kernel.cat >> +++ b/tools/memory-model/linux-kernel.cat >> @@ -36,9 +36,7 @@ let wmb = [W] ; fencerel(Wmb) ; [W] >> let mb = ([M] ; fencerel(Mb) ; [M]) | >> ([M] ; fencerel(Before-atomic) ; [RMW] ; po? ; [M]) | >> ([M] ; po? ; [RMW] ; fencerel(After-atomic) ; [M]) | >> - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | >> - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; >> - fencerel(After-unlock-lock) ; [M]) >> + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) > Shouldn't the po case of (co | po) remain intact here? You can leave it here, but it is already covered by two other parts: the ordering given through ppo/hb is covered by the po-unlock-lock-po & int in ppo, and the ordering given through pb is covered by its inclusion in strong-order. Now whether it should be included in strong-order or not is a matter of grouping: is it better to leave all cases ordered by the [After-unlock-lock] fence together, or is it better to keep the <=po parts of the fences together and the external parts together? Right now I prefer to group things around the fences because that is more of an isolated idea, rather than around whether they order internally or externally. As a side bonus I like that po-unlock-lock-po ; [After-unlock-lock] "rhymes" nicely. If we split the strong-order definition and move the po part back into mb, that would disappear. >> let gp = po ; [Sync-rcu | Sync-srcu] ; po? >> let strong-fence = mb | gp >> >> @@ -91,8 +89,12 @@ acyclic hb as happens-before >> (* Write and fence propagation ordering *) >> (****************************************) >> >> -(* Propagation: Each non-rf link needs a strong fence. *) >> -let pb = prop ; strong-fence ; hb* ; [Marked] >> +(* Strong ordering operations *) >> +let strong-order = strong-fence | ([M] ; po-unlock-lock-po ; >> + [After-unlock-lock] ; po ; [M]) > This is not the same as the definition removed above. In particular, > po-unlock-lock-po only allows one step along the locking chain -- it has > rf where the definition above has co. Indeed it is not, but the subsequent lock-unlock sequences are in hb*. For this reason it can be simplified to just consider the directly following unlock(). As a consequence, LKW becomes de-emphasized. Note that if you remove the explicit references to LKW from the model, you can argue about programs that don't use trylock as though LKW didn't exist, which is what some people in academia do, e.g., https://people.mpi-sws.org/~viktor/papers/oopsla2019-lapor.pdf (I added Viktor in CC, in case he wants to add something). Since it doesn't cost anything I decided to include it like this. I don't feel extremely strong about either point, but I've written them according to my preference. > >> + >> +(* Propagation: Each non-rf link needs a strong ordering operation. *) >> +let pb = prop ; strong-order ; hb* ; [Marked] >> acyclic pb as propagation >> >> (*******) >> @@ -141,7 +143,7 @@ let rec rcu-order = rcu-gp | srcu-gp | >> (rcu-order ; rcu-link ; rcu-order) >> let rcu-fence = po ; rcu-order ; po? >> let fence = fence | rcu-fence > It would be nice here to have a separate term for a potentially > cross-CPU fence. > > In fact, why don't we make a concerted effort to straighten out the > terminology more fully? Now seems like a good time to do it. I agree; wrapping my head around this terminology-space is the whole reason why I started looking into the formalization of rcu, and I'm beginning to understand it a little bit. > To begin with, let's be more careful about the difference between an > order-inducing object (an event or pair of events) and the relation of > being ordered by such an object. For instance, given: > > A: WRITE_ONCE(x, 1); > B: smp_mb(); > C: r1 = READ_ONCE(y); > > then B is an order-inducing object (a memory barrier), and (A,C) is a > pair of events ordered by that object. In general, an order is related > to an order-inducing object by: > > order = po ; [order-inducing object] ; po Yes! That's what I was trying to say in the rcu-order/rcu-fence discussion. (I would call it an operation rather than an object, since it may be something involving steps of multiple threads, like rcu, but let's stick with object for now). However, while trying to rewrite the definition, I noticed that there *is* something around rb which requires playing with po?, just not in the current definition of rcu-fence or gp. This is currently covered in having po <= rcu-link. A good example would be ... ->prop X ->po;rcu-rcsc;rcu-link;rcu-gp Y ->po Z ->rcu-gp;rcu-link;rcu-rscs;po ... (I think this happens to be very similar if not the same as the code you sent earlier!). in my view this includes two ordering objects (the two RCU law instances rcu-rcsc;rcu-link;rcu-gp and rcu-gp;rcu-link;rcu-rscs), but there's only one po! It can't be made to quite belong to either ordering object, so you don't have order;order here. So I started wondering why the same isn't the case for pb, and whether perhaps it should be order = po ; [order-inducing object] ; po? This way you'd get order;order here quite neatly. I figured out that if we replace the first ordering object here with a pb related one like ... ->prop X ->po;[mb fence] Y ->po Z ->rcu-gp;rcu-link;rcu-rscs;po ... then we can just forget about the first ordering object and turn the whole thing into ... ->prop X ->po Z ->rcu-gp;rcu-link;rcu-rscs;po ... So in this case, the po can always be "stolen" by the second ordering object and we can completely forget about the first one, because the prop;po already has an ordering effect together with the second ordering object by itself. This is also the case if the second ordering object is a pb-related one. For this reason, the issue of po? never comes up when sticking to hb and pb, only when looking at rb. That at least explains why using po is ok for defining strong-order. I'm still not sure if just defining order = po ; [order-inducing object] ; po? would also be ok for defining strong-order. This would have the benefit that it would just work for for rb-related order-inducing objects as well. Do you remember why the current definition does not have a po? at the end? > > with suitable modifications for things like smp_store_release where > one of the events being ordered _is_ the order-inducing event. > > So for example, we could consistently refer to all order-inducing events > as either barriers or fences, and all order-reflecting relations as orders. This would require widespread changes to the .cat file, but I > think it would be worthwhile. I agree that having a uniform language for this is worthwhile. However you just dropped the cases of order-inducing objects that are not just a single event. I am completely fine calling the individual *events* barriers. (I don't like calling every barrier a fence though; Arm very explicitly doesn't do this and internally we don't either. However for LKMM I don't mind sticking to existing terminology here and calling them all fences; but to me a fence is something that orders certain things po-before the fence with certain things po-after). But I would like to have a name for order-inducing objects that link two events. I would call them an "ordering operation" where the first event is the event that "starts the operation", and the second event is the event that "completes that operation". Then we can say things like "when CPU C starts an ordering operation of type blah that is completed by CPU C', then any store that propagates to C before C starts the operation must propagate to any CPU C'' before any store of C' that is executed after C' completes the execution propagates to C'' ". This would be a weak ordering operation, and would probably imply that blah-order is a kind of happens-before order and would appear in cumul-fence. Or "when CPU C starts an ordering operation of type blubb that is completed by CPU C', then any store that propagates to C before C starts the operation must propagate to any other CPU before any instruction of C' po-after C' completes the execution is executed". This would be a strong ordering operation which would imply that prop ; blubb-order is a kind of happens-before order. > (Treating "barrier" and "fence" as synonyms seems to be too deeply > entrenched to try and fight against.) > > Once that is straightened out, we can distinguish between fences or > orders that are weak vs. strong. And then we can divide up strong > fences/orders into single-CPU vs. cross-CPU, if we want to. With the distinction above, barriers and fences are always single-CPU, while ordering operations can be either cross-CPU or single-CPU. I'm not sure if there's still a need to distinguish more carefully than that. Hope that makes sense, jonas
Thanks for reviewing the documentation. You made me realize that the patch is already doing two things -- trying to fix the incorrectness of the documentation where it claims that fences like strong-fence only relate po-earlier to po-later events, and trying to make ppo a subrelation of po. Perhaps it would be better to do this in two steps. First like you suggest only do the ppo fix, and then in a second step (after agreeing with Alan on terminology) fix the documentation in a unified way (instead of only for strong-fence like in this patch). Of course you're free to re-state your disagreement about such a change then :D Either way, the specific comments are helpful. On 1/18/2023 10:30 PM, Andrea Parri wrote: >> + Whenever any CPU C' executes an unlock operation U such that >> + CPU C executes a lock operation L followed by a po-later >> + smp_mb__after_unlock_lock() fence, and L is either a later lock >> + operation on the lock released by U or is po-after U, then any >> + store that propagates to C' before U must propagate to all other >> + CPUs before any instructions po-after the fence are executed on C. > The barrier is never mentioned in this document. This is a relatively > oddball/rare barrier. Also, IMO, this description doesn't add much to > the notions of execution and propagation being introduced. I'd rather > move it (or parts of it) to ODDS AND ENDS where smp_mb__after_spinlock() > and other smp_mb__*() are currently briefly described. I understand your concern. However, I think the extended strong-order relation needs to be mentioned for defining pb. Having a strong ordering operation at this point of the manual also helps introducing rcu-fence later which works similarly. I'm hoping if we can make a single renaming patch, we can essentially kill most of the explanation of how rcu-fence links events by different threads by just pointing to how strong-order is doing the same thing. >> +While smp_wmb() and release fences only force certain earlier stores >> +to propagate to another CPU C' before certain later stores propagate >> +to the same CPU C', > If "earlier" means po-earlier, this statement is wrong, cf. the comment > about A-cumulativity. IAC, it should be clarified. Indeed I don't mean po-earlier, and agree it should be clarified. But I'm not sure yet how to clarify "earlier" and "later" considering that the precise definition of earlier and later depends on the barrier. > > >> strong fences and smp_mb__after_unlock_lock() >> +force those stores to propagate to all other CPUs before any later >> +instruction is executed. We collectively refer to the latter >> +operations as strong ordering operations, as they provide much >> +stronger ordering in two ways: >> + >> + Firstly, strong ordering operations also create order between >> + earlier stores and later reads. > Switching back to "execution order" I guess; need clarification. > (Same as above) >> + >> + Secondly, strong ordering operations create a form of global >> + ordering: When an earlier store W propagates to CPU C and is >> + ordered by a strong ordering operation with a store W' of C, >> + and another CPU C' observes W' and in response issues yet >> + another store W'', then W'' also can not propagate to any CPU >> + before W. By contrast, a release fence or smp_wmb() would only >> + order W and W', but not force any ordering between W and W''. >> + To summarize, the ordering forced by strong ordering operations >> + extends to later stores of all CPUs, while other fences only >> + force ordering with relation to stores on the CPU that executes >> + the fence. >> + >> +The propagation ordering enforced by release fences and strong ordering >> +operations affects stores from other CPUs that propagate to CPU C before >> +the fence is executed, as well as stores that are executed on C before >> +the fence. We describe this property by saying that release fences and >> +strong ordering operations are A-cumulative. By contrast, smp_wmb() >> +fences are not A-cumulative; they only affect the propagation of stores >> +that are executed on C before the fence (i.e., those which precede the >> +fence in program order). > [lots of renaming unless I missed something] The second paragraph is just renaming, but the first part is new. Best wishes and let me know if you agree on rearranging the submission like that, jonas
On Wed, Jan 18, 2023 at 10:38:11PM +0100, Jonas Oberhauser wrote: > > > On 1/18/2023 8:52 PM, Alan Stern wrote: > > On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: > > > - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | > > > - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; > > > - fencerel(After-unlock-lock) ; [M]) > > > + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) > > Shouldn't the po case of (co | po) remain intact here? > > You can leave it here, but it is already covered by two other parts: the > ordering given through ppo/hb is covered by the po-unlock-lock-po & int in > ppo, and the ordering given through pb is covered by its inclusion in > strong-order. What about the ordering given through A-cumul(strong-fence)/cumul-fence/prop/hb? I suppose that might be superseded by pb as well, but it seems odd not to have it in hb. > Now whether it should be included in strong-order or not is a matter of > grouping: is it better to leave all cases ordered by the [After-unlock-lock] > fence together, or is it better to keep the <=po parts of the fences > together and the external parts together? > Right now I prefer to group things around the fences because that is more of > an isolated idea, rather than around whether they order internally or > externally. In general, the idea in the memory model is that hb ordering relies on the non-strong properties of fences, whereas pb relies on the properties of strong fences, and rb relies on the properties of the RCU fences. It's sort of a hierarchy. In principle these relations should all be merged together (like xb does), but it turns out to work okay in practice because pb will absorb an hb on its right, and rb will absorb a pb on its right. Besides, merging them all together would involve an uncomfortably large recursive definition. > > > @@ -91,8 +89,12 @@ acyclic hb as happens-before > > > (* Write and fence propagation ordering *) > > > (****************************************) > > > -(* Propagation: Each non-rf link needs a strong fence. *) > > > -let pb = prop ; strong-fence ; hb* ; [Marked] > > > +(* Strong ordering operations *) > > > +let strong-order = strong-fence | ([M] ; po-unlock-lock-po ; > > > + [After-unlock-lock] ; po ; [M]) > > This is not the same as the definition removed above. In particular, > > po-unlock-lock-po only allows one step along the locking chain -- it has > > rf where the definition above has co. > > Indeed it is not, but the subsequent lock-unlock sequences are in hb*. For > this reason it can be simplified to just consider the directly following > unlock(). I'm not sure that argument is right. The smp_mb__after_unlock_lock needs to go after the _last_ lock in the sequence, not after the first. So you don't have to worry about subsequent lock-unlock sequences; you have to worry about preceding lock-unlock sequences. > > In fact, why don't we make a concerted effort to straighten out the > > terminology more fully? Now seems like a good time to do it. > > I agree; wrapping my head around this terminology-space is the whole reason > why I started looking into the formalization of rcu, and I'm beginning to > understand it a little bit. > > > To begin with, let's be more careful about the difference between an > > order-inducing object (an event or pair of events) and the relation of > > being ordered by such an object. For instance, given: > > > > A: WRITE_ONCE(x, 1); > > B: smp_mb(); > > C: r1 = READ_ONCE(y); > > > > then B is an order-inducing object (a memory barrier), and (A,C) is a > > pair of events ordered by that object. In general, an order is related > > to an order-inducing object by: > > > > order = po ; [order-inducing object] ; po > > Yes! That's what I was trying to say in the rcu-order/rcu-fence discussion. > (I would call it an operation rather than an object, since it may be > something involving steps of multiple threads, like rcu, but let's stick > with object for now). Like I said above, the object could be a pair of events. But "operation" is probably a better term. > > with suitable modifications for things like smp_store_release where > > one of the events being ordered _is_ the order-inducing event. > > > > So for example, we could consistently refer to all order-inducing events > > as either barriers or fences, and all order-reflecting relations as orders. This would require widespread changes to the .cat file, but I > > think it would be worthwhile. > > I agree that having a uniform language for this is worthwhile. > However you just dropped the cases of order-inducing objects that are not > just a single event. Let's agree to talk about order-inducing operations instead, and take {order-inducing operation} to be the relation linking the start event of the operation to the end event. So for operations that are a single object, such as a memory barrier, it is the same as [order-inducing object]. Then we have order = po ; {order-inducing operation} ; po > I am completely fine calling the individual *events* barriers. > (I don't like calling every barrier a fence though; Arm very explicitly > doesn't do this and internally we don't either. However for LKMM I don't > mind sticking to existing terminology here and calling them all fences; but > to me a fence is something that orders certain things po-before the fence > with certain things po-after). Maybe we should also agree to reserve "barrier" for compiler barriers, and use "fence" for all memory barriers. (Yes, I realize this isn't totally consistent but it's the best I can think of.) Then all fence instructions would be barriers but not vice versa. > But I would like to have a name for order-inducing objects that link two > events. > > I would call them an "ordering operation" where the first event is the event > that "starts the operation", and the second event is the event that > "completes that operation". > > Then we can say things like "when CPU C starts an ordering operation of type > blah that is completed by CPU C', then any store that propagates to C before > C starts the operation must propagate to any CPU C'' before any store of C' > that is executed after C' completes the execution propagates to C'' ". > This would be a weak ordering operation, and would probably imply that > blah-order is a kind of happens-before order and would appear in > cumul-fence. > > Or "when CPU C starts an ordering operation of type blubb that is completed > by CPU C', then any store that propagates to C before C starts the operation > must propagate to any other CPU before any instruction of C' po-after C' > completes the execution is executed". > This would be a strong ordering operation which would imply that prop ; > blubb-order is a kind of happens-before order. Right. > > (Treating "barrier" and "fence" as synonyms seems to be too deeply > > entrenched to try and fight against.) > > > > Once that is straightened out, we can distinguish between fences or > > orders that are weak vs. strong. And then we can divide up strong > > fences/orders into single-CPU vs. cross-CPU, if we want to. > With the distinction above, barriers and fences are always single-CPU, while > ordering operations can be either cross-CPU or single-CPU. I'm not sure if > there's still a need to distinguish more carefully than that. We'll see. But we need a shorter name than "ordering-operation" for use in the memory model. How about "xfence"? Or maybe use "xfence" for cross-CPU ordering operations, which are always strong -- I don't know what we should call multi-instruction single-CPU ordering operations (of which po-unlock-lock-po seems to be the only one). Alan
On 1/19/2023 4:13 AM, Alan Stern wrote: > On Wed, Jan 18, 2023 at 10:38:11PM +0100, Jonas Oberhauser wrote: >> >> On 1/18/2023 8:52 PM, Alan Stern wrote: >>> On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: >>>> - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | >>>> - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; >>>> - fencerel(After-unlock-lock) ; [M]) >>>> + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) >>> Shouldn't the po case of (co | po) remain intact here? >> You can leave it here, but it is already covered by two other parts: the >> ordering given through ppo/hb is covered by the po-unlock-lock-po & int in >> ppo, and the ordering given through pb is covered by its inclusion in >> strong-order. > What about the ordering given through > A-cumul(strong-fence)/cumul-fence/prop/hb? I suppose that might be > superseded by pb as well, Indeed, in fact all of A-cumul(strong-fence) is already covered through pb. > but it seems odd not to have it in hb. > In general, the idea in the memory model is that hb ordering relies on > the non-strong properties of fences, whereas pb relies on the properties > of strong fences, and rb relies on the properties of the RCU fences. I agree in the sense that all strong-ordering operations are A-cumulative and not including them in A-cumul is weird. On the other side the model becomes a tiny bit smaller and simpler when all ordering of prop;strong-ordering goes through a single place (pb). If you want, you could think of the A-cumulativity of strong ordering operations as being a consequence of their strong properties. Mathematically it already is the case, since overwrite&ext ; cumul-fence* ; rfe ; strong-fence ; cumul-fence* ; rfe? is a subset of prop ; strong-fence ; hb* >>>> @@ -91,8 +89,12 @@ acyclic hb as happens-before >>>> (* Write and fence propagation ordering *) >>>> (****************************************) >>>> -(* Propagation: Each non-rf link needs a strong fence. *) >>>> -let pb = prop ; strong-fence ; hb* ; [Marked] >>>> +(* Strong ordering operations *) >>>> +let strong-order = strong-fence | ([M] ; po-unlock-lock-po ; >>>> + [After-unlock-lock] ; po ; [M]) >>> This is not the same as the definition removed above. In particular, >>> po-unlock-lock-po only allows one step along the locking chain -- it has >>> rf where the definition above has co. >> Indeed it is not, but the subsequent lock-unlock sequences are in hb*. For >> this reason it can be simplified to just consider the directly following >> unlock(). > I'm not sure that argument is right. The smp_mb__after_unlock_lock > needs to go after the _last_ lock in the sequence, not after the first. > So you don't have to worry about subsequent lock-unlock sequences; you > have to worry about preceding lock-unlock sequences. I formalized a proof of equivalence in Coq a few months ago, but I was recalling the argument incorrectly from memory. I think the correct argument is that the previous po-unlock-lock-po form a cumul-fence*;rfe;po sequence that starts with a po-rel. so any prop; .... ; co ; ... ; this fence ;... can be rewritten to prop; cumul_fence* ; po-unlock-lock-po ; this fence ;... and because the the first cumul-fence here needs to start with po-release, the prop ;cumul-fence* can be merged into a larger prop, leaving prop; po-unlock-lock-po ; this fence ;... Best wishes, jonas
On Thu, Jan 19, 2023 at 04:05:38PM +0100, Jonas Oberhauser wrote: > > > On 1/19/2023 4:13 AM, Alan Stern wrote: > > On Wed, Jan 18, 2023 at 10:38:11PM +0100, Jonas Oberhauser wrote: > > > > > > On 1/18/2023 8:52 PM, Alan Stern wrote: > > > > On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: > > > > > - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | > > > > > - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; > > > > > - fencerel(After-unlock-lock) ; [M]) > > > > > + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) > > > > Shouldn't the po case of (co | po) remain intact here? > > > You can leave it here, but it is already covered by two other parts: the > > > ordering given through ppo/hb is covered by the po-unlock-lock-po & int in > > > ppo, and the ordering given through pb is covered by its inclusion in > > > strong-order. > > What about the ordering given through > > A-cumul(strong-fence)/cumul-fence/prop/hb? I suppose that might be > > superseded by pb as well, > Indeed, in fact all of A-cumul(strong-fence) is already covered through pb. > > but it seems odd not to have it in hb. > > In general, the idea in the memory model is that hb ordering relies on > > the non-strong properties of fences, whereas pb relies on the properties > > of strong fences, and rb relies on the properties of the RCU fences. > > I agree in the sense that all strong-ordering operations are A-cumulative > and not including them in A-cumul is weird. The reason for including A-cumul(strong-fence | po-rel) in the definition of cumul-fence had nothing to do with the fact that the fences needed to be strong. It was simply a convenient way to list all the A-cumulative fences. It could just as well have been written A-cumul(mb | gp | po-rel). > On the other side the model becomes a tiny bit smaller and simpler when all > ordering of prop;strong-ordering goes through a single place (pb). > > If you want, you could think of the A-cumulativity of strong ordering > operations as being a consequence of their strong properties. Mathematically That's backward logic. Being strong isn't the reason the fences are A-cumulative. Indeed, the model could have been written not to assume that strong fences are A-cumulative. > it already is the case, since > overwrite&ext ; cumul-fence* ; rfe ; strong-fence ; cumul-fence* ; rfe? > is a subset of > prop ; strong-fence ; hb* Invalid reasoning. If strong fences had not been A-cumulative then this inclusion wouldn't matter, because the pb relation would have been defined differently. > > > > > @@ -91,8 +89,12 @@ acyclic hb as happens-before > > > > > (* Write and fence propagation ordering *) > > > > > (****************************************) > > > > > -(* Propagation: Each non-rf link needs a strong fence. *) > > > > > -let pb = prop ; strong-fence ; hb* ; [Marked] > > > > > +(* Strong ordering operations *) > > > > > +let strong-order = strong-fence | ([M] ; po-unlock-lock-po ; > > > > > + [After-unlock-lock] ; po ; [M]) > > > > This is not the same as the definition removed above. In particular, > > > > po-unlock-lock-po only allows one step along the locking chain -- it has > > > > rf where the definition above has co. > > > Indeed it is not, but the subsequent lock-unlock sequences are in hb*. For > > > this reason it can be simplified to just consider the directly following > > > unlock(). > > I'm not sure that argument is right. The smp_mb__after_unlock_lock > > needs to go after the _last_ lock in the sequence, not after the first. > > So you don't have to worry about subsequent lock-unlock sequences; you > > have to worry about preceding lock-unlock sequences. > > I formalized a proof of equivalence in Coq a few months ago, but I was > recalling the argument incorrectly from memory. > I think the correct argument is that the previous po-unlock-lock-po form a > cumul-fence*;rfe;po sequence that starts with a po-rel. > so any > prop; .... ; co ; ... ; this fence ;... > can be rewritten to > prop; cumul_fence* ; po-unlock-lock-po ; this fence ;... > and because the the first cumul-fence here needs to start with po-release, > the prop ;cumul-fence* can be merged into a larger prop, leaving > prop; po-unlock-lock-po ; this fence ;... This may be so, but I would like to point out that the memory model was not particularly designed to be as short or as simple as possible, but more to reflect transparently the intuitions that kernel programmers have about the ways ordering should work in the kernel. It may very well include redundancies as a result. I don't think that's a bad point. For example, the prop relation is meant to cover all fences that order store propagations in the usual way (basically all fences except rmb). This includes both weak and strong fences; the fact that strong fences also have other ordering properties doesn't mean they should be kept out of prop. Alan
On 1/19/2023 9:06 PM, Alan Stern wrote: > On Thu, Jan 19, 2023 at 04:05:38PM +0100, Jonas Oberhauser wrote: >> >> On 1/19/2023 4:13 AM, Alan Stern wrote: >>> On Wed, Jan 18, 2023 at 10:38:11PM +0100, Jonas Oberhauser wrote: >>>> On 1/18/2023 8:52 PM, Alan Stern wrote: >>>>> On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: >>>>>> - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | >>>>>> - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; >>>>>> - fencerel(After-unlock-lock) ; [M]) >>>>>> + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) >>>>> Shouldn't the po case of (co | po) remain intact here? >>>> You can leave it here, but it is already covered by two other parts: the >>>> ordering given through ppo/hb is covered by the po-unlock-lock-po & int in >>>> ppo, and the ordering given through pb is covered by its inclusion in >>>> strong-order. >>> What about the ordering given through >>> A-cumul(strong-fence)/cumul-fence/prop/hb? I suppose that might be >>> superseded by pb as well, >> Indeed, in fact all of A-cumul(strong-fence) is already covered through pb. >>> but it seems odd not to have it in hb. >>> In general, the idea in the memory model is that hb ordering relies on >>> the non-strong properties of fences, whereas pb relies on the properties >>> of strong fences, and rb relies on the properties of the RCU fences. >> I agree in the sense that all strong-ordering operations are A-cumulative >> and not including them in A-cumul is weird. > >> On the other side the model becomes a tiny bit smaller and simpler when all >> ordering of prop;strong-ordering goes through a single place (pb). >> >> If you want, you could think of the A-cumulativity of strong ordering >> operations as being a consequence of their strong properties. Mathematically > That's backward logic. Being strong isn't the reason the fences are > A-cumulative. Indeed, the model could have been written not to assume > that strong fences are A-cumulative. It's not just the model, it's also how strong fences are introduced in the documentation. I agree though that you could decouple the notion of strong from A-cumulativity. But would anyone want a strong fence that is not A-cumulative? It's a bit like asking in C11 for a barrier that has the sequential consistency guarantee of appearing in the global order S, but doesn't have release or acquire semantics. Yes you could define that, but would it really make sense? >>>>>> @@ -91,8 +89,12 @@ acyclic hb as happens-before >>>>>> (* Write and fence propagation ordering *) >>>>>> (****************************************) >>>>>> -(* Propagation: Each non-rf link needs a strong fence. *) >>>>>> -let pb = prop ; strong-fence ; hb* ; [Marked] >>>>>> +(* Strong ordering operations *) >>>>>> +let strong-order = strong-fence | ([M] ; po-unlock-lock-po ; >>>>>> + [After-unlock-lock] ; po ; [M]) >>>>> This is not the same as the definition removed above. In particular, >>>>> po-unlock-lock-po only allows one step along the locking chain -- it has >>>>> rf where the definition above has co. >>>> Indeed it is not, but the subsequent lock-unlock sequences are in hb*. For >>>> this reason it can be simplified to just consider the directly following >>>> unlock(). >>> I'm not sure that argument is right. The smp_mb__after_unlock_lock >>> needs to go after the _last_ lock in the sequence, not after the first. >>> So you don't have to worry about subsequent lock-unlock sequences; you >>> have to worry about preceding lock-unlock sequences. >> I formalized a proof of equivalence in Coq a few months ago, but I was >> recalling the argument incorrectly from memory. >> I think the correct argument is that the previous po-unlock-lock-po form a >> cumul-fence*;rfe;po sequence that starts with a po-rel. >> so any >> prop; .... ; co ; ... ; this fence ;... >> can be rewritten to >> prop; cumul_fence* ; po-unlock-lock-po ; this fence ;... >> and because the the first cumul-fence here needs to start with po-release, >> the prop ;cumul-fence* can be merged into a larger prop, leaving >> prop; po-unlock-lock-po ; this fence ;... > This may be so, but I would like to point out that the memory model was > not particularly designed to be as short or as simple as possible, but > more to reflect transparently the intuitions that kernel programmers > have about the ways ordering should work in the kernel. It may very > well include redundancies as a result. I don't think that's a bad > point. I agree that sometimes redundancies have value. But I think having, where possible, a kind of minimal responsibility principle where each fence appears in as few relations as possible also has value. I've seen that when I try to help people in my team learn to use LKMM: they see a specific type of fence and get stuck for a while chasing one of its uses. For example, they may chase a long prop link using the only strong fence in the example somewhere in the middle, which will then later turn out to be a dead-end because what they need to use is pb. For someone who doesn't know that we never need to rely on that use to get ordering, it basically doubles the amount of time spent looking at the graph and chasing definitions. And for very good reasons there already are alot of definitions even when redundancies are eliminated. Perhaps I would say that including these redundancies is good for understanding why the formalization makes sense, but excluding them is better for actually using the formalization. This includes both when looking at code while having a printout of the model next to you, but also when trying to reason about LKMM itself (e.g., what one might do when changes are made to LKMM and one wants to check that they interact well with the existing parts of LKMM). I think in the long term, increasing the usability is more important than the obvious correctness. But maybe I'm biased because I'm mostly on the user side of LKMM :D
On Fri, Jan 20, 2023 at 12:12:50PM +0100, Jonas Oberhauser wrote: > > > On 1/19/2023 9:06 PM, Alan Stern wrote: > > That's backward logic. Being strong isn't the reason the fences are > > A-cumulative. Indeed, the model could have been written not to assume > > that strong fences are A-cumulative. > > It's not just the model, it's also how strong fences are introduced in the > documentation. The documentation can be updated. > I agree though that you could decouple the notion of strong from > A-cumulativity. > But would anyone want a strong fence that is not A-cumulative? Why would anyone want a weak fence that isn't A-cumulative? :-) Architecture designers sometimes do strange things... > It's a bit like asking in C11 for a barrier that has the sequential > consistency guarantee of appearing in the global order S, but doesn't have > release or acquire semantics. Yes you could define that, but would it really > make sense? You're still missing the point. The important aspect of the fences in cumul-fence is that they are A-cumulative, not whether they are strong. You're fixating on an irrelevancy. > > This may be so, but I would like to point out that the memory model was > > not particularly designed to be as short or as simple as possible, but > > more to reflect transparently the intuitions that kernel programmers > > have about the ways ordering should work in the kernel. It may very > > well include redundancies as a result. I don't think that's a bad > > point. > > I agree that sometimes redundancies have value. But I think having, where > possible, a kind of minimal responsibility principle where each fence > appears in as few relations as possible also has value. > I've seen that when I try to help people in my team learn to use LKMM: they > see a specific type of fence and get stuck for a while chasing one of its > uses. For example, they may chase a long prop link using the only strong > fence in the example somewhere in the middle, which will then later turn out > to be a dead-end because what they need to use is pb. People who make that particular mistake should take a lesson from it: The presence of a strong fence should point them toward looking for an application of pb before looking at prop, because pb is is based on the special properties of strong fences whereas prop is not. > For someone who doesn't know that we never need to rely on that use to get > ordering, it basically doubles the amount of time spent looking at the graph > and chasing definitions. And for very good reasons there already are alot of > definitions even when redundancies are eliminated. IMO, people who try to use the memory model this way need to develop a good understanding of it first. (Although perhaps carrying out these sorts of exercises is _how_ people develop their understanding...) I don't regard it as a bad thing that by making a mistake through pursuing a redundant pathway, people can deepen their understanding. That's how one learns. > Perhaps I would say that including these redundancies is good for > understanding why the formalization makes sense, but excluding them is > better for actually using the formalization. > This includes both when looking at code while having a printout of the model > next to you, but also when trying to reason about LKMM itself (e.g., what > one might do when changes are made to LKMM and one wants to check that they > interact well with the existing parts of LKMM). Not necessarily so. You might _want_ a change to affect one of the redundant paths but not the other. > I think in the long term, increasing the usability is more important than > the obvious correctness. But maybe I'm biased because I'm mostly on the user > side of LKMM :D No doubt I'm guilty of having my own set of biases... Alan
On 1/20/2023 5:32 PM, Alan Stern wrote: > On Fri, Jan 20, 2023 at 12:12:50PM +0100, Jonas Oberhauser wrote: >> >> On 1/19/2023 9:06 PM, Alan Stern wrote: >>> That's backward logic. Being strong isn't the reason the fences are >>> A-cumulative. Indeed, the model could have been written not to assume >>> that strong fences are A-cumulative. >> It's not just the model, it's also how strong fences are introduced in the >> documentation. > The documentation can be updated. Sure. But what would be the benefit? Anyways, what I meant is something else. There are (at least) two interpretations of what a strong-fence is. The first that you have in mind as far as I understand, that every store affected by the strong-fence must propagate to every other CPU before any instruction behind the strong-fence executes. (but it doesn't talk about which stores are being affected). The second one is that a strong-fence is an A-cumulative fence which additionally has that guarantee. What I meant is that reading the documentation or the model, one might come to the conclusion that it means the second thing, and in that interpretation fences that are strong must be A-cumulative. I don't see anything wrong with that, especially since I don't think strong-fence is a standard term that exists in a vacuum and means the first thing by convention. Obviously there's some elegance in making things orthogonal rather than hierarchical. So I can understand why you defend the first interpretation. But there's really only a benefit if that opens up some interesting design space. I just don't see that right now. So if hypothetically you were ok to change the meaning of strong fence to include A-cumulativity -- and I think from the model and documentation perspective it doesn't take much to do that if anything -- then saying "all strong-fence properties are covered exactly in pb" isn't a big step. > >> I agree though that you could decouple the notion of strong from >> A-cumulativity. >> But would anyone want a strong fence that is not A-cumulative? > Why would anyone want a weak fence that isn't A-cumulative? :-) > Architecture designers sometimes do strange things... (as a side note and out of curiosity: Which architecture has a weak fence that isn't A-cumulative? Is it alpha?) As for strong fences that aren't A-cumulative, I remember someone telling me not too long ago that one shouldn't worry about strange hypothetical architectures ; ) More to the point, I find it improbable that the performance benefit of this vs just using however smp_mb() maps on that architecture would ever warrant the inclusion of such a fence in the LKMM. >> It's a bit like asking in C11 for a barrier that has the sequential >> consistency guarantee of appearing in the global order S, but doesn't have >> release or acquire semantics. Yes you could define that, but would it really >> make sense? > You're still missing the point. The important aspect of the fences in > cumul-fence is that they are A-cumulative, not whether they are strong. I completely understand that. I just don't think there's anything fundamentally wrong with alternatively creating a more disjoint hierarchy of - fences that aren't A-cumulative but order propagation (in cumul-fence, without A-cumul) - fences that are A-cumulative but aren't strong (in cumul-fence, with A-cumul) - fences that are strong (in pb) Right now, both pb and cumul-fence deal with strong fences. And again, I understand that one point of view here is that this is not because strong fences need to inherently be A-cumulative and included in cumul-fence by using the strong-fence identifier. Indeed one could have, in theory, strong fences that aren't A-cumulative, and using strong-fence is as you wrote just a convenient way to list all the A-cumulative strong fences because that currently happens to be all of the strong fences. Another POV is that one might instead formally describe things in terms of these three more disjoint classes, adapt the documentation of cumul-fence to say that it does not deal with strong fences because those are dealt with in a later relation, and not worry about hypothetical barriers that currently don't have a justifying use case. (And I suppose to some extent you already don't worry about it, because pb would have to be defined differently if such fences ever made their way into LKMM.) Now cumul-fence/prop cares about the A-cumulativity aspect, and pb about the strong-fence aspect, but of course the A-cumulativity also appears in pb, just hidden right now through the additional rfe? at the end of prop (if there were non-A-cumulative strong fences, I think it would need to be pb = overwrite & ext ; cumul-fence* ; (A-cumul(...) | ...)). So I don't think one can draw the A-cumulativity aspect delineation between the relations clearly (when allowing for orthogonality). I'm proposing instead to make A-cumulativity a part of being a strong-fence and drawing the strong-fence delineation clearly. > You're fixating on an irrelevancy. The only thing I'm fixating on is whether it makes sense to remove certain redundancies in LKMM. And I don't think that's irrelevant; it's about reducing friction and making thinking about LKMM faster in the long run. >>> This may be so, but I would like to point out that the memory model was >>> not particularly designed to be as short or as simple as possible, but >>> more to reflect transparently the intuitions that kernel programmers >>> have about the ways ordering should work in the kernel. It may very >>> well include redundancies as a result. I don't think that's a bad >>> point. >> I agree that sometimes redundancies have value. But I think having, where >> possible, a kind of minimal responsibility principle where each fence >> appears in as few relations as possible also has value. >> I've seen that when I try to help people in my team learn to use LKMM: they >> see a specific type of fence and get stuck for a while chasing one of its >> uses. For example, they may chase a long prop link using the only strong >> fence in the example somewhere in the middle, which will then later turn out >> to be a dead-end because what they need to use is pb. > People who make that particular mistake should take a lesson from it: > The presence of a strong fence should point them toward looking for an > application of pb before looking at prop, because pb is is based on the > special properties of strong fences whereas prop is not. Yes, with two caveats: - you can remove the "before looking at prop" since there's never ever any point of looking at (extending) prop when you have a strong fence - why give people the opportunity for that mistake in the first place? ... > [...] by making a mistake through pursuing > a redundant pathway, people can deepen their understanding. That's how > one learns. ... I do feel reminded about the discussion about building character : ) Honestly I would easily see your point if there were some uncommon reasons to use the strong fence to extend prop. Then the lesson is what you mentioned: usually, strong fences should be looked at through the pb lense, and only if you get stuck that way it makes sense to look through the prop lense (and probably one could develop a good intuition for which situation calls for which after some time). But that's not the case here. There's nothing to learn here except that one should pretend that strong-fence didn't exist in prop. That lesson could also be learned by not having it there in the first place. And I think you can easily present LKMM in a way that makes this look reasonable. >> For someone who doesn't know that we never need to rely on that use to get >> ordering, it basically doubles the amount of time spent looking at the graph >> and chasing definitions. And for very good reasons there already are alot of >> definitions even when redundancies are eliminated. > IMO, people who try to use the memory model this way need to develop a > good understanding of it first. I disagree; both for the reason you mention later, but also because IMHO the big advantage of a formal model is you don't need to get a good understanding before you can start going and tackling issues. In German we say "entlanghangeln" which literally means "to make one's way hand over hand along a rope", as a metaphor for following formal definitions carefully step by step when one doesn't yet have a strong intuition to get everything right without needing to look at the formalism; the formalism is kind of a safety line that prevents one from falling into the abyss. (and in the spirit of what you said below, while doing that with an attentive mind one builds intuition and understanding). > (Although perhaps carrying out these > sorts of exercises is _how_ people develop their understanding...) I > don't regard it as a bad thing that by making a mistake through pursuing > a redundant pathway, people can deepen their understanding. That's how > one learns. I agree. >> Perhaps I would say that including these redundancies is good for >> understanding why the formalization makes sense, but excluding them is >> better for actually using the formalization. >> This includes both when looking at code while having a printout of the model >> next to you, but also when trying to reason about LKMM itself (e.g., what >> one might do when changes are made to LKMM and one wants to check that they >> interact well with the existing parts of LKMM). > Not necessarily so. You might _want_ a change to affect one of the > redundant paths but not the other. I definitely agree that it might very well be that the redundancy is dissolved at a later point in time through such discovery. In this case one would have simplified too much : ) But what I mean is, for example, when introducing rmw-sequences a question that came up is whether they are sufficient to ensure monotonicity. Analyzing stuff like this is much easier on a simplified model (including some other simplifications that are all equivalent to LKMM as written), because there are a lot fewer cases to cover. It turns a proof that feels like a bookkeeping nightmare into something that I can manage in a few pages of paper. Similarly when thinking about whether rcu could be understood through a lense that is closer to the Grace Period Guarantee rather than counting gps and rscs, I do this on the simplified model, because those equivalence proofs become easier. I'm wondering a little if there's some way in the middle, e.g., by writting short comments in the model wherever something is redundant. Something like (* note: strong-fence is included here for completeness, and can be safely ignored *). Best wishes, jonas
On Sat, Jan 21, 2023 at 01:41:16AM +0100, Jonas Oberhauser wrote: > > > On 1/20/2023 5:32 PM, Alan Stern wrote: > > On Fri, Jan 20, 2023 at 12:12:50PM +0100, Jonas Oberhauser wrote: > > > > > > On 1/19/2023 9:06 PM, Alan Stern wrote: > > > > That's backward logic. Being strong isn't the reason the fences are > > > > A-cumulative. Indeed, the model could have been written not to assume > > > > that strong fences are A-cumulative. > > > It's not just the model, it's also how strong fences are introduced in the > > > documentation. > > The documentation can be updated. > > Sure. But what would be the benefit? Aren't you interested in making the memory model more understandable to students? See also the end of this email (comments count somewhat as documentation). > Anyways, what I meant is something else. > There are (at least) two interpretations of what a strong-fence is. > The first that you have in mind as far as I understand, that every store > affected by the strong-fence must propagate to every other CPU before any > instruction behind the strong-fence executes. (but it doesn't talk about > which stores are being affected). Yes, that's what I have in mind. > The second one is that a strong-fence is an A-cumulative fence which > additionally has that guarantee. > > What I meant is that reading the documentation or the model, one might come > to the conclusion that it means the second thing, and in that interpretation > fences that are strong must be A-cumulative. Okay, so let's change the documentation/model to ensure this doesn't happen. > I don't see anything wrong with that, especially since I don't think > strong-fence is a standard term that exists in a vacuum and means the first > thing by convention. > > Obviously there's some elegance in making things orthogonal rather than > hierarchical. > So I can understand why you defend the first interpretation. > But there's really only a benefit if that opens up some interesting design > space. I just don't see that right now. > > So if hypothetically you were ok to change the meaning of strong fence to > include A-cumulativity -- and I think from the model and documentation > perspective it doesn't take much to do that if anything -- then saying "all > strong-fence properties are covered exactly in pb" isn't a big step. I believe that the difference between strong and weak fences is much more fundamental and important than the difference between A-cumulative and non-A-cumulative fences. Consider an analogy: Some animals are capable of walking around, but no plants are. Would you ever want to say that a plant is a non-walking living thing with various properties that differentiate it from animals? Or does it make more sense to say that plants are living things with various fundamental properties, and in addition some animals can walk? > > > I agree though that you could decouple the notion of strong from > > > A-cumulativity. > > > But would anyone want a strong fence that is not A-cumulative? > > Why would anyone want a weak fence that isn't A-cumulative? :-) > > Architecture designers sometimes do strange things... > > (as a side note and out of curiosity: Which architecture has a weak fence > that isn't A-cumulative? Is it alpha?) I believe that's right. It still exists in the kernel as smp_wmb(). > As for strong fences that aren't A-cumulative, I remember someone telling me > not too long ago that one shouldn't worry about strange hypothetical > architectures ; ) > More to the point, I find it improbable that the performance benefit of this > vs just using however smp_mb() maps on that architecture would ever warrant > the inclusion of such a fence in the LKMM. Agreed; I can't imagine any good reason for having a non-A-cumulative strong fence either. But it doesn't matter; when learning or using the LKMM it's much more important to focus on strong vs. weak than on A-cumulative vs. non-A-cumulative. > > > It's a bit like asking in C11 for a barrier that has the sequential > > > consistency guarantee of appearing in the global order S, but doesn't have > > > release or acquire semantics. Yes you could define that, but would it really > > > make sense? > > You're still missing the point. The important aspect of the fences in > > cumul-fence is that they are A-cumulative, not whether they are strong. > > I completely understand that. I just don't think there's anything > fundamentally wrong with alternatively creating a more disjoint hierarchy of > - fences that aren't A-cumulative but order propagation (in cumul-fence, > without A-cumul) > - fences that are A-cumulative but aren't strong (in cumul-fence, with > A-cumul) > - fences that are strong (in pb) There is yet another level of fences in the hierarchy: those which order instruction execution but not propagation (smp_rmb() and acquire). One of the important points about cumul-fence is that it excludes this level. That's for a functional reason -- prop simply doesn't work for those fences, so it has to exclude them. But it does work for strong fences, so excluding them would be an artificial restriction. > Right now, both pb and cumul-fence deal with strong fences. And again, I I would say that cumul-fence _allows_ strong fences, or _can work with_ strong fences. And I would never want to say that cumul-fence and prop can't be used with strong fences. In fact, if you find a situation where this happens, it might incline you to consider if the fence could be replaced with a weaker one. > understand that one point of view here is that this is not because strong > fences need to inherently be A-cumulative and included in cumul-fence by > using the strong-fence identifier. > Indeed one could have, in theory, strong fences that aren't A-cumulative, > and using strong-fence is as you wrote just a convenient way to list all the > A-cumulative strong fences because that currently happens to be all of the > strong fences. > > Another POV is that one might instead formally describe things in terms of > these three more disjoint classes, adapt the documentation of cumul-fence to > say that it does not deal with strong fences because those are dealt with in > a later relation, and not worry about hypothetical barriers that currently > don't have a justifying use case. > (And I suppose to some extent you already don't worry about it, because pb > would have to be defined differently if such fences ever made their way into > LKMM.) > > Now cumul-fence/prop cares about the A-cumulativity aspect, and pb about the > strong-fence aspect, but of course the A-cumulativity also appears in pb, > just hidden right now through the additional rfe? at the end of prop (if > there were non-A-cumulative strong fences, I think it would need to be pb = > overwrite & ext ; cumul-fence* ; (A-cumul(...) | ...)). So I don't think one Not quite right. A hypothetical non-A-cumulative case for pb would have to omit the cumul-fence term entirely. > can draw the A-cumulativity aspect delineation between the relations clearly > (when allowing for orthogonality). I'm proposing instead to make > A-cumulativity a part of being a strong-fence and drawing the strong-fence > delineation clearly. Maybe so, in some sense. But in practice you're asking to have strong fences removed from cumul-fence and prop. I don't want to do that, even at the cost of some redundancy. Consider the RB pattern as another example. Suppose the read -> write ordering on one or both sides is provided by a fence rather than a dependency or some such. Would you want to keep such cycles out of the (ppo | rfe)+ part of hb+, on the basis that they also can be covered by ((prop \ id) & int)? > > People who make that particular mistake should take a lesson from it: > > The presence of a strong fence should point them toward looking for an > > application of pb before looking at prop, because pb is is based on the > > special properties of strong fences whereas prop is not. > > Yes, with two caveats: > - you can remove the "before looking at prop" since there's never ever any > point of looking at (extending) prop when you have a strong fence > - why give people the opportunity for that mistake in the first place? ... > > [...] by making a mistake through pursuing > > a redundant pathway, people can deepen their understanding. That's how > > one learns. > ... I do feel reminded about the discussion about building character : ) > > Honestly I would easily see your point if there were some uncommon reasons > to use the strong fence to extend prop. Then the lesson is what you > mentioned: usually, strong fences should be looked at through the pb lense, > and only if you get stuck that way it makes sense to look through the prop > lense (and probably one could develop a good intuition for which situation > calls for which after some time). > > But that's not the case here. There's nothing to learn here except that one > should pretend that strong-fence didn't exist in prop. > That lesson could also be learned by not having it there in the first place. > And I think you can easily present LKMM in a way that makes this look > reasonable. In fact, I wouldn't mind removing the happens-before, propagation, and rcu axioms from LKMM entirely, replacing them with the single executes-before axiom. > I'm wondering a little if there's some way in the middle, e.g., by writting > short comments in the model wherever something is redundant. Something like > (* note: strong-fence is included here for completeness, and can be safely > ignored *). I have no objection to doing that. It seems like a good idea. Alan
On 1/21/2023 9:56 PM, Alan Stern wrote: > On Sat, Jan 21, 2023 at 01:41:16AM +0100, Jonas Oberhauser wrote: >> >> On 1/20/2023 5:32 PM, Alan Stern wrote: >>> On Fri, Jan 20, 2023 at 12:12:50PM +0100, Jonas Oberhauser wrote: >>>> On 1/19/2023 9:06 PM, Alan Stern wrote: >>>>> That's backward logic. Being strong isn't the reason the fences are >>>>> A-cumulative. Indeed, the model could have been written not to assume >>>>> that strong fences are A-cumulative. >>>> It's not just the model, it's also how strong fences are introduced in the >>>> documentation. >>> The documentation can be updated. >> Sure. But what would be the benefit? > Aren't you interested in making the memory model more understandable to > students? Of course : ) >> The second one is that a strong-fence is an A-cumulative fence which >> additionally has that guarantee. >> >> What I meant is that reading the documentation or the model, one might come >> to the conclusion that it means the second thing, and in that interpretation >> fences that are strong must be A-cumulative. > Okay, so let's change the documentation/model to ensure this doesn't > happen. > >> I don't see anything wrong with that, especially since I don't think >> strong-fence is a standard term that exists in a vacuum and means the first >> thing by convention. >> >> Obviously there's some elegance in making things orthogonal rather than >> hierarchical. >> So I can understand why you defend the first interpretation. >> But there's really only a benefit if that opens up some interesting design >> space. I just don't see that right now. >> >> So if hypothetically you were ok to change the meaning of strong fence to >> include A-cumulativity -- and I think from the model and documentation >> perspective it doesn't take much to do that if anything -- then saying "all >> strong-fence properties are covered exactly in pb" isn't a big step. > I believe that the difference between strong and weak fences is much > more fundamental and important than the difference between A-cumulative > and non-A-cumulative fences. > > Consider an analogy: Some animals are capable of walking around, but no > plants are. Would you ever want to say that a plant is a non-walking > living thing with various properties that differentiate it from animals? > Or does it make more sense to say that plants are living things with > various fundamental properties, and in addition some animals can walk? I agree that the difference between (the strict definition of) strong and weak is more important than between A-cumulative and not. The analogy doesn't work well for me though. Being A-cumulative is also a form of "strength". Definitely if you replace a non-A-cumulative fence with an otherwise equivalent but A-cumulative one (assume such fences exist), then you can never get more behaviors, but you might get fewer. I think that's why it's more natural for me to think of strong fences in terms of having multiple strong properties, the chief among which is that it requires "earlier" stores to propagate before po-later instructions execute but another one being A-cumulativity, than it would be of thinking of plants as being things that don't walk and have some other properties. >>>> It's a bit like asking in C11 for a barrier that has the sequential >>>> consistency guarantee of appearing in the global order S, but doesn't have >>>> release or acquire semantics. Yes you could define that, but would it really >>>> make sense? >>> You're still missing the point. The important aspect of the fences in >>> cumul-fence is that they are A-cumulative, not whether they are strong. >> I completely understand that. I just don't think there's anything >> fundamentally wrong with alternatively creating a more disjoint hierarchy of >> - fences that aren't A-cumulative but order propagation (in cumul-fence, >> without A-cumul) >> - fences that are A-cumulative but aren't strong (in cumul-fence, with >> A-cumul) >> - fences that are strong (in pb) > There is yet another level of fences in the hierarchy: those which order > instruction execution but not propagation (smp_rmb() and acquire). One > of the important points about cumul-fence is that it excludes this > level. > > That's for a functional reason -- prop simply doesn't work for those > fences, so it has to exclude them. But it does work for strong fences, > so excluding them would be an artificial restriction. Hm, so could we say some fences order 1) propagation with propagation (weak fences) 2) execution with execution (rmb, acquire) 3) propagation with execution (strong fences) where ordering with execution implicitly orders with propagation as well because things can only propagate after they execute. However, the 4th possibility (execution with only propagation) happens not to exist. I'm not sure if it would even be distinguishable from the second type. In the operational model, can you forward from stores that have not executed yet? > >> Right now, both pb and cumul-fence deal with strong fences. And again, I > I would say that cumul-fence _allows_ strong fences, or _can work with_ > strong fences. And I would never want to say that cumul-fence and prop > can't be used with strong fences. In fact, if you find a situation > where this happens, it might incline you to consider if the fence could > be replaced with a weaker one. Can you explain the latter part? What does it mean to 'find a situation where this happens'? As I understand the sentence, in current LKMM I don't think this is possible. Do you mean that if you find a case where you could make a cycle with cumul-fence/prop using strong fences, you might just rely on a weaker fence instead? > >> understand that one point of view here is that this is not because strong >> fences need to inherently be A-cumulative and included in cumul-fence by >> using the strong-fence identifier. >> Indeed one could have, in theory, strong fences that aren't A-cumulative, >> and using strong-fence is as you wrote just a convenient way to list all the >> A-cumulative strong fences because that currently happens to be all of the >> strong fences. >> >> Another POV is that one might instead formally describe things in terms of >> these three more disjoint classes, adapt the documentation of cumul-fence to >> say that it does not deal with strong fences because those are dealt with in >> a later relation, and not worry about hypothetical barriers that currently >> don't have a justifying use case. >> (And I suppose to some extent you already don't worry about it, because pb >> would have to be defined differently if such fences ever made their way into >> LKMM.) >> >> Now cumul-fence/prop cares about the A-cumulativity aspect, and pb about the >> strong-fence aspect, but of course the A-cumulativity also appears in pb, >> just hidden right now through the additional rfe? at the end of prop (if >> there were non-A-cumulative strong fences, I think it would need to be pb = >> overwrite & ext ; cumul-fence* ; (A-cumul(...) | ...)). So I don't think one > Not quite right. A hypothetical non-A-cumulative case for pb would have > to omit the cumul-fence term entirely. Wouldn't that violate the transitivity of "X is required to propagate before Y" ? If I have X ->cumul-fence+ Y ->weird-strong-fence Z doesn't that mean that for every CPU C, 1. X is required to propagate to C before Y propagates to C 2. Y is required to propagate to C before any instruction po-after Z executes But then also X is required to pragate to C before any instruction po-after Z executes. How is this enforced if there is no cumul-fence* in the new pb? Thinking about prop and pb along these lines gives me a weird feeling. Trying to pinpoint it down, it seems a little bit weird that A-cumul doesn't appear around the strong-fence in pb. Of course it should not appear after prop which already has an rfe? at the end. Nevertheless, having the rfe? at the end is clearly important to representing the idea behind prop. If it weren't for the fact that A-cumul is needed to define prop, it almost makes me think that it would be nice to express the difference between A-cumulative and non-A-cumulative fences (that order propagation) by saying that an A-cumulative fence has prop ; a-cumul-fence;rfe? <= prop while the non-A-cumulative fence has prop-without-rfe ; non-a-cumul-fence <= prop-without-rfe where prop links E and F if there is some coherence-later store after E that propagates to F's CPU by the time F executes, and prop-without-rfe links them if that store propagates to any CPU before F propagates to that CPU. (of course this ignores the interplay between these two relations that happens if you have a mix of a-cumul-fences and non-a-cumul-fences). > >> can draw the A-cumulativity aspect delineation between the relations clearly >> (when allowing for orthogonality). I'm proposing instead to make >> A-cumulativity a part of being a strong-fence and drawing the strong-fence >> delineation clearly. > Maybe so, in some sense. But in practice you're asking to have strong > fences removed from cumul-fence and prop. I don't want to do that, even > at the cost of some redundancy. > > Consider the RB pattern as another example. Suppose the read -> write > ordering on one or both sides is provided by a fence rather than a > dependency or some such. Would you want to keep such cycles out of the > (ppo | rfe)+ part of hb+, on the basis that they also can be covered by > ((prop \ id) & int)? This is the kind of case I mentioned before, where there are still good uses for every individual part (i.e., if you have a fence, it might be important for ppo or prop to complete a circle), and the existance of the fences might just draw one into a more likely direction. Besides, the model one obtains by trying to remove this redundancy just becomes more complicated, which is the opposite of what I want. Another good example are the many different fences in cumul-fence, such as in a thread that looks like Rx;mb();Wy;wmb(); Wz... : here one can use either one or two cumul-fence edges (link the Wx that Rx reads from with Wy and then Wy with Wz, or link it directly to Wz), but trying to eliminate this redundancy just makes the model more complicated. I'm not against this partially overlapping kind of redundancy, but I dislike subsuming kind of redundancy where some branches of the logic just never need to be used. > In fact, I wouldn't mind removing the happens-before, propagation, and > rcu axioms from LKMM entirely, replacing them with the single > executes-before axiom. I was planning to propose the same thing, however, I would also propose to redefine hb and rb by dropping the hb/pb parts at the end of these relations. hb = .... pb = prop ; strong-fence ; [Marked] rb = prop ; rcu-fence ; [Marked] xb = hb|pb|rb acyclic xb > >> I'm wondering a little if there's some way in the middle, e.g., by writting >> short comments in the model wherever something is redundant. Something like >> (* note: strong-fence is included here for completeness, and can be safely >> ignored *). > I have no objection to doing that. It seems like a good idea. > > Alan Perhaps we can start a new thread then to discuss a few points where redundancies might be annotated this way or eliminated. Best wishes, jonas
On Mon, Jan 23, 2023 at 02:59:37PM +0100, Jonas Oberhauser wrote: > > > On 1/21/2023 9:56 PM, Alan Stern wrote: > > There is yet another level of fences in the hierarchy: those which order > > instruction execution but not propagation (smp_rmb() and acquire). One > > of the important points about cumul-fence is that it excludes this > > level. > > > > That's for a functional reason -- prop simply doesn't work for those > > fences, so it has to exclude them. But it does work for strong fences, > > so excluding them would be an artificial restriction. > > Hm, so could we say some fences order > 1) propagation with propagation (weak fences) > 2) execution with execution (rmb, acquire) > 3) propagation with execution (strong fences) > > where ordering with execution implicitly orders with propagation as well > because things can only propagate after they execute. > However, the 4th possibility (execution with only propagation) happens not > to exist. I'm not sure if it would even be distinguishable from the second > type. Only in that such a memory barrier would order po-earlier anything against po-later stores, whereas rmb orders loads against loads and acquire orders loads against anything. > In the operational model, can you forward from stores that have not > executed yet? Yes, it is explicitly allowed. But forwarding doesn't apply in this situation because stores can be forwarded only to po-later loads, not to po-earlier ones. > > > Right now, both pb and cumul-fence deal with strong fences. And again, I > > I would say that cumul-fence _allows_ strong fences, or _can work with_ > > strong fences. And I would never want to say that cumul-fence and prop > > can't be used with strong fences. In fact, if you find a situation > > where this happens, it might incline you to consider if the fence could > > be replaced with a weaker one. > > Can you explain the latter part? > What does it mean to 'find a situation where this happens'? > As I understand the sentence, in current LKMM I don't think this is > possible. > Do you mean that if you find a case where you could make a cycle with > cumul-fence/prop using strong fences, you might just rely on a weaker fence > instead? Exactly. > > Not quite right. A hypothetical non-A-cumulative case for pb would have > > to omit the cumul-fence term entirely. > > Wouldn't that violate the transitivity of "X is required to propagate before > Y" ? > If I have > X ->cumul-fence+ Y ->weird-strong-fence Z > doesn't that mean that for every CPU C, > 1. X is required to propagate to C before Y propagates to C > 2. Y is required to propagate to C before any instruction po-after Z > executes Not if Y is a load. > But then also X is required to pragate to C before any instruction po-after > Z executes. > How is this enforced if there is no cumul-fence* in the new pb? You do have a point. I guess one would have to put (cumul-fence+ ; [W])? or something like it in the definition. > Thinking about prop and pb along these lines gives me a weird feeling. > Trying to pinpoint it down, it seems a little bit weird that A-cumul doesn't > appear around the strong-fence in pb. I think the reason it got left out was because all strong fences are A-cumulative. If some of them weren't, it would have to appear there in some form. > Of course it should not appear after > prop which already has an rfe? at the end. Nevertheless, having the rfe? at > the end is clearly important to representing the idea behind prop. If it > weren't for the fact that A-cumul is needed to define prop, it almost makes > me think that it would be nice to express the difference between > A-cumulative and non-A-cumulative fences (that order propagation) by saying > that an A-cumulative fence has > prop ; a-cumul-fence;rfe? <= prop > while the non-A-cumulative fence has > prop-without-rfe ; non-a-cumul-fence <= prop-without-rfe Isn't this just a more complicated way of saying what the A-cumul() macro expresses? > I'm not against this partially overlapping kind of redundancy, but I dislike > subsuming kind of redundancy where some branches of the logic just never > need to be used. Consider: Could we remove all propagation-ordering fences from ppo because they are subsumed by prop? (Or is that just wrong?) > > In fact, I wouldn't mind removing the happens-before, propagation, and > > rcu axioms from LKMM entirely, replacing them with the single > > executes-before axiom. > > I was planning to propose the same thing, however, I would also propose to > redefine hb and rb by dropping the hb/pb parts at the end of these > relations. > > hb = .... > pb = prop ; strong-fence ; [Marked] > rb = prop ; rcu-fence ; [Marked] > > xb = hb|pb|rb > acyclic xb I'm not so sure that's a good idea. For instance, it would require the definitions of rcu-link and rb to be changed from having (hb* ; pb*) to having (hb | pb)*. Also, although it's not mentioned anywhere, the definition of xbstar could be changed to hb* ; pb* ; rb* because each of these relations absorbs a weaker one to its right. > > > I'm wondering a little if there's some way in the middle, e.g., by writting > > > short comments in the model wherever something is redundant. Something like > > > (* note: strong-fence is included here for completeness, and can be safely > > > ignored *). > > I have no objection to doing that. It seems like a good idea. > > > > Alan > > Perhaps we can start a new thread then to discuss a few points where > redundancies might be annotated this way or eliminated. Sure, go ahead. Alan
Alright, after some synchronization in the other parts of this thread I am beginning to prepare the next iteration of the patch. On 1/19/2023 4:13 AM, Alan Stern wrote: > On Wed, Jan 18, 2023 at 10:38:11PM +0100, Jonas Oberhauser wrote: >> >> On 1/18/2023 8:52 PM, Alan Stern wrote: >>> On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: >>>> - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | >>>> - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; >>>> - fencerel(After-unlock-lock) ; [M]) >>>> + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) >>> Shouldn't the po case of (co | po) remain intact here? >> You can leave it here, but it is already covered by two other parts: the >> ordering given through ppo/hb is covered by the po-unlock-lock-po & int in >> ppo, and the ordering given through pb is covered by its inclusion in >> strong-order. > What about the ordering given through > A-cumul(strong-fence)/cumul-fence/prop/hb? I suppose that might be > superseded by pb as well, but it seems odd not to have it in hb. How should we resolve this? My current favorite (compromise :D) solution would be to 1. still eliminate both po and co cases from first definition of strong-fence which is used in ppo, 2. define a relation equal to the strong-order in this patch (with po|rf) but call it strong-fence for now (in response to Andrea's valid criticism that this patch is doing maybe more than just fix ppo) 3. use the extended strong-fence in the definition of cumul-fence and pb So I'd still simplify po|co to po|rf and drop the po case from ppo, but return both of those cases in cumul-fence, to be consistent with the idea that cumul-fence should deal with the weak properties of the fences including this after-unlock-lock fence. Would that be acceptable? jonas
On 1/23/2023 6:28 PM, Alan Stern wrote: > On Mon, Jan 23, 2023 at 02:59:37PM +0100, Jonas Oberhauser wrote: >> >> On 1/21/2023 9:56 PM, Alan Stern wrote: >>> There is yet another level of fences in the hierarchy: those which order >>> instruction execution but not propagation (smp_rmb() and acquire). One >>> of the important points about cumul-fence is that it excludes this >>> level. >>> >>> That's for a functional reason -- prop simply doesn't work for those >>> fences, so it has to exclude them. But it does work for strong fences, >>> so excluding them would be an artificial restriction. >> Hm, so could we say some fences order >> 1) propagation with propagation (weak fences) >> 2) execution with execution (rmb, acquire) >> 3) propagation with execution (strong fences) >> >> where ordering with execution implicitly orders with propagation as well >> because things can only propagate after they execute. >> However, the 4th possibility (execution with only propagation) happens not >> to exist. I'm not sure if it would even be distinguishable from the second >> type. > Only in that such a memory barrier would order po-earlier anything > against po-later stores, whereas rmb orders loads against loads and > acquire orders loads against anything. > >> In the operational model, can you forward from stores that have not >> executed yet? > Yes, it is explicitly allowed. But forwarding doesn't apply in this > situation because stores can be forwarded only to po-later loads, not to > po-earlier ones. The reason I was asking is because if forwarding was forbidden from non-executed stores, execute-to-prop frences could potentially have observably different behavior from comparable execute-to-execute cases. It's moot because it's not forbidden, but if you want to see the reasoning, consider a case like this: load from y ; execute-to-prop-fence ; store to x ; ... ; load from x load from y ; execute-to-execute-fence ; store to x ; ... ; load from x (where both fences only order load->store). In the first case, x could execute before the load from y and the load from x could already execute. In the second case, x couldn't execute before the load from y and so (assuming you couldn't forward from non-executed stores) x couldn't execute. As a result, the second type of fence would have ordered the loads but the first one wouldn't. >>> Not quite right. A hypothetical non-A-cumulative case for pb would have >>> to omit the cumul-fence term entirely. >> Wouldn't that violate the transitivity of "X is required to propagate before >> Y" ? >> If I have >> X ->cumul-fence+ Y ->weird-strong-fence Z >> doesn't that mean that for every CPU C, >> 1. X is required to propagate to C before Y propagates to C >> 2. Y is required to propagate to C before any instruction po-after Z >> executes > Not if Y is a load. > > I guess one would have to put > > (cumul-fence+ ; [W])? > > or something like it in the definition. I suppose it's true that Y being a load would be an exception, but that would only be if the cumul-fence+ sequence either ends in a strong-fence, or in po-unlock-lock-po. We can ignore the first case (and the ordering would be provided anyways through pb at that point). For the po-unlock-lock-po, you can just take Y:=the LKW event of the unlock and repeat the argument. So I don't think the [W] is necessary. (and if it was maybe it would also be necessary in the definition of prop/cumul-fence itself, to account for all the non-A-cumulative fences in there). > > >> Thinking about prop and pb along these lines gives me a weird feeling. >> Trying to pinpoint it down, it seems a little bit weird that A-cumul doesn't >> appear around the strong-fence in pb. > I think the reason it got left out was because all strong fences are > A-cumulative. If some of them weren't, it would have to appear there in > some form. > >> Of course it should not appear after >> prop which already has an rfe? at the end. Nevertheless, having the rfe? at >> the end is clearly important to representing the idea behind prop. If it >> weren't for the fact that A-cumul is needed to define prop, it almost makes >> me think that it would be nice to express the difference between >> A-cumulative and non-A-cumulative fences (that order propagation) by saying >> that an A-cumulative fence has >> prop ; a-cumul-fence;rfe? <= prop >> while the non-A-cumulative fence has >> prop-without-rfe ; non-a-cumul-fence <= prop-without-rfe > Isn't this just a more complicated way of saying what the A-cumul() > macro expresses? In the sense that I'm just stating some consequences of A-cumul works inside the model, yes. But at a syntactic level, no. The A-cumul puts the rfe? to the front. Here I put the rfe? behind the A-cumulative fence. And I distinguish between a prop that may have rfe? at the end, and one that doesn't, while the use of A-cumul only applies the "prop-without-rfe" in the sense of prop-without-rfe ; (A-cumul(...) | ...) <= prop-without-rfe I think part of my weird feeling comes from this asymmetry between A-cumul() putting the rfe? to the left and prop putting the rfe? to the right. Or more precisely, that the latter is sometimes in anticipation of an A-cumulative fence (where the A-cumul would normally take it to the left of that fence) and sometimes just to express the idea of propagation, and that these are the same, which should somehow lead to a simpler definition but doesn't. >> I'm not against this partially overlapping kind of redundancy, but I dislike >> subsuming kind of redundancy where some branches of the logic just never >> need to be used. > Consider: Could we remove all propagation-ordering fences from ppo > because they are subsumed by prop? (Or is that just wrong?) Surely not, since prop doesn't usually provide ordering by itself. >>> In fact, I wouldn't mind removing the happens-before, propagation, and >>> rcu axioms from LKMM entirely, replacing them with the single >>> executes-before axiom. >> I was planning to propose the same thing, however, I would also propose to >> redefine hb and rb by dropping the hb/pb parts at the end of these >> relations. >> >> hb = .... >> pb = prop ; strong-fence ; [Marked] >> rb = prop ; rcu-fence ; [Marked] >> >> xb = hb|pb|rb >> acyclic xb > I'm not so sure that's a good idea. For instance, it would require the > definitions of rcu-link and rb to be changed from having (hb* ; pb*) to > having (hb | pb)*. I think that's an improvement. It's obvious that (hb | pb)* is right and so is (pb | hb)*. For (hb* ; pb*), the first reaction is "why do all the hb edges need to be before the pb edges?", until one realizes that pb actually allows hb* at the end, so in a sense this is hb* ; (pb ; hb*)*, and then one has to understand that this means that the prop;strong-fence edges can appear any number of times at arbitrary locations. It just seems like defining (pb | hb)* with extra steps. The order of nesting seems to be also somewhat a matter of preference, perhaps in some weird alternative universe the LKMM says pb = (prop\id)&int | prop;strong-fence and hb = (rfe | ppo);pb*. (Personally I think the current way is more reasonable than this one, but that might be because our preferences happen to align in this instance.) > Also, although it's not mentioned anywhere, the > definition of xbstar could be changed to hb* ; pb* ; rb* because each of > these relations absorbs a weaker one to its right. I wouldn't want to need to do this reasoning just to understand that it has arbitrarily many hb, pb, and rb edges. >>>> I'm wondering a little if there's some way in the middle, e.g., by writting >>>> short comments in the model wherever something is redundant. Something like >>>> (* note: strong-fence is included here for completeness, and can be safely >>>> ignored *). >>> I have no objection to doing that. It seems like a good idea. >>> >>> Alan >> Perhaps we can start a new thread then to discuss a few points where >> redundancies might be annotated this way or eliminated. > Sure, go ahead. I'll put it on my to-do-list, let's converge on some topics first :D best wishes, jonas
On Mon, Jan 23, 2023 at 07:25:48PM +0100, Jonas Oberhauser wrote: > Alright, after some synchronization in the other parts of this thread I am > beginning to prepare the next iteration of the patch. > > On 1/19/2023 4:13 AM, Alan Stern wrote: > > On Wed, Jan 18, 2023 at 10:38:11PM +0100, Jonas Oberhauser wrote: > > > > > > On 1/18/2023 8:52 PM, Alan Stern wrote: > > > > On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: > > > > > - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | > > > > > - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; > > > > > - fencerel(After-unlock-lock) ; [M]) > > > > > + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) > > > > Shouldn't the po case of (co | po) remain intact here? > > > You can leave it here, but it is already covered by two other parts: the > > > ordering given through ppo/hb is covered by the po-unlock-lock-po & int in > > > ppo, and the ordering given through pb is covered by its inclusion in > > > strong-order. > > What about the ordering given through > > A-cumul(strong-fence)/cumul-fence/prop/hb? I suppose that might be > > superseded by pb as well, but it seems odd not to have it in hb. > > How should we resolve this? > My current favorite (compromise :D) solution would be to > 1. still eliminate both po and co cases from first definition of > strong-fence which is used in ppo, > 2. define a relation equal to the strong-order in this patch (with po|rf) Wouldn't it need to have po|co? Consider: Wx=1 Rx=1 Ry=1 Rz=1 lock(s) lock(s) lock(s) unlock(s) unlock(s) unlock(s) Wy=1 Wz=1 smp_mb__after_unlock_lock Rx=0 With the co term this is forbidden. With only the rf term it is allowed, because po-unlock-lock-po isn't A-cumulative. > but call it strong-fence for now (in response to Andrea's valid criticism > that this patch is doing maybe more than just fix ppo) > 3. use the extended strong-fence in the definition of cumul-fence and pb > > So I'd still simplify po|co to po|rf and drop the po case from ppo, but > return both of those cases in cumul-fence, to be consistent with the idea > that cumul-fence should deal with the weak properties of the fences > including this after-unlock-lock fence. > > > Would that be acceptable? Subject to the point mentioned above, yes. Alan
On Mon, Jan 23, 2023 at 08:33:42PM +0100, Jonas Oberhauser wrote: > > > On 1/23/2023 6:28 PM, Alan Stern wrote: > The reason I was asking is because if forwarding was forbidden from > non-executed stores, execute-to-prop frences could potentially have > observably different behavior from comparable execute-to-execute cases. It's > moot because it's not forbidden, but if you want to see the reasoning, > consider a case like this: > > load from y ; execute-to-prop-fence ; store to x ; ... ; load from x > load from y ; execute-to-execute-fence ; store to x ; ... ; load from x > > (where both fences only order load->store). > In the first case, x could execute before the load from y and the load from > x could already execute. > In the second case, x couldn't execute before the load from y and so > (assuming you couldn't forward from non-executed stores) x couldn't execute. > As a result, the second type of fence would have ordered the loads but the > first one wouldn't. The prototype example demonstrating that store forwarding really happens looks like this: read y ; ctrl-dep ; write x ; read x ; addr-dep ; read z where forwarding allows x and z to be read before y. > > > > Not quite right. A hypothetical non-A-cumulative case for pb would have > > > > to omit the cumul-fence term entirely. > > > Wouldn't that violate the transitivity of "X is required to propagate before > > > Y" ? > > > If I have > > > X ->cumul-fence+ Y ->weird-strong-fence Z > > > doesn't that mean that for every CPU C, > > > 1. X is required to propagate to C before Y propagates to C > > > 2. Y is required to propagate to C before any instruction po-after Z > > > executes > > Not if Y is a load. > > > > I guess one would have to put > > > > (cumul-fence+ ; [W])? > > > > or something like it in the definition. > > I suppose it's true that Y being a load would be an exception, but that > would only be if the cumul-fence+ sequence either ends in a strong-fence, or > in po-unlock-lock-po. > > We can ignore the first case (and the ordering would be provided anyways > through pb at that point). > For the po-unlock-lock-po, you can just take Y:=the LKW event of the unlock > and repeat the argument. And yet you complained about the reasoning needed to understand that (pb ; hb) <= pb! Not to mention the brittleness of this argument; what if in the future cumul-fence gets another term ending in a load? > So I don't think the [W] is necessary. (and if it was maybe it would also be > necessary in the definition of prop/cumul-fence itself, to account for all > the non-A-cumulative fences in there). > > I think part of my weird feeling comes from this asymmetry between A-cumul() > putting the rfe? to the left and prop putting the rfe? to the right. Or more > precisely, that the latter is sometimes in anticipation of an A-cumulative > fence (where the A-cumul would normally take it to the left of that fence) > and sometimes just to express the idea of propagation, and that these are > the same, which should somehow lead to a simpler definition but doesn't. Well, consider that maybe they aren't the same. :-) The definition of prop is a little more complicated than one might expect, because the overwrite and cumul-fence parts are both optional. Leaving one or both of them out is valid, but it requires a little extra thought to see why. > > > I'm not against this partially overlapping kind of redundancy, but I dislike > > > subsuming kind of redundancy where some branches of the logic just never > > > need to be used. > > Consider: Could we remove all propagation-ordering fences from ppo > > because they are subsumed by prop? (Or is that just wrong?) > > Surely not, since prop doesn't usually provide ordering by itself. Sorry, I meant the prop-related non-ppo parts of hb and pb. > > > > In fact, I wouldn't mind removing the happens-before, propagation, and > > > > rcu axioms from LKMM entirely, replacing them with the single > > > > executes-before axiom. > > > I was planning to propose the same thing, however, I would also propose to > > > redefine hb and rb by dropping the hb/pb parts at the end of these > > > relations. > > > > > > hb = .... > > > pb = prop ; strong-fence ; [Marked] > > > rb = prop ; rcu-fence ; [Marked] > > > > > > xb = hb|pb|rb > > > acyclic xb > > I'm not so sure that's a good idea. For instance, it would require the > > definitions of rcu-link and rb to be changed from having (hb* ; pb*) to > > having (hb | pb)*. > I think that's an improvement. It's obvious that (hb | pb)* is right and so > is (pb | hb)*. > For (hb* ; pb*), the first reaction is "why do all the hb edges need to be > before the pb edges?", until one realizes that pb actually allows hb* at the > end, so in a sense this is hb* ; (pb ; hb*)*, and then one has to > understand that this means that the prop;strong-fence edges can appear any > number of times at arbitrary locations. It just seems like defining (pb | > hb)* with extra steps. This can be mentioned explicitly as a comment or in explanation.txt. > The order of nesting seems to be also somewhat a matter of preference, > perhaps in some weird alternative universe the LKMM says pb = (prop\id)&int > | prop;strong-fence and hb = (rfe | ppo);pb*. (Personally I think the > current way is more reasonable than this one, but that might be because our > preferences happen to align in this instance.) You can't define hb that way, because the definition of hb appears before the definition of pb. And it has to be this way, because hb is used in the definition of pb. Alan
On 1/23/2023 9:25 PM, Alan Stern wrote: > On Mon, Jan 23, 2023 at 07:25:48PM +0100, Jonas Oberhauser wrote: >> Alright, after some synchronization in the other parts of this thread I am >> beginning to prepare the next iteration of the patch. >> >> On 1/19/2023 4:13 AM, Alan Stern wrote: >>> On Wed, Jan 18, 2023 at 10:38:11PM +0100, Jonas Oberhauser wrote: >>>> On 1/18/2023 8:52 PM, Alan Stern wrote: >>>>> On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: >>>>>> - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | >>>>>> - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; >>>>>> - fencerel(After-unlock-lock) ; [M]) >>>>>> + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) >>>>> Shouldn't the po case of (co | po) remain intact here? >>>> You can leave it here, but it is already covered by two other parts: the >>>> ordering given through ppo/hb is covered by the po-unlock-lock-po & int in >>>> ppo, and the ordering given through pb is covered by its inclusion in >>>> strong-order. >>> What about the ordering given through >>> A-cumul(strong-fence)/cumul-fence/prop/hb? I suppose that might be >>> superseded by pb as well, but it seems odd not to have it in hb. >> How should we resolve this? >> My current favorite (compromise :D) solution would be to >> 1. still eliminate both po and co cases from first definition of >> strong-fence which is used in ppo, >> 2. define a relation equal to the strong-order in this patch (with po|rf) > Wouldn't it need to have po|co? Consider: > > Wx=1 Rx=1 Ry=1 Rz=1 > lock(s) lock(s) lock(s) > unlock(s) unlock(s) unlock(s) > Wy=1 Wz=1 smp_mb__after_unlock_lock > Rx=0 > > With the co term this is forbidden. With only the rf term it is > allowed, because po-unlock-lock-po isn't A-cumulative. No, but unlock() is ( https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git/tree/tools/memory-model/lock.cat?h=dev.2023.01.19a#n67 ). So you get Rx=0 ->overwrite Wx=1 ->rfe Rx1 ->po-rel T1:unlock(s) ->rfe T2:lock(s) ->po-unlock-lock-po;after ... fence;po Rx=0 which is Rx=0 ->prop ; po-unlock-lock-po;after ... fence;po Rx=0 Are you ok going forward like this then? If not, I might prefer to redefine po-unlock-lock-po into something that works for all its use cases if possible. I think | po ; [UL] ; (po|co?;rf) ; [LKR] ; po |might be such a definition but haven't fully thought about it. best wishes, jonas
On 1/23/2023 10:10 PM, Alan Stern wrote: > On Mon, Jan 23, 2023 at 08:33:42PM +0100, Jonas Oberhauser wrote: >> >> On 1/23/2023 6:28 PM, Alan Stern wrote: >> >>> >>> I guess one would have to put >>> >>> (cumul-fence+ ; [W])? >>> >>> or something like it in the definition. >> I suppose it's true that Y being a load would be an exception, but that >> would only be if the cumul-fence+ sequence either ends in a strong-fence, or >> in po-unlock-lock-po. >> >> We can ignore the first case (and the ordering would be provided anyways >> through pb at that point). >> For the po-unlock-lock-po, you can just take Y:=the LKW event of the unlock >> and repeat the argument. > And yet you complained about the reasoning needed to understand that > (pb ; hb) <= pb! Eh, I can't help it, my first instinct is always going to be to make things shorter :D > Not to mention the brittleness of this argument; what > if in the future cumul-fence gets another term ending in a load? After mulling it over a bit in my big old head, I consider that even though dropping the [W] may be shorter, it might make for the simpler model by excluding lots of cases. That makes me think you should do it for real in the definition of prop. And not just at the very end, because in fact each cumul-fence link might come from a non-A-cumulative fence. So the same argument you are giving should be applied recursively. Either prop = (overwrite & ext)? ; (cumul-fence; [W])* ; rfe? or integrate it directly into cumul-fence. >> So I don't think the [W] is necessary. (and if it was maybe it would also be >> necessary in the definition of prop/cumul-fence itself, to account for all >> the non-A-cumulative fences in there). >> >> I think part of my weird feeling comes from this asymmetry between A-cumul() >> putting the rfe? to the left and prop putting the rfe? to the right. Or more >> precisely, that the latter is sometimes in anticipation of an A-cumulative >> fence (where the A-cumul would normally take it to the left of that fence) >> and sometimes just to express the idea of propagation, and that these are >> the same, which should somehow lead to a simpler definition but doesn't. > Well, consider that maybe they aren't the same. :-) > > The definition of prop is a little more complicated than one might > expect, because the overwrite and cumul-fence parts are both optional. > Leaving one or both of them out is valid, but it requires a little extra > thought to see why. Let's at this point in time not get started on the overwrite part being optional :D (see, this is me successfully holding myself back from opening another discussion! I can do it!). > >>>> I'm not against this partially overlapping kind of redundancy, but I dislike >>>> subsuming kind of redundancy where some branches of the logic just never >>>> need to be used. >>> Consider: Could we remove all propagation-ordering fences from ppo >>> because they are subsumed by prop? (Or is that just wrong?) >> Surely not, since prop doesn't usually provide ordering by itself. > Sorry, I meant the prop-related non-ppo parts of hb and pb. I still don't follow :( Can you write some equations to show me what you mean? >>>>> In fact, I wouldn't mind removing the happens-before, propagation, and >>>>> rcu axioms from LKMM entirely, replacing them with the single >>>>> executes-before axiom. >>>> I was planning to propose the same thing, however, I would also propose to >>>> redefine hb and rb by dropping the hb/pb parts at the end of these >>>> relations. >>>> >>>> hb = .... >>>> pb = prop ; strong-fence ; [Marked] >>>> rb = prop ; rcu-fence ; [Marked] >>>> >>>> xb = hb|pb|rb >>>> acyclic xb >>> I'm not so sure that's a good idea. For instance, it would require the >>> definitions of rcu-link and rb to be changed from having (hb* ; pb*) to >>> having (hb | pb)*. >> I think that's an improvement. It's obvious that (hb | pb)* is right and so >> is (pb | hb)*. >> For (hb* ; pb*), the first reaction is "why do all the hb edges need to be >> before the pb edges?", until one realizes that pb actually allows hb* at the >> end, so in a sense this is hb* ; (pb ; hb*)*, and then one has to >> understand that this means that the prop;strong-fence edges can appear any >> number of times at arbitrary locations. It just seems like defining (pb | >> hb)* with extra steps. > This can be mentioned explicitly as a comment or in explanation.txt. Ok, but why not just use (pb|hb)* and (pb|hb|rb)* and not worry about having to explain anything? And make the hb and rb definitions simpler at the same time? >> The order of nesting seems to be also somewhat a matter of preference, >> perhaps in some weird alternative universe the LKMM says pb = (prop\id)&int >> | prop;strong-fence and hb = (rfe | ppo);pb*. (Personally I think the >> current way is more reasonable than this one, but that might be because our >> preferences happen to align in this instance.) > You can't define hb that way, because the definition of hb appears > before the definition of pb. And it has to be this way, because hb is > used in the definition of pb. Note that in that alternative universe, pb = (prop\id)&int | prop;strong-fence which doesn't require any definition of hb. Best wishes, jonas
On Tue, Jan 24, 2023 at 01:54:14PM +0100, Jonas Oberhauser wrote: > > > On 1/23/2023 9:25 PM, Alan Stern wrote: > > On Mon, Jan 23, 2023 at 07:25:48PM +0100, Jonas Oberhauser wrote: > > > Alright, after some synchronization in the other parts of this thread I am > > > beginning to prepare the next iteration of the patch. > > > > > > On 1/19/2023 4:13 AM, Alan Stern wrote: > > > > On Wed, Jan 18, 2023 at 10:38:11PM +0100, Jonas Oberhauser wrote: > > > > > On 1/18/2023 8:52 PM, Alan Stern wrote: > > > > > > On Tue, Jan 17, 2023 at 08:31:59PM +0100, Jonas Oberhauser wrote: > > > > > > > - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | > > > > > > > - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; > > > > > > > - fencerel(After-unlock-lock) ; [M]) > > > > > > > + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) > > > > > > Shouldn't the po case of (co | po) remain intact here? > > > > > You can leave it here, but it is already covered by two other parts: the > > > > > ordering given through ppo/hb is covered by the po-unlock-lock-po & int in > > > > > ppo, and the ordering given through pb is covered by its inclusion in > > > > > strong-order. > > > > What about the ordering given through > > > > A-cumul(strong-fence)/cumul-fence/prop/hb? I suppose that might be > > > > superseded by pb as well, but it seems odd not to have it in hb. > > > How should we resolve this? > > > My current favorite (compromise :D) solution would be to > > > 1. still eliminate both po and co cases from first definition of > > > strong-fence which is used in ppo, > > > 2. define a relation equal to the strong-order in this patch (with po|rf) > > Wouldn't it need to have po|co? Consider: > > > > Wx=1 Rx=1 Ry=1 Rz=1 > > lock(s) lock(s) lock(s) > > unlock(s) unlock(s) unlock(s) > > Wy=1 Wz=1 smp_mb__after_unlock_lock > > Rx=0 > > > > With the co term this is forbidden. With only the rf term it is > > allowed, because po-unlock-lock-po isn't A-cumulative. > No, but unlock() is ( https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git/tree/tools/memory-model/lock.cat?h=dev.2023.01.19a#n67 > ). So you get So it is. I had forgotten about that. The model is getting too complicated to fit entirely in my mind... > Rx=0 ->overwrite Wx=1 ->rfe Rx1 ->po-rel T1:unlock(s) ->rfe T2:lock(s) > ->po-unlock-lock-po;after ... fence;po Rx=0 > which is > Rx=0 ->prop ; po-unlock-lock-po;after > ... fence;po Rx=0 > > Are you ok going forward like this then? I guess so, provided we mention somewhere in the code or documentation that this relation extends beyond a single rf. Alan
On Tue, Jan 24, 2023 at 02:14:03PM +0100, Jonas Oberhauser wrote: > After mulling it over a bit in my big old head, I consider that even though > dropping the [W] may be shorter, it might make for the simpler model by > excluding lots of cases. > That makes me think you should do it for real in the definition of prop. And > not just at the very end, because in fact each cumul-fence link might come > from a non-A-cumulative fence. So the same argument you are giving should be > applied recursively. > Either > > prop = (overwrite & ext)? ; (cumul-fence; [W])* ; rfe? > > or integrate it directly into cumul-fence. I dislike this sort of argument. I understand the formal memory model by relating it to the informal operational model. Thus, cumul-fence links a write W to another event E when the fence guarantees that W will propagate to E's CPU before E executes. That's how the memory model expresses the propagation properties of these fences. I don't want to rule out the possibility that E is a read merely because cumul-fence might be followed by another, A-cumulative fence. If E=read were ruled out then cumul-fence would not properly express the propagation properties of the fences. > > > > Consider: Could we remove all propagation-ordering fences from ppo > > > > because they are subsumed by prop? (Or is that just wrong?) > > > Surely not, since prop doesn't usually provide ordering by itself. > > Sorry, I meant the prop-related non-ppo parts of hb and pb. > > I still don't follow :( Can you write some equations to show me what you > mean? Consider: Rx=1 Ry=1 Wrelease Y=1 Wx=1 Here we have Wx=1 ->hb* Ry=1 by (prop \ id) & int, using the fact that Wy=1 is an A-cumulative release fence. But we also have Wx=1 ->rfe Rx=1 ->ppo Wy=1 ->rfe Ry=1. Thus there are two distinct ways of proving that Wx=1 ->hb* Ry=1. If we removed the fence term from the definition of ppo (or weakened it to just rmb | acq), we would eliminate the second, redundant proof. Is this the sort of thing you think we should do? > > > > > > In fact, I wouldn't mind removing the happens-before, propagation, and > > > > > > rcu axioms from LKMM entirely, replacing them with the single > > > > > > executes-before axiom. > > > > > I was planning to propose the same thing, however, I would also propose to > > > > > redefine hb and rb by dropping the hb/pb parts at the end of these > > > > > relations. > > > > > > > > > > hb = .... > > > > > pb = prop ; strong-fence ; [Marked] > > > > > rb = prop ; rcu-fence ; [Marked] > > > > > > > > > > xb = hb|pb|rb > > > > > acyclic xb > > > > I'm not so sure that's a good idea. For instance, it would require the > > > > definitions of rcu-link and rb to be changed from having (hb* ; pb*) to > > > > having (hb | pb)*. > > > I think that's an improvement. It's obvious that (hb | pb)* is right and so > > > is (pb | hb)*. > > > For (hb* ; pb*), the first reaction is "why do all the hb edges need to be > > > before the pb edges?", until one realizes that pb actually allows hb* at the > > > end, so in a sense this is hb* ; (pb ; hb*)*, and then one has to > > > understand that this means that the prop;strong-fence edges can appear any > > > number of times at arbitrary locations. It just seems like defining (pb | > > > hb)* with extra steps. > > This can be mentioned explicitly as a comment or in explanation.txt. > Ok, but why not just use (pb|hb)* and (pb|hb|rb)* and not worry about > having to explain anything? > And make the hb and rb definitions simpler at the same time? Do you think (pb | hb)* is simpler than pb* (as in the statement of the propagation axiom)? Besides, remember what I said about understanding the formal memory model in terms of the operational model. pb relates a write W to another event E when the strong fence guarantees that W will propagate to E's CPU before E executes. If the hb* term were omitted from the definition of pb, this wouldn't be true any more. Or at least, not as true as it should be. Alan
On 1/24/2023 6:14 PM, Alan Stern wrote: > On Tue, Jan 24, 2023 at 02:14:03PM +0100, Jonas Oberhauser wrote: >> After mulling it over a bit in my big old head, I consider that even though >> dropping the [W] may be shorter, it might make for the simpler model by >> excluding lots of cases. >> That makes me think you should do it for real in the definition of prop. And >> not just at the very end, because in fact each cumul-fence link might come >> from a non-A-cumulative fence. So the same argument you are giving should be >> applied recursively. >> Either >> >> prop = (overwrite & ext)? ; (cumul-fence; [W])* ; rfe? >> >> or integrate it directly into cumul-fence. > I dislike this sort of argument. I understand the formal memory model > by relating it to the informal operational model. Thus, cumul-fence > links a write W to another event E when the fence guarantees that W will > propagate to E's CPU before E executes. I later wondered why it's not defined like this and realized that prop means that it's before E executes. > That's how the memory model > expresses the propagation properties of these fences. I don't think that's really a perfect match though. For example, W ->wmb E (and thus cumul-fence) does guarantee that W propagates to E's CPU before E executes. But the propagation property of wmb is that W propagates to every CPU before E propagates to that CPU. It just so happens that the time E propagates to E's CPU is the time it executes. Indeed, looking at the non-strong properties of fences only, should give rise to a relation that only says "W propagates to any CPU before E propagates to that CPU" and that is a relation between stores. And quite different from "W propagates to E's CPU before E executes". I believe that relation is (cumul-fence;[W])+. Then X ->(overwrite&ext);(cumul-fence;[W])* E means that there is some W co-after X which propagates to any CPU no later than E due to the weak properties of fences along that path. And X ->(overwrite&ext);(cumul-fence;[W])*;rfe? E implies that there is some W co-after X which propagates to the CPU executing E no later than E executes. (because E observes or executes and hence propapagated to itself a store that must propagate to E's CPU no earlier than W). I think this is closer to the idea of expressing the (non-strong) propagation properties of the fences. > I don't want to > rule out the possibility that E is a read merely because cumul-fence > might be followed by another, A-cumulative fence. Perhaps you mean non-A-cumulative fence? The A-cumulative fences (when their A-cumulativity is actually used) already rule out reads because they use overwrite;cumul-fence*;rfe;(the a-cumulativity) > >>>>> Consider: Could we remove all propagation-ordering fences from ppo >>>>> because they are subsumed by prop? (Or is that just wrong?) >>>> Surely not, since prop doesn't usually provide ordering by itself. >>> Sorry, I meant the prop-related non-ppo parts of hb and pb. >> I still don't follow :( Can you write some equations to show me what you >> mean? > Consider: > > Rx=1 Ry=1 > Wrelease Y=1 Wx=1 > > Here we have Wx=1 ->hb* Ry=1 by (prop \ id) & int, using the fact that > Wy=1 is an A-cumulative release fence. But we also have > > Wx=1 ->rfe Rx=1 ->ppo Wy=1 ->rfe Ry=1. > > Thus there are two distinct ways of proving that Wx=1 ->hb* Ry=1. If we > removed the fence term from the definition of ppo (or weakened it to > just rmb | acq), we would eliminate the second, redundant proof. Is > this the sort of thing you think we should do? The reason I wouldn't do something like that is that firstly, the fence does preserve the program order, and secondly there are proofs where you need to use that fact. >>>>>>> In fact, I wouldn't mind removing the happens-before, propagation, and >>>>>>> rcu axioms from LKMM entirely, replacing them with the single >>>>>>> executes-before axiom. >>>>>> I was planning to propose the same thing, however, I would also propose to >>>>>> redefine hb and rb by dropping the hb/pb parts at the end of these >>>>>> relations. >>>>>> >>>>>> hb = .... >>>>>> pb = prop ; strong-fence ; [Marked] >>>>>> rb = prop ; rcu-fence ; [Marked] >>>>>> >>>>>> xb = hb|pb|rb >>>>>> acyclic xb >>>>> I'm not so sure that's a good idea. For instance, it would require the >>>>> definitions of rcu-link and rb to be changed from having (hb* ; pb*) to >>>>> having (hb | pb)*. >>>> I think that's an improvement. It's obvious that (hb | pb)* is right and so >>>> is (pb | hb)*. >>>> For (hb* ; pb*), the first reaction is "why do all the hb edges need to be >>>> before the pb edges?", until one realizes that pb actually allows hb* at the >>>> end, so in a sense this is hb* ; (pb ; hb*)*, and then one has to >>>> understand that this means that the prop;strong-fence edges can appear any >>>> number of times at arbitrary locations. It just seems like defining (pb | >>>> hb)* with extra steps. >>> This can be mentioned explicitly as a comment or in explanation.txt. >> Ok, but why not just use (pb|hb)* and (pb|hb|rb)* and not worry about >> having to explain anything? >> And make the hb and rb definitions simpler at the same time? > Do you think (pb | hb)* is simpler than pb* (as in the statement of the > propagation axiom)? pb+, however aren't you thinking of getting rid of the propagation axiom? I still think (pb' | hb)+ where pb' is the simpler definition of pb is simpler than pb*, where pb=pb';hb*. > Besides, remember what I said about understanding the formal memory > model in terms of the operational model. pb relates a write W to > another event E when the strong fence guarantees that W will propagate > to E's CPU before E executes. I suppose to every CPU before E executes? > If the hb* term were omitted from the definition of pb, this wouldn't be true any more. Or at least, not as > true as it should be. Why is that the right level of how true it should be? doesn't W ->pb;rb E also guarantee that W will propagate to E's CPU before E executes? Or even just W ->pb;pb E? Why only consider W->pb;hb E? I'd rather think of it in terms of "this is the basic block that implies that it W executes before E because it propagates to every CPU before E executes, and then you can of course extend it by adding any of pb,rb, and hb at the end to get a longer "executes before"". Best wishes, jonas
On Tue, Jan 24, 2023 at 09:23:02PM +0100, Jonas Oberhauser wrote: > > > On 1/24/2023 6:14 PM, Alan Stern wrote: > > On Tue, Jan 24, 2023 at 02:14:03PM +0100, Jonas Oberhauser wrote: > > > After mulling it over a bit in my big old head, I consider that even though > > > dropping the [W] may be shorter, it might make for the simpler model by > > > excluding lots of cases. > > > That makes me think you should do it for real in the definition of prop. And > > > not just at the very end, because in fact each cumul-fence link might come > > > from a non-A-cumulative fence. So the same argument you are giving should be > > > applied recursively. > > > Either > > > > > > prop = (overwrite & ext)? ; (cumul-fence; [W])* ; rfe? > > > > > > or integrate it directly into cumul-fence. > > I dislike this sort of argument. I understand the formal memory model > > by relating it to the informal operational model. Thus, cumul-fence > > links a write W to another event E when the fence guarantees that W will > > propagate to E's CPU before E executes. > > I later wondered why it's not defined like this and realized that prop means > that it's before E executes. > > > That's how the memory model > > expresses the propagation properties of these fences. > > I don't think that's really a perfect match though. > For example, W ->wmb E (and thus cumul-fence) does guarantee that W > propagates to E's CPU before E executes. > But the propagation property of wmb is that W propagates to every CPU before > E propagates to that CPU. > It just so happens that the time E propagates to E's CPU is the time it > executes. > > Indeed, looking at the non-strong properties of fences only, should give > rise to a relation that only says "W propagates to any CPU before E > propagates to that CPU" and that is a relation between stores. And quite > different from "W propagates to E's CPU before E executes". > > I believe that relation is (cumul-fence;[W])+. Add an rfe? to the end and you get the "before E executes" version. Or more accurately (rfe? ; ppo*). Hmmm, the only reason for omitting that ppo* term in the model is that it would never be needed. So maybe we should after all do the same for the hb* term at the end of pb and the (hb* | pb*) part at the end of rb. Starting from first principles, it's apparent that each of these types of propagation fences is associated with two relations: one involving propagation order and a companion relation involving execution order. Here's what I mean. For the sake of discussion let's define several classes of fences: efences are those which constrain execution order; pfences are those which constrain propagation order; sfences are those which strongly constrain propagation order. Each class includes the following ones. (And if you like, you can insert afences between pfences and sfences -- they would be the A-cumulative fences.) Now, the memory model builds up successively more inclusive notions of execution order. This process starts with execution of instructions in the same CPU not involving fences. Thus we have the ppo relations: dependencies and a few oddball things like ((overwrite ; rfe) & int) or ([UL] ; po ; [LKR]). Next, the efences also restrict single-CPU execution order. These fences only need to have one associated relation since they don't specifically involve propagation. Adding rfe to the list gives us inter-CPU ordering. Then associated with pfences we have the relation you've been talking about: W propagates to each CPU before W' does. This is (cumul-fence ; [W]). Perhaps a better name for it would be wprop. Given this relation, we obtain a companion relation that restricts execution order: ((overwrite & ext) ; wprop+ ; rfe) & int. (Note that the overall form is the same for afences as for pfences.) Adding this companion relation into the mix gives us essentially hb. For sfences the associated relation expresses: W propagates to every CPU before Y executes. This is basically (wprop* ; rfe? ; sfence) (using the fact that all sfences are A-cumulative) -- or if you prefer, (wprop* ; cumul-sfence). We can call this sprop. Then the companion relation restricting execution order is: (overwrite & ext) ; sprop For RCU, the associated relation expressing t2(A) < t1(B) is rcu-order and the companion relation is rcu-fence. Putting all those execution-order relations together gives us xb, the executes-before relation. Then the only axiom we need for all of this that xb is acyclic. Of course, I have left out a lot of details. Still, how does that sound as a scheme for rationalizing the memory model? Alan
On 1/25/2023 3:57 AM, Alan Stern wrote: > On Tue, Jan 24, 2023 at 09:23:02PM +0100, Jonas Oberhauser wrote: >> >> On 1/24/2023 6:14 PM, Alan Stern wrote: >>> On Tue, Jan 24, 2023 at 02:14:03PM +0100, Jonas Oberhauser wrote: >>>> After mulling it over a bit in my big old head, I consider that even though >>>> dropping the [W] may be shorter, it might make for the simpler model by >>>> excluding lots of cases. >>>> That makes me think you should do it for real in the definition of prop. And >>>> not just at the very end, because in fact each cumul-fence link might come >>>> from a non-A-cumulative fence. So the same argument you are giving should be >>>> applied recursively. >>>> Either >>>> >>>> prop = (overwrite & ext)? ; (cumul-fence; [W])* ; rfe? >>>> >>>> or integrate it directly into cumul-fence. >>> I dislike this sort of argument. I understand the formal memory model >>> by relating it to the informal operational model. Thus, cumul-fence >>> links a write W to another event E when the fence guarantees that W will >>> propagate to E's CPU before E executes. >> I later wondered why it's not defined like this and realized that prop means >> that it's before E executes. >> >>> That's how the memory model >>> expresses the propagation properties of these fences. >> I don't think that's really a perfect match though. >> For example, W ->wmb E (and thus cumul-fence) does guarantee that W >> propagates to E's CPU before E executes. >> But the propagation property of wmb is that W propagates to every CPU before >> E propagates to that CPU. >> It just so happens that the time E propagates to E's CPU is the time it >> executes. >> >> Indeed, looking at the non-strong properties of fences only, should give >> rise to a relation that only says "W propagates to any CPU before E >> propagates to that CPU" and that is a relation between stores. And quite >> different from "W propagates to E's CPU before E executes". >> >> I believe that relation is (cumul-fence;[W])+. > Add an rfe? to the end and you get the "before E executes" version. Yes, but with the minor caveat that this is only for the "because of the weak propagation ordering of fences (pfence)" case. Current prop also includes some other "before E executes" cases, e.g., when the last fence is po-unlock-lock-po or a strong-fence. > Or more accurately (rfe? ; ppo*). Hmmm, the only reason for omitting that > ppo* term in the model is that it would never be needed. So maybe we > should after all do the same for the hb* term at the end of pb and the > (hb* | pb*) part at the end of rb. > > > Starting from first principles, it's apparent that each of these types > of propagation fences is associated with two relations: one involving > propagation order and a companion relation involving execution order. > > Here's what I mean. For the sake of discussion let's define several > classes of fences: > > efences are those which constrain execution order; > > pfences are those which constrain propagation order; > > sfences are those which strongly constrain propagation order. > > Each class includes the following ones. (And if you like, you can > insert afences between pfences and sfences -- they would be the > A-cumulative fences.) > > Now, the memory model builds up successively more inclusive notions of > execution order. This process starts with execution of instructions in > the same CPU not involving fences. Thus we have the ppo relations: > dependencies and a few oddball things like ((overwrite ; rfe) & int) or > ([UL] ; po ; [LKR]). > > Next, the efences also restrict single-CPU execution order. These > fences only need to have one associated relation since they don't > specifically involve propagation. Adding rfe to the list gives us > inter-CPU ordering. > > Then associated with pfences we have the relation you've been talking > about: > > W propagates to each CPU before W' does. > > This is (cumul-fence ; [W]). Perhaps a better name for it would be > wprop. Given this relation, we obtain a companion relation that > restricts execution order: > > ((overwrite & ext) ; wprop+ ; rfe) & int. > > (Note that the overall form is the same for afences as for pfences.) > Adding this companion relation into the mix gives us essentially hb. > > For sfences the associated relation expresses: > > W propagates to every CPU before Y executes. > > This is basically (wprop* ; rfe? ; sfence) (using the fact that all > sfences are A-cumulative) -- or if you prefer, (wprop* ; cumul-sfence). > We can call this sprop. Then the companion relation restricting > execution order is: > > (overwrite & ext) ; sprop > > For RCU, the associated relation expressing t2(A) < t1(B) is rcu-order > and the companion relation is rcu-fence. Do we put rcu-order under sprop as well? Otherwise you need (overwrite & ext)? ; rcu-fence to express the full companion relation. > > Putting all those execution-order relations together gives us xb, the > executes-before relation. Then the only axiom we need for all of this > that xb is acyclic. > > Of course, I have left out a lot of details. Still, how does that sound > as a scheme for rationalizing the memory model? It seems like we're on the same page! It would be an honor for me to fill in the details and propose a patch, if you're interested. Have fun, jonas
On Wed, Jan 25, 2023 at 02:06:01PM +0100, Jonas Oberhauser wrote: > > > On 1/25/2023 3:57 AM, Alan Stern wrote: > > Starting from first principles, it's apparent that each of these types > > of propagation fences is associated with two relations: one involving > > propagation order and a companion relation involving execution order. > > > > Here's what I mean. For the sake of discussion let's define several > > classes of fences: > > > > efences are those which constrain execution order; > > > > pfences are those which constrain propagation order; > > > > sfences are those which strongly constrain propagation order. > > > > Each class includes the following ones. (And if you like, you can > > insert afences between pfences and sfences -- they would be the > > A-cumulative fences.) > > > > Now, the memory model builds up successively more inclusive notions of > > execution order. This process starts with execution of instructions in > > the same CPU not involving fences. Thus we have the ppo relations: > > dependencies and a few oddball things like ((overwrite ; rfe) & int) or > > ([UL] ; po ; [LKR]). > > > > Next, the efences also restrict single-CPU execution order. These > > fences only need to have one associated relation since they don't > > specifically involve propagation. Adding rfe to the list gives us > > inter-CPU ordering. > > > > Then associated with pfences we have the relation you've been talking > > about: > > > > W propagates to each CPU before W' does. > > > > This is (cumul-fence ; [W]). Perhaps a better name for it would be > > wprop. Given this relation, we obtain a companion relation that > > restricts execution order: > > > > ((overwrite & ext) ; wprop+ ; rfe) & int. > > > > (Note that the overall form is the same for afences as for pfences.) > > Adding this companion relation into the mix gives us essentially hb. > > > > For sfences the associated relation expresses: > > > > W propagates to every CPU before Y executes. > > > > This is basically (wprop* ; rfe? ; sfence) (using the fact that all > > sfences are A-cumulative) -- or if you prefer, (wprop* ; cumul-sfence). > > We can call this sprop. Then the companion relation restricting > > execution order is: > > > > (overwrite & ext) ; sprop > > > > For RCU, the associated relation expressing t2(A) < t1(B) is rcu-order > > and the companion relation is rcu-fence. > Do we put rcu-order under sprop as well? Otherwise you need > > (overwrite & ext)? ; rcu-fence > > to express the full companion relation. My mistake; what I meant was something a little smaller than rb. That is, (overwrite & ext) ; wprop* ; rfe? ; rcu-fence In other words, a relation expressing that rcu-fence acts as a strong fence. But also something expressing that rcu-fence acts as an efence? -- I guess this is covered if the (overwrite & ext) part above is made optional, although that feels a little artificial. I don't think we will be able to include rcu-fence in sprop, because we will need to use sprop in the definition of rcu-fence. Oh yes, one other piece of terminology I forgot to mention. The things you referred to before as "ordering operations" could instead be called "extended fences". What do you think? > > Putting all those execution-order relations together gives us xb, the > > executes-before relation. Then the only axiom we need for all of this > > that xb is acyclic. > > > > Of course, I have left out a lot of details. Still, how does that sound > > as a scheme for rationalizing the memory model? > > It seems like we're on the same page! > It would be an honor for me to fill in the details and propose a patch, if > you're interested. An invasive, multi-faceted change like this has to be broken down into multiple patches, each doing only one thing and each easily verified as not changing the meaning of the code. Feel free to go ahead and work out a proposal. Alan
diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt index e901b47236c3..4f5f6ac098de 100644 --- a/tools/memory-model/Documentation/explanation.txt +++ b/tools/memory-model/Documentation/explanation.txt @@ -865,14 +865,44 @@ propagates stores. When a fence instruction is executed on CPU C: propagate to all other CPUs before any instructions po-after the strong fence are executed on C. -The propagation ordering enforced by release fences and strong fences -affects stores from other CPUs that propagate to CPU C before the -fence is executed, as well as stores that are executed on C before the -fence. We describe this property by saying that release fences and -strong fences are A-cumulative. By contrast, smp_wmb() fences are not -A-cumulative; they only affect the propagation of stores that are -executed on C before the fence (i.e., those which precede the fence in -program order). + Whenever any CPU C' executes an unlock operation U such that + CPU C executes a lock operation L followed by a po-later + smp_mb__after_unlock_lock() fence, and L is either a later lock + operation on the lock released by U or is po-after U, then any + store that propagates to C' before U must propagate to all other + CPUs before any instructions po-after the fence are executed on C. + +While smp_wmb() and release fences only force certain earlier stores +to propagate to another CPU C' before certain later stores propagate +to the same CPU C', strong fences and smp_mb__after_unlock_lock() +force those stores to propagate to all other CPUs before any later +instruction is executed. We collectively refer to the latter +operations as strong ordering operations, as they provide much +stronger ordering in two ways: + + Firstly, strong ordering operations also create order between + earlier stores and later reads. + + Secondly, strong ordering operations create a form of global + ordering: When an earlier store W propagates to CPU C and is + ordered by a strong ordering operation with a store W' of C, + and another CPU C' observes W' and in response issues yet + another store W'', then W'' also can not propagate to any CPU + before W. By contrast, a release fence or smp_wmb() would only + order W and W', but not force any ordering between W and W''. + To summarize, the ordering forced by strong ordering operations + extends to later stores of all CPUs, while other fences only + force ordering with relation to stores on the CPU that executes + the fence. + +The propagation ordering enforced by release fences and strong ordering +operations affects stores from other CPUs that propagate to CPU C before +the fence is executed, as well as stores that are executed on C before +the fence. We describe this property by saying that release fences and +strong ordering operations are A-cumulative. By contrast, smp_wmb() +fences are not A-cumulative; they only affect the propagation of stores +that are executed on C before the fence (i.e., those which precede the +fence in program order). rcu_read_lock(), rcu_read_unlock(), and synchronize_rcu() fences have other properties which we discuss later. @@ -1425,36 +1455,36 @@ requirement is the content of the LKMM's "happens-before" axiom. The LKMM defines yet another relation connected to times of instruction execution, but it is not included in hb. It relies on the -particular properties of strong fences, which we cover in the next -section. +particular properties of strong ordering operations, which we cover in the +next section. THE PROPAGATES-BEFORE RELATION: pb ---------------------------------- The propagates-before (pb) relation capitalizes on the special -features of strong fences. It links two events E and F whenever some -store is coherence-later than E and propagates to every CPU and to RAM -before F executes. The formal definition requires that E be linked to -F via a coe or fre link, an arbitrary number of cumul-fences, an -optional rfe link, a strong fence, and an arbitrary number of hb -links. Let's see how this definition works out. +features of strong ordering operations. It links two events E and F +whenever some store is coherence-later than E and propagates to every +CPU and to RAM before F executes. The formal definition requires that +E be linked to F via a coe or fre link, an arbitrary number of +cumul-fences, an optional rfe link, a strong ordering operation, and an +arbitrary number of hb links. Let's see how this definition works out. Consider first the case where E is a store (implying that the sequence of links begins with coe). Then there are events W, X, Y, and Z such that: - E ->coe W ->cumul-fence* X ->rfe? Y ->strong-fence Z ->hb* F, + E ->coe W ->cumul-fence* X ->rfe? Y ->strong-order Z ->hb* F, where the * suffix indicates an arbitrary number of links of the specified type, and the ? suffix indicates the link is optional (Y may be equal to X). Because of the cumul-fence links, we know that W will propagate to Y's CPU before X does, hence before Y executes and hence -before the strong fence executes. Because this fence is strong, we -know that W will propagate to every CPU and to RAM before Z executes. -And because of the hb links, we know that Z will execute before F. -Thus W, which comes later than E in the coherence order, will -propagate to every CPU and to RAM before F executes. +before the strong ordering operation executes. Because of the strong +ordering, we know that W will propagate to every CPU and to RAM before +Z executes. And because of the hb links, we know that Z will execute +before F. Thus W, which comes later than E in the coherence order, +will propagate to every CPU and to RAM before F executes. The case where E is a load is exactly the same, except that the first link in the sequence is fre instead of coe. @@ -1637,8 +1667,8 @@ does not imply X ->rcu-order V, because the sequence contains only one rcu-gp link but two rcu-rscsi links. The rcu-order relation is important because the Grace Period Guarantee -means that rcu-order links act kind of like strong fences. In -particular, E ->rcu-order F implies not only that E begins before F +means that rcu-order links act kind of like a strong ordering operation. +In particular, E ->rcu-order F implies not only that E begins before F ends, but also that any write po-before E will propagate to every CPU before any instruction po-after F can execute. (However, it does not imply that E must execute before F; in fact, each synchronize_rcu() @@ -1675,24 +1705,23 @@ The rcu-fence relation is a simple extension of rcu-order. While rcu-order only links certain fence events (calls to synchronize_rcu(), rcu_read_lock(), or rcu_read_unlock()), rcu-fence links any events that are separated by an rcu-order link. This is analogous to the way -the strong-fence relation links events that are separated by an +the strong-order relation links events that are separated by an smp_mb() fence event (as mentioned above, rcu-order links act kind of -like strong fences). Written symbolically, X ->rcu-fence Y means -there are fence events E and F such that: +like strong ordering operations). Written symbolically, X ->rcu-fence Y +means there are fence events E and F such that: X ->po E ->rcu-order F ->po Y. From the discussion above, we see this implies not only that X executes before Y, but also (if X is a store) that X propagates to -every CPU before Y executes. Thus rcu-fence is sort of a -"super-strong" fence: Unlike the original strong fences (smp_mb() and +every CPU before Y executes. Thus unlike strong fences (smp_mb() and synchronize_rcu()), rcu-fence is able to link events on different CPUs. (Perhaps this fact should lead us to say that rcu-fence isn't really a fence at all!) Finally, the LKMM defines the RCU-before (rb) relation in terms of rcu-fence. This is done in essentially the same way as the pb -relation was defined in terms of strong-fence. We will omit the +relation was defined in terms of strong-order. We will omit the details; the end result is that E ->rb F implies E must execute before F, just as E ->pb F does (and for much the same reasons). @@ -2134,7 +2163,7 @@ intermediate event Z such that: and either: - Z is connected to Y by a strong-fence link followed by a + Z is connected to Y by a strong-order link followed by a possibly empty sequence of xb links, or: @@ -2153,10 +2182,10 @@ The motivations behind this definition are straightforward: from W, which certainly means that W must have propagated to R's CPU before R executed. - strong-fence memory barriers force stores that are po-before - the barrier, or that propagate to the barrier's CPU before the - barrier executes, to propagate to all CPUs before any events - po-after the barrier can execute. + Strong ordering operations force stores that are po-before + the operation or that propagate to the CPU that begins the + operation before the operation executes, to propagate to all + CPUs before any events po-after the operation execute. To see how this works out in practice, consider our old friend, the MP pattern (with fences and statement labels, but without the conditional @@ -2485,7 +2514,7 @@ sequence or if W can be linked to R by a sequence. For the cases involving a vis link, the LKMM also accepts sequences in which W is linked to W' or R by a - strong-fence ; xb* ; {w and/or r}-pre-bounded + strong-order ; xb* ; {w and/or r}-pre-bounded sequence with no post-bounding, and in every case the LKMM also allows the link simply to be a fence with no bounding at all. If no sequence diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index 07f884f9b2bf..1d91edbc10fe 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -36,9 +36,7 @@ let wmb = [W] ; fencerel(Wmb) ; [W] let mb = ([M] ; fencerel(Mb) ; [M]) | ([M] ; fencerel(Before-atomic) ; [RMW] ; po? ; [M]) | ([M] ; po? ; [RMW] ; fencerel(After-atomic) ; [M]) | - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | - ([M] ; po ; [UL] ; (co | po) ; [LKW] ; - fencerel(After-unlock-lock) ; [M]) + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) let gp = po ; [Sync-rcu | Sync-srcu] ; po? let strong-fence = mb | gp @@ -91,8 +89,12 @@ acyclic hb as happens-before (* Write and fence propagation ordering *) (****************************************) -(* Propagation: Each non-rf link needs a strong fence. *) -let pb = prop ; strong-fence ; hb* ; [Marked] +(* Strong ordering operations *) +let strong-order = strong-fence | ([M] ; po-unlock-lock-po ; + [After-unlock-lock] ; po ; [M]) + +(* Propagation: Each non-rf link needs a strong ordering operation. *) +let pb = prop ; strong-order ; hb* ; [Marked] acyclic pb as propagation (*******) @@ -141,7 +143,7 @@ let rec rcu-order = rcu-gp | srcu-gp | (rcu-order ; rcu-link ; rcu-order) let rcu-fence = po ; rcu-order ; po? let fence = fence | rcu-fence -let strong-fence = strong-fence | rcu-fence +let strong-order = strong-order | rcu-fence (* rb orders instructions just as pb does *) let rb = prop ; rcu-fence ; hb* ; pb* ; [Marked] @@ -169,7 +171,7 @@ flag ~empty mixed-accesses as mixed-accesses (* Executes-before and visibility *) let xbstar = (hb | pb | rb)* let vis = cumul-fence* ; rfe? ; [Marked] ; - ((strong-fence ; [Marked] ; xbstar) | (xbstar & int)) + ((strong-order ; [Marked] ; xbstar) | (xbstar & int)) (* Boundaries for lifetimes of plain accesses *) let w-pre-bounded = [Marked] ; (addr | fence)? @@ -180,9 +182,9 @@ let r-post-bounded = (nonrw-fence | ([~Noreturn] ; fencerel(Rmb) ; [R4rmb]))? ; [Marked] (* Visibility and executes-before for plain accesses *) -let ww-vis = fence | (strong-fence ; xbstar ; w-pre-bounded) | +let ww-vis = fence | (strong-order ; xbstar ; w-pre-bounded) | (w-post-bounded ; vis ; w-pre-bounded) -let wr-vis = fence | (strong-fence ; xbstar ; r-pre-bounded) | +let wr-vis = fence | (strong-order ; xbstar ; r-pre-bounded) | (w-post-bounded ; vis ; r-pre-bounded) let rw-xbstar = fence | (r-post-bounded ; xbstar ; w-pre-bounded)