[PATCHv14,08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

Message ID 20230111123736.20025-9-kirill.shutemov@linux.intel.com
State New
Headers
Series Linear Address Masking enabling |

Commit Message

Kirill A. Shutemov Jan. 11, 2023, 12:37 p.m. UTC
  Use static key to reduce untagged_addr() overhead.

The key only gets enabled when the first process enables LAM.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/x86/include/asm/uaccess.h | 8 ++++++--
 arch/x86/kernel/process_64.c   | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)
  

Comments

Peter Zijlstra Jan. 17, 2023, 1:05 p.m. UTC | #1
On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote:

>  #define __untagged_addr(untag_mask, addr)	({			\
>  	u64 __addr = (__force u64)(addr);				\
> -	s64 sign = (s64)__addr >> 63;					\
> -	__addr &= untag_mask | sign;					\
> +	if (static_branch_likely(&tagged_addr_key)) {			\
> +		s64 sign = (s64)__addr >> 63;				\
> +		__addr &= untag_mask | sign;				\
> +	}								\
>  	(__force __typeof__(addr))__addr;				\
>  })
>  
> #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr)

Is the compiler clever enough to put the memop inside the branch?
  
Kirill A. Shutemov Jan. 17, 2023, 1:57 p.m. UTC | #2
On Tue, Jan 17, 2023 at 02:05:22PM +0100, Peter Zijlstra wrote:
> On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote:
> 
> >  #define __untagged_addr(untag_mask, addr)
> >  	u64 __addr = (__force u64)(addr);				\
> > -	s64 sign = (s64)__addr >> 63;					\
> > -	__addr &= untag_mask | sign;					\
> > +	if (static_branch_likely(&tagged_addr_key)) {			\
> > +		s64 sign = (s64)__addr >> 63;				\
> > +		__addr &= untag_mask | sign;				\
> > +	}								\
> >  	(__force __typeof__(addr))__addr;				\
> >  })
> >  
> > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr)
> 
> Is the compiler clever enough to put the memop inside the branch?

Hm. You mean current_untag_mask() inside static_branch_likely()?

But it is preprocessor who does this, not compiler. So, yes, the memop is
inside the branch.

Or I didn't understand your question.
  
Peter Zijlstra Jan. 17, 2023, 3:02 p.m. UTC | #3
On Tue, Jan 17, 2023 at 04:57:03PM +0300, Kirill A. Shutemov wrote:
> On Tue, Jan 17, 2023 at 02:05:22PM +0100, Peter Zijlstra wrote:
> > On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote:
> > 
> > >  #define __untagged_addr(untag_mask, addr)
> > >  	u64 __addr = (__force u64)(addr);				\
> > > -	s64 sign = (s64)__addr >> 63;					\
> > > -	__addr &= untag_mask | sign;					\
> > > +	if (static_branch_likely(&tagged_addr_key)) {			\
> > > +		s64 sign = (s64)__addr >> 63;				\
> > > +		__addr &= untag_mask | sign;				\
> > > +	}								\
> > >  	(__force __typeof__(addr))__addr;				\
> > >  })
> > >  
> > > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr)
> > 
> > Is the compiler clever enough to put the memop inside the branch?
> 
> Hm. You mean current_untag_mask() inside static_branch_likely()?
> 
> But it is preprocessor who does this, not compiler. So, yes, the memop is
> inside the branch.
> 
> Or I didn't understand your question.

Nah, call it a pre-lunch dip, I overlooked the whole CPP angle -- d'0h.

That said, I did just put it through a compiler to see wth it did and it
is pretty gross:

GCC-12.2:

0000 00000000000023b0 <write_ok_or_segv>:
0000     23b0:  48 89 fa                mov    %rdi,%rdx
0003     23b3:  eb 76                   jmp    242b <write_ok_or_segv+0x7b>
0005     23b5:  65 48 8b 0d 00 00 00 00         mov    %gs:0x0(%rip),%rcx        # 23bd <write_ok_or_segv+0xd>  23b9: R_X86_64_PC32     tlbstate_untag_mask-0x4
000d     23bd:  48 89 f8                mov    %rdi,%rax
0010     23c0:  48 c1 f8 3f             sar    $0x3f,%rax
0014     23c4:  48 09 c8                or     %rcx,%rax
0017     23c7:  48 21 f8                and    %rdi,%rax
001a     23ca:  48 b9 00 f0 ff ff ff 7f 00 00   movabs $0x7ffffffff000,%rcx
0024     23d4:  48 39 f1                cmp    %rsi,%rcx
0027     23d7:  72 14                   jb     23ed <write_ok_or_segv+0x3d>
0029     23d9:  48 29 f1                sub    %rsi,%rcx
002c     23dc:  be 01 00 00 00          mov    $0x1,%esi
0031     23e1:  48 39 c1                cmp    %rax,%rcx
0034     23e4:  72 07                   jb     23ed <write_ok_or_segv+0x3d>
0036     23e6:  89 f0                   mov    %esi,%eax
0038     23e8:  e9 00 00 00 00          jmp    23ed <write_ok_or_segv+0x3d>     23e9: R_X86_64_PLT32    __x86_return_thunk-0x4
003d     23ed:  65 48 8b 04 25 00 00 00 00      mov    %gs:0x0,%rax     23f2: R_X86_64_32S      pcpu_hot
0046     23f6:  48 89 90 68 0b 00 00    mov    %rdx,0xb68(%rax)
004d     23fd:  be 01 00 00 00          mov    $0x1,%esi
0052     2402:  bf 0b 00 00 00          mov    $0xb,%edi
0057     2407:  48 c7 80 78 0b 00 00 06 00 00 00        movq   $0x6,0xb78(%rax)
0062     2412:  48 c7 80 70 0b 00 00 0e 00 00 00        movq   $0xe,0xb70(%rax)
006d     241d:  e8 00 00 00 00          call   2422 <write_ok_or_segv+0x72>     241e: R_X86_64_PLT32    force_sig_fault-0x4
0072     2422:  31 f6                   xor    %esi,%esi
0074     2424:  89 f0                   mov    %esi,%eax
0076     2426:  e9 00 00 00 00          jmp    242b <write_ok_or_segv+0x7b>     2427: R_X86_64_PLT32    __x86_return_thunk-0x4
007b     242b:  48 89 f8                mov    %rdi,%rax
007e     242e:  eb 9a                   jmp    23ca <write_ok_or_segv+0x1a>

Note the stupid jump to the end and back. Not all sites do this mind
you, but a fair number seem to do it.

Let me try llvm to see if it is any smarter.

CLANG-16:

0000 0000000000002d50 <write_ok_or_segv>:
0000     2d50:	41 57                	push   %r15
0002     2d52:	41 56                	push   %r14
0004     2d54:	41 54                	push   %r12
0006     2d56:	53                   	push   %rbx
0007     2d57:	48 89 f3             	mov    %rsi,%rbx
000a     2d5a:	48 89 fa             	mov    %rdi,%rdx
000d     2d5d:	49 89 fe             	mov    %rdi,%r14
0010     2d60:	eb 15                	jmp    2d77 <write_ok_or_segv+0x27>
0012     2d62:	48 89 d0             	mov    %rdx,%rax
0015     2d65:	48 c1 f8 3f          	sar    $0x3f,%rax
0019     2d69:	65 4c 8b 35 00 00 00 00 	mov    %gs:0x0(%rip),%r14        # 2d71 <write_ok_or_segv+0x21>	2d6d: R_X86_64_PC32	tlbstate_untag_mask-0x4
0021     2d71:	49 09 c6             	or     %rax,%r14
0024     2d74:	49 21 d6             	and    %rdx,%r14
0027     2d77:	f3 0f 1e fa          	endbr64
002b     2d7b:	49 bf 00 f0 ff ff ff 7f 00 00 	movabs $0x7ffffffff000,%r15
0035     2d85:	4d 89 fc             	mov    %r15,%r12
0038     2d88:	49 29 dc             	sub    %rbx,%r12
003b     2d8b:	72 05                	jb     2d92 <write_ok_or_segv+0x42>
003d     2d8d:	4d 39 f4             	cmp    %r14,%r12
0040     2d90:	73 34                	jae    2dc6 <write_ok_or_segv+0x76>
0042     2d92:	65 48 8b 05 00 00 00 00 	mov    %gs:0x0(%rip),%rax        # 2d9a <write_ok_or_segv+0x4a>	2d96: R_X86_64_PC32	pcpu_hot-0x4
004a     2d9a:	48 c7 80 78 0b 00 00 06 00 00 00 	movq   $0x6,0xb78(%rax)
0055     2da5:	48 89 90 68 0b 00 00 	mov    %rdx,0xb68(%rax)
005c     2dac:	48 c7 80 70 0b 00 00 0e 00 00 00 	movq   $0xe,0xb70(%rax)
0067     2db7:	bf 0b 00 00 00       	mov    $0xb,%edi
006c     2dbc:	be 01 00 00 00       	mov    $0x1,%esi
0071     2dc1:	e8 00 00 00 00       	call   2dc6 <write_ok_or_segv+0x76>	2dc2: R_X86_64_PLT32	force_sig_fault-0x4
0076     2dc6:	4d 39 f4             	cmp    %r14,%r12
0079     2dc9:	0f 93 c1             	setae  %cl
007c     2dcc:	49 39 df             	cmp    %rbx,%r15
007f     2dcf:	0f 93 c0             	setae  %al
0082     2dd2:	20 c8                	and    %cl,%al
0084     2dd4:	5b                   	pop    %rbx
0085     2dd5:	41 5c                	pop    %r12
0087     2dd7:	41 5e                	pop    %r14
0089     2dd9:	41 5f                	pop    %r15
008b     2ddb:	e9 00 00 00 00       	jmp    2de0 <__pfx_get_gate_vma>	2ddc: R_X86_64_PLT32	__x86_return_thunk-0x4

Well, it got the untag right, but OMG.. :-( Joao, Sami, any idea why it
put an ENDBR in there?
  
Linus Torvalds Jan. 17, 2023, 5:18 p.m. UTC | #4
On Tue, Jan 17, 2023 at 7:02 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Jan 17, 2023 at 04:57:03PM +0300, Kirill A. Shutemov wrote:
> > On Tue, Jan 17, 2023 at 02:05:22PM +0100, Peter Zijlstra wrote:
> > > On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote:
> > >
> > > >  #define __untagged_addr(untag_mask, addr)
> > > >   u64 __addr = (__force u64)(addr);                               \
> > > > - s64 sign = (s64)__addr >> 63;                                   \
> > > > - __addr &= untag_mask | sign;                                    \
> > > > + if (static_branch_likely(&tagged_addr_key)) {                   \
> > > > +         s64 sign = (s64)__addr >> 63;                           \
> > > > +         __addr &= untag_mask | sign;                            \
> > > > + }                                                               \
> > > >   (__force __typeof__(addr))__addr;                               \
> > > >  })
> > > >
> > > > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr)
> > >
> > > Is the compiler clever enough to put the memop inside the branch?
> >
> > Hm. You mean current_untag_mask() inside static_branch_likely()?
> >
> > But it is preprocessor who does this, not compiler. So, yes, the memop is
> > inside the branch.
> >
> > Or I didn't understand your question.
>
> Nah, call it a pre-lunch dip, I overlooked the whole CPP angle -- d'0h.
>
> That said, I did just put it through a compiler to see wth it did and it
> is pretty gross:

Yeah, I think the static branch likely just makes things worse.

And if we really want to make the "no untag mask exists" case better,
I think the code should probably use static_branch_unlikely() rather
than *_likely(). That should make it jump to the masking code, and
leave the unmasked code as a fallthrough, no?

The reason clang seems to generate saner code is that clang seems to
largely ignore the whole "__builtin_expect()", at least not to the
point where it tries to make the unlikely case be out-of-line.

But on the whole, I think we'd be better off without this whole static branch.

The cost of "untagged_addr()" generally shouldn't be worth this. There
are few performance-crticial users - the most common case is, I think,
just mmap() and friends, and the single load is going to be a
non-issue there.

Looking around, I think the only situation where we may care is
strnlen_user() and strncpy_from_user(). Those *can* be
performance-critical. They're used for paths and for execve() strings,
and can be a bit hot.

And both of those cases actually just use it because of the whole
"maximum address" calculation to avoid traversing into kernel
addresses, so I wonder if we could use alternatives there, kind of
like the get_user/put_user cases did. Except it's generic code, so ..

But maybe even those aren't worth worrying about. At least they do the
unmasking outside the loop - although then in the case of execve(),
the string copies themselves are obviously done in a loop anyway.

Kirill, do you have clear numbers for that static key being a noticeable win?

                 Linus
  
Linus Torvalds Jan. 17, 2023, 5:28 p.m. UTC | #5
On Tue, Jan 17, 2023 at 9:18 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> The reason clang seems to generate saner code is that clang seems to
> largely ignore the whole "__builtin_expect()", at least not to the
> point where it tries to make the unlikely case be out-of-line.

Side note: that's not something new or unusual. It's been the case
since I started testing clang - we have several code-paths where we
use "unlikely()" to try to get very unlikely cases to be out-of-line,
and clang just mostly ignores it, or treats it as a very weak hint. I
think the only way to get clang to treat it as a *strong* hint is to
use PGO.

And in this case it actually made code generation look better,
probably because this particular use of static_branch_likely() is a
bit confused about which side should be the preferred one.  It's using
the static branch to make the old case not have the masked load, but
then it's saying that the new case is the likely one.

So clang ignoring the likely() hint is probably the right thing here,
and then the wrong thing in some other places.

              Linus
  
Peter Zijlstra Jan. 17, 2023, 6:14 p.m. UTC | #6
On Tue, Jan 17, 2023 at 09:18:01AM -0800, Linus Torvalds wrote:

> The reason clang seems to generate saner code is that clang seems to
> largely ignore the whole "__builtin_expect()", at least not to the
> point where it tries to make the unlikely case be out-of-line.

So in this case there is only a 'likely' hint, we're explicitly trying
to keep the thing in-line so we can jump over it.

It is GCC that generated an implicit else (and marked it 'unlikely' --
which we didn't ask for), but worse, it failed to spot the else case is
in fact shared with the normal case and it could've simply lifted that
mov instruction.

That is, instead of this:

0003     23b3:  eb 76                   jmp    242b <write_ok_or_segv+0x7b>
0005     23b5:  65 48 8b 0d 00 00 00 00         mov    %gs:0x0(%rip),%rcx        # 23bd <write_ok_or_segv+0xd>  23b9: R_X86_64_PC32     tlbstate_untag_mask-0x4
000d     23bd:  48 89 f8                mov    %rdi,%rax
0010     23c0:  48 c1 f8 3f             sar    $0x3f,%rax
0014     23c4:  48 09 c8                or     %rcx,%rax
0017     23c7:  48 21 f8                and    %rdi,%rax

001a     23ca:  48 b9 00 f0 ff ff ff 7f 00 00   movabs $0x7ffffffff000,%rcx

007b     242b:  48 89 f8                mov    %rdi,%rax
007e     242e:  eb 9a                   jmp    23ca <write_ok_or_segv+0x1a>

It could've just done:

0003     48 89 f8                mov    %rdi,%rax
0006     eb 76                   jmp    +18
0008     65 48 8b 0d 00 00 00 00         mov    %gs:0x0(%rip),%rcx        # 23bd <write_ok_or_segv+0xd>  23b9: R_X86_64_PC32     tlbstate_untag_mask-0x4
0010     48 c1 f8 3f             sar    $0x3f,%rax
0014     48 09 c8                or     %rcx,%rax
0017     48 21 f8                and    %rdi,%rax

001a     48 b9 00 f0 ff ff ff 7f 00 00   movabs $0x7ffffffff000,%rcx

and everything would've been good. In all the cases I've seen it do
this, it was the same, it has this silly move out of line that's also
part of the regular branch.

That is, I like __builtin_expect() to be a strong hint. If I don't want
things out of line, I shouldn't have put unlikely on it. What I don't
like is that implicit else branches get the opposite strong hint.

What I like even less is that it found it needed that else branch at
all.
  
Peter Zijlstra Jan. 17, 2023, 6:21 p.m. UTC | #7
On Tue, Jan 17, 2023 at 09:18:01AM -0800, Linus Torvalds wrote:

> But maybe even those aren't worth worrying about. At least they do the
> unmasking outside the loop - although then in the case of execve(),
> the string copies themselves are obviously done in a loop anyway.
> 
> Kirill, do you have clear numbers for that static key being a noticeable win?

Numbers would be good; I think this all started as a precaution, but
clearly that's not really working out so well :/
  
Nick Desaulniers Jan. 17, 2023, 6:26 p.m. UTC | #8
On Tue, Jan 17, 2023 at 9:29 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Tue, Jan 17, 2023 at 9:18 AM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > The reason clang seems to generate saner code is that clang seems to
> > largely ignore the whole "__builtin_expect()", at least not to the
> > point where it tries to make the unlikely case be out-of-line.
>
> Side note: that's not something new or unusual. It's been the case
> since I started testing clang - we have several code-paths where we
> use "unlikely()" to try to get very unlikely cases to be out-of-line,
> and clang just mostly ignores it, or treats it as a very weak hint. I
> think the only way to get clang to treat it as a *strong* hint is to
> use PGO.

I'd be surprised if that were intentional or by design.

Do you guys have a bug report we could look at?

> So clang ignoring the likely() hint is probably the right thing here,
> and then the wrong thing in some other places.
  
Linus Torvalds Jan. 17, 2023, 6:33 p.m. UTC | #9
On Tue, Jan 17, 2023 at 10:26 AM Nick Desaulniers
<ndesaulniers@google.com> wrote:
>
> On Tue, Jan 17, 2023 at 9:29 AM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > Side note: that's not something new or unusual. It's been the case
> > since I started testing clang - we have several code-paths where we
> > use "unlikely()" to try to get very unlikely cases to be out-of-line,
> > and clang just mostly ignores it, or treats it as a very weak hint. I
> > think the only way to get clang to treat it as a *strong* hint is to
> > use PGO.
>
> I'd be surprised if that were intentional or by design.
>
> Do you guys have a bug report we could look at?

Heh. I actually sent you an example long ago. Let me go fish it out of
my mail archives and quote some of it below so that you can find it in
yours..

              Linus

[ Time passes. Found this email to you and Bill Wendling from Feb 16,
2020, Message-ID
CAHk-=wigVshsByCMjkUiZyQSR5N5zi2aAeQc+VJCzQV=nm8E7g@mail.gmail.com ]:

  Anyway, I'm looking at clang code generation, and comparing it with
  gcc on one of my "this has been optimized to hell and back" functions:
  __d_lookup_rcu().

  It looks like clang does frame pointers, and ignores our
  likely/unlikely annotations.

  So this code:

                if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
                        int tlen;
                        const char *tname;
                        ......

  doesn't actually jump out of line, but instead generates the unlikely
  case as the fallthrough:

        testb   $2, (%r12)
        je      .LBB50_9
        ... unlikely code goes here...

  and then the likely case ends up having unfortunate reloads inside the
  hot loop. Possibly because it has one fewer free registers than gcc
  because of the frame pointer.

  I didn't look into _why_ clang generates frame pointers but gcc
  doesn't. It may be just a compiler default, I think we don't end up
  explicitly asking either way.

[ And then Bill replied with this ]

  It's not a no-op. We add branch probabilities to the IR, whether
  they're honored or not depends on the transformation. But they
  *should* be honored when available. I've seen in the past that instead
  of moving unlikely blocks out of the way (like gcc, which moves them
  below the function's "ret" instruction), LLVM will do something like
  this:

    <normal code>
    <jmp to loop conditional test>
        <unlikely code>
        <more likely code>
    <loop conditional test>
    <...>

  I.e. the loop is rotated and the unlikely code is first and the hotter
  code is closer together but between the unlikely and conditional test.
  Could this be what's going on? Otherwise, maybe clang decided that
  it's not beneficial to move the code out-of-line because the benefit
  was minimal? (I'm guessing here.)
  
Nick Desaulniers Jan. 17, 2023, 7:17 p.m. UTC | #10
On Tue, Jan 17, 2023 at 10:34 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Tue, Jan 17, 2023 at 10:26 AM Nick Desaulniers
> <ndesaulniers@google.com> wrote:
> >
> > On Tue, Jan 17, 2023 at 9:29 AM Linus Torvalds
> > <torvalds@linux-foundation.org> wrote:
> > >
> > > Side note: that's not something new or unusual. It's been the case
> > > since I started testing clang - we have several code-paths where we
> > > use "unlikely()" to try to get very unlikely cases to be out-of-line,
> > > and clang just mostly ignores it, or treats it as a very weak hint. I
> > > think the only way to get clang to treat it as a *strong* hint is to
> > > use PGO.
> >
> > I'd be surprised if that were intentional or by design.
> >
> > Do you guys have a bug report we could look at?
>
> Heh. I actually sent you an example long ago. Let me go fish it out of
> my mail archives and quote some of it below so that you can find it in
> yours..
>
>               Linus
>
> [ Time passes. Found this email to you and Bill Wendling from Feb 16,
> 2020, Message-ID
> CAHk-=wigVshsByCMjkUiZyQSR5N5zi2aAeQc+VJCzQV=nm8E7g@mail.gmail.com ]:
>
>   Anyway, I'm looking at clang code generation, and comparing it with
>   gcc on one of my "this has been optimized to hell and back" functions:
>   __d_lookup_rcu().
>
>   It looks like clang does frame pointers, and ignores our
>   likely/unlikely annotations.
>
>   So this code:
>
>                 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
>                         int tlen;
>                         const char *tname;
>                         ......
>
>   doesn't actually jump out of line, but instead generates the unlikely
>   case as the fallthrough:
>
>         testb   $2, (%r12)
>         je      .LBB50_9
>         ... unlikely code goes here...


Perhaps that was compiler version or config specific?

$ make LLVM=1 -j128 defconfig fs/dcache.o
$ llvm-objdump -d  --no-show-raw-insn
--disassemble-symbols=__d_lookup_rcu fs/dcache.o
0000000000003210 <__d_lookup_rcu>:
    3210:      endbr64
    3214:      pushq %rbp
    3215:      pushq %r15
    3217:      pushq %r14
    3219:      pushq %r12
    321b:      pushq %rbx
    321c:      testb $0x2, (%rdi)
    321f:      jne 0x32d7 <__d_lookup_rcu+0xc7>
...
    32d7:      popq %rbx
    32d8:      popq %r12
    32da:      popq %r14
    32dc:      popq %r15
    32de:      popq %rbp
    32df:      jmp 0x3300 <__d_lookup_rcu_op_compare>

That looks like what you want, yeah?

Your original report was from nearly 3 years ago; could have fixed a
few instances of branch weights not getting propagated since then.

What's going on in this case in this thread? I know we don't support
hot/cold attributes on labels yet, but if static_branch_likely (or
friends) is being used, we assign the indirect branches a 0%
likeliness/branch-weight.

>
>   and then the likely case ends up having unfortunate reloads inside the
>   hot loop. Possibly because it has one fewer free registers than gcc
>   because of the frame pointer.
>
>   I didn't look into _why_ clang generates frame pointers but gcc
>   doesn't. It may be just a compiler default, I think we don't end up
>   explicitly asking either way.
>
> [ And then Bill replied with this ]
>
>   It's not a no-op. We add branch probabilities to the IR, whether
>   they're honored or not depends on the transformation. But they
>   *should* be honored when available. I've seen in the past that instead
>   of moving unlikely blocks out of the way (like gcc, which moves them
>   below the function's "ret" instruction), LLVM will do something like
>   this:
>
>     <normal code>
>     <jmp to loop conditional test>
>         <unlikely code>
>         <more likely code>
>     <loop conditional test>
>     <...>
>
>   I.e. the loop is rotated and the unlikely code is first and the hotter
>   code is closer together but between the unlikely and conditional test.
>   Could this be what's going on? Otherwise, maybe clang decided that
>   it's not beneficial to move the code out-of-line because the benefit
>   was minimal? (I'm guessing here.)
  
Linus Torvalds Jan. 17, 2023, 8:10 p.m. UTC | #11
On Tue, Jan 17, 2023 at 11:17 AM Nick Desaulniers
<ndesaulniers@google.com> wrote:
>
> Perhaps that was compiler version or config specific?

Possible, but...

The clang code generation annoyed me enough that I actually ended up
rewriting the unlikely test to be outside the loop in commit
ae2a823643d7 ("dcache: move the DCACHE_OP_COMPARE case out of the
__d_lookup_rcu loop").

I think that then made clang no longer have the whole "rotate loop
with unlikely case in the middle" issue.

And then because clang *still* messed up by trying to be too clever (see

    https://lore.kernel.org/all/CAHk-=wjyOB66pofW0mfzDN7SO8zS1EMRZuR-_2aHeO+7kuSrAg@mail.gmail.com/

for details), I also ended up doing commit c4e34dd99f2e ("x86:
simplify load_unaligned_zeropad() implementation").

The end result is that now the compiler almost *cannot* mess up any more.

So the reason clang now does a good job on __d_lookup_rcu() is largely
that I took away all the places where it did badly ;)

That said, clang still generates more register pressure than gcc,
causing the function prologue and epilogue to be rather bigger
(pushing and popping six registers, as opposed to gcc that only needs
three)

Gcc is also better able to schedule the prologue and epilogue together
with the work of the function, which clang seems to always do it as a
"push all" and "pop all" sequence.

That scheduling doesn't matter in that particular place (although it
does make the unlikely case of calling __d_lookup_rcu_op_compare
pointlessly push all regs only to then pop them), but I've seen a few
other cases where it ends up meaning that it always does that full
function prologue even when the *likely* case then returns early and
doesn't actually need any of that work because it didn't use any of
those registers.

But yeah, the RCU pathname lookup looks fine these days. And I don't
actually think it was due to clang changes ;)

                Linus
  
Linus Torvalds Jan. 17, 2023, 8:43 p.m. UTC | #12
On Tue, Jan 17, 2023 at 12:10 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> That said, clang still generates more register pressure than gcc,
> causing the function prologue and epilogue to be rather bigger
> (pushing and popping six registers, as opposed to gcc that only needs
> three)

.. and at least part of that is the same thing with the bad byte mask
generation (see that "clang *still* messes up" link for details).

Basically, the byte mask is computed by

        mask = bytemask_from_count(tcount);

where we have

   #define bytemask_from_count(cnt)       (~(~0ul << (cnt)*8))

and clang tries very very hard to avoid that "multiply by 8", so
instead it keeps a shadow copy of that "(cnt)*8" value in the loop.

That is wrong for a couple of reasons:

 (a) it adds register pressure for no good reason

 (b) when you shift left by that value, only the low 6 bits of that
value matters

And guess how that "tcount" is updated? It's this:

                tcount -= sizeof(unsigned long);

in the loop, and thus the update of that shadow value of "(cnt)*8" is done as

        addl    $-64, %ecx

inside that loop.

This is truly stupid and wasted work, because the low 6 bits of the
value - remember, the only part that matters - DOES NOT CHANGE when
you do that.

So clang has decided that it needs to

 (a) avoid the "expensive" multiply-by-8 at the end by turning it into
a repeated "add $-64" inside the loop

 (b) added register pressure and one extra instruction inside the loop

 (c) not realized that that extra instruction doesn't actually *do*
anything, because it only affects the bits that don't actually matter
in the end.

which is all kind of silly, wouldn't you agree. Every single step
there was pointless.

But with my other simplifications, the fact that clang does these
extra things is no longer all that noticeable. It *used* to be a
horrible disaster because the extra register pressure ended up meaning
that you had spills and all kinds of nastiness. Now the function is
simple enough that even with the extra register pressure, there's no
need for spills.

.. until you look at the 32-bit version, which still needs spills. Gcc
does too, but clang just makes it worse by having the extra pointless
shadow variable.

If I cared about 32-bit, I might write up a bugzilla entry. As it is,
it's just "clang tries to be clever, and in the process is actually
being stupid".

              Linus
  
Kirill A. Shutemov Jan. 19, 2023, 11:06 p.m. UTC | #13
On Tue, Jan 17, 2023 at 04:02:06PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 17, 2023 at 04:57:03PM +0300, Kirill A. Shutemov wrote:
> > On Tue, Jan 17, 2023 at 02:05:22PM +0100, Peter Zijlstra wrote:
> > > On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote:
> > > 
> > > >  #define __untagged_addr(untag_mask, addr)
> > > >  	u64 __addr = (__force u64)(addr);				\
> > > > -	s64 sign = (s64)__addr >> 63;					\
> > > > -	__addr &= untag_mask | sign;					\
> > > > +	if (static_branch_likely(&tagged_addr_key)) {			\
> > > > +		s64 sign = (s64)__addr >> 63;				\
> > > > +		__addr &= untag_mask | sign;				\
> > > > +	}								\
> > > >  	(__force __typeof__(addr))__addr;				\
> > > >  })
> > > >  
> > > > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr)
> > > 
> > > Is the compiler clever enough to put the memop inside the branch?
> > 
> > Hm. You mean current_untag_mask() inside static_branch_likely()?
> > 
> > But it is preprocessor who does this, not compiler. So, yes, the memop is
> > inside the branch.
> > 
> > Or I didn't understand your question.
> 
> Nah, call it a pre-lunch dip, I overlooked the whole CPP angle -- d'0h.
> 
> That said, I did just put it through a compiler to see wth it did and it
> is pretty gross:

I tried to replace static branch with alternative. It kinda works, but
required few hack. Thanks to Andrew Cooper for helping to untangle them.

I am not sure if it worth the effort. I don't have any evidence that it
helps. untagged_addr() overhead is rather small and hides in noise of
syscall cost.

I only made alternative for untagged_addr(), but not for
untagged_addr_remote(). _remote() case has very few users.

BTW, it would be nice to be able to apply alternative later, delaying it
until the first user of LAM, like I did with static_branch.
We don't have a way to do this right?

Any opinions? I am okay dropping the patch altogether.

diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index c44b56f7ffba..3f0c31044f02 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -75,6 +75,12 @@
 # define DISABLE_CALL_DEPTH_TRACKING	(1 << (X86_FEATURE_CALL_DEPTH & 31))
 #endif
 
+#ifdef CONFIG_ADDRESS_MASKING
+# define DISABLE_LAM		0
+#else
+# define DISABLE_LAM		(1 << (X86_FEATURE_LAM & 31))
+#endif
+
 #ifdef CONFIG_INTEL_IOMMU_SVM
 # define DISABLE_ENQCMD		0
 #else
@@ -115,7 +121,7 @@
 #define DISABLED_MASK10	0
 #define DISABLED_MASK11	(DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
 			 DISABLE_CALL_DEPTH_TRACKING)
-#define DISABLED_MASK12	0
+#define DISABLED_MASK12	(DISABLE_LAM)
 #define DISABLED_MASK13	0
 #define DISABLED_MASK14	0
 #define DISABLED_MASK15	0
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index f9f85d596581..57ccb91fcccf 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -9,6 +9,7 @@
 #include <linux/kasan-checks.h>
 #include <linux/mm_types.h>
 #include <linux/string.h>
+#include <linux/mmap_lock.h>
 #include <asm/asm.h>
 #include <asm/page.h>
 #include <asm/smap.h>
@@ -24,28 +25,48 @@ static inline bool pagefault_disabled(void);
 #endif
 
 #ifdef CONFIG_ADDRESS_MASKING
-DECLARE_STATIC_KEY_FALSE(tagged_addr_key);
+static inline unsigned long __untagged_addr(unsigned long addr)
+{
+	/*
+	 * Magic with the 'sign' allows to untag userspace pointer without
+	 * any branches while leaving kernel addresses intact.
+	 */
+	long sign;
+
+	/*
+	 * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation
+	 * in alternative instructions. The relocation gets wrong when gets
+	 * copied to the target place.
+	 */
+	asm (ALTERNATIVE("",
+			 "sar $63, %[sign]\n\t" /* user_ptr ? 0 : -1UL */
+			 "or %%gs:tlbstate_untag_mask, %[sign]\n\t"
+			 "and %[sign], %[addr]\n\t", X86_FEATURE_LAM)
+	     : [addr] "+r" (addr), [sign] "=r" (sign)
+	     : "m" (tlbstate_untag_mask), "[sign]" (addr));
+
+	return addr;
+}
 
-/*
- * Mask out tag bits from the address.
- *
- * Magic with the 'sign' allows to untag userspace pointer without any branches
- * while leaving kernel addresses intact.
- */
-#define __untagged_addr(untag_mask, addr)	({			\
-	u64 __addr = (__force u64)(addr);				\
-	if (static_branch_likely(&tagged_addr_key)) {			\
-		s64 sign = (s64)__addr >> 63;				\
-		__addr &= untag_mask | sign;				\
-	}								\
-	(__force __typeof__(addr))__addr;				\
+#define untagged_addr(addr)	({					\
+	unsigned long __addr = (__force unsigned long)(addr);		\
+	(__force __typeof__(addr))__untagged_addr(__addr);		\
 })
 
-#define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr)
+static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
+						   unsigned long addr)
+{
+	long sign = addr >> 63;
+
+	mmap_assert_locked(mm);
+	addr &= (mm)->context.untag_mask | sign;
+
+	return addr;
+}
 
 #define untagged_addr_remote(mm, addr)	({				\
-	mmap_assert_locked(mm);						\
-	__untagged_addr((mm)->context.untag_mask, addr);		\
+	unsigned long __addr = (__force unsigned long)(addr);		\
+	(__force __typeof__(addr))__untagged_addr_remote(mm, __addr);	\
 })
 
 #else
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0831d2be190f..e006725afdf1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -745,9 +745,6 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
 
 #ifdef CONFIG_ADDRESS_MASKING
 
-DEFINE_STATIC_KEY_FALSE(tagged_addr_key);
-EXPORT_SYMBOL_GPL(tagged_addr_key);
-
 #define LAM_U57_BITS 6
 
 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
@@ -787,8 +784,6 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
 	set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
 
 	mmap_write_unlock(mm);
-
-	static_branch_enable(&tagged_addr_key);
 	return 0;
 }
 #endif
  

Patch

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 32c9dd052e43..f9f85d596581 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -24,6 +24,8 @@  static inline bool pagefault_disabled(void);
 #endif
 
 #ifdef CONFIG_ADDRESS_MASKING
+DECLARE_STATIC_KEY_FALSE(tagged_addr_key);
+
 /*
  * Mask out tag bits from the address.
  *
@@ -32,8 +34,10 @@  static inline bool pagefault_disabled(void);
  */
 #define __untagged_addr(untag_mask, addr)	({			\
 	u64 __addr = (__force u64)(addr);				\
-	s64 sign = (s64)__addr >> 63;					\
-	__addr &= untag_mask | sign;					\
+	if (static_branch_likely(&tagged_addr_key)) {			\
+		s64 sign = (s64)__addr >> 63;				\
+		__addr &= untag_mask | sign;				\
+	}								\
 	(__force __typeof__(addr))__addr;				\
 })
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 88aae519c8f8..1b41c60ebf6e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -745,6 +745,9 @@  static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
 
 #ifdef CONFIG_ADDRESS_MASKING
 
+DEFINE_STATIC_KEY_FALSE(tagged_addr_key);
+EXPORT_SYMBOL_GPL(tagged_addr_key);
+
 #define LAM_U57_BITS 6
 
 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
@@ -781,6 +784,7 @@  static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
 
 	mmap_write_unlock(mm);
 
+	static_branch_enable(&tagged_addr_key);
 	return 0;
 }
 #endif