Effect of @inline(...) on nested functions

This program:

@inline(__always)
func makeWyRandPRNG(state: UInt64 = UInt64.random(in: .min ... .max)) -> () -> UInt64 {
    var state = state
    @inline(__always)
    func nextWyRand() -> UInt64 {
        state &+= 0xa0761d6478bd642f
        let mul = state.multipliedFullWidth(by: state ^ 0xe7037ed1a0b428db)
        return mul.high ^ mul.low
    }
    return nextWyRand
}

@inline(never)
func test() {
    let nextWyRand = makeWyRandPRNG()
    var cs: UInt64 = 0
    let t0 = DispatchTime.now().uptimeNanoseconds
    for _ in 0 ..< 100_000_000 { cs &+= nextWyRand() }
    let t1 = DispatchTime.now().uptimeNanoseconds
    print(Double(t1-t0)/1e9, "seconds (checksum: \(cs))")
}
for _ in 0 ..< 4 { test() }

will output the following (on my MBP, compiled with -O):

0.230478786 seconds (checksum: 18255195438169662781)
0.090597126 seconds (checksum: 4070770879551161749)
0.089199336 seconds (checksum: 8926320280989155142)
0.089612531 seconds (checksum: 5118214779124164664)

So, the hundred million calls takes around 0.09 s.


Now, if I change the @inline for the outer function to never, like so:

@inline(never)
func makeWyRandPRNG(state: UInt64 = UInt64.random(in: .min ... .max)) -> () -> UInt64 {
    var state = state
    @inline(__always)
    func nextWyRand() -> UInt64 {
        state &+= 0xa0761d6478bd642f
        ...

the output will be:

0.352346736 seconds (checksum: 4559576502277926132)
0.347921636 seconds (checksum: 9148822367661849503)
0.35059418 seconds (checksum: 12610521329567560262)
0.346017439 seconds (checksum: 11761609518189602487)

ie, the returned inner function (which is still @inline(__always)) is now more than 3 times slower.

Is this working as intended? (I expected it to make no difference, since the inner function should still be inlined, and the outer function is just called once, outside the timed loop.)


Here's the disassembly for the timed loop:

when outer function has @inline(__always)
    0x1000014b1 <+97>:  callq  0x1000a9ec0               ; symbol stub for: static Dispatch.DispatchTime.now() -> Dispatch.DispatchTime
    0x1000014b6 <+102>: callq  0x1000a9eba               ; symbol stub for: Dispatch.DispatchTime.uptimeNanoseconds.getter : Swift.UInt64
    0x1000014bb <+107>: movq   %rax, -0x60(%rbp)
    0x1000014bf <+111>: movq   0x8(%rbx), %rax
    0x1000014c3 <+115>: movq   %r13, %rdi
    0x1000014c6 <+118>: movq   %r15, %rsi
    0x1000014c9 <+121>: movq   %rax, -0x68(%rbp)
    0x1000014cd <+125>: callq  *%rax
    0x1000014cf <+127>: leaq   -0x48(%rbp), %rdi
    0x1000014d3 <+131>: leaq   -0x88(%rbp), %rsi
    0x1000014da <+138>: movl   $0x1, %edx
    0x1000014df <+143>: xorl   %ecx, %ecx
    0x1000014e1 <+145>: callq  0x1000aa64c               ; symbol stub for: swift_beginAccess
    0x1000014e6 <+150>: movq   -0x48(%rbp), %r11
    0x1000014ea <+154>: movq   %r11, %xmm0
    0x1000014ef <+159>: pshufd $0x44, %xmm0, %xmm0       ; xmm0 = xmm0[0,1,0,1] 
    0x1000014f4 <+164>: paddq  0xaa0b4(%rip), %xmm0
    0x1000014fc <+172>: pxor   %xmm7, %xmm7
    0x100001500 <+176>: movl   $0x5f5e100, %r14d         ; imm = 0x5F5E100 
    0x100001506 <+182>: movdqa 0xaa0b1(%rip), %xmm8
    0x10000150f <+191>: movdqa 0xaa0b9(%rip), %xmm2
    0x100001517 <+199>: movdqa 0xaa0c1(%rip), %xmm3
    0x10000151f <+207>: movdqa 0xaa0c9(%rip), %xmm4
    0x100001527 <+215>: pxor   %xmm1, %xmm1
    0x10000152b <+219>: nopl   (%rax,%rax)
    0x100001530 <+224>: movdqa %xmm0, %xmm5
    0x100001534 <+228>: paddq  %xmm8, %xmm5
    0x100001539 <+233>: movdqa %xmm0, %xmm6
    0x10000153d <+237>: paddq  %xmm2, %xmm6
    0x100001541 <+241>: movq   %xmm5, %rsi
    0x100001546 <+246>: pextrq $0x1, %xmm5, %rdx
    0x10000154d <+253>: pxor   %xmm3, %xmm5
    0x100001551 <+257>: movq   %xmm6, %r12
    0x100001556 <+262>: pextrq $0x1, %xmm6, %r10
    0x10000155d <+269>: pxor   %xmm3, %xmm6
    0x100001561 <+273>: movq   %xmm5, %rcx
    0x100001566 <+278>: pextrq $0x1, %xmm5, %rax
    0x10000156d <+285>: movq   %xmm6, %r9
    0x100001572 <+290>: pextrq $0x1, %xmm6, %rdi
    0x100001579 <+297>: mulq   %rdx
    0x10000157c <+300>: movq   %rdx, %rbx
    0x10000157f <+303>: movq   %rax, %r8
    0x100001582 <+306>: movq   %rcx, %rax
    0x100001585 <+309>: mulq   %rsi
    0x100001588 <+312>: movq   %rdx, %rcx
    0x10000158b <+315>: movq   %rax, %rsi
    0x10000158e <+318>: movq   %rdi, %rax
    0x100001591 <+321>: mulq   %r10
    0x100001594 <+324>: movq   %rdx, %rdi
    0x100001597 <+327>: movq   %rax, %r10
    0x10000159a <+330>: movq   %r9, %rax
    0x10000159d <+333>: mulq   %r12
    0x1000015a0 <+336>: xorq   %rsi, %rcx
    0x1000015a3 <+339>: xorq   %r8, %rbx
    0x1000015a6 <+342>: xorq   %rax, %rdx
    0x1000015a9 <+345>: xorq   %r10, %rdi
    0x1000015ac <+348>: movq   %rbx, %xmm5
    0x1000015b1 <+353>: movq   %rcx, %xmm6
    0x1000015b6 <+358>: punpcklqdq %xmm5, %xmm6              ; xmm6 = xmm6[0],xmm5[0] 
    0x1000015ba <+362>: paddq  %xmm6, %xmm7
    0x1000015be <+366>: movq   %rdi, %xmm5
    0x1000015c3 <+371>: movq   %rdx, %xmm6
    0x1000015c8 <+376>: punpcklqdq %xmm5, %xmm6              ; xmm6 = xmm6[0],xmm5[0] 
    0x1000015cc <+380>: paddq  %xmm6, %xmm1
    0x1000015d0 <+384>: paddq  %xmm4, %xmm0
    0x1000015d4 <+388>: addq   $-0x4, %r14
    0x1000015d8 <+392>: jne    0x100001530               ; <+224> [inlined] function signature specialization <Arg[0] = Stack Promoted from Box> of nextWyRand() -> Swift.UInt64 + 5 at main.swift:50
    0x1000015de <+398>: movdqa %xmm1, -0xa0(%rbp)
    0x1000015e6 <+406>: movdqa %xmm7, -0xb0(%rbp)
    0x1000015ee <+414>: movabsq $-0x18a771abbef7b100, %rax ; imm = 0xE7588E5441084F00 
    0x1000015f8 <+424>: addq   %rax, %r11
    0x1000015fb <+427>: movq   %r11, -0x48(%rbp)
    0x1000015ff <+431>: movq   %r13, %rax
    0x100001602 <+434>: callq  0x1000a9ec0               ; symbol stub for: static Dispatch.DispatchTime.now() -> Dispatch.DispatchTime
    0x100001607 <+439>: callq  0x1000a9eba               ; symbol stub for: Dispatch.DispatchTime.uptimeNanoseconds.getter : Swift.UInt64
when outer function has @inline(never)
    0x10000154a <+74>:  callq  0x1000a9ef0               ; symbol stub for: static Dispatch.DispatchTime.now() -> Dispatch.DispatchTime
    0x10000154f <+79>:  callq  0x1000a9eea               ; symbol stub for: Dispatch.DispatchTime.uptimeNanoseconds.getter : Swift.UInt64
    0x100001554 <+84>:  movq   %rax, -0x50(%rbp)
    0x100001558 <+88>:  movq   0x8(%r14), %rax
    0x10000155c <+92>:  movq   %r13, -0x58(%rbp)
    0x100001560 <+96>:  movq   %r13, %rdi
    0x100001563 <+99>:  movq   %r12, -0x60(%rbp)
    0x100001567 <+103>: movq   %r12, %rsi
    0x10000156a <+106>: movq   %rax, -0x48(%rbp)
    0x10000156e <+110>: callq  *%rax
    0x100001570 <+112>: movq   %rbx, -0x68(%rbp)
    0x100001574 <+116>: addq   $0x10, %rbx
    0x100001578 <+120>: movl   $0x5f5e100, %r12d         ; imm = 0x5F5E100 
    0x10000157e <+126>: movabsq $-0x5f89e29b87429bd1, %r13 ; imm = 0xA0761D6478BD642F 
    0x100001588 <+136>: movabsq $-0x18fc812e5f4bd725, %r14 ; imm = 0xE7037ED1A0B428DB 
    0x100001592 <+146>: nopw   %cs:(%rax,%rax)
    0x10000159c <+156>: nopl   (%rax)
    0x1000015a0 <+160>: movl   $0x1, %edx
    0x1000015a5 <+165>: movq   %rbx, %rdi
    0x1000015a8 <+168>: leaq   -0x88(%rbp), %rsi
    0x1000015af <+175>: xorl   %ecx, %ecx
    0x1000015b1 <+177>: callq  0x1000aa67c               ; symbol stub for: swift_beginAccess
    0x1000015b6 <+182>: movq   (%rbx), %rcx
    0x1000015b9 <+185>: addq   %r13, %rcx
    0x1000015bc <+188>: movq   %rcx, %rax
    0x1000015bf <+191>: xorq   %r14, %rax
    0x1000015c2 <+194>: mulq   %rcx
    0x1000015c5 <+197>: movq   %rcx, (%rbx)
    0x1000015c8 <+200>: xorq   %rax, %rdx
    0x1000015cb <+203>: addq   %rdx, %r15
    0x1000015ce <+206>: decq   %r12
    0x1000015d1 <+209>: jne    0x1000015a0               ; <+160> [inlined] nextWyRand() -> Swift.UInt64 + 14
    0x1000015d3 <+211>: movq   -0x58(%rbp), %r13
    0x1000015d7 <+215>: movq   %r13, %rax
    0x1000015da <+218>: callq  0x1000a9ef0               ; symbol stub for: static Dispatch.DispatchTime.now() -> Dispatch.DispatchTime
    0x1000015df <+223>: callq  0x1000a9eea               ; symbol stub for: Dispatch.DispatchTime.uptimeNanoseconds.getter : Swift.UInt64

This looks correct.

The body of the outer function is opaque at the callsite.

I expect that if you use the inner function within the outer function, then it would be inlined.

• • •

Also, I don’t think the nested-ness matters.

If the inner function were instead a global function, sites where the outer function is called still would not be able to know what the outer function returns, because the outer function is never inlined.

I'm probably missing something, but:

AFAICS, the following results suggests that the nestedness matters, in the sense that it's not only the outer function's @inline(X) that decides:

outer inner time
__always __always 0.09s
__always never 0.73s
never __always 0.34s
never never 0.44s

Do those tests all have the inner function nested inside the outer function?

If so, then you have not actually tested whether or not nested-ness matters.

I am suggesting that if you move the inner function out to the top-level, and run the same 4 tests again, that you are likely to see similar results.

In other words, I suspect that the results you are seeing are purely a reflection of the interaction between inlining level, and not at all dependent on whether the inner function is physically nested inside the outer function.

I’m happy to be proven wrong though.

• • •

Edit: oh, I see now the inner function captures state from the outer. That makes it tricky to separate.

Correct. I've just run the test program as is in the OP, but with __always or never for the outer and inner functions as described by the table.

But (ignoring whether or not nested-ness matters) why should case 2 (0.73s) be twice as slow as case 4 (0.44s) here:

?

While I don’t know the specific reason here, it is certainly plausible that some functions will be slowed down by inlining in some situations.

So, if you’re in a situation where inlining slows down outer but massively speeds up inner, then the results you see are comprehensible.

It is impossible to inline inner unless outer is also inlinable (because if outer is not inlinable then the body of inner cannot be seen at all), so cases 3 & 4 should be statistically indistinguishable.

The fact that 2 is slower than 4 follows from outer being slowed down by inlining. The fact that 1 is faster follows from inner being massively sped up by inlining.

I can’t speak as to “why” inlining helps or hurts any particular function though.