This program:
@inline(__always)
func makeWyRandPRNG(state: UInt64 = UInt64.random(in: .min ... .max)) -> () -> UInt64 {
var state = state
@inline(__always)
func nextWyRand() -> UInt64 {
state &+= 0xa0761d6478bd642f
let mul = state.multipliedFullWidth(by: state ^ 0xe7037ed1a0b428db)
return mul.high ^ mul.low
}
return nextWyRand
}
@inline(never)
func test() {
let nextWyRand = makeWyRandPRNG()
var cs: UInt64 = 0
let t0 = DispatchTime.now().uptimeNanoseconds
for _ in 0 ..< 100_000_000 { cs &+= nextWyRand() }
let t1 = DispatchTime.now().uptimeNanoseconds
print(Double(t1-t0)/1e9, "seconds (checksum: \(cs))")
}
for _ in 0 ..< 4 { test() }
will output the following (on my MBP, compiled with -O):
0.230478786 seconds (checksum: 18255195438169662781)
0.090597126 seconds (checksum: 4070770879551161749)
0.089199336 seconds (checksum: 8926320280989155142)
0.089612531 seconds (checksum: 5118214779124164664)
So, the hundred million calls takes around 0.09 s.
Now, if I change the @inline
for the outer function to never
, like so:
@inline(never)
func makeWyRandPRNG(state: UInt64 = UInt64.random(in: .min ... .max)) -> () -> UInt64 {
var state = state
@inline(__always)
func nextWyRand() -> UInt64 {
state &+= 0xa0761d6478bd642f
...
the output will be:
0.352346736 seconds (checksum: 4559576502277926132)
0.347921636 seconds (checksum: 9148822367661849503)
0.35059418 seconds (checksum: 12610521329567560262)
0.346017439 seconds (checksum: 11761609518189602487)
ie, the returned inner function (which is still @inline(__always)
) is now more than 3 times slower.
Is this working as intended? (I expected it to make no difference, since the inner function should still be inlined, and the outer function is just called once, outside the timed loop.)
Here's the disassembly for the timed loop:
when outer function has @inline(__always)
0x1000014b1 <+97>: callq 0x1000a9ec0 ; symbol stub for: static Dispatch.DispatchTime.now() -> Dispatch.DispatchTime
0x1000014b6 <+102>: callq 0x1000a9eba ; symbol stub for: Dispatch.DispatchTime.uptimeNanoseconds.getter : Swift.UInt64
0x1000014bb <+107>: movq %rax, -0x60(%rbp)
0x1000014bf <+111>: movq 0x8(%rbx), %rax
0x1000014c3 <+115>: movq %r13, %rdi
0x1000014c6 <+118>: movq %r15, %rsi
0x1000014c9 <+121>: movq %rax, -0x68(%rbp)
0x1000014cd <+125>: callq *%rax
0x1000014cf <+127>: leaq -0x48(%rbp), %rdi
0x1000014d3 <+131>: leaq -0x88(%rbp), %rsi
0x1000014da <+138>: movl $0x1, %edx
0x1000014df <+143>: xorl %ecx, %ecx
0x1000014e1 <+145>: callq 0x1000aa64c ; symbol stub for: swift_beginAccess
0x1000014e6 <+150>: movq -0x48(%rbp), %r11
0x1000014ea <+154>: movq %r11, %xmm0
0x1000014ef <+159>: pshufd $0x44, %xmm0, %xmm0 ; xmm0 = xmm0[0,1,0,1]
0x1000014f4 <+164>: paddq 0xaa0b4(%rip), %xmm0
0x1000014fc <+172>: pxor %xmm7, %xmm7
0x100001500 <+176>: movl $0x5f5e100, %r14d ; imm = 0x5F5E100
0x100001506 <+182>: movdqa 0xaa0b1(%rip), %xmm8
0x10000150f <+191>: movdqa 0xaa0b9(%rip), %xmm2
0x100001517 <+199>: movdqa 0xaa0c1(%rip), %xmm3
0x10000151f <+207>: movdqa 0xaa0c9(%rip), %xmm4
0x100001527 <+215>: pxor %xmm1, %xmm1
0x10000152b <+219>: nopl (%rax,%rax)
0x100001530 <+224>: movdqa %xmm0, %xmm5
0x100001534 <+228>: paddq %xmm8, %xmm5
0x100001539 <+233>: movdqa %xmm0, %xmm6
0x10000153d <+237>: paddq %xmm2, %xmm6
0x100001541 <+241>: movq %xmm5, %rsi
0x100001546 <+246>: pextrq $0x1, %xmm5, %rdx
0x10000154d <+253>: pxor %xmm3, %xmm5
0x100001551 <+257>: movq %xmm6, %r12
0x100001556 <+262>: pextrq $0x1, %xmm6, %r10
0x10000155d <+269>: pxor %xmm3, %xmm6
0x100001561 <+273>: movq %xmm5, %rcx
0x100001566 <+278>: pextrq $0x1, %xmm5, %rax
0x10000156d <+285>: movq %xmm6, %r9
0x100001572 <+290>: pextrq $0x1, %xmm6, %rdi
0x100001579 <+297>: mulq %rdx
0x10000157c <+300>: movq %rdx, %rbx
0x10000157f <+303>: movq %rax, %r8
0x100001582 <+306>: movq %rcx, %rax
0x100001585 <+309>: mulq %rsi
0x100001588 <+312>: movq %rdx, %rcx
0x10000158b <+315>: movq %rax, %rsi
0x10000158e <+318>: movq %rdi, %rax
0x100001591 <+321>: mulq %r10
0x100001594 <+324>: movq %rdx, %rdi
0x100001597 <+327>: movq %rax, %r10
0x10000159a <+330>: movq %r9, %rax
0x10000159d <+333>: mulq %r12
0x1000015a0 <+336>: xorq %rsi, %rcx
0x1000015a3 <+339>: xorq %r8, %rbx
0x1000015a6 <+342>: xorq %rax, %rdx
0x1000015a9 <+345>: xorq %r10, %rdi
0x1000015ac <+348>: movq %rbx, %xmm5
0x1000015b1 <+353>: movq %rcx, %xmm6
0x1000015b6 <+358>: punpcklqdq %xmm5, %xmm6 ; xmm6 = xmm6[0],xmm5[0]
0x1000015ba <+362>: paddq %xmm6, %xmm7
0x1000015be <+366>: movq %rdi, %xmm5
0x1000015c3 <+371>: movq %rdx, %xmm6
0x1000015c8 <+376>: punpcklqdq %xmm5, %xmm6 ; xmm6 = xmm6[0],xmm5[0]
0x1000015cc <+380>: paddq %xmm6, %xmm1
0x1000015d0 <+384>: paddq %xmm4, %xmm0
0x1000015d4 <+388>: addq $-0x4, %r14
0x1000015d8 <+392>: jne 0x100001530 ; <+224> [inlined] function signature specialization <Arg[0] = Stack Promoted from Box> of nextWyRand() -> Swift.UInt64 + 5 at main.swift:50
0x1000015de <+398>: movdqa %xmm1, -0xa0(%rbp)
0x1000015e6 <+406>: movdqa %xmm7, -0xb0(%rbp)
0x1000015ee <+414>: movabsq $-0x18a771abbef7b100, %rax ; imm = 0xE7588E5441084F00
0x1000015f8 <+424>: addq %rax, %r11
0x1000015fb <+427>: movq %r11, -0x48(%rbp)
0x1000015ff <+431>: movq %r13, %rax
0x100001602 <+434>: callq 0x1000a9ec0 ; symbol stub for: static Dispatch.DispatchTime.now() -> Dispatch.DispatchTime
0x100001607 <+439>: callq 0x1000a9eba ; symbol stub for: Dispatch.DispatchTime.uptimeNanoseconds.getter : Swift.UInt64
when outer function has @inline(never)
0x10000154a <+74>: callq 0x1000a9ef0 ; symbol stub for: static Dispatch.DispatchTime.now() -> Dispatch.DispatchTime
0x10000154f <+79>: callq 0x1000a9eea ; symbol stub for: Dispatch.DispatchTime.uptimeNanoseconds.getter : Swift.UInt64
0x100001554 <+84>: movq %rax, -0x50(%rbp)
0x100001558 <+88>: movq 0x8(%r14), %rax
0x10000155c <+92>: movq %r13, -0x58(%rbp)
0x100001560 <+96>: movq %r13, %rdi
0x100001563 <+99>: movq %r12, -0x60(%rbp)
0x100001567 <+103>: movq %r12, %rsi
0x10000156a <+106>: movq %rax, -0x48(%rbp)
0x10000156e <+110>: callq *%rax
0x100001570 <+112>: movq %rbx, -0x68(%rbp)
0x100001574 <+116>: addq $0x10, %rbx
0x100001578 <+120>: movl $0x5f5e100, %r12d ; imm = 0x5F5E100
0x10000157e <+126>: movabsq $-0x5f89e29b87429bd1, %r13 ; imm = 0xA0761D6478BD642F
0x100001588 <+136>: movabsq $-0x18fc812e5f4bd725, %r14 ; imm = 0xE7037ED1A0B428DB
0x100001592 <+146>: nopw %cs:(%rax,%rax)
0x10000159c <+156>: nopl (%rax)
0x1000015a0 <+160>: movl $0x1, %edx
0x1000015a5 <+165>: movq %rbx, %rdi
0x1000015a8 <+168>: leaq -0x88(%rbp), %rsi
0x1000015af <+175>: xorl %ecx, %ecx
0x1000015b1 <+177>: callq 0x1000aa67c ; symbol stub for: swift_beginAccess
0x1000015b6 <+182>: movq (%rbx), %rcx
0x1000015b9 <+185>: addq %r13, %rcx
0x1000015bc <+188>: movq %rcx, %rax
0x1000015bf <+191>: xorq %r14, %rax
0x1000015c2 <+194>: mulq %rcx
0x1000015c5 <+197>: movq %rcx, (%rbx)
0x1000015c8 <+200>: xorq %rax, %rdx
0x1000015cb <+203>: addq %rdx, %r15
0x1000015ce <+206>: decq %r12
0x1000015d1 <+209>: jne 0x1000015a0 ; <+160> [inlined] nextWyRand() -> Swift.UInt64 + 14
0x1000015d3 <+211>: movq -0x58(%rbp), %r13
0x1000015d7 <+215>: movq %r13, %rax
0x1000015da <+218>: callq 0x1000a9ef0 ; symbol stub for: static Dispatch.DispatchTime.now() -> Dispatch.DispatchTime
0x1000015df <+223>: callq 0x1000a9eea ; symbol stub for: Dispatch.DispatchTime.uptimeNanoseconds.getter : Swift.UInt64