while trying to simplify a lot of duplicated code in one of our code bases, i wondered what the performance impact of factoring the code into more concise forms would be.
i was dismayed to find that the most straightforward refactorizations generate rather poor machine code, even today on 5.9 with optimization enabled.
here’s a simplified model of the verbose code that i was trying to simplify:
public
func f1(a:[Int], b:[Int], g:(Int) throws -> ()) rethrows
{
for x:Int in a
{
try g(x)
}
for x:Int in b
{
try g(x)
}
}
it generates excellent assembly which is more or less what you would expect for what is written here.
output.f1(a: [Swift.Int], b: [Swift.Int], g: (Swift.Int) throws -> ()) throws -> ():
push rbp
push r15
push r14
push r13
push rbx
sub rsp, 16
mov rbx, r12
mov r13, rcx
mov qword ptr [rsp + 8], rdx
mov qword ptr [rsp], rsi
mov r14, qword ptr [rdi + 16]
test r14, r14
je .LBB1_5
mov rbp, rdi
call swift_retain@PLT
xor r15d, r15d
.LBB1_2:
mov rdi, qword ptr [rbp + 8*r15 + 32]
mov r12, rbx
call qword ptr [rsp + 8]
mov rbx, r12
test r12, r12
jne .LBB1_9
inc r15
cmp r14, r15
jne .LBB1_2
mov rdi, rbp
call swift_release@PLT
.LBB1_5:
mov rbp, qword ptr [rsp]
mov r14, qword ptr [rbp + 16]
test r14, r14
je .LBB1_10
mov rdi, rbp
call swift_retain@PLT
xor r15d, r15d
.LBB1_7:
mov rdi, qword ptr [rbp + 8*r15 + 32]
mov r12, rbx
call qword ptr [rsp + 8]
mov rbx, r12
test r12, r12
jne .LBB1_9
inc r15
cmp r14, r15
jne .LBB1_7
.LBB1_9:
mov rdi, rbp
call swift_release@PLT
.LBB1_10:
mov r12, rbx
add rsp, 16
pop rbx
pop r13
pop r14
pop r15
pop rbp
ret
here are two ways to rewrite this code, using tools that are available in the standard library (no third-party packages!):
public
func f2(a:[Int], b:[Int], g:(Int) throws -> ()) rethrows
{
for x:Int in a + b
{
try g(x)
}
}
public
func f3(a:[Int], b:[Int], g:(Int) throws -> ()) rethrows
{
for x:Int in [a, b].joined()
{
try g(x)
}
}
as you might expect from the title of this thread, they generate very poor assembly - f2
actually allocates a new array to store the concatenated arrays, and f3
seems to be not using any generic specialization of JoinedSequence
.
output.f2(a: [Swift.Int], b: [Swift.Int], g: (Swift.Int) throws -> ()) throws -> ():
push rbp
push r15
push r14
push r13
push rbx
sub rsp, 32
mov rbx, r12
mov qword ptr [rsp + 24], rcx
mov qword ptr [rsp + 16], rdx
mov r14, rsi
mov rbp, rdi
mov qword ptr [rsp + 8], rdi
mov rdi, rsi
call swift_retain@PLT
mov rdi, rbp
call swift_retain@PLT
lea r13, [rsp + 8]
mov rdi, r14
call (generic specialization <Swift.Int, [Swift.Int]> of Swift.Array.append<A where A == A1.Element, A1: Swift.Sequence>(contentsOf: __owned A1) -> ())
mov rbp, qword ptr [rsp + 8]
mov r14, qword ptr [rbp + 16]
test r14, r14
je .LBB2_4
dec r14
xor r15d, r15d
mov r13, qword ptr [rsp + 24]
.LBB2_2:
mov rdi, qword ptr [rbp + 8*r15 + 32]
mov r12, rbx
call qword ptr [rsp + 16]
mov rbx, r12
test r12, r12
jne .LBB2_4
lea rax, [r15 + 1]
cmp r14, r15
mov r15, rax
jne .LBB2_2
.LBB2_4:
mov rdi, rbp
call swift_release@PLT
mov r12, rbx
add rsp, 32
pop rbx
pop r13
pop r14
pop r15
pop rbp
ret
output.f3(a: [Swift.Int], b: [Swift.Int], g: (Swift.Int) throws -> ()) throws -> ():
push rbp
push r15
push r14
push r13
push rbx
sub rsp, 64
mov rbx, r12
mov r13, rcx
mov rbp, rdx
mov r15, rsi
mov r14, rdi
lea rdi, [rip + (demangling cache variable for type metadata for Swift._ContiguousArrayStorage<[Swift.Int]>)]
call __swift_instantiateConcreteTypeFromMangledName
lea rsi, [rsp + 16]
mov rdi, rax
call swift_initStackObject@PLT
mov rcx, rax
mov qword ptr [rax + 16], 2
mov qword ptr [rax + 24], 4
add rax, 32
mov qword ptr [rsp], rax
mov qword ptr [rcx + 32], r14
mov qword ptr [rsp + 8], rcx
mov qword ptr [rcx + 40], r15
mov rdi, r14
mov esi, 2
call swift_retain_n@PLT
mov rdi, r15
call swift_retain@PLT
test r14, r14
je .LBB3_5
xor r15d, r15d
.LBB3_2:
cmp r15, qword ptr [r14 + 16]
je .LBB3_5
jae .LBB3_14
mov rdi, qword ptr [r14 + 8*r15 + 32]
inc r15
mov r12, rbx
call rbp
mov rbx, r12
test r12, r12
je .LBB3_2
jmp .LBB3_12
.LBB3_5:
mov rax, qword ptr [rsp + 8]
mov r15, qword ptr [rax + 40]
mov rdi, r15
call swift_retain@PLT
mov rdi, r14
call swift_release@PLT
test r15, r15
je .LBB3_6
xor r14d, r14d
.LBB3_8:
cmp r14, qword ptr [r15 + 16]
je .LBB3_6
jae .LBB3_14
mov rdi, qword ptr [r15 + 8*r14 + 32]
inc r14
mov r12, rbx
call rbp
mov rbx, r12
test r12, r12
je .LBB3_8
mov r14, r15
.LBB3_12:
mov r15, qword ptr [rsp]
mov rdi, qword ptr [rsp + 8]
call swift_setDeallocating@PLT
lea rdi, [rip + (demangling cache variable for type metadata for [Swift.Int])]
call __swift_instantiateConcreteTypeFromMangledName
mov esi, 2
mov rdi, r15
mov rdx, rax
call swift_arrayDestroy@PLT
mov rdi, r14
jmp .LBB3_13
.LBB3_6:
mov r14, qword ptr [rsp]
mov rdi, qword ptr [rsp + 8]
call swift_setDeallocating@PLT
lea rdi, [rip + (demangling cache variable for type metadata for [Swift.Int])]
call __swift_instantiateConcreteTypeFromMangledName
mov esi, 2
mov rdi, r14
mov rdx, rax
call swift_arrayDestroy@PLT
mov rdi, r15
.LBB3_13:
call swift_release@PLT
mov r12, rbx
add rsp, 64
pop rbx
pop r13
pop r14
pop r15
pop rbp
ret
.LBB3_14:
ud2
here is the full godbolt.
what is going on here?