Efficiently chaining two arrays?

taylorswift · January 21, 2024, 11:43pm

while trying to simplify a lot of duplicated code in one of our code bases, i wondered what the performance impact of factoring the code into more concise forms would be.

i was dismayed to find that the most straightforward refactorizations generate rather poor machine code, even today on 5.9 with optimization enabled.

here’s a simplified model of the verbose code that i was trying to simplify:

public 
func f1(a:[Int], b:[Int], g:(Int) throws -> ()) rethrows
{
    for x:Int in a
    {
        try g(x)
    }
    for x:Int in b
    {
        try g(x)
    }
}

it generates excellent assembly which is more or less what you would expect for what is written here.

output.f1(a: [Swift.Int], b: [Swift.Int], g: (Swift.Int) throws -> ()) throws -> ():
        push    rbp
        push    r15
        push    r14
        push    r13
        push    rbx
        sub     rsp, 16
        mov     rbx, r12
        mov     r13, rcx
        mov     qword ptr [rsp + 8], rdx
        mov     qword ptr [rsp], rsi
        mov     r14, qword ptr [rdi + 16]
        test    r14, r14
        je      .LBB1_5
        mov     rbp, rdi
        call    swift_retain@PLT
        xor     r15d, r15d
.LBB1_2:
        mov     rdi, qword ptr [rbp + 8*r15 + 32]
        mov     r12, rbx
        call    qword ptr [rsp + 8]
        mov     rbx, r12
        test    r12, r12
        jne     .LBB1_9
        inc     r15
        cmp     r14, r15
        jne     .LBB1_2
        mov     rdi, rbp
        call    swift_release@PLT
.LBB1_5:
        mov     rbp, qword ptr [rsp]
        mov     r14, qword ptr [rbp + 16]
        test    r14, r14
        je      .LBB1_10
        mov     rdi, rbp
        call    swift_retain@PLT
        xor     r15d, r15d
.LBB1_7:
        mov     rdi, qword ptr [rbp + 8*r15 + 32]
        mov     r12, rbx
        call    qword ptr [rsp + 8]
        mov     rbx, r12
        test    r12, r12
        jne     .LBB1_9
        inc     r15
        cmp     r14, r15
        jne     .LBB1_7
.LBB1_9:
        mov     rdi, rbp
        call    swift_release@PLT
.LBB1_10:
        mov     r12, rbx
        add     rsp, 16
        pop     rbx
        pop     r13
        pop     r14
        pop     r15
        pop     rbp
        ret

here are two ways to rewrite this code, using tools that are available in the standard library (no third-party packages!):

public 
func f2(a:[Int], b:[Int], g:(Int) throws -> ()) rethrows
{
    for x:Int in a + b
    {
        try g(x)
    }
}
public 
func f3(a:[Int], b:[Int], g:(Int) throws -> ()) rethrows
{
    for x:Int in [a, b].joined()
    {
        try g(x)
    }
}

as you might expect from the title of this thread, they generate very poor assembly - f2 actually allocates a new array to store the concatenated arrays, and f3 seems to be not using any generic specialization of JoinedSequence.

output.f2(a: [Swift.Int], b: [Swift.Int], g: (Swift.Int) throws -> ()) throws -> ():
        push    rbp
        push    r15
        push    r14
        push    r13
        push    rbx
        sub     rsp, 32
        mov     rbx, r12
        mov     qword ptr [rsp + 24], rcx
        mov     qword ptr [rsp + 16], rdx
        mov     r14, rsi
        mov     rbp, rdi
        mov     qword ptr [rsp + 8], rdi
        mov     rdi, rsi
        call    swift_retain@PLT
        mov     rdi, rbp
        call    swift_retain@PLT
        lea     r13, [rsp + 8]
        mov     rdi, r14
        call    (generic specialization <Swift.Int, [Swift.Int]> of Swift.Array.append<A where A == A1.Element, A1: Swift.Sequence>(contentsOf: __owned A1) -> ())
        mov     rbp, qword ptr [rsp + 8]
        mov     r14, qword ptr [rbp + 16]
        test    r14, r14
        je      .LBB2_4
        dec     r14
        xor     r15d, r15d
        mov     r13, qword ptr [rsp + 24]
.LBB2_2:
        mov     rdi, qword ptr [rbp + 8*r15 + 32]
        mov     r12, rbx
        call    qword ptr [rsp + 16]
        mov     rbx, r12
        test    r12, r12
        jne     .LBB2_4
        lea     rax, [r15 + 1]
        cmp     r14, r15
        mov     r15, rax
        jne     .LBB2_2
.LBB2_4:
        mov     rdi, rbp
        call    swift_release@PLT
        mov     r12, rbx
        add     rsp, 32
        pop     rbx
        pop     r13
        pop     r14
        pop     r15
        pop     rbp
        ret

output.f3(a: [Swift.Int], b: [Swift.Int], g: (Swift.Int) throws -> ()) throws -> ():
        push    rbp
        push    r15
        push    r14
        push    r13
        push    rbx
        sub     rsp, 64
        mov     rbx, r12
        mov     r13, rcx
        mov     rbp, rdx
        mov     r15, rsi
        mov     r14, rdi
        lea     rdi, [rip + (demangling cache variable for type metadata for Swift._ContiguousArrayStorage<[Swift.Int]>)]
        call    __swift_instantiateConcreteTypeFromMangledName
        lea     rsi, [rsp + 16]
        mov     rdi, rax
        call    swift_initStackObject@PLT
        mov     rcx, rax
        mov     qword ptr [rax + 16], 2
        mov     qword ptr [rax + 24], 4
        add     rax, 32
        mov     qword ptr [rsp], rax
        mov     qword ptr [rcx + 32], r14
        mov     qword ptr [rsp + 8], rcx
        mov     qword ptr [rcx + 40], r15
        mov     rdi, r14
        mov     esi, 2
        call    swift_retain_n@PLT
        mov     rdi, r15
        call    swift_retain@PLT
        test    r14, r14
        je      .LBB3_5
        xor     r15d, r15d
.LBB3_2:
        cmp     r15, qword ptr [r14 + 16]
        je      .LBB3_5
        jae     .LBB3_14
        mov     rdi, qword ptr [r14 + 8*r15 + 32]
        inc     r15
        mov     r12, rbx
        call    rbp
        mov     rbx, r12
        test    r12, r12
        je      .LBB3_2
        jmp     .LBB3_12
.LBB3_5:
        mov     rax, qword ptr [rsp + 8]
        mov     r15, qword ptr [rax + 40]
        mov     rdi, r15
        call    swift_retain@PLT
        mov     rdi, r14
        call    swift_release@PLT
        test    r15, r15
        je      .LBB3_6
        xor     r14d, r14d
.LBB3_8:
        cmp     r14, qword ptr [r15 + 16]
        je      .LBB3_6
        jae     .LBB3_14
        mov     rdi, qword ptr [r15 + 8*r14 + 32]
        inc     r14
        mov     r12, rbx
        call    rbp
        mov     rbx, r12
        test    r12, r12
        je      .LBB3_8
        mov     r14, r15
.LBB3_12:
        mov     r15, qword ptr [rsp]
        mov     rdi, qword ptr [rsp + 8]
        call    swift_setDeallocating@PLT
        lea     rdi, [rip + (demangling cache variable for type metadata for [Swift.Int])]
        call    __swift_instantiateConcreteTypeFromMangledName
        mov     esi, 2
        mov     rdi, r15
        mov     rdx, rax
        call    swift_arrayDestroy@PLT
        mov     rdi, r14
        jmp     .LBB3_13
.LBB3_6:
        mov     r14, qword ptr [rsp]
        mov     rdi, qword ptr [rsp + 8]
        call    swift_setDeallocating@PLT
        lea     rdi, [rip + (demangling cache variable for type metadata for [Swift.Int])]
        call    __swift_instantiateConcreteTypeFromMangledName
        mov     esi, 2
        mov     rdi, r14
        mov     rdx, rax
        call    swift_arrayDestroy@PLT
        mov     rdi, r15
.LBB3_13:
        call    swift_release@PLT
        mov     r12, rbx
        add     rsp, 64
        pop     rbx
        pop     r13
        pop     r14
        pop     r15
        pop     rbp
        ret
.LBB3_14:
        ud2

here is the full godbolt.

what is going on here?

ibex10 · January 22, 2024, 1:31am

I don't know why, but f3 seems to take the longest time.

--> f1 delta: 25.43048906326294  M = 100000000 N = 100000000
--> f2 delta: 26.214691996574402 M = 100000000 N = 100000000
--> f3 delta: 72.22976696491241  M = 100000000 N = 100000000
--> f4 delta: 26.386988043785095 M = 100000000 N = 100000000

Details

@main
enum TestBench {
    static func main () async throws {
        try time (which: "f1", f:f1)
        try time (which: "f2", f:f2)
        try time (which: "f3", f:f3)
        try time (which: "f4", f:f4)
    }
}

func time (which:String, f:([Int], [Int], (Int) throws -> ()) throws -> Void) throws {
    let M = 100_000_000
    let N = M
    
    let u = Array <Int> (repeating: 0, count: M)
    let v = Array <Int> (repeating: 1, count: N)
    
    let start = Date ()
    try f (u, v, {(u:Int) throws in})
    let delta = Date().timeIntervalSince (start)
    print("--> \(which) delta: \(delta) M = \(M) N = \(N)")

}

func f1 (a:[Int], b:[Int], g:(Int) throws -> ()) rethrows
{
    for x:Int in a
    {
        try g(x)
    }
    for x:Int in b
    {
        try g(x)
    }
}
    
func f2(a:[Int], b:[Int], g:(Int) throws -> ()) rethrows
{
    for x:Int in a + b
    {
        try g(x)
    }
}

func f3(a:[Int], b:[Int], g:(Int) throws  -> ()) rethrows
{
    for x:Int in [a, b].joined()
    {
        try g(x)
    }
}

func f4(a:[Int], b:[Int], g:(Int) throws  -> ()) rethrows
{
    var u = a
    u.append (contentsOf: b)
    
    for x:Int in u
    {
        try g(x)
    }
}

JuneBash · January 22, 2024, 2:29am

If you use the swift-algorithms package you could use chain(_:_:).

for x in chain(a, b) {
  try g(x)
}

Ben_Cohen · January 22, 2024, 2:32am

I’m not sure why you find this surprising. That is what + does.

You might hope that it does it on the stack, but thinking the compiler not eliding the array completely qualifies as surprising seems like an unreasonable optimization expectation.

Maybe an optimization the compiler_could_ do, but tbh that is a slippery slope where people expect to be able to write increasingly clearly less-efficient code and then become disappointed when the heroic optimizations fail (which they always will, at the margin).

tera · January 22, 2024, 2:36am

How about this?

    try a.forEach(g)
    try b.forEach(g)

Still a repetition but quite concise.

taylorswift · January 22, 2024, 5:18am

the g closure does not actually exist in the code base, it is just a test case to see how the compiler would handle arbitrary code duplicated across two for loops.

i think that while this is reasonable explanation for +, you are missing the bigger point which is that the “lazy”/“no defragmentation” pattern - that is, [a, b].joined() performs so poorly relative to a reallocate-and-defragment strategy.

wadetregaskis · January 22, 2024, 6:05am

Part of me agrees with this - call it the principled part, I suppose - but another part disagrees - perhaps the pragmatic part.

Swift is a pretty high-level language. Some constructs happen to be thin abstractions over the machine code, but most are not. I think that - like it or not - we have to rely a lot on the optimiser to bridge high-level luxury to viable real-world performance. Even for the most relatively basic aspects, like Collections and Sequences, or ref-counting, or even just method calls themselves (turn off inlining and see what happens ).

While admittedly I was not surprised that a + b behaves the way it does in this case, I can't explain why the compiler didn't optimise it better. Is it semantic constraints? The need to preserve the possibility of side-effects? Or just a missing feature of the optimiser? (rhetorical, for our purposes here - although tangentially I am of course genuinely curious)

I don't have any actual explanation for why someone shouldn't assume the a + b optimises better, just some vague intuition. Which I suspect comes from a history of lower-level languages; dumber languages, with simpler compilers. An increasingly uncommon background.

I think my point is just that, we shouldn't be too critical of anyone's expectations or presumptions around these things - what does or doesn't get optimised, and to what degree, is super complicated.

The Swift compiler is a bit like LLMs, actually - it does such a surprisingly good job sometimes, that it's easy to then expect stellar performance all the time, and it seems so inexplicable as to when it nails it or seemingly falls on its face.

ibex10 · January 22, 2024, 12:06pm

Out of curiosity, I gave it a try to compare its performance with that of the custom one I rolled up.


--> f1 delta: 0.42064404487609863 M = 1000000 N = 1000000
--> f2 delta: 0.26962804794311523 M = 1000000 N = 1000000

Details

import Foundation
import Algorithms

@main enum ChainTest {
    static func main () async throws {
        try time (which: "f1", f:f1)
        try time (which: "f2", f:f2)
    }
}

func time (which:String, f:([Int], [Int], (Int) throws -> ()) throws -> Void) throws {
    let M = 1_000_000
    let N = M
    
    let u = Array <Int> (repeating: 0, count: M)
    let v = Array <Int> (repeating: 1, count: N)
    
    let start = Date ()
    try f (u, v, {(u:Int) throws in})
    let delta = Date().timeIntervalSince (start)
    print("--> \(which) delta: \(delta) M = \(M) N = \(N)")

}

func f1 (a:[Int], b:[Int], g:(Int) throws  -> ()) rethrows {
    for x:Int in chain (a, b) {
        try g(x)
    }
}

func f2 (a:[Int], b:[Int], g:(Int) throws  -> ()) rethrows {
    for x:Int in custom_chain (a, b) {
        try g(x)
    }
}

func custom_chain <T> (_ u: [T], _ v: [T]) -> CustomChain <T> {
    CustomChain (u, v)
}

class CustomChain <T>: Sequence, IteratorProtocol {
    let u: [T]
    let v: [T]
    private var index: [T].Index
    private var _next: (()->T?)?

    init (_ u: [T], _ v: [T]) {
        self.u = u
        self.v = v
        self.index = u.startIndex
        self._next = self.u_next
    }
    
    func next() -> T? {
        _next? ()
    }
    
   private func u_next() -> T? {
        if index < u.endIndex {
            defer {index = index.advanced (by: 1)}
            return u [index]
        } else {
            index = v.startIndex
            _next = self.v_next
            return _next? ()
        }
    }
    
    private func v_next() -> T? {
        if index < v.endIndex {
            defer {index = index.advanced (by: 1)}
            return v [index]
        } else {
            return nil
        }
    }
}

wadetregaskis · January 22, 2024, 5:32pm

Swift-Algorithm's chain is more flexible - it doesn't require that the chained Sequences be the same type (only that their Elements be the same type). I suspect that's ultimately why it's slower.

It looks at a glance like it's essentially fully-inlinable, so that it's crossing module boundaries doesn't look like a factor in this case.

However, one obvious thing it does differently is always call next on the first Sequence's iterator before trying the second Sequence. So depending on the implementation of your Sequences, that might be wasting significant time for every iteration. Your approach uses more memory to avoid this - which IMO seems like the obvious best option here. However, you can do that easily because you're constraining the two Sequences to the same type. Since Swift-Algorithm's chain is not, I assume it'd have to use an existential to try the same thing, which introduces its own overhead.

(that said, it could have a simple boolean to cache the notion that the first iterator is exhausted, thus eliminating any possible performance overhead from the first Sequence's iterator's next method)

It's a pity there's no specialisation for the case where the input Sequences are identical.

dnadoba · January 22, 2024, 7:29pm

@ibex10 you are running this with a debug build. Try running with a release build/with optimisations turned on e.g.

$ swift run -c=release
--> f1 delta: 2.396106719970703e-05 M = 1000000 N = 1000000
--> f2 delta: 0.02014899253845215 M = 1000000 N = 1000000

Note the exponential notation of f1 delta, its 0.00002396106719970703 in decimal.

@taylorswift AFAICT the machine code for [a, b].joined() fully specialises (it just has an allocation for the temporary array, I think even on the stack) and chain(a, b) doesn't even allocate at all.
Runtime performance seems fine as well, one should probably write a proper package-benchmark test though.

Summary

for in a; for in b; duration: 0.004177208 seconds
a + b               duration: 0.006087875 seconds
[a, b].joined()     duration: 0.004152667 seconds
chain(a, b)         duration: 0.00395725 seconds

Compiler Explorer

import Algorithms

@main struct Main {
    static func main() {
        let a = (0..<1_000_000).map { _ in Int.random(in: 0..<255) }
        let b = (0..<1_000_000).map { _ in Int.random(in: 0..<255) }
        
        func measure(f: ([Int], [Int], (Int) -> Void) -> Void, name: String) -> Int {
            var sum = 0
            let duration = ContinuousClock().measure {
                f(a, b) { number in
                    sum += number
                }
            }
            print("\(name) duration: \(duration)")
            return sum
        }
        
        
        let total1 = measure(f: f1, name: "for in a; for in b;")
        let total2 = measure(f: f2, name: "a + b              ")
        let total3 = measure(f: f3, name: "[a, b].joined()    ")
        let total4 = measure(f: f4, name: "chain(a, b)        ")
        
        precondition(total1 == total2)
        precondition(total2 == total3)
        precondition(total3 == total4)
    }
}

func f1(a:[Int], b:[Int], g:(Int) -> ()){
    for x in a {
        g(x)
    }
    for x in b {
        g(x)
    }
}

func f2(a:[Int], b:[Int], g:(Int) -> ()) {
    for x in a + b {
        g(x)
    }
}

func f3(a:[Int], b:[Int], g:(Int) -> ()){
    for x in [a, b].joined() {
        g(x)
    }
}

func f4(a:[Int], b:[Int], g:(Int) -> ()){
    for x in chain(a, b) {
        g(x)
    }
}

ibex10 · January 22, 2024, 10:59pm

dnadoba:

@ibex10 you are running this with a debug build. Try running with a release build/with optimisations turned on e.g.
$ swift run -c=release
--> f1 delta: 2.396106719970703e-05 M = 1000000 N = 1000000
--> f2 delta: 0.02014899253845215   M = 1000000 N = 1000000
Note the exponential notation of f1 delta, its 0.00002396106719970703 in decimal.

You are right.

Thank you for pointing that out, and thank you for the example code.

I was running it from Xcode in release mode. I tried it from the command line with swift run -c=release, which made all the difference.

> swift run -c=release
...
--> f1 delta: 0.00044608116149902344 M = 1000000 N = 1000000
--> f2 delta: 0.03287100791931152.   M = 1000000 N = 1000000