ARC, tail recursion

While looking at a stack overflow crash, it looks like

var leftMost: Expr { count == 0 ? self : args[0].leftMost }

is not benefiting from tail call optimization.

final class Expr : Sequence, Equatable, Hashable {
    unowned(unsafe) var parent: Expr? = nil
    var op: OpCode
    var args: ContiguousArray<Expr> 
...
}

Is the retain/release inhibiting tail-call optimization?

Graphing Calculator`Expr.leftMost.getter:
    0x102eb6900 <+0>:  stp    x20, x19, [sp, #-0x20]!
    0x102eb6904 <+4>:  stp    x29, x30, [sp, #0x10]
    0x102eb6908 <+8>:  add    x29, sp, #0x10
->  0x102eb690c <+12>: ldr    x8, [x20, #0x38]
    0x102eb6910 <+16>: ldr    x9, [x8, #0x10]
    0x102eb6914 <+20>: cbz    x9, 0x102eb6944           ; <+68> at Expr.swift:218:40
    0x102eb6918 <+24>: ldr    x20, [x8, #0x20]
    0x102eb691c <+28>: mov    x0, x20
    0x102eb6920 <+32>: bl     0x1033a9688               ; symbol stub for: swift_retain
    0x102eb6924 <+36>: bl     0x102eb6900               ; <+0> at Expr.swift:218
    0x102eb6928 <+40>: mov    x19, x0
    0x102eb692c <+44>: mov    x0, x20
    0x102eb6930 <+48>: bl     0x1033a9670               ; symbol stub for: swift_release
    0x102eb6934 <+52>: mov    x0, x19
    0x102eb6938 <+56>: ldp    x29, x30, [sp, #0x10]
    0x102eb693c <+60>: ldp    x20, x19, [sp], #0x20
    0x102eb6940 <+64>: ret    
    0x102eb6944 <+68>: mov    x0, x20
    0x102eb6948 <+72>: ldp    x29, x30, [sp, #0x10]
    0x102eb694c <+76>: ldp    x20, x19, [sp], #0x20
    0x102eb6950 <+80>: b      0x1033a9688               ; symbol stub for: swift_retain

It is straightforward to avoid the recursion manually with

    var leftMost: Expr {
        var expr = self
        while expr.count > 0 { expr = expr.args[0] }
        return expr
    }

but is there any way to avoid the retain/release while stepping down the child nodes?

2 Likes

Try unowned var expr = self

What's the max child count in a node (if any)?

No maximum. In practice, the number is typically very small.

This particular function is one of a dozen with the same issue, so it would take some work to change them all and measure the effect. Looking at the generated code:

    @inline(never)
    var leftMost: Expr {
        var expr = self
        while expr.count > 0 { expr = expr.args[0] }
        return expr
    }

generates

Calculator`Expr.leftMost.getter:
    0x1030cf244 <+0>:  stp    x22, x21, [sp, #-0x30]!
    0x1030cf248 <+4>:  stp    x20, x19, [sp, #0x10]
    0x1030cf24c <+8>:  stp    x29, x30, [sp, #0x20]
    0x1030cf250 <+12>: add    x29, sp, #0x20
    0x1030cf254 <+16>: ldr    x21, [x20, #0x38]
    0x1030cf258 <+20>: ldr    x19, [x21, #0x10]
    0x1030cf25c <+24>: mov    x0, x20
    0x1030cf260 <+28>: bl     0x1035c1688               ; symbol stub for: swift_retain
->  0x1030cf264 <+32>: cbz    x19, 0x1030cf290          ; <+76> at Expr.swift
    0x1030cf268 <+36>: ldr    x19, [x21, #0x20]
    0x1030cf26c <+40>: mov    x0, x19
    0x1030cf270 <+44>: bl     0x1035c1688               ; symbol stub for: swift_retain
    0x1030cf274 <+48>: mov    x0, x20
    0x1030cf278 <+52>: bl     0x1035c1670               ; symbol stub for: swift_release
    0x1030cf27c <+56>: ldr    x21, [x19, #0x38]
    0x1030cf280 <+60>: ldr    x8, [x21, #0x10]
    0x1030cf284 <+64>: mov    x20, x19
    0x1030cf288 <+68>: cbnz   x8, 0x1030cf268           ; <+36> [inlined] generic specialization <Graphing_Calculator.Expr> of Swift.ContiguousArray.subscript.getter : (Swift.Int) -> τ_0_0 at <compiler-generated>
    0x1030cf28c <+72>: b      0x1030cf294               ; <+80> at Expr.swift:223:9
    0x1030cf290 <+76>: mov    x19, x20
    0x1030cf294 <+80>: mov    x0, x19
    0x1030cf298 <+84>: ldp    x29, x30, [sp, #0x20]
    0x1030cf29c <+88>: ldp    x20, x19, [sp, #0x10]
    0x1030cf2a0 <+92>: ldp    x22, x21, [sp], #0x30
    0x1030cf2a4 <+

while

    @inline(never)
    var leftMost: Expr {
        unowned var expr = self
        while expr.count > 0 { expr = expr.args[0] }
        return expr

generates

Graphing Calculator`Expr.leftMost.getter:
    0x100ba77e4 <+0>:   stp    x22, x21, [sp, #-0x30]!
    0x100ba77e8 <+4>:   stp    x20, x19, [sp, #0x10]
    0x100ba77ec <+8>:   stp    x29, x30, [sp, #0x20]
    0x100ba77f0 <+12>:  add    x29, sp, #0x20
->  0x100ba77f4 <+16>:  mov    x0, x20
    0x100ba77f8 <+20>:  bl     0x1010997c0               ; symbol stub for: swift_unownedRetainStrong
    0x100ba77fc <+24>:  ldr    x8, [x20, #0x38]
    0x100ba7800 <+28>:  ldr    x19, [x8, #0x10]
    0x100ba7804 <+32>:  bl     0x1010997b4               ; symbol stub for: swift_unownedRetain
    0x100ba7808 <+36>:  bl     0x101099670               ; symbol stub for: swift_release
    0x100ba780c <+40>:  cbz    x19, 0x100ba7860          ; <+124> at Expr.swift
    0x100ba7810 <+44>:  mov    x0, x20
    0x100ba7814 <+48>:  bl     0x1010997c0               ; symbol stub for: swift_unownedRetainStrong
    0x100ba7818 <+52>:  ldr    x8, [x20, #0x38]
    0x100ba781c <+56>:  ldr    x9, [x8, #0x10]
    0x100ba7820 <+60>:  cbz    x9, 0x100ba7884           ; <+160> [inlined] Swift runtime failure: Index out of range at <compiler-generated>
    0x100ba7824 <+64>:  ldr    x19, [x8, #0x20]
    0x100ba7828 <+68>:  mov    x0, x19
    0x100ba782c <+72>:  bl     0x1010997b4               ; symbol stub for: swift_unownedRetain
    0x100ba7830 <+76>:  mov    x0, x20
    0x100ba7834 <+80>:  bl     0x1010997a8               ; symbol stub for: swift_unownedRelease
    0x100ba7838 <+84>:  mov    x0, x20
    0x100ba783c <+88>:  bl     0x101099670               ; symbol stub for: swift_release
    0x100ba7840 <+92>:  mov    x0, x19
    0x100ba7844 <+96>:  bl     0x1010997c0               ; symbol stub for: swift_unownedRetainStrong
    0x100ba7848 <+100>: ldr    x8, [x19, #0x38]
    0x100ba784c <+104>: ldr    x21, [x8, #0x10]
    0x100ba7850 <+108>: bl     0x101099670               ; symbol stub for: swift_release
    0x100ba7854 <+112>: mov    x20, x19
    0x100ba7858 <+116>: cbnz   x21, 0x100ba7810          ; <+44> at Expr.swift:222:39
    0x100ba785c <+120>: b      0x100ba7864               ; <+128> at Expr.swift:223:16
    0x100ba7860 <+124>: mov    x19, x20
    0x100ba7864 <+128>: mov    x0, x19
    0x100ba7868 <+132>: bl     0x1010997c0               ; symbol stub for: swift_unownedRetainStrong
    0x100ba786c <+136>: bl     0x1010997a8               ; symbol stub for: swift_unownedRelease
    0x100ba7870 <+140>: mov    x0, x19
    0x100ba7874 <+144>: ldp    x29, x30, [sp, #0x20]
    0x100ba7878 <+148>: ldp    x20, x19, [sp, #0x10]
    0x100ba787c <+152>: ldp    x22, x21, [sp], #0x30
    0x100ba7880 <+156>: ret    
    0x100ba7884 <+160>: brk    #0x1

but I am in over my depths to interpret any of that.

What's count, is it a computed variable, can it cause ARC traffic?

would that make a difference?

  unowned(unsafe) var e = expr

ps. i presume the above output is for "release" mode?

pps. if count is a wrapper for "args.count" (as it seems) then you may probably change the code to use "args.first" instead.

The count property is just shorthand for args.count

Yes, definitely.

    @inline(never)
    var leftMost: Expr {
        unowned(unsafe) var expr = self
        while expr.count > 0 { expr = expr.args.first! }
        return expr
    }

generates

Graphing Calculator`Expr.leftMost.getter:
    0x104c137e4 <+0>:  stp    x20, x19, [sp, #-0x20]!
    0x104c137e8 <+4>:  stp    x29, x30, [sp, #0x10]
    0x104c137ec <+8>:  add    x29, sp, #0x10
    0x104c137f0 <+12>: ldr    x0, [x20, #0x38]
    0x104c137f4 <+16>: ldr    x8, [x0, #0x10]
->  0x104c137f8 <+20>: cbz    x8, 0x104c1380c           ; <+40> at Expr.swift:223:16
    0x104c137fc <+24>: bl     0x105105688               ; symbol stub for: swift_retain
    0x104c13800 <+28>: ldr    x20, [x0, #0x20]
    0x104c13804 <+32>: bl     0x105105670               ; symbol stub for: swift_release
    0x104c13808 <+36>: b      0x104c137f0               ; <+12> [inlined] Graphing_Calculator.Expr.count.getter : Swift.Int at Expr.swift:124:22
    0x104c1380c <+40>: mov    x0, x20
    0x104c13810 <+44>: ldp    x29, x30, [sp, #0x10]
    0x104c13814 <+48>: ldp    x20, x19, [sp], #0x20
    0x104c13818 <+52>: b      0x105105688               ; symbol stub for: swift_retain

and this one any better?

while expr.args.first != nil { expr = expr.args.first! }

you may later on optimise this to calculate the expression once, perhaps by using another unowned variable.

ps. also, consider switching to value types! It might be a big change to your code base but the end result might well worth it.

Swift will not let you do anything with a value without it being retained, or without a promise that someone else is retaining it in the mean time. There aren't many generally supported ways to spell the latter if it can't be inferred from the structure of the code, though there have been various proposals to add such functionality (which I'm having trouble finding right now).

In this case, the code in question is enough to guarantee that, since the compiler can see that indeed no mutations can happen at all here. So I'd say this is a missed optimization opportunity and leave it at that.

3 Likes

I wonder if this is a regression. I thought I inspected this about a year ago and found that the recursive version both elided out the retain/release and also did the tail-recursion optimization. But I didn't take exhaustive notes at the time, so I may be misremembering.

It's certainly possible, I bet some of the work for A roadmap for improving Swift performance predictability: ARC improvements and ownership control resulted in regressions that weren't strictly related to that model change.

2 Likes

That generated identical code to while expr.count > 0 { expr = expr.args.first! }

I've been considering it in depth while rewriting all the ancient C++ code over the last year, but I fear I would only be trading one set of issues for another. I couldn't see how to do it in a way that simplified the structure of the various parts using Expr. On the one hand, the C++ is significantly faster. However, it was not thread-safe and it was extremely fragile doing all memory management manually. The Swift rewrite using ARC does the same work in about 25% the lines of source code and is significantly less fragile to use, so I'm willing to accept a performance penalty for using ARC if necessary. Having said that, I keep posting these questions to ensure that I'm not leaving any performance gains on the ground just out of ignorance.

Should I post this issue to bugs.swift.org? Or is that on github now?

I see. Yet another option could be manual linked lists instead of arrays.

What is the slowdown compared to C++?

Swift Bugs will suggest you using GitHub Issues on the first page.

It depends on what the user is doing. For most things: none. Most things are compute-bound in numeric computations, for which the Swift code has comparable performance to the C++ code now (after a learning curve using the Unsafe APIs.) The ARC issues only arise where the code is compute bound on symbolic computation of expression trees, which can arise from documents with ridiculously large equations or a complex series of definitions that generate large equations internally. https://twitter.com/RonAvitzur/status/1461023215724097536 details diving into one such example. The last document I measured that was compute bound in symbolic code showed a 3x slowdown in the Swift version, but I haven't looked at enough cases to know if that is typical.

Submitted to missed optimization opportunity for elimination of retain/release and tail recursion · Issue #58549 · apple/swift · GitHub

2 Likes

You may find the following applicable to your case. I played with several tree implementations, value and reference types, binary and non binary, here is the result of that microbenchmark (in each case it's creating a tree with 16M nodes and doing various typical operations on the tree: traverse, reverse, clone, delete). All times are in ms (the smaller the better):

				create  travers reverse	clone 	delete
RefTree			2258	716		1143	9541	3661
RefBinTree		1114	390		532		1891	2246
ValueTree		1113	371		350		2159	1547
ValueBinTree 	1104	474		3537	3400	3656
RawBinTree		258		224		176		0		23
The main data structures used.
class RefTree: Tree {
    var id: Int
    var value: Int
    var nodes: [RefTree]
}

class RefBinTree: Tree {
    var id: Int
    var value: Int
    var left: RefBinTree?
    var right: RefBinTree?
}

struct ValueTree: Tree {
    var id: Int
    var value: Int
    var nodes: [ValueTree]
}

struct ValueBinTree: Tree {
    var id: Int
    var value: Int
    var left: MyOptional<ValueBinTree>
    var right: MyOptional<ValueBinTree>
}

struct RawBinTreeNode {
    var value: Int
    var left: Int = 0
    var right: Int = 0
}

struct RawBinTree: Tree {
    var id: Int
}

MyOptional is like Optional - I wasn't use Optional in a recursive data structure - not possible to put "indirect" on a struct or it's member.

Note on the RawBinTree.

RawBinTree is a custom data structure that's neither value nor reference - it's using fixed sized nodes, referenced by their Int ids, nodes are allocated from an arena (which is resized via realloc), deleted nodes are put on a linked list of free nodes. The reason clone / delete is fast is because it is a single realloc + memmove. Raw tree can only hold simple value types (like ints / enums without associated values or with simple associated values), more complex dynamic types like String or Array are not supported although fixed sized short strings (e.g. typical formula labels like "x") can be implemented "inline" and more complex types can be placed in an out of place table with entries referenced via short integer indices. If I were to implement a non binary variant of "RawBinTree" I'd probably go with a fixed sized nodes still:

struct RawTreeNode {
    var value: Int
    var right: Int = 0 // sibling nodes, 0 terminated
    var child: Int = 0 // first child node
}

1 Like