I had not. When I tried that in the reduced example, it was the slowest variation, which I do not understand at all. But when used with an .withUnsafeMutableBufferPointer closure it is substantially faster.
Would that require passing it as an inout parameter to avoid making a local copy?
It is a property of the CompiledExpr class that holds the program representation of an Expr.
Yes. I've included it in the test cases below.
Here is a reduced example that shows some of the context and illustrates the performance differences. The performance-critical section is copy/pasted six times with variations.
When the code
Array was a var
property, timing was substantially slower. I should understand why, but I do not.
// ArrayOverheadTimingTests Created by avitzur on 11/19/21.
import Foundation
/// An attempt to extract a skeleton of Graphing Calculator's performance critical innermost loop
/// to investigate abstraction penalties imposed by using Swift Array
///
/// For context:
/// Expr is the tree representation of a user-inputted equation (not necessary for this sample)
/// CompiledExpr holds a linear representation optimized for evaluation
/// Program is the list of instructions
///
/// a single compiledExpr instance can be called to evaluate on different numeric types (and also, not shown here, on a vector of input values)
///
/// program.run is copy/pasted with six variations below to compare timing of different ways
///
/// Building this as a command-line tool, optimized for speed, with Safety Checks disabled, outputs this on my M1 Mini:
/// run0 returns 10100.0 in 1.96s: Local Arrays
/// run1 returns 10100.0 in 1.41s: Local Arrays, pre-allocated, manually indexed
/// run2 returns 10100.0 in 2.13s: Combined Local Array, pre-allocated, manually indexed
/// run3 returns 10100.0 in 0.63s: Combined Local Array, pre-allocated, manually indexed, withUnsafeMutableBufferPointer
/// run4 returns 10100.0 in 0.58s: Passing pre-allocated UnsafeMutableRawBufferPointer
/// run5 returns 10100.0 in 0.57s: Passing pre-allocated UnsafeMutableRawBufferPointer without rebasing
/// run0 returns Interval(min: 10100.0, max: 10100.0) in 2.18s: Local Arrays
/// run1 returns Interval(min: 10100.0, max: 10100.0) in 1.51s: Local Arrays pre-allocated, manually indexed
/// run2 returns Interval(min: 10100.0, max: 10100.0) in 2.12s: Combined Local Array pre-allocated, manually indexed
/// run3 returns Interval(min: 10100.0, max: 10100.0) in 0.78s: Combined Local Array pre-allocated, manually indexed, withUnsafeMutableBufferPointer
/// run4 returns Interval(min: 10100.0, max: 10100.0) in 0.76s: Passing pre-allocated UnsafeMutableRawBufferPointer
/// run5 returns Interval(min: 10100.0, max: 10100.0) in 0.72s: Passing pre-allocated UnsafeMutableRawBufferPointer without rebasing
/// Program ended with exit code: 0
// Since I'm unsure whether or in what ways generic specialization contributes to performance issues,
// here is a toy protocol with two numeric types for timing comparison
protocol Computable {
static var nan: Self { get }
var isNaN: Bool { get }
init(_ x: Double)
static func +(a: Self, b: Self) -> Self
static func /(a: Self, b: Self) -> Self
static func >(a: Self, b: Self) -> Bool
}
extension Double: Computable {}
struct Interval : Computable {
var min: Double
var max: Double
var isNaN: Bool { min.isNaN || max.isNaN }
static let nan = Interval(.nan)
init(_ x: Double) {
self.min = x
self.max = x
}
static func +(a: Self, b: Self) -> Self { Interval(a.min + b.min) }
static func /(a: Self, b: Self) -> Self { Interval(a.min / b.min) }
static func >(a: Self, b: Self) -> Bool { a.min > b.min }
}
enum Instruction: Equatable {
case constant(Double) // push constant onto stack
case x // push graphing variable onto stack
case getVariable(Int) // push CSE variable onto stack
case setVariable(Int) // copy top-of-stack into CSE variable
case getLoopVariable(Int) // push loop variable onto stack
case setLoopVariable(Int) // copy top-of-stack into loop variable
case pop // discard top-of-stack
case loopIf(_ variable: Int, _ branchDistance: Int) // for looping on sums and products
case incrementAndLoop(_ variable: Int, _ branchDistance: Int)
case plus
case result
}
class Program {
let code: [Instruction] // TODO: change to var code and observe timing changes
let numCSEvariables: Int // size of local cseVariables in run()
let numLoopVariables: Int // size of local loopVariables in run()
let maxStackDepth: Int // number of elements needed for stack[]
var stack: UnsafeMutableRawBufferPointer // pre-allocate memory needed by program.run
init() {
code = [ // Sample program to evaluate the (sum of k from 0 to x of k) + (sum of k from 0 to x of k)
.constant(0),
.setLoopVariable(0),
.x,
.setLoopVariable(1),
.pop,
.loopIf(0, 3),
.getLoopVariable(0),
.plus,
.incrementAndLoop(0, 3),
.setVariable(0),
.getVariable(0),
.plus,
.result
]
numCSEvariables = 1
numLoopVariables = 2
maxStackDepth = 4
let n = maxStackDepth + numCSEvariables + numLoopVariables
self.stack = UnsafeMutableRawBufferPointer.allocate(byteCount: n * MemoryLayout<Interval>.stride, alignment: MemoryLayout<Double>.alignment)
}
// Using Array<T> in the most natural way
func run0<T: Computable>(x: T) -> T {
var stack: [T] = []
var cseVariables = [T](repeating: T.nan, count: numCSEvariables)
var loopVariables = [T](repeating: T.nan, count: numLoopVariables)
var y = T.nan // top of stack held separately
var pop: T { stack.removeLast() }
func push(_ x: T) { stack.append(x) }
var pc = 0
execution: while true {
switch code[pc] {
case .constant(let x): push(y); y = T(x)
case .x: push(y); y = x
case .getVariable(let i): push(y); y = cseVariables[i]
case .getLoopVariable(let i): push(y); y = loopVariables[i]
case .setVariable(let i): cseVariables[i] = y
case .setLoopVariable(let i): loopVariables[i] = y
case .plus: y = pop + y
case .pop: y = pop
case .result: break execution
case .loopIf(let loopIndex, let branchDistance):
let k = loopVariables[loopIndex]
let kMax = loopVariables[loopIndex + 1]
if k > kMax { pc += branchDistance }
case .incrementAndLoop(let loopIndex, let branchDistance):
loopVariables[loopIndex] = loopVariables[loopIndex] + T(1)
pc -= branchDistance
continue // skip pc += 1
}
pc += 1
if !(y is Double) && y.isNaN && !stack.isEmpty {
return .nan
}
}
return y
}
// Pre-allocate stack to avoid over overhead imposed by append/removeLast, keep track of position with stack pointer index sp
func run1<T: Computable>(x: T) -> T {
var stack = [T](repeating: T.nan, count: maxStackDepth)
var cseVariables = [T](repeating: T.nan, count: numCSEvariables)
var loopVariables = [T](repeating: T.nan, count: numLoopVariables)
var sp = 0
var y = T.nan // top of stack held separately
var pop: T { sp -= 1; return stack[sp + 1] }
func push(_ x: T) { sp += 1; stack[sp] = x }
var pc = 0
execution: while true {
switch code[pc] {
case .constant(let x): push(y); y = T(x)
case .x: push(y); y = x
case .getVariable(let i): push(y); y = cseVariables[i]
case .getLoopVariable(let i): push(y); y = loopVariables[i]
case .setVariable(let i): cseVariables[i] = y
case .setLoopVariable(let i): loopVariables[i] = y
case .plus: y = pop + y
case .pop: y = pop
case .result: break execution
case .loopIf(let loopIndex, let branchDistance):
let k = loopVariables[loopIndex]
let kMax = loopVariables[loopIndex + 1]
if k > kMax { pc += branchDistance }
case .incrementAndLoop(let loopIndex, let branchDistance):
loopVariables[loopIndex] = loopVariables[loopIndex] + T(1)
pc -= branchDistance
continue // skip pc += 1
}
pc += 1
if !(y is Double) && y.isNaN && sp != 0 {
return .nan
}
}
return y
}
// combine the 3 Array<T> to reduce construction/destruction overhead
func run2<T: Computable>(x: T) -> T {
var stack = [T](repeating: T.nan, count: maxStackDepth + numCSEvariables + numLoopVariables)
var sp = 0
var y = T.nan // top of stack held separately
var pop: T { sp -= 1; return stack[sp + 1] }
func push(_ x: T) { sp += 1; stack[sp] = x }
var pc = 0
execution: while true {
switch code[pc] {
case .constant(let x): push(y); y = T(x)
case .x: push(y); y = x
case .getVariable(let i): push(y); y = stack[i + maxStackDepth]
case .getLoopVariable(let i): push(y); y = stack[i + maxStackDepth + numCSEvariables]
case .setVariable(let i): stack[i + maxStackDepth] = y
case .setLoopVariable(let i): stack[i + maxStackDepth + numCSEvariables] = y
case .plus: y = pop + y
case .pop: y = pop
case .result: break execution
case .loopIf(let loopIndex, let branchDistance):
let k = stack[maxStackDepth + numCSEvariables + loopIndex]
let kMax = stack[maxStackDepth + numCSEvariables + loopIndex + 1]
if k > kMax { pc += branchDistance }
case .incrementAndLoop(let loopIndex, let branchDistance):
stack[maxStackDepth + numCSEvariables + loopIndex] = stack[maxStackDepth + numCSEvariables + loopIndex] + T(1)
pc -= branchDistance
continue // skip pc += 1
}
pc += 1
if !(y is Double) && y.isNaN && sp != 0 {
return .nan
}
}
return y
}
// access stack .withUnsafeMutableBufferPointer
func run3<T: Computable>(x: T) -> T {
var stack = [T](repeating: T.nan, count: maxStackDepth + numCSEvariables + numLoopVariables)
return stack.withUnsafeMutableBufferPointer{ stack in
var sp = 0
var y = T.nan // top of stack held separately
var pop: T { sp -= 1; return stack[sp + 1] }
func push(_ x: T) { sp += 1; stack[sp] = x }
var pc = 0
execution: while true {
switch code[pc] {
case .constant(let x): push(y); y = T(x)
case .x: push(y); y = x
case .getVariable(let i): push(y); y = stack[i + maxStackDepth]
case .getLoopVariable(let i): push(y); y = stack[i + maxStackDepth + numCSEvariables]
case .setVariable(let i): stack[i + maxStackDepth] = y
case .setLoopVariable(let i): stack[i + maxStackDepth + numCSEvariables] = y
case .plus: y = pop + y
case .pop: y = pop
case .result: break execution
case .loopIf(let loopIndex, let branchDistance):
let k = stack[maxStackDepth + numCSEvariables + loopIndex]
let kMax = stack[maxStackDepth + numCSEvariables + loopIndex + 1]
if k > kMax { pc += branchDistance }
case .incrementAndLoop(let loopIndex, let branchDistance):
stack[maxStackDepth + numCSEvariables + loopIndex] = stack[maxStackDepth + numCSEvariables + loopIndex] + T(1)
pc -= branchDistance
continue // skip pc += 1
}
pc += 1
if !(y is Double) && y.isNaN && sp != 0 {
return .nan
}
}
return y
}
}
// pre-allocate stack outside of call
func run4<T: Computable>(x: T, on stack: UnsafeMutableBufferPointer<T>) -> T {
let cseVariables = UnsafeMutableBufferPointer(rebasing: stack[maxStackDepth...])
let loopVariables = UnsafeMutableBufferPointer(rebasing: stack[(maxStackDepth + numCSEvariables)...])
var sp = 0
var y = T.nan // top of stack held separately
var pop: T { sp -= 1; return stack[sp + 1] }
func push(_ x: T) { sp += 1; stack[sp] = x }
var pc = 0
execution: while true {
switch code[pc] {
case .constant(let x): push(y); y = T(x)
case .x: push(y); y = x
case .getVariable(let i): push(y); y = cseVariables[i]
case .getLoopVariable(let i): push(y); y = loopVariables[i]
case .setVariable(let i): cseVariables[i] = y
case .setLoopVariable(let i): loopVariables[i] = y
case .plus: y = pop + y
case .pop: y = pop
case .result: break execution
case .loopIf(let loopIndex, let branchDistance):
let k = loopVariables[loopIndex]
let kMax = loopVariables[loopIndex + 1]
if k > kMax { pc += branchDistance }
case .incrementAndLoop(let loopIndex, let branchDistance):
loopVariables[loopIndex] = loopVariables[loopIndex] + T(1)
pc -= branchDistance
continue // skip pc += 1
}
pc += 1
if !(y is Double) && y.isNaN && sp != 0 {
return .nan
}
}
return y
}
// pre-allocate stack outside of call, single array without rebasing
func run5<T: Computable>(x: T, on stack: UnsafeMutableBufferPointer<T>) -> T {
var sp = 0
var y = T.nan // top of stack held separately
var pop: T { sp -= 1; return stack[sp + 1] }
func push(_ x: T) { sp += 1; stack[sp] = x }
var pc = 0
execution: while true {
switch code[pc] {
case .constant(let x): push(y); y = T(x)
case .x: push(y); y = x
case .getVariable(let i): push(y); y = stack[i + maxStackDepth]
case .getLoopVariable(let i): push(y); y = stack[i + maxStackDepth + numCSEvariables]
case .setVariable(let i): stack[i + maxStackDepth] = y
case .setLoopVariable(let i): stack[i + maxStackDepth + numCSEvariables] = y
case .plus: y = pop + y
case .pop: y = pop
case .result: break execution
case .loopIf(let loopIndex, let branchDistance):
let k = stack[maxStackDepth + numCSEvariables + loopIndex]
let kMax = stack[maxStackDepth + numCSEvariables + loopIndex + 1]
if k > kMax { pc += branchDistance }
case .incrementAndLoop(let loopIndex, let branchDistance):
stack[maxStackDepth + numCSEvariables + loopIndex] = stack[maxStackDepth + numCSEvariables + loopIndex] + T(1)
pc -= branchDistance
continue // skip pc += 1
}
pc += 1
if !(y is Double) && y.isNaN && sp != 0 {
return .nan
}
}
return y
}
}
class CompiledExpr {
var program: Program = Program()
func evaluate0(x: Double) -> Double { program.run0(x: x) }
func evaluate1(x: Double) -> Double { program.run1(x: x) }
func evaluate2(x: Double) -> Double { program.run2(x: x) }
func evaluate3(x: Double) -> Double { program.run3(x: x) }
func evaluate4(x: Double) -> Double { program.run4(x: x, on: program.stack.bindMemory(to: Double.self)) }
func evaluate5(x: Double) -> Double { program.run5(x: x, on: program.stack.bindMemory(to: Double.self)) }
func evaluateAsInterval0(x: Interval) -> Interval { program.run0(x: x) }
func evaluateAsInterval1(x: Interval) -> Interval { program.run1(x: x) }
func evaluateAsInterval2(x: Interval) -> Interval { program.run2(x: x) }
func evaluateAsInterval3(x: Interval) -> Interval { program.run3(x: x) }
func evaluateAsInterval4(x: Interval) -> Interval { program.run4(x: x, on: program.stack.bindMemory(to: Interval.self)) }
func evaluateAsInterval5(x: Interval) -> Interval { program.run5(x: x, on: program.stack.bindMemory(to: Interval.self)) }
}
func time<T: Computable>(_ functionName: String, _ description: String, function: () -> T) {
let iterations = 1000000
let start = DispatchTime.now().uptimeNanoseconds
var sum: T = T(0)
for _ in 1 ... iterations {
sum = sum + function()
}
sum = sum / T(Double(iterations))
let end = DispatchTime.now().uptimeNanoseconds
print("\(functionName) returns \(sum) in \(String(format: "%.2f", Double(end - start)/1.0e9))s: \(description)")
}
func doTimingComparisons() {
let f = CompiledExpr()
time("run0", "Local Arrays") { f.evaluate0(x: 100) }
time("run1", "Local Arrays, pre-allocated, manually indexed") { f.evaluate1(x: 100) }
time("run2", "Combined Local Array, pre-allocated, manually indexed") { f.evaluate2(x: 100) }
time("run3", "Combined Local Array, pre-allocated, manually indexed, withUnsafeMutableBufferPointer") { f.evaluate3(x: 100) }
time("run4", "Passing pre-allocated UnsafeMutableRawBufferPointer") { f.evaluate4(x: 100) }
time("run5", "Passing pre-allocated UnsafeMutableRawBufferPointer without rebasing") { f.evaluate5(x: 100) }
time("run0", "Local Arrays") { f.evaluateAsInterval0(x: Interval(100)) }
time("run1", "Local Arrays pre-allocated, manually indexed") { f.evaluateAsInterval1(x: Interval(100)) }
time("run2", "Combined Local Array pre-allocated, manually indexed") { f.evaluateAsInterval2(x: Interval(100)) }
time("run3", "Combined Local Array pre-allocated, manually indexed, withUnsafeMutableBufferPointer") { f.evaluateAsInterval3(x: Interval(100)) }
time("run4", "Passing pre-allocated UnsafeMutableRawBufferPointer") { f.evaluateAsInterval4(x: Interval(100)) }
time("run5", "Passing pre-allocated UnsafeMutableRawBufferPointer without rebasing") { f.evaluateAsInterval5(x: Interval(100)) }
}
doTimingComparisons()