SIMD performance question

According to my test program below (when run on my MacBook Pro late 2013, using dev snapshot 2018-02-21)

x = simd_fract(x)

is slower than

x = x - floor(x)`

x being a simd float2.

I would have expected the opposite.

And also, simd_fast_recip doesn't seem to be faster than simd_precise_recip.

Can anyone explain these results?


Demonstration program:

import AppKit
import simd

// First some code for generating random float2, not very relevant for this
// particular question, but including it was the quickest way for me to put
// together a self-contained demonstration program:

protocol RandomGenerator : class {
    func next() -> UInt64
}
protocol PseudoRandomGenerator : RandomGenerator {
    associatedtype State
    var state: State { get }
    init?(state: State)
    init(seed: UInt64)
}
final class SplitMix64 : PseudoRandomGenerator {
    var state: UInt64
    /// Every UInt64 value is a valid SplitMix64 state.
    init(state: UInt64) { self.state = state }
    init(seed: UInt64) { self.state = seed }
    func next() -> UInt64 {
        state = state &+ 0x9E3779B97F4A7C15
        var z = state
        z = (z ^ (z >> UInt64(30))) &* 0xBF58476D1CE4E5B9
        z = (z ^ (z >> UInt64(27))) &* 0x94D049BB133111EB
        return z ^ (z >> UInt64(31))
    }
}
final class Xoroshiro128Plus : PseudoRandomGenerator {
    var state: (UInt64, UInt64)
    /// The state of Xoroshiro128Plus must not be everywhere zero.
    init?(state: (UInt64, UInt64)) {
        guard state.0 != 0 || state.1 != 0 else { return nil }
        self.state = state
    }
    init(seed: UInt64) {
        // Uses SplitMix64 to scramble the given seed into a valid state:
        let sm = SplitMix64(seed: seed)
        state = (sm.next(), sm.next())
    }
    func next() -> UInt64 {
        func rol55(_ x: UInt64) -> UInt64 {
            return (x << UInt64(55)) | (x >> UInt64(9))
        }
        func rol36(_ x: UInt64) -> UInt64 {
            return (x << UInt64(36)) | (x >> UInt64(28))
        }
        let result = state.0 &+ state.1
        let t = state.1 ^ state.0
        state = (rol55(state.0) ^ t ^ (t << UInt64(14)), rol36(t))
        return result
    }
}
extension PseudoRandomGenerator {
    init() {
        var seed: UInt64 = 0
        withUnsafeMutableBytes(of: &seed) { (ptr) -> Void in
            let sc = SecRandomCopyBytes(nil, ptr.count, ptr.baseAddress!)
            precondition(sc == errSecSuccess)
        }
        self.init(seed: seed)
    }
}

extension float2 {
    init(unitRange value: UInt64) {
        let v = unsafeBitCast(value, to: (UInt32, UInt32).self)
        let shifts = 32 &- (Float.significandBitCount &+ 1)
        let x = Float(v.0 >> shifts)
        let y = Float(v.1 >> shifts)
        self = float2(x, y) * (Float.ulpOfOne / 2)
    }
}
extension UInt64 {
    var float2InUnitRange: float2 { return float2(unitRange: self) }
}

// Code more relevant for the question starts here:

extension float2 {
    func wrapped(toLowerBound lowerBound: float2,
                 upperBound: float2) -> float2
    {
        let measure = upperBound - lowerBound
        let recipMeasure = simd_precise_recip(measure)
        let scaled = self * recipMeasure
        let scaledFract = scaled - floor(scaled)
        // Why is the above faster than the following line?
        //let scaledFract = simd_fract(scaled)
        return simd_muladd(scaledFract, measure, lowerBound)
    }
    
    // I would expect the following version to be faster than the above,
    // but the opposite shows to be the case when I run this program.
    func fastwrapped(toLowerBound lowerBound: float2,
                 upperBound: float2) -> float2
    {
        let measure = upperBound - lowerBound
        let recipMeasure = simd_fast_recip(measure)
        // Using simd_fast_recip doesn't seem to make any difference … why?
        let scaled = self * recipMeasure
        let scaledFract = simd_fract(scaled)
        // Using simd_fract instead of scaled - floor(scaled) seems to make
        // it slower rather than faster as I would have expected, why?
        return simd_muladd(scaledFract, measure, lowerBound)
    }
}

@_transparent
func test(label: String,
          brownianMotionSteps: Int,
          fn: (float2) -> float2) -> Double
{
    print("Running test \"\(label)\":")
    var minTime = Double.infinity
    let rg = Xoroshiro128Plus()
    for trial in 0 ..< 5 {
        var sum = float2(0, 0)
        var pos = rg.next().float2InUnitRange * float2(2, 2) - float2(1, 1)
        let t0 = CACurrentMediaTime()
        for _ in 0 ..< brownianMotionSteps {
            let deltaPos = simd_muladd(rg.next().float2InUnitRange,
                                       float2(2, 2), float2(-1, -1))
            pos = pos + deltaPos
            pos = fn(pos)
            sum = sum + pos // (just to prevent possible dead code removal)
        }
        let t1 = CACurrentMediaTime()
        minTime = min(minTime, t1 - t0)
        print("  trial #\(trial), time: ", t1 - t0, "seconds, sum:", sum)
    }
    return minTime
}

let num = 100_000_000
let baselineMinTime = test(label: "baseline", brownianMotionSteps: num) {
    $0
}
let wrappedMinTime = test(label: "wrapped", brownianMotionSteps: num) {
    $0.wrapped(toLowerBound: float2(-1, -1), upperBound: float2(1, 1))
}
let fastwrappedMinTime = test(label: "fastwrapped", brownianMotionSteps: num) {
    $0.fastwrapped(toLowerBound: float2(-1, -1), upperBound: float2(1, 1))
}

let correlatedWrappedTime = wrappedMinTime - baselineMinTime
let correlatedFastwrappedTime = fastwrappedMinTime - baselineMinTime

print("\nResult:")
print(String(format:
    "  The execution time of fastwrapped is %.3f times that of wrapped",
             correlatedFastwrappedTime / correlatedWrappedTime))

Compiling and running it (on my MacBook Pro late 2013, 2 GHz Intel Core i7):

› /Library/Developer/Toolchains/swift-DEVELOPMENT-SNAPSHOT-2018-02-21-a.xctoolchain/usr/bin/swiftc --version
Apple Swift version 4.1-dev (LLVM c4ec2ab808, Clang d8b11579e8, Swift c8e37242fe)
Target: x86_64-apple-darwin17.4.0

› /Library/Developer/Toolchains/swift-DEVELOPMENT-SNAPSHOT-2018-02-21-a.xctoolchain/usr/bin/swiftc \
-sdk /Applications/Xcode-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.13.sdk \
-O -whole-module-optimization -gnone -static-stdlib test.swift

› ./test
Running test "baseline":
  trial #0, time:  0.188109429000178 seconds, sum: float2(-1.31112e+11, 2.74878e+11)
  trial #1, time:  0.186443000995496 seconds, sum: float2(1.69388e+10, -7.23415e+09)
  trial #2, time:  0.180021314001351 seconds, sum: float2(-2.51869e+10, 9.0047e+10)
  trial #3, time:  0.185760033004044 seconds, sum: float2(-1.37439e+11, 1.37439e+11)
  trial #4, time:  0.194156596997345 seconds, sum: float2(2.74878e+11, 2.74878e+11)
Running test "wrapped":
  trial #0, time:  0.77032812100515 seconds, sum: float2(-3730.34, -7811.03)
  trial #1, time:  0.766150342999026 seconds, sum: float2(9844.67, -2780.4)
  trial #2, time:  0.779085449998092 seconds, sum: float2(4719.87, -3515.87)
  trial #3, time:  0.769905059998564 seconds, sum: float2(2734.79, -6225.55)
  trial #4, time:  0.767759139998816 seconds, sum: float2(-3272.91, -2379.1)
Running test "fastwrapped":
  trial #0, time:  0.869073567999294 seconds, sum: float2(3486.23, -3510.17)
  trial #1, time:  0.873843446999672 seconds, sum: float2(-3849.97, -267.973)
  trial #2, time:  0.862285786002758 seconds, sum: float2(3000.18, 8486.01)
  trial #3, time:  0.861077888999716 seconds, sum: float2(-9836.21, -5104.47)
  trial #4, time:  0.870769528002711 seconds, sum: float2(-1675.35, 2734.05)

Result:
  The execution time of fastwrapped is 1.162 times that of wrapped

The definition of simd_fract(x) is precisely "x - floor(x) clamped to [0,1)" (fract is defined this way because without the clamp, x - floor(x) produces 1.0 for tiny negative numbers, which breaks some algorithms). Why do you expect it to be faster with an additional clamp? (I'm genuinely curious, because it seems like a documentation bug if this isn't clear).

As for the lack of difference between fast_recip and precise_recip, the argument to the function is a compile-time constant; I expect that the optimizer is able to propagate it through so that neither is actually evaluated at run-time. I'm not set-up to test this right now, but a quick glance at the disassembly would confirm.

1 Like

I haven't looked at your actual algorithm, but I can say from my past experience: Be careful about register traffic.

If you end up moving numbers between ordinary float registers and vector registers frequently, using SIMD instructions will reduce performance. You will need to use SIMD version of many operations (that you would not normally use SIMD instruction for) to ensure intermediate results stay in vector registers. Such instructions may not have an inherent performance advantage, but by keeping intermediate results in vector registers they help improve overall performance.

1 Like

Thanks! That certainly explains it. I simply hadn't looked up the definition of simd_fract(x), and didn't realize that it was implemented on top of floor and did a clamp. I naively assumed that simd_fract(x) involved fewer operations (one) than x - floor(x) (two). I see now that it is a strange assumption to make. The only doc I read was this:

/*! @abstract The "fractional part" of x, lying in the range [0, 1).
 *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
 *  positive and finite, then the two values are exactly equal.               */
public func simd_fract(_ x: simd_float2) -> simd_float2

EDIT: I now realized that there is another (the same?) function called just fract for which the documentation is more detailed and mentions x - floor(x) and clamped:

/// `x - floor(x)`, clamped to lie in the range [0,1).  Without this clamp step,
/// the result would be 1.0 when `x` is a very small negative number, which may
/// result in out-of-bounds table accesses in common usage.
public func fract(_ x: float2) -> float2

Of course, thanks!

Yeah fract is just a transparent Swift wrapper around simd_fract. There's some cleanup that needs to happen here.

Yes, I noticed that simd_fract versions exist for a single Float and Double argument, whereas the fract versions can only take simd vectors.