According to my test program below (when run on my MacBook Pro late 2013, using dev snapshot 2018-02-21)
x = simd_fract(x)
is slower than
x = x - floor(x)`
x
being a simd float2.
I would have expected the opposite.
And also, simd_fast_recip doesn't seem to be faster than simd_precise_recip.
Can anyone explain these results?
Demonstration program:
import AppKit
import simd
// First some code for generating random float2, not very relevant for this
// particular question, but including it was the quickest way for me to put
// together a self-contained demonstration program:
protocol RandomGenerator : class {
func next() -> UInt64
}
protocol PseudoRandomGenerator : RandomGenerator {
associatedtype State
var state: State { get }
init?(state: State)
init(seed: UInt64)
}
final class SplitMix64 : PseudoRandomGenerator {
var state: UInt64
/// Every UInt64 value is a valid SplitMix64 state.
init(state: UInt64) { self.state = state }
init(seed: UInt64) { self.state = seed }
func next() -> UInt64 {
state = state &+ 0x9E3779B97F4A7C15
var z = state
z = (z ^ (z >> UInt64(30))) &* 0xBF58476D1CE4E5B9
z = (z ^ (z >> UInt64(27))) &* 0x94D049BB133111EB
return z ^ (z >> UInt64(31))
}
}
final class Xoroshiro128Plus : PseudoRandomGenerator {
var state: (UInt64, UInt64)
/// The state of Xoroshiro128Plus must not be everywhere zero.
init?(state: (UInt64, UInt64)) {
guard state.0 != 0 || state.1 != 0 else { return nil }
self.state = state
}
init(seed: UInt64) {
// Uses SplitMix64 to scramble the given seed into a valid state:
let sm = SplitMix64(seed: seed)
state = (sm.next(), sm.next())
}
func next() -> UInt64 {
func rol55(_ x: UInt64) -> UInt64 {
return (x << UInt64(55)) | (x >> UInt64(9))
}
func rol36(_ x: UInt64) -> UInt64 {
return (x << UInt64(36)) | (x >> UInt64(28))
}
let result = state.0 &+ state.1
let t = state.1 ^ state.0
state = (rol55(state.0) ^ t ^ (t << UInt64(14)), rol36(t))
return result
}
}
extension PseudoRandomGenerator {
init() {
var seed: UInt64 = 0
withUnsafeMutableBytes(of: &seed) { (ptr) -> Void in
let sc = SecRandomCopyBytes(nil, ptr.count, ptr.baseAddress!)
precondition(sc == errSecSuccess)
}
self.init(seed: seed)
}
}
extension float2 {
init(unitRange value: UInt64) {
let v = unsafeBitCast(value, to: (UInt32, UInt32).self)
let shifts = 32 &- (Float.significandBitCount &+ 1)
let x = Float(v.0 >> shifts)
let y = Float(v.1 >> shifts)
self = float2(x, y) * (Float.ulpOfOne / 2)
}
}
extension UInt64 {
var float2InUnitRange: float2 { return float2(unitRange: self) }
}
// Code more relevant for the question starts here:
extension float2 {
func wrapped(toLowerBound lowerBound: float2,
upperBound: float2) -> float2
{
let measure = upperBound - lowerBound
let recipMeasure = simd_precise_recip(measure)
let scaled = self * recipMeasure
let scaledFract = scaled - floor(scaled)
// Why is the above faster than the following line?
//let scaledFract = simd_fract(scaled)
return simd_muladd(scaledFract, measure, lowerBound)
}
// I would expect the following version to be faster than the above,
// but the opposite shows to be the case when I run this program.
func fastwrapped(toLowerBound lowerBound: float2,
upperBound: float2) -> float2
{
let measure = upperBound - lowerBound
let recipMeasure = simd_fast_recip(measure)
// Using simd_fast_recip doesn't seem to make any difference … why?
let scaled = self * recipMeasure
let scaledFract = simd_fract(scaled)
// Using simd_fract instead of scaled - floor(scaled) seems to make
// it slower rather than faster as I would have expected, why?
return simd_muladd(scaledFract, measure, lowerBound)
}
}
@_transparent
func test(label: String,
brownianMotionSteps: Int,
fn: (float2) -> float2) -> Double
{
print("Running test \"\(label)\":")
var minTime = Double.infinity
let rg = Xoroshiro128Plus()
for trial in 0 ..< 5 {
var sum = float2(0, 0)
var pos = rg.next().float2InUnitRange * float2(2, 2) - float2(1, 1)
let t0 = CACurrentMediaTime()
for _ in 0 ..< brownianMotionSteps {
let deltaPos = simd_muladd(rg.next().float2InUnitRange,
float2(2, 2), float2(-1, -1))
pos = pos + deltaPos
pos = fn(pos)
sum = sum + pos // (just to prevent possible dead code removal)
}
let t1 = CACurrentMediaTime()
minTime = min(minTime, t1 - t0)
print(" trial #\(trial), time: ", t1 - t0, "seconds, sum:", sum)
}
return minTime
}
let num = 100_000_000
let baselineMinTime = test(label: "baseline", brownianMotionSteps: num) {
$0
}
let wrappedMinTime = test(label: "wrapped", brownianMotionSteps: num) {
$0.wrapped(toLowerBound: float2(-1, -1), upperBound: float2(1, 1))
}
let fastwrappedMinTime = test(label: "fastwrapped", brownianMotionSteps: num) {
$0.fastwrapped(toLowerBound: float2(-1, -1), upperBound: float2(1, 1))
}
let correlatedWrappedTime = wrappedMinTime - baselineMinTime
let correlatedFastwrappedTime = fastwrappedMinTime - baselineMinTime
print("\nResult:")
print(String(format:
" The execution time of fastwrapped is %.3f times that of wrapped",
correlatedFastwrappedTime / correlatedWrappedTime))
Compiling and running it (on my MacBook Pro late 2013, 2 GHz Intel Core i7):
› /Library/Developer/Toolchains/swift-DEVELOPMENT-SNAPSHOT-2018-02-21-a.xctoolchain/usr/bin/swiftc --version
Apple Swift version 4.1-dev (LLVM c4ec2ab808, Clang d8b11579e8, Swift c8e37242fe)
Target: x86_64-apple-darwin17.4.0
› /Library/Developer/Toolchains/swift-DEVELOPMENT-SNAPSHOT-2018-02-21-a.xctoolchain/usr/bin/swiftc \
-sdk /Applications/Xcode-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.13.sdk \
-O -whole-module-optimization -gnone -static-stdlib test.swift
› ./test
Running test "baseline":
trial #0, time: 0.188109429000178 seconds, sum: float2(-1.31112e+11, 2.74878e+11)
trial #1, time: 0.186443000995496 seconds, sum: float2(1.69388e+10, -7.23415e+09)
trial #2, time: 0.180021314001351 seconds, sum: float2(-2.51869e+10, 9.0047e+10)
trial #3, time: 0.185760033004044 seconds, sum: float2(-1.37439e+11, 1.37439e+11)
trial #4, time: 0.194156596997345 seconds, sum: float2(2.74878e+11, 2.74878e+11)
Running test "wrapped":
trial #0, time: 0.77032812100515 seconds, sum: float2(-3730.34, -7811.03)
trial #1, time: 0.766150342999026 seconds, sum: float2(9844.67, -2780.4)
trial #2, time: 0.779085449998092 seconds, sum: float2(4719.87, -3515.87)
trial #3, time: 0.769905059998564 seconds, sum: float2(2734.79, -6225.55)
trial #4, time: 0.767759139998816 seconds, sum: float2(-3272.91, -2379.1)
Running test "fastwrapped":
trial #0, time: 0.869073567999294 seconds, sum: float2(3486.23, -3510.17)
trial #1, time: 0.873843446999672 seconds, sum: float2(-3849.97, -267.973)
trial #2, time: 0.862285786002758 seconds, sum: float2(3000.18, 8486.01)
trial #3, time: 0.861077888999716 seconds, sum: float2(-9836.21, -5104.47)
trial #4, time: 0.870769528002711 seconds, sum: float2(-1675.35, 2734.05)
Result:
The execution time of fastwrapped is 1.162 times that of wrapped