I've previously reported .ulpOfOne
being unnecessarily slow (SR-6919), and below is a program that demonstrates a similar issue: Float
's .nextUp
being about 50 times slower than an alternative implementation.
The situation is probably the same for Double
's .nextUp
and for .nextDown
, and, as mentioned in the comments of the program, .greatestFiniteMagnitude
and .infinity
are also unexpectedly slow, and I assume there are many more.
It's kind of a sad situation, that standard things like these can't be trusted performance-wise, and that I have to spend large amounts of my time investigating and writing alternative implementations of them (instead of writing my actual algorithms and apps).
Since I'd rather not prepare and file a detailed SR for every similar case, I'm just posting this example here and asking these questions:
-
Is there a known common cause for these issues (eg some planned optimization that is not yet performed by the compiler)?
-
Is there a priority or road-map for fixing/improving them?
-
In what not-very-time-consuming way(s) can I help getting them fixed/improved?
I'd also like to know if this microbenchmark is misleading in any way!
The program:
import AppKit
extension Float {
/// Example of an alternative implementation of `.nextUp`, aiming to
/// demonstrate that the standard `.nextUp` implementation is probably in
/// the order of 50 times slower than necessary.
var altNextUp: Float {
if self.isFinite {
let s = self + 0.0 // if self is -0.0 { s = 0.0 } else { s = self }
let signBit = s.bitPattern >> 31
return Float(bitPattern:
s.bitPattern &+ (UInt32.max &- 1) &* signBit &+ 1)
} else if self.bitPattern == 0b1_11111111_00000000000000000000000 {
// For -inf, return -greatestFiniteMagnitude:
return Float(bitPattern: 0b1_11111110_11111111111111111111111)
} else {
// Can only be +inf or nan here, result should be self for both.
return self
}
// Regarding the binary bit patterns above: Using the equivalent
// "self == -Float.infinity" and
// "return -Float.greatestFiniteMagnitude"
// proved to be a lot slower than the above formulation.
// (So .infinity and .greatestFiniteMagnitude seems to be slower than
// necessary too. But self.isFinite (at the top) seems to be as fast
// as the equivalent: "self.bitPattern & 0x7f800000 != 0x7f800000".)
}
}
func validateAltNextUp() {
print("Checking that f.altNextUp matches f.nextUp for all Float values f:")
for i in UInt32.min ... UInt32.max {
let f = Float(bitPattern: i)
if f.altNextUp.bitPattern != f.nextUp.bitPattern {
print(" Invalid value for Float(bitPattern: \(i)).myNextUp")
exit(1)
}
if i & 0x1fffffff == 0x1fffffff {
let progress = (Double(i) / Double(UInt32.max) * 100).rounded()
print(" \(Int(progress))% OK")
}
}
}
func benchmark(_ label: String,
category: RandomSource.RandomFloatValueCategory,
fn: (Float) -> Float) -> Double
{
print(" \(label):")
var minTime = Double.infinity
let testCount = 5
var randomFloats = [Float](repeating: 0.0, count: 100_000_000)
let rs = RandomSource()
for _ in 0 ..< testCount {
for i in randomFloats.indices {
randomFloats[i] = rs.randomFloatValue(category: category)
}
var checksum = UInt32(0)
let t0 = CACurrentMediaTime()
for f in randomFloats {
let r = fn(f)
checksum ^= r.bitPattern
}
let t1 = CACurrentMediaTime()
print(" time:", t1 - t0, "seconds (checksum: \(checksum))")
minTime = min(minTime, t1 - t0)
}
return minTime
}
final class RandomSource {
let bufferSize = 1024 * 1024
var buffer: UnsafeMutableRawPointer
var offset = 0
init() {
buffer = .allocate(byteCount: bufferSize, alignment: 64)
reset()
}
deinit {
buffer.deallocate()
}
private func reset() {
let r = SecRandomCopyBytes(nil, bufferSize, buffer)
precondition(r == errSecSuccess)
offset = 0
}
func unsafeNextRandomBitPattern<T>(asType _: T.Type) -> T {
let byteCount = MemoryLayout<T>.size
if offset &+ byteCount > bufferSize { reset() }
let value = buffer.load(fromByteOffset: offset, as: T.self)
offset = offset &+ byteCount
return value
}
}
extension RandomSource {
enum RandomFloatValueCategory {
/// Random `Float` values (including NaN and infinity).
/// The distribution of different kinds of values looks
/// something like this:
///
/// .isFinite: 99.610000 %
/// .isNaN: 0.390000 %
/// .isInfinite: 0.000001 %
/// .isNormal: 99.220000 %
/// .isSubnormal: 0.390000 %
case any
/// Random finite `Float` values (any value other than NaN and infinity).
case finite
/// Uniformly distributed random unit range [0.0, 1.0) `Float` values.
case unitRange
/// Uniformly distributed random unit range [-1.0, 1.0) `Float` values.
case symmetricUnitRange
/// Random NaN `Float` values.
case nan
/// Random ±infinity `Float` values.
case infinity
}
func randomFloatValue(category: RandomFloatValueCategory) -> Float {
while true {
let bp = unsafeNextRandomBitPattern(asType: UInt32.self)
switch category {
case .any:
return Float(bitPattern: bp)
case .finite:
let f = Float(bitPattern: bp)
if f.isFinite { return f }
case .unitRange:
let shifts = 32 &- (Float.significandBitCount &+ 1)
// Since .ulpOfOne is also slower than necessary (SR-6919):
let altUlpOfOne = Float(bitPattern: 0x34000000)
return Float(bp >> shifts) * (altUlpOfOne / 2.0)
case .symmetricUnitRange:
let shifts = 32 &- (Float.significandBitCount &+ 1)
let altUlpOfOne = Float(bitPattern: 0x34000000)
let f = Float(bp >> shifts) * (altUlpOfOne / 2.0)
return f * 2.0 - 1.0
case .nan:
let f = Float(bitPattern: bp | 0x7f800000)
// f can be ±inf if all significand-bits happen to be zero, so:
if f.isNaN { return f }
case .infinity:
return Float(bitPattern: (bp & 0x80000000) | 0x7f800000)
}
}
}
}
validateAltNextUp()
let floatValueCategoriesToTest: [RandomSource.RandomFloatValueCategory] = [
.any,
.finite,
.unitRange,
.symmetricUnitRange,
.nan,
.infinity
]
var summary = [(RandomSource.RandomFloatValueCategory, Double)]()
for c in floatValueCategoriesToTest {
print("--------------------")
print("Using random Float values of category .\(c):")
let baselineTime =
benchmark("r = f", category: c) { $0 }
let altNextUpTime =
benchmark("r = f.altNextUp", category: c) { $0.altNextUp }
let stdNextUpTime =
benchmark("r = f.nextUp", category: c) { $0.nextUp }
let onlyStdTime = (stdNextUpTime - baselineTime)
let onlyAltTime = (altNextUpTime - baselineTime)
let speedup = (onlyStdTime / onlyAltTime)
print(" Result for category .\(c):")
print(String(format: " .altNextUp is %5.1f times faster than .nextUp",
speedup))
summary.append((c, speedup))
}
print("--------------------\nSummary:")
for (c, speedup) in summary {
print(String(format: ".altNextUp is %5.1f times faster than" +
" .nextUp for category \(c)", speedup))
}
Compiling and running on my MBP, Intel Core i7, macOS 10.13.3 (summary at the end):
› swiftc --version
Apple Swift version 4.1 (swiftlang-902.0.38 clang-902.0.30)
Target: x86_64-apple-darwin17.4.0
› swiftc -O -whole-module-optimization -static-stdlib AltNextUp.swift
› ./AltNextUp
Checking that f.altNextUp matches f.nextUp for all Float values f:
12% OK
25% OK
37% OK
50% OK
62% OK
75% OK
87% OK
100% OK
--------------------
Using random Float values of category .any:
r = f:
time: 0.029838053000276 seconds (checksum: 58711825)
time: 0.0299741519993404 seconds (checksum: 861671730)
time: 0.0297457659908105 seconds (checksum: 472713438)
time: 0.029883098002756 seconds (checksum: 1638835220)
time: 0.0300683309906162 seconds (checksum: 1286754611)
r = f.altNextUp:
time: 0.0558166010014247 seconds (checksum: 2306590007)
time: 0.0554675339953974 seconds (checksum: 230564091)
time: 0.0638001959887333 seconds (checksum: 3920135602)
time: 0.0569571730011376 seconds (checksum: 2110971917)
time: 0.0554473879892612 seconds (checksum: 2870614279)
r = f.nextUp:
time: 1.46318522699585 seconds (checksum: 3427652541)
time: 1.46466239400615 seconds (checksum: 2812842025)
time: 1.4613655909925 seconds (checksum: 3476030149)
time: 1.46923679100291 seconds (checksum: 3277928121)
time: 1.50366488700092 seconds (checksum: 2510324098)
Result for category .any:
.altNextUp is 55.7 times faster than .nextUp
--------------------
Using random Float values of category .finite:
r = f:
time: 0.0298321669979487 seconds (checksum: 1878953528)
time: 0.0296727499953704 seconds (checksum: 1010194629)
time: 0.0298948949930491 seconds (checksum: 2903609226)
time: 0.0298710869974457 seconds (checksum: 3785817386)
time: 0.0302293090062449 seconds (checksum: 2749474380)
r = f.altNextUp:
time: 0.0554891810024856 seconds (checksum: 2876972754)
time: 0.0605894990003435 seconds (checksum: 1773123711)
time: 0.0554058050038293 seconds (checksum: 2952190057)
time: 0.0637177680036984 seconds (checksum: 3590646689)
time: 0.055678935998003 seconds (checksum: 2588428598)
r = f.nextUp:
time: 1.45651687799545 seconds (checksum: 2799961604)
time: 1.50189002101251 seconds (checksum: 1149146560)
time: 1.45686126200599 seconds (checksum: 673653893)
time: 1.46648709299916 seconds (checksum: 1404830140)
time: 1.46890417800751 seconds (checksum: 1460930821)
Result for category .finite:
.altNextUp is 55.4 times faster than .nextUp
--------------------
Using random Float values of category .unitRange:
r = f:
time: 0.0295948119892273 seconds (checksum: 945924168)
time: 0.0300074789993232 seconds (checksum: 268231395)
time: 0.0297075080015929 seconds (checksum: 1067833450)
time: 0.0298581460083369 seconds (checksum: 257380703)
time: 0.0308274239941966 seconds (checksum: 177206134)
r = f.altNextUp:
time: 0.0554944409959717 seconds (checksum: 186425451)
time: 0.055514250008855 seconds (checksum: 1032097886)
time: 0.0554864440055098 seconds (checksum: 836568423)
time: 0.0640538520092377 seconds (checksum: 47486903)
time: 0.0557622570049716 seconds (checksum: 208578365)
r = f.nextUp:
time: 0.908307685996988 seconds (checksum: 75489059)
time: 0.906268719991203 seconds (checksum: 976704978)
time: 0.900013104008394 seconds (checksum: 820958178)
time: 0.919965298002353 seconds (checksum: 217238336)
time: 0.907456795001053 seconds (checksum: 947691156)
Result for category .unitRange:
.altNextUp is 33.6 times faster than .nextUp
--------------------
Using random Float values of category .symmetricUnitRange:
r = f:
time: 0.0298502350051422 seconds (checksum: 223901436)
time: 0.0299190469959285 seconds (checksum: 2315488200)
time: 0.0303898870042758 seconds (checksum: 3147309888)
time: 0.0307191220053937 seconds (checksum: 2287779874)
time: 0.030207210991648 seconds (checksum: 2330834048)
r = f.altNextUp:
time: 0.0552963619993534 seconds (checksum: 2164446496)
time: 0.0637335259962128 seconds (checksum: 232036324)
time: 0.0551901809958508 seconds (checksum: 196426914)
time: 0.0638436360022752 seconds (checksum: 841488406)
time: 0.055314992001513 seconds (checksum: 2341243640)
r = f.nextUp:
time: 1.48364534600114 seconds (checksum: 854273000)
time: 1.50064570399991 seconds (checksum: 2252985600)
time: 1.46703579300083 seconds (checksum: 3156200310)
time: 1.46095811099804 seconds (checksum: 155718182)
time: 1.463854540998 seconds (checksum: 2358879464)
Result for category .symmetricUnitRange:
.altNextUp is 56.5 times faster than .nextUp
--------------------
Using random Float values of category .nan:
r = f:
time: 0.0300197009928524 seconds (checksum: 5060929)
time: 0.0298186400032137 seconds (checksum: 8170168)
time: 0.02986538199184 seconds (checksum: 2473471)
time: 0.0300926060008351 seconds (checksum: 2154221388)
time: 0.0316904770006659 seconds (checksum: 2148593027)
r = f.altNextUp:
time: 0.0636590059875743 seconds (checksum: 2152642469)
time: 0.0550635370018426 seconds (checksum: 5603112)
time: 0.0640082820027601 seconds (checksum: 2151118778)
time: 0.0554347750003217 seconds (checksum: 2151997615)
time: 0.0554645879892632 seconds (checksum: 4792910)
r = f.nextUp:
time: 0.121845918998588 seconds (checksum: 2153716152)
time: 0.120195970011991 seconds (checksum: 2585770)
time: 0.126004698002362 seconds (checksum: 6207349)
time: 0.122530396998627 seconds (checksum: 454112)
time: 0.121163736999733 seconds (checksum: 2148300345)
Result for category .nan:
.altNextUp is 3.6 times faster than .nextUp
--------------------
Using random Float values of category .infinity:
r = f:
time: 0.0341460189956706 seconds (checksum: 0)
time: 0.0298047859978396 seconds (checksum: 2147483648)
time: 0.0300058279972291 seconds (checksum: 2147483648)
time: 0.0297330600005807 seconds (checksum: 2147483648)
time: 0.0304830780078191 seconds (checksum: 2147483648)
r = f.altNextUp:
time: 0.0565835360030178 seconds (checksum: 2164260863)
time: 0.0639977930113673 seconds (checksum: 2164260863)
time: 0.0553346960077761 seconds (checksum: 2164260863)
time: 0.0637900139990961 seconds (checksum: 2164260863)
time: 0.06371855600446 seconds (checksum: 2164260863)
r = f.nextUp:
time: 1.29966003900336 seconds (checksum: 0)
time: 1.28218217800895 seconds (checksum: 2164260863)
time: 1.29585642099846 seconds (checksum: 0)
time: 1.30232698800683 seconds (checksum: 0)
time: 1.29522193400771 seconds (checksum: 2164260863)
Result for category .infinity:
.altNextUp is 48.9 times faster than .nextUp
--------------------
Summary:
.altNextUp is 55.7 times faster than .nextUp for category any
.altNextUp is 55.4 times faster than .nextUp for category finite
.altNextUp is 33.6 times faster than .nextUp for category unitRange
.altNextUp is 56.5 times faster than .nextUp for category symmetricUnitRange
.altNextUp is 3.6 times faster than .nextUp for category nan
.altNextUp is 48.9 times faster than .nextUp for category infinity