What is the cause and expected future of optimization glitches like this?

This will be easiest to explain by writing the rest of this post in the comments of the following self contained demonstration program.

(A tiny and seemingly irrelevant change to the program causes the compiler to either perform or miss an optimization.)

// This program is a reduced version of the situation where I most
// recently ran into this issue.

import Foundation


// I was about to see if I could come up with an improved implementation of
// the `next(upperBound:)` method of the Random API in the standard library.
//
// So, the aim was to improve the efficiency of `myNext(upperBound:)` here:

public extension RandomNumberGenerator {
    mutating func myNext<T>(upperBound: T) -> T
        where T : FixedWidthInteger, T : UnsignedInteger
    {
        // Start out by just copying the stdlib implementation:
        var random: T = next()
        var m = random.multipliedFullWidth(by: upperBound)
        if m.low < upperBound {
            let t = (0 &- upperBound) % upperBound
            while m.low < t {
                random = next()
                m = random.multipliedFullWidth(by: upperBound)
            }
        }
        return m.high
    }
}

// I'd have to benchmark it for both slow and fast generators (because an
// improved version should be faster for both). The slow generator will be
// system random number generator and the fast generator will be this:

struct WyRand : RandomNumberGenerator {
    // Translated from:
    // https://github.com/lemire/testingRNG/blob/master/source/wyrand.h

    private var state: UInt64

    init(state: UInt64 = UInt64.random(in: .min ... .max)) {
        self.state = state // All 64 bit values are valid states.
    }

    mutating func next() -> UInt64 {
        state &+= 0xa0761d6478bd642f
        let mul = state.multipliedFullWidth(by: state ^ 0xe7037ed1a0b428db)
        return mul.high ^ mul.low
    }
}


// And finally, the benchmarking code:

func testBoundedRandomNumber(_ upperBound: UInt64) {
    let iterationCount = 1024*1024*32
    print("upperBound:", upperBound)
    //var generator = SystemRandomNumberGenerator()
    var generator = WyRand(state: 12345678901234567)
    //var generator = WyRand()
    var cs = UInt64(0)
    var hist = [UInt64 : UInt64]()
    let t0 = DispatchTime.now().uptimeNanoseconds
    for _ in 0 ..< iterationCount {
        let x = generator.next(upperBound: upperBound)
        // let x = generator.myNext(upperBound: upperBound)
        cs &+= x
        //hist[x, default: 0] &+= 1
    }
    let t1 = DispatchTime.now().uptimeNanoseconds
    // for k in hist.keys.sorted() { let v = hist[k]!; print("  ", k, v) }
    print("  time:", Double(t1-t0)/1e9, "seconds | checksum:", cs)
}

testBoundedRandomNumber((UInt64(1) &<< 63) &- 1)
testBoundedRandomNumber((UInt64(1) &<< 63) &+ 0)
testBoundedRandomNumber((UInt64(1) &<< 63) &+ 1)
testBoundedRandomNumber((UInt64(1) &<< 63) &+ (UInt64(1) &<< 62))
testBoundedRandomNumber(UInt64.max)

// Now, if I run this program as is
// (release build, defailt toolchain of Xcode 11.4,
//  note that the warning on `hist` should be there, ie the exact formulation
//  including what's commented out and not is important)
// I'll see this:
//
//   upperBound: 9223372036854775807
//     time: 0.3350296 seconds | checksum: 2825020404886721512
//   upperBound: 9223372036854775808
//     time: 0.336002166 seconds | checksum: 2825020404911887093
//   upperBound: 9223372036854775809
//     time: 0.508603183 seconds | checksum: 6443194833870232561
//   upperBound: 13835058055282163712
//     time: 0.451878394 seconds | checksum: 3387194916848432580
//   upperBound: 18446744073709551615
//     time: 0.352567688 seconds | checksum: 5650040809806994650
//
// And I can verify that the speed of `next` and `myNext` will be the same by
// commenting out either of them.
//
// BUT (here's the tiny seemingly irrelevant modification), note what happens
// when I comment out the line that would print the histogram (had it not been
// empty / unused), ie after I do this: --.
//   .------------------------------------'
//   ↓
//   // for k in hist.keys.sorted() { let v = hist[k]!; print("  ", k, v) }
//
// The optimizer is now able to do a significantly better job:
//
//   upperBound: 9223372036854775807
//     time: 0.041308889 seconds | checksum: 2825020404886721512
//   upperBound: 9223372036854775808
//     time: 0.036839298 seconds | checksum: 2825020404911887093
//   upperBound: 9223372036854775809
//     time: 0.363419656 seconds | checksum: 6443194833870232561
//   upperBound: 13835058055282163712
//     time: 0.164354209 seconds | checksum: 3387194916848432580
//   upperBound: 18446744073709551615
//     time: 0.039010151 seconds | checksum: 5650040809806994650
//
// Note that for some of the ranges, it's 10x faster now, and this will be
// true for both `next` and `myNext`.
//
// I also noted than an alternative way to enable the optimization is to
// annotate `testBoundedRandomNumber` with `@inline(__always)`, in which case
// the histogram-printing line does not have to be commented out.

Questions:

  • What causes optimization glitches like this?

  • Should they be considered bugs?

  • Will they be less common in future versions of the compiler?

2 Likes