Creating an array of random numbers in Swift is slow

I'm trying to create a Swift function that creates an array of random numbers similar to the numpy.random.rand function in the NumPy package.

This example creates a large NumPy array with random numbers. I am using uv to run the Python code.

# Run with  ->  uv run main.py

import numpy as np

def main():
    n = 100_000_000
    a = np.random.rand(n)
    print("first value is", a[0])
    print("last value is", a[n - 1])

if __name__ == "__main__":
    main()

Here is my attempt to do the same thing in Swift.

// Build with  ->  swiftc main.swift -Ounchecked
// Run with    ->  ./main

func randomArray(_ count: Int) -> [Double] {
    let result = Array<Double>(unsafeUninitializedCapacity: count) { buffer, initCount in
        for i in 0..<count {
            buffer[i] = Double.random(in: 0...1)
        }
        initCount = count
    }
    return result
}

func main() {
    let n = 100_000_000
    let a = randomArray(n)
    print("first value is \(a[0])")
    print("last value is \(a[n - 1])")
}

main()

The Swift code is much slower than the NumPy code. Is there a better way to create a random array in Swift? Does Accelerate provide any functions that could speed this up?

There are two pitches that would address this:

The later also has an implementation that you could use with some tweaks:

1 Like

Another issue is that I believe Swift's default RNG is more comparable to Python's SecureRandom, rather than the default numpy random, so you may want to compare that performance.

8 Likes

This will probably get re-pitched sometime in the future now that ~Copyable constraints (which this really wants to use) are becoming viable. (Specifically, I either want to relax the RandomNumberGenerator protocol to allow non-copyable conformers, or introduce a new protocol that does that, and I've been waiting for the compiler support for such things to get ironed out).

@Jon_Shier's point is really important, though. You can use a custom random source for this use case, and it will be much, much faster than using the SystemRNG.

7 Likes

After reading the replies from @Jon_Shier and @scanon, I ditched the system random number generator Double.random(in: 0...1) and replaced it with drand48() as shown below. This runs about 1.5x faster than the NumPy code. I guess I could write a custom random generator using the RandomNumberGenerator protocol but now I know to avoid the system generator.

import Accelerate

func randomArray(_ count: Int) -> [Double] {
    let result = Array<Double>(unsafeUninitializedCapacity: count) { buffer, initCount in
        for i in 0..<count {
            buffer[i] = drand48()
        }
        initCount = count
    }
    return result
}

func main() {
    let n = 100_000_000
    let a = randomArray(n)
    print("first value is \(a[0])")
    print("last value is \(a[n - 1])")
}

main()
3 Likes

I discovered this blog post from Daniel Lemire about a hash function called wyhash. The original wyhash function and associated wyrand function are available in this wyhash repo. The relevant code is in wyhash.h. Below are snippets from wyhash.h related to the wyrand function:

// 128bit multiply function
static inline uint64_t _wyrot(uint64_t x) { return (x>>32)|(x<<32); }

static inline void _wymum(uint64_t *A, uint64_t *B){
#if(WYHASH_32BIT_MUM)
  uint64_t hh=(*A>>32)*(*B>>32), hl=(*A>>32)*(uint32_t)*B, lh=(uint32_t)*A*(*B>>32), ll=(uint64_t)(uint32_t)*A*(uint32_t)*B;
  #if(WYHASH_CONDOM>1)
  *A^=_wyrot(hl)^hh; *B^=_wyrot(lh)^ll;
  #else
  *A=_wyrot(hl)^hh; *B=_wyrot(lh)^ll;
  #endif
#elif defined(__SIZEOF_INT128__)
  __uint128_t r=*A; r*=*B;
  #if(WYHASH_CONDOM>1)
  *A^=(uint64_t)r; *B^=(uint64_t)(r>>64);
  #else
  *A=(uint64_t)r; *B=(uint64_t)(r>>64);
  #endif
#elif defined(_MSC_VER) && defined(_M_X64)
  #if(WYHASH_CONDOM>1)
  uint64_t  a,  b;
  a=_umul128(*A,*B,&b);
  *A^=a;  *B^=b;
  #else
  *A=_umul128(*A,*B,B);
  #endif
#else
  uint64_t ha=*A>>32, hb=*B>>32, la=(uint32_t)*A, lb=(uint32_t)*B, hi, lo;
  uint64_t rh=ha*hb, rm0=ha*lb, rm1=hb*la, rl=la*lb, t=rl+(rm0<<32), c=t<rl;
  lo=t+(rm1<<32); c+=lo<t; hi=rh+(rm0>>32)+(rm1>>32)+c;
  #if(WYHASH_CONDOM>1)
  *A^=lo;  *B^=hi;
  #else
  *A=lo;  *B=hi;
  #endif
#endif
}

// Multiply and xor mix function, aka MUM
static inline uint64_t _wymix(uint64_t A, uint64_t B) {
  _wymum(&A, &B);
  return A^B;
}

//The wyrand PRNG that pass BigCrush and PractRand
static inline uint64_t wyrand(uint64_t *seed) {
  *seed+=0x2d358dccaa6c78a5ull;
  return _wymix(*seed, *seed^0x8bb84b93962eacc9ull);
}

The blog post links to a Swift implementation of wyrand that is available at the SwiftWyhash repo. The code in the SwiftWyhash repo lags behind recent changes in the wyhash repo by several years. So I attempted to update the Swift implementation based on the latest code in the wyhash repo. Below is my Swift attempt:

import Foundation

struct WyRand: RandomNumberGenerator {
    private var state : UInt64

    init(seed: UInt64 = mach_absolute_time()) {
        state = seed
    }

    mutating func next() -> UInt64 {
        state &+= 0x2d358dccaa6c78a5
        let mul = state.multipliedFullWidth(by: state ^ 0x8bb84b93962eacc9)
        return mul.low ^ mul.high
    }
}

Using this random number generator to fill an array with random values is shown below. This is significantly faster than using Swift's default Double.random.

let n = 100_000_000
var rng = WyRand()

let result = Array<Double>(unsafeUninitializedCapacity: n) { buffer, initCount in
    for i in 0..<n {
        buffer[i] = Double.random(in: 0..<1, using: &rng)
    }
    initCount = n
}

print(result[0])

I'm not great with C/C++ code so can someone take a look at the wyrand function in wyhash.h and let me know if the WyRand struct is a good Swift representation of the wyrand function?

2 Likes

I compared the Swift Wyrand version to the Rust wyrand version using a seed of 42 and got the same results. So I guess the Swift code is working fine.

1 Like

How does the speed of this compare to drand48? I occasionally need a fast, insecure, low-quality random number generator, so this could be useful to keep in mind. Thanks!

It is about 2.2x faster than drand48 and 42x faster than Double.random when filling a Double array of 100,000,000 elements. This was measured on an M4 Mac, compiling with -Ounchecked, and using hyperfine for the benchmark. I will write a short article with all the details soon.

5 Likes

I dug around the Accelerate framework and found the dlaruv function which is a LAPACK function that returns a vector of double precision numbers from a uniform distribution. The code below runs very fast and is 84x faster than the wyrand example I shared above. How is dlaruv so fast? Or am I doing something wrong here and the result is not comparable to the previous examples?

// Compile    -->  swiftc -Xcc -DACCELERATE_NEW_LAPACK -Ounchecked accel.swift
// Run        -->  ./accel
// Benchmark  -->  hyperfine --warmup 3 './accel'

import Accelerate

func main() {
    let n = 100_000_000

    let result = Array<Double>(unsafeUninitializedCapacity: n) { buffer, initCount in
        var iseed: [Int32] = (0..<3).map { _ in Int32.random(in: 1..<4095) }
        iseed += [2 * (Int32.random(in: 1..<4095) / 2) + 1 ]

        var nn = Int32(n)
        dlaruv_(&iseed, &nn, buffer.baseAddress)
        initCount = n
    }

    print(result[0])
}

main()
2 Likes

A more fair comparison would be to wrap that into a generator and generate a single value in a loop like you did before:

    for i in 0..<n {
        buffer[i] = Double.random(in: 0..<1, using: &rng)
    }
1 Like

Well, my goal is to create an array of random values, not a single random number. The previous attempts in this post accomplished that by creating a single random number then assigning that number to an array element. The LAPACK dlaruv function seems to do this much more efficiently for an entire array. So unless Swift has the ability to generate an array of random numbers (not just a single random number) then I think the LAPACK dlaruv function is the fastest option.

I understand... I merely suggested to first make it fair for both tests – my bet if you wrap it up in a generator and draw one value at a time from that in a loop you'd see the second test being as slow as the first test (†). As a second step, if you remove the generator from the first test and fill the array directly (with UInt64 valuers for simplicity) it would be as fast as the second test (†). Then what's left is to figure out how to fill the array of Doubles in a bulk manner, perhaps using some bulk version of generator API (if exists).

(† or the first test is still slower than the second no matter what you do, we just don't know it yet before doing a fair comparison.)

1 Like

dlaruv_ only generates up to 128 random numbers in one go. So this array has random numbers for the first 128 indices, then uninitialized memory afterwards.

2 Likes

Happened to have investigated into this. My two cents: fill the buffer with an RNG, treat as UInt64 and create a Double out of it. This also has the (un)desirable property that each number in [0,1) has a 1/2^64 chance to appear – 1.0f also appears, after rounding, with approximately 1/2 the chance of the number immediately below it.

A even better algo would give you even finer numbers near 0.0, but it either takes more than 64-bits of randomness, or takes a variable number of bits so have to manage the RNG state.

import Darwin // For arc4random_buf, an Secure Random Generator 
              // that fills a buffer, see arc4random(3)
func randomArray(_ count: Int) -> [Double] {
    let result = Array<Double>(unsafeUninitializedCapacity: count) { buffer, initCount in
        // assert(MemoryLayout<UInt64>.stride == MemoryLayout<Double>.stride)
        arc4random_buf(a.baseAddress!, a.count * MemoryLayout<Double>.stride)
        for i in a.indices {
            a[i] = Double(a[i].bitPattern) *
                   Double(sign:.plus, exponent: -64, significand: 1.0)
        }
        initCount = count
    }
    return result
}

You might be tempted to operate on the mantissa for the *2^-64 part, but that introduces a branch (0.0 is 0x0, and needs separate operation). The f-multiply code is much cleaner: load pair, convert to float, multiply, store pair, flush, repeat.

LBB19_6:
	ldp	q1, q0, [x26]
	ucvtf.2d	v1, v1
	ucvtf.2d	v0, v0
	fmul.2d	v0, v0, v2
	fmul.2d	v1, v1, v2
	stp	q1, q0, [x26], #32
	subs	x8, x8, #1
	b.ne	LBB19_6

It generates 100000000 doubles in 230ms. While not as fast as your dlaruv_ (some 6.3 ms for me), it's not bad and gives a Secure Random distribution.

I've also tried to do streaming conversion (fill a small buffer, convert, repeat). It gets slower as the buffer size decreases! 512 doubles (32KiB) is the minimum buffer size that isn't slower than just to fill the entire buffer and convert in a second pass. Your mileage may vary.

I should read the LAPACK documentation more carefully, but you are right, dlaruv is only for 128 elements. I checked the elements in the dlaruv example I posted above and confirmed that values after the 127th element are just zero. The LAPACK function that I should use is dlarnv. The revised LAPACK version is given below. This is faster than using the default Double.random, but slower than the WyRand and drand48 examples I posted above.

import Accelerate

let n = 100_000_000

let result = Array<Double>(unsafeUninitializedCapacity: n) { buffer, initCount in
    var idist: Int32 = 1
    var nn = Int32(n)

    var iseed: [Int32] = (0..<3).map { _ in Int32.random(in: 1..<4095) }
    iseed += [2 * (Int32.random(in: 1..<4095) / 2) + 1 ]

    dlarnv_(&idist, &iseed, &nn, buffer.baseAddress)

    initCount = n
}

Appendix: streaming convert

// Compile    -->  swiftc -parse-as-library -Ounchecked arc4streaming128k8e.swift -o accel
// Run        -->  ./accel
// Benchmark  -->  hyperfine --shell=none --warmup 3 './accel'
// Have fun experimenting with the typealiases and ChunkSizes, and also
// devising your own UCVTF_pack()

@main struct randomTest{
typealias T = Streaming

static func main() {
    let n = 100_000_000
    let a = randomArray(n)
    print("first value is \(a[0])")
    print("last value is \(a[n - 1])")
}

static func randomArray(_ count: Int) -> [Double] {
    let result = Array<Double>(unsafeUninitializedCapacity: count) { buffer, initCount in
        T.randomFillDouble(buffer)
        initCount = count
    }
    return result
}
}

protocol RandomFillDouble {
    static func randomFillDouble(_ a: UnsafeMutableBufferPointer<Double>)
}

import Darwin
import simd

struct Streaming : RandomFillDouble {
static var fillChunkSize = 2048 // 128KB
typealias simd = SIMD8          // 8 elements
typealias u = UInt64
typealias i = Int64
typealias f = Double
typealias SIMD_u = simd<u>
typealias SIMD_i = simd<i>
typealias SIMD_f = simd<f>
static var procChunkSize = MemoryLayout<SIMD_f>.size / MemoryLayout<f>.size

// In-place convert Uint64 in [0,MAX] into Double in [0,1)
// a most convoluted and slow implementation
@inlinable static func UCVTF_pack(_ pack : inout SIMD_u) {
    pack = SIMD_u(clamping:
               unsafeBitCast(
                   unsafeBitCast(SIMD_f(pack), to: SIMD_u.self) // convert
                   &- // divide by 2^64
                   Double(sign: FloatingPointSign.plus,
                          exponentBitPattern: UInt(64),
                          significandBitPattern: UInt64(0))
                   .bitPattern
                   , to: SIMD_i.self) // unsafeBitCast
        ) // SIMD_u(clamping:) deals with the special case 0 (negative to 0)
}

@inlinable static func randomFillDouble(_ a: UnsafeMutableBufferPointer<Double>) {

    // Process the arrays in chunks
    let fillRemainderStartIndex = a.count - (a.count % fillChunkSize)

    // Chunk filling
    for i in stride(from: 0, to: fillRemainderStartIndex, by: fillChunkSize) {
        let chunk = UnsafeMutableBufferPointer(rebasing: a[i ..< i+fillChunkSize])
        arc4random_buf(chunk.baseAddress, chunk.count * MemoryLayout<Double>.stride)
        chunk.withMemoryRebound(to: SIMD_u.self) { pack in
            for i in pack.indices{
                UCVTF_pack(&pack[i])
            }
        }
    }

    // Filling the rest
    let buf = UnsafeMutableBufferPointer(rebasing: a[fillRemainderStartIndex ..< a.count])
    arc4random_buf(buf.baseAddress, buf.count * MemoryLayout<UInt64>.stride)
    buf.withMemoryRebound(to: SIMD_u.self) { pack in
        for i in pack.indices {
            UCVTF_pack(&pack[i])
        }
    }

    // The rest of the rest isn't affected by choices of UCVTF_pack. Insignificant.
    let procRemainderStartIndex = buf.count - (buf.count % procChunkSize)
    for i in procRemainderStartIndex ..< buf.count {
        buf[i] = switch(buf[i]) {
                 case Double(0): 
                     buf[i] // Generates fcmeq.2d add.2d bif.16b
                 default: 
                     Double(bitPattern: 
                            Double(buf[i]).bitPattern
                            &-
                            Double(sign: .plus,
                            exponentBitPattern: UInt(64),
                            significandBitPattern: UInt64(0)).bitPattern
                            )
        }
    }
    }

}

Thanks for sharing this. What are the specs of the Mac you are running this code on? What are your hyperfine results?

M3pro. hyperfine reports ~236 ms for two-pass, ~421 ms for 4KiB, ~310 ms for 8 KiB, ~253 ms for 16KiB, ~225 ms for 32 KiB, ~220 ms for 64 KiB, ~221 ms for 128 KiB. The SIMD size doesn't seem to matter (2 vs. 4 vs. 8)

EDIT: the time was measured when UCVTF_pack() was like Double(in) / 2^64. Not the bit-manipulation in my post above, which repeatedly calls an external auto-generated function SIMD<UInt64>.init(clamping:SIMD<Int64>)

EDIT2: wyrand() manages to achieve 72ms for the same array size, 3x my speed. Since it generates random numbers on the stack, it's explained by having one single store vs. store-load-store.
Yet if I use Double.random(in: 0..<1, using: &rng), instead of dividing by 2^64, it's 87 ms! Swift's Double.random(in:using:) is slow after all.

Given what I know about random number generation, I would suspect that Swift is generating slightly better numbers in return for being slower, but neither one is actually generating a uniformly random number. The current state of the art, AIUI, is described in this paper.

I suspect that the largest source of slowdown in the standard library function is that it has to generate each number one 64-bit word at a time, which isn't really fixable unless we add a fill(_ bytes: {Output/Mutable}RawSpan) requirement to RandomNumberGenerator.

For reference: the source code to BinaryFloatingPoint.random(in:using:)
extension BinaryFloatingPoint where Self.RawSignificand: FixedWidthInteger {

  /// Returns a random value within the specified range, using the given
  /// generator as a source for randomness.
  ///
  /// Use this method to generate a floating-point value within a specific
  /// range when you are using a custom random number generator. This example
  /// creates three new values in the range `10.0 ..< 20.0`.
  ///
  ///     for _ in 1...3 {
  ///         print(Double.random(in: 10.0 ..< 20.0, using: &myGenerator))
  ///     }
  ///     // Prints "18.1900709259179"
  ///     // Prints "14.2286325689993"
  ///     // Prints "13.1485686260762"
  ///
  /// The `random(in:using:)` static method chooses a random value from a
  /// continuous uniform distribution in `range`, and then converts that value
  /// to the nearest representable value in this type. Depending on the size
  /// and span of `range`, some concrete values may be represented more
  /// frequently than others.
  ///
  /// - Note: The algorithm used to create random values may change in a future
  ///   version of Swift. If you're passing a generator that results in the
  ///   same sequence of floating-point values each time you run your program,
  ///   that sequence may change when your program is compiled using a
  ///   different version of Swift.
  ///
  /// - Parameters:
  ///   - range: The range in which to create a random value.
  ///     `range` must be finite and non-empty.
  ///   - generator: The random number generator to use when creating the
  ///     new random value.
  /// - Returns: A random value within the bounds of `range`.
  @inlinable
  public static func random<T: RandomNumberGenerator>(
    in range: Range<Self>,
    using generator: inout T
  ) -> Self {
    _precondition(
      !range.isEmpty,
      "Can't get random value with an empty range"
    )
    let delta = range.upperBound - range.lowerBound
    //  TODO: this still isn't quite right, because the computation of delta
    //  can overflow (e.g. if .upperBound = .maximumFiniteMagnitude and
    //  .lowerBound = -.upperBound); this should be re-written with an
    //  algorithm that handles that case correctly, but this precondition
    //  is an acceptable short-term fix.
    _precondition(
      delta.isFinite,
      "There is no uniform distribution on an infinite range"
    )
    let rand: Self.RawSignificand
    if Self.RawSignificand.bitWidth == Self.significandBitCount + 1 {
      rand = generator.next()
    } else {
      let significandCount = Self.significandBitCount + 1
      let maxSignificand: Self.RawSignificand = 1 << significandCount
      // Rather than use .next(upperBound:), which has to work with arbitrary
      // upper bounds, and therefore does extra work to avoid bias, we can take
      // a shortcut because we know that maxSignificand is a power of two.
      rand = generator.next() & (maxSignificand - 1)
    }
    let unitRandom = Self.init(rand) * (Self.ulpOfOne / 2)
    let randFloat = delta * unitRandom + range.lowerBound
    if randFloat == range.upperBound {
      return Self.random(in: range, using: &generator)
    }
    return randFloat
  }

PS. I think the best case implementation would generate N doubles worth of entropy in bulk then additional entropy one double at a time to reject or correct any out of bound samples.

PPS. On the other hand that does mean looping over the array twice, once to fill it and then again to debias it, so perhaps it is better to do one at a time.