While implementing a 2D hierarchical hash grid / spatial hashing data structure in Swift, and trying to optimize this function (defined only for i
in 0 ... 6
):
func globalCellIndex(forGridIndex i: UInt8) -> UInt32 {
let mask = ~(UInt32(0b11111111111111111) &>> (i &* 2))
return UInt32(0b10101010101000000) & mask
}
I ran into the following two questions.
Further background details here.
The hierarchical hash grid is optimized for speed and a specific use case, so it's hardcoded to work only with bounding circles that have their centers in 0 ..< 256 and radii in 0.25 ..< 32.
global
gridIndex cellSize resolution radius range cellIndex
0 1 256 x 256 0.25 ..< 0.5 0
1 2 128 x 128 0.5 ..< 1.0 65536
2 4 64 x 64 1.0 ..< 2.0 81920
3 8 32 x 32 2.0 ..< 4.0 86016
4 16 16 x 16 4.0 ..< 8.0 87040
5 32 8 x 8 8.0 ..< 16.0 87296
6 64 4 x 4 16.0 ..< 32.0 87360
The globalCellIndex
is the start index for a given grid, and is calculated as follows:
globalCellIndex(forGridIndex: 0) == 0
globalCellIndex(forGridIndex: 1) == 65536 == 256*256
globalCellIndex(forGridIndex: 2) == 81920 == 256*256 + 128*128
...
globalCellIndex(forGridIndex: 6) == 87360 == 256*256 + 128*128 + 64*64 + 32*32 + 16*16 + 8*8 + 4*4
This function can be written like this:
func globalCellIndex(forGridIndex i: UInt8) -> UInt32 {
let mask = ~(UInt32(0b11111111111111111) &>> (i &* 2))
return UInt32(0b10101010101000000) & mask
}
The following makes it clear how it works:
0 -> 0 == 0b00000000000000000
1 -> 65536 == 0b10000000000000000
2 -> 81920 == 0b10100000000000000
3 -> 86016 == 0b10101000000000000
4 -> 87040 == 0b10101010000000000
5 -> 87296 == 0b10101010100000000
6 -> 87360 == 0b10101010101000000
Question 1
I noticed that the following alternative formulation of the function is a bit faster:
func globalCellIndexB(forGridIndex i: UInt8) -> UInt32 {
let results = (UInt32(0), UInt32(65536), UInt32(81920), UInt32(86016),
UInt32(87040), UInt32(87296), UInt32(87360))
return withUnsafeBytes(of: results) {
$0.bindMemory(to: UInt32.self)[Int(i)]
}
}
according to
this benchmark program.
import Foundation
func globalCellIndex(forGridIndex i: UInt8) -> UInt32 {
let mask = ~(UInt32(0b11111111111111111) &>> (i &* 2))
return UInt32(0b10101010101000000) & mask
}
func globalCellIndexB(forGridIndex i: UInt8) -> UInt32 {
let results = (UInt32(0), UInt32(65536), UInt32(81920), UInt32(86016),
UInt32(87040), UInt32(87296), UInt32(87360))
return withUnsafeBytes(of: results) {
$0.bindMemory(to: UInt32.self)[Int(i)]
}
}
func test() {
print("Generating 8MB of random gridIndices (bytes in range 0 ... 6) β¦")
let count = 1024*1024*8
let gridIndices: [UInt8] = (0 ..< count).map { _ in .random(in: 0 ... 6) }
print("Done! Performing test, 5 times:")
var checksum: UInt32 = 0
for _ in 0 ..< 5 {
let t0 = DispatchTime.now().uptimeNanoseconds
for gi in gridIndices {
checksum &+= globalCellIndex(forGridIndex: gi)
//checksum &+= globalCellIndexB(forGridIndex: gi)
}
let t1 = DispatchTime.now().uptimeNanoseconds
print(Double(t1-t0)/1e9, "seconds (\(checksum))")
}
}
test()
Can anyone explain why this function is faster than the former bit manipulating one? I did look at them in godbolt but I lack the skills needed to make sense of their disassembly.
Question 2
Is there any difference I should know about between the following three ways of writing the read-data-from-pointer-code? They are equivalent (all three result in identical code being generated) according to godbolt:
func globalCellIndexB(forGridIndex i: UInt8) -> UInt32 {
let results = (UInt32(0), UInt32(65536), UInt32(81920), UInt32(86016),
UInt32(87040), UInt32(87296), UInt32(87360))
return withUnsafeBytes(of: results) {
// Alternative 1:
$0.bindMemory(to: UInt32.self)[Int(i)]
// Alternative 2:
$0.baseAddress!.assumingMemoryBound(to: UInt32.self)[Int(i)]
// Alternative 3:
$0.load(fromByteOffset: Int(i) &* 4, as: UInt32.self)
}
}