Why does adding conformance to RandomAccessCollection cause a performance regression?

I tried some things with the code (since I've ran into similar issues that I never managed to isolate), but the only thing I could find out was that the regression disappears if you annotate the get and set of BitVectors subscript with @_transparent. I don't know why this is, but I guess it does demonstrate a bug (or at least an opportunity for improvement) in the optimizer. But it would be nice if someone more skilled could find a smaller reproducing example, cc @Slava_Pestov, @Joe_Groff, @Andrew_Trick.

I'm attaching the code (it's the files of your MWE-branch, joined into one) in the state where I left it after finding this workaround (and after doing some other things that I tested but which made no difference, eg removing all public, sticking the test code in a func, fixing the warnings that then appeared about changing some vars to lets etc.).

import Dispatch
import QuartzCore

struct BitVector {
    private var bits: Array<Int>
    
    private let blockSize = Int.bitWidth
    
    init(repeating: Bool, count: Int) {
        let fillInt = repeating ? ~0 : 0
        let nBlocks: Int = (count / blockSize) + 1
        bits = [Int](repeating: fillInt, count: nBlocks)
    }
    
    private func getBlockAndOffset(of bitIndex: Int) -> (Int, Int) {
        return (bitIndex / blockSize, bitIndex % blockSize)
    }
    
    var startIndex: Int { return 0 }
    var endIndex: Int { return bits.count * blockSize }
    
    // NOTE: Adding @_transparent to the get and set of this subscript
    // removes the slowdown that is otherwise magically introduced by
    // conforming BitVector to RandomAccessCollection (see further down).
    // So if you comment out the two @_transparent and keep the conformance
    // to RandomAccessCollection, the slowdown will be there again.
    subscript(_ bitIndex: Int) -> Bool {
        @_transparent
        get {
            let (block, offset) = getBlockAndOffset(of: bitIndex)
            let mask = 1 << offset
            return mask & bits[block] != 0
        }
        @_transparent
        set {
            let (block, offset) = getBlockAndOffset(of: bitIndex)
            let mask = 1 << offset
            if newValue {
                bits[block] |= mask
            } else {
                bits[block] &= ~mask
            }
        }
    }
}

struct Edge<T: BinaryInteger> {
    let src: T
    let dst: T
    
    init(_ s: T, _ d: T) {
        src = s
        dst = d
    }
    
    var ordered: Edge {
        if src > dst {
            return Edge(dst, src)
        } else { return self }
    }
    
    var reverse: Edge {
        return Edge(dst, src)
    }
}

struct Graph<T: BinaryInteger> {
    let rowidx: Array<T>
    let colptr: Array<Array<T>.Index>
    
    static var eltype: Any.Type { return T.self }
    var nv: T { return T((colptr.count - 1)) }
    var ne: Int { return rowidx.count / 2 }
    
    var vertices: StrideTo<T> {
        return stride(from: T(0), to: nv, by: +1)
    }
    
    init(fromBinaryFile fileName: String) {
        let file = URL(fileURLWithPath: fileName)
        let fileHandle = try! FileHandle(forReadingFrom: file)
        let magic = fileHandle.readData(ofLength: 4)
        guard magic.elementsEqual("GRPH".utf8) else {
            fatalError("\(file) was not a graph file")
        }
        let colSize = fileHandle.readData(ofLength: MemoryLayout<UInt32>.size).withUnsafeBytes { (ptr: UnsafePointer<UInt32>) -> Int in
            return Int(ptr.pointee.bigEndian)
        }
        colptr = fileHandle.readData(ofLength: MemoryLayout<UInt32>.size * colSize).withUnsafeBytes({ (ptr: UnsafePointer<UInt32>) -> [Int] in
            let bufferPointer = UnsafeBufferPointer(start: ptr, count: colSize)
            return [Int](bufferPointer.lazy.map { Int($0.bigEndian) })
        })
        let rowSize = fileHandle.readData(ofLength: MemoryLayout<UInt32>.size).withUnsafeBytes { (ptr: UnsafePointer<UInt32>) -> Int in
            return Int(ptr.pointee.bigEndian)
        }
        rowidx = fileHandle.readData(ofLength: MemoryLayout<UInt32>.size * rowSize).withUnsafeBytes({ (ptr: UnsafePointer<UInt32>) -> [T] in
            let bufferPointer = UnsafeBufferPointer(start: ptr, count: rowSize)
            return [T](bufferPointer.lazy.map { T($0.bigEndian) })
        })
    }
    
    private func vecRange(_ s: Array<T>.Index) -> Range<Array<T>.Index> {
        let rStart = colptr[s]
        let rEnd = colptr[s + 1]
        return rStart ..< rEnd
    }
    
    func neighbors(of vertex: T) -> ArraySlice<T> {
        let range = vecRange(Array<T>.Index(vertex))
        return rowidx[range]
    }
    
    func BFS(from sourceVertex: Int) -> [T] {
        let numVertices = Int(nv)
        let maxT = ~T()
        var visited = BitVector(repeating: false, count: numVertices)
        var vertLevel = Array<T>(repeating: maxT, count: numVertices)
        var nLevel: T = 1
        var curLevel = [T]()
        curLevel.reserveCapacity(numVertices)
        var nextLevel = [T]()
        nextLevel.reserveCapacity(numVertices)
        
        visited[sourceVertex] = true
        vertLevel[sourceVertex] = 0
        curLevel.append(T(sourceVertex))
        
        while !curLevel.isEmpty {
            for vertex in curLevel {
                for neighbor in neighbors(of: vertex) {
                    if !visited[Int(neighbor)] {
                        visited[Int(neighbor)] = true
                        nextLevel.append(neighbor)
                        vertLevel[Int(neighbor)] = nLevel
                    }
                }
            }
            nLevel += 1
            curLevel.removeAll(keepingCapacity: true)
            (curLevel, nextLevel) = (nextLevel, curLevel)
            curLevel.sort()
        }
        return vertLevel
    }
}

extension Graph: CustomStringConvertible {
    var description: String {
        return "{\(nv), \(ne)} Graph"
    }
}

// NOTE: This conformance is not necessary for the test to succeed, but
// without the workaround of adding @_transparent to the get and set of
// BitVector's subscript (see above), this conformance will somehow cause
// a performance regression in the below test.
extension BitVector : RandomAccessCollection {}

func test() {
    let start = DispatchTime.now()
    let h = Graph<UInt32>(fromBinaryFile: "/Users/jens/Downloads/indptrvecs-4m-30m.0based.bin")
    let end = DispatchTime.now()
    print("graph read took \(Double(end.uptimeNanoseconds - start.uptimeNanoseconds) / 1_000_000.0) ms")
    print(h)
    var times = [Double]()
    for i in 1 ... 40 {
        let start = CACurrentMediaTime()
        let bfs1 = h.BFS(from: 0)
        let end = CACurrentMediaTime()
        let timediff = (end - start) * 1000
        print("Run \(i): BFS from vertex 0 to \(timediff) ms; sum = \(bfs1.reduce(0, +))")
        times.append(timediff)
    }
    
    let timeAvg = times.reduce(0, +) / Double(times.count)
    let sumOfSquaredAvgDiff = times.map { pow($0 - timeAvg, 2.0) }.reduce(0, +)
    let timeStd = sqrt(sumOfSquaredAvgDiff / Double(times.count - 1))
    print("Times: min: \(times.min()!), max: \(times.max()!), avg: \(timeAvg), std: \(timeStd)")
}
test()

For anyone wanting to look into this: In order to run the above demo program, you also need the graph data file linked to here.