What is the best way to work with the file system?

What is the quickest way to read a large file?

That's `mmap` (1x or base time)
func readWithMMap(_ url: URL, callback: (UInt8) -> Void) async {
    let file = open(url.path, O_RDONLY)
    var s = stat()
    fstat(file, &s)
    let size = Int(s.st_size)
    let bytes = mmap(nil, size, PROT_READ, MAP_PRIVATE, file, 0)!.assumingMemoryBound(to: UInt8.self)
    for i in 0 ..< size {
        callback(bytes[i])
    }
    munmap(bytes, size)
    close(file)
}
Followed by reading / "freading" file by large chunks (1.2x base time)
func readFileByBlocks(_ url: URL, callback: (UInt8) -> Void) async {
    let blockSize = 10*1024
    let file = open(url.path, O_RDONLY)
    let mem = malloc(blockSize)!.assumingMemoryBound(to: UInt8.self)
    while true {
        let count = read(file, mem, blockSize)
        for i in 0 ..< count { callback(mem[i]) }
        if count < blockSize { break }
    }
    free(mem); close(file)
}

func freadFileByBlocks(_ url: URL, blockSize: Int = 10*1024, callback: (UInt8) -> Void) async {
    let file = fopen(url.path, "rb")!
    let mem = malloc(blockSize)!.assumingMemoryBound(to: UInt8.self)
    while true {
        let count = fread(mem, 1, blockSize, file)
        for i in 0 ..< count { callback(mem[i]) }
        if count < blockSize { break }
    }
    free(mem); fclose(file)
}
Followed by Data(contentsOf:) (1.4x base time)
func readWithData(_ url: URL, callback: (UInt8) -> Void) async {
    let data = try! Data(contentsOf: url)
    let size = data.count
    data.withUnsafeBytes { bytes in
        let p = bytes
        for i in 0 ..< size {
            callback(p[i])
        }
    }
}
Followed by URL.resourceBytes, well done! (2.8x base time)
func readWithResourceBytes(_ url: URL, callback: @escaping (UInt8) -> Void) async {
    do {
        for try await byte in url.resourceBytes {
            callback(byte)
        }
    } catch {
        fatalError()
    }
}
Followed by URLSession.data (7.5x base time)
func readWithURLSessionAsync(_ url: URL, callback: @escaping (UInt8) -> Void) async {
    let data = try! await URLSession.shared.data(from: url).0
    let size = data.count
    data.withUnsafeBytes { bytes in
        let p = bytes
        for i in 0 ..< size {
            callback(p[i])
        }
    }
}
Followed by URLSession.dataTask, no idea why it is slower (30x base time)
func readWithURLSession(_ url: URL, callback: @escaping (UInt8) -> Void) async {
    await withCheckedContinuation { continuation in
        URLSession.shared.dataTask(with: URLRequest(url: url)) { data, response, error in
            let size = data!.count
            data!.withUnsafeBytes { bytes in
                let p = bytes
                for i in 0 ..< size {
                    callback(p[i])
                }
            }
            continuation.resume(returning: ())
        }.resume()
    }
}
Followed by "freading" file byte by byte (130x base time)
func freadFileByBytes(_ url: URL, callback: (UInt8) -> Void) async {
    let file = fopen(url.path, "rb")!
    while true {
        var byte: UInt8 = 0
        let count = fread(&byte, 1, 1, file)
        if count < 1 { break }
        callback(byte)
    }
    fclose(file)
}
Followed by reading file byte by byte – no buffering (2900x base time)
func readFileByBytes(_ url: URL, callback: (UInt8) -> Void) async {
    let file = open(url.path, O_RDONLY)
    while true {
        var byte: UInt8 = 0
        let count = read(file, &byte, 1)
        if count < 1 { break }
        callback(byte)
    }
    close(file)
}
Then something is seriously wrong with this AsyncStream implementation (7600x base time)

see the fragment at the bottom of this post.

Rest of the code if you want to try it
func test() {
    Task {
        let url = URL.temporaryDirectory.appending(component: UUID().uuidString)
        let data = Data(repeating: 0xAD, count: 100_000_001)
        try! data.write(to: url)
        
        let base = await measure(url, nil, "readWithMMap", readWithMMap)
        await measure(url, base, "readFileByBlocks", readFileByBlocks)
        await measure(url, base, "freadFileByBlocks", readFileByBlocks)
        await measure(url, base, "readWithData", readWithData)
        await measure(url, base, "readWithResourceBytes", readWithResourceBytes)
        await measure(url, base, "readWithURLSessionAsync", readWithURLSessionAsync)
        await measure(url, base, "readWithURLSession", readWithURLSession)
        await measure(url, base, "freadFileByBytes", freadFileByBytes)
        await measure(url, base, "readFileByBytes", readFileByBytes)
        await measure(url, base, "readWithAsyncStream", readWithAsyncStream)
        try! FileManager.default.removeItem(at: url)
        print("done")
    }
}

@discardableResult func measure(_ url: URL, _ base: Double?, _ title: String, _ execute: (URL, @escaping (UInt8) -> Void) async -> Void) async -> Double {
    let start = Date()
    var result = 0
    await execute(url) { result &+= Int($0) }
    let elapsed = Date().timeIntervalSince(start)
    precondition(result == 17300000173)
    let elapsedString = String(format: "%.3f", elapsed)
    let factorString = String(format: "%.3f", elapsed / (base ?? elapsed))
    print("\(title): \(elapsedString)sec, \(factorString)x")
    return elapsed
}

test()
RunLoop.current.run(until: .distantFuture)

This is the one that takes ridiculous amount of time (7600x base time):

func readWithAsyncStream(_ url: URL, callback: @escaping (UInt8) -> Void) async {
    let stream = AsyncStream<UInt8> { continuation in
        let data = try! Data(contentsOf: url)
        let size = data.count
        data.withUnsafeBytes { bytes in
            let p = bytes
            for i in 0 ..< size {
                continuation.yield(p[i])
            }
        }
        continuation.finish()
    }
    for await byte in stream {
        callback(byte)
    }
}

Don't know if async stream / continuations are slow or I am using them wrongly.

All tests were done with -O and there was no significant difference between mac and iOS device.

21 Likes