I'm trying to write a simple binary stream reader for Data
that will let me pick off basic FixedWidthInteger
values. Unfortunately, the resulting code makes a LOT of calls that I think should be avoidable. This is the basic code:
struct
BinaryReader
{
init(data inData: Data)
{
self.data = inData
}
mutating
func
get<T>()
-> T where T : FixedWidthInteger
{
let size = MemoryLayout<T>.size
let v: T = self.data.subdata(in: self.idx..<self.idx + size).withUnsafeBytes { $0.load(as: T.self) }
self.idx += size
if self.bigEndian
{
return T(bigEndian: v)
}
else
{
return T(littleEndian: v)
}
}
mutating
func
seek(by inDelta: Int)
{
self.idx += inDelta
precondition(self.idx >= 0 && self.idx < self.data.count, "seek(by: \(inDelta)) out of bounds")
}
let data : Data
var idx = 0
var bigEndian = true
}
I had hoped that at least the init(bigEndian:)
/init(littleEndian:)
calls would optimize away, but they don’t. Looking at assembly using xcrun -sdk macosx swiftc [-O] -emit-assembly BinaryReader.swift
, I get this:
Without -O
:
...
testb $1, 24(%rdx)
jne LBB7_10
jmp LBB7_11
LBB7_10:
movq -160(%rbp), %rdi
movq -184(%rbp), %rsi
movq -112(%rbp), %rdx
movq -136(%rbp), %rax
callq *16(%rax)
movq -88(%rbp), %rcx
movq %rax, -288(%rbp)
movq %rcx, %rax
movq -160(%rbp), %rdi
movq -112(%rbp), %r13
movq -112(%rbp), %rsi
movq -104(%rbp), %rdx
callq _$ss17FixedWidthIntegerP9bigEndianxx_tcfCTj
movq -184(%rbp), %rdi
movq -112(%rbp), %rsi
movq -136(%rbp), %rax
callq *8(%rax)
jmp LBB7_12
LBB7_11:
movq -160(%rbp), %rdi
movq -184(%rbp), %rsi
movq -112(%rbp), %rdx
movq -136(%rbp), %rax
callq *16(%rax)
movq -88(%rbp), %rcx
movq %rax, -296(%rbp)
movq %rcx, %rax
movq -160(%rbp), %rdi
movq -112(%rbp), %r13
movq -112(%rbp), %rsi
movq -104(%rbp), %rdx
callq _$ss17FixedWidthIntegerP12littleEndianxx_tcfCTj
movq -184(%rbp), %rdi
movq -112(%rbp), %rsi
movq -136(%rbp), %rax
callq *8(%rax)
LBB7_12:
leaq -24(%rbp), %rsp
popq %rbx
popq %r12
popq %r13
popq %rbp
retq
...
And with -O
:
...
movq -88(%rbp), %rax
jne LBB4_5
movq -48(%rbp), %rdi
movq -56(%rbp), %rsi
movq %rsi, %r13
movq %r15, %rdx
callq _$ss17FixedWidthIntegerP9bigEndianxx_tcfCTj
jmp LBB4_6
LBB4_5:
movq -48(%rbp), %rdi
movq -56(%rbp), %rsi
movq %rsi, %r13
movq %r15, %rdx
callq _$ss17FixedWidthIntegerP12littleEndianxx_tcfCTj
LBB4_6:
leaq -40(%rbp), %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
...
There are more calls for the withUnsafeBytes
and load
calls. It seems all of that stuff should be able to be optimized away. Is there a different way to implement this that would result in better performance? I’ve got to parse a lot of big files.