Hmm, this ended up being a curious little micro-optimisation puzzle.
Basically, if you write:
for i in 0..<arr.flatCount {
let result = arr[flatIndex: i]
blackHole(result)
}
You'll get this output (where I believe it is re-calculating the offset every iteration of the loop):
push rbp
mov rbp, rsp
push r15
push r14
push rbx
push rax
mov r15, qword ptr [rdi + 16]
shl r15, 2
test r15, r15
je .LBB5_3
mov r14, rdi
add r14, 32
xor eax, eax
.LBB5_2:
mov rcx, rax
// IIUC, all of this shifting, subtracting and adding is recalculating the offset.
sar rcx, 63
shr rcx, 62
add rcx, rax
sar rcx, 2
lea edx, [4*rcx]
lea rbx, [rax + 1]
sub eax, edx
shl rcx, 4
add rcx, r14
cdqe
movss xmm0, dword ptr [rcx + 4*rax]
call (output.blackHole(Swift.Float) -> ())
mov rax, rbx
cmp r15, rbx
jne .LBB5_2
.LBB5_3:
add rsp, 8
pop rbx
pop r14
pop r15
pop rbp
ret
However, writing it as a while
loop:
var i = 0
while i < arr.flatCount {
let result = arr[flatIndex: i]
blackHole(result)
i += 1
}
Generates fewer, more pleasing instructions:
push rbp
mov rbp, rsp
push r15
push r14
push r12
push rbx
mov r15, qword ptr [rdi + 16]
shl r15, 2
test r15, r15
jle .LBB5_3
mov r14, rdi
add r14, 32
xor ebx, ebx
movabs r12, 4611686018427387900
.LBB5_2:
mov rax, rbx
and rax, r12
lea rax, [r14 + 4*rax]
mov ecx, ebx
and ecx, 3
movss xmm0, dword ptr [rax + 4*rcx]
call (output.blackHole(Swift.Float) -> ())
add rbx, 1
cmp r15, rbx
jne .LBB5_2
.LBB5_3:
pop rbx
pop r12
pop r14
pop r15
pop rbp
ret
I very much doubt you'll get anything nicer than the latter result, even from C.