There's no function calls in the resulting (optimised) code, inside the loop:
loc_10008cd7c:
ldr x11, [x10], #0x8
rev x12, x11
ubfx x13, x12, #0x8, #0x8
and x14, x12, #0xff
cmp x12, #0x80
ccmp x13, x14, #0x2, hs
clz x12, x12
add x12, x12, x8
csel x12, x8, x12, hs
cmp x11, #0x0
csel x8, x8, x12, eq
subs x9, x9, #0x1
b.ne loc_10008cd7c
In comparison the imperative version is:
oc_10008d3a8:
cmp x9, x10
b.eq loc_10008d338
b.hs loc_10008d410
ldr x14, [x11, x9, lsl #3]
cbnz x14, loc_10008d3d0
loc_10008d3bc:
cmp x13, x9
b.eq loc_10008d338
ldr x14, [x12, x9, lsl #3]
add x9, x9, #0x1
cbz x14, loc_10008d3bc
loc_10008d3d0:
add x9, x9, #0x1
rev x14, x14
ubfx x15, x14, #0x8, #0x8
cmp x15, w14, uxtb
b.hs loc_10008d3a8
cmp x14, #0x80
b.lo loc_10008d3a8
clz x14, x14
add x8, x14, x8
b loc_10008d3a8