i’m looking at some Swift-generated assembly for code that uses SIMD operations, and a lot of it just seems badly broken? as in the compiler seems to only use the SIMD registers to retrieve the arguments, and then just unpacks them into the normal registers and does all the operations byte-by-byte.
For example, this really trivial function
func add(a:SIMD16<UInt8>, b:SIMD16<UInt8>) -> SIMD16<UInt8>
{
a &+ b
}
just turns into this:
// stack setup
pushq %rbp
movq %rsp, %rbp
// save registers
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
// store arguments as local stack vars
movaps %xmm0, -64(%rbp)
movaps %xmm1, -80(%rbp)
// reload element b[0] and b[1] ... why???
movb -80(%rbp), %al
movb -79(%rbp), %cl
// add element a[0] to b[0]
addb -64(%rbp), %al
// spill a[0] + b[0]
movb %al, -42(%rbp)
// add element a[1] to b[1]
addb -63(%rbp), %cl
// spill a[1] + b[1]
movb %cl, -41(%rbp)
// same as above, but for the other 14 elements
movb -78(%rbp), %r8b
addb -62(%rbp), %r8b
movb -77(%rbp), %r9b
addb -61(%rbp), %r9b
movb -76(%rbp), %r10b
addb -60(%rbp), %r10b
movb -75(%rbp), %r11b
addb -59(%rbp), %r11b
movb -74(%rbp), %r14b
addb -58(%rbp), %r14b
movb -73(%rbp), %r15b
addb -57(%rbp), %r15b
movb -72(%rbp), %r12b
addb -56(%rbp), %r12b
movb -71(%rbp), %r13b
addb -55(%rbp), %r13b
movb -70(%rbp), %sil
addb -54(%rbp), %sil
movb -69(%rbp), %cl
addb -53(%rbp), %cl
movb -68(%rbp), %dl
addb -52(%rbp), %dl
movb -67(%rbp), %bl
addb -51(%rbp), %bl
movb -66(%rbp), %al
addb -50(%rbp), %al
movb -65(%rbp), %dil
addb -49(%rbp), %dil
// move... each byte... back into the simd registers,
// one by one for some reason
movzbl %dil, %edi
movd %edi, %xmm0
movzbl %al, %eax
movd %eax, %xmm1
punpcklbw %xmm0, %xmm1
movzbl %bl, %eax
movd %eax, %xmm0
movzbl %dl, %eax
movd %eax, %xmm2
punpcklbw %xmm0, %xmm2
punpcklwd %xmm1, %xmm2
movzbl %cl, %eax
movd %eax, %xmm0
movzbl %sil, %eax
movd %eax, %xmm3
punpcklbw %xmm0, %xmm3
movzbl %r13b, %eax
movd %eax, %xmm0
movzbl %r12b, %eax
movd %eax, %xmm1
punpcklbw %xmm0, %xmm1
punpcklwd %xmm3, %xmm1
punpckldq %xmm2, %xmm1
movzbl %r15b, %eax
movd %eax, %xmm0
movzbl %r14b, %eax
movd %eax, %xmm2
punpcklbw %xmm0, %xmm2
movzbl %r11b, %eax
movd %eax, %xmm0
movzbl %r10b, %eax
movd %eax, %xmm3
punpcklbw %xmm0, %xmm3
punpcklwd %xmm2, %xmm3
movzbl %r9b, %eax
movd %eax, %xmm0
movzbl %r8b, %eax
movd %eax, %xmm2
punpcklbw %xmm0, %xmm2
// reload spilled sums, and move them into the
// simd registers, individually
movzbl -41(%rbp), %eax
movd %eax, %xmm4
movzbl -42(%rbp), %eax
movd %eax, %xmm0
// interleave four simd registers containing 4 elements
// each into xmm0
punpcklbw %xmm4, %xmm0
punpcklwd %xmm2, %xmm0
punpckldq %xmm3, %xmm0
punpcklqdq %xmm1, %xmm0
// restore registers
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
// leave
popq %rbp
retq
this ate up fourteen normal registers and four vector registers for no reason, and still ended up spilling onto the stack…
weirdly this doesn’t seem to be a problem with eight-byte SIMD operations:
(maybe because the SIMD8s fit into the normal rdi, rsi, etc. argument-passing registers?)
func add(a:SIMD8<UInt8>, b:SIMD8<UInt8>) -> SIMD8<UInt8>
{
a &+ b
}
// stack setup
pushq %rbp
movq %rsp, %rbp
// load `a` into xmm0, as it should
movq %rdi, %xmm0
// load `b` into xmm1, as it should
movq %rsi, %xmm1
// xmm1 += xmm0
paddb %xmm0, %xmm1
// return xmm1
movq %xmm1, %rax
// leave
popq %rbp
retq