My usecase is doing parallel comparisons against a vector of byte-tags. unfortunately, this “simple” searching code does not vectorize on its own.
func find(_ tag:UInt8, in buffer:UnsafePointer<UInt8>) -> UInt16
{
(0 ..< 16).reduce(0){ $0 | (buffer[$1] == tag ? 1 << $1 : 0) }
}
output.find(_: Swift.UInt8, in: Swift.UnsafePointer<Swift.UInt8>)
-> Swift.UInt16:
pushq %rbp
movq %rsp, %rbp
xorl %edx, %edx
movl $1, %r8d
xorl %r9d, %r9d
jmp .LBB1_1
.LBB1_9:
movl %ecx, %eax
andb $15, %al
shlxl %eax, %r8d, %eax
.LBB1_10:
orl %r9d, %eax
leaq 1(%rcx), %rdx
movl %eax, %r9d
cmpq $15, %rcx
je .LBB1_7
.LBB1_1:
movq %rdx, %rcx
.LBB1_2:
leaq 1(%rcx), %rax
cmpb %dil, (%rsi,%rcx)
jne .LBB1_11
testq %rcx, %rcx
js .LBB1_4
cmpq $15, %rcx
jg .LBB1_12
jmp .LBB1_9
.LBB1_4:
cmpq $-15, %rcx
jge .LBB1_5
.LBB1_11:
cmpq $15, %rcx
.LBB1_12:
movq %rax, %rcx
jne .LBB1_2
jmp .LBB1_6
.LBB1_5:
movl %ecx, %eax
negb %al
andb $15, %al
shrxl %eax, %r8d, %eax
jmp .LBB1_10
.LBB1_6:
movl %r9d, %eax
.LBB1_7:
popq %rbp
retq
so far, the only way i’ve been able to get this to vectorize is to use the builtin intrinsics as @scanon suggested in a different thread:
#if arch(x86_64)
import _Builtin_intrinsics.intel
extension SIMD16 where Scalar == UInt8
{
func find(_ scalar:UInt8) -> UInt16
{
let repeated:Self = .init(repeating: scalar)
let mask:SIMD2<Int64> = _mm_cmpeq_epi8(
unsafeBitCast(self, to: SIMD2<Int64>.self),
unsafeBitCast(repeated, to: SIMD2<Int64>.self))
return .init(truncatingIfNeeded: _mm_movemask_epi8(mask))
}
}
#endif
(extension in output):Swift.SIMD16< where A == Swift.UInt8>.find(Swift.UInt8)
-> Swift.UInt16:
pushq %rbp
movq %rsp, %rbp
vpbroadcastb %edi, %xmm1
vpcmpeqb %xmm0, %xmm1, %k0
kmovd %k0, %eax
popq %rbp
retq
that constitutes one half of my use case. now i’m trying to figure out how to set the byte values in the vector to be compared against.
this works! again though, can we assume the layout of the SIMD elements in memory?