As was suggested in this thread, i performed a move-mask operation by importing the x86 _mm_cmpeq_epi8
intrinsic.
#if arch(x86_64)
import _Builtin_intrinsics.intel
extension SIMD16 where Scalar == UInt8
{
func find(_ scalar:UInt8) -> UInt16
{
let repeated:Self = .init(repeating: scalar)
let mask:SIMD2<Int64> = _mm_cmpeq_epi8(
unsafeBitCast(self, to: SIMD2<Int64>.self),
unsafeBitCast(repeated, to: SIMD2<Int64>.self))
return .init(truncatingIfNeeded: _mm_movemask_epi8(mask))
}
}
#endif
now i’m looking to make this work on all Swift platforms, not just #if arch(x86_64)
evaluates true. does anyone have any recommendations for implementing efficient fallbacks for this operation?
i have generic Swift SIMD{N}
code here:
func find(_ key:UInt8, in vector:SIMD16<UInt8>, _ body:(Int) -> ())
{
// (key: 5, vector: (1, 5, 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5))
let places:SIMD16<UInt8> =
.init(128, 64, 32, 16, 8, 4, 2, 1, 128, 64, 32, 16, 8, 4, 2, 1),
match:SIMD16<UInt8> = places.replacing(with: 0, where: vector .!= key)
// match: ( 0, 64, 0, 0, 8, 4, 0, 0, 0, 0, 0, 0, 8, 0, 0, 1)
let r8:SIMD8<UInt8> = match.evenHalf | match.oddHalf,
r4:SIMD4<UInt8> = r8.evenHalf | r8.oddHalf,
r2:SIMD2<UInt8> = r4.evenHalf | r4.oddHalf
let r:UInt16 = .init(r2.x) << 8 | .init(r2.y)
// r: 0b0100_1100_0000_1001
var i:Int = r.leadingZeroBitCount
while i < 16
{
body(i)
i += 1 + ((r << i) << 1).leadingZeroBitCount
}
}
but it generates very inefficient assembly.