To narrow it down from being an LLVM bug, I wrote some C code to do the same thing, and compiled it with clang. Sure enough, we get zip1.8h
and zip2.8h
, just as expected.
zip_example.c
// Compiled with `clang -S zip_example.c -O2 -o -`
#include <stdio.h>
#include <stdint.h>
#include <arm_neon.h>
void print_uint8x8(uint16x8_t v) {
printf("(%x, %x, %x, %x, %x, %x, %x, %x)",
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
}
__attribute__ ((noinline)) // Just so we can see these instructions in isolation
uint16x8_t zip1_8h(const uint16x8_t a, const uint16x8_t b) {
// .globl _vzip1_u8_wrapper ; -- Begin function zip1_8h
// .p2align 2
// _vzip1_u8_wrapper: ; @vzip1_u8_wrapper
// .cfi_startproc
// ; %bb.0:
// zip1.8h v0, v0, v1
// ret
// .cfi_endproc
// ; -- End function
return vzip1q_u16(a, b);
}
__attribute__ ((noinline)) // Just so we can see these instructions in isolation
uint16x8_t zip2_8h(const uint16x8_t a, const uint16x8_t b) {
// .globl _vzip2_u8_wrapper ; -- Begin function zip2_8h
// .p2align 2
// _vzip2_u8_wrapper: ; @vzip2_u8_wrapper
// .cfi_startproc
// ; %bb.0:
// zip2.8h v0, v0, v1
// ret
// .cfi_endproc
// ; -- End function
return vzip2q_u16(a, b);
}
int main() {
// Input arrays
uint16x8_t a = (uint16x8_t){0, 2, 4, 6, 8, 0xA, 0xC, 0xE};
uint16x8_t b = (uint16x8_t){1, 3, 5, 7, 9, 0xB, 0xD, 0xF};
uint16x8_t firstHalf = zip1_8h(a, b);
uint16x8_t secondHalf = zip2_8h(a, b);
printf("Zipped output:\n");
print_uint8x8(firstHalf);
printf(" ");
print_uint8x8(secondHalf);
printf("\n");
return 0;
}
For reference, here's a minimal Swift comparison:
minimal_demo.swift
// Compiled with:
// - xcrun swiftc --target=arm64-apple-macos12 -emit-assembly -O ./minimal_demo.swift
// - xcrun swiftc --target=arm64-apple-macos12 -emit-sil -O ./minimal_demo.swift
// - xcrun swiftc --target=arm64-apple-macos12 -emit-ir -O ./minimal_demo.swift
/// A function which uses the `zip1.8h` ARM instruction to interleave the first halves of the *last* 4 elements of `a`
/// with the latter halves of the *last* 4 elements of `b`, to produce a new 8 element vector.
/// The *first* 4 elements of `a` and `b` are ignored.
@inline(never) // Just so we can see these instructions in isolation
func zip1_8h(_ a: SIMD8<UInt16>, _ b: SIMD8<UInt16>) -> SIMD8<UInt16> {
// Disassembly of SIMDPlayground`zip1_8h(_:_:):
// 0x100003edc <+0>: zip1.8h v0, v0, v1
// 0x100003ee0 <+4>: ret
return SIMD8<UInt16>(
a[0], b[0],
a[1], b[1],
a[2], b[2],
a[3], b[3]
)
}
/// A function which should the `zip2.8h` ARM instruction to interleave the first halves of the *first* 4 elements of `a`
/// with the latter halves of the *first* 4 elements of `b`, to produce a new 8 element vector.
/// The *last* 4 elements of `a` and `b` are ignored.
@inline(never) // Just so we can see these instructions in isolation
func zip2_8h(_ a: SIMD8<UInt16>, _ b: SIMD8<UInt16>) -> SIMD8<UInt16> {
// Disassembly of SIMDPlayground`zip2_8h(_:_:):
// 0x100003ebc <+0>: dup.8h v2, v0[4]
// 0x100003ec0 <+4>: mov.h v2[1], v1[4]
// 0x100003ec4 <+8>: mov.h v2[2], v0[5]
// 0x100003ec8 <+12>: mov.h v2[3], v1[5]
// 0x100003ecc <+16>: mov.h v2[4], v0[6]
// 0x100003ed0 <+20>: mov.h v2[5], v1[6]
// 0x100003ed4 <+24>: mov.h v2[6], v0[7]
// 0x100003ed8 <+28>: mov.h v2[7], v1[7]
// 0x100003edc <+32>: mov.16b v0, v2
// 0x100003ee0 <+36>: ret
return SIMD8<UInt16>(
a[4], b[4],
a[5], b[5],
a[6], b[6],
a[7], b[7]
)
}
let a = SIMD8<UInt16>(0, 2, 4, 6, 8, 0xA, 0xC, 0xE)
let b = SIMD8<UInt16>(1, 3, 5, 7, 9, 0xB, 0xD, 0xF)
// | zip1.8h | zip2.8h |
print(zip1_8h(a, b)) // Ignores 8...0xF
// SIMD8<UInt16>(0, 1, 2, 3, 4, 5, 6, 7)
print(zip2_8h(a, b)) // Ignores 0...7
// SIMD8<UInt16>(8, 9, 10, 11, 12, 13, 14, 15)
SIL
// zip1_8h(_:_:)
sil hidden [noinline] @$s12minimal_demo7zip1_8hys5SIMD8Vys6UInt16VGAG_AGtF : $@convention(thin) (SIMD8<UInt16>, SIMD8<UInt16>) -> SIMD8<UInt16> {
[global: ]
// %0 "a" // users: %4, %2
// %1 "b" // users: %8, %3
bb0(%0 : $SIMD8<UInt16>, %1 : $SIMD8<UInt16>):
debug_value %0 : $SIMD8<UInt16>, let, name "a", argno 1 // id: %2
debug_value %1 : $SIMD8<UInt16>, let, name "b", argno 2 // id: %3
%4 = struct_extract %0 : $SIMD8<UInt16>, #SIMD8._storage // user: %5
%5 = struct_extract %4 : $UInt16.SIMD8Storage, #UInt16.SIMD8Storage._value // users: %18, %15, %12, %7
%6 = integer_literal $Builtin.Int32, 0 // users: %21, %10, %7
%7 = builtin "extractelement_Vec8xInt16_Int32"(%5 : $Builtin.Vec8xInt16, %6 : $Builtin.Int32) : $Builtin.Int16 // user: %21
%8 = struct_extract %1 : $SIMD8<UInt16>, #SIMD8._storage // user: %9
%9 = struct_extract %8 : $UInt16.SIMD8Storage, #UInt16.SIMD8Storage._value // users: %19, %16, %13, %10
%10 = builtin "extractelement_Vec8xInt16_Int32"(%9 : $Builtin.Vec8xInt16, %6 : $Builtin.Int32) : $Builtin.Int16 // user: %22
%11 = integer_literal $Builtin.Int32, 1 // users: %22, %13, %12
%12 = builtin "extractelement_Vec8xInt16_Int32"(%5 : $Builtin.Vec8xInt16, %11 : $Builtin.Int32) : $Builtin.Int16 // user: %23
%13 = builtin "extractelement_Vec8xInt16_Int32"(%9 : $Builtin.Vec8xInt16, %11 : $Builtin.Int32) : $Builtin.Int16 // user: %24
%14 = integer_literal $Builtin.Int32, 2 // users: %23, %16, %15
%15 = builtin "extractelement_Vec8xInt16_Int32"(%5 : $Builtin.Vec8xInt16, %14 : $Builtin.Int32) : $Builtin.Int16 // user: %26
%16 = builtin "extractelement_Vec8xInt16_Int32"(%9 : $Builtin.Vec8xInt16, %14 : $Builtin.Int32) : $Builtin.Int16 // user: %28
%17 = integer_literal $Builtin.Int32, 3 // users: %24, %19, %18
%18 = builtin "extractelement_Vec8xInt16_Int32"(%5 : $Builtin.Vec8xInt16, %17 : $Builtin.Int32) : $Builtin.Int16 // user: %30
%19 = builtin "extractelement_Vec8xInt16_Int32"(%9 : $Builtin.Vec8xInt16, %17 : $Builtin.Int32) : $Builtin.Int16 // user: %32
%20 = builtin "zeroInitializer"<Builtin.Vec8xInt16>() : $Builtin.Vec8xInt16 // user: %21
%21 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%20 : $Builtin.Vec8xInt16, %7 : $Builtin.Int16, %6 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %22
%22 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%21 : $Builtin.Vec8xInt16, %10 : $Builtin.Int16, %11 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %23
%23 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%22 : $Builtin.Vec8xInt16, %12 : $Builtin.Int16, %14 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %24
%24 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%23 : $Builtin.Vec8xInt16, %13 : $Builtin.Int16, %17 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %26
%25 = integer_literal $Builtin.Int32, 4 // user: %26
%26 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%24 : $Builtin.Vec8xInt16, %15 : $Builtin.Int16, %25 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %28
%27 = integer_literal $Builtin.Int32, 5 // user: %28
%28 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%26 : $Builtin.Vec8xInt16, %16 : $Builtin.Int16, %27 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %30
%29 = integer_literal $Builtin.Int32, 6 // user: %30
%30 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%28 : $Builtin.Vec8xInt16, %18 : $Builtin.Int16, %29 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %32
%31 = integer_literal $Builtin.Int32, 7 // user: %32
%32 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%30 : $Builtin.Vec8xInt16, %19 : $Builtin.Int16, %31 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %33
%33 = struct $UInt16.SIMD8Storage (%32 : $Builtin.Vec8xInt16) // user: %34
%34 = struct $SIMD8<UInt16> (%33 : $UInt16.SIMD8Storage) // user: %35
return %34 : $SIMD8<UInt16> // id: %35
} // end sil function '$s12minimal_demo7zip1_8hys5SIMD8Vys6UInt16VGAG_AGtF'
// print(_:separator:terminator:)
sil @$ss5print_9separator10terminatoryypd_S2StF : $@convention(thin) (@guaranteed Array<Any>, @guaranteed String, @guaranteed String) -> ()
// zip2_8h(_:_:)
sil hidden [noinline] @$s12minimal_demo7zip2_8hys5SIMD8Vys6UInt16VGAG_AGtF : $@convention(thin) (SIMD8<UInt16>, SIMD8<UInt16>) -> SIMD8<UInt16> {
[global: ]
// %0 "a" // users: %4, %2
// %1 "b" // users: %8, %3
bb0(%0 : $SIMD8<UInt16>, %1 : $SIMD8<UInt16>):
debug_value %0 : $SIMD8<UInt16>, let, name "a", argno 1 // id: %2
debug_value %1 : $SIMD8<UInt16>, let, name "b", argno 2 // id: %3
%4 = struct_extract %0 : $SIMD8<UInt16>, #SIMD8._storage // user: %5
%5 = struct_extract %4 : $UInt16.SIMD8Storage, #UInt16.SIMD8Storage._value // users: %18, %15, %12, %7
%6 = integer_literal $Builtin.Int32, 4 // users: %29, %10, %7
%7 = builtin "extractelement_Vec8xInt16_Int32"(%5 : $Builtin.Vec8xInt16, %6 : $Builtin.Int32) : $Builtin.Int16 // user: %22
%8 = struct_extract %1 : $SIMD8<UInt16>, #SIMD8._storage // user: %9
%9 = struct_extract %8 : $UInt16.SIMD8Storage, #UInt16.SIMD8Storage._value // users: %19, %16, %13, %10
%10 = builtin "extractelement_Vec8xInt16_Int32"(%9 : $Builtin.Vec8xInt16, %6 : $Builtin.Int32) : $Builtin.Int16 // user: %24
%11 = integer_literal $Builtin.Int32, 5 // users: %30, %13, %12
%12 = builtin "extractelement_Vec8xInt16_Int32"(%5 : $Builtin.Vec8xInt16, %11 : $Builtin.Int32) : $Builtin.Int16 // user: %26
%13 = builtin "extractelement_Vec8xInt16_Int32"(%9 : $Builtin.Vec8xInt16, %11 : $Builtin.Int32) : $Builtin.Int16 // user: %28
%14 = integer_literal $Builtin.Int32, 6 // users: %31, %16, %15
%15 = builtin "extractelement_Vec8xInt16_Int32"(%5 : $Builtin.Vec8xInt16, %14 : $Builtin.Int32) : $Builtin.Int16 // user: %29
%16 = builtin "extractelement_Vec8xInt16_Int32"(%9 : $Builtin.Vec8xInt16, %14 : $Builtin.Int32) : $Builtin.Int16 // user: %30
%17 = integer_literal $Builtin.Int32, 7 // users: %32, %19, %18
%18 = builtin "extractelement_Vec8xInt16_Int32"(%5 : $Builtin.Vec8xInt16, %17 : $Builtin.Int32) : $Builtin.Int16 // user: %31
%19 = builtin "extractelement_Vec8xInt16_Int32"(%9 : $Builtin.Vec8xInt16, %17 : $Builtin.Int32) : $Builtin.Int16 // user: %32
%20 = builtin "zeroInitializer"<Builtin.Vec8xInt16>() : $Builtin.Vec8xInt16 // user: %22
%21 = integer_literal $Builtin.Int32, 0 // user: %22
%22 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%20 : $Builtin.Vec8xInt16, %7 : $Builtin.Int16, %21 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %24
%23 = integer_literal $Builtin.Int32, 1 // user: %24
%24 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%22 : $Builtin.Vec8xInt16, %10 : $Builtin.Int16, %23 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %26
%25 = integer_literal $Builtin.Int32, 2 // user: %26
%26 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%24 : $Builtin.Vec8xInt16, %12 : $Builtin.Int16, %25 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %28
%27 = integer_literal $Builtin.Int32, 3 // user: %28
%28 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%26 : $Builtin.Vec8xInt16, %13 : $Builtin.Int16, %27 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %29
%29 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%28 : $Builtin.Vec8xInt16, %15 : $Builtin.Int16, %6 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %30
%30 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%29 : $Builtin.Vec8xInt16, %16 : $Builtin.Int16, %11 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %31
%31 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%30 : $Builtin.Vec8xInt16, %18 : $Builtin.Int16, %14 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %32
%32 = builtin "insertelement_Vec8xInt16_Int16_Int32"(%31 : $Builtin.Vec8xInt16, %19 : $Builtin.Int16, %17 : $Builtin.Int32) : $Builtin.Vec8xInt16 // user: %33
%33 = struct $UInt16.SIMD8Storage (%32 : $Builtin.Vec8xInt16) // user: %34
%34 = struct $SIMD8<UInt16> (%33 : $UInt16.SIMD8Storage) // user: %35
return %34 : $SIMD8<UInt16> // id: %35
} // end sil function '$s12minimal_demo7zip2_8hys5SIMD8Vys6UInt16VGAG_AGtF'
LLVM IR
; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
define hidden swiftcc <8 x i16> @"$s12minimal_demo7zip1_8hys5SIMD8Vys6UInt16VGAG_AGtF"(<8 x i16> %0, <8 x i16> %1) local_unnamed_addr #4 {
entry:
%2 = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%3 = shufflevector <8 x i16> %2, <8 x i16> %0, <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%4 = shufflevector <8 x i16> %3, <8 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
%5 = shufflevector <8 x i16> %4, <8 x i16> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 poison, i32 poison, i32 poison>
%6 = shufflevector <8 x i16> %5, <8 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 10, i32 poison, i32 poison>
%7 = shufflevector <8 x i16> %6, <8 x i16> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 11, i32 poison>
%8 = shufflevector <8 x i16> %7, <8 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
ret <8 x i16> %8
}
; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
define hidden swiftcc <8 x i16> @"$s12minimal_demo7zip2_8hys5SIMD8Vys6UInt16VGAG_AGtF"(<8 x i16> %0, <8 x i16> %1) local_unnamed_addr #4 {
entry:
%2 = shufflevector <8 x i16> %0, <8 x i16> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%3 = shufflevector <8 x i16> %2, <8 x i16> %1, <8 x i32> <i32 0, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%4 = shufflevector <8 x i16> %3, <8 x i16> %0, <8 x i32> <i32 0, i32 1, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%5 = shufflevector <8 x i16> %4, <8 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 poison, i32 poison, i32 poison, i32 poison>
%6 = shufflevector <8 x i16> %5, <8 x i16> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 14, i32 poison, i32 poison, i32 poison>
%7 = shufflevector <8 x i16> %6, <8 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 14, i32 poison, i32 poison>
%8 = shufflevector <8 x i16> %7, <8 x i16> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 15, i32 poison>
%9 = shufflevector <8 x i16> %8, <8 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
ret <8 x i16> %9
}
The LLVM IR might shed some light. The zip2_8h
function has this extra variable, which doesn't have a counterpart in zip1_8h
. I'm not sure what to make of it
%2 = shufflevector <8 x i16> %0, <8 x i16> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>