How to enable SIMD in Swift WebAssembly

posting this here because it took me a while to figure this out and hopefully it helps someone else.

i noticed from playing around with Swift WebAssembly that the Swift compiler (at least the 6.1.2 compiler with a recent Wasm SDK) refuses to emit Actual SIMD Instructions in the generated WebAssembly.

import Vector

let x: Vector3 = .init(1, 2, 3)
let y: Vector3 = .init(4, 5, 6)
print(x + y)
$ swift build --swift-sdk wasm32-unknown-wasi -c release --product vector
Building for production...
[7/7] Linking vector.wasm
Build of product 'vector' complete! (1.56s)


$ wasm2wat .build/release/vector.wasm | grep -E "(v128|simd|f32x4|i32x4)"
            call $udiv128by64to64
          call $udiv128by64to64
  (func $udiv128by64to64 (type 254) (param i64 i64 i64 i32) (result i64)
    call $udiv128by64to64default
  (func $udiv128by64to64default (type 254) (param i64 i64 i64 i32) (result i64)

there are no SIMD instructions in this binary – udiv128by64to64 is just a BigInt helper provided by the runtime.

to coax Swift into actually using SIMD instructions, you have to pass -Xcc -msimd128

$ swift build --swift-sdk wasm32-unknown-wasi -c release --product vector \
    -Xcc -msimd128

then you get all of this stuff:

$ wasm2wat .build/release/vector.wasm | grep -E "(v128|simd|f32x4|i32x4)"
  (type (;59;) (func (param v128 v128 i32 i32)))
  (type (;60;) (func (param i32 v128 v128 i32 i32)))
  (type (;61;) (func (param v128 v128 v128 v128 i32 i32) (result i32)))
  (type (;62;) (func (param v128 v128 i32 i32) (result i32)))
  (type (;64;) (func (param v128 v128 i32 i32) (result f64)))
  (type (;66;) (func (param i32 v128 v128 v128 v128 i32 i32)))
  (type (;67;) (func (param i32 v128 v128 f64 i32 i32)))
  (type (;68;) (func (param i32 f64 v128 v128 i32 i32)))
  (func $$s6Vector7Vector3V7storages5SIMD3VySdGvs (type 59) (param v128 v128 i32 i32)
    v128.store offset=16
    v128.store)
  (func $$s6Vector7Vector3V7storageACs5SIMD3VySdG_tcfC (type 60) (param i32 v128 v128 i32 i32)
    v128.store offset=16
    v128.store)
  (func $$s6Vector7Vector3V23__derived_struct_equalsySbAC_ACtFZ (type 61) (param v128 v128 v128 v128 i32 i32) (result i32)
      i32x4.extract_lane 0
  (func $$s6Vector7Vector3V4hash4intoys6HasherVz_tF (type 60) (param i32 v128 v128 i32 i32)
  (func $$ss4SIMDPsE4hash4intoys6HasherVz_tFs5SIMD3VySdG_Tg5 (type 60) (param i32 v128 v128 i32 i32)
  (func $$s6Vector7Vector3V9hashValueSivg (type 62) (param v128 v128 i32 i32) (result i32)
    (local i32 v128 v128 v128 v128)
    v128.load offset=16
    v128.load
    v128.load offset=16
    v128.load
            v128.store
            v128.store offset=16
            v128.store offset=32
            v128.store offset=48
    (local i32 v128 v128)
    v128.load offset=16
    v128.load
    v128.load
    v128.load offset=16
    (local i32 v128 v128)
    v128.load offset=16
    v128.load
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16
    v128.store)
    (local v128)
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16
    v128.store)
  (func $$s6Vector7Vector3V3sumSdvg (type 64) (param v128 v128 i32 i32) (result f64)
  (func $$s6Vector7Vector3V2peoiyyACz_ACtFZ (type 60) (param i32 v128 v128 i32 i32)
    v128.load
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
  (func $$s6Vector7Vector3V2seoiyyACz_ACtFZ (type 60) (param i32 v128 v128 i32 i32)
    v128.load
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
  (func $$s6Vector7Vector3V2meoiyyACz_ACtFZ (type 60) (param i32 v128 v128 i32 i32)
    v128.load
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
  (func $$s6Vector7Vector3V2deoiyyACz_ACtFZ (type 60) (param i32 v128 v128 i32 i32)
    v128.load
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
    v128.load
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
    v128.load
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
    v128.load
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
    v128.load
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
  (func $$s6Vector7Vector3V1poiyA2C_ACtFZ (type 66) (param i32 v128 v128 v128 v128 i32 i32)
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16
    v128.store)
  (func $$s6Vector7Vector3V1soiyA2C_ACtFZ (type 66) (param i32 v128 v128 v128 v128 i32 i32)
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16
    v128.store)
  (func $$s6Vector7Vector3V1moiyA2C_ACtFZ (type 66) (param i32 v128 v128 v128 v128 i32 i32)
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16
    v128.store)
  (func $$s6Vector7Vector3V1doiyA2C_ACtFZ (type 66) (param i32 v128 v128 v128 v128 i32 i32)
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16
    v128.store)
  (func $$s6Vector7Vector3V1poiyA2C_SdtFZ (type 67) (param i32 v128 v128 f64 i32 i32)
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
  (func $$s6Vector7Vector3V1soiyA2C_SdtFZ (type 67) (param i32 v128 v128 f64 i32 i32)
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
  (func $$s6Vector7Vector3V1moiyA2C_SdtFZ (type 67) (param i32 v128 v128 f64 i32 i32)
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
  (func $$s6Vector7Vector3V1doiyA2C_SdtFZ (type 67) (param i32 v128 v128 f64 i32 i32)
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
  (func $$s6Vector7Vector3V1poiyACSd_ACtFZ (type 68) (param i32 f64 v128 v128 i32 i32)
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
  (func $$s6Vector7Vector3V1soiyACSd_ACtFZ (type 68) (param i32 f64 v128 v128 i32 i32)
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
  (func $$s6Vector7Vector3V1moiyACSd_ACtFZ (type 68) (param i32 f64 v128 v128 i32 i32)
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
  (func $$s6Vector7Vector3V1doiyACSd_ACtFZ (type 68) (param i32 f64 v128 v128 i32 i32)
    v128.store
    v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
    v128.store offset=16)
    v128.load
    v128.store
    v128.load
    v128.store
        v128.const i32x4 0x00000000 0x00000000 0x00000000 0x00000000
        v128.store offset=8 align=8
  (func $$s6Vector7Vector3V7storages5SIMD3VySdGvg (type 60) (param i32 v128 v128 i32 i32)
    v128.load
    v128.load offset=16
    v128.store offset=16
    v128.store
  (func $$s6Vector2oooPyAA24_AbsoluteValueExpressionVyAA7Vector3VGAFF (type 60) (param i32 v128 v128 i32 i32)
    v128.store offset=16
    v128.store)
  (func $$s6Vector2ooopySdAA24_AbsoluteValueExpressionVyAA7Vector3VGF (type 64) (param v128 v128 i32 i32) (result f64)
    v128.const i32x4 0x00000000 0x40180000 0x00000000 0x00000000
    v128.store offset=16
    v128.const i32x4 0x00000000 0x40100000 0x00000000 0x40140000
    v128.store
    v128.const i32x4 0x00000000 0x40080000 0x00000000 0x00000000
    v128.store offset=16
    v128.const i32x4 0x00000000 0x3ff00000 0x00000000 0x40000000
    v128.store
    v128.const i32x4 0x00000000 0x40220000 0x00000000 0x00000000
    v128.store offset=32
    v128.const i32x4 0x00000000 0x40140000 0x00000000 0x401c0000
    v128.store offset=16
5 Likes

Not sure if this is in any way related, but where does the Vector3 type/Vector package come from?

FWIW it's no different than enabling other LLVM backend features for any other instruction set, there's nothing exclusive to Wasm here in terms of passing the options. See the rest of target-dependent -m and -f options supported by Clang, there is a Wasm section there too. There was a thread or two here on forums about controlling CPU-specific architecture features, which is done in exactly the same way.

1 Like

Vector is just a private library i wrote, which is little more than a wrapper around SIMD3<Double>.

1 Like