In the example below I use SIMD and Accelerate to add two large vectors. The elapsed time for the SIMD4 and SIMD64 examples is much slower than the Accelerate example. Maybe Accelerate is doing something under-the-hood to optimize this problem. Anyway, I have never used SIMD types in Swift, so can someone comment on my use of SIMD here and if I used it correctly?

```
import Accelerate
func runSimd4(_ a: [Double], _ b: [Double]) {
var c: [Double] = Array(repeating: 0.0, count: a.count)
let tic = Date.now
// Define the chunk size based on the SIMD vector type (e.g., SIMD4<Float>)
let chunkSize = 4
// Process the arrays in chunks
let remainderStartIndex = a.count - (a.count % chunkSize)
for i in stride(from: 0, to: remainderStartIndex, by: chunkSize) {
// Create SIMD vectors for the current chunk
let simdA = SIMD4<Double>(a[i], a[i+1], a[i+2], a[i+3])
let simdB = SIMD4<Double>(b[i], b[i+1], b[i+2], b[i+3])
// Perform element-wise addition using SIMD
let simdResult = simdA + simdB
// Store the result back into the result array
c[i] = simdResult[0]
c[i+1] = simdResult[1]
c[i+2] = simdResult[2]
c[i+3] = simdResult[3]
}
// Handle the remainder elements that don't fit into SIMD chunks
for i in remainderStartIndex..<a.count {
c[i] = a[i] + b[i]
}
let toc = tic.timeIntervalSinceNow.magnitude
print("SIMD4 elapsed time ........ \(toc) s")
//print("SIMD4 result ........", c)
}
func runSimd64(_ a: [Double], _ b: [Double]) {
var c: [Double] = Array(repeating: 0.0, count: a.count)
let tic = Date.now
// Define the chunk size based on the SIMD vector type (e.g., SIMD64<Float>)
let chunkSize = 64
// Process the arrays in chunks
let remainderStartIndex = a.count - (a.count % chunkSize)
for i in stride(from: 0, to: remainderStartIndex, by: chunkSize) {
// Create SIMD vectors for the current chunk
let simdA = SIMD64<Double>(a[i..<i+chunkSize])
let simdB = SIMD64<Double>(b[i..<i+chunkSize])
// Perform element-wise addition using SIMD
let simdResult = simdA + simdB
// Store the result back into the result array
for j in 0..<chunkSize {
c[i + j] = simdResult[j]
}
}
// Handle the remainder elements that don't fit into SIMD chunks
for i in remainderStartIndex..<a.count {
c[i] = a[i] + b[i]
}
let toc = tic.timeIntervalSinceNow.magnitude
print("SIMD64 elapsed time ....... \(toc) s")
//print("SIMD64 result ........", c)
}
func runAccelerate(_ a: [Double], _ b: [Double]) {
let tic = Date.now
let c = vDSP.add(a, b)
let toc = tic.timeIntervalSinceNow.magnitude
print("Accelerate elapsed time ... \(toc) s")
//print("Accelerate result ...", c)
}
// --- Run examples ---
//let a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10.0]
//let b = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11.0]
let n = 100_000
let a = (0..<n).map { _ in Double.random(in: 0...1) }
let b = (0..<n).map { _ in Double.random(in: 0...1) }
runAccelerate(a, b)
runSimd4(a, b)
runSimd64(a, b)
```

The elapsed times from an Intel Mac are shown below. I did not run the code with any optimizations enabled.

```
Accelerate elapsed time ... 0.0004929 s
SIMD4 elapsed time ........ 0.01182 s
SIMD64 elapsed time ....... 0.09705 s
```