I'm performing matrix multiplication in Swift using the `vDSP_mmulD`

and `cblas_dgemm`

functions from Accelerate as shown below. Each matrix is a square N x N matrix. The code is in a file named `main.swift`

and is compiled with `swiftc -Xcc -DACCELERATE_NEW_LAPACK -Ounchecked main.swift`

then executed with `./main`

.

```
// main.swift
import Accelerate
func random(_ n: Int) -> [Double] {
var idist: Int32 = 1
var iseed: [Int32] = (0..<3).map { _ in Int32.random(in: 1..<4095) }
iseed += [2 * (Int32.random(in: 1..<4095) / 2) + 1 ]
var num = Int32(n)
var vec = [Double](repeating: 0.0, count: n)
dlarnv_(&idist, &iseed, &num, &vec)
return vec
}
func mmul(_ a: [Double], _ b: [Double], size: Int) -> [Double] {
let len = vDSP_Length(size)
let m: vDSP_Length = len
let n: vDSP_Length = len
let p: vDSP_Length = len
let stride = 1
var c = [Double](repeating: 0, count: a.count)
vDSP_mmulD(a, stride, b, stride, &c, stride, m, n, p)
return c
}
func dgemm(_ a: inout [Double], _ b: inout [Double], size: Int32) -> [Double] {
let m: Int32 = size
let n: Int32 = size
let k: Int32 = size
var c = [Double](repeating: 0, count: a.count)
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, &a, size, &b, size, 0.0, &c, size)
return c
}
// Example to compare with Python result
// Notice that n is for NxN matrix
func run_mmul_example() {
let n = 3
let a: [Double] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
let b: [Double] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
let c = mmul(a, b, size: n)
print(c)
}
// Example to compare with Python result
// Notice that n is for NxN matrix
func run_dgemm_example() {
let n: Int32 = 3 // where n is for NxN matrix
var a: [Double] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
var b: [Double] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
let c = dgemm(&a, &b, size: n)
print(c)
}
func run_mmul_benchmark() {
let tic = ContinuousClock().now
let n = 8000
let a = random(n * n)
let b = random(n * n)
let c = mmul(a, b, size: n)
print(c[0])
print(c.last!)
let toc = ContinuousClock().now
print("elapsed \(toc - tic)")
}
func run_dgemm_benchmark() {
let tic = ContinuousClock().now
let n = 8000
var a = random(n * n)
var b = random(n * n)
let c = dgemm(&a, &b, size: Int32(n))
print(c[0])
print(c.last!)
let toc = ContinuousClock().now
print("elapsed \(toc - tic)")
}
run_mmul_example()
run_dgemm_example()
run_mmul_benchmark()
run_dgemm_benchmark()
```

I compared the Swift/Accelerate results to Python using the NumPy package. See Python code below. The code is in a file named `main.py`

and executed with `python main.py`

. The Swift and Python results are equivalent, so I feel confident that each code is doing the same calculation.

```
# main.py
import numpy as np
import time
def run_example():
# Example to compare with Swift result
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
c = np.matmul(a, b)
print(c)
def run_benchmark():
# Benchmark
tic = time.perf_counter()
n = 8000
a = np.random.rand(n, n)
b = np.random.rand(n, n)
c = np.matmul(a, b)
print(c[0, 0])
print(c[-1, -1])
toc = time.perf_counter()
print(f"elapsed {toc - tic} sec")
if __name__ == "__main__":
run_example()
run_benchmark()
```

However, for a large 8000 x 8000 matrix, the Swift/Accelerate code is about one second slower than the Python/NumPy code. The table below shows the elapsed time in seconds from the benchmark functions for three runs. The results are from a 2019 MacBook Pro with a 2.6 GHz 6-core Intel CPU and 32 GB of RAM running on macOS 14.4.1 Sonoma.

Run | vDSP_mmulD | cblas_dgemm | NumPy |
---|---|---|---|

1 | 5.69 | 5.38 | 4.42 |

2 | 5.77 | 5.51 | 4.40 |

3 | 5.73 | 5.48 | 4.43 |

Even if I ignore the generation of the random matrices, the Swift code is still slower than the Python code. I know NumPy has some optimized code under-the-hood but I'm surprised it's faster than Accelerate. I thought Accelerate is optimized for Apple hardware so I was expecting to get better performance than NumPy. Is there something else I should do to improve performance in Swift/Accelerate for matrix multiplication?