I'm performing matrix multiplication in Swift using the vDSP_mmulD and cblas_dgemm functions from Accelerate as shown below. Each matrix is a square N x N matrix. The code is in a file named main.swift and is compiled with swiftc -Xcc -DACCELERATE_NEW_LAPACK -Ounchecked main.swift then executed with ./main.
// main.swift
import Accelerate
func random(_ n: Int) -> [Double] {
var idist: Int32 = 1
var iseed: [Int32] = (0..<3).map { _ in Int32.random(in: 1..<4095) }
iseed += [2 * (Int32.random(in: 1..<4095) / 2) + 1 ]
var num = Int32(n)
var vec = [Double](repeating: 0.0, count: n)
dlarnv_(&idist, &iseed, &num, &vec)
return vec
}
func mmul(_ a: [Double], _ b: [Double], size: Int) -> [Double] {
let len = vDSP_Length(size)
let m: vDSP_Length = len
let n: vDSP_Length = len
let p: vDSP_Length = len
let stride = 1
var c = [Double](repeating: 0, count: a.count)
vDSP_mmulD(a, stride, b, stride, &c, stride, m, n, p)
return c
}
func dgemm(_ a: inout [Double], _ b: inout [Double], size: Int32) -> [Double] {
let m: Int32 = size
let n: Int32 = size
let k: Int32 = size
var c = [Double](repeating: 0, count: a.count)
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, &a, size, &b, size, 0.0, &c, size)
return c
}
// Example to compare with Python result
// Notice that n is for NxN matrix
func run_mmul_example() {
let n = 3
let a: [Double] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
let b: [Double] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
let c = mmul(a, b, size: n)
print(c)
}
// Example to compare with Python result
// Notice that n is for NxN matrix
func run_dgemm_example() {
let n: Int32 = 3 // where n is for NxN matrix
var a: [Double] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
var b: [Double] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
let c = dgemm(&a, &b, size: n)
print(c)
}
func run_mmul_benchmark() {
let tic = ContinuousClock().now
let n = 8000
let a = random(n * n)
let b = random(n * n)
let c = mmul(a, b, size: n)
print(c[0])
print(c.last!)
let toc = ContinuousClock().now
print("elapsed \(toc - tic)")
}
func run_dgemm_benchmark() {
let tic = ContinuousClock().now
let n = 8000
var a = random(n * n)
var b = random(n * n)
let c = dgemm(&a, &b, size: Int32(n))
print(c[0])
print(c.last!)
let toc = ContinuousClock().now
print("elapsed \(toc - tic)")
}
run_mmul_example()
run_dgemm_example()
run_mmul_benchmark()
run_dgemm_benchmark()
I compared the Swift/Accelerate results to Python using the NumPy package. See Python code below. The code is in a file named main.py and executed with python main.py. The Swift and Python results are equivalent, so I feel confident that each code is doing the same calculation.
# main.py
import numpy as np
import time
def run_example():
# Example to compare with Swift result
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
c = np.matmul(a, b)
print(c)
def run_benchmark():
# Benchmark
tic = time.perf_counter()
n = 8000
a = np.random.rand(n, n)
b = np.random.rand(n, n)
c = np.matmul(a, b)
print(c[0, 0])
print(c[-1, -1])
toc = time.perf_counter()
print(f"elapsed {toc - tic} sec")
if __name__ == "__main__":
run_example()
run_benchmark()
However, for a large 8000 x 8000 matrix, the Swift/Accelerate code is about one second slower than the Python/NumPy code. The table below shows the elapsed time in seconds from the benchmark functions for three runs. The results are from a 2019 MacBook Pro with a 2.6 GHz 6-core Intel CPU and 32 GB of RAM running on macOS 14.4.1 Sonoma.
| Run |
vDSP_mmulD |
cblas_dgemm |
NumPy |
| 1 |
5.69 |
5.38 |
4.42 |
| 2 |
5.77 |
5.51 |
4.40 |
| 3 |
5.73 |
5.48 |
4.43 |
Even if I ignore the generation of the random matrices, the Swift code is still slower than the Python code. I know NumPy has some optimized code under-the-hood but I'm surprised it's faster than Accelerate. I thought Accelerate is optimized for Apple hardware so I was expecting to get better performance than NumPy. Is there something else I should do to improve performance in Swift/Accelerate for matrix multiplication?