I ran your Metal example and didn't get any speed up compared to Accelerate on my Intel Mac. The exact code I ran is shown below.
import MetalPerformanceShaders
extension MPSMatrix {
static func makeBuffer(floats: [Float], rows: Int, columns: Int) -> MTLBuffer {
floats.withUnsafeBufferPointer { bp in
let array = bp.baseAddress!
let rowBytes = columns * MemoryLayout<Float>.stride
return MTLCreateSystemDefaultDevice()!.makeBuffer(bytes: array, length: rows * rowBytes, options: [])!
}
}
convenience init(floats: [Float], rows: Int, columns: Int) {
let buffer = Self.makeBuffer(floats: floats, rows: rows, columns: columns)
let rowBytes = columns * MemoryLayout<Float>.stride
let descriptor = MPSMatrixDescriptor(rows: rows, columns: columns, rowBytes: rowBytes, dataType: .float32)
self.init(buffer: buffer, descriptor: descriptor)
}
convenience init(rows: Int, columns: Int) {
let rowBytes = columns * MemoryLayout<Float>.stride
let buffer = MTLCreateSystemDefaultDevice()!.makeBuffer(length: rows * rowBytes, options: [])!
let descriptor = MPSMatrixDescriptor(rows: rows, columns: columns, rowBytes: rowBytes, dataType: .float32)
self.init(buffer: buffer, descriptor: descriptor)
}
static func * (lhs: MPSMatrix, rhs: MPSMatrix) -> MPSMatrix {
let device = lhs.device
precondition(device === rhs.device)
precondition(lhs.columns == rhs.rows)
let result = MPSMatrix(rows: lhs.rows, columns: rhs.columns)
let commandBuffer = device.makeCommandQueue()!.makeCommandBuffer()!
let mul = MPSMatrixMultiplication(device: device, resultRows: lhs.rows, resultColumns: rhs.columns, interiorColumns: lhs.columns)
mul.encode(commandBuffer: commandBuffer, leftMatrix: lhs, rightMatrix: rhs, resultMatrix: result)
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
return result
}
}
func runExample() {
let n = 3
let a: [Float] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
let b: [Float] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
let left = MPSMatrix(floats: a, rows: n, columns: n)
let right = MPSMatrix(floats: b, rows: n, columns: n)
let c = left * right
var cPointer = c.data.contents().bindMemory(to: Float.self, capacity: n * n)
for _ in 0..<n*n {
let y = Float(cPointer.pointee)
print(y, terminator: " ")
cPointer = cPointer.advanced(by: 1)
}
print("")
}
func runBenchmark() {
let n = 8000
let left = MPSMatrix(floats: .init(repeating: 0, count: n*n), rows: n, columns: n)
let right = MPSMatrix(floats: .init(repeating: 0, count: n*n), rows: n, columns: n)
let tic = ContinuousClock().now
_ = left * right
let toc = ContinuousClock().now
print("metal elapsed \(toc - tic) (w/o random)")
}
runExample()
runBenchmark()
Compile and run with:
swiftc -framework CoreGraphics -Ounchecked mainmetal.swift
./mainmetal
Gives the following:
30.0 36.0 42.0 66.0 81.0 96.0 102.0 126.0 150.0
metal elapsed 4.259647533 seconds (w/o random)