the following test program just doesn’t seem to be benefiting from TaskGroup
parallelism at all (beyond a marginal improvement from n=1 to n=2 cores), despite running on an 8-core machine.
although i have marked Partition
@unchecked Sendable
in this example, each partition is assigned and manipulated by one Child Task only.
struct PartitionValues
{
private
var buffer:[Int]
private
init(buffer:[Int])
{
self.buffer = buffer
}
init(count:Int, scale:Int)
{
let basis:[Int] = (0 ..< scale).map { _ in Int.random(in: 0 ..< 100) }
self.init(buffer: [_].init(repeatElement(basis, count: count).joined()))
}
mutating
func process()
{
let start:SuspendingClock.Instant = .now
for i:Int in self.buffer.indices
{
{ $0 = 2 * $0 + 1 }(&self.buffer[i])
}
let end:SuspendingClock.Instant = .now
print("processed \(self.buffer.count) elements in \(end - start)")
}
}
final
class Partition:@unchecked Sendable
{
private(set)
var values:PartitionValues
private
init(values:PartitionValues)
{
self.values = values
}
static func create(n:Int, scale:Int) -> Partition
{
.init(values: .init(count: 40320 / n, scale: scale))
}
func process()
{
self.values.process()
}
}
func main() async
{
for scale:Int in [5000, 10000]
{
for n:Int in [1, 2, 4, 8]
{
let partitions:[Partition] = (0 ..< n).map { _ in .create(n: n, scale: scale) }
let start:SuspendingClock.Instant = .now
await withTaskGroup(of: Void.self)
{
(tasks:inout TaskGroup<Void>) in
for partition:Partition in partitions
{
tasks.addTask
{
partition.process()
}
}
}
let end:SuspendingClock.Instant = .now
print("n = \(n), elapsed time = \(end - start)")
print()
}
print("---")
}
}
await main()
by running the benchmarks under 1x and 2x “scales”, we get an idea of how long the computation should “inherently” take, and we observe that task-local throughput seems to be (linearly) influenced by how many other threads are active.
$ swiftc -O b.swift -o b && echo "build complete" && ./b
build complete
processed 201600000 elements in 0.149900215 seconds
n = 1, elapsed time = 0.207860123 seconds
+ processed 100800000 elements in 0.103600706 seconds
+ processed 100800000 elements in 0.110130889 seconds
n = 2, elapsed time = 0.171554005 seconds
processed 50400000 elements in 0.104014164 seconds
processed 50400000 elements in 0.104690326 seconds
processed 50400000 elements in 0.106976494 seconds
processed 50400000 elements in 0.111549674 seconds
n = 4, elapsed time = 0.174183825 seconds
processed 25200000 elements in 0.098610127 seconds
processed 25200000 elements in 0.098924595 seconds
processed 25200000 elements in 0.099103121 seconds
processed 25200000 elements in 0.099506595 seconds
processed 25200000 elements in 0.110996157 seconds
processed 25200000 elements in 0.111128111 seconds
processed 25200000 elements in 0.111447141 seconds
processed 25200000 elements in 0.111726082 seconds
n = 8, elapsed time = 0.171806161 seconds
processed 403200000 elements in 0.301231452 seconds
n = 1, elapsed time = 0.4195478 seconds
processed 201600000 elements in 0.209494795 seconds
processed 201600000 elements in 0.211085142 seconds
n = 2, elapsed time = 0.329079313 seconds
+ processed 100800000 elements in 0.211962049 seconds
+ processed 100800000 elements in 0.212068609 seconds
+ processed 100800000 elements in 0.211920862 seconds
+ processed 100800000 elements in 0.213087498 seconds
n = 4, elapsed time = 0.333433957 seconds
processed 50400000 elements in 0.198199067 seconds
processed 50400000 elements in 0.199327949 seconds
processed 50400000 elements in 0.200085101 seconds
processed 50400000 elements in 0.200602445 seconds
processed 50400000 elements in 0.218438551 seconds
processed 50400000 elements in 0.218666651 seconds
processed 50400000 elements in 0.219319901 seconds
processed 50400000 elements in 0.219773213 seconds
n = 8, elapsed time = 0.338723114 seconds
this doesn’t seem to be an artifact of OS-level CPU prioritization, because a similar test program that uses no internal Array buffer does benefit from the expected parallelism.
struct Partition:Sendable
{
let count:Int
let seed:Int
static func create(n:Int, scale:Int) -> Partition
{
.init(count: scale * 40320 / n, seed: .random(in: 0 ..< 100))
}
@inline(never)
func process() -> Int
{
var s:Int = self.seed
for _:Int in 0 ..< 100
{
for _:Int in 0 ..< self.count
{
s = 2 &* s &+ 1
}
}
return s
}
}
func main() async
{
for scale:Int in [5000, 10000]
{
for n:Int in [1, 2, 4, 8]
{
let partitions:[Partition] = (0 ..< n).map { _ in .create(n: n, scale: scale) }
let start:SuspendingClock.Instant = .now
await withTaskGroup(of: Void.self)
{
(tasks:inout TaskGroup<Void>) in
for partition:Partition in partitions
{
tasks.addTask
{
_ = partition.process()
}
}
}
let end:SuspendingClock.Instant = .now
print("n = \(n), elapsed time = \(end - start)")
print()
}
print("---")
}
}
await main()
$ swiftc -O c.swift -o c && echo "build complete" && ./c
build complete
n = 1, elapsed time = 1.209820018 seconds
n = 2, elapsed time = 0.603347224 seconds
n = 4, elapsed time = 0.308958449 seconds
n = 8, elapsed time = 0.156760841 seconds
---
n = 1, elapsed time = 2.392856643 seconds
n = 2, elapsed time = 1.202244252 seconds
n = 4, elapsed time = 0.614643686 seconds
n = 8, elapsed time = 0.312989128 seconds
what could be happening here?