Partitioning work across parallel tasks doesn’t seem to benefit from parallelism

the following test program just doesn’t seem to be benefiting from TaskGroup parallelism at all (beyond a marginal improvement from n=1 to n=2 cores), despite running on an 8-core machine.

although i have marked Partition @unchecked Sendable in this example, each partition is assigned and manipulated by one Child Task only.

struct PartitionValues
{
    private
    var buffer:[Int]

    private
    init(buffer:[Int])
    {
        self.buffer = buffer
    }

    init(count:Int, scale:Int)
    {
        let basis:[Int] = (0 ..< scale).map { _ in Int.random(in: 0 ..< 100) }
        self.init(buffer: [_].init(repeatElement(basis, count: count).joined()))
    }

    mutating
    func process()
    {
        let start:SuspendingClock.Instant = .now
        for i:Int in self.buffer.indices
        {
            { $0 = 2 * $0 + 1 }(&self.buffer[i])
        }
        let end:SuspendingClock.Instant = .now
        print("processed \(self.buffer.count) elements in \(end - start)")
    }
}

final
class Partition:@unchecked Sendable
{
    private(set)
    var values:PartitionValues

    private
    init(values:PartitionValues)
    {
        self.values = values
    }

    static func create(n:Int, scale:Int) -> Partition
    {
        .init(values: .init(count: 40320 / n, scale: scale))
    }

    func process()
    {
        self.values.process()
    }
}

func main() async
{
    for scale:Int in [5000, 10000]
    {
        for n:Int in [1, 2, 4, 8]
        {
            let partitions:[Partition] = (0 ..< n).map { _ in .create(n: n, scale: scale) }
            let start:SuspendingClock.Instant = .now
            await withTaskGroup(of: Void.self)
            {
                (tasks:inout TaskGroup<Void>) in

                for partition:Partition in partitions
                {
                    tasks.addTask
                    {
                        partition.process()
                    }
                }
            }
            let end:SuspendingClock.Instant = .now
            print("n = \(n), elapsed time = \(end - start)")
            print()
        }

        print("---")
    }
}

await main()

by running the benchmarks under 1x and 2x “scales”, we get an idea of how long the computation should “inherently” take, and we observe that task-local throughput seems to be (linearly) influenced by how many other threads are active.

$ swiftc -O b.swift -o b && echo "build complete" && ./b
build complete
processed 201600000 elements in 0.149900215 seconds
n = 1, elapsed time = 0.207860123 seconds

+ processed 100800000 elements in 0.103600706 seconds
+ processed 100800000 elements in 0.110130889 seconds
n = 2, elapsed time = 0.171554005 seconds

processed 50400000 elements in 0.104014164 seconds
processed 50400000 elements in 0.104690326 seconds
processed 50400000 elements in 0.106976494 seconds
processed 50400000 elements in 0.111549674 seconds
n = 4, elapsed time = 0.174183825 seconds

processed 25200000 elements in 0.098610127 seconds
processed 25200000 elements in 0.098924595 seconds
processed 25200000 elements in 0.099103121 seconds
processed 25200000 elements in 0.099506595 seconds
processed 25200000 elements in 0.110996157 seconds
processed 25200000 elements in 0.111128111 seconds
processed 25200000 elements in 0.111447141 seconds
processed 25200000 elements in 0.111726082 seconds
n = 8, elapsed time = 0.171806161 seconds
processed 403200000 elements in 0.301231452 seconds
n = 1, elapsed time = 0.4195478 seconds

processed 201600000 elements in 0.209494795 seconds
processed 201600000 elements in 0.211085142 seconds
n = 2, elapsed time = 0.329079313 seconds

+ processed 100800000 elements in 0.211962049 seconds
+ processed 100800000 elements in 0.212068609 seconds
+ processed 100800000 elements in 0.211920862 seconds
+ processed 100800000 elements in 0.213087498 seconds
n = 4, elapsed time = 0.333433957 seconds

processed 50400000 elements in 0.198199067 seconds
processed 50400000 elements in 0.199327949 seconds
processed 50400000 elements in 0.200085101 seconds
processed 50400000 elements in 0.200602445 seconds
processed 50400000 elements in 0.218438551 seconds
processed 50400000 elements in 0.218666651 seconds
processed 50400000 elements in 0.219319901 seconds
processed 50400000 elements in 0.219773213 seconds
n = 8, elapsed time = 0.338723114 seconds

this doesn’t seem to be an artifact of OS-level CPU prioritization, because a similar test program that uses no internal Array buffer does benefit from the expected parallelism.

struct Partition:Sendable
{
    let count:Int
    let seed:Int

    static func create(n:Int, scale:Int) -> Partition
    {
        .init(count: scale * 40320 / n, seed: .random(in: 0 ..< 100))
    }

    @inline(never)
    func process() -> Int
    {
        var s:Int = self.seed
        for _:Int in 0 ..< 100
        {
            for _:Int in 0 ..< self.count
            {
                s = 2 &* s &+ 1
            }
        }
        return s
    }
}

func main() async
{
    for scale:Int in [5000, 10000]
    {
        for n:Int in [1, 2, 4, 8]
        {
            let partitions:[Partition] = (0 ..< n).map { _ in .create(n: n, scale: scale) }
            let start:SuspendingClock.Instant = .now
            await withTaskGroup(of: Void.self)
            {
                (tasks:inout TaskGroup<Void>) in

                for partition:Partition in partitions
                {
                    tasks.addTask
                    {
                        _ = partition.process()
                    }
                }
            }
            let end:SuspendingClock.Instant = .now
            print("n = \(n), elapsed time = \(end - start)")
            print()
        }

        print("---")
    }
}

await main()
$ swiftc -O c.swift -o c && echo "build complete" && ./c
build complete
n = 1, elapsed time = 1.209820018 seconds

n = 2, elapsed time = 0.603347224 seconds

n = 4, elapsed time = 0.308958449 seconds

n = 8, elapsed time = 0.156760841 seconds

---
n = 1, elapsed time = 2.392856643 seconds

n = 2, elapsed time = 1.202244252 seconds

n = 4, elapsed time = 0.614643686 seconds

n = 8, elapsed time = 0.312989128 seconds

what could be happening here?

4 Likes