Lazy versions of Sequence.split


(Daryle Walker) #1

This is a continuation of making a lazy version of Collection.split.

I started with an Iterator that wraps another, and takes in the other parameters of split, then vends each non-separator element paired with an index for the sub-sequence it would go into. There are also marker values for empty sub-sequences (still with the index).

enum LazySplitElement<Element> {
    case empty(rank: Int)
    case element(rank: Int, value: Element)

    var rank: Int {
        switch self {
        case .empty(let r):
            return r
        case .element(let r, _):
            return r
        }
    }
    var value: Element? {
        switch self {
        case .empty:
            return nil
        case .element(_, let v):
            return v
        }
    }
}

enum LazySplitCoreProgress<Element> {
    case untraversed
    case running(last: Element)
    case finished
}

struct LazySplitIndicatingIterator<Base: IteratorProtocol> {
    var base: Base
    var splitCounter: Int
    let omitEmptySubsequences: Bool
    let isSeparator: (Base.Element) -> Bool
    var progress: LazySplitCoreProgress<Base.Element> = .untraversed

    init(_ base: Base, maxSplits: Int = 9, omittingEmptySubsequences: Bool = true, whereSeparator isSeparator: @escaping (Base.Element) -> Bool) {
        precondition(maxSplits >= 0)

        self.base = base
        splitCounter = maxSplits
        omitEmptySubsequences = omittingEmptySubsequences
        self.isSeparator = isSeparator
    }
}

extension LazySplitIndicatingIterator: IteratorProtocol {

    mutating func next() -> LazySplitElement<Base.Element>? {
        while splitCounter > 0 {
            guard let latest = base.next() else {
                defer { progress = .finished }

                switch (omitEmptySubsequences, progress) {
                case (false, .running(let last)) where isSeparator(last):
                    fallthrough
                case (false, .untraversed):
                    defer { splitCounter -= 1 }
                    return .empty(rank: splitCounter)
                default:
                    return nil
                }
            }
            defer { progress = .running(last: latest) }

            switch progress {
            case .untraversed:
                if isSeparator(latest) {
                    if omitEmptySubsequences {
                        continue
                    } else {
                        defer { splitCounter -= 1 }
                        return .empty(rank: splitCounter)
                    }
                } else {
                    return .element(rank: splitCounter, value: latest)
                }
            case .running(let last):
                switch (omitEmptySubsequences, isSeparator(last), isSeparator(latest)) {
                case (_, _, false):
                    return .element(rank: splitCounter, value: latest)
                case (_, false, true):
                    splitCounter -= 1
                    fallthrough
                case (true, true, true):
                    continue
                case (false, true, true):
                    defer { splitCounter -= 1 }
                    return .empty(rank: splitCounter)
                }
            case .finished:
                preconditionFailure("A wrapping iterator should not be finished before its base")
            }
        }
        return base.next().map { .element(rank: splitCounter, value: $0) }
    }

}

Next was an iterator, and wrapping sequence, that lazily visited each split sub-sequence, but generated each sub-sequence eagerly as an Array.

public struct SlightlyLazySplitIterator<Base: IteratorProtocol> {
    var indicators: LazySplitIndicatingIterator<Base>
    var previous: LazySplitElement<Base.Element>? = nil

    init(_ base: Base, maxSplits: Int = Int.max, omittingEmptySubsequences: Bool = true, whereSeparator isSeparator: @escaping (Base.Element) -> Bool) {
        // For some reason this couldn't work without either "<Base>" or ".init" after the type name.
        indicators = LazySplitIndicatingIterator<Base>(base, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences, whereSeparator: isSeparator)
    }
}

extension SlightlyLazySplitIterator: IteratorProtocol {
    mutating public func next() -> [Base.Element]? {
        if previous == nil {
            previous = indicators.next()
        }
        guard let last = previous else { return nil }
        previous = nil

        var elements = [last]
        let elementRank = last.rank
        while let latest = indicators.next() {
            if elementRank == latest.rank {
                elements.append(latest)
            } else {
                previous = latest
                break
            }
        }
        switch elements.first! {
        case .empty:
            precondition(elements.count == 1)
            return []
        case .element:
            return elements.map { $0.value! }
        }
    }
}


public struct SlightlyLazySplitSequence<Base: Sequence> {
    var base: Base
    let maxSplitCount: Int
    let omitEmptySubsequences: Bool
    let isSeparator: (Base.Element) -> Bool

    init(_ base: Base, maxSplits: Int = Int.max, omittingEmptySubsequences: Bool = true, whereSeparator isSeparator: @escaping (Base.Element) -> Bool) {
        precondition(maxSplits >= 0)

        self.base = base
        maxSplitCount = maxSplits
        omitEmptySubsequences = omittingEmptySubsequences
        self.isSeparator = isSeparator
    }
}

extension SlightlyLazySplitSequence: Sequence {
    public func makeIterator() -> SlightlyLazySplitIterator<Base.Iterator> {
        return SlightlyLazySplitIterator(base.makeIterator(), maxSplits: maxSplitCount, omittingEmptySubsequences: omitEmptySubsequences, whereSeparator: isSeparator)
    }
}

extension SlightlyLazySplitSequence: LazySequenceProtocol {}

extension LazySequenceProtocol {
    public func ssplit(maxSplits: Int = Int.max, omittingEmptySubsequences: Bool = true, whereSeparator matches: @escaping (Element) -> Bool) -> SlightlyLazySplitSequence<Elements> {
        return SlightlyLazySplitSequence(elements, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences, whereSeparator: matches)
    }
}

extension LazySequenceProtocol where Element: Equatable {
    public func ssplit(separator: Element, maxSplits: Int = Int.max, omittingEmptySubsequences: Bool = true) -> SlightlyLazySplitSequence<Elements> {
        return ssplit(maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences, whereSeparator: { $0 == separator })
    }
}

Can anyone explain why the compiler can't implicitly figure out the correct version of LazySplitIndicatingIterator in the init of SlightlyLazySplitIterator?

This version of lazy-split works fine, although taking chunks that are gigabytes long is a risk with eager implementations. So this makes a fully-lazy version useful:

public struct ReallyLazySplitSubsequencesIterator<Base: IteratorProtocol> {
    final public class Element {
        let rank: Int
        unowned let source: Core

        init(rank: Int, core: Core) {
            self.rank = rank
            source = core
        }
    }
    final class Core {
        var indicators: LazySplitIndicatingIterator<Base>
        var currentSubsequence: Element?
        var onDeckElement: LazySplitCoreProgress<LazySplitElement<Base.Element>>

        init(_ base: Base, maxSplits: Int, omittingEmptySubsequences: Bool, whereSeparator isSeparator: @escaping (Base.Element) -> Bool) {
            indicators = LazySplitIndicatingIterator(base, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences, whereSeparator: isSeparator)
            currentSubsequence = nil
            onDeckElement = .untraversed
        }
    }

    let core: Core

    init(_ base: Base, maxSplits: Int, omittingEmptySubsequences: Bool, whereSeparator isSeparator: @escaping (Base.Element) -> Bool) {
        core = Core(base, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences, whereSeparator: isSeparator)
    }
}

extension ReallyLazySplitSubsequencesIterator: IteratorProtocol {
    mutating public func next() -> Element? {
        switch core.onDeckElement {
        case .untraversed:
            precondition(core.currentSubsequence == nil)
            if let first = core.indicators.next() {
                core.onDeckElement = .running(last: first)
                core.currentSubsequence = Element(rank: first.rank, core: core)
            } else {
                core.onDeckElement = .finished
            }
        case .running(let current):
            if core.currentSubsequence!.rank == current.rank {
                while case .some = core.currentSubsequence!.next() {
                    // Nothing; however, the calls to core.currentSubsequence!.next() change core.onDeckElement.
                }
            }
            switch core.onDeckElement {
            case .untraversed:
                preconditionFailure("This case should never be re-reached")
            case .running(let upcoming):
                precondition(core.currentSubsequence!.rank == upcoming.rank + 1)

                core.currentSubsequence = Element(rank: upcoming.rank, core: core)
            case .finished:
                core.currentSubsequence = nil
            }
        case .finished:
            core.currentSubsequence = nil
        }
        return core.currentSubsequence
    }
}

extension ReallyLazySplitSubsequencesIterator.Element: Sequence, IteratorProtocol {
    public func next() -> Base.Element? {
        switch source.onDeckElement {
        case .untraversed:
            preconditionFailure("source should have passed this state before this subsequence was vended")
        case .running(let upcoming) where upcoming.rank == rank:
            precondition(source.currentSubsequence === self)
            defer {
                if let nextElement = source.indicators.next() {
                    source.onDeckElement = .running(last: nextElement)
                } else {
                    source.onDeckElement = .finished
                }
            }
            return upcoming.value
        case .finished, .running:
            // For .running, precondition(upcoming.rank < rank), but we're not bothering to check.
            return nil
        }
    }
}

extension ReallyLazySplitSubsequencesIterator.Element: LazySequenceProtocol {}

public struct ReallyLazySplitSequence<Base: Sequence> {
    var base: Base
    let maxSplitCount: Int
    let omitEmptySubsequences: Bool
    let isSeparator: (Base.Element) -> Bool

    init(_ base: Base, maxSplits: Int = Int.max, omittingEmptySubsequences: Bool = true, whereSeparator isSeparator: @escaping (Base.Element) -> Bool) {
        precondition(maxSplits >= 0)

        self.base = base
        maxSplitCount = maxSplits
        omitEmptySubsequences = omittingEmptySubsequences
        self.isSeparator = isSeparator
    }
}

extension ReallyLazySplitSequence: Sequence {
    public func makeIterator() -> ReallyLazySplitSubsequencesIterator<Base.Iterator> {
        return ReallyLazySplitSubsequencesIterator(base.makeIterator(), maxSplits: maxSplitCount, omittingEmptySubsequences: omitEmptySubsequences, whereSeparator: isSeparator)
    }
}

extension ReallyLazySplitSequence: LazySequenceProtocol {}

extension LazySequenceProtocol {
    public func hsplit(maxSplits: Int = Int.max, omittingEmptySubsequences: Bool = true, whereSeparator matches: @escaping (Element) -> Bool) -> ReallyLazySplitSequence<Elements> {
        return ReallyLazySplitSequence(elements, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences, whereSeparator: matches)
    }
}

extension LazySequenceProtocol where Element: Equatable {
    public func hsplit(separator: Element, maxSplits: Int = Int.max, omittingEmptySubsequences: Bool = true) -> ReallyLazySplitSequence<Elements> {
        return hsplit(maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences, whereSeparator: { $0 == separator })
    }
}

This version may need some work. Whenever another sub-sequence needs to be accessed, the previous one needs to be invalidated. For instance:

Array(line.lazy.hsplit(separator: spaceChar, omittingEmptySubsequences: false))

creates an Array of the lazy-proxy sub-sequence type, i.e. an array of dead references! This is proven when

Array(line.lazy.hsplit(separator: spaceChar, omittingEmptySubsequences: false)).map { String($0) }

crashes. However, if I trigger a conversion while doing the lazy traversal:

Array(line.lazy.hsplit(separator: spaceChar, omittingEmptySubsequences: false).map { String($0) })

I get full objects instead of dangling references.

How are lazy sequences of lazy sequences supposed to work; do I have to rework the design to force eagerness in some uses, or should we tell users not to do that?


A lazy version of Collection.split