Find multiple substrings from specified index

Hello forum,

Does swift have String method like this:

syntax:
firstIndex(of: String, at: String.Index) -> String.Index?

usage:
mystring.firstIndex(of: "hello", at: 5)

It allows programmers to find substrings at index.
For example, Ive got text log with some corrupted records like this:
tag xxx tag yyy ta

the first "tag" should be replaced by "tag1", and the second by "tag2" like this:
tag1 xxx tag2 yyy ta

The following is what I will do in C++. How can I do it in Swift?

Thanks

#include <iostream>
#include <string>
using namespace std;

void tag_correct(string &str, string &sub)
{
    string::size_type pos = str.find(sub);
    if (pos != string::npos)
    {
        str.replace(pos, sub.size(), "tag1");
    }

    pos = str.find(sub, pos + 1);
    if (pos != string::npos)
    {
        str.replace(pos, sub.size(), "tag2");
    }
}

int main()
{
    string str = " tag xxx tag yyy ta";
    string sub = "tag";

    cout << "old: " << str << endl;
    tag_correct(str, sub);
    cout << "new: " << str << endl;
}

If you import Foundation, you can use range(of:options:range:locale:), and most of the arguments have reasonable defaults. Thus:

import Foundation

extension String {
    func withFixedTags() -> String {
        guard var found = range(of: "tag") else { return self }
        var answer = String(self[..<found.lowerBound])
        answer += "tag1"

        if let found1 = range(of: "tag", range: found.upperBound ..< endIndex) {
            answer += self[found.upperBound ..< found1.lowerBound]
            answer += "tag2"
            found = found1
        }

        answer += self[found.upperBound...]
        return answer
    }
}

" tag xxx tag yyy ta".withFixedTags()
// Answer: " tag1 xxx tag2 yyy ta"
1 Like

To implement the exact method you asked for, you would do this:

import Foundation
extension String {
    func firstIndex(of: String, at: String.Index) -> String.Index? {
        return self[at...].range(of: of)?.lowerBound
    }
}

But there are many pitfalls in what you want to do with it, because String indices are not integer offsets. This would be a better (and generalized) algorithm to accomplish the same overall task:

extension RangeReplaceableCollection where Element : Equatable {

    func replacingInstanecs<SearchTerm, ReplacementGenerator>(
        of searchTerm: SearchTerm,
        with replacements: ReplacementGenerator) -> Self
        where SearchTerm : Collection, SearchTerm.Element == Self.Element,
        ReplacementGenerator : Sequence, ReplacementGenerator.Element : Sequence,
        ReplacementGenerator.Element.Element == Self.Element {
            
            let searchTermSize = searchTerm.count

            var replacementIterator = replacements.makeIterator()
            var result = Self()

            var cursor = startIndex
            while let found = indices[cursor...].first(where: { searchIndex in
                return self[searchIndex...].prefix(searchTermSize).elementsEqual(searchTerm)
            }) {
                defer { cursor = index(found, offsetBy: searchTermSize) }

                result.append(contentsOf: self[cursor ..< found])
                guard let nextReplacement = replacementIterator.next() else {
                    fatalError("Not enough replacements.")
                }
                result.append(contentsOf: nextReplacement)
            }

            result.append(contentsOf: self[cursor...])
            return result
    }
}

let string = " tag xxx tag yyy ta"
let replacements = sequence(first: 1, next: { $0 + 1 }).lazy.map({ "tag\($0)" })
let transformed = string.replacingInstanecs(of: "tag", with: replacements)
2 Likes

Also note that the range(of:) variants come from Foundation’s NSString and operate on a UTF‐16 level. They do not respect the definition of Character that you are used to from String and may return only pieces of a Character.

let string = "e\u{301}" // e + ◌́
let search = "́" // ◌́

// The UTF‐16 definition using Foundation’s `range(of:)`
// as implemented above.
let utf16 = string.firstIndex(of: search, at: string.startIndex)
// Results in position “1”.

// A `Character` based, built‐in Swift method:
// (Technically generic over any Collection.)
let character = string.firstIndex(of: search)
// Results in “nil”.

Thanks Jeremy

I add a check in your nice extension for it reports error of upperBound < lowerBound for some inputs.

import Foundation
extension String {
    func firstIndex(of: String, at: String.Index) -> String.Index? {
        if at <= endIndex {
            return self[at...].range(of: of)?.lowerBound
        } else {
            return nil
        }
    }
}

var string = ", hello"
let search = "hello"

//string = "hello tom hello jerry"
//search = "hello"

var start = string.startIndex
var end = string.endIndex

print("old: \(string)")

var index = string.firstIndex(of: search, at: string.startIndex)
if index != nil {
    start = index!
    end = string.index(index!, offsetBy: search.count)
    string.replaceSubrange(start..<end, with: "hi")
}

index = string.firstIndex(of: search, at: end)
if index != nil {
    start = index!
    end = string.index(index!, offsetBy: search.count)
    string.replaceSubrange(start..<end, with: "hi again")
}

print("new: \(string)")

Thanks Rob

I come up with something like this

func foo (haystack: inout String, needle: String){
    guard let range = haystack[haystack.startIndex...].range(of: needle) else {return}
    haystack.replaceSubrange(range, with: "tag1")

    guard let range2 = haystack[range.upperBound...].range(of: needle) else {return}
    haystack.replaceSubrange(range2, with: "tag2")
}

var haystack = " tag xxx tag yyy ta.."
let needle = "tag"

print("old: \(haystack)")
foo(haystack: &haystack, needle: needle)
print("new: \(haystack)")

/*
 old:  tag xxx tag yyy ta..
 new:  tag1 xxx tag2 yyy ta..
 */

I believe haystack[range.upperBound...] is invalid after you have mutated haystack (because mutating a collection invalidates existing indexes unless documented otherwise). That’s why I didn’t write it the way you did.

2 Likes

The replacement part sounds like something that could be implemented efficiently with Collection Diffing. If only I know how it works :thinking:.

It would be wise to get in the habit of writing those sorts of constructions with if let:

if let start = index {
    // [...]
}

Not only is it more concise, but as soon as concurrency enters the picture, other parts of the program can change the value in between the two lines:

if x.y.z != nil {
    // What if some concurrent part of the program modifies x, y, or z?
    let a = x.y.z! // ...crash...
    // [...]
}

@mayoff is correct, this is invalid:

guard let range = haystack/*[...]*/.range(of: /*[*/"tag"/*]*/) // [...]
haystack.replaceSubrange(range, with: "tag1")

guard let range2 = haystack[range.upperBound...] // [...]
  • The first line gets a range (start and end indices) that describe the original state of haystack.
  • The second line modifies haystack behind the backs of those indices.
  • The third line then applies the upperBound index to a mismatched string. If it doesn’t crash, you are just lucky.

The problem can be demonstrated with a simple modification to the parameters:

var haystack = "long search term..."
let needle = "long search term"

foo(haystack: &haystack, needle: needle) // error: EXC_BAD_INSTRUCTION

In that case it happens because the range is equivalent to 0–16, the mutation shortens the string from 19 characters down to 7 (“tag1...”) and then the upper bound of 16 is out of bounds when you try to use it.

It may be tempting to just try to compute the difference in length, but that is not actually predictable, since characters can combine ("e" + "́" is 1 + 1 = 1). The only safe way to do replacement is to build a separate String instance by consecutively appending fragments, all without touching the original String, so that its indices remain valid. Then, when the new string is complete you can replace the old with it in a single overwrite.

That is why the extension I provided above does it the way it does; there are no safe shortcuts. (But you could simplify it to a concrete method for String if you want, you do not necessarily need the generics that allow it to also work with Arrays and other Collections.)

1 Like

Collection diffing is not helpful here. It enables the opposite:

let start = " tag xxx tag yyy ta.."
let end = " tag1 xxx tag2 yyy ta.."
// Question: How did I get from `start` to `end`?

// Answer:
let difference = end.difference(from: start)
print(difference)
// 1. Keep characters 0–4 (“ tag”),
// 2. Insert “1”,
// 3. Keep characters 4–12 (“ xxx tag”),
// 4. Insert “2”,
// 5. Keep characters 12–21 (“ yyy ta..”).

// ...which describes approximately what our algorithm did.
// (Though our algorithm also needlessly replaced “tag” with “tag”.)

You can diff the tags, then the shift the changes for each of the locations you find, then apply them at the end.

Oh, I see what you mean now: in order to (possibly) reduce the amount affected by each individual replacement. It would be an interesting optimization to try out if the code were very performance critical, but it would probably be slower for some uses just as it would be faster for others. In any case it is not really necessary to get the job done, and is far more complicated than the code the original poster is trying to translate from C. So I hope we are not causing confusion.

Not natively in the stdlib, but it is high on the list of String Essentials.

2 Likes

Hi Jeremy,

how about this one? another try of mine:)

extension String {
    //return the index of the substring start at the specified position
    func firstIndex(of: String, at: Int) -> Int? {
        let array = Array(self)
        var start = at
        
        if at < 0 {
            return nil
        }
        
        while start + of.count - 1 < array.endIndex {
            if String(array[start..<start+of.count]) == of {
                return start
            }
            start += 1
        }
        return nil
    }
}

//find the second "disk" and place "samsung" before it
var diskinfo = "hhd disk ssd disk xxx disk..."
var start = diskinfo.firstIndex(of: "disk", at: 10)
var index = diskinfo.index(diskinfo.startIndex, offsetBy: start!)

print("old: \(diskinfo)")
diskinfo.insert(contentsOf: "samsung ", at: index)
print("new: \(diskinfo)")

//old: hhd disk ssd disk xxx disk...
//new: hhd disk ssd samsung disk xxx disk...

:+1: Yes, that will do the job accurately and safely. Nice work.

(It is not as fast as it could be. It doesn’t seem like that matters to you at this point, which is perfectly fine. If in the future you find you wish it were faster, you can always come back and ask.)

There is nothing String-specific for this task, so it can be (over-engineered and) generalized:

extension Collection {

    /// Returns the first index where the given collection exists as a
    /// subsequence of this collection, using the given predicate to compare
    /// elements for equivalence.
    public func firstIndex<C: Collection>(of target: C, by areEquivalent: (Element, Element) throws -> Bool) rethrows -> Index? where C.Element == Element {
        guard let firstTarget = target.first else { return startIndex }

        let targetTail = target.dropFirst()
        var tail = self[...]
        while let prospect = try tail.firstIndex(where: { try areEquivalent($0, firstTarget) }) {
            let nextTail = tail.dropFirst()
            if try nextTail.starts(with: targetTail, by: areEquivalent) {
                return prospect
            } else {
                tail = nextTail
            }
        }
        return nil
    }

}

extension Collection where Element: Equatable {

    /// Returns the first index where that trailing subsequence starts with the
    /// given collection.
    public func firstIndex<C: Collection>(of target: C) -> Index? where C.Element == Element {
        return firstIndex(of: target, by: ==)
    }

}

//find the second "disk" and place "samsung" before it
let disk = "disk"
let diskCount = "disk".count
var diskinfo = "hhd disk ssd disk xxx disk..."
print("old: \(diskinfo)")
if let firstDisk = diskinfo.firstIndex(of: disk) {
    let firstDiskEnd = diskinfo.index(firstDisk, offsetBy: diskCount)
    if let secondDisk = diskinfo[firstDiskEnd...].firstIndex(of: disk) {
        diskinfo.insert(contentsOf: "samsung ", at: secondDisk)
    }
}
print("new: \(diskinfo)")

//old: hhd disk ssd disk xxx disk...
//new: hhd disk ssd samsung disk xxx disk...

Note that I didn't not tack on a "at: Int" parameter; the method finds the strictly first match. If you want to search after a prefix, the convention is to find the index past the prefix and work with the suffix sub-sequence.

There are optimizations to be made if a prefix of your needle string appears as a suffix of the same string, but there's a whole branch of Computer Science related to that.

2 Likes

Thanks Daryle,

It works and It is great code.