I am writing a Swift interface to the Apache Arrow glib C interface. This is a common columnar data format for large datasets in data science and machine learning. I am successfully serializing and de-serializing Swift arrays of String
, Int
, Bool
, Float
, and Double
, but the code is pretty gnarly. Lots of duplicated if
statements and force type casts. Here is an example that converts a Swift array of String
s or Double
s into a GArrowArray
:
protocol ArrowSupportedType: Equatable, CustomStringConvertible {
}
extension String: ArrowSupportedType {
}
extension Double: ArrowSupportedType {
}
func arrayToGArray<T: ArrowSupportedType>(values: [T]) throws -> UnsafeMutablePointer<GArrowArray>? {
let valuesType = type(of: values).Element.self
let arrayBuilder: UnsafeMutablePointer<GArrowArrayBuilder>?
if valuesType == Double.self {
arrayBuilder = GARROW_ARRAY_BUILDER(garrow_double_array_builder_new())
} else if valuesType == String.self {
arrayBuilder = GARROW_ARRAY_BUILDER(garrow_string_array_builder_new())
}
// additional if statements for additional types supported omitted for brevity
if let arrayBuilder = arrayBuilder {
var error: UnsafeMutablePointer<GError>?
var result: gboolean
#if canImport(Darwin)
let numValues: Int64 = Int64(values.count)
#else
let numValues: Int = values.count
#endif
if valuesType == Double.self {
var values = values as! [Double] // TODO: Is there any way to do this without force casting?
result = garrow_double_array_builder_append_values(GARROW_DOUBLE_ARRAY_BUILDER(arrayBuilder),
&values,
numValues,
[],
0,
&error)
} else if valuesType == String.self {
let values = values as! [String] // TODO: Is there any way to do this without force casting?
var cValues = values.map { UnsafePointer<Int8>(strdup($0)) }
result = garrow_string_array_builder_append_strings(GARROW_STRING_ARRAY_BUILDER(arrayBuilder),
&cValues,
numValues,
[],
0,
&error)
// This free is from here: https://stackoverflow.com/a/38275792/529743
// TODO: I don't understand when I need to free UnsafeMutablePointers and when I don't. Is there a
// closure that would obviate the need to manually call free?
for ptr in cValues { free(UnsafeMutablePointer(mutating: ptr)) }
}
// additional if statements for additional types supported omitted for brevity
// error handling etc omitted for brevity
The above works end-to-end, but I'd like to clean up the code. You'll notice the as!
force casts, as well as the long list of else if
's that must be present in every function I have that converts Swift<>Arrow types.
I have been following type erasure guides like this and this and this, but I haven't quite gotten my brain around it. I'm getting stuck on two facts that differ from my situation and their examples:
- My underlying types are not types I defined, like
Goat
orElectricCar
, they are built-in types likeString
andDouble
. - My underlying types are arrays of types. I need to act on sequences of
String
andDouble
rather than single objects, so perhaps we'd say this isAnySequence<String>
andAnySequence<Double>
.
I haven't successfully landed on a configuration of protocol
s, associatedtype
s, and struct
s that has compiled. It seems like this is what type erasure is for, but I'm missing it.
This is the closest I have gotten, but it does not compile:
protocol ArrowSequenceProtocol {
associatedtype ArrowType
var elements: [ArrowType] { get }
func toGArrowArray() throws -> UnsafeMutablePointer<GArrowArray>?
}
struct ArrowSequenceString {
typealias ArrowType = String
let elements: [ArrowType]
func toGArrowArray() throws -> UnsafeMutablePointer<GArrowArray>? {
var error: UnsafeMutablePointer<GError>?
var result: gboolean
let arrayBuilder: UnsafeMutablePointer<GArrowStringArrayBuilder>? = garrow_string_array_builder_new()
#if canImport(Darwin)
let numValues: Int64 = Int64(self.elements.count)
#else
let numValues: Int = self.elements.count
#endif
var cValues = self.elements.map { UnsafePointer<Int8>(strdup($0)) }
result = garrow_string_array_builder_append_strings(arrayBuilder,
&cValues,
numValues,
[],
0,
&error)
return try completeGArrayBuilding(result: result, error: error, arrayBuilder: GARROW_ARRAY_BUILDER(arrayBuilder))
}
}
struct ArrowSequenceDouble {
typealias ArrowType = Double
var elements: [ArrowType]
func toGArrowArray() throws -> UnsafeMutablePointer<GArrowArray>? {
var error: UnsafeMutablePointer<GError>?
var result: gboolean
let arrayBuilder: UnsafeMutablePointer<GArrowDoubleArrayBuilder>? = garrow_double_array_builder_new()
#if canImport(Darwin)
let numValues: Int64 = Int64(self.elements.count)
#else
let numValues: Int = self.elements.count
#endif
var elements = self.elements
result = garrow_double_array_builder_append_values(arrayBuilder,
&elements,
numValues,
[],
0,
&error)
return try completeGArrayBuilding(result: result, error: error, arrayBuilder: GARROW_ARRAY_BUILDER(arrayBuilder))
}
}
struct AnyArrowSequence<ArrowType>: ArrowSequenceProtocol {
private let _toGArrowArray: () throws -> UnsafeMutablePointer<GArrowArray>?
var elements: [ArrowType]
init<T: ArrowSequenceProtocol>(_ wrapped: T) where T.ArrowType == ArrowType {
_toGArrowArray = wrapped.toGArrowArray
self.elements = wrapped.elements
}
func toGArrowArray() throws -> UnsafeMutablePointer<GArrowArray>? {
return try _toGArrowArray()
}
}
func completeGArrayBuilding(result: gboolean,
error: UnsafeMutablePointer<GError>?,
arrayBuilder: UnsafeMutablePointer<GArrowArrayBuilder>) throws ->
UnsafeMutablePointer<GArrowArray>? {
// left out for brevity, this part works fine
}
protocol ArrowSupportedType: Equatable, CustomStringConvertible {
}
extension String: ArrowSupportedType {
}
extension Double: ArrowSupportedType {
}
// FIXME: This doesn't compile. The leap between the generic type T which can be a String, Bool,
// Int, Float, or Double to one of the ArrowSequence structs is the issue.
func arrayToGArray<T: ArrowSupportedType>(values: [T]) throws -> UnsafeMutablePointer<GArrowArray>? {
let valuesType = type(of: values).Element.self
if valuesType == Double.self {
let wrappedValues = ArrowSequenceDouble(elements: values)
return try wrappedValues.toGArrowArray()
} else if valuesType == String.self {
let wrappedValues = ArrowSequenceString(elements: values)
return try wrappedValues.toGArrowArray()
} else {
throw ArrowError.unsupportedDataType("Got array with type \(valuesType), which is not supported")
}
}
This leaves me stuck at:
/Users/xander/dev/SwiftArrow/Sources/Arrow/SwiftToGArrow.swift:8:59: error: cannot convert value of type '[T]' to expected argument type '[ArrowSequenceDouble.ArrowType]' (aka 'Array<Double>')
let wrappedValues = ArrowSequenceDouble(elements: values)
^
/Users/xander/dev/SwiftArrow/Sources/Arrow/SwiftToGArrow.swift:8:59: note: arguments to generic parameter 'Element' ('T' and 'ArrowSequenceDouble.ArrowType' (aka 'Double')) are expected to be equal
let wrappedValues = ArrowSequenceDouble(elements: values)
^
/Users/xander/dev/SwiftArrow/Sources/Arrow/SwiftToGArrow.swift:11:59: error: cannot convert value of type '[T]' to expected argument type '[ArrowSequenceString.ArrowType]' (aka 'Array<String>')
let wrappedValues = ArrowSequenceString(elements: values)
^
/Users/xander/dev/SwiftArrow/Sources/Arrow/SwiftToGArrow.swift:11:59: note: arguments to generic parameter 'Element' ('T' and 'ArrowSequenceString.ArrowType' (aka 'String')) are expected to be equal
let wrappedValues = ArrowSequenceString(elements: values)
^
Any pointers on the Swifty-est way to manage these types would be appreciated. Avoiding performance pitfalls is also worth keeping in mind, this library is meant to serialize/de-serialize terabytes of data to/from disk.