Serialization in Swift

Loooop · February 12, 2023, 6:05pm

Hi Tera,
I more or less fine-tuned the identity of value types by defining this protocol in my library:

public protocol GIdentifiable<GID> {
	associatedtype GID : Hashable
	var gID: Self.GID? { get }
}

extension GIdentifiable where Self:Identifiable {
	public var gID: Self.ID? { id }
}

Here's how the code from your first example can be implemented in GraphCodable:

protocol Node: Identifiable, GIdentifiable, GCodable {
	var name: 		String 	{ get }
	var code:		[Int]?	{ get }
	var children:	[Self]	{ get }
	
	init(name: String, code:[Int]?, children: [Self])
}

fileprivate enum Key : String { case name, code, children }

extension Node {
	init(name: String, code:[Int] ) {
		self.init(name: name, code:code, children: [])
	}
	init(name: String ) {
		self.init(name: name, code:nil, children: [])
	}
	
	init(from decoder: GDecoder) throws {
		self.init(
			name: 		try decoder.decode(for: Key.name),
			code: 		try decoder.decode(for: Key.code),
			children:	try decoder.decode(for: Key.children)
		)
	}
	
	func encode(to encoder: GEncoder) throws {
		try encoder.encode( name, for: Key.name )
		try encoder.encode( code, for: Key.code )
		try encoder.encode( children, for: Key.children )
	}
}

struct NodeS : Node {
	private (set) var id = UUID()
	
	var name 		: String 	{ willSet { id = UUID() } }
	var code		: [Int]?	{ willSet { id = UUID() } }
	var children	: [NodeS]	{ willSet { id = UUID() } }
	
	init(name: String, code:[Int]?, children: [NodeS] ) {
		self.name		= name
		self.code		= code
		self.children	= children
	}
}

final class NodeC : Node {
	var name 		: String
	var code		: [Int]?
	var children	: [NodeC]
	
	init(name: String, code:[Int]?, children: [NodeC] ) {
		self.name		= name
		self.code		= code
		self.children	= children
	}
}

func nodeTest<NodeType: Node>(_ nodeType: NodeType.Type) throws {
	print(type(of: nodeType))
	let code = [1,2,3]
	
	let a = NodeType(name: "a", code:code)
	let b = NodeType(name: "b", code:code)
	let c = NodeType(name: "c", code:code, children: [a, b])
	let d = NodeType(name: "d", code:code, children: [c, a])

	let inRoot	= [a,b,c,d]
	let data	= try GraphEncoder().encode( inRoot )
	let outRoot	= try GraphDecoder().decode( type( of:inRoot ), from: data )
	
	print("• Decoded Root Dump ••••••••••••")
	print( try GraphEncoder().dump( outRoot ) )
}

try nodeTest( NodeS.self )
try nodeTest( NodeC.self )

And this is the output after archiving and dearchiving:

NodeS.Type
• Decoded Root Dump ••••••••••••
== BODY ==========================================================
- VAL1000
	- VAL1001
		+ "name": "a"
		+ "code": VAL1002
			- 1
			- 2
			- 3
		.
		+ "children": VAL1003
		.
	.
	- VAL1004
		+ "name": "b"
		+ "code": PTR1002
		+ "children": PTR1003
	.
	- VAL1005
		+ "name": "c"
		+ "code": PTR1002
		+ "children": VAL1006
			- PTR1001
			- PTR1004
		.
	.
	- VAL1007
		+ "name": "d"
		+ "code": PTR1002
		+ "children": VAL1008
			- PTR1005
			- PTR1001
		.
	.
.
==================================================================

NodeC.Type
• Decoded Root Dump ••••••••••••
== BODY ==========================================================
- VAL1000
	- REF1001 MyGraphCodableApp.NodeC
		+ "name": "a"
		+ "code": VAL1002
			- 1
			- 2
			- 3
		.
		+ "children": VAL1003
		.
	.
	- REF1004 MyGraphCodableApp.NodeC
		+ "name": "b"
		+ "code": PTR1002
		+ "children": PTR1003
	.
	- REF1005 MyGraphCodableApp.NodeC
		+ "name": "c"
		+ "code": PTR1002
		+ "children": VAL1006
			- PTR1001
			- PTR1004
		.
	.
	- REF1007 MyGraphCodableApp.NodeC
		+ "name": "d"
		+ "code": PTR1002
		+ "children": VAL1008
			- PTR1005
			- PTR1001
		.
	.
.
==================================================================

Now, as was already the case for reference types, there is no duplication of data even with value types, provided they implement the GIdentifiable protocol.

tera · February 12, 2023, 10:11pm

Great news.

Combining this with the idea from another thread you may even do all strings (and other long enough value types like arrays / dictionaries) "GIdentifiable":

extension String: GIdentifiable {
    public var gID: [UInt8]? {
        valueIdentifier(self)
    }
}

where:

func valueIdentifier<T>(_ value: T) -> [UInt8] {
    let size = MemoryLayout.size(ofValue: value)
    var id: [UInt8] = []
    withUnsafeBytes(of: value) { p in
        let bytes = p.baseAddress!.assumingMemoryBound(to: UInt8.self)
        for i in 0 ..< size {
            let byte = bytes[i]
            id.append(byte)
        }
    }
    return id
}

func valueHexIdentifier<T>(_ value: T) -> String {
    let id = valueIdentifier(value)
    return id.reduce("") { r, e in r + String(format: "%02x", e) }
}

test:

func makeStringCopy(_ string: String) -> String {
    String(data: string.data(using: .utf8)!, encoding: .utf8)!
}

let a = makeStringCopy("Hello New World, Long enough string")
let b = a
let c = makeStringCopy("Hello New World, Long enough string")

let d = makeStringCopy("short string")
let e = d
let f = makeStringCopy("short string")

let aid = valueHexIdentifier(a) // 23000000000000f09001110200600000
let bid = valueHexIdentifier(b) // 23000000000000f09001110200600000 // same
let cid = valueHexIdentifier(c) // 23000000000000f0e001110200600000 // different

let did = valueHexIdentifier(d) // 73686f727420737472696e67000000ec
let eid = valueHexIdentifier(e) // 73686f727420737472696e67000000ec // same
let fid = valueHexIdentifier(f) // 73686f727420737472696e67000000ec // same

print(aid)
print(bid)
print(cid)
print(did)
print(eid)
print(fid)

precondition(aid == bid, "should be the same")
precondition(aid == bid, "will be different")
precondition(did == eid, "should be the same")
precondition(did == fid, "most likely will be the same due to tagged strings optimisation")

print()

Note that this automatically generated identifier generates relatively short (say 16 byte) identifier that will be equal for the same values (in the "let b = a" meaning of sameness) but may be different for values that are otherwise equal (in the "a == b" meaning of equal). This ID is not as good as the one provided manually, but, perhaps, better than no ID at all, and the good thing about it - it is automatically generated, e.g.:

extension Array: GIdentifiable {
    public var gID: [UInt8]? {
        valueIdentifier(self)
    }
}

Would be interesting to see how this works in real-world tests.

tera · February 12, 2023, 11:28pm

Beware that for value types valueIdentifier's ID is as large as MemoryLayout.size(of: value) – while String / Array / Dictionary / Set are known to be short, this might be a concern for large custom structures. Some examples:

print(valueHexIdentifier(0x123456789ABCDEF0)) // f0debc9a78563412
// same bytes, just different spelling due to little endian

let point = CGPoint(
    x: Double(bitPattern: 0x1111111111111111),
    y: Double(bitPattern: 0x2222222222222222))

print(valueHexIdentifier(point)) // 11111111111111112222222222222222

struct S {
    var x: UInt8 = 0x33
    // here will be padding (not necessarily 0! could be any garbage)
    var y: UInt16 = 0x7777
}

print(valueHexIdentifier(S())) // 33007777

Ever wanted to see what's inside optionals?

print(valueHexIdentifier(0x4444444444444444 as Int?)) // 444444444444444400
print(valueHexIdentifier(nil as Int?))                // 000000000000000001

print(valueHexIdentifier(false as Bool?))   // 00
print(valueHexIdentifier(true as Bool?))    // 01
print(valueHexIdentifier(nil as Bool?))     // 02

enum E: String { case x, y, z }
print(valueHexIdentifier(E.x as E?))        // 00
print(valueHexIdentifier(E.y as E?))        // 01
print(valueHexIdentifier(E.z as E?))        // 02
print(valueHexIdentifier(nil as E?))        // 03

And, BTW, hey, we've just reinvented ObjectIdentifier!
If you call valueIdentifier on an object you'll get similar results to ObjectIdentifier's:

class C {}
let obj = C()
print(valueHexIdentifier(obj)) // b0c2000000600000
print(ObjectIdentifier(obj))   // ObjectIdentifier(0x000060000000c2b0)

Happens to be the same (apart from endian difference in the output). @QuinceyMorris might give an idea under what platforms or circumstances (diagnostic settings?) we could see these two actually giving different results.

Same test as above but now the node is using automatic identifier instead of providing a manual one:

struct NodeS: Node {
    var id: String { valueHexIdentifier(self) }
    var name: String
    var children: [NodeS]
}

Outputs:

NodeS.Type
 640000000000000000000000000000e1e000930100600000: d [
     630000000000000000000000000000e1000f120100600000: c [
         610000000000000000000000000000e1e0709ddb01000000: a 
         620000000000000000000000000000e1e0709ddb01000000: b 
     ]
     610000000000000000000000000000e1e0709ddb01000000: a already seen
     630000000000000000000000000000e1000f120100600000: c already seen
 ]

Note that it correctly marked "a" and "c" as already seen based on identity check.

Loooop · February 13, 2023, 4:35am

You really don't want to pass as the id an array of the bytes in memory representing the value.
And anyway, just in case, it would be better to make the type Hashable and pass itself as the id.

The solution for arrays already exists, because as id you pass the pointer to their internal storage like this:

extension Array : GIdentifiable where Element:GCodable {
	public var gID: OpaquePointer? {
		withUnsafeBytes { OpaquePointer( $0.baseAddress ) }
	}
}

extension ContiguousArray : GIdentifiable where Element:GCodable {
	public var gID: OpaquePointer? {
		withUnsafeBytes { OpaquePointer( $0.baseAddress ) }
	}
}

or like this:

extension Array : GIdentifiable where Element:GCodable {
	public var gID: ObjectIdentifier? {
		withUnsafeBytes { unsafeBitCast( $0.baseAddress, to: ObjectIdentifier?.self) }
	}
}

extension ContiguousArray : GIdentifiable where Element:GCodable {
	public var gID: ObjectIdentifier? {
		withUnsafeBytes { unsafeBitCast( $0.baseAddress, to: ObjectIdentifier?.self) }
	}
}

The problem is that the internal storage doesn't seem to be accessible in other system library containers like Sets and Dictionaries.

Would it be possible to make system containers Identifiable?

tera · February 13, 2023, 4:53am

Why?

Also note these warnings from the docs irt using Array's withUnsafeBytes:

Although the second warning one seems to be relevant (I am making an assumption!) to actually copying bytes from the pointer, not to merely using the pointer itself. The first warning is more worrying.*

Loooop · February 13, 2023, 6:15am

Because in general you don't know which fields of the value type must be archived (and therefore constitute its identity during archiving) and which are not.

Note: I'm not accessing the storage pointed to by baseAddress, I'm just using that address as a lookup key. It's just a number. For this I prefer the version that turns that address into an ObjectIdentifier instead of an OpaquePointer.

If the storage exists, the problem doesn't arise.

If it doesn't exist, there are two possibilities:

If during archiving the temporary storage it is created at two different memory addresses for the same array, I will not notice that the array is the same and will archive it twice. It doesn't seem to happen, but just in case, I still don't lose the ability to read the archive.
If instead the temporary storage for two different arrays is created at the same address when I archive the first and second array, the two arrays would be considered equal even though they are not, and this is certainly a problem.

NSExceptional · December 19, 2023, 12:43am

I'm not sure what point you're trying to make here. You didn't directly refute anything I was saying. Are you trying to say Codable shouldn't be used to output XML such as property lists or HTML?

ksluder · December 19, 2023, 2:27am

The post you are replying to is from April of 2021, and even the last post in this thread is 10 months old.

NSExceptional · February 1, 2024, 9:30am

And?