This is the (implementation and) program that output the above:
/// An 8-bit floating point type (which probably doesn't work as expected yet).
///
/// This type has been put together by (an amateur) looking at this:
/// * https://en.wikipedia.org/wiki/Single-precision_floating-point_format
/// * http://www.cs.jhu.edu/~jorgev/cs333/readings/8-Bit_Floating_Point.pdf
/// * https://raw.githubusercontent.com/apple/swift/master/stdlib/public/core/FloatingPointTypes.swift.gyb
/// and piggybacking on `Float32` where (it's maybe) possible.
///
/// `Float8` has three exponent bits and four significand bits.
///
/// ```
/// These are just some notes I used when implementing it:
///
/// exponent bit pattern: 0 1 2 3 4 5 6 7
/// exponent: sub -2 -1 0 1 2 3 inf/nan
/// bias 3
///
/// 0_000_0001 = 0x01 = 2**(-2) * (0 + 1/16) = 0.015625 (least nonzero magnitude)
/// 0_000_1111 = 0x0f = 2**(-2) * (0 + 15/16) = 0.234375 (greatest subnormal magnitude)
/// 0_001_0000 = 0x10 = 2**(-2) * (1 + 0/16) = 0.25 (least normal nonzero magnitude)
/// 0_011_0000 = 0x30 = 2**( 0) * (1 + 0/16) = 1.0
/// 0_110_1111 = 0x6f = 2**( 3) * (1 + 15/16) = 15.5 (greatest finite magnitude)
/// ```
/// See: https://forums.swift.org/t/has-anyone-implemented-a-float8-quarter-type/33337/8
struct Float8 {
private var bitPattern: UInt8
init(bitPattern: UInt8) {
self.bitPattern = bitPattern
}
}
import Darwin
extension Float8 {
var float: Float {
// if isSignalingNaN { return Float.signalingNaN }
// if isNaN { return Float.nan }
// let fsign = sign == .minus ? -Float(1) : Float(1)
// if isInfinite { return Float.infinity * fsign }
// var zeroOrOne: Float = 1.0
// var exp = Float(exponentBitPattern) - Float(Self._exponentBias)
// if isSubnormal {
// zeroOrOne = 0.0
// exp += 1
// }
// let fraction: Float = Float(bitPattern & 0b1111) / 16.0
// return fsign *
// powf(Float(2), exp) * (zeroOrOne + fraction)
return Float(self)
}
}
private extension BinaryInteger {
private func _binaryLogarithm() -> Int {
precondition(self > (0 as Self))
var (quotient, remainder) =
(bitWidth &- 1).quotientAndRemainder(dividingBy: UInt.bitWidth)
remainder = remainder &+ 1
var word = UInt(truncatingIfNeeded: self >> (bitWidth &- remainder))
// If, internally, a variable-width binary integer uses digits of greater
// bit width than that of Magnitude.Words.Element (i.e., UInt), then it is
// possible that `word` could be zero. Additionally, a signed variable-width
// binary integer may have a leading word that is zero to store a clear sign
// bit.
while word == 0 {
quotient = quotient &- 1
remainder = remainder &+ UInt.bitWidth
word = UInt(truncatingIfNeeded: self >> (bitWidth &- remainder))
}
// Note that the order of operations below is important to guarantee that
// we won't overflow.
return UInt.bitWidth &* quotient &+
(UInt.bitWidth &- (word.leadingZeroBitCount &+ 1))
}
}
extension Float8 : BinaryFloatingPoint {
typealias Exponent = Int
typealias RawSignificand = UInt8
typealias RawExponent = UInt
typealias Stride = Self
typealias Magnitude = Self
typealias FloatLiteralType = Float
typealias IntegerLiteralType = Int64
static var exponentBitCount: Int { 3 }
static var significandBitCount: Int { 4 }
static var _exponentBias: UInt { 3 }
static var nan: Float8 { Float8(bitPattern: 0b0_111_1000) }
static var signalingNaN: Float8 { Float8(bitPattern: 0b0_111_0100) }
static var infinity: Float8 { Float8(bitPattern: 0b0_111_0000) }
/// 0.25
static var leastNormalMagnitude: Float8 {
Float8(bitPattern: 0b0_001_0000)
}
/// 0.015625
static var leastNonzeroMagnitude: Float8 {
Float8(bitPattern: 0b0_000_0001)
}
/// 15.5
static var greatestFiniteMagnitude: Float8 {
Float8(bitPattern: 0b0_110_1111)
}
private static var _infinityExponent: UInt = 0b111
private static var _significandMask: UInt8 = 0b1111
/// The mathematical constant pi approximated by the closest representable
/// `Float8` value less than pi, which is `3.125`.
static var pi: Float8 {
return Float8(bitPattern: 0b0_100_1001)
}
var exponentBitPattern: UInt { UInt((bitPattern &>> 4) & 0b111) }
var significandBitPattern: UInt8 { bitPattern & 0b1111 }
var sign: FloatingPointSign { bitPattern & 128 == 128 ? .minus : .plus }
var exponent: Int {
if !isFinite { return .max }
if isZero { return .min }
let provisional = Int(exponentBitPattern) - Int(Self._exponentBias)
if isNormal { return provisional }
let shift = Self.significandBitCount -
significandBitPattern._binaryLogarithm()
return provisional + 1 - shift
}
var significand: Float8 {
if isNaN { return self }
if isNormal {
return Float8(sign: .plus,
exponentBitPattern: Self._exponentBias,
significandBitPattern: significandBitPattern)
}
if isSubnormal {
let shift = Self.significandBitCount -
significandBitPattern._binaryLogarithm()
return Float8(
sign: .plus,
exponentBitPattern: Self._exponentBias,
significandBitPattern: significandBitPattern &<< shift
)
}
// zero or infinity.
return Float8(
sign: .plus,
exponentBitPattern: exponentBitPattern,
significandBitPattern: 0
)
}
var ulp: Float8 {
guard isFinite else { return .nan }
if isNormal {
let bitPattern_ = bitPattern & Self.infinity.bitPattern
return Float8(bitPattern: bitPattern_) * 0x1p-4
}
return .leastNormalMagnitude * 0x1p-4
}
var binade: Float8 {
guard isFinite else { return .nan }
if isSubnormal {
// The following from the FloatingPointTypes.swift.gyb file
// (and adapted to this type) does not work, only produces inf:
// let bitPattern_ = (self * 0x1p4).bitPattern
// & (-Self.infinity).bitPattern
// return Float8(bitPattern: bitPattern_) * 0x1p-4
// So I do this instead:
let shifts = (bitPattern & 0b0_000_1111).leadingZeroBitCount
return Float8(bitPattern: UInt8(1) &<< (7 &- shifts))
}
return Float8(bitPattern: bitPattern & (-Self.infinity).bitPattern)
}
var significandWidth: Int {
let trailingZeroBits = significandBitPattern.trailingZeroBitCount
if isNormal {
guard significandBitPattern != 0 else { return 0 }
return Self.significandBitCount &- trailingZeroBits
}
if isSubnormal {
let leadingZeroBits = significandBitPattern.leadingZeroBitCount
return Self.RawSignificand.bitWidth &-
(trailingZeroBits &+ leadingZeroBits &+ 1)
}
return -1
}
var nextUp: Float8 {
// Silence signaling NaNs, map -0 to +0.
let x = self + 0
if _fastPath(x < .infinity) {
let increment = Int8(bitPattern: x.bitPattern) &>> 7 | 1
let bitPattern_ = x.bitPattern &+ UInt8(bitPattern: increment)
return Float8(bitPattern: bitPattern_)
}
return x
}
init(sign: FloatingPointSign,
exponentBitPattern: UInt,
significandBitPattern: UInt8)
{
self.bitPattern = (sign == .minus ? 0b1_000_0000 : 0b0_000_0000)
| (UInt8(truncatingIfNeeded: (exponentBitPattern & 0b111)) << 4)
| (significandBitPattern & 0b1111)
}
init(sign: FloatingPointSign, exponent: Int, significand: Float8) {
var result = significand
if sign == .minus { result = -result }
if significand.isFinite && !significand.isZero {
var clamped = exponent
let leastNormalExponent = 1 - Int(Self._exponentBias)
let greatestFiniteExponent = Int(Self._exponentBias)
if clamped < leastNormalExponent {
clamped = max(clamped, 3*leastNormalExponent)
while clamped < leastNormalExponent {
result *= Self.leastNormalMagnitude
clamped -= leastNormalExponent
}
}
else if clamped > greatestFiniteExponent {
clamped = min(clamped, 3*greatestFiniteExponent)
let step = Float8(sign: .plus,
exponentBitPattern: 6,
significandBitPattern: 0)
while clamped > greatestFiniteExponent {
result *= step
clamped -= greatestFiniteExponent
}
}
let scale = Float8(
sign: .plus,
exponentBitPattern: UInt(Int(Self._exponentBias) + clamped),
significandBitPattern: 0
)
result = result * scale
}
self = result
}
mutating func round(_ rule: FloatingPointRoundingRule) {
var f = self.float
f.round(rule)
self = Float8(f)
}
static func - (lhs: Float8, rhs: Float8) -> Float8 {
// NOTE: My promoting to Float32 was causing an infinite recursion
// for eg `let a = Float8(-Float(15.9))`
// I solved it by implementing the unary minus operator below, instead
// of letting it use the default implementation.
return Float8(lhs.float - rhs.float)
}
static prefix func -(lhs: Float8) -> Float8 {
return Float8(bitPattern: lhs.bitPattern ^ 0b1_000_0000)
}
static func * (lhs: Float8, rhs: Float8) -> Float8 {
return Float8(lhs.float * rhs.float)
}
static func *= (lhs: inout Float8, rhs: Float8) {
var f = lhs.float
f *= rhs.float
lhs = Float8(f)
}
static func / (lhs: Float8, rhs: Float8) -> Float8 {
return Float8(lhs.float / rhs.float)
}
static func /= (lhs: inout Float8, rhs: Float8) {
var f = lhs.float
f /= rhs.float
lhs = Float8(f)
}
static func += (lhs: inout Float8, rhs: Float8) {
var f = lhs.float
f += rhs.float
lhs = Float8(lhs)
}
static func + (lhs: Float8, rhs: Float8) -> Float8 {
return Float8(lhs.float + rhs.float)
}
static func -= (lhs: inout Float8, rhs: Float8) {
var f = lhs.float
f -= rhs.float
lhs = Float8(f)
}
mutating func formRemainder(dividingBy other: Float8) {
var f = self.float
f.formRemainder(dividingBy: other.float)
self = Float8(f)
}
mutating func formTruncatingRemainder(dividingBy other: Float8) {
var f = self.float
f.formTruncatingRemainder(dividingBy: other.float)
self = Float8(f)
}
mutating func formSquareRoot() {
var f = self.float
f.formSquareRoot()
self = Float8(f)
}
mutating func addProduct(_ lhs: Float8, _ rhs: Float8) {
var f = self.float
f.addProduct(lhs.float, rhs.float)
self = Float8(f)
}
func isEqual(to other: Float8) -> Bool {
return self.float.isEqual(to: other.float)
}
func isLess(than other: Float8) -> Bool {
return self.float.isLess(than: other.float)
}
func isLessThanOrEqualTo(_ other: Float8) -> Bool {
return self.float.isLessThanOrEqualTo(other.float)
}
var isNormal: Bool {
return exponentBitPattern > 0 && isFinite
}
var isFinite: Bool {
return exponentBitPattern < 7
}
var isZero: Bool {
return self.bitPattern & 0b0_111_1111 == 0
}
var isSubnormal: Bool {
return exponentBitPattern == 0 && significandBitPattern != 0
}
var isInfinite: Bool {
return !isFinite && significandBitPattern == 0
}
var isNaN: Bool {
return !isFinite && significandBitPattern != 0
}
private static var _quietNaNMask: UInt8 {
return 1 &<< UInt8(significandBitCount - 1)
}
var isSignalingNaN: Bool {
return isNaN && (significandBitPattern & Self._quietNaNMask) == 0
}
var isCanonical: Bool { return true }
func distance(to other: Float8) -> Float8 {
return Float8(other.float - self.float)
}
func advanced(by n: Float8) -> Float8 {
return Float8(self.float + n.float)
}
var magnitude: Float8 {
return Float8(self.float.magnitude)
}
init(integerLiteral value: Int64) {
// Sorry:
let signBit: UInt8 = value < 0 ? 0b1_000_0000 : 0b0_000_0000
switch value.magnitude {
case 0: self.init(bitPattern: 0b0_000_0000 | signBit)
case 1: self.init(bitPattern: 0b0_011_0000 | signBit)
case 2: self.init(bitPattern: 0b0_100_0000 | signBit)
case 3: self.init(bitPattern: 0b0_100_1000 | signBit)
case 4: self.init(bitPattern: 0b0_101_0000 | signBit)
case 5: self.init(bitPattern: 0b0_101_0100 | signBit)
case 6: self.init(bitPattern: 0b0_101_1000 | signBit)
case 7: self.init(bitPattern: 0b0_101_1100 | signBit)
case 8: self.init(bitPattern: 0b0_110_0000 | signBit)
case 9: self.init(bitPattern: 0b0_110_0010 | signBit)
case 10: self.init(bitPattern: 0b0_110_0100 | signBit)
case 11: self.init(bitPattern: 0b0_110_0110 | signBit)
case 12: self.init(bitPattern: 0b0_110_1000 | signBit)
case 13: self.init(bitPattern: 0b0_110_1010 | signBit)
case 14: self.init(bitPattern: 0b0_110_1100 | signBit)
case 15: self.init(bitPattern: 0b0_110_1110 | signBit)
default: fatalError()
}
}
init(floatLiteral value: Float) {
// There was an infinite recursion here for eg `Float8(-Float(0))`,
// but not for `Float8(-Float(1))` or `Float8(Float(0))`.
// This check takes care of that particular case, but are there more?
if value == -Float(0) {
self.init(bitPattern: 0b1_000_0000)
} else {
self.init(value)
}
}
}
extension Float8 : CustomStringConvertible, LosslessStringConvertible {
var description: String { return "\(Float(self))" }
init?(_ description: String) {
guard let f32 = Float(description) else { return nil }
let f8 = Float8(f32)
if f8.description != description { return nil }
self = f8
}
}
//-----------------------------------------------------------------------------
// MARK: - Demo
//-----------------------------------------------------------------------------
extension String {
func leftPadded(to minCount: Int, with char: Character=" ") -> String {
return String(repeating: char, count: max(0, minCount-count)) + self
}
}
extension BinaryFloatingPoint {
var segmentedBinaryString: String {
let e = String(exponentBitPattern, radix: 2)
let s = String(significandBitPattern, radix: 2)
return [self.sign == .plus ? "0" : "1", "_",
e.leftPadded(to: Self.exponentBitCount, with: "0"), "_",
s.leftPadded(to: Self.significandBitCount, with: "0")].joined()
}
}
extension LosslessStringConvertible {
func leftPadded(to minCount: Int, with char: Character=" ") -> String {
return description.leftPadded(to: minCount, with: char)
}
}
extension Float8 {
static func debugPrintAllValues() {
var finCount = 0
var infCount = 0
var nanCount = 0
print(" N Float8 bitPattern exponent significand binade ulp")
print("---------------------------------------------------------------------------")
for byteValue: UInt8 in .min ... .max {
let v = Float8(bitPattern: byteValue)
let expStr: String
switch v.exponent {
case .min: expStr = "Int.min"
case .max: expStr = "Int.max"
default: expStr = v.exponent.description
}
print(
byteValue.leftPadded(to: 4),
v.leftPadded(to: 11),
v.segmentedBinaryString.leftPadded(to: 12),
expStr.leftPadded(to: 9),
v.significand.leftPadded(to: 11),
v.binade.leftPadded(to: 11),
v.ulp.leftPadded(to: 11)
)
if v.isFinite { finCount += 1 }
if v.isNaN { nanCount += 1 }
if v.isInfinite { infCount += 1 }
}
print("Number of finite values:", finCount)
print("Number of infinite values:", infCount)
print("Number of NaNs:", nanCount)
precondition(finCount + infCount + nanCount == 256)
}
}
Float8.debugPrintAllValues()
Before possibly cleaning it up and using/trusting it, I'd like to take the opportunity and ask if anyone more skilled than me would like to take a quick look and maybe spot some obvious mistakes.
Edit: Corrected the code.