C++ Barycentric Function
Vec3f barycentric(Vec3f A, Vec3f B, Vec3f C, Vec3f P) {
Vec3f s[2];
for (int i=2; i--; ) {
s[i][0] = C[i]-A[i];
s[i][1] = B[i]-A[i];
s[i][2] = A[i]-P[i];
}
Vec3f u = cross(s[0], s[1]);
if (std::abs(u[2])>1e-2) // dont forget that u[2] is integer. If it is zero then triangle ABC is degenerate
return Vec3f(1.f-(u.x+u.y)/u.z, u.y/u.z, u.x/u.z);
return Vec3f(-1,1,1); // in this case generate negative coordinates, it will be thrown away by the rasterizator
}
C++ Assembly
__Z11barycentric3vecILm3EfES0_S0_S0_: ; @_Z11barycentric3vecILm3EfES0_S0_S0_
.cfi_startproc
; %bb.0:
ldp s5, s2, [sp]
fsub s5, s5, s0
fsub s3, s3, s0
fsub s6, s2, s1
fsub s4, s4, s1
fnmul s2, s3, s6
fmadd s2, s5, s4, s2
fabs s7, s2
fcvt d7, s7
mov x8, #5243 ; =0x147b
movk x8, #18350, lsl #16
movk x8, #31457, lsl #32
movk x8, #16260, lsl #48
fmov d16, x8
fcmp d7, d16
b.le LBB1_2
; %bb.1:
ldp s16, s7, [sp, #12]
fsub s1, s1, s7
fsub s0, s0, s16
fnmul s5, s5, s1
fmadd s5, s0, s6, s5
fnmul s0, s0, s4
fmadd s3, s3, s1, s0
fadd s0, s3, s5
fdiv s0, s0, s2
fmov s1, #1.00000000
fsub s0, s1, s0
fdiv s1, s5, s2
fdiv s2, s3, s2
ret
LBB1_2:
fmov s1, #1.00000000
fmov s0, #-1.00000000
fmov s2, #1.00000000
ret
.cfi_endproc
This is compared to this:
Swift Barycentric Function
func barycentric(_ a: Vec3r, _ b: Vec3r, _ c: Vec3r, _ p: Vec3r) -> Vec3r {
var s0 = Vec3r()
var s1 = Vec3r()
s1.x = c.y - a.y
s1.y = b.y - a.y
s1.z = a.y - p.y
s0.x = c.x - a.x
s0.y = b.x - a.x
s0.z = a.x - p.x
let u = s0 ^ s1
if abs(u.z) > 1e-2 {
return Vec3r(x: 1.0 - (u.x + u.y) / u.z, y: u.y / u.z, z: u.x / u.z)
} else {
return Vec3r(x: -1, y: 1, z: 1)
}
}
Swift Assembly
_$s4main11barycentricyAA5Vec3rVAD_A3DtF:
.cfi_startproc
stp d15, d14, [sp, #-80]!
stp d13, d12, [sp, #16]
stp d11, d10, [sp, #32]
stp d9, d8, [sp, #48]
stp x29, x30, [sp, #64]
add x29, sp, #64
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
.cfi_offset b8, -24
.cfi_offset b9, -32
.cfi_offset b10, -40
.cfi_offset b11, -48
.cfi_offset b12, -56
.cfi_offset b13, -64
.cfi_offset b14, -72
.cfi_offset b15, -80
fmov s8, s7
fmov s9, s6
fmov s10, s4
fmov s11, s3
fmov s12, s1
fmov s13, s0
ldp s14, s15, [x29, #20]
bl _$s4main5Vec3rVACycfC
bl _$s4main5Vec3rVACycfC
fsub s3, s8, s12
fsub s4, s10, s12
fsub s5, s12, s15
fsub s0, s9, s13
fsub s1, s11, s13
fsub s2, s13, s14
bl _$s4main5Vec3rV1xoiyA2C_ACtFZ
fabs s3, s2
mov w8, #55050
movk w8, #15395, lsl #16
fmov s4, w8
fcmp s3, s4
b.le LBB0_2
fadd s3, s0, s1
fdiv s3, s3, s2
fmov s4, #1.00000000
fsub s3, s4, s3
fdiv s1, s1, s2
fdiv s2, s0, s2
fmov s0, s3
b LBB0_3
LBB0_2:
fmov s0, #-1.00000000
fmov s1, #1.00000000
fmov s2, #1.00000000
LBB0_3:
bl _$s4main5Vec3rV1x1y1zACSf_S2ftcfC
ldp x29, x30, [sp, #64]
ldp d9, d8, [sp, #48]
ldp d11, d10, [sp, #32]
ldp d13, d12, [sp, #16]
ldp d15, d14, [sp], #80
ret
.cfi_endproc
The C++ one seems to use less instructions. But probably looking at the uninlined function is not relevant to my case.
At the moment I'm assuming the issue must be in the calling function, and the way barycentric
is inlined. I'm no good at reading assembly, but this sticks out to me. Here's the callsite to the inlined barycentric
(I think) in the C++ version:
ldp s1, s2, [x21]
ldp s0, s6, [x21, #12]
ldp s3, s5, [x21, #24]
fsub s4, s3, s1
fsub s3, s0, s1
fsub s5, s5, s2
fsub s6, s6, s2
fnmul s0, s3, s5
fmadd s0, s4, s6, s0
fabs s7, s0
And here's that call site in the Swift version:
+0x468 ldp s0, s15, [x21, #0x20]
+0x46c ldp s1, s2, [x21, #0x2c]
+0x470 ldp s3, s4, [x21, #0x38]
+0x474 fsub s4, s4, s15
+0x478 fsub s2, s2, s15
+0x47c fsub s13, s3, s0
+0x480 fsub s8, s1, s0
+0x484 fsub s0, s0, s10
+0x488 fmul s18, s0, s2
+0x48c fmul s19, s0, s4
+0x490 fmul s0, s2, s13
+0x494 fmul s1, s8, s4
+0x498 fsub s9, s0, s1
+0x49c fabd s14, s0, s1
I'm assuming those muls are from the cross product. And that the fabs and fabd instructions are the call to abs()
. The codegen is quite different for what (to me) seems like fairly similar code, but I'm not smart enough to say if it's slower.