cc @Erik_Eckstein. It looks like the optimizer is failing to do LICM in either case. With magnitude
, it's instead turning the addition loop into a multiplication by 1 billion (though it still computes the magnitude):
%152 = tail call double @CACurrentMediaTime()
%153 = load double, double* getelementptr inbounds (%TSd, %TSd* @"$S3foo1aSdvp", i64 0, i32 0), align 8
%154 = tail call double @llvm.fabs.f64(double %153)
%155 = bitcast double %154 to i64
%.promoted = load i64, i64* getelementptr inbounds (%Ts6UInt64V, %Ts6UInt64V* @"$S3foo8checksums6UInt64Vvp", i64 0, i32 0), align 8
%156 = mul i64 %155, 1000000000
%157 = add i64 %.promoted, %156
store i64 %157, i64* getelementptr inbounds (%Ts6UInt64V, %Ts6UInt64V* @"$S3foo8checksums6UInt64Vvp", i64 0, i32 0), align 8
%158 = tail call double @CACurrentMediaTime()
On the other hand, with nextDown
, it appears to be trying to vectorize the loop for no good reason:
%152 = tail call double @CACurrentMediaTime()
%153 = load double, double* getelementptr inbounds (%TSd, %TSd* @"$S3foo1aSdvp", i64 0, i32 0), align 8
%154 = fsub double 0.000000e+00, %153
%155 = fcmp olt double %154, 0x7FF0000000000000
%156 = bitcast double %154 to i64
%157 = ashr i64 %156, 63
%158 = or i64 %157, 1
%159 = add i64 %158, %156
%160 = bitcast i64 %159 to double
%161 = select i1 %155, double %160, double %154
%162 = fsub double -0.000000e+00, %161
%163 = bitcast double %162 to i64
%.promoted = load i64, i64* getelementptr inbounds (%Ts6UInt64V, %Ts6UInt64V* @"$S3foo8checksums6UInt64Vvp", i64 0, i32 0), align 8
%broadcast.splatinsert136 = insertelement <2 x i64> undef, i64 %163, i32 0
%broadcast.splat137 = shufflevector <2 x i64> %broadcast.splatinsert136, <2 x i64> undef, <2 x i32> zeroinitializer
%broadcast.splatinsert138 = insertelement <2 x i64> undef, i64 %163, i32 0
%broadcast.splat139 = shufflevector <2 x i64> %broadcast.splatinsert138, <2 x i64> undef, <2 x i32> zeroinitializer
%164 = insertelement <2 x i64> <i64 undef, i64 0>, i64 %.promoted, i32 0
br label %vector.body
vector.body: ; preds = %vector.body, %"$SSa13_adoptStorage_5countSayxG_SpyxGts016_ContiguousArrayB0CyxGn_SitFZSS_Tg5Tf4nnd_n.exit"
%index = phi i64 [ 0, %"$SSa13_adoptStorage_5countSayxG_SpyxGts016_ContiguousArrayB0CyxGn_SitFZSS_Tg5Tf4nnd_n.exit" ], [ %index.next.7, %vector.body ]
%vec.phi = phi <2 x i64> [ %164, %"$SSa13_adoptStorage_5countSayxG_SpyxGts016_ContiguousArrayB0CyxGn_SitFZSS_Tg5Tf4nnd_n.exit" ], [ %179, %vector.body ]
%vec.phi134 = phi <2 x i64> [ zeroinitializer, %"$SSa13_adoptStorage_5countSayxG_SpyxGts016_ContiguousArrayB0CyxGn_SitFZSS_Tg5Tf4nnd_n.exit" ], [ %180, %vector.body ]
%165 = add <2 x i64> %vec.phi, %broadcast.splat137
%166 = add <2 x i64> %vec.phi134, %broadcast.splat139
%167 = add <2 x i64> %165, %broadcast.splat137
%168 = add <2 x i64> %166, %broadcast.splat139
%169 = add <2 x i64> %167, %broadcast.splat137
%170 = add <2 x i64> %168, %broadcast.splat139
%171 = add <2 x i64> %169, %broadcast.splat137
%172 = add <2 x i64> %170, %broadcast.splat139
%173 = add <2 x i64> %171, %broadcast.splat137
%174 = add <2 x i64> %172, %broadcast.splat139
%175 = add <2 x i64> %173, %broadcast.splat137
%176 = add <2 x i64> %174, %broadcast.splat139
%177 = add <2 x i64> %175, %broadcast.splat137
%178 = add <2 x i64> %176, %broadcast.splat139
%179 = add <2 x i64> %177, %broadcast.splat137
%180 = add <2 x i64> %178, %broadcast.splat139
%index.next.7 = add nuw nsw i64 %index, 32
%181 = icmp eq i64 %index.next.7, 1000000000
br i1 %181, label %middle.block, label %vector.body, !llvm.loop !45
middle.block: ; preds = %vector.body
%bin.rdx = add <2 x i64> %180, %179
%rdx.shuf = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
%bin.rdx140 = add <2 x i64> %bin.rdx, %rdx.shuf
%182 = extractelement <2 x i64> %bin.rdx140, i32 0
store i64 %182, i64* getelementptr inbounds (%Ts6UInt64V, %Ts6UInt64V* @"$S3foo8checksums6UInt64Vvp", i64 0, i32 0), align 8
%183 = tail call double @CACurrentMediaTime()
which is arguably an LLVM bug. There's still a bug on the Swift side that we aren't propagating the constant value of a
. This appears to be specific to global code. If I factor the loop out into a local function, I get more reasonable-looking IR, in which both the constant value of a
gets propagated and the addition loop gets completely constant folded:
/*
let a = 1.0
func loop(checksum: inout UInt64) {
for _ in 0 ..< 1_000_000_000 {
let v = a.magnitude // <-- Replace with `a.nextDown` and you get the same thing
checksum = checksum &+ v.bitPattern
}
}
*/
define hidden swiftcc void @"$S3foo4loop8checksumys6UInt64Vz_tF"(%Ts6UInt64V* nocapture dereferenceable(8)) local_unnamed_addr #0 {
entry:
%._value = getelementptr inbounds %Ts6UInt64V, %Ts6UInt64V* %0, i64 0, i32 0
%._value.promoted = load i64, i64* %._value, align 8
%1 = add i64 %._value.promoted, 6917529027641081856
store i64 %1, i64* %._value, align 8
ret void
}
Possibly, because we model the variables in top-level code as global variables, the compiler is being unnecessarily conservative with them?