Hi Jens, Joe,
Thanks for your advice.
The program looks like this:
import AVR
typealias IntegerLiteralType = UInt8
var shouldCalculateRPMs: Bool = false
var capturedPulseCount: UInt8 = 0
// Setup periodic reading of the timer pulse counter
executeAsync(after: UInt16(100), repeats: true) {
// *** This is an interrupt service routine (ISR), code must be fast ***
// If we are not already calculating RPMs
if !shouldCalculateRPMs {
// Capture current pulse count and reset counter
capturedPulseCount = currentTimer0Value()
timer0CounterReset()
// Let the main loop know that it is time to calculate RPMs
shouldCalculateRPMs = true
}
}
while(true) {
if shouldCalculateRPMs {
// Calculate RPMs and print
let rpm: UInt16 = UInt16(capturedPulseCount) &* (300 as UInt16)
print(unsignedInt: rpm, addNewline: true)
// print(message: RPM)
delay(ms: 500)
// Let ISR know we are done
shouldCalculateRPMs = false
}
}
The code calls various functions in an API consisting of simple swift shims around underlying C API functions.
For example, there are functions that read values from the hardware (currentTimer0Value, timer0CounterReset) and to print debug messages or wait in a loop (print, delay) that are probably not important in this case.
The core function call is executeAsync, which looks something like this (some irrelevant stuff removed)…
public func executeAsync(after: UInt16, callback: @escaping @convention(c) () -> Void) {
_setupTimer1InterruptCallback(after, callback)
}
The function _setupTimer1InterruptCallback is defined in the C API something like this…
typedef void (* __nonnull externalInterruptCallback)();
void _setupTimer1InterruptCallback(uint16_t tenthsOfAMillisecond, externalInterruptCallback __nonnull callback);
The C API function implementation stores the function pointer in an internal variable and calls it at regular intervals from within an interrupt handler.
Looking at the llvm IR, it compiles down to something like this…
… first part omitted for clarity...
%Sb = type <{ i1 }>
%Vs5UInt8 = type <{ i8 }>
@_Tv4main19shouldCalculateRPMsSb = hidden local_unnamed_addr global %Sb zeroinitializer, align 1
@_Tv4main18capturedPulseCountVs5UInt8 = hidden local_unnamed_addr global %Vs5UInt8 zeroinitializer, align 1
@__swift_reflection_version = linkonce_odr hidden constant i16 1
@llvm.used = appending global [1 x i8*] [i8* bitcast (i16* @__swift_reflection_version to i8*)], section "llvm.metadata"
; Function Attrs: noreturn
define i32 @main(i32, i8** nocapture readnone) local_unnamed_addr #0 {
entry:
store i1 false, i1* getelementptr inbounds (%Sb, %Sb* @_Tv4main19shouldCalculateRPMsSb, i64 0, i32 0), align 1
store i8 0, i8* getelementptr inbounds (%Vs5UInt8, %Vs5UInt8* @_Tv4main18capturedPulseCountVs5UInt8, i64 0, i32 0), align 1
tail call void @_TF3AVR12executeAsyncFT5afterVs6UInt167repeatsSb8callbackcT_T__T_(i16 100, i1 true, i8* bitcast (void ()* @_TToF4mainU_FT_T_ to i8*))
%.pr = load i1, i1* getelementptr inbounds (%Sb, %Sb* @_Tv4main19shouldCalculateRPMsSb, i64 0, i32 0), align 1
br i1 %.pr, label %2, label %.backedge.preheader
; <label>:2: ; preds = %entry
%3 = load i8, i8* getelementptr inbounds (%Vs5UInt8, %Vs5UInt8* @_Tv4main18capturedPulseCountVs5UInt8, i64 0, i32 0), align 1
%4 = zext i8 %3 to i16
%5 = mul i16 %4, 300
tail call void @_TF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_(i16 %5, i1 true)
tail call void @_TF3AVR5delayFT2msVs6UInt16_T_(i16 500)
store i1 false, i1* getelementptr inbounds (%Sb, %Sb* @_Tv4main19shouldCalculateRPMsSb, i64 0, i32 0), align 1
br label %.backedge.preheader
.backedge.preheader: ; preds = %entry, %2
br label %.backedge
.backedge: ; preds = %.backedge.preheader, %.backedge
br label %.backedge
}
declare void @_TF3AVR12executeAsyncFT5afterVs6UInt167repeatsSb8callbackcT_T__T_(i16, i1, i8*) local_unnamed_addr #1
define linkonce_odr hidden void @_TToF4mainU_FT_T_() #1 {
entry:
%0 = load i1, i1* getelementptr inbounds (%Sb, %Sb* @_Tv4main19shouldCalculateRPMsSb, i64 0, i32 0), align 1
br i1 %0, label %1, label %2
; <label>:1: ; preds = %entry, %2
ret void
; <label>:2: ; preds = %entry
%3 = tail call i8 @_TF3AVR18currentTimer0ValueFT_Vs5UInt8() #2
store i8 %3, i8* getelementptr inbounds (%Vs5UInt8, %Vs5UInt8* @_Tv4main18capturedPulseCountVs5UInt8, i64 0, i32 0), align 1
tail call void @_TF3AVR18timer0CounterResetFT_T_() #2
store i1 true, i1* getelementptr inbounds (%Sb, %Sb* @_Tv4main19shouldCalculateRPMsSb, i64 0, i32 0), align 1
br label %1
}
declare void @_TF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_(i16, i1) local_unnamed_addr #1
declare void @_TF3AVR5delayFT2msVs6UInt16_T_(i16) local_unnamed_addr #1
declare i8 @_TF3AVR18currentTimer0ValueFT_Vs5UInt8() local_unnamed_addr #1
declare void @_TF3AVR18timer0CounterResetFT_T_() local_unnamed_addr #1
So the best I can see, the optimiser looks at the variable shouldCalculateRPMs and decides it can never change inside the while loop, so it decides the code inside while(true) {} is all loop invariant and moves it before the loop, making the loop an infinite loop with no code in it.
Interestingly, if I add something that should make no difference like a simple print statement just before the end of the loop, it seems to “short circuit” the loop invariant optimisation…
var shouldCalculateRPMs: Bool = false
var capturedPulseCount: UInt8 = 0
// Setup periodic reading of the timer pulse counter
executeAsync(after: UInt16(100), repeats: true) {
// *** This is an interrupt service routine (ISR), code must be fast ***
// If we are not already calculating RPMs
if !shouldCalculateRPMs {
// Capture current pulse count and reset counter
capturedPulseCount = currentTimer0Value()
timer0CounterReset()
// Let the main loop know that it is time to calculate RPMs
shouldCalculateRPMs = true
}
}
while(true) {
if shouldCalculateRPMs {
// Calculate RPMs and print
let rpm: UInt16 = UInt16(capturedPulseCount) &* (300 as UInt16)
print(unsignedInt: rpm, addNewline: true)
// print(message: RPM)
delay(ms: 500)
// Let ISR know we are done
shouldCalculateRPMs = false
}
print(unsignedInt: 1, addNewline: true)
}
This produces the “right” LLVM IR…
%Sb = type <{ i1 }>
%Vs5UInt8 = type <{ i8 }>
@_Tv4main19shouldCalculateRPMsSb = hidden local_unnamed_addr global %Sb zeroinitializer, align 1
@_Tv4main18capturedPulseCountVs5UInt8 = hidden local_unnamed_addr global %Vs5UInt8 zeroinitializer, align 1
@__swift_reflection_version = linkonce_odr hidden constant i16 1
@llvm.used = appending global [1 x i8*] [i8* bitcast (i16* @__swift_reflection_version to i8*)], section "llvm.metadata"
; Function Attrs: noreturn
define i32 @main(i32, i8** nocapture readnone) local_unnamed_addr #0 {
entry:
store i1 false, i1* getelementptr inbounds (%Sb, %Sb* @_Tv4main19shouldCalculateRPMsSb, i64 0, i32 0), align 1
store i8 0, i8* getelementptr inbounds (%Vs5UInt8, %Vs5UInt8* @_Tv4main18capturedPulseCountVs5UInt8, i64 0, i32 0), align 1
tail call void @_TF3AVR12executeAsyncFT5afterVs6UInt167repeatsSb8callbackcT_T__T_(i16 100, i1 true, i8* bitcast (void ()* @_TToF4mainU_FT_T_ to i8*))
br label %2
; <label>:2: ; preds = %8, %entry
%3 = load i1, i1* getelementptr inbounds (%Sb, %Sb* @_Tv4main19shouldCalculateRPMsSb, i64 0, i32 0), align 1
br i1 %3, label %4, label %8
; <label>:4: ; preds = %2
%5 = load i8, i8* getelementptr inbounds (%Vs5UInt8, %Vs5UInt8* @_Tv4main18capturedPulseCountVs5UInt8, i64 0, i32 0), align 1
%6 = zext i8 %5 to i16
%7 = mul i16 %6, 300
tail call void @_TF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_(i16 %7, i1 true)
tail call void @_TF3AVR5delayFT2msVs6UInt16_T_(i16 500)
store i1 false, i1* getelementptr inbounds (%Sb, %Sb* @_Tv4main19shouldCalculateRPMsSb, i64 0, i32 0), align 1
br label %8
; <label>:8: ; preds = %2, %4
tail call void @_TF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_(i16 1, i1 true)
br label %2
}
So now it’s checking @_Tv4main19shouldCalculateRPMsSb every time, and it all works!
Furthermore, I can comment out the call to executeAsync… it makes no difference.
The only thing that makes the compiler not do the loop invariant code optimisation is adding some “grist” like the print statement or a delay.
My best guess is that the closure being passed to executeAsync is passed with @convention(c), meaning it does not capture any variables (the variables are global value types so they shouldn’t need capture)… so the compiler has “no way of knowing” that the variable might be changed by executeAsync. That’s why the loop invariant optimisation is being done too aggressively.
Thoughts?
Carl