6 years ago · 883b0c9880
--- a/Marlin/src/HAL/HAL_AVR/math_AVR.h
+++ b/Marlin/src/HAL/HAL_AVR/math_AVR.h
@@ -23,92 +23,95 @@
 
				
				 #ifndef _MATH_AVR_H_
			
 
				
				 #define _MATH_AVR_H_
			
 
				
				 
			
 
				
				-#define a(CODE) " " CODE "\n\t"
			
 
				
				-
			
 
				
				 /**
			
 
				
				  * Optimized math functions for AVR
			
 
				
				  */
			
 
				
				 
			
 
				
				 // intRes = longIn1 * longIn2 >> 24
			
 
				
				 // uses:
			
 
				
				-// r26 to store 0
			
 
				
				-// r27 to store bits 16-23 of the 48bit result. The top bit is used to round the two byte result.
			
 
				
				+// A[tmp] to store 0
			
 
				
				+// B[tmp] to store bits 16-23 of the 48bit result. The top bit is used to round the two byte result.
			
 
				
				 // note that the lower two bytes and the upper byte of the 48bit result are not calculated.
			
 
				
				 // this can cause the result to be out by one as the lower bytes may cause carries into the upper ones.
			
 
				
				-// B0 A0 are bits 24-39 and are the returned value
			
 
				
				-// C1 B1 A1 is longIn1
			
 
				
				-// D2 C2 B2 A2 is longIn2
			
 
				
				+// B A are bits 24-39 and are the returned value
			
 
				
				+// C B A is longIn1
			
 
				
				+// D C B A is longIn2
			
 
				
				 //
			
 
				
				-#define MultiU24X32toH16(intRes, longIn1, longIn2) \
			
 
				
				-  asm volatile ( \
			
 
				
				-                 A("clr r26")      \
			
 
				
				-                 A("mul %A1, %B2") \
			
 
				
				-                 A("mov r27, r1")  \
			
 
				
				-                 A("mul %B1, %C2") \
			
 
				
				-                 A("movw %A0, r0") \
			
 
				
				-                 A("mul %C1, %C2") \
			
 
				
				-                 A("add %B0, r0")  \
			
 
				
				-                 A("mul %C1, %B2") \
			
 
				
				-                 A("add %A0, r0")  \
			
 
				
				-                 A("adc %B0, r1")  \
			
 
				
				-                 A("mul %A1, %C2") \
			
 
				
				-                 A("add r27, r0")  \
			
 
				
				-                 A("adc %A0, r1")  \
			
 
				
				-                 A("adc %B0, r26") \
			
 
				
				-                 A("mul %B1, %B2") \
			
 
				
				-                 A("add r27, r0")  \
			
 
				
				-                 A("adc %A0, r1")  \
			
 
				
				-                 A("adc %B0, r26") \
			
 
				
				-                 A("mul %C1, %A2") \
			
 
				
				-                 A("add r27, r0")  \
			
 
				
				-                 A("adc %A0, r1")  \
			
 
				
				-                 A("adc %B0, r26") \
			
 
				
				-                 A("mul %B1, %A2") \
			
 
				
				-                 A("add r27, r1")  \
			
 
				
				-                 A("adc %A0, r26") \
			
 
				
				-                 A("adc %B0, r26") \
			
 
				
				-                 A("lsr r27")      \
			
 
				
				-                 A("adc %A0, r26") \
			
 
				
				-                 A("adc %B0, r26") \
			
 
				
				-                 A("mul %D2, %A1") \
			
 
				
				-                 A("add %A0, r0")  \
			
 
				
				-                 A("adc %B0, r1")  \
			
 
				
				-                 A("mul %D2, %B1") \
			
 
				
				-                 A("add %B0, r0")  \
			
 
				
				-                 A("clr r1")       \
			
 
				
				-                 : \
			
 
				
				-                 "=&r" (intRes) \
			
 
				
				-                 : \
			
 
				
				-                 "d" (longIn1), \
			
 
				
				-                 "d" (longIn2) \
			
 
				
				-                 : \
			
 
				
				-                 "r26" , "r27" \
			
 
				
				-               )
			
 
				
				+static FORCE_INLINE uint16_t MultiU24X32toH16(uint32_t longIn1, uint32_t longIn2) {
			
 
				
				+  register uint8_t tmp1;
			
 
				
				+  register uint8_t tmp2;
			
 
				
				+  register uint16_t intRes;
			
 
				
				+  __asm__ __volatile__(
			
 
				
				+    A("clr %[tmp1]")
			
 
				
				+    A("mul %A[longIn1], %B[longIn2]")
			
 
				
				+    A("mov %[tmp2], r1")
			
 
				
				+    A("mul %B[longIn1], %C[longIn2]")
			
 
				
				+    A("movw %A[intRes], r0")
			
 
				
				+    A("mul %C[longIn1], %C[longIn2]")
			
 
				
				+    A("add %B[intRes], r0")
			
 
				
				+    A("mul %C[longIn1], %B[longIn2]")
			
 
				
				+    A("add %A[intRes], r0")
			
 
				
				+    A("adc %B[intRes], r1")
			
 
				
				+    A("mul %A[longIn1], %C[longIn2]")
			
 
				
				+    A("add %[tmp2], r0")
			
 
				
				+    A("adc %A[intRes], r1")
			
 
				
				+    A("adc %B[intRes], %[tmp1]")
			
 
				
				+    A("mul %B[longIn1], %B[longIn2]")
			
 
				
				+    A("add %[tmp2], r0")
			
 
				
				+    A("adc %A[intRes], r1")
			
 
				
				+    A("adc %B[intRes], %[tmp1]")
			
 
				
				+    A("mul %C[longIn1], %A[longIn2]")
			
 
				
				+    A("add %[tmp2], r0")
			
 
				
				+    A("adc %A[intRes], r1")
			
 
				
				+    A("adc %B[intRes], %[tmp1]")
			
 
				
				+    A("mul %B[longIn1], %A[longIn2]")
			
 
				
				+    A("add %[tmp2], r1")
			
 
				
				+    A("adc %A[intRes], %[tmp1]")
			
 
				
				+    A("adc %B[intRes], %[tmp1]")
			
 
				
				+    A("lsr %[tmp2]")
			
 
				
				+    A("adc %A[intRes], %[tmp1]")
			
 
				
				+    A("adc %B[intRes], %[tmp1]")
			
 
				
				+    A("mul %D[longIn2], %A[longIn1]")
			
 
				
				+    A("add %A[intRes], r0")
			
 
				
				+    A("adc %B[intRes], r1")
			
 
				
				+    A("mul %D[longIn2], %B[longIn1]")
			
 
				
				+    A("add %B[intRes], r0")
			
 
				
				+    A("clr r1")
			
 
				
				+      : [intRes] "=&r" (intRes),
			
 
				
				+        [tmp1] "=&r" (tmp1),
			
 
				
				+        [tmp2] "=&r" (tmp2)
			
 
				
				+      : [longIn1] "d" (longIn1),
			
 
				
				+        [longIn2] "d" (longIn2)
			
 
				
				+      : "cc"
			
 
				
				+  );
			
 
				
				+  return intRes;
			
 
				
				+}
			
 
				
				 
			
 
				
				 // intRes = intIn1 * intIn2 >> 16
			
 
				
				 // uses:
			
 
				
				 // r26 to store 0
			
 
				
				 // r27 to store the byte 1 of the 24 bit result
			
 
				
				-#define MultiU16X8toH16(intRes, charIn1, intIn2) \
			
 
				
				-  asm volatile ( \
			
 
				
				-                 A("clr r26")      \
			
 
				
				-                 A("mul %A1, %B2") \
			
 
				
				-                 A("movw %A0, r0") \
			
 
				
				-                 A("mul %A1, %A2") \
			
 
				
				-                 A("add %A0, r1")  \
			
 
				
				-                 A("adc %B0, r26") \
			
 
				
				-                 A("lsr r0")       \
			
 
				
				-                 A("adc %A0, r26") \
			
 
				
				-                 A("adc %B0, r26") \
			
 
				
				-                 A("clr r1")       \
			
 
				
				-                 : \
			
 
				
				-                 "=&r" (intRes) \
			
 
				
				-                 : \
			
 
				
				-                 "d" (charIn1), \
			
 
				
				-                 "d" (intIn2) \
			
 
				
				-                 : \
			
 
				
				-                 "r26" \
			
 
				
				-               )
			
 
				
				-
			
 
				
				+static FORCE_INLINE uint16_t MultiU16X8toH16(uint8_t charIn1, uint16_t intIn2) {
			
 
				
				+  register uint8_t tmp;
			
 
				
				+  register uint16_t intRes;
			
 
				
				+  __asm__ __volatile__ (
			
 
				
				+    A("clr %[tmp]")
			
 
				
				+    A("mul %[charIn1], %B[intIn2]")
			
 
				
				+    A("movw %A[intRes], r0")
			
 
				
				+    A("mul %[charIn1], %A[intIn2]")
			
 
				
				+    A("add %A[intRes], r1")
			
 
				
				+    A("adc %B[intRes], %[tmp]")
			
 
				
				+    A("lsr r0")
			
 
				
				+    A("adc %A[intRes], %[tmp]")
			
 
				
				+    A("adc %B[intRes], %[tmp]")
			
 
				
				+    A("clr r1")
			
 
				
				+      : [intRes] "=&r" (intRes),
			
 
				
				+        [tmp] "=&r" (tmp)
			
 
				
				+      : [charIn1] "d" (charIn1),
			
 
				
				+        [intIn2] "d" (intIn2)
			
 
				
				+      : "cc"
			
 
				
				+  );
			
 
				
				+  return intRes;
			
 
				
				+}
			
 
				
				 
			
 
				
				 #endif // _MATH_AVR_H_
			
--- a/Marlin/src/HAL/math_32bit.h
+++ b/Marlin/src/HAL/math_32bit.h
@@ -23,11 +23,13 @@
 
				
				 #ifndef MATH_32BIT_H
			
 
				
				 #define MATH_32BIT_H
			
 
				
				 
			
 
				
				+#include "../core/macros.h"
			
 
				
				+
			
 
				
				 /**
			
 
				
				  * Math helper functions for 32 bit CPUs
			
 
				
				  */
			
 
				
				-
			
 
				
				-#define MultiU32X32toH32(intRes, longIn1, longIn2) intRes = ((uint64_t)longIn1 * longIn2 + 0x80000000) >> 32
			
 
				
				-#define MultiU32X24toH32(intRes, longIn1, longIn2) intRes = ((uint64_t)longIn1 * longIn2 + 0x00800000) >> 24
			
 
				
				+static FORCE_INLINE uint32_t MultiU32X24toH32(uint32_t longIn1, uint32_t longIn2) {
			
 
				
				+  return ((uint64_t)longIn1 * longIn2 + 0x00800000) >> 24;
			
 
				
				+}
			
 
				
				 
			
 
				
				 #endif // MATH_32BIT_H
			
--- a/Marlin/src/module/stepper.cpp
+++ b/Marlin/src/module/stepper.cpp
@@ -1158,6 +1158,12 @@ HAL_STEP_TIMER_ISR {
 
				
				   HAL_timer_isr_epilogue(STEP_TIMER_NUM);
			
 
				
				 }
			
 
				
				 
			
 
				
				+#ifdef CPU_32_BIT
			
 
				
				+  #define STEP_MULTIPLY(A,B) MultiU32X24toH32(A, B);
			
 
				
				+#else
			
 
				
				+  #define STEP_MULTIPLY(A,B) MultiU24X32toH16(A, B);
			
 
				
				+#endif
			
 
				
				+
			
 
				
				 void Stepper::isr() {
			
 
				
				 
			
 
				
				   #define ENDSTOP_NOMINAL_OCR_VAL 1500 * HAL_TICKS_PER_US // Check endstops every 1.5ms to guarantee two stepper ISRs within 5ms for BLTouch
			
@@ -1525,14 +1531,7 @@ void Stepper::isr() {
 
				
				           ? _eval_bezier_curve(acceleration_time)
			
 
				
				           : current_block->cruise_rate;
			
 
				
				     #else
			
 
				
				-      #ifdef CPU_32_BIT
			
 
				
				-        MultiU32X24toH32(acc_step_rate, acceleration_time, current_block->acceleration_rate);
			
 
				
				-      #else
			
 
				
				-        MultiU24X32toH16(acc_step_rate, acceleration_time, current_block->acceleration_rate);
			
 
				
				-      #endif
			
 
				
				-      acc_step_rate += current_block->initial_rate;
			
 
				
				-
			
 
				
				-      // upper limit
			
 
				
				+      acc_step_rate = STEP_MULTIPLY(acceleration_time, current_block->acceleration_rate) + current_block->initial_rate;
			
 
				
				       NOMORE(acc_step_rate, current_block->nominal_rate);
			
 
				
				     #endif
			
 
				
				 
			
@@ -1576,18 +1575,14 @@ void Stepper::isr() {
 
				
				     #else
			
 
				
				 
			
 
				
				       // Using the old trapezoidal control
			
 
				
				-      #ifdef CPU_32_BIT
			
 
				
				-        MultiU32X24toH32(step_rate, deceleration_time, current_block->acceleration_rate);
			
 
				
				-      #else
			
 
				
				-        MultiU24X32toH16(step_rate, deceleration_time, current_block->acceleration_rate);
			
 
				
				-      #endif
			
 
				
				-
			
 
				
				+      step_rate = STEP_MULTIPLY(deceleration_time, current_block->acceleration_rate);
			
 
				
				       if (step_rate < acc_step_rate) { // Still decelerating?
			
 
				
				         step_rate = acc_step_rate - step_rate;
			
 
				
				         NOLESS(step_rate, current_block->final_rate);
			
 
				
				       }
			
 
				
				       else
			
 
				
				         step_rate = current_block->final_rate;
			
 
				
				+
			
 
				
				     #endif
			
 
				
				 
			
 
				
				     // step_rate to timer interval
			
--- a/Marlin/src/module/stepper.h
+++ b/Marlin/src/module/stepper.h
@@ -340,24 +340,24 @@ class Stepper {
 
				
				 
			
 
				
				       #ifdef CPU_32_BIT
			
 
				
				         // In case of high-performance processor, it is able to calculate in real-time
			
 
				
				-        const uint32_t MIN_TIME_PER_STEP = (HAL_STEPPER_TIMER_RATE) / ((STEP_DOUBLER_FREQUENCY) * 2);
			
 
				
				+        const uint32_t min_time_per_step = (HAL_STEPPER_TIMER_RATE) / ((STEP_DOUBLER_FREQUENCY) * 2);
			
 
				
				         timer = uint32_t(HAL_STEPPER_TIMER_RATE) / step_rate;
			
 
				
				-        NOLESS(timer, MIN_TIME_PER_STEP); // (STEP_DOUBLER_FREQUENCY * 2 kHz - this should never happen)
			
 
				
				+        NOLESS(timer, min_time_per_step); // (STEP_DOUBLER_FREQUENCY * 2 kHz - this should never happen)
			
 
				
				       #else
			
 
				
				         NOLESS(step_rate, F_CPU / 500000);
			
 
				
				         step_rate -= F_CPU / 500000; // Correct for minimal speed
			
 
				
				         if (step_rate >= (8 * 256)) { // higher step rate
			
 
				
				-          unsigned short table_address = (unsigned short)&speed_lookuptable_fast[(unsigned char)(step_rate >> 8)][0];
			
 
				
				-          unsigned char tmp_step_rate = (step_rate & 0x00FF);
			
 
				
				-          unsigned short gain = (unsigned short)pgm_read_word_near(table_address + 2);
			
 
				
				-          MultiU16X8toH16(timer, tmp_step_rate, gain);
			
 
				
				-          timer = (unsigned short)pgm_read_word_near(table_address) - timer;
			
 
				
				+          uint8_t tmp_step_rate = (step_rate & 0x00FF);
			
 
				
				+          uint16_t table_address = (uint16_t)&speed_lookuptable_fast[(uint8_t)(step_rate >> 8)][0];
			
 
				
				+          uint16_t gain = (uint16_t)pgm_read_word_near(table_address + 2);
			
 
				
				+          timer = MultiU16X8toH16(tmp_step_rate, gain);
			
 
				
				+          timer = (uint16_t)pgm_read_word_near(table_address) - timer;
			
 
				
				         }
			
 
				
				         else { // lower step rates
			
 
				
				-          unsigned short table_address = (unsigned short)&speed_lookuptable_slow[0][0];
			
 
				
				+          uint16_t table_address = (uint16_t)&speed_lookuptable_slow[0][0];
			
 
				
				           table_address += ((step_rate) >> 1) & 0xFFFC;
			
 
				
				-          timer = (unsigned short)pgm_read_word_near(table_address);
			
 
				
				-          timer -= (((unsigned short)pgm_read_word_near(table_address + 2) * (unsigned char)(step_rate & 0x0007)) >> 3);
			
 
				
				+          timer = (uint16_t)pgm_read_word_near(table_address);
			
 
				
				+          timer -= (((uint16_t)pgm_read_word_near(table_address + 2) * (uint8_t)(step_rate & 0x0007)) >> 3);
			
 
				
				         }
			
 
				
				         if (timer < 100) { // (20kHz - this should never happen)
			
 
				
				           timer = 100;