123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208 |
- /**
- * Marlin 3D Printer Firmware
- * Copyright (c) 2020 MarlinFirmware [https://github.com/MarlinFirmware/Marlin]
- *
- * Based on Sprinter and grbl.
- * Copyright (c) 2011 Camiel Gubbels / Erik van der Zalm
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- *
- */
- #pragma once
-
- #include "../../inc/MarlinConfigPre.h"
-
- /**
- * Busy wait delay cycles routines:
- *
- * DELAY_CYCLES(count): Delay execution in cycles
- * DELAY_NS(count): Delay execution in nanoseconds
- * DELAY_US(count): Delay execution in microseconds
- */
-
- #include "../../core/macros.h"
-
- void calibrate_delay_loop();
-
- #if defined(__arm__) || defined(__thumb__)
-
- // We want to have delay_cycle function with the lowest possible overhead, so we adjust at the function at runtime based on the current CPU best feature
- typedef void (*DelayImpl)(uint32_t);
- extern DelayImpl DelayCycleFnc;
-
- // I've measured 36 cycles on my system to call the cycle waiting method, but it shouldn't change much to have a bit more margin, it only consume a bit more flash
- #define TRIP_POINT_FOR_CALLING_FUNCTION 40
-
- // A simple recursive template class that output exactly one 'nop' of code per recursion
- template <int N> struct NopWriter {
- FORCE_INLINE static void build() {
- __asm__ __volatile__("nop");
- NopWriter<N-1>::build();
- }
- };
- // End the loop
- template <> struct NopWriter<0> { FORCE_INLINE static void build() {} };
-
- namespace Private {
- // Split recursing template in 2 different class so we don't reach the maximum template instantiation depth limit
- template <bool belowTP, int N> struct Helper {
- FORCE_INLINE static void build() {
- DelayCycleFnc(N - 2); // Approximative cost of calling the function (might be off by one or 2 cycles)
- }
- };
-
- template <int N> struct Helper<true, N> {
- FORCE_INLINE static void build() {
- NopWriter<N - 1>::build();
- }
- };
-
- template <> struct Helper<true, 0> {
- FORCE_INLINE static void build() {}
- };
-
- }
- // Select a behavior based on the constexpr'ness of the parameter
- // If called with a compile-time parameter, then write as many NOP as required to reach the asked cycle count
- // (there is some tripping point here to start looping when it's more profitable than gruntly executing NOPs)
- // If not called from a compile-time parameter, fallback to a runtime loop counting version instead
- template <bool compileTime, int Cycles>
- struct SmartDelay {
- FORCE_INLINE SmartDelay(int) {
- if (Cycles == 0) return;
- Private::Helper<Cycles < TRIP_POINT_FOR_CALLING_FUNCTION, Cycles>::build();
- }
- };
- // Runtime version below. There is no way this would run under less than ~TRIP_POINT_FOR_CALLING_FUNCTION cycles
- template <int T>
- struct SmartDelay<false, T> {
- FORCE_INLINE SmartDelay(int v) { DelayCycleFnc(v); }
- };
-
- #define DELAY_CYCLES(X) do { SmartDelay<IS_CONSTEXPR(X), IS_CONSTEXPR(X) ? X : 0> _smrtdly_X(X); } while(0)
-
- // For delay in microseconds, no smart delay selection is required, directly call the delay function
- // Teensy compiler is too old and does not accept smart delay compile-time / run-time selection correctly
- #define DELAY_US(x) DelayCycleFnc((x) * ((F_CPU) / 1000000UL))
-
- #elif defined(__AVR__)
- FORCE_INLINE static void __delay_up_to_3c(uint8_t cycles) {
- switch (cycles) {
- case 3:
- __asm__ __volatile__(A("RJMP .+0") A("NOP"));
- break;
- case 2:
- __asm__ __volatile__(A("RJMP .+0"));
- break;
- case 1:
- __asm__ __volatile__(A("NOP"));
- break;
- }
- }
-
- // Delay in cycles
- FORCE_INLINE static void DELAY_CYCLES(uint16_t cycles) {
- if (__builtin_constant_p(cycles)) {
- if (cycles <= 3) {
- __delay_up_to_3c(cycles);
- }
- else if (cycles == 4) {
- __delay_up_to_3c(2);
- __delay_up_to_3c(2);
- }
- else {
- cycles -= 1 + 4; // Compensate for the first LDI (1) and the first round (4)
- __delay_up_to_3c(cycles % 4);
-
- cycles /= 4;
- // The following code burns [1 + 4 * (rounds+1)] cycles
- uint16_t dummy;
- __asm__ __volatile__(
- // "manually" load counter from constants, otherwise the compiler may optimize this part away
- A("LDI %A[rounds], %[l]") // 1c
- A("LDI %B[rounds], %[h]") // 1c (compensating the non branching BRCC)
- L("1")
- A("SBIW %[rounds], 1") // 2c
- A("BRCC 1b") // 2c when branching, else 1c (end of loop)
- : // Outputs ...
- [rounds] "=w" (dummy) // Restrict to a wo (=) 16 bit register pair (w)
- : // Inputs ...
- [l] "M" (cycles%256), // Restrict to 0..255 constant (M)
- [h] "M" (cycles/256) // Restrict to 0..255 constant (M)
- :// Clobbers ...
- "cc" // Indicate we are modifying flags like Carry (cc)
- );
- }
- }
- else {
- __asm__ __volatile__(
- L("1")
- A("SBIW %[cycles], 4") // 2c
- A("BRCC 1b") // 2c when branching, else 1c (end of loop)
- : [cycles] "+w" (cycles) // output: Restrict to a rw (+) 16 bit register pair (w)
- : // input: -
- : "cc" // clobbers: We are modifying flags like Carry (cc)
- );
- }
- }
-
- // Delay in microseconds
- #define DELAY_US(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL))
-
- #elif defined(ESP32) || defined(__PLAT_LINUX__) || defined(__PLAT_NATIVE_SIM__)
-
- // DELAY_CYCLES specified inside platform
-
- // Delay in microseconds
- #define DELAY_US(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL))
- #else
-
- #error "Unsupported MCU architecture"
-
- #endif
-
- /**************************************************************
- * Delay in nanoseconds. Requires the F_CPU macro.
- * These macros follow avr-libc delay conventions.
- *
- * For AVR there are three possible operation modes, due to its
- * slower clock speeds and thus coarser delay resolution. For
- * example, when F_CPU = 16000000 the resolution is 62.5ns.
- *
- * Round up (default)
- * Round up the delay according to the CPU clock resolution.
- * e.g., 100 will give a delay of 2 cycles (125ns).
- *
- * Round down (DELAY_NS_ROUND_DOWN)
- * Round down the delay according to the CPU clock resolution.
- * e.g., 100 will be rounded down to 1 cycle (62.5ns).
- *
- * Nearest (DELAY_NS_ROUND_CLOSEST)
- * Round the delay to the nearest number of clock cycles.
- * e.g., 165 will be rounded up to 3 cycles (187.5ns) because
- * it's closer to the requested delay than 2 cycle (125ns).
- */
-
- #ifndef __AVR__
- #undef DELAY_NS_ROUND_DOWN
- #undef DELAY_NS_ROUND_CLOSEST
- #endif
-
- #if ENABLED(DELAY_NS_ROUND_DOWN)
- #define DELAY_NS(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL) / 1000UL) // floor
- #elif ENABLED(DELAY_NS_ROUND_CLOSEST)
- #define DELAY_NS(x) DELAY_CYCLES(((x) * ((F_CPU) / 1000000UL) + 500) / 1000UL) // round
- #else
- #define DELAY_NS(x) DELAY_CYCLES(((x) * ((F_CPU) / 1000000UL) + 999) / 1000UL) // "ceil"
- #endif
|