|
@@ -407,76 +407,76 @@ void Planner::init() {
|
407
|
407
|
|
408
|
408
|
__asm__ __volatile__(
|
409
|
409
|
// %8:%7:%6 = interval
|
410
|
|
- // r31:r30: MUST be those registers, and they must point to the inv_tab
|
|
410
|
+ // r31:r30: MUST be those registers, and they must point to the inv_tab
|
411
|
411
|
|
412
|
|
- " clr %13" "\n\t" // %13 = 0
|
|
412
|
+ " clr %13" "\n\t" // %13 = 0
|
413
|
413
|
|
414
|
|
- // Now we must compute
|
415
|
|
- // result = 0xFFFFFF / d
|
|
414
|
+ // Now we must compute
|
|
415
|
+ // result = 0xFFFFFF / d
|
416
|
416
|
// %8:%7:%6 = interval
|
417
|
|
- // %16:%15:%14 = nr
|
|
417
|
+ // %16:%15:%14 = nr
|
418
|
418
|
// %13 = 0
|
419
|
419
|
|
420
|
|
- // A plain division of 24x24 bits should take 388 cycles to complete. We will
|
|
420
|
+ // A plain division of 24x24 bits should take 388 cycles to complete. We will
|
421
|
421
|
// use Newton-Raphson for the calculation, and will strive to get way less cycles
|
422
|
422
|
// for the same result - Using C division, it takes 500cycles to complete .
|
423
|
423
|
|
424
|
|
- " clr %3" "\n\t" // idx = 0
|
|
424
|
+ " clr %3" "\n\t" // idx = 0
|
425
|
425
|
" mov %14,%6" "\n\t"
|
426
|
426
|
" mov %15,%7" "\n\t"
|
427
|
|
- " mov %16,%8" "\n\t" // nr = interval
|
428
|
|
- " tst %16" "\n\t" // nr & 0xFF0000 == 0 ?
|
429
|
|
- " brne 2f" "\n\t" // No, skip this
|
|
427
|
+ " mov %16,%8" "\n\t" // nr = interval
|
|
428
|
+ " tst %16" "\n\t" // nr & 0xFF0000 == 0 ?
|
|
429
|
+ " brne 2f" "\n\t" // No, skip this
|
430
|
430
|
" mov %16,%15" "\n\t"
|
431
|
|
- " mov %15,%14" "\n\t" // nr <<= 8, %14 not needed
|
432
|
|
- " subi %3,-8" "\n\t" // idx += 8
|
433
|
|
- " tst %16" "\n\t" // nr & 0xFF0000 == 0 ?
|
434
|
|
- " brne 2f" "\n\t" // No, skip this
|
435
|
|
- " mov %16,%15" "\n\t" // nr <<= 8, %14 not needed
|
436
|
|
- " clr %15" "\n\t" // We clear %14
|
437
|
|
- " subi %3,-8" "\n\t" // idx += 8
|
438
|
|
-
|
439
|
|
- // here %16 != 0 and %16:%15 contains at least 9 MSBits, or both %16:%15 are 0
|
|
431
|
+ " mov %15,%14" "\n\t" // nr <<= 8, %14 not needed
|
|
432
|
+ " subi %3,-8" "\n\t" // idx += 8
|
|
433
|
+ " tst %16" "\n\t" // nr & 0xFF0000 == 0 ?
|
|
434
|
+ " brne 2f" "\n\t" // No, skip this
|
|
435
|
+ " mov %16,%15" "\n\t" // nr <<= 8, %14 not needed
|
|
436
|
+ " clr %15" "\n\t" // We clear %14
|
|
437
|
+ " subi %3,-8" "\n\t" // idx += 8
|
|
438
|
+
|
|
439
|
+ // here %16 != 0 and %16:%15 contains at least 9 MSBits, or both %16:%15 are 0
|
440
|
440
|
"2:" "\n\t"
|
441
|
|
- " cpi %16,0x10" "\n\t" // (nr & 0xf00000) == 0 ?
|
442
|
|
- " brcc 3f" "\n\t" // No, skip this
|
443
|
|
- " swap %15" "\n\t" // Swap nibbles
|
444
|
|
- " swap %16" "\n\t" // Swap nibbles. Low nibble is 0
|
|
441
|
+ " cpi %16,0x10" "\n\t" // (nr & 0xf00000) == 0 ?
|
|
442
|
+ " brcc 3f" "\n\t" // No, skip this
|
|
443
|
+ " swap %15" "\n\t" // Swap nibbles
|
|
444
|
+ " swap %16" "\n\t" // Swap nibbles. Low nibble is 0
|
445
|
445
|
" mov %14, %15" "\n\t"
|
446
|
|
- " andi %14,0x0f" "\n\t" // Isolate low nibble
|
447
|
|
- " andi %15,0xf0" "\n\t" // Keep proper nibble in %15
|
448
|
|
- " or %16, %14" "\n\t" // %16:%15 <<= 4
|
449
|
|
- " subi %3,-4" "\n\t" // idx += 4
|
|
446
|
+ " andi %14,0x0f" "\n\t" // Isolate low nibble
|
|
447
|
+ " andi %15,0xf0" "\n\t" // Keep proper nibble in %15
|
|
448
|
+ " or %16, %14" "\n\t" // %16:%15 <<= 4
|
|
449
|
+ " subi %3,-4" "\n\t" // idx += 4
|
450
|
450
|
|
451
|
451
|
"3:" "\n\t"
|
452
|
|
- " cpi %16,0x40" "\n\t" // (nr & 0xc00000) == 0 ?
|
|
452
|
+ " cpi %16,0x40" "\n\t" // (nr & 0xc00000) == 0 ?
|
453
|
453
|
" brcc 4f" "\n\t" // No, skip this
|
454
|
454
|
" add %15,%15" "\n\t"
|
455
|
455
|
" adc %16,%16" "\n\t"
|
456
|
456
|
" add %15,%15" "\n\t"
|
457
|
|
- " adc %16,%16" "\n\t" // %16:%15 <<= 2
|
458
|
|
- " subi %3,-2" "\n\t" // idx += 2
|
|
457
|
+ " adc %16,%16" "\n\t" // %16:%15 <<= 2
|
|
458
|
+ " subi %3,-2" "\n\t" // idx += 2
|
459
|
459
|
|
460
|
460
|
"4:" "\n\t"
|
461
|
|
- " cpi %16,0x80" "\n\t" // (nr & 0x800000) == 0 ?
|
462
|
|
- " brcc 5f" "\n\t" // No, skip this
|
|
461
|
+ " cpi %16,0x80" "\n\t" // (nr & 0x800000) == 0 ?
|
|
462
|
+ " brcc 5f" "\n\t" // No, skip this
|
463
|
463
|
" add %15,%15" "\n\t"
|
464
|
|
- " adc %16,%16" "\n\t" // %16:%15 <<= 1
|
465
|
|
- " inc %3" "\n\t" // idx += 1
|
|
464
|
+ " adc %16,%16" "\n\t" // %16:%15 <<= 1
|
|
465
|
+ " inc %3" "\n\t" // idx += 1
|
466
|
466
|
|
467
|
467
|
// Now %16:%15 contains its MSBit set to 1, or %16:%15 is == 0. We are now absolutely sure
|
468
|
468
|
// we have at least 9 MSBits available to enter the initial estimation table
|
469
|
469
|
"5:" "\n\t"
|
470
|
470
|
" add %15,%15" "\n\t"
|
471
|
471
|
" adc %16,%16" "\n\t" // %16:%15 = tidx = (nr <<= 1), we lose the top MSBit (always set to 1, %16 is the index into the inverse table)
|
472
|
|
- " add r30,%16" "\n\t" // Only use top 8 bits
|
473
|
|
- " adc r31,%13" "\n\t" // r31:r30 = inv_tab + (tidx)
|
474
|
|
- " lpm %14, Z" "\n\t" // %14 = inv_tab[tidx]
|
475
|
|
- " ldi %15, 1" "\n\t" // %15 = 1 %15:%14 = inv_tab[tidx] + 256
|
|
472
|
+ " add r30,%16" "\n\t" // Only use top 8 bits
|
|
473
|
+ " adc r31,%13" "\n\t" // r31:r30 = inv_tab + (tidx)
|
|
474
|
+ " lpm %14, Z" "\n\t" // %14 = inv_tab[tidx]
|
|
475
|
+ " ldi %15, 1" "\n\t" // %15 = 1 %15:%14 = inv_tab[tidx] + 256
|
476
|
476
|
|
477
|
477
|
// We must scale the approximation to the proper place
|
478
|
|
- " clr %16" "\n\t" // %16 will always be 0 here
|
479
|
|
- " subi %3,8" "\n\t" // idx == 8 ?
|
|
478
|
+ " clr %16" "\n\t" // %16 will always be 0 here
|
|
479
|
+ " subi %3,8" "\n\t" // idx == 8 ?
|
480
|
480
|
" breq 6f" "\n\t" // yes, no need to scale
|
481
|
481
|
" brcs 7f" "\n\t" // If C=1, means idx < 8, result was negative!
|
482
|
482
|
|
|
@@ -503,13 +503,13 @@ void Planner::init() {
|
503
|
503
|
" or %15,%12" "\n\t" // %15:%16 <<= 4
|
504
|
504
|
"16:" "\n\t"
|
505
|
505
|
" sbrs %3,3" "\n\t" // shift by 8bits position?
|
506
|
|
- " rjmp 6f" "\n\t" // No, we are done
|
|
506
|
+ " rjmp 6f" "\n\t" // No, we are done
|
507
|
507
|
" mov %16,%15" "\n\t"
|
508
|
508
|
" mov %15,%14" "\n\t"
|
509
|
509
|
" clr %14" "\n\t"
|
510
|
510
|
" jmp 6f" "\n\t"
|
511
|
511
|
|
512
|
|
- // idx < 8, now %3 = idx - 8. Get the count of bits
|
|
512
|
+ // idx < 8, now %3 = idx - 8. Get the count of bits
|
513
|
513
|
"7:" "\n\t"
|
514
|
514
|
" neg %3" "\n\t" // %3 = -idx = count of bits to move right. idx range:[1...8]
|
515
|
515
|
" sbrs %3,0" "\n\t" // shift by 1 bit position ?
|
|
@@ -541,7 +541,7 @@ void Planner::init() {
|
541
|
541
|
// Now, we must refine the estimation present on %16:%15:%14 using 1 iteration
|
542
|
542
|
// of Newton-Raphson. As it has a quadratic convergence, 1 iteration is enough
|
543
|
543
|
// to get more than 18bits of precision (the initial table lookup gives 9 bits of
|
544
|
|
- // precision to start from). 18bits of precision is all what is needed here for result
|
|
544
|
+ // precision to start from). 18bits of precision is all what is needed here for result
|
545
|
545
|
|
546
|
546
|
// %8:%7:%6 = d = interval
|
547
|
547
|
// %16:%15:%14 = x = initial estimation of 0x1000000 / d
|
|
@@ -585,7 +585,7 @@ void Planner::init() {
|
585
|
585
|
|
586
|
586
|
// %16:%15:%14 = x = initial estimation of 0x1000000 / d
|
587
|
587
|
// %3:%2:%1:%0 = (1<<25) - x*d = acc
|
588
|
|
- // %13 = 0
|
|
588
|
+ // %13 = 0
|
589
|
589
|
|
590
|
590
|
// result = %11:%10:%9:%5:%4
|
591
|
591
|
" mul %14,%0" "\n\t" // r1:r0 = LO(x) * LO(acc)
|
|
@@ -599,7 +599,7 @@ void Planner::init() {
|
599
|
599
|
" adc %5,r1" "\n\t"
|
600
|
600
|
" adc %9,%13" "\n\t"
|
601
|
601
|
" adc %10,%13" "\n\t"
|
602
|
|
- " adc %11,%13" "\n\t" // %11:%10:%9:%5:%4 += MI(x) * LO(acc)
|
|
602
|
+ " adc %11,%13" "\n\t" // %11:%10:%9:%5:%4 += MI(x) * LO(acc)
|
603
|
603
|
" mul %16,%0" "\n\t" // r1:r0 = HI(x) * LO(acc)
|
604
|
604
|
" add %5,r0" "\n\t"
|
605
|
605
|
" adc %9,r1" "\n\t"
|
|
@@ -645,12 +645,12 @@ void Planner::init() {
|
645
|
645
|
" mul %16,%3" "\n\t" // r1:r0 = HI(x) * HI(acc)
|
646
|
646
|
" add %11,r0" "\n\t" // %11:%10:%9:%5:%4 += MI(x) * HI(acc) << 32
|
647
|
647
|
|
648
|
|
- // At this point, %11:%10:%9 contains the new estimation of x.
|
|
648
|
+ // At this point, %11:%10:%9 contains the new estimation of x.
|
649
|
649
|
|
650
|
650
|
// Finally, we must correct the result. Estimate remainder as
|
651
|
|
- // (1<<24) - x*d
|
652
|
|
- // %11:%10:%9 = x
|
653
|
|
- // %8:%7:%6 = d = interval" "\n\t"
|
|
651
|
+ // (1<<24) - x*d
|
|
652
|
+ // %11:%10:%9 = x
|
|
653
|
+ // %8:%7:%6 = d = interval" "\n\t"
|
654
|
654
|
" ldi %3,1" "\n\t"
|
655
|
655
|
" clr %2" "\n\t"
|
656
|
656
|
" clr %1" "\n\t"
|
|
@@ -682,23 +682,23 @@ void Planner::init() {
|
682
|
682
|
" mul %7,%11" "\n\t" // r1:r0 = MI(d) * HI(x)
|
683
|
683
|
" sub %3,r0" "\n\t" // %3:%2:%1:%0 -= MI(d) * HI(x) << 24
|
684
|
684
|
// %3:%2:%1:%0 = r = (1<<24) - x*d
|
685
|
|
- // %8:%7:%6 = d = interval
|
|
685
|
+ // %8:%7:%6 = d = interval
|
686
|
686
|
|
687
|
687
|
// Perform the final correction
|
688
|
688
|
" sub %0,%6" "\n\t"
|
689
|
689
|
" sbc %1,%7" "\n\t"
|
690
|
690
|
" sbc %2,%8" "\n\t" // r -= d
|
691
|
|
- " brcs 14f" "\n\t" // if ( r >= d)
|
|
691
|
+ " brcs 14f" "\n\t" // if ( r >= d)
|
692
|
692
|
|
693
|
|
- // %11:%10:%9 = x
|
|
693
|
+ // %11:%10:%9 = x
|
694
|
694
|
" ldi %3,1" "\n\t"
|
695
|
695
|
" add %9,%3" "\n\t"
|
696
|
696
|
" adc %10,%13" "\n\t"
|
697
|
697
|
" adc %11,%13" "\n\t" // x++
|
698
|
698
|
"14:" "\n\t"
|
699
|
699
|
|
700
|
|
- // Estimation is done. %11:%10:%9 = x
|
701
|
|
- " clr __zero_reg__" "\n\t" // Make C runtime happy
|
|
700
|
+ // Estimation is done. %11:%10:%9 = x
|
|
701
|
+ " clr __zero_reg__" "\n\t" // Make C runtime happy
|
702
|
702
|
// [211 cycles total]
|
703
|
703
|
: "=r" (r2),
|
704
|
704
|
"=r" (r3),
|