b/libgcc/config/avr/libf7/libf7-asm.sx
@@ -1287,6 +1287,189 @@ DEFUN div
#endif /* F7MOD_div_ */
+#ifdef F7MOD_sqrt_approx_
+;; ReMainder
+#define MX 16
+#define M0 17
+#define M1 26
+#define M2 27
+#define M3 14
+#define M4 15
+#define M5 TMP
+#define M6 r29
+
+#define AA ZERO
+#define One r13
+#define Bits r28
+#define Bytes M6
+
+;; Extend C[] by 0b01 at the low end.
+#define CX (0b01 << 6)
+
+;;; Compute square-root of const f7_t *R22 for a positive number.
+DEFUN sqrt_approx
+ ;; 7 = Y, R17...R13
+ do_prologue_saves 7
+
+ wmov ZL, r22 ; Input const f7_t*
+ wmov YL, r24 ; Output f7_t*
+ F7call load_mant
+ ldi CA, 0xff
+
+ ;; The paper-pencil method for the mantissa consumes bits in pairs and
+ ;; expects the input as Q-format 2.*, but mant is in 1.*. This means
+ ;; we have to shift one to the right. If expo is odd, then we shift
+ ;; one to the left and subtract one from expo in order to compensate
+ ;; and to get an even exponent.
+
+ ;; Divide expo by 2 because we are doing sqrt.
+ ldd XH, Z+Expo+1
+ ldd XL, Z+Expo+0
+ asr XH
+ ror XL
+ ;; Store expo to result.
+ wmov ZL, YL
+ std Z+Expo+0, XL
+ std Z+Expo+1, XH
+
+ brcs 1f
+ ;; Expo was even. Do >>=1 in order to get Q2.* as explained above.
+ LSR C6 $ ror C5 $ ror C4 $ ror C3
+ ror C2 $ ror C1 $ ror C0 $ ror CA
+1:
+ ;; For odd expo, >>=1 to get Q2.* and <<=1 to get an even expo
cancel out.
+ ;; And the right-shift of the exponent implicitly subtracted 1 from it
+ ;; as needed.
+ F7call store_mant.with_flags
+
+ ;; Let Z point one past the mantissa's MSB.
+ adiw ZL, Off + F7_MANT_BYTES
+
+ ;; Clear the result mantissa.
+ .global __clr_8
+ XCALL __clr_8
+ ;; Clear ReMainder. M6 === Bytes will be zero when Bytes is down to 0.
+ clr M5
+ wmov M3, C3
+ wmov M1, C1
+ wmov MX, CA
+
+ clr One
+ inc One
+
+ ;; "+1" because .flags extends the mantissa at the low end.
+ ldi Bytes, 1 + F7_MANT_BYTES
+.Loop_bytes:
+ ld AA, -Z
+ ldi Bits, 8
+.Loop_bits:
+ ;; Shift top 2 bits of MX into M[].
+ LSL MX $ rol M0 $ rol M1 $ rol M2 $ rol M3
+ LSL MX $ rol M0 $ rol M1 $ rol M2 $ rol M3
+
+ ;; "Take down" 2 bits from A[] to MX.7 and MX.6
+ mov MX, AA
+ andi MX, 0xc0
+ lsl AA
+ lsl AA
+
+ ;; Compare remainder against current result extended by 0b01.
+ CPI MX, CX
+ cpc M0, C0
+ cpc M1, C1
+ cpc M2, C2
+ cpc M3, C3
+ brcs 1f
+ ;; If the extended result fits, subtract it from M and set the
+ ;; next result bit to 1.
+ SUBI MX, CX
+ sbc M0, C0
+ sbc M1, C1
+ sbc M2, C2
+ sbc M3, C3
+1:
+ ;; If it doesn't fit, set the next result bit to 0 (and don't
subtract).
+ rol C0
+ eor C0, One
+ rol C1
+ rol C2
+ rol C3
+
+ subi Bits, 2
+ brne .Loop_bits
+ ;; AA (=== ZERO) is zero again.
+
+ dec Bytes
+ brne .Loop_bytes
+ ;; B6 (=== Bytes) is zero now.
+
+ ;; Now we consumed all the 64 bits of the extended mantissa, but we
+ ;; only expanded 64 / 2 = 32 bits of the result, which is currently
+ ;; held in C3 ... C0. Do the same like above, but on all bytes.
+ ;; Shift in 00's because the mantissa is exhausted.
+
+ ;; "-1" because flags is part of the mantissa, which is already
consumed.
+ ldi Bits, 8 * (F7_MANT_BYTES - 1)
+.Loop2_bits:
+ ;; Shift top 2 bits of MX into M[].
+.Ltwice:
+ LSL MX
+ rol M0
+ rol M1
+ rol M2
+ rol M3
+ rol M4
+ rol M5
+ rol M6
+ subi Bits, 0x80
+ brmi .Ltwice
+
+ ;; "Take down" two 0's to MX.7 and MX.6
+ ; clr MX ;; MX is already zero.
+
+ ;; Compare remainder against current result extended by 0b01.
+ CPI MX, CX
+ cpc M0, C0
+ cpc M1, C1
+ cpc M2, C2
+ cpc M3, C3
+ cpc M4, C4
+ cpc M5, C5
+ cpc M6, C6
+ brcs 1f
+ ;; If the extended result fits, subtract it from M and set the
+ ;; next result bit to 1.
+ SUBI MX, CX
+ sbc M0, C0
+ sbc M1, C1
+ sbc M2, C2
+ sbc M3, C3
+ sbc M4, C4
+ sbc M5, C5
+ sbc M6, C6
+1:
+ ;; If it doesn't fit, set the next result bit to 0 (and don't
subtract).
+ rol C0
+ eor C0, One
+ rol C1
+ rol C2
+ rol C3
+ rol C4
+ rol C5
+ rol C6
+
+ subi Bits, 2
+ brne .Loop2_bits
+
+ ;; Set flags.
+ st Z, ZERO
+ F7call store_mant
+
+ do_epilogue_restores 7
+ENDF sqrt_approx
+#endif /* F7MOD_sqrt_approx_ */
+
+
#if defined (F7MOD_sqrt16_) && defined (__AVR_HAVE_MUL__)
#define Mask C6
@@ -1348,74 +1531,6 @@ DEFUN sqrt16_round
#undef Q1
#endif /* F7MOD_sqrt16_ && MUL */
-#ifdef F7MOD_sqrt_approx_
-DEFUN sqrt_approx
- push r17
- push r16
- wmov XL, r24
- wmov ZL, r22
-
- ;; C[] = 0.
- .global __clr_8
- XCALL __clr_8
-
- ldd C5, Z+5+Off
- ldd C6, Z+6+Off
-
- ldd Carry, Z+0+Expo
- ldd TMP, Z+1+Expo
- wmov ZL, XL
-
- st Z, ZERO
-
- asr TMP
- ror Carry
- std Z+1+Expo, TMP
- std Z+0+Expo, Carry
-
- ;; Re-interpreting our Q-format 1.xx mantissa as Q2.yy, we have to
shift
- ;; the mantissa to the right by 1. As we need an even exponent,
multiply
- ;; the mantissa by 2 for odd exponents, i.e. only right-shift if .expo
- ;; is even.
-
- brcs 1f
- lsr C6
- ror C5
-
-1:
- F7call sqrt16_round
-
- ;; sqrt16_round() returns: C = 0: error in [0, -1/2 LSB).
- ;; C = 1: error in [1/2 LSB, 0)
-
- brcc 2f
- ;; Undo the round-up from sqrt16_round(); this will transform to
- ;; error in [-1/2 LSB, -1 LSB).
- sbiw C5, 1
- ;; Together with the correct bit C4.7, the error is in [0, -1/2 LSB).
- ori C4, 1 << 7
-
-2: ;; Setting C4.6 adds 1/4 LSB and the error is now in [1/4 LSB, -1/4
LSB)
- ;; in either case.
- ori C4, 1 << 6
-
- ;; ????????????
- ;; sqrt16_round() runs on integers which means that it computes the
- ;; square root of mant * 2^14 if we regard mant as Q-format 2.yy,
- ;; i.e. 2 integral bits. The result is sqrt(mant) * 2^7,
- ;; and in order to get the same scaling like the input, .expo has to
- ;; be adjusted by 7. ???????????????
-
- ldi Carry, 8
- F7call normalize.store_with_flags
-
- pop r16
- pop r17
- ret
-
-ENDF sqrt_approx
-#endif /* F7MOD_sqrt_approx_ */
-
#undef CA
#undef C0
b/libgcc/config/avr/libf7/libf7.c
@@ -1188,40 +1188,6 @@ f7_t* f7_ldexp (f7_t *cc, const f7_t *aa, int delta)
#ifdef F7MOD_sqrt_
-static void sqrt_worker (f7_t *cc, const f7_t *rr)
-{
- f7_t tmp7, *tmp = &tmp7;
- f7_t aa7, *aa = &aa7;
-
- // aa in [1/2, 2) => aa->expo in { -1, 0 }.
- int16_t a_expo = -(rr->expo & 1);
- int16_t c_expo = (rr->expo - a_expo) >> 1; // FIXME: r_expo = INT_MAX???
-
- __asm ("" : "+r" (aa));
-
- f7_copy (aa, rr);
- aa->expo = a_expo;
-
- // No use of rr or *cc past this point: We may use cc as temporary.
- // Approximate square-root of A by X <-- (X + A / X) / 2.
-
- f7_sqrt_approx_asm (cc, aa);
-
- // Iterate X <-- (X + A / X) / 2.
- // 3 Iterations with 16, 32, 58 bits of precision for the quotient.
-
- for (uint8_t prec = 16; (prec & 0x80) == 0; prec <<= 1)
- {
- f7_divx (tmp, aa, cc, (prec & 64) ? 2 + F7_MANT_BITS : prec);
- f7_Iadd (cc, tmp);
- // This will never underflow because |c_expo| is small.
- cc->expo--;
- }
-
- // Similar: |c_expo| is small, hence no ldexp needed.
- cc->expo += c_expo;
-}
-
F7_WEAK
void f7_sqrt (f7_t *cc, const f7_t *aa)
{
@@ -1236,7 +1202,7 @@ void f7_sqrt (f7_t *cc, const f7_t *aa)
if (f7_class_zero (a_class))
return f7_clr (cc);
- sqrt_worker (cc, aa);
+ f7_sqrt_approx_asm (cc, aa);
}
#endif // F7MOD_sqrt_