LibF7: Use paper-pencil method for sqrt instead of Newton-Raphson iteration.

libgcc/config/avr/libf7/
	* libf7-asm.sx (sqrt_approx): Rewrite.
	* libf7.c (f7_sqrt): Use it instead of sqrt_worker.
	(sqrt_worker): Remove.
This commit is contained in:
Georg-Johann Lay
2023-11-12 15:55:40 +01:00
parent e0787da263
commit 3a5a30792f
2 changed files with 184 additions and 103 deletions

View File

@@ -1287,6 +1287,189 @@ ENDF div
#endif /* F7MOD_div_ */
#ifdef F7MOD_sqrt_approx_
;; ReMainder
#define MX 16
#define M0 17
#define M1 26
#define M2 27
#define M3 14
#define M4 15
#define M5 TMP
#define M6 r29
#define AA ZERO
#define One r13
#define Bits r28
#define Bytes M6
;; Extend C[] by 0b01 at the low end.
#define CX (0b01 << 6)
;;; Compute square-root of const f7_t *R22 for a positive number.
DEFUN sqrt_approx
;; 7 = Y, R17...R13
do_prologue_saves 7
wmov ZL, r22 ; Input const f7_t*
wmov YL, r24 ; Output f7_t*
F7call load_mant
ldi CA, 0xff
;; The paper-pencil method for the mantissa consumes bits in pairs and
;; expects the input as Q-format 2.*, but mant is in 1.*. This means
;; we have to shift one to the right. If expo is odd, then we shift
;; one to the left and subtract one from expo in order to compensate
;; and to get an even exponent.
;; Divide expo by 2 because we are doing sqrt.
ldd XH, Z+Expo+1
ldd XL, Z+Expo+0
asr XH
ror XL
;; Store expo to result.
wmov ZL, YL
std Z+Expo+0, XL
std Z+Expo+1, XH
brcs 1f
;; Expo was even. Do >>=1 in order to get Q2.* as explained above.
LSR C6 $ ror C5 $ ror C4 $ ror C3
ror C2 $ ror C1 $ ror C0 $ ror CA
1:
;; For odd expo, >>=1 to get Q2.* and <<=1 to get an even expo cancel out.
;; And the right-shift of the exponent implicitly subtracted 1 from it
;; as needed.
F7call store_mant.with_flags
;; Let Z point one past the mantissa's MSB.
adiw ZL, Off + F7_MANT_BYTES
;; Clear the result mantissa.
.global __clr_8
XCALL __clr_8
;; Clear ReMainder. M6 === Bytes will be zero when Bytes is down to 0.
clr M5
wmov M3, C3
wmov M1, C1
wmov MX, CA
clr One
inc One
;; "+1" because .flags extends the mantissa at the low end.
ldi Bytes, 1 + F7_MANT_BYTES
.Loop_bytes:
ld AA, -Z
ldi Bits, 8
.Loop_bits:
;; Shift top 2 bits of MX into M[].
LSL MX $ rol M0 $ rol M1 $ rol M2 $ rol M3
LSL MX $ rol M0 $ rol M1 $ rol M2 $ rol M3
;; "Take down" 2 bits from A[] to MX.7 and MX.6
mov MX, AA
andi MX, 0xc0
lsl AA
lsl AA
;; Compare remainder against current result extended by 0b01.
CPI MX, CX
cpc M0, C0
cpc M1, C1
cpc M2, C2
cpc M3, C3
brcs 1f
;; If the extended result fits, subtract it from M and set the
;; next result bit to 1.
SUBI MX, CX
sbc M0, C0
sbc M1, C1
sbc M2, C2
sbc M3, C3
1:
;; If it doesn't fit, set the next result bit to 0 (and don't subtract).
rol C0
eor C0, One
rol C1
rol C2
rol C3
subi Bits, 2
brne .Loop_bits
;; AA (=== ZERO) is zero again.
dec Bytes
brne .Loop_bytes
;; B6 (=== Bytes) is zero now.
;; Now we consumed all the 64 bits of the extended mantissa, but we
;; only expanded 64 / 2 = 32 bits of the result, which is currently
;; held in C3 ... C0. Do the same like above, but on all bytes.
;; Shift in 00's because the mantissa is exhausted.
;; "-1" because flags is part of the mantissa, which is already consumed.
ldi Bits, 8 * (F7_MANT_BYTES - 1)
.Loop2_bits:
;; Shift top 2 bits of MX into M[].
.Ltwice:
LSL MX
rol M0
rol M1
rol M2
rol M3
rol M4
rol M5
rol M6
subi Bits, 0x80
brmi .Ltwice
;; "Take down" two 0's to MX.7 and MX.6
; clr MX ;; MX is already zero.
;; Compare remainder against current result extended by 0b01.
CPI MX, CX
cpc M0, C0
cpc M1, C1
cpc M2, C2
cpc M3, C3
cpc M4, C4
cpc M5, C5
cpc M6, C6
brcs 1f
;; If the extended result fits, subtract it from M and set the
;; next result bit to 1.
SUBI MX, CX
sbc M0, C0
sbc M1, C1
sbc M2, C2
sbc M3, C3
sbc M4, C4
sbc M5, C5
sbc M6, C6
1:
;; If it doesn't fit, set the next result bit to 0 (and don't subtract).
rol C0
eor C0, One
rol C1
rol C2
rol C3
rol C4
rol C5
rol C6
subi Bits, 2
brne .Loop2_bits
;; Set flags.
st Z, ZERO
F7call store_mant
do_epilogue_restores 7
ENDF sqrt_approx
#endif /* F7MOD_sqrt_approx_ */
#if defined (F7MOD_sqrt16_) && defined (__AVR_HAVE_MUL__)
#define Mask C6
@@ -1348,74 +1531,6 @@ ENDF sqrt16_round
#undef Q1
#endif /* F7MOD_sqrt16_ && MUL */
#ifdef F7MOD_sqrt_approx_
DEFUN sqrt_approx
push r17
push r16
wmov XL, r24
wmov ZL, r22
;; C[] = 0.
.global __clr_8
XCALL __clr_8
ldd C5, Z+5+Off
ldd C6, Z+6+Off
ldd Carry, Z+0+Expo
ldd TMP, Z+1+Expo
wmov ZL, XL
st Z, ZERO
asr TMP
ror Carry
std Z+1+Expo, TMP
std Z+0+Expo, Carry
;; Re-interpreting our Q-format 1.xx mantissa as Q2.yy, we have to shift
;; the mantissa to the right by 1. As we need an even exponent, multiply
;; the mantissa by 2 for odd exponents, i.e. only right-shift if .expo
;; is even.
brcs 1f
lsr C6
ror C5
1:
F7call sqrt16_round
;; sqrt16_round() returns: C = 0: error in [0, -1/2 LSB).
;; C = 1: error in [1/2 LSB, 0)
brcc 2f
;; Undo the round-up from sqrt16_round(); this will transform to
;; error in [-1/2 LSB, -1 LSB).
sbiw C5, 1
;; Together with the correct bit C4.7, the error is in [0, -1/2 LSB).
ori C4, 1 << 7
2: ;; Setting C4.6 adds 1/4 LSB and the error is now in [1/4 LSB, -1/4 LSB)
;; in either case.
ori C4, 1 << 6
;; ????????????
;; sqrt16_round() runs on integers which means that it computes the
;; square root of mant * 2^14 if we regard mant as Q-format 2.yy,
;; i.e. 2 integral bits. The result is sqrt(mant) * 2^7,
;; and in order to get the same scaling like the input, .expo has to
;; be adjusted by 7. ???????????????
ldi Carry, 8
F7call normalize.store_with_flags
pop r16
pop r17
ret
ENDF sqrt_approx
#endif /* F7MOD_sqrt_approx_ */
#undef CA
#undef C0

View File

@@ -1188,40 +1188,6 @@ f7_t* f7_ldexp (f7_t *cc, const f7_t *aa, int delta)
#ifdef F7MOD_sqrt_
static void sqrt_worker (f7_t *cc, const f7_t *rr)
{
f7_t tmp7, *tmp = &tmp7;
f7_t aa7, *aa = &aa7;
// aa in [1/2, 2) => aa->expo in { -1, 0 }.
int16_t a_expo = -(rr->expo & 1);
int16_t c_expo = (rr->expo - a_expo) >> 1; // FIXME: r_expo = INT_MAX???
__asm ("" : "+r" (aa));
f7_copy (aa, rr);
aa->expo = a_expo;
// No use of rr or *cc past this point: We may use cc as temporary.
// Approximate square-root of A by X <-- (X + A / X) / 2.
f7_sqrt_approx_asm (cc, aa);
// Iterate X <-- (X + A / X) / 2.
// 3 Iterations with 16, 32, 58 bits of precision for the quotient.
for (uint8_t prec = 16; (prec & 0x80) == 0; prec <<= 1)
{
f7_divx (tmp, aa, cc, (prec & 64) ? 2 + F7_MANT_BITS : prec);
f7_Iadd (cc, tmp);
// This will never underflow because |c_expo| is small.
cc->expo--;
}
// Similar: |c_expo| is small, hence no ldexp needed.
cc->expo += c_expo;
}
F7_WEAK
void f7_sqrt (f7_t *cc, const f7_t *aa)
{
@@ -1236,7 +1202,7 @@ void f7_sqrt (f7_t *cc, const f7_t *aa)
if (f7_class_zero (a_class))
return f7_clr (cc);
sqrt_worker (cc, aa);
f7_sqrt_approx_asm (cc, aa);
}
#endif // F7MOD_sqrt_