mirror of
https://github.com/gcc-mirror/gcc.git
synced 2026-05-06 14:59:39 +02:00
LibF7: Use paper-pencil method for sqrt instead of Newton-Raphson iteration.
libgcc/config/avr/libf7/ * libf7-asm.sx (sqrt_approx): Rewrite. * libf7.c (f7_sqrt): Use it instead of sqrt_worker. (sqrt_worker): Remove.
This commit is contained in:
@@ -1287,6 +1287,189 @@ ENDF div
|
||||
#endif /* F7MOD_div_ */
|
||||
|
||||
|
||||
#ifdef F7MOD_sqrt_approx_
|
||||
;; ReMainder
|
||||
#define MX 16
|
||||
#define M0 17
|
||||
#define M1 26
|
||||
#define M2 27
|
||||
#define M3 14
|
||||
#define M4 15
|
||||
#define M5 TMP
|
||||
#define M6 r29
|
||||
|
||||
#define AA ZERO
|
||||
#define One r13
|
||||
#define Bits r28
|
||||
#define Bytes M6
|
||||
|
||||
;; Extend C[] by 0b01 at the low end.
|
||||
#define CX (0b01 << 6)
|
||||
|
||||
;;; Compute square-root of const f7_t *R22 for a positive number.
|
||||
DEFUN sqrt_approx
|
||||
;; 7 = Y, R17...R13
|
||||
do_prologue_saves 7
|
||||
|
||||
wmov ZL, r22 ; Input const f7_t*
|
||||
wmov YL, r24 ; Output f7_t*
|
||||
F7call load_mant
|
||||
ldi CA, 0xff
|
||||
|
||||
;; The paper-pencil method for the mantissa consumes bits in pairs and
|
||||
;; expects the input as Q-format 2.*, but mant is in 1.*. This means
|
||||
;; we have to shift one to the right. If expo is odd, then we shift
|
||||
;; one to the left and subtract one from expo in order to compensate
|
||||
;; and to get an even exponent.
|
||||
|
||||
;; Divide expo by 2 because we are doing sqrt.
|
||||
ldd XH, Z+Expo+1
|
||||
ldd XL, Z+Expo+0
|
||||
asr XH
|
||||
ror XL
|
||||
;; Store expo to result.
|
||||
wmov ZL, YL
|
||||
std Z+Expo+0, XL
|
||||
std Z+Expo+1, XH
|
||||
|
||||
brcs 1f
|
||||
;; Expo was even. Do >>=1 in order to get Q2.* as explained above.
|
||||
LSR C6 $ ror C5 $ ror C4 $ ror C3
|
||||
ror C2 $ ror C1 $ ror C0 $ ror CA
|
||||
1:
|
||||
;; For odd expo, >>=1 to get Q2.* and <<=1 to get an even expo cancel out.
|
||||
;; And the right-shift of the exponent implicitly subtracted 1 from it
|
||||
;; as needed.
|
||||
F7call store_mant.with_flags
|
||||
|
||||
;; Let Z point one past the mantissa's MSB.
|
||||
adiw ZL, Off + F7_MANT_BYTES
|
||||
|
||||
;; Clear the result mantissa.
|
||||
.global __clr_8
|
||||
XCALL __clr_8
|
||||
;; Clear ReMainder. M6 === Bytes will be zero when Bytes is down to 0.
|
||||
clr M5
|
||||
wmov M3, C3
|
||||
wmov M1, C1
|
||||
wmov MX, CA
|
||||
|
||||
clr One
|
||||
inc One
|
||||
|
||||
;; "+1" because .flags extends the mantissa at the low end.
|
||||
ldi Bytes, 1 + F7_MANT_BYTES
|
||||
.Loop_bytes:
|
||||
ld AA, -Z
|
||||
ldi Bits, 8
|
||||
.Loop_bits:
|
||||
;; Shift top 2 bits of MX into M[].
|
||||
LSL MX $ rol M0 $ rol M1 $ rol M2 $ rol M3
|
||||
LSL MX $ rol M0 $ rol M1 $ rol M2 $ rol M3
|
||||
|
||||
;; "Take down" 2 bits from A[] to MX.7 and MX.6
|
||||
mov MX, AA
|
||||
andi MX, 0xc0
|
||||
lsl AA
|
||||
lsl AA
|
||||
|
||||
;; Compare remainder against current result extended by 0b01.
|
||||
CPI MX, CX
|
||||
cpc M0, C0
|
||||
cpc M1, C1
|
||||
cpc M2, C2
|
||||
cpc M3, C3
|
||||
brcs 1f
|
||||
;; If the extended result fits, subtract it from M and set the
|
||||
;; next result bit to 1.
|
||||
SUBI MX, CX
|
||||
sbc M0, C0
|
||||
sbc M1, C1
|
||||
sbc M2, C2
|
||||
sbc M3, C3
|
||||
1:
|
||||
;; If it doesn't fit, set the next result bit to 0 (and don't subtract).
|
||||
rol C0
|
||||
eor C0, One
|
||||
rol C1
|
||||
rol C2
|
||||
rol C3
|
||||
|
||||
subi Bits, 2
|
||||
brne .Loop_bits
|
||||
;; AA (=== ZERO) is zero again.
|
||||
|
||||
dec Bytes
|
||||
brne .Loop_bytes
|
||||
;; B6 (=== Bytes) is zero now.
|
||||
|
||||
;; Now we consumed all the 64 bits of the extended mantissa, but we
|
||||
;; only expanded 64 / 2 = 32 bits of the result, which is currently
|
||||
;; held in C3 ... C0. Do the same like above, but on all bytes.
|
||||
;; Shift in 00's because the mantissa is exhausted.
|
||||
|
||||
;; "-1" because flags is part of the mantissa, which is already consumed.
|
||||
ldi Bits, 8 * (F7_MANT_BYTES - 1)
|
||||
.Loop2_bits:
|
||||
;; Shift top 2 bits of MX into M[].
|
||||
.Ltwice:
|
||||
LSL MX
|
||||
rol M0
|
||||
rol M1
|
||||
rol M2
|
||||
rol M3
|
||||
rol M4
|
||||
rol M5
|
||||
rol M6
|
||||
subi Bits, 0x80
|
||||
brmi .Ltwice
|
||||
|
||||
;; "Take down" two 0's to MX.7 and MX.6
|
||||
; clr MX ;; MX is already zero.
|
||||
|
||||
;; Compare remainder against current result extended by 0b01.
|
||||
CPI MX, CX
|
||||
cpc M0, C0
|
||||
cpc M1, C1
|
||||
cpc M2, C2
|
||||
cpc M3, C3
|
||||
cpc M4, C4
|
||||
cpc M5, C5
|
||||
cpc M6, C6
|
||||
brcs 1f
|
||||
;; If the extended result fits, subtract it from M and set the
|
||||
;; next result bit to 1.
|
||||
SUBI MX, CX
|
||||
sbc M0, C0
|
||||
sbc M1, C1
|
||||
sbc M2, C2
|
||||
sbc M3, C3
|
||||
sbc M4, C4
|
||||
sbc M5, C5
|
||||
sbc M6, C6
|
||||
1:
|
||||
;; If it doesn't fit, set the next result bit to 0 (and don't subtract).
|
||||
rol C0
|
||||
eor C0, One
|
||||
rol C1
|
||||
rol C2
|
||||
rol C3
|
||||
rol C4
|
||||
rol C5
|
||||
rol C6
|
||||
|
||||
subi Bits, 2
|
||||
brne .Loop2_bits
|
||||
|
||||
;; Set flags.
|
||||
st Z, ZERO
|
||||
F7call store_mant
|
||||
|
||||
do_epilogue_restores 7
|
||||
ENDF sqrt_approx
|
||||
#endif /* F7MOD_sqrt_approx_ */
|
||||
|
||||
|
||||
#if defined (F7MOD_sqrt16_) && defined (__AVR_HAVE_MUL__)
|
||||
|
||||
#define Mask C6
|
||||
@@ -1348,74 +1531,6 @@ ENDF sqrt16_round
|
||||
#undef Q1
|
||||
#endif /* F7MOD_sqrt16_ && MUL */
|
||||
|
||||
#ifdef F7MOD_sqrt_approx_
|
||||
DEFUN sqrt_approx
|
||||
push r17
|
||||
push r16
|
||||
wmov XL, r24
|
||||
wmov ZL, r22
|
||||
|
||||
;; C[] = 0.
|
||||
.global __clr_8
|
||||
XCALL __clr_8
|
||||
|
||||
ldd C5, Z+5+Off
|
||||
ldd C6, Z+6+Off
|
||||
|
||||
ldd Carry, Z+0+Expo
|
||||
ldd TMP, Z+1+Expo
|
||||
wmov ZL, XL
|
||||
|
||||
st Z, ZERO
|
||||
|
||||
asr TMP
|
||||
ror Carry
|
||||
std Z+1+Expo, TMP
|
||||
std Z+0+Expo, Carry
|
||||
|
||||
;; Re-interpreting our Q-format 1.xx mantissa as Q2.yy, we have to shift
|
||||
;; the mantissa to the right by 1. As we need an even exponent, multiply
|
||||
;; the mantissa by 2 for odd exponents, i.e. only right-shift if .expo
|
||||
;; is even.
|
||||
|
||||
brcs 1f
|
||||
lsr C6
|
||||
ror C5
|
||||
|
||||
1:
|
||||
F7call sqrt16_round
|
||||
|
||||
;; sqrt16_round() returns: C = 0: error in [0, -1/2 LSB).
|
||||
;; C = 1: error in [1/2 LSB, 0)
|
||||
|
||||
brcc 2f
|
||||
;; Undo the round-up from sqrt16_round(); this will transform to
|
||||
;; error in [-1/2 LSB, -1 LSB).
|
||||
sbiw C5, 1
|
||||
;; Together with the correct bit C4.7, the error is in [0, -1/2 LSB).
|
||||
ori C4, 1 << 7
|
||||
|
||||
2: ;; Setting C4.6 adds 1/4 LSB and the error is now in [1/4 LSB, -1/4 LSB)
|
||||
;; in either case.
|
||||
ori C4, 1 << 6
|
||||
|
||||
;; ????????????
|
||||
;; sqrt16_round() runs on integers which means that it computes the
|
||||
;; square root of mant * 2^14 if we regard mant as Q-format 2.yy,
|
||||
;; i.e. 2 integral bits. The result is sqrt(mant) * 2^7,
|
||||
;; and in order to get the same scaling like the input, .expo has to
|
||||
;; be adjusted by 7. ???????????????
|
||||
|
||||
ldi Carry, 8
|
||||
F7call normalize.store_with_flags
|
||||
|
||||
pop r16
|
||||
pop r17
|
||||
ret
|
||||
|
||||
ENDF sqrt_approx
|
||||
#endif /* F7MOD_sqrt_approx_ */
|
||||
|
||||
|
||||
#undef CA
|
||||
#undef C0
|
||||
|
||||
@@ -1188,40 +1188,6 @@ f7_t* f7_ldexp (f7_t *cc, const f7_t *aa, int delta)
|
||||
|
||||
|
||||
#ifdef F7MOD_sqrt_
|
||||
static void sqrt_worker (f7_t *cc, const f7_t *rr)
|
||||
{
|
||||
f7_t tmp7, *tmp = &tmp7;
|
||||
f7_t aa7, *aa = &aa7;
|
||||
|
||||
// aa in [1/2, 2) => aa->expo in { -1, 0 }.
|
||||
int16_t a_expo = -(rr->expo & 1);
|
||||
int16_t c_expo = (rr->expo - a_expo) >> 1; // FIXME: r_expo = INT_MAX???
|
||||
|
||||
__asm ("" : "+r" (aa));
|
||||
|
||||
f7_copy (aa, rr);
|
||||
aa->expo = a_expo;
|
||||
|
||||
// No use of rr or *cc past this point: We may use cc as temporary.
|
||||
// Approximate square-root of A by X <-- (X + A / X) / 2.
|
||||
|
||||
f7_sqrt_approx_asm (cc, aa);
|
||||
|
||||
// Iterate X <-- (X + A / X) / 2.
|
||||
// 3 Iterations with 16, 32, 58 bits of precision for the quotient.
|
||||
|
||||
for (uint8_t prec = 16; (prec & 0x80) == 0; prec <<= 1)
|
||||
{
|
||||
f7_divx (tmp, aa, cc, (prec & 64) ? 2 + F7_MANT_BITS : prec);
|
||||
f7_Iadd (cc, tmp);
|
||||
// This will never underflow because |c_expo| is small.
|
||||
cc->expo--;
|
||||
}
|
||||
|
||||
// Similar: |c_expo| is small, hence no ldexp needed.
|
||||
cc->expo += c_expo;
|
||||
}
|
||||
|
||||
F7_WEAK
|
||||
void f7_sqrt (f7_t *cc, const f7_t *aa)
|
||||
{
|
||||
@@ -1236,7 +1202,7 @@ void f7_sqrt (f7_t *cc, const f7_t *aa)
|
||||
if (f7_class_zero (a_class))
|
||||
return f7_clr (cc);
|
||||
|
||||
sqrt_worker (cc, aa);
|
||||
f7_sqrt_approx_asm (cc, aa);
|
||||
}
|
||||
#endif // F7MOD_sqrt_
|
||||
|
||||
|
||||
Reference in New Issue
Block a user