qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softf


From: Catalin Patulea
Subject: [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only
Date: Wed, 27 Jun 2012 19:00:36 -0400

Hey guys,

I've been hacking up the FPREM and FPREM1 i386 instructions (without KVM) to be 
implemented using SoftFloat only. This was motivated by some correctness issues
that we noticed with the current implementation which follows the AMD 
datasheet.I believe the datasheet explains the operation as if intermediate 
results had "infinite" precision, but in this case intermediate rounding causes 
a loss of precision at the final output.

My reference was TestFloat [1], specifically the floatx80_rem suite, which 
tests the FPREM instruction quite thoroughly (FPREM1 is technically untested 
but very similar). The current code fails about half the suite; with the patch, 
all tests pass.

There are still lots of dark corners - the code originates from Bochs' copy of 
SoftFloat and I tried to port the Bochs exception handling logic as much as I 
could. Many details still elude me though (see comments in the code). TestFloat 
does test some of the exception logic but not as thoroughly as I would have 
liked. If anyone can offer some guidance here, I am happy to fix up the patch.

That's about it.. let me know what you think.

Catalin

[1] http://www.jhauser.us/arithmetic/TestFloat.html

---
 fpu/softfloat.c         |  182 +++++++++++++++++++++++++++++++++++++++++++++++
 fpu/softfloat.h         |    3 +
 target-i386/op_helper.c |  163 +++++++++++++++++-------------------------
 3 files changed, 249 insertions(+), 99 deletions(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index b29256a..9c6e4a3 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -5234,6 +5234,16 @@ int floatx80_unordered_quiet( floatx80 a, floatx80 b 
STATUS_PARAM )
 }
 
 /*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is an
+| unsupported; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+int floatx80_is_unsupported(floatx80 a)
+{
+    return (extractFloatx80Exp(a) &&
+            !(extractFloatx80Frac(a) & LIT64(0x8000000000000000)));
+}
+
+/*----------------------------------------------------------------------------
 | Returns the result of converting the quadruple-precision floating-point
 | value `a' to the 32-bit two's complement integer format.  The conversion
 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
@@ -6828,6 +6838,178 @@ floatx80 floatx80_scalbn( floatx80 a, int n 
STATUS_PARAM )
                                           aSign, aExp, aSig, 0 STATUS_VAR );
 }
 
+/*
+ * BEGIN from Bochs rev 11224 dated 2012-06-21 10:33:37 -0700
+ *
+ * Converted to use Qemu type aliases, use C features only, etc.
+ */
+
+/* executes single exponent reduction cycle */
+static uint64 remainder_kernel(uint64 aSig0, uint64 bSig, int expDiff, uint64 
*zSig0, uint64 *zSig1)
+{
+    uint64 term0, term1;
+    uint64 aSig1 = 0;
+
+    shortShift128Left(aSig1, aSig0, expDiff, &aSig1, &aSig0);
+    uint64 q = estimateDiv128To64(aSig1, aSig0, bSig);
+    mul64To128(bSig, q, &term0, &term1);
+    sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
+    while ((int64)(*zSig1) < 0) {
+        --q;
+        add128(*zSig1, *zSig0, 0, bSig, zSig1, zSig0);
+    }
+    return q;
+}
+
+static int do_fprem(floatx80 a, floatx80 b, floatx80 *r, uint64 *q, int 
rounding_mode STATUS_PARAM )
+{
+    int32 aExp, bExp, zExp, expDiff;
+    uint64 aSig0, aSig1, bSig;
+    int aSign;
+    *q = 0;
+
+    // handle unsupported extended double-precision floating encodings
+    if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
+    {
+        float_raise(float_flag_invalid, status);
+        *r = floatx80_default_nan;
+        return -1;
+    }
+
+    aSig0 = extractFloatx80Frac(a);
+    aExp = extractFloatx80Exp(a);
+    aSign = extractFloatx80Sign(a);
+    bSig = extractFloatx80Frac(b);
+    bExp = extractFloatx80Exp(b);
+
+    if (aExp == 0x7FFF) {
+        if ((uint64) (aSig0<<1) || ((bExp == 0x7FFF) && (uint64) (bSig<<1))) {
+            *r = propagateFloatx80NaN(a, b, status);
+            return -1;
+        }
+        float_raise(float_flag_invalid, status);
+        *r = floatx80_default_nan;
+        return -1;
+    }
+    if (bExp == 0x7FFF) {
+        if ((uint64) (bSig<<1)) {
+            *r = propagateFloatx80NaN(a, b, status);
+            return -1;
+        }
+        if (aExp == 0 && aSig0) {
+            float_raise(float_flag_input_denormal, status);
+            normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+            *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
+                    packFloatx80(aSign, aExp, aSig0) : a;
+            return 0;
+        }
+        *r = a;
+        return 0;
+
+    }
+    if (bExp == 0) {
+        if (bSig == 0) {
+            float_raise(float_flag_invalid, status);
+            *r = floatx80_default_nan;
+            return -1;
+        }
+        float_raise(float_flag_input_denormal, status);
+        normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
+    }
+    if (aExp == 0) {
+        if (aSig0 == 0) {
+            *r = a;
+            return 0;
+        }
+        float_raise(float_flag_input_denormal, status);
+        normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+    }
+    expDiff = aExp - bExp;
+    aSig1 = 0;
+
+    uint32 overflow = 0;
+
+    if (expDiff >= 64) {
+        int n = (expDiff & 0x1f) | 0x20;
+        remainder_kernel(aSig0, bSig, n, &aSig0, &aSig1);
+        zExp = aExp - n;
+        overflow = 1;
+    }
+    else {
+        zExp = bExp;
+
+        if (expDiff < 0) {
+            if (expDiff < -1) {
+               *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
+                    packFloatx80(aSign, aExp, aSig0) : a;
+               return 0;
+            }
+            shift128Right(aSig0, 0, 1, &aSig0, &aSig1);
+            expDiff = 0;
+        }
+
+        if (expDiff > 0) {
+            *q = remainder_kernel(aSig0, bSig, expDiff, &aSig0, &aSig1);
+        }
+        else {
+            if (bSig <= aSig0) {
+               aSig0 -= bSig;
+               *q = 1;
+            }
+        }
+
+        if (rounding_mode == float_round_nearest_even)
+        {
+            uint64 term0, term1;
+            shift128Right(bSig, 0, 1, &term0, &term1);
+
+            if (! lt128(aSig0, aSig1, term0, term1))
+            {
+               int lt = lt128(term0, term1, aSig0, aSig1);
+               int eq = eq128(aSig0, aSig1, term0, term1);
+
+               if ((eq && (*q & 1)) || lt) {
+                  aSign = !aSign;
+                  ++*q;
+               }
+               if (lt) sub128(bSig, 0, aSig0, aSig1, &aSig0, &aSig1);
+            }
+        }
+    }
+
+    *r = normalizeRoundAndPackFloatx80(80, aSign, zExp, aSig0, aSig1, status);
+    return overflow;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the remainder of the extended double-precision floating-point value
+| `a' with respect to the corresponding value `b'.  The operation is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int floatx80_ieee754_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64 *q 
STATUS_PARAM )
+{
+    return do_fprem(a, b, r, q, float_round_nearest_even STATUS_VAR );
+}
+
+/*----------------------------------------------------------------------------
+| Returns the remainder of the extended double-precision floating-point value
+| `a' with  respect to  the corresponding value `b'. Unlike previous function
+| the  function  does not compute  the remainder  specified  in  the IEC/IEEE
+| Standard  for Binary  Floating-Point  Arithmetic.  This  function  operates
+| differently  from the  previous  function in  the way  that it  rounds  the
+| quotient of 'a' divided by 'b' to an integer.
+*----------------------------------------------------------------------------*/
+
+int floatx80_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64 *q 
STATUS_PARAM )
+{
+    return do_fprem(a, b, r, q, float_round_to_zero STATUS_VAR );
+}
+
+/*
+ * END from Bochs rev 11224 dated 2012-06-21 10:33:37 -0700
+ */
+
 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
 {
     flag aSign;
diff --git a/fpu/softfloat.h b/fpu/softfloat.h
index feec3a1..53d7827 100644
--- a/fpu/softfloat.h
+++ b/fpu/softfloat.h
@@ -497,10 +497,13 @@ int floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM );
 int floatx80_unordered_quiet( floatx80, floatx80 STATUS_PARAM );
 int floatx80_compare( floatx80, floatx80 STATUS_PARAM );
 int floatx80_compare_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_is_unsupported( floatx80 );
 int floatx80_is_quiet_nan( floatx80 );
 int floatx80_is_signaling_nan( floatx80 );
 floatx80 floatx80_maybe_silence_nan( floatx80 );
 floatx80 floatx80_scalbn( floatx80, int STATUS_PARAM );
+int floatx80_ieee754_remainder( floatx80, floatx80, floatx80 *, uint64 * 
STATUS_PARAM );
+int floatx80_remainder( floatx80, floatx80, floatx80 *, uint64 * STATUS_PARAM 
);
 
 INLINE floatx80 floatx80_abs(floatx80 a)
 {
diff --git a/target-i386/op_helper.c b/target-i386/op_helper.c
index 2862ea4..ae0c877 100644
--- a/target-i386/op_helper.c
+++ b/target-i386/op_helper.c
@@ -3620,6 +3620,23 @@ static void fpu_set_exception(int mask)
         env->fpus |= FPUS_SE | FPUS_B;
 }
 
+static int fpu_exception_from_fp_status(void)
+{
+    int mask;
+
+    const int float_flag_denormals = float_flag_input_denormal |
+                                     float_flag_output_denormal;
+
+    // Most flags have the same value in the enum and 387 status word, except
+    // the denormal flags.
+    mask = env->fp_status.float_exception_flags & ~float_flag_denormals;
+    if (env->fp_status.float_exception_flags & float_flag_denormals) {
+        mask |= FPUS_DE;
+    }
+
+    return mask;
+}
+
 static inline floatx80 helper_fdiv(floatx80 a, floatx80 b)
 {
     if (floatx80_is_zero(b)) {
@@ -4210,119 +4227,67 @@ void helper_fxtract(void)
 
 void helper_fprem1(void)
 {
-    double st0, st1, dblq, fpsrcop, fptemp;
-    CPU_LDoubleU fpsrcop1, fptemp1;
-    int expdif;
-    signed long long int q;
-
-    st0 = floatx80_to_double(ST0);
-    st1 = floatx80_to_double(ST1);
-
-    if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
-        ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-        return;
-    }
-
-    fpsrcop = st0;
-    fptemp = st1;
-    fpsrcop1.d = ST0;
-    fptemp1.d = ST1;
-    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
-
-    if (expdif < 0) {
-        /* optimisation? taken from the AMD docs */
+    floatx80 result;
+    uint64_t quotient;
+
+    /* TODO(catalinp):
+     *
+     * - What is the defined purpose of fp_status in Qemu? Seems many things
+     *   write into it, but few if any translate into env->fpus.
+     *
+     * - Bochs' FPU_pre_exception_handling resets a few more fields in 
fp_status
+     *   before doing the operation. What is the purpose of that and is this
+     *   necessary here?
+     */
+    env->fp_status.float_exception_flags = 0;
+    int flags = floatx80_ieee754_remainder(ST0, ST1, &result, &quotient,
+                                           &env->fp_status);
+
+    /* TODO(catalinp): Bochs also checks for unmasked exceptions before storing
+     * these flags. Should we also do this?
+     */
+    if (flags >= 0) {
         env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-        /* ST0 is unchanged */
-        return;
+        if (flags) {
+            env->fpus |= (1 << 10);
+        } else {            /* (C0,C3,C1) <-- (q2,q1,q0) */
+            if (quotient & 1) env->fpus |= (1 << 9);
+            if (quotient & 2) env->fpus |= (1 << 14);
+            if (quotient & 4) env->fpus |= (1 << 8);
+        }
     }
 
-    if (expdif < 53) {
-        dblq = fpsrcop / fptemp;
-        /* round dblq towards nearest integer */
-        dblq = rint(dblq);
-        st0 = fpsrcop - fptemp * dblq;
-
-        /* convert dblq to q by truncating towards zero */
-        if (dblq < 0.0)
-           q = (signed long long int)(-dblq);
-        else
-           q = (signed long long int)dblq;
+    ST0 = result;
 
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-                                /* (C0,C3,C1) <-- (q2,q1,q0) */
-        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
-        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
-        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
-    } else {
-        env->fpus |= 0x400;  /* C2 <-- 1 */
-        fptemp = pow(2.0, expdif - 50);
-        fpsrcop = (st0 / st1) / fptemp;
-        /* fpsrcop = integer obtained by chopping */
-        fpsrcop = (fpsrcop < 0.0) ?
-                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
-        st0 -= (st1 * fpsrcop * fptemp);
-    }
-    ST0 = double_to_floatx80(st0);
+    /* TODO(catalinp): Set only unmasked exceptions? */
+    fpu_set_exception(fpu_exception_from_fp_status());
 }
 
 void helper_fprem(void)
 {
-    double st0, st1, dblq, fpsrcop, fptemp;
-    CPU_LDoubleU fpsrcop1, fptemp1;
-    int expdif;
-    signed long long int q;
-
-    st0 = floatx80_to_double(ST0);
-    st1 = floatx80_to_double(ST1);
+    floatx80 result;
+    uint64_t quotient;
 
-    if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
-       ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
-       env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-       return;
-    }
-
-    fpsrcop = st0;
-    fptemp = st1;
-    fpsrcop1.d = ST0;
-    fptemp1.d = ST1;
-    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
+    /* TODO(catalinp): See comments in helper_fprem1() about lots of potential
+     * semantic ambiguities/differences between this and Bochs' implementation.
+     */
+    env->fp_status.float_exception_flags = 0;
+    int flags = floatx80_remainder(ST0, ST1, &result, &quotient, 
&env->fp_status);
 
-    if (expdif < 0) {
-        /* optimisation? taken from the AMD docs */
+    if (flags >= 0) {
         env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-        /* ST0 is unchanged */
-        return;
+        if (flags) {
+            env->fpus |= (1 << 10);
+        } else {            /* (C0,C3,C1) <-- (q2,q1,q0) */
+            if (quotient & 1) env->fpus |= (1 << 9);
+            if (quotient & 2) env->fpus |= (1 << 14);
+            if (quotient & 4) env->fpus |= (1 << 8);
+        }
     }
 
-    if ( expdif < 53 ) {
-        dblq = fpsrcop/*ST0*/ / fptemp/*ST1*/;
-        /* round dblq towards zero */
-        dblq = (dblq < 0.0) ? ceil(dblq) : floor(dblq);
-        st0 = fpsrcop/*ST0*/ - fptemp * dblq;
+    ST0 = result;
 
-        /* convert dblq to q by truncating towards zero */
-        if (dblq < 0.0)
-           q = (signed long long int)(-dblq);
-        else
-           q = (signed long long int)dblq;
-
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-                                /* (C0,C3,C1) <-- (q2,q1,q0) */
-        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
-        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
-        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
-    } else {
-        int N = 32 + (expdif % 32); /* as per AMD docs */
-        env->fpus |= 0x400;  /* C2 <-- 1 */
-        fptemp = pow(2.0, (double)(expdif - N));
-        fpsrcop = (st0 / st1) / fptemp;
-        /* fpsrcop = integer obtained by chopping */
-        fpsrcop = (fpsrcop < 0.0) ?
-                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
-        st0 -= (st1 * fpsrcop * fptemp);
-    }
-    ST0 = double_to_floatx80(st0);
+    fpu_set_exception(fpu_exception_from_fp_status());
 }
 
 void helper_fyl2xp1(void)
-- 
1.7.7.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]