[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softf
From: |
Catalin Patulea |
Subject: |
[Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only |
Date: |
Mon, 2 Jul 2012 11:25:35 -0400 |
FPREM1 now passes the TestFloat floatx80_rem suite (and FPREM is implemented
very
similarly).
The code (the bulk of which is remainder_kernel and do_fprem) is derived from
Bochs SVN revision 11224 dated 2012-06-21 10:33:37 -0700, with conversions to
Qemu type aliases, C features only, etc. as needed.
Signed-off-by: Catalin Patulea <address@hidden>
---
fpu/softfloat.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++
fpu/softfloat.h | 4 +
target-i386/op_helper.c | 166 ++++++++++++++++------------------------
3 files changed, 266 insertions(+), 99 deletions(-)
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index b29256a..bd1879d 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -5234,6 +5234,16 @@ int floatx80_unordered_quiet( floatx80 a, floatx80 b
STATUS_PARAM )
}
/*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is an
+| unsupported value; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+int floatx80_is_unsupported(floatx80 a)
+{
+ return extractFloatx80Exp(a) &&
+ !(extractFloatx80Frac(a) & LIT64(0x8000000000000000));
+}
+
+/*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the 32-bit two's complement integer format. The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
@@ -6828,6 +6838,191 @@ floatx80 floatx80_scalbn( floatx80 a, int n
STATUS_PARAM )
aSign, aExp, aSig, 0 STATUS_VAR );
}
+/* executes single exponent reduction cycle */
+static uint64_t remainder_kernel(uint64_t aSig0, uint64_t bSig, int expDiff,
+ uint64_t *zSig0, uint64_t *zSig1)
+{
+ uint64_t term0, term1;
+ uint64_t aSig1 = 0;
+
+ shortShift128Left(aSig1, aSig0, expDiff, &aSig1, &aSig0);
+ uint64_t q = estimateDiv128To64(aSig1, aSig0, bSig);
+ mul64To128(bSig, q, &term0, &term1);
+ sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
+ while ((int64)(*zSig1) < 0) {
+ --q;
+ add128(*zSig1, *zSig0, 0, bSig, zSig1, zSig0);
+ }
+ return q;
+}
+
+static int do_fprem(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q,
+ int rounding_mode STATUS_PARAM)
+{
+ int32 aExp, bExp, zExp, expDiff;
+ uint64_t aSig0, aSig1, bSig;
+ flag aSign;
+ *q = 0;
+
+ /* handle unsupported extended double-precision floating encodings */
+ if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) {
+ float_raise(float_flag_invalid, status);
+ *r = floatx80_default_nan;
+ return -1;
+ }
+
+ aSig0 = extractFloatx80Frac(a);
+ aExp = extractFloatx80Exp(a);
+ aSign = extractFloatx80Sign(a);
+ bSig = extractFloatx80Frac(b);
+ bExp = extractFloatx80Exp(b);
+
+ if (aExp == 0x7FFF) {
+ if ((uint64_t) (aSig0<<1) || ((bExp == 0x7FFF) &&
+ (uint64_t) (bSig<<1))) {
+ *r = propagateFloatx80NaN(a, b, status);
+ return -1;
+ }
+ float_raise(float_flag_invalid, status);
+ *r = floatx80_default_nan;
+ return -1;
+ }
+ if (bExp == 0x7FFF) {
+ if ((uint64_t) (bSig<<1)) {
+ *r = propagateFloatx80NaN(a, b, status);
+ return -1;
+ }
+ if (aExp == 0 && aSig0) {
+ float_raise(float_flag_input_denormal, status);
+ normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+ *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
+ packFloatx80(aSign, aExp, aSig0) : a;
+ return 0;
+ }
+ *r = a;
+ return 0;
+
+ }
+ if (bExp == 0) {
+ if (bSig == 0) {
+ float_raise(float_flag_invalid, status);
+ *r = floatx80_default_nan;
+ return -1;
+ }
+ float_raise(float_flag_input_denormal, status);
+ normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
+ }
+ if (aExp == 0) {
+ if (aSig0 == 0) {
+ *r = a;
+ return 0;
+ }
+ float_raise(float_flag_input_denormal, status);
+ normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+ }
+ expDiff = aExp - bExp;
+ aSig1 = 0;
+
+ int overflow = 0;
+
+ if (expDiff >= 64) {
+ int n = (expDiff & 0x1f) | 0x20;
+ remainder_kernel(aSig0, bSig, n, &aSig0, &aSig1);
+ zExp = aExp - n;
+ overflow = 1;
+ } else {
+ zExp = bExp;
+
+ if (expDiff < 0) {
+ if (expDiff < -1) {
+ *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
+ packFloatx80(aSign, aExp, aSig0) : a;
+ return 0;
+ }
+ shift128Right(aSig0, 0, 1, &aSig0, &aSig1);
+ expDiff = 0;
+ }
+
+ if (expDiff > 0) {
+ *q = remainder_kernel(aSig0, bSig, expDiff, &aSig0, &aSig1);
+ } else {
+ if (bSig <= aSig0) {
+ aSig0 -= bSig;
+ *q = 1;
+ }
+ }
+
+ if (rounding_mode == float_round_nearest_even) {
+ uint64_t term0, term1;
+ shift128Right(bSig, 0, 1, &term0, &term1);
+
+ if (!lt128(aSig0, aSig1, term0, term1)) {
+ int lt = lt128(term0, term1, aSig0, aSig1);
+ int eq = eq128(aSig0, aSig1, term0, term1);
+
+ if ((eq && (*q & 1)) || lt) {
+ aSign = !aSign;
+ ++*q;
+ }
+ if (lt) {
+ sub128(bSig, 0, aSig0, aSig1, &aSig0, &aSig1);
+ }
+ }
+ }
+ }
+
+ *r = normalizeRoundAndPackFloatx80(80, aSign, zExp, aSig0, aSig1, status);
+ return overflow;
+}
+
+/*----------------------------------------------------------------------------
+| Computes the remainder of the extended double-precision floating-point value
+| `a' with respect to the corresponding value `b'. Stores the remainder in
+| `*r'.
+|
+| Returns one of the following:
+| -1 The operands are invalid. Exceptions were raised and no remainder was
+| computed. `*r' was set to a NaN value.
+| 0 The remainder was computed successfully and completely.
+| 1 Only a partial remainder was computed and stored in `*r'. Call this
+| function again with `a' set to the returned partial remainder and the
+| same `b' to continue calculating the remainder.
+|
+| The operation is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int floatx80_ieee754_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q
+ STATUS_PARAM)
+{
+ return do_fprem(a, b, r, q, float_round_nearest_even STATUS_VAR);
+}
+
+/*----------------------------------------------------------------------------
+| Computes the remainder of the extended double-precision floating-point value
+| `a' with respect to the corresponding value `b'. Stores the remainder in
+| `*r'.
+|
+| Returns one of the following:
+| -1 The operands are invalid. Exceptions were raised and no remainder was
+| computed. `*r' was set to a NaN value.
+| 0 The remainder was computed successfully and completely.
+| 1 Only a partial remainder was computed and stored in `*r'. Call this
+| function again with `a' set to the returned partial remainder and the
+| same `b' to continue calculating the remainder.
+|
+| Unlike previous function the function does not compute the remainder
+| specified in the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+| This function operates differently from the previous function in the
way
+| that it rounds the quotient of 'a' divided by 'b' to an integer.
+*----------------------------------------------------------------------------*/
+
+int floatx80_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q
+ STATUS_PARAM)
+{
+ return do_fprem(a, b, r, q, float_round_to_zero STATUS_VAR);
+}
+
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
{
flag aSign;
diff --git a/fpu/softfloat.h b/fpu/softfloat.h
index feec3a1..e6f5b87 100644
--- a/fpu/softfloat.h
+++ b/fpu/softfloat.h
@@ -497,10 +497,14 @@ int floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM );
int floatx80_unordered_quiet( floatx80, floatx80 STATUS_PARAM );
int floatx80_compare( floatx80, floatx80 STATUS_PARAM );
int floatx80_compare_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_is_unsupported(floatx80);
int floatx80_is_quiet_nan( floatx80 );
int floatx80_is_signaling_nan( floatx80 );
floatx80 floatx80_maybe_silence_nan( floatx80 );
floatx80 floatx80_scalbn( floatx80, int STATUS_PARAM );
+int floatx80_ieee754_remainder(floatx80, floatx80, floatx80 *, uint64_t *
+ STATUS_PARAM);
+int floatx80_remainder(floatx80, floatx80, floatx80 *, uint64_t *
STATUS_PARAM);
INLINE floatx80 floatx80_abs(floatx80 a)
{
diff --git a/target-i386/op_helper.c b/target-i386/op_helper.c
index 2862ea4..53f46fd 100644
--- a/target-i386/op_helper.c
+++ b/target-i386/op_helper.c
@@ -3620,6 +3620,23 @@ static void fpu_set_exception(int mask)
env->fpus |= FPUS_SE | FPUS_B;
}
+static int fpu_exception_from_fp_status(const float_status *fp_status)
+{
+ int mask;
+
+ const int float_flag_denormals = float_flag_input_denormal |
+ float_flag_output_denormal;
+
+ /* Most flags have the same value in the enum and 387 status word, except
+ * the denormal flags. */
+ mask = fp_status->float_exception_flags & ~float_flag_denormals;
+ if (fp_status->float_exception_flags & float_flag_denormals) {
+ mask |= FPUS_DE;
+ }
+
+ return mask;
+}
+
static inline floatx80 helper_fdiv(floatx80 a, floatx80 b)
{
if (floatx80_is_zero(b)) {
@@ -4208,121 +4225,72 @@ void helper_fxtract(void)
}
}
+static void fprem_maybe_update_flags(int overflow, uint64_t quotient)
+{
+ if (overflow >= 0) { /* overflow < 0 means error */
+ env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
+ if (overflow) {
+ env->fpus |= 1 << 10;
+ } else { /* (C0,C3,C1) <-- (q2,q1,q0) */
+ if (quotient & 1) {
+ env->fpus |= 1 << 9;
+ }
+ if (quotient & 2) {
+ env->fpus |= 1 << 14;
+ }
+ if (quotient & 4) {
+ env->fpus |= 1 << 8;
+ }
+ }
+ }
+}
+
void helper_fprem1(void)
{
- double st0, st1, dblq, fpsrcop, fptemp;
- CPU_LDoubleU fpsrcop1, fptemp1;
- int expdif;
- signed long long int q;
+ floatx80 result;
+ uint64_t quotient;
- st0 = floatx80_to_double(ST0);
- st1 = floatx80_to_double(ST1);
+ /* Use a copy of the current fp_status so that we can detect new exceptions
+ * and pass them to fpu_set_exception. TODO(catalinp): Remove this once
+ * exceptions are reported directly into env->fp_status. */
+ float_status local_status = env->fp_status;
- if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
- ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
- env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
- return;
- }
-
- fpsrcop = st0;
- fptemp = st1;
- fpsrcop1.d = ST0;
- fptemp1.d = ST1;
- expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
+ /* TODO(catalinp): Bochs' FPU_pre_exception_handling resets a few more
+ * fields in fp_status before doing the operation. What is the purpose of
+ * that and is it necessary here? */
+ local_status.float_exception_flags = 0;
- if (expdif < 0) {
- /* optimisation? taken from the AMD docs */
- env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
- /* ST0 is unchanged */
- return;
- }
+ int overflow = floatx80_ieee754_remainder(ST0, ST1, &result, "ient,
+ &local_status);
- if (expdif < 53) {
- dblq = fpsrcop / fptemp;
- /* round dblq towards nearest integer */
- dblq = rint(dblq);
- st0 = fpsrcop - fptemp * dblq;
+ /* TODO(catalinp): Bochs also checks for unmasked exceptions before storing
+ * these flags. Should we also do this?
+ */
+ fprem_maybe_update_flags(overflow, quotient);
- /* convert dblq to q by truncating towards zero */
- if (dblq < 0.0)
- q = (signed long long int)(-dblq);
- else
- q = (signed long long int)dblq;
+ ST0 = result;
- env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
- /* (C0,C3,C1) <-- (q2,q1,q0) */
- env->fpus |= (q & 0x4) << (8 - 2); /* (C0) <-- q2 */
- env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
- env->fpus |= (q & 0x1) << (9 - 0); /* (C1) <-- q0 */
- } else {
- env->fpus |= 0x400; /* C2 <-- 1 */
- fptemp = pow(2.0, expdif - 50);
- fpsrcop = (st0 / st1) / fptemp;
- /* fpsrcop = integer obtained by chopping */
- fpsrcop = (fpsrcop < 0.0) ?
- -(floor(fabs(fpsrcop))) : floor(fpsrcop);
- st0 -= (st1 * fpsrcop * fptemp);
- }
- ST0 = double_to_floatx80(st0);
+ fpu_set_exception(fpu_exception_from_fp_status(&local_status));
}
void helper_fprem(void)
{
- double st0, st1, dblq, fpsrcop, fptemp;
- CPU_LDoubleU fpsrcop1, fptemp1;
- int expdif;
- signed long long int q;
+ floatx80 result;
+ uint64_t quotient;
- st0 = floatx80_to_double(ST0);
- st1 = floatx80_to_double(ST1);
+ /* TODO(catalinp): See comments in helper_fprem1() about lots of potential
+ * semantic ambiguities/differences between this and Bochs' implementation.
+ */
+ float_status local_status = env->fp_status;
+ local_status.float_exception_flags = 0;
+ int overflow = floatx80_remainder(ST0, ST1, &result, "ient,
+ &local_status);
- if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
- ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
- env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
- return;
- }
-
- fpsrcop = st0;
- fptemp = st1;
- fpsrcop1.d = ST0;
- fptemp1.d = ST1;
- expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
-
- if (expdif < 0) {
- /* optimisation? taken from the AMD docs */
- env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
- /* ST0 is unchanged */
- return;
- }
-
- if ( expdif < 53 ) {
- dblq = fpsrcop/*ST0*/ / fptemp/*ST1*/;
- /* round dblq towards zero */
- dblq = (dblq < 0.0) ? ceil(dblq) : floor(dblq);
- st0 = fpsrcop/*ST0*/ - fptemp * dblq;
+ fprem_maybe_update_flags(overflow, quotient);
- /* convert dblq to q by truncating towards zero */
- if (dblq < 0.0)
- q = (signed long long int)(-dblq);
- else
- q = (signed long long int)dblq;
+ ST0 = result;
- env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
- /* (C0,C3,C1) <-- (q2,q1,q0) */
- env->fpus |= (q & 0x4) << (8 - 2); /* (C0) <-- q2 */
- env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
- env->fpus |= (q & 0x1) << (9 - 0); /* (C1) <-- q0 */
- } else {
- int N = 32 + (expdif % 32); /* as per AMD docs */
- env->fpus |= 0x400; /* C2 <-- 1 */
- fptemp = pow(2.0, (double)(expdif - N));
- fpsrcop = (st0 / st1) / fptemp;
- /* fpsrcop = integer obtained by chopping */
- fpsrcop = (fpsrcop < 0.0) ?
- -(floor(fabs(fpsrcop))) : floor(fpsrcop);
- st0 -= (st1 * fpsrcop * fptemp);
- }
- ST0 = double_to_floatx80(st0);
+ fpu_set_exception(fpu_exception_from_fp_status(&local_status));
}
void helper_fyl2xp1(void)
--
1.7.7.3