qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 03/11] softfloat: Introduce float_flag_inorm_denormal


From: Alex Bennée
Subject: Re: [PATCH 03/11] softfloat: Introduce float_flag_inorm_denormal
Date: Mon, 07 Jun 2021 16:35:43 +0100
User-agent: mu4e 1.5.13; emacs 28.0.50

Richard Henderson <richard.henderson@linaro.org> writes:

> Create a new exception flag for reporting input denormals that are not
> flushed to zero, they are normalized and treated as normal numbers.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  include/fpu/softfloat-types.h | 15 ++++---
>  fpu/softfloat.c               | 84 +++++++++++------------------------
>  fpu/softfloat-parts.c.inc     |  1 +
>  3 files changed, 36 insertions(+), 64 deletions(-)
>
> diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h
> index e2d70ff556..174100e50e 100644
> --- a/include/fpu/softfloat-types.h
> +++ b/include/fpu/softfloat-types.h
> @@ -143,13 +143,14 @@ typedef enum __attribute__((__packed__)) {
>   */
>  
>  enum {
> -    float_flag_invalid   =  1,
> -    float_flag_divbyzero =  4,
> -    float_flag_overflow  =  8,
> -    float_flag_underflow = 16,
> -    float_flag_inexact   = 32,
> -    float_flag_iflush_denormal = 64,
> -    float_flag_oflush_denormal = 128
> +    float_flag_invalid         = 0x0001,
> +    float_flag_divbyzero       = 0x0002,
> +    float_flag_overflow        = 0x0004,
> +    float_flag_underflow       = 0x0008,
> +    float_flag_inexact         = 0x0010,
> +    float_flag_inorm_denormal  = 0x0020,  /* denormal input, normalized */
> +    float_flag_iflush_denormal = 0x0040,  /* denormal input, flushed to zero 
> */
> +    float_flag_oflush_denormal = 0x0080,  /* denormal result, flushed to 
> zero */
>  };
>  
>  /*
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index cb077cf111..e54cdb274d 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -126,61 +126,23 @@ this code that are retained.
>   * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 
> result
>   * and the result is < the minimum normal.
>   */
> -#define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
> +
> +#define GEN_INPUT_FLUSH(name, soft_t)                                   \
>      static inline void name(soft_t *a, float_status *s)                 \
>      {                                                                   \
>          if (unlikely(soft_t ## _is_denormal(*a))) {                     \
> -            *a = soft_t ## _set_sign(soft_t ## _zero,                   \
> -                                     soft_t ## _is_neg(*a));            \
> -            float_raise(float_flag_iflush_denormal, s);                  \
> +            if (s->flush_inputs_to_zero) {                              \
> +                *a = soft_t ## _set_sign(0, soft_t ## _is_neg(*a));     \
> +                float_raise(float_flag_iflush_denormal, s);             \
> +            } else {                                                    \
> +                float_raise(float_flag_inorm_denormal, s);              \
> +            }                                                           \
>          }                                                               \
>      }

So I'm guessing Emilio had the original flush code split was to avoid
multiple checks against s->flush_inputs_to_zero in the code. The was
possibly a good reason, comparing the before/after of float32_mul:

  Dump of assembler code for function float32_mul:
     0x0000000000934240 <+0>:   movzbl 0x1(%rdx),%eax
     0x0000000000934244 <+4>:   test   $0x20,%al
     0x0000000000934246 <+6>:   je     0x9342b0 <float32_mul+112>
     0x0000000000934248 <+8>:   cmpb   $0x0,(%rdx)
     0x000000000093424b <+11>:  jne    0x9342b0 <float32_mul+112>
     0x000000000093424d <+13>:  cmpb   $0x0,0x5(%rdx)
     0x0000000000934251 <+17>:  jne    0x9342d0 <float32_mul+144>
     0x0000000000934253 <+19>:  mov    %edi,%eax
     0x0000000000934255 <+21>:  shr    $0x17,%eax
     0x0000000000934258 <+24>:  add    $0x1,%eax
     0x000000000093425b <+27>:  test   $0xfe,%al
     0x000000000093425d <+29>:  je     0x9342a8 <float32_mul+104>
     0x000000000093425f <+31>:  mov    %esi,%eax
     0x0000000000934261 <+33>:  shr    $0x17,%eax
     0x0000000000934264 <+36>:  add    $0x1,%eax
     0x0000000000934267 <+39>:  test   $0xfe,%al
     0x0000000000934269 <+41>:  jne    0x934273 <float32_mul+51>
     0x000000000093426b <+43>:  test   $0x7fffffff,%esi
     0x0000000000934271 <+49>:  jne    0x9342b0 <float32_mul+112>
     0x0000000000934273 <+51>:  mov    %esi,-0xc(%rsp)
     0x0000000000934277 <+55>:  movss  -0xc(%rsp),%xmm0
     0x000000000093427d <+61>:  mov    %edi,-0xc(%rsp)
     0x0000000000934281 <+65>:  movss  -0xc(%rsp),%xmm2
     0x0000000000934287 <+71>:  mulss  %xmm2,%xmm0
     0x000000000093428b <+75>:  movd   %xmm0,%eax
     0x000000000093428f <+79>:  andps  0x3b805a(%rip),%xmm0        # 0xcec2f0
     0x0000000000934296 <+86>:  ucomiss 0x3b8047(%rip),%xmm0        # 0xcec2e4
     0x000000000093429d <+93>:  jbe    0x9342b8 <float32_mul+120>
     0x000000000093429f <+95>:  orb    $0x8,0x1(%rdx)
     0x00000000009342a3 <+99>:  retq   
     0x00000000009342a4 <+100>: nopl   0x0(%rax)
     0x00000000009342a8 <+104>: test   $0x7fffffff,%edi
     0x00000000009342ae <+110>: je     0x93425f <float32_mul+31>
     0x00000000009342b0 <+112>: jmpq   0x9290d0 <soft_f32_mul>
     0x00000000009342b5 <+117>: nopl   (%rax)
     0x00000000009342b8 <+120>: movss  0x3b8020(%rip),%xmm1        # 0xcec2e0
     0x00000000009342c0 <+128>: comiss %xmm0,%xmm1
     0x00000000009342c3 <+131>: jae    0x934320 <float32_mul+224>
     0x00000000009342c5 <+133>: retq   
     0x00000000009342c6 <+134>: nopw   %cs:0x0(%rax,%rax,1)
     0x00000000009342d0 <+144>: test   $0x7f800000,%edi
     0x00000000009342d6 <+150>: jne    0x9342f0 <float32_mul+176>
     0x00000000009342d8 <+152>: test   $0x7fffffff,%edi
     0x00000000009342de <+158>: je     0x9342f0 <float32_mul+176>
     0x00000000009342e0 <+160>: or     $0x40,%eax
     0x00000000009342e3 <+163>: and    $0x80000000,%edi
     0x00000000009342e9 <+169>: mov    %al,0x1(%rdx)
     0x00000000009342ec <+172>: nopl   0x0(%rax)
     0x00000000009342f0 <+176>: test   $0x7f800000,%esi
     0x00000000009342f6 <+182>: jne    0x934253 <float32_mul+19>
     0x00000000009342fc <+188>: test   $0x7fffffff,%esi
     0x0000000000934302 <+194>: je     0x934253 <float32_mul+19>
     0x0000000000934308 <+200>: and    $0x80000000,%esi
     0x000000000093430e <+206>: orb    $0x40,0x1(%rdx)
     0x0000000000934312 <+210>: jmpq   0x934253 <float32_mul+19>
     0x0000000000934317 <+215>: nopw   0x0(%rax,%rax,1)
     0x0000000000934320 <+224>: mov    %edi,%ecx
     0x0000000000934322 <+226>: or     %esi,%ecx
     0x0000000000934324 <+228>: and    $0x7fffffff,%ecx
     0x000000000093432a <+234>: jne    0x9342b0 <float32_mul+112>
     0x000000000093432c <+236>: jmp    0x9342c5 <float32_mul+133>
  End of assembler dump.

And after this change:

  Dump of assembler code for function float32_mul:
     0x0000000000895d60 <+0>:   movzbl 0x1(%rdx),%eax
     0x0000000000895d64 <+4>:   test   $0x10,%al
     0x0000000000895d66 <+6>:   je     0x895e30 <float32_mul+208>
     0x0000000000895d6c <+12>:  cmpb   $0x0,(%rdx)
     0x0000000000895d6f <+15>:  jne    0x895e30 <float32_mul+208>
     0x0000000000895d75 <+21>:  test   $0x7f800000,%edi
     0x0000000000895d7b <+27>:  jne    0x895da0 <float32_mul+64>
     0x0000000000895d7d <+29>:  test   $0x7fffffff,%edi
     0x0000000000895d83 <+35>:  je     0x895da0 <float32_mul+64>
     0x0000000000895d85 <+37>:  cmpb   $0x0,0x5(%rdx)
     0x0000000000895d89 <+41>:  je     0x895e60 <float32_mul+256>
     0x0000000000895d8f <+47>:  or     $0x40,%eax
     0x0000000000895d92 <+50>:  and    $0x80000000,%edi
     0x0000000000895d98 <+56>:  mov    %al,0x1(%rdx)
     0x0000000000895d9b <+59>:  nopl   0x0(%rax,%rax,1)
     0x0000000000895da0 <+64>:  test   $0x7f800000,%esi
     0x0000000000895da6 <+70>:  jne    0x895dd0 <float32_mul+112>
     0x0000000000895da8 <+72>:  test   $0x7fffffff,%esi
     0x0000000000895dae <+78>:  je     0x895dd0 <float32_mul+112>
     0x0000000000895db0 <+80>:  cmpb   $0x0,0x5(%rdx)
     0x0000000000895db4 <+84>:  movzbl 0x1(%rdx),%eax
     0x0000000000895db8 <+88>:  je     0x895e50 <float32_mul+240>
     0x0000000000895dbe <+94>:  or     $0x40,%eax
     0x0000000000895dc1 <+97>:  and    $0x80000000,%esi
     0x0000000000895dc7 <+103>: mov    %al,0x1(%rdx)
     0x0000000000895dca <+106>: nopw   0x0(%rax,%rax,1)
     0x0000000000895dd0 <+112>: mov    %edi,%eax
     0x0000000000895dd2 <+114>: shr    $0x17,%eax
     0x0000000000895dd5 <+117>: add    $0x1,%eax
     0x0000000000895dd8 <+120>: test   $0xfe,%al
     0x0000000000895dda <+122>: je     0x895e28 <float32_mul+200>
     0x0000000000895ddc <+124>: mov    %esi,%eax
     0x0000000000895dde <+126>: shr    $0x17,%eax
     0x0000000000895de1 <+129>: add    $0x1,%eax
     0x0000000000895de4 <+132>: test   $0xfe,%al
     0x0000000000895de6 <+134>: jne    0x895df0 <float32_mul+144>
     0x0000000000895de8 <+136>: test   $0x7fffffff,%esi
     0x0000000000895dee <+142>: jne    0x895e30 <float32_mul+208>
     0x0000000000895df0 <+144>: mov    %esi,-0xc(%rsp)
     0x0000000000895df4 <+148>: movss  -0xc(%rsp),%xmm0
     0x0000000000895dfa <+154>: mov    %edi,-0xc(%rsp)
     0x0000000000895dfe <+158>: movss  -0xc(%rsp),%xmm2
     0x0000000000895e04 <+164>: mulss  %xmm2,%xmm0
     0x0000000000895e08 <+168>: movd   %xmm0,%eax
     0x0000000000895e0c <+172>: andps  0x46bb5d(%rip),%xmm0        # 0xd01970
     0x0000000000895e13 <+179>: ucomiss 0x46bb4a(%rip),%xmm0        # 0xd01964
     0x0000000000895e1a <+186>: jbe    0x895e38 <float32_mul+216>
     0x0000000000895e1c <+188>: orb    $0x4,0x1(%rdx)
     0x0000000000895e20 <+192>: retq   
     0x0000000000895e21 <+193>: nopl   0x0(%rax)
     0x0000000000895e28 <+200>: test   $0x7fffffff,%edi
     0x0000000000895e2e <+206>: je     0x895ddc <float32_mul+124>
     0x0000000000895e30 <+208>: jmpq   0x88a8c0 <soft_f32_mul>
     0x0000000000895e35 <+213>: nopl   (%rax)
     0x0000000000895e38 <+216>: movss  0x46bb20(%rip),%xmm1        # 0xd01960
     0x0000000000895e40 <+224>: comiss %xmm0,%xmm1
     0x0000000000895e43 <+227>: jae    0x895e70 <float32_mul+272>
     0x0000000000895e45 <+229>: retq   
     0x0000000000895e46 <+230>: nopw   %cs:0x0(%rax,%rax,1)
     0x0000000000895e50 <+240>: or     $0x20,%eax
     0x0000000000895e53 <+243>: mov    %al,0x1(%rdx)
     0x0000000000895e56 <+246>: jmpq   0x895dd0 <float32_mul+112>
     0x0000000000895e5b <+251>: nopl   0x0(%rax,%rax,1)
     0x0000000000895e60 <+256>: or     $0x20,%eax
     0x0000000000895e63 <+259>: mov    %al,0x1(%rdx)
     0x0000000000895e66 <+262>: jmpq   0x895da0 <float32_mul+64>
     0x0000000000895e6b <+267>: nopl   0x0(%rax,%rax,1)
     0x0000000000895e70 <+272>: mov    %esi,%ecx
     0x0000000000895e72 <+274>: or     %edi,%ecx
     0x0000000000895e74 <+276>: and    $0x7fffffff,%ecx
     0x0000000000895e7a <+282>: jne    0x895e30 <float32_mul+208>
     0x0000000000895e7c <+284>: jmp    0x895e45 <float32_mul+229>
  End of assembler dump.

However I'm not sure how much of that increase is down to the change of
macro expansion and how much is due to the extra leg for the flushing.

Anyway other than that observation seems OK to me:

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée



reply via email to

[Prev in Thread] Current Thread [Next in Thread]