Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <367A4018B58A4E308E2A95404362CBFB@H270>
Date: Sun, 15 Aug 2021 09:04:55 +0200
From: "Stefan Kanthak" <stefan.kanthak@...go.de>
To: "Szabolcs Nagy" <nsz@...t70.net>
Cc: <musl@...ts.openwall.com>
Subject: Re: [PATCH #2] Properly simplified nextafter()

Szabolcs Nagy <nsz@...t70.net> wrote:

>* Stefan Kanthak <stefan.kanthak@...go.de> [2021-08-13 14:04:51 +0200]:
>> Szabolcs Nagy <nsz@...t70.net> wrote on 2021-08-10 at 23:34:

>> > (the i386 machine where i originally tested this preferred int
>> > cmp and float cmp was very slow in the subnormal range
>>
>> This also and still holds for i386 FPU fadd/fmul as well as SSE
>> addsd/addss/mulss/mulsd additions/multiplies!
>
> they are avoided in the common case, and only used to create
> fenv side-effects.

Unfortunately but for hard & SOFT-float, where no fenv exists, as
Rich wrote.

>> --- -/src/math/nextafter.c
>> +++ +/src/math/nextafter.c
>> @@ -10,13 +10,13 @@
>>                 return x + y;
>>         if (ux.i == uy.i)
>>                 return y;
>> -       ax = ux.i & -1ULL/2;
>> -       ay = uy.i & -1ULL/2;
>> +       ax = ux.i << 2;
>> +       ay = uy.i << 2;
>
> the << 2 looks wrong, the top bit of the exponent is lost.

It IS wrong, but only in the post, not in the code I tested.

>>         if (ax == 0) {
>>                 if (ay == 0)
>>                         return y;
>>                 ux.i = (uy.i & 1ULL<<63) | 1;
>> -       } else if (ax > ay || ((ux.i ^ uy.i) & 1ULL<<63))
>> +       } else if ((ax < ay) == ((int64_t) ux.i < 0))
>>                 ux.i--;
>>         else
>>                 ux.i++;
> ...
>> How do you compare these 60 instructions/252 bytes to the code I posted
>> (23 instructions/72 bytes)?
>
> you should benchmark, but the second best is to look
> at the longest dependency chain in the hot path and
> add up the instruction latencies.

1 billion calls to nextafter(), with random from, and to either 0 or +INF:
run 1 against glibc,                         8.58 ns/call
run 2 against musl original,                 3.59
run 3 against musl patched,                  0.52
run 4 the pure floating-point variant from   0.72
      my initial post in this thread,
run 5 the assembly variant I posted.         0.28 ns/call

Now hurry up and patch your slowmotion code!

Stefan

PS: I cheated a very tiny little bit: the isnan() macro of musl patched is

#ifdef PATCH
#define isnan(x) ( \
sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) << 1) > 0xff00000U : \
sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) << 1) > 0xffe0000000000000ULL : \
__fpclassifyl(x) == FP_NAN)
#else
#define isnan(x) ( \
sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) & 0x7fffffff) > 0x7f800000 : \
sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) & -1ULL>>1) > 0x7ffULL<<52 : \
__fpclassifyl(x) == FP_NAN)
#endif // PATCH

PPS: and of course the log from the benchmarks...

[stefan@...e ~]$ lscpu
Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                16
On-line CPU(s) list:   0-15
Thread(s) per core:    2
Core(s) per socket:    8
Socket(s):             1
NUMA node(s):          1
Vendor ID:             AuthenticAMD
CPU family:            23
Model:                 49
Model name:            AMD EPYC 7262 8-Core Processor
Stepping:              0
CPU MHz:               3194.323
BogoMIPS:              6388.64
Virtualization:        AMD-V
L1d cache:             32K
L1i cache:             32K
L2 cache:              512K
L3 cache:              16384K
...
[stefan@...e ~]$ gcc --version
gcc (GCC) 8.3.1 20190311 (Red Hat 8.3.1-3)
Copyright (C) 2018 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

[stefan@...e ~]$ cat patch.c
#include <math.h>

static __inline unsigned __FLOAT_BITS(float __f)
{
 union {float __f; unsigned __i;} __u;
 __u.__f = __f;
 return __u.__i;
}

static __inline unsigned long long __DOUBLE_BITS(double __f)
{
 union {double __f; unsigned long long __i;} __u;
 __u.__f = __f;
 return __u.__i;
}

#ifdef PATCH
#define isnan(x) ( \
sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) << 1) > 0xff00000U : \
sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) << 1) > 0xffe0000000000000ULL : \
__fpclassifyl(x) == FP_NAN)
#else
#define isnan(x) ( \
sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) & 0x7fffffff) > 0x7f800000 : \
sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) & -1ULL>>1) > 0x7ffULL<<52 : \
__fpclassifyl(x) == FP_NAN)
#endif // PATCH

__attribute__((noinline))
double nextafter(double x, double y)
{
 union {double f; unsigned long long i;} ux={x}, uy={y};
 unsigned long long ax, ay;
 int e;

 if (isnan(x) || isnan(y))
  return x + y;
 if (ux.i == uy.i)
  return y;
#ifdef PATCH
 ax = ux.i << 1;
 ay = uy.i << 1;
#else
 ax = ux.i & -1ULL/2;
 ay = uy.i & -1ULL/2;
#endif
 if (ax == 0) {
  if (ay == 0)
   return y;
  ux.i = (uy.i & 1ULL<<63) | 1;
#ifdef PATCH
 } else if (ax < ay == (long long) ux.i < 0)
#else
 } else if (ax > ay || ((ux.i ^ uy.i) & 1ULL<<63))
#endif
  ux.i--;
 else
  ux.i++;
 e = ux.i >> 52 & 0x7ff;
 /* raise overflow if ux.f is infinite and x is finite */
 if (e == 0x7ff)
  FORCE_EVAL(x + x);
 /* raise underflow if ux.f is subnormal or zero */
 if (e == 0)
  FORCE_EVAL(x*x + ux.f*ux.f);
 return ux.f;
}
[stefan@...e ~]$ cat bench.c
// Copyright © 2005-2021, Stefan Kanthak <stefan.kanthak@...go.de>

#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <time.h>

__attribute__((noinline))
double nop(double foo, double bar)
{
    return foo + bar;
}

inline static
double lfsr64(void)
{
    // 64-bit linear feedback shift register (Galois form) using
    //  primitive polynomial 0xAD93D23594C935A9 (CRC-64 "Jones"),
    //   initialised with 2**64 / golden ratio

    static uint64_t lfsr = 0x9E3779B97F4A7C15;
    const  uint64_t sign = (int64_t) lfsr >> 63;

    lfsr = (lfsr << 1) ^ (0xAD93D23594C935A9 & sign);

    return *(double *) &lfsr;
}

inline static
double random64(void)
{
    static uint64_t seed = 0x0123456789ABCDEF;

    seed = seed * 6364136223846793005 + 1442695040888963407;

    return *(double *) &seed;
}

int main(void)
{
    clock_t t0, t1, t2, tt;
    uint32_t n;
    volatile double result;

    t0 = clock();

    for (n = 500000000u; n > 0u; n--) {
        result = nop(lfsr64(), 0.0);
        result = nop(random64(), 1.0 / 0.0);
    }

    t1 = clock();

    for (n = 500000000u; n > 0u; n--) {
        result = nextafter(lfsr64(), 0.0);
        result = nextafter(random64(), 1.0 / 0.0);
    }

    t2 = clock();
    tt = t2 - t0; t2 -= t1; t1 -= t0; t0 = t2 - t1;

    printf("\n"
           "nop()         %4lu.%06lu       0\n"
           "nextafter()   %4lu.%06lu    %4lu.%06lu\n"
           "              %4lu.%06lu nano-seconds\n",
           t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC);
}
[stefan@...e ~]$ gcc -O3 -lm bench.c
[stefan@...e ~]$ perf stat ./a.out

nop()            1.480000       0
nextafter()     10.060000       8.580000
                11.540000 nano-seconds

 Performance counter stats for './a.out':

         11,548.78 msec task-clock:u              #    1.000 CPUs utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
               145      page-faults:u             #    0.013 K/sec
    38,917,213,536      cycles:u                  #    3.370 GHz                      (83.33%)
    15,647,656,615      stalled-cycles-frontend:u #   40.21% frontend cycles idle     (83.33%)
    10,746,044,422      stalled-cycles-backend:u  #   27.61% backend cycles idle      (83.33%)
    69,739,403,870      instructions:u            #    1.79  insn per cycle
                                                  #    0.22  stalled cycles per insn  (83.33%)
    16,495,748,110      branches:u                # 1428.354 M/sec                    (83.33%)
       500,701,246      branch-misses:u           #    3.04% of all branches          (83.33%)

      11.550414029 seconds time elapsed

      11.548265000 seconds user
       0.000999000 seconds sys


[stefan@...e ~]$ gcc -O3 bench.c patch.c
patch.c:23: warning: "isnan" redefined
 #define isnan(x) ( \

In file included from patch.c:1:
/usr/include/math.h:254: note: this is the location of the previous definition
 #  define isnan(x) \

[stefan@...e ~]$ perf stat ./a.out

nop()            1.480000       0
nextafter()      5.070000       3.590000
                 6.550000 nano-seconds

 Performance counter stats for './a.out':

          6,558.44 msec task-clock:u              #    1.000 CPUs utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
               122      page-faults:u             #    0.019 K/sec
    22,038,054,931      cycles:u                  #    3.360 GHz                      (83.33%)
           204,849      stalled-cycles-frontend:u #    0.00% frontend cycles idle     (83.34%)
     5,497,340,276      stalled-cycles-backend:u  #   24.94% backend cycles idle      (83.34%)
    39,751,746,482      instructions:u            #    1.80  insn per cycle
                                                  #    0.14  stalled cycles per insn  (83.34%)
     9,747,428,086      branches:u                # 1486.242 M/sec                    (83.34%)
       250,407,409      branch-misses:u           #    2.57% of all branches          (83.33%)

       6.559550924 seconds time elapsed

       6.558918000 seconds user
       0.000000000 seconds sys


[stefan@...e ~]$ gcc -O3 bench.c -DPATCH patch.c
patch.c:18: warning: "isnan" redefined
 #define isnan(x) ( \

In file included from patch.c:1:
/usr/include/math.h:254: note: this is the location of the previous definition
 #  define isnan(x) \

[stefan@...e ~]$ perf stat ./a.out

nop()            1.480000       0
nextafter()      2.000000       0.520000
                 3.480000 nano-seconds

 Performance counter stats for './a.out':

          3,489.59 msec task-clock:u              #    1.000 CPUs utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
               123      page-faults:u             #    0.035 K/sec
    11,721,075,764      cycles:u                  #    3.359 GHz                      (83.32%)
           132,680      stalled-cycles-frontend:u #    0.00% frontend cycles idle     (83.32%)
     4,500,051,993      stalled-cycles-backend:u  #   38.39% backend cycles idle      (83.32%)
    40,501,721,908      instructions:u            #    3.46  insn per cycle
                                                  #    0.11  stalled cycles per insn  (83.33%)
     8,494,571,027      branches:u                # 2434.258 M/sec                    (83.35%)
           497,230      branch-misses:u           #    0.01% of all branches          (83.34%)

       3.490437579 seconds time elapsed

       3.490053000 seconds user
       0.000000000 seconds sys


[stefan@...e ~]$ gcc -O3 patch.c nextafter-fp.c
[stefan@...e ~]$ perf stat ./a.out

nop()            1.490000       0
nextafter()      2.210000       0.720000
                 3.700000 nano-seconds

 Performance counter stats for './a.out':

          3,702.89 msec task-clock:u              #    1.000 CPUs utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
               122      page-faults:u             #    0.033 K/sec
    12,407,345,183      cycles:u                  #    3.351 GHz                      (83.32%)
           135,817      stalled-cycles-frontend:u #    0.00% frontend cycles idle     (83.34%)
     5,498,895,906      stalled-cycles-backend:u  #   44.32% backend cycles idle      (83.34%)
    38,002,430,460      instructions:u            #    3.06  insn per cycle
                                                  #    0.14  stalled cycles per insn  (83.34%)
     7,497,381,393      branches:u                # 2024.735 M/sec                    (83.34%)
           497,462      branch-misses:u           #    0.01% of all branches          (83.32%)

       3.703648875 seconds time elapsed

       3.703294000 seconds user
       0.000000000 seconds sys


[stefan@...e ~]$ gcc -O3 bench.c nextafter.s
[stefan@...e ~]$ perf stat ./a.out

nop()            1.630000       0
nextafter()      1.910000       0.280000
                 3.540000 nano-seconds

 Performance counter stats for './a.out':

          3,547.12 msec task-clock:u              #    1.000 CPUs utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
               123      page-faults:u             #    0.035 K/sec
    11,949,840,797      cycles:u                  #    3.369 GHz                      (83.32%)
           129,627      stalled-cycles-frontend:u #    0.00% frontend cycles idle     (83.34%)
     3,998,960,716      stalled-cycles-backend:u  #   33.46% backend cycles idle      (83.34%)
    37,493,523,285      instructions:u            #    3.14  insn per cycle
                                                  #    0.11  stalled cycles per insn  (83.34%)
     7,998,559,192      branches:u                # 2254.945 M/sec                    (83.34%)
           497,565      branch-misses:u           #    0.01% of all branches          (83.32%)

       3.547999008 seconds time elapsed

       3.546671000 seconds user
       0.000999000 seconds sys


[stefan@...e ~]$

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.