|
Message-ID: <367A4018B58A4E308E2A95404362CBFB@H270> Date: Sun, 15 Aug 2021 09:04:55 +0200 From: "Stefan Kanthak" <stefan.kanthak@...go.de> To: "Szabolcs Nagy" <nsz@...t70.net> Cc: <musl@...ts.openwall.com> Subject: Re: [PATCH #2] Properly simplified nextafter() Szabolcs Nagy <nsz@...t70.net> wrote: >* Stefan Kanthak <stefan.kanthak@...go.de> [2021-08-13 14:04:51 +0200]: >> Szabolcs Nagy <nsz@...t70.net> wrote on 2021-08-10 at 23:34: >> > (the i386 machine where i originally tested this preferred int >> > cmp and float cmp was very slow in the subnormal range >> >> This also and still holds for i386 FPU fadd/fmul as well as SSE >> addsd/addss/mulss/mulsd additions/multiplies! > > they are avoided in the common case, and only used to create > fenv side-effects. Unfortunately but for hard & SOFT-float, where no fenv exists, as Rich wrote. >> --- -/src/math/nextafter.c >> +++ +/src/math/nextafter.c >> @@ -10,13 +10,13 @@ >> return x + y; >> if (ux.i == uy.i) >> return y; >> - ax = ux.i & -1ULL/2; >> - ay = uy.i & -1ULL/2; >> + ax = ux.i << 2; >> + ay = uy.i << 2; > > the << 2 looks wrong, the top bit of the exponent is lost. It IS wrong, but only in the post, not in the code I tested. >> if (ax == 0) { >> if (ay == 0) >> return y; >> ux.i = (uy.i & 1ULL<<63) | 1; >> - } else if (ax > ay || ((ux.i ^ uy.i) & 1ULL<<63)) >> + } else if ((ax < ay) == ((int64_t) ux.i < 0)) >> ux.i--; >> else >> ux.i++; > ... >> How do you compare these 60 instructions/252 bytes to the code I posted >> (23 instructions/72 bytes)? > > you should benchmark, but the second best is to look > at the longest dependency chain in the hot path and > add up the instruction latencies. 1 billion calls to nextafter(), with random from, and to either 0 or +INF: run 1 against glibc, 8.58 ns/call run 2 against musl original, 3.59 run 3 against musl patched, 0.52 run 4 the pure floating-point variant from 0.72 my initial post in this thread, run 5 the assembly variant I posted. 0.28 ns/call Now hurry up and patch your slowmotion code! Stefan PS: I cheated a very tiny little bit: the isnan() macro of musl patched is #ifdef PATCH #define isnan(x) ( \ sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) << 1) > 0xff00000U : \ sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) << 1) > 0xffe0000000000000ULL : \ __fpclassifyl(x) == FP_NAN) #else #define isnan(x) ( \ sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) & 0x7fffffff) > 0x7f800000 : \ sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) & -1ULL>>1) > 0x7ffULL<<52 : \ __fpclassifyl(x) == FP_NAN) #endif // PATCH PPS: and of course the log from the benchmarks... [stefan@...e ~]$ lscpu Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 16 On-line CPU(s) list: 0-15 Thread(s) per core: 2 Core(s) per socket: 8 Socket(s): 1 NUMA node(s): 1 Vendor ID: AuthenticAMD CPU family: 23 Model: 49 Model name: AMD EPYC 7262 8-Core Processor Stepping: 0 CPU MHz: 3194.323 BogoMIPS: 6388.64 Virtualization: AMD-V L1d cache: 32K L1i cache: 32K L2 cache: 512K L3 cache: 16384K ... [stefan@...e ~]$ gcc --version gcc (GCC) 8.3.1 20190311 (Red Hat 8.3.1-3) Copyright (C) 2018 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. [stefan@...e ~]$ cat patch.c #include <math.h> static __inline unsigned __FLOAT_BITS(float __f) { union {float __f; unsigned __i;} __u; __u.__f = __f; return __u.__i; } static __inline unsigned long long __DOUBLE_BITS(double __f) { union {double __f; unsigned long long __i;} __u; __u.__f = __f; return __u.__i; } #ifdef PATCH #define isnan(x) ( \ sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) << 1) > 0xff00000U : \ sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) << 1) > 0xffe0000000000000ULL : \ __fpclassifyl(x) == FP_NAN) #else #define isnan(x) ( \ sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) & 0x7fffffff) > 0x7f800000 : \ sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) & -1ULL>>1) > 0x7ffULL<<52 : \ __fpclassifyl(x) == FP_NAN) #endif // PATCH __attribute__((noinline)) double nextafter(double x, double y) { union {double f; unsigned long long i;} ux={x}, uy={y}; unsigned long long ax, ay; int e; if (isnan(x) || isnan(y)) return x + y; if (ux.i == uy.i) return y; #ifdef PATCH ax = ux.i << 1; ay = uy.i << 1; #else ax = ux.i & -1ULL/2; ay = uy.i & -1ULL/2; #endif if (ax == 0) { if (ay == 0) return y; ux.i = (uy.i & 1ULL<<63) | 1; #ifdef PATCH } else if (ax < ay == (long long) ux.i < 0) #else } else if (ax > ay || ((ux.i ^ uy.i) & 1ULL<<63)) #endif ux.i--; else ux.i++; e = ux.i >> 52 & 0x7ff; /* raise overflow if ux.f is infinite and x is finite */ if (e == 0x7ff) FORCE_EVAL(x + x); /* raise underflow if ux.f is subnormal or zero */ if (e == 0) FORCE_EVAL(x*x + ux.f*ux.f); return ux.f; } [stefan@...e ~]$ cat bench.c // Copyright © 2005-2021, Stefan Kanthak <stefan.kanthak@...go.de> #include <math.h> #include <stdint.h> #include <stdio.h> #include <time.h> __attribute__((noinline)) double nop(double foo, double bar) { return foo + bar; } inline static double lfsr64(void) { // 64-bit linear feedback shift register (Galois form) using // primitive polynomial 0xAD93D23594C935A9 (CRC-64 "Jones"), // initialised with 2**64 / golden ratio static uint64_t lfsr = 0x9E3779B97F4A7C15; const uint64_t sign = (int64_t) lfsr >> 63; lfsr = (lfsr << 1) ^ (0xAD93D23594C935A9 & sign); return *(double *) &lfsr; } inline static double random64(void) { static uint64_t seed = 0x0123456789ABCDEF; seed = seed * 6364136223846793005 + 1442695040888963407; return *(double *) &seed; } int main(void) { clock_t t0, t1, t2, tt; uint32_t n; volatile double result; t0 = clock(); for (n = 500000000u; n > 0u; n--) { result = nop(lfsr64(), 0.0); result = nop(random64(), 1.0 / 0.0); } t1 = clock(); for (n = 500000000u; n > 0u; n--) { result = nextafter(lfsr64(), 0.0); result = nextafter(random64(), 1.0 / 0.0); } t2 = clock(); tt = t2 - t0; t2 -= t1; t1 -= t0; t0 = t2 - t1; printf("\n" "nop() %4lu.%06lu 0\n" "nextafter() %4lu.%06lu %4lu.%06lu\n" " %4lu.%06lu nano-seconds\n", t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC); } [stefan@...e ~]$ gcc -O3 -lm bench.c [stefan@...e ~]$ perf stat ./a.out nop() 1.480000 0 nextafter() 10.060000 8.580000 11.540000 nano-seconds Performance counter stats for './a.out': 11,548.78 msec task-clock:u # 1.000 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 145 page-faults:u # 0.013 K/sec 38,917,213,536 cycles:u # 3.370 GHz (83.33%) 15,647,656,615 stalled-cycles-frontend:u # 40.21% frontend cycles idle (83.33%) 10,746,044,422 stalled-cycles-backend:u # 27.61% backend cycles idle (83.33%) 69,739,403,870 instructions:u # 1.79 insn per cycle # 0.22 stalled cycles per insn (83.33%) 16,495,748,110 branches:u # 1428.354 M/sec (83.33%) 500,701,246 branch-misses:u # 3.04% of all branches (83.33%) 11.550414029 seconds time elapsed 11.548265000 seconds user 0.000999000 seconds sys [stefan@...e ~]$ gcc -O3 bench.c patch.c patch.c:23: warning: "isnan" redefined #define isnan(x) ( \ In file included from patch.c:1: /usr/include/math.h:254: note: this is the location of the previous definition # define isnan(x) \ [stefan@...e ~]$ perf stat ./a.out nop() 1.480000 0 nextafter() 5.070000 3.590000 6.550000 nano-seconds Performance counter stats for './a.out': 6,558.44 msec task-clock:u # 1.000 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 122 page-faults:u # 0.019 K/sec 22,038,054,931 cycles:u # 3.360 GHz (83.33%) 204,849 stalled-cycles-frontend:u # 0.00% frontend cycles idle (83.34%) 5,497,340,276 stalled-cycles-backend:u # 24.94% backend cycles idle (83.34%) 39,751,746,482 instructions:u # 1.80 insn per cycle # 0.14 stalled cycles per insn (83.34%) 9,747,428,086 branches:u # 1486.242 M/sec (83.34%) 250,407,409 branch-misses:u # 2.57% of all branches (83.33%) 6.559550924 seconds time elapsed 6.558918000 seconds user 0.000000000 seconds sys [stefan@...e ~]$ gcc -O3 bench.c -DPATCH patch.c patch.c:18: warning: "isnan" redefined #define isnan(x) ( \ In file included from patch.c:1: /usr/include/math.h:254: note: this is the location of the previous definition # define isnan(x) \ [stefan@...e ~]$ perf stat ./a.out nop() 1.480000 0 nextafter() 2.000000 0.520000 3.480000 nano-seconds Performance counter stats for './a.out': 3,489.59 msec task-clock:u # 1.000 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 123 page-faults:u # 0.035 K/sec 11,721,075,764 cycles:u # 3.359 GHz (83.32%) 132,680 stalled-cycles-frontend:u # 0.00% frontend cycles idle (83.32%) 4,500,051,993 stalled-cycles-backend:u # 38.39% backend cycles idle (83.32%) 40,501,721,908 instructions:u # 3.46 insn per cycle # 0.11 stalled cycles per insn (83.33%) 8,494,571,027 branches:u # 2434.258 M/sec (83.35%) 497,230 branch-misses:u # 0.01% of all branches (83.34%) 3.490437579 seconds time elapsed 3.490053000 seconds user 0.000000000 seconds sys [stefan@...e ~]$ gcc -O3 patch.c nextafter-fp.c [stefan@...e ~]$ perf stat ./a.out nop() 1.490000 0 nextafter() 2.210000 0.720000 3.700000 nano-seconds Performance counter stats for './a.out': 3,702.89 msec task-clock:u # 1.000 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 122 page-faults:u # 0.033 K/sec 12,407,345,183 cycles:u # 3.351 GHz (83.32%) 135,817 stalled-cycles-frontend:u # 0.00% frontend cycles idle (83.34%) 5,498,895,906 stalled-cycles-backend:u # 44.32% backend cycles idle (83.34%) 38,002,430,460 instructions:u # 3.06 insn per cycle # 0.14 stalled cycles per insn (83.34%) 7,497,381,393 branches:u # 2024.735 M/sec (83.34%) 497,462 branch-misses:u # 0.01% of all branches (83.32%) 3.703648875 seconds time elapsed 3.703294000 seconds user 0.000000000 seconds sys [stefan@...e ~]$ gcc -O3 bench.c nextafter.s [stefan@...e ~]$ perf stat ./a.out nop() 1.630000 0 nextafter() 1.910000 0.280000 3.540000 nano-seconds Performance counter stats for './a.out': 3,547.12 msec task-clock:u # 1.000 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 123 page-faults:u # 0.035 K/sec 11,949,840,797 cycles:u # 3.369 GHz (83.32%) 129,627 stalled-cycles-frontend:u # 0.00% frontend cycles idle (83.34%) 3,998,960,716 stalled-cycles-backend:u # 33.46% backend cycles idle (83.34%) 37,493,523,285 instructions:u # 3.14 insn per cycle # 0.11 stalled cycles per insn (83.34%) 7,998,559,192 branches:u # 2254.945 M/sec (83.34%) 497,565 branch-misses:u # 0.01% of all branches (83.32%) 3.547999008 seconds time elapsed 3.546671000 seconds user 0.000999000 seconds sys [stefan@...e ~]$
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.