>From c11ba119569609cfe3f910891405452306bdf303 Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Sun, 19 Mar 2017 03:56:01 +0000 Subject: [PATCH 5/5] define FP_FAST_FMA and FP_FAST_FMAF when fma and fmaf can be inlined FP_FAST_FMA can be defined if "the fma function generally executes about as fast as, or faster than, a multiply and an add of double operands", which can only be true if the fma call is inlined as an instruction. gcc sets __FP_FAST_FMA if __builtin_fma is inlined as an instruction, but that does not mean an fma call will be inlined (e.g. it is defined with -fno-builtin-fma), other compilers (clang) don't even have such macro, so there is no reliable way to tell when fma is inlined. one approach is to define FP_FAST_FMA based on the libc implementation: when it has a single instruction implementation, then the compiler should also be able to do the inlining and in case that fails at least the libc code is still fast (there is just an extern call overhead). on aarch64, powerpc, powerpc64, s390x we can give this guarantee, but on arm, x32 and x86_64 runtime checks would be needed to do the same. for now arm, x32 and x86_64 set FP_FAST_FMA when the compiler should be able to inline fma, but if that fails the libc code will be slow (unless musl is built for an isa baseline that includes an fma instruction). --- arch/aarch64/bits/math.h | 2 ++ arch/arm/bits/math.h | 4 ++++ arch/generic/bits/math.h | 0 arch/powerpc/bits/math.h | 4 ++++ arch/powerpc64/bits/math.h | 2 ++ arch/s390x/bits/math.h | 2 ++ arch/x32/bits/math.h | 4 ++++ arch/x86_64/bits/math.h | 4 ++++ include/math.h | 2 ++ 9 files changed, 24 insertions(+) create mode 100644 arch/aarch64/bits/math.h create mode 100644 arch/arm/bits/math.h create mode 100644 arch/generic/bits/math.h create mode 100644 arch/powerpc/bits/math.h create mode 100644 arch/powerpc64/bits/math.h create mode 100644 arch/s390x/bits/math.h create mode 100644 arch/x32/bits/math.h create mode 100644 arch/x86_64/bits/math.h diff --git a/arch/aarch64/bits/math.h b/arch/aarch64/bits/math.h new file mode 100644 index 00000000..c7ec28c5 --- /dev/null +++ b/arch/aarch64/bits/math.h @@ -0,0 +1,2 @@ +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 diff --git a/arch/arm/bits/math.h b/arch/arm/bits/math.h new file mode 100644 index 00000000..2fbf371c --- /dev/null +++ b/arch/arm/bits/math.h @@ -0,0 +1,4 @@ +#if __ARM_FEATURE_FMA && (__ARM_FP&12) == 12 +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 +#endif diff --git a/arch/generic/bits/math.h b/arch/generic/bits/math.h new file mode 100644 index 00000000..e69de29b diff --git a/arch/powerpc/bits/math.h b/arch/powerpc/bits/math.h new file mode 100644 index 00000000..3913b15e --- /dev/null +++ b/arch/powerpc/bits/math.h @@ -0,0 +1,4 @@ +#ifndef _SOFT_FLOAT +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 +#endif diff --git a/arch/powerpc64/bits/math.h b/arch/powerpc64/bits/math.h new file mode 100644 index 00000000..c7ec28c5 --- /dev/null +++ b/arch/powerpc64/bits/math.h @@ -0,0 +1,2 @@ +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 diff --git a/arch/s390x/bits/math.h b/arch/s390x/bits/math.h new file mode 100644 index 00000000..c7ec28c5 --- /dev/null +++ b/arch/s390x/bits/math.h @@ -0,0 +1,2 @@ +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 diff --git a/arch/x32/bits/math.h b/arch/x32/bits/math.h new file mode 100644 index 00000000..c7569d6c --- /dev/null +++ b/arch/x32/bits/math.h @@ -0,0 +1,4 @@ +#if __FMA__ || __FMA4__ +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 +#endif diff --git a/arch/x86_64/bits/math.h b/arch/x86_64/bits/math.h new file mode 100644 index 00000000..c7569d6c --- /dev/null +++ b/arch/x86_64/bits/math.h @@ -0,0 +1,4 @@ +#if __FMA__ || __FMA4__ +#define FP_FAST_FMA 1 +#define FP_FAST_FMAF 1 +#endif diff --git a/include/math.h b/include/math.h index fea34686..58da26c2 100644 --- a/include/math.h +++ b/include/math.h @@ -11,6 +11,8 @@ extern "C" { #define __NEED_double_t #include +#include + #if 100*__GNUC__+__GNUC_MINOR__ >= 303 #define NAN __builtin_nanf("") #define INFINITY __builtin_inff() -- 2.18.0