>From c11ba119569609cfe3f910891405452306bdf303 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Sun, 19 Mar 2017 03:56:01 +0000
Subject: [PATCH 5/5] define FP_FAST_FMA and FP_FAST_FMAF when fma and fmaf can
 be inlined

FP_FAST_FMA can be defined if "the fma function generally executes about
as fast as, or faster than, a multiply and an add of double operands",
which can only be true if the fma call is inlined as an instruction.

gcc sets __FP_FAST_FMA if __builtin_fma is inlined as an instruction,
but that does not mean an fma call will be inlined (e.g. it is defined
with -fno-builtin-fma), other compilers (clang) don't even have such
macro, so there is no reliable way to tell when fma is inlined.

one approach is to define FP_FAST_FMA based on the libc implementation:
when it has a single instruction implementation, then the compiler should
also be able to do the inlining and in case that fails at least the libc
code is still fast (there is just an extern call overhead).

on aarch64, powerpc, powerpc64, s390x we can give this guarantee, but
on arm, x32 and x86_64 runtime checks would be needed to do the same.

for now arm, x32 and x86_64 set FP_FAST_FMA when the compiler should be
able to inline fma, but if that fails the libc code will be slow (unless
musl is built for an isa baseline that includes an fma instruction).
---
 arch/aarch64/bits/math.h   | 2 ++
 arch/arm/bits/math.h       | 4 ++++
 arch/generic/bits/math.h   | 0
 arch/powerpc/bits/math.h   | 4 ++++
 arch/powerpc64/bits/math.h | 2 ++
 arch/s390x/bits/math.h     | 2 ++
 arch/x32/bits/math.h       | 4 ++++
 arch/x86_64/bits/math.h    | 4 ++++
 include/math.h             | 2 ++
 9 files changed, 24 insertions(+)
 create mode 100644 arch/aarch64/bits/math.h
 create mode 100644 arch/arm/bits/math.h
 create mode 100644 arch/generic/bits/math.h
 create mode 100644 arch/powerpc/bits/math.h
 create mode 100644 arch/powerpc64/bits/math.h
 create mode 100644 arch/s390x/bits/math.h
 create mode 100644 arch/x32/bits/math.h
 create mode 100644 arch/x86_64/bits/math.h

diff --git a/arch/aarch64/bits/math.h b/arch/aarch64/bits/math.h
new file mode 100644
index 00000000..c7ec28c5
--- /dev/null
+++ b/arch/aarch64/bits/math.h
@@ -0,0 +1,2 @@
+#define FP_FAST_FMA 1
+#define FP_FAST_FMAF 1
diff --git a/arch/arm/bits/math.h b/arch/arm/bits/math.h
new file mode 100644
index 00000000..2fbf371c
--- /dev/null
+++ b/arch/arm/bits/math.h
@@ -0,0 +1,4 @@
+#if __ARM_FEATURE_FMA && (__ARM_FP&12) == 12
+#define FP_FAST_FMA 1
+#define FP_FAST_FMAF 1
+#endif
diff --git a/arch/generic/bits/math.h b/arch/generic/bits/math.h
new file mode 100644
index 00000000..e69de29b
diff --git a/arch/powerpc/bits/math.h b/arch/powerpc/bits/math.h
new file mode 100644
index 00000000..3913b15e
--- /dev/null
+++ b/arch/powerpc/bits/math.h
@@ -0,0 +1,4 @@
+#ifndef _SOFT_FLOAT
+#define FP_FAST_FMA 1
+#define FP_FAST_FMAF 1
+#endif
diff --git a/arch/powerpc64/bits/math.h b/arch/powerpc64/bits/math.h
new file mode 100644
index 00000000..c7ec28c5
--- /dev/null
+++ b/arch/powerpc64/bits/math.h
@@ -0,0 +1,2 @@
+#define FP_FAST_FMA 1
+#define FP_FAST_FMAF 1
diff --git a/arch/s390x/bits/math.h b/arch/s390x/bits/math.h
new file mode 100644
index 00000000..c7ec28c5
--- /dev/null
+++ b/arch/s390x/bits/math.h
@@ -0,0 +1,2 @@
+#define FP_FAST_FMA 1
+#define FP_FAST_FMAF 1
diff --git a/arch/x32/bits/math.h b/arch/x32/bits/math.h
new file mode 100644
index 00000000..c7569d6c
--- /dev/null
+++ b/arch/x32/bits/math.h
@@ -0,0 +1,4 @@
+#if __FMA__ || __FMA4__
+#define FP_FAST_FMA 1
+#define FP_FAST_FMAF 1
+#endif
diff --git a/arch/x86_64/bits/math.h b/arch/x86_64/bits/math.h
new file mode 100644
index 00000000..c7569d6c
--- /dev/null
+++ b/arch/x86_64/bits/math.h
@@ -0,0 +1,4 @@
+#if __FMA__ || __FMA4__
+#define FP_FAST_FMA 1
+#define FP_FAST_FMAF 1
+#endif
diff --git a/include/math.h b/include/math.h
index fea34686..58da26c2 100644
--- a/include/math.h
+++ b/include/math.h
@@ -11,6 +11,8 @@ extern "C" {
 #define __NEED_double_t
 #include <bits/alltypes.h>
 
+#include <bits/math.h>
+
 #if 100*__GNUC__+__GNUC_MINOR__ >= 303
 #define NAN       __builtin_nanf("")
 #define INFINITY  __builtin_inff()
-- 
2.18.0