>From 75ad4e8ec4abc6ce1d801017679c9e9e50fdfcf5 Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Wed, 24 Apr 2019 23:29:05 +0000 Subject: [PATCH 2/3] x86: optimize fp_arch.h Use inline asm constraints instead of volatile store: fp_barrier does not need to drop excess precision when x87 fpu is used, fp_force_eval uses memory constraint to drop excess precision, when sse2 math is available xmm register constraint is used. This saves 416 and 322 bytes in .text on x86_64 and i386 respectively. --- arch/i386/fp_arch.h | 48 +++++++++++++++++++++++++++++++++++++++++++ arch/x32/fp_arch.h | 40 ++++++++++++++++++++++++++++++++++++ arch/x86_64/fp_arch.h | 40 ++++++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+) create mode 100644 arch/i386/fp_arch.h create mode 100644 arch/x32/fp_arch.h create mode 100644 arch/x86_64/fp_arch.h diff --git a/arch/i386/fp_arch.h b/arch/i386/fp_arch.h new file mode 100644 index 00000000..33ac222d --- /dev/null +++ b/arch/i386/fp_arch.h @@ -0,0 +1,48 @@ +#include + +#ifdef __SSE2_MATH__ +#define FP_BARRIER(x) __asm__ __volatile__ ("" : "+x"(x)) +#define FP_EVAL(x) __asm__ __volatile__ ("" : "+x"(x)) +#else +#define FP_BARRIER(x) __asm__ __volatile__ ("" : "+t"(x)) +#define FP_EVAL(x) __asm__ __volatile__ ("" : "+m"(x)) +#endif + +#define fp_barrierf fp_barrierf +static inline float_t fp_barrierf(float_t x) +{ + FP_BARRIER(x); + return x; +} + +#define fp_barrier fp_barrier +static inline double_t fp_barrier(double_t x) +{ + FP_BARRIER(x); + return x; +} + +#define fp_barrierl fp_barrierl +static inline long double fp_barrierl(long double x) +{ + __asm__ __volatile__ ("" : "+t"(x)); + return x; +} + +#define fp_force_evalf fp_force_evalf +static inline void fp_force_evalf(float x) +{ + FP_EVAL(x); +} + +#define fp_force_eval fp_force_eval +static inline void fp_force_eval(double x) +{ + FP_EVAL(x); +} + +#define fp_force_evall fp_force_evall +static inline void fp_force_evall(long double x) +{ + __asm__ __volatile__ ("" : "+t"(x)); +} diff --git a/arch/x32/fp_arch.h b/arch/x32/fp_arch.h new file mode 100644 index 00000000..af4309d9 --- /dev/null +++ b/arch/x32/fp_arch.h @@ -0,0 +1,40 @@ +#include + +#define fp_barrierf fp_barrierf +static inline float_t fp_barrierf(float_t x) +{ + __asm__ __volatile__ ("" : "+x"(x)); + return x; +} + +#define fp_barrier fp_barrier +static inline double_t fp_barrier(double_t x) +{ + __asm__ __volatile__ ("" : "+x"(x)); + return x; +} + +#define fp_barrierl fp_barrierl +static inline long double fp_barrierl(long double x) +{ + __asm__ __volatile__ ("" : "+t"(x)); + return x; +} + +#define fp_force_evalf fp_force_evalf +static inline void fp_force_evalf(float x) +{ + __asm__ __volatile__ ("" : "+x"(x)); +} + +#define fp_force_eval fp_force_eval +static inline void fp_force_eval(double x) +{ + __asm__ __volatile__ ("" : "+x"(x)); +} + +#define fp_force_evall fp_force_evall +static inline void fp_force_evall(long double x) +{ + __asm__ __volatile__ ("" : "+t"(x)); +} diff --git a/arch/x86_64/fp_arch.h b/arch/x86_64/fp_arch.h new file mode 100644 index 00000000..af4309d9 --- /dev/null +++ b/arch/x86_64/fp_arch.h @@ -0,0 +1,40 @@ +#include + +#define fp_barrierf fp_barrierf +static inline float_t fp_barrierf(float_t x) +{ + __asm__ __volatile__ ("" : "+x"(x)); + return x; +} + +#define fp_barrier fp_barrier +static inline double_t fp_barrier(double_t x) +{ + __asm__ __volatile__ ("" : "+x"(x)); + return x; +} + +#define fp_barrierl fp_barrierl +static inline long double fp_barrierl(long double x) +{ + __asm__ __volatile__ ("" : "+t"(x)); + return x; +} + +#define fp_force_evalf fp_force_evalf +static inline void fp_force_evalf(float x) +{ + __asm__ __volatile__ ("" : "+x"(x)); +} + +#define fp_force_eval fp_force_eval +static inline void fp_force_eval(double x) +{ + __asm__ __volatile__ ("" : "+x"(x)); +} + +#define fp_force_evall fp_force_evall +static inline void fp_force_evall(long double x) +{ + __asm__ __volatile__ ("" : "+t"(x)); +} -- 2.21.0