|
Message-Id: <1423761423-30050-2-git-send-email-vda.linux@googlemail.com> Date: Thu, 12 Feb 2015 18:17:03 +0100 From: Denys Vlasenko <vda.linux@...glemail.com> To: musl@...ts.openwall.com, Rich Felker <dalias@...c.org> Cc: Denys Vlasenko <vda.linux@...glemail.com> Subject: [PATCH 2/2] x86_64/memset: align destination to 8 byte boundary 8-byte alignment gives ~25% speedup on "rep stosq" memsets to L1 cache, compared to intentionally misaligned ones. It is a smaller win of ~15% on larger memsets to L2 too. Measured on Intel Sandy Bridge CPU (i7-2620M, 2.70GHz) Signed-off-by: Denys Vlasenko <vda.linux@...glemail.com> --- src/string/x86_64/memset.s | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s index 523caa0..5c9e333 100644 --- a/src/string/x86_64/memset.s +++ b/src/string/x86_64/memset.s @@ -4,16 +4,23 @@ memset: movzbq %sil,%rax cmp $16,%rdx jb .Less_than_16 + test %esi,%esi jnz .L_widen_rax # unlikely .L_widened: - lea -1(%rdx),%rcx mov %rdi,%r8 + + test $7,%dil + jnz .L_align # unlikely +.L_aligned: + + lea -1(%rdx),%rcx shr $3,%rcx mov %rax,-8(%rdi,%rdx) rep stosq + mov %r8,%rax ret @@ -23,6 +30,19 @@ memset: imul %rsi,%rax jmp .L_widened +# 8-byte alignment gives ~25% speedup on "rep stosq" memsets +# to L1 cache, compared to intentionally misaligned ones. +# It is a smaller win of ~15% on larger memsets to L2 too. +# Measured on Intel Sandy Bridge CPU (i7-2620M, 2.70GHz) +.L_align: + mov %rax,(%rdi) +1: inc %rdi + dec %rdx + test $7,%dil + jnz 1b + jmp .L_aligned + + .Less_than_16: test %edx,%edx jz .L_ret -- 1.8.1.4
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.