|
Message-Id: <1423845589-5920-1-git-send-email-vda.linux@googlemail.com> Date: Fri, 13 Feb 2015 17:39:49 +0100 From: Denys Vlasenko <vda.linux@...glemail.com> To: musl@...ts.openwall.com, Rich Felker <dalias@...c.org> Cc: Denys Vlasenko <vda.linux@...glemail.com> Subject: [PATCH] x86_64/memset: use "small block" code for blocks up to 30 bytes long Before this change, we were using it only for 15-byte blocks and smaller. Measurements on Sandy Bridge CPU show that "rep stosq" setup time is high enough to dominate speed of fills well above that size: 31 byte block: 3.279282 bytes/ns 30 byte block: 3.173499 bytes/ns .. 20 byte block: 2.116552 bytes/ns .. 16 byte block: 1.799337 bytes/ns 15 byte block: 5.074332 bytes/ns 14 byte block: 4.736135 bytes/ns 13 byte block: 4.398852 bytes/ns 12 byte block: 4.060479 bytes/ns 11 byte block: 3.723065 bytes/ns 10 byte block: 3.384556 bytes/ns 9 byte block: 2.867677 bytes/ns 8 byte block: 2.257382 bytes/ns 7 byte block: 1.975605 bytes/ns 6 byte block: 1.693388 bytes/ns 5 byte block: 1.411434 bytes/ns 4 byte block: 1.129147 bytes/ns 3 byte block: 0.847030 bytes/ns 2 byte block: 0.616008 bytes/ns 1 byte block: 0.308069 bytes/ns The patch does not increase the number of branches, but is able to handle blocks up to 30 bytes. After the patch, timings are: 32 byte block: 3.384681 bytes/ns 31 byte block: 3.279118 bytes/ns 30 byte block: 10.128968 bytes/ns 29 byte block: 9.793798 bytes/ns 28 byte block: 9.456081 bytes/ns 27 byte block: 9.120555 bytes/ns 26 byte block: 8.782757 bytes/ns 25 byte block: 8.446654 bytes/ns 24 byte block: 8.109310 bytes/ns 23 byte block: 7.773063 bytes/ns 22 byte block: 7.434663 bytes/ns 21 byte block: 7.098760 bytes/ns 20 byte block: 6.760724 bytes/ns 19 byte block: 6.424286 bytes/ns 18 byte block: 6.086166 bytes/ns 17 byte block: 5.749441 bytes/ns 16 byte block: 5.411120 bytes/ns 15 byte block: 5.074234 bytes/ns 14 byte block: 3.947913 bytes/ns 13 byte block: 3.666643 bytes/ns 12 byte block: 3.384641 bytes/ns 11 byte block: 3.103178 bytes/ns 10 byte block: 2.821105 bytes/ns 9 byte block: 2.539481 bytes/ns 8 byte block: 2.257338 bytes/ns 7 byte block: 1.975530 bytes/ns 6 byte block: 1.693337 bytes/ns 5 byte block: 1.411388 bytes/ns 4 byte block: 1.129111 bytes/ns 3 byte block: 0.846994 bytes/ns 2 byte block: 0.615982 bytes/ns 1 byte block: 0.308056 bytes/ns Signed-off-by: Denys Vlasenko <vda.linux@...glemail.com> --- src/string/x86_64/memset.s | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s index ea61687..81adbb2 100644 --- a/src/string/x86_64/memset.s +++ b/src/string/x86_64/memset.s @@ -2,13 +2,13 @@ .type memset,@function memset: movzbq %sil,%rax - cmp $16,%rdx - jb .Less_than_16 - test %esi,%esi jnz .L_widen_rax # unlikely .L_widened: + cmp $31,%rdx + jb .Less_than_31 + mov %rdi,%r8 test $7,%dil @@ -43,7 +43,7 @@ memset: jmp .L_aligned -.Less_than_16: +.Less_than_31: test %edx,%edx jz .L_ret @@ -52,20 +52,18 @@ memset: cmp $2,%edx jbe .L_ret - mov %al,1(%rdi) - mov %al,-2(%rdi,%rdx) - # 32-bit imul has 3-4 cycles latency - imul $0x1010101,%eax - cmp $4,%edx + mov %ax,1(%rdi) + mov %ax,(-1-2)(%rdi,%rdx) + cmp $6,%edx jbe .L_ret - mov %eax,(%rdi) - mov %eax,-4(%rdi,%rdx) - cmp $8,%edx + mov %eax,(1+2)(%rdi) + mov %eax,(-1-2-4)(%rdi,%rdx) + cmp $14,%edx jbe .L_ret - mov %eax,4(%rdi) - mov %eax,-8(%rdi,%rdx) + mov %rax,(1+2+4)(%rdi) + mov %rax,(-1-2-4-8)(%rdi,%rdx) .L_ret: mov %rdi,%rax ret -- 1.8.1.4
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.