.global memset
.type memset,@function
memset:
	movzbq %sil,%rax
	test %esi,%esi
	jnz .L_widen_rax  # unlikely
.L_widened:

	mov %rdi,%r8

	cmp $16,%rdx
	jbe .Less_than_or_equal_16

	test $7,%dil
	jnz .L_align  # unlikely
.L_aligned:

	lea -1(%rdx),%rcx
	shr $3,%rcx
	mov %rax,-8(%rdi,%rdx)
	rep
	stosq

	mov %r8,%rax
	ret

.L_widen_rax:
	# 64-bit imul has 3-7 cycles latency
	mov $0x101010101010101,%rsi
	imul %rsi,%rax
	jmp .L_widened

# 8-byte alignment gives ~25% speedup on "rep stosq" memsets
# to L1 cache, compared to intentionally misaligned ones.
# It is a smaller win of ~15% on larger memsets to L2 too.
# Measured on Intel Sandy Bridge CPU (i7-2620M, 2.70GHz)
.L_align:
	mov %rax,(%rdi)
1:	inc %rdi
	dec %rdx
	test $7,%dil
	jnz 1b
	jmp .L_aligned


.Less_than_or_equal_16:
	jb 1f
    # fill 8-16 bytes:
0:	mov %rax,(%rdi)
	mov %rax,-8(%rdi,%rdx)
	mov %r8,%rax
	ret
1:	test $8,%dl
	jnz 0b

	test $4,%dl
	jz 1f
    # fill 4-7 bytes:
	mov %eax,(%rdi)
	mov %eax,-4(%rdi,%rdx)
	mov %r8,%rax
	ret

    # fill 0-3 bytes:
1:	test $2,%dl
	jz 1f
	mov %ax,(%rdi)
	add $2,%rdi
1:	test $1,%dl
	jz 1f
	mov %al,(%rdi)
1:	mov %r8,%rax
	ret
