|
Message-Id: <20230607100710.4286-2-zhang_fei_0403@163.com> Date: Wed, 7 Jun 2023 18:07:08 +0800 From: zhangfei <zhang_fei_0403@....com> To: dalias@...c.org, musl@...ts.openwall.com Cc: zhangfei <zhangfei@...iscas.ac.cn> Subject: [PATCH 1/3] RISC-V: Optimize memset From: zhangfei <zhangfei@...iscas.ac.cn> This code is based on linux/arch/riscv/lib/memset.S. Removed macro definition and modified to support RISCV64. When the amount of data in the source code is less than 16 bytes or after loop tail processing, byte storage is used. Here we refer to musl/src/string/memset.c, and modify it to fill head and tail with minimal branching. Signed-off-by: Zhang Fei<zhangfei@...iscas.ac.cn> --- src/string/riscv64/memset.S | 136 ++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 src/string/riscv64/memset.S diff --git a/src/string/riscv64/memset.S b/src/string/riscv64/memset.S new file mode 100644 index 0000000..f8663d7 --- /dev/null +++ b/src/string/riscv64/memset.S @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013 Regents of the University of California + */ + +#define SZREG 8 +#define REG_S sd + +.global memset +.type memset,@function +memset: + move t0, a0 /* Preserve return value */ + + /* Defer to byte-oriented fill for small sizes */ + sltiu a3, a2, 16 + bnez a3, 4f + + /* + * Round to nearest XLEN-aligned address + * greater than or equal to start address + */ + addi a3, t0, SZREG-1 + andi a3, a3, ~(SZREG-1) + beq a3, t0, 2f /* Skip if already aligned */ + /* Handle initial misalignment */ + sub a4, a3, t0 +1: + sb a1, 0(t0) + addi t0, t0, 1 + bltu t0, a3, 1b + sub a2, a2, a4 /* Update count */ + +2: + andi a1, a1, 0xff + slli a3, a1, 8 + or a1, a3, a1 + slli a3, a1, 16 + or a1, a3, a1 + slli a3, a1, 32 + or a1, a3, a1 + + /* Calculate end address */ + andi a4, a2, ~(SZREG-1) + add a3, t0, a4 + + andi a4, a4, 31*SZREG /* Calculate remainder */ + beqz a4, 3f /* Shortcut if no remainder */ + neg a4, a4 + addi a4, a4, 32*SZREG /* Calculate initial offset */ + + /* Adjust start address with offset */ + sub t0, t0, a4 + + /* Jump into loop body */ + /* Assumes 64-bit instruction lengths */ + la a5, 3f + srli a4, a4, 1 + add a5, a5, a4 + jr a5 +3: + REG_S a1, 0(t0) + REG_S a1, SZREG(t0) + REG_S a1, 2*SZREG(t0) + REG_S a1, 3*SZREG(t0) + REG_S a1, 4*SZREG(t0) + REG_S a1, 5*SZREG(t0) + REG_S a1, 6*SZREG(t0) + REG_S a1, 7*SZREG(t0) + REG_S a1, 8*SZREG(t0) + REG_S a1, 9*SZREG(t0) + REG_S a1, 10*SZREG(t0) + REG_S a1, 11*SZREG(t0) + REG_S a1, 12*SZREG(t0) + REG_S a1, 13*SZREG(t0) + REG_S a1, 14*SZREG(t0) + REG_S a1, 15*SZREG(t0) + REG_S a1, 16*SZREG(t0) + REG_S a1, 17*SZREG(t0) + REG_S a1, 18*SZREG(t0) + REG_S a1, 19*SZREG(t0) + REG_S a1, 20*SZREG(t0) + REG_S a1, 21*SZREG(t0) + REG_S a1, 22*SZREG(t0) + REG_S a1, 23*SZREG(t0) + REG_S a1, 24*SZREG(t0) + REG_S a1, 25*SZREG(t0) + REG_S a1, 26*SZREG(t0) + REG_S a1, 27*SZREG(t0) + REG_S a1, 28*SZREG(t0) + REG_S a1, 29*SZREG(t0) + REG_S a1, 30*SZREG(t0) + REG_S a1, 31*SZREG(t0) + addi t0, t0, 32*SZREG + bltu t0, a3, 3b + andi a2, a2, SZREG-1 /* Update count */ + +4: + /* Handle trailing misalignment */ + beqz a2, 6f + add a3, t0, a2 +5: + /* Fill head and tail with minimal branching. Each + * conditional ensures that all the subsequently used + * offsets are well-defined and in the dest region. */ + sb a1, 0(t0) + sb a1, -1(a3) + li a4, 2 + bgeu a4, a2, 6f + + sb a1, 1(t0) + sb a1, 2(t0) + sb a1, -2(a3) + sb a1, -3(a3) + li a4, 6 + bgeu a4, a2, 6f + + sb a1, 3(t0) + sb a1, -4(a3) + li a4, 8 + bgeu a4, a2, 6f + + sb a1, 4(t0) + sb a1, 5(t0) + sb a1, -5(a3) + li a4, 11 + bgeu a4, a2, 6f + + sb a1, 6(t0) + sb a1, -6(a3) + sb a1, -7(a3) + li a4, 14 + bgeu a4, a2, 6f + + sb a1, 7(t0) +6: + ret -- 2.34.1
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.