From 8abc8b8f53d61cfd31305c864d9bbef6ff207abe Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Fri, 2 Aug 2024 11:42:09 +0800 Subject: [PATCH 1/7] [LoongArch] support relaxation of pcalau12i/ld.d to pcalau12i/addi.d --- elf/arch-loongarch.cc | 97 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 2 deletions(-) diff --git a/elf/arch-loongarch.cc b/elf/arch-loongarch.cc index b121d44a75..bf6eb2fdd7 100644 --- a/elf/arch-loongarch.cc +++ b/elf/arch-loongarch.cc @@ -364,10 +364,70 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { write_k12(loc, highest12(S + A, P)); break; case R_LARCH_GOT_PC_LO12: - write_k12(loc, GOT + G + A); + if (i >= 2 && get_r_delta(i-1) - get_r_delta(i-2) == 4) { + // pcalau12i/ld.d has been relaxed to pcalau12i/addi.d, and + // then the pair has been relaxed to pcaddi. + // loc stores 'ld.d', rewrite ld.d with pcaddi + *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); + write_j20(loc, (S + A - P) >> 2); + } else { + if (i >= 2 && + i + 1 < rels.size() && + sym.is_local(ctx) && + !sym.is_absolute() && + !sym.is_ifunc() && + ctx.arg.relax && + rels[i-1].r_type == R_LARCH_RELAX && + rels[i+1].r_type == R_LARCH_RELAX && + rels[i-2].r_type == R_LARCH_GOT_PC_HI20 && + rels[i-2].r_offset == rel.r_offset - 4) { + u32 insn1 = *(ul32 *)(contents.data() + rels[i-2].r_offset); + u32 insn2 = *(ul32 *)(contents.data() + rel.r_offset); + u32 rd = get_rd(insn1); + + if (rd == get_rd(insn2)) { + // pcalau12i/ld.d has been relaxed to pcalau12i/addi.d + // rewrite the ld.d with addi.d + *(ul32 *)loc = 0x02c00000 | rd | (rd << 5); + write_k12(loc, S + A); + break; + } + } + + // relax not applied. + write_k12(loc, GOT + G + A); + } break; case R_LARCH_GOT_PC_HI20: - write_j20(loc, hi20(GOT + G + A, P)); + if (removed_bytes != 0) { + // The first instruction of pcalau12i/ld.d has been removed + assert(removed_bytes == 4); + break; + } else { + if (i + 3 < rels.size() && + sym.is_local(ctx) && + !sym.is_absolute() && + !sym.is_ifunc() && + ctx.arg.relax && + rels[i+1].r_type == R_LARCH_RELAX && + rels[i+3].r_type == R_LARCH_RELAX && + rels[i+2].r_type == R_LARCH_GOT_PC_LO12 && + rels[i+2].r_offset == rel.r_offset + 4) { + u32 insn1 = *(ul32 *)(contents.data() + rel.r_offset); + u32 insn2 = *(ul32 *)(contents.data() + rels[i+2].r_offset); + u32 rd = get_rd(insn1); + + if (rd == get_rd(insn2)) { + // pcalau12i/ld.d has been relaxed to pcalau12i/addi.d + // reloc the pcalau12i as R_LARCH_PLACA_HI20 + write_j20(loc, hi20(S + A, P)); + break; + } + } + + // relax not applied. + write_j20(loc, hi20(GOT + G + A, P)); + } break; case R_LARCH_GOT64_PC_LO20: write_j20(loc, higher20(GOT + G + A, P)); @@ -816,6 +876,39 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { get_rd(jirl) == 0 || get_rd(jirl) == 1) delta += 4; break; + case R_LARCH_GOT_PC_HI20: + // The following two instructions are used to load a + // symbol value from the GOT + // + // pcalau12i $t0, 0 # R_LARCH_GOT_PC_HI20 + // ld.d $t0, $t0, 0 # R_LARCH_GOT_PC_LO12 + // + // If the symbol is defined in the file current relocation belongs to, + // we can relax them to the following instructions and avoid memory load. + // + // pcalau12i $t0, 0 + // addi.d $t0, $t0, 0 + if (sym.is_local(ctx) && + !sym.is_absolute() && + !sym.is_ifunc() && + ctx.arg.relax && + i + 3 < rels.size() && + rels[i + 2].r_type == R_LARCH_GOT_PC_LO12 && + rels[i + 2].r_offset == rels[i].r_offset + 4 && + rels[i + 3].r_type == R_LARCH_RELAX) { + u32 insn1 = *(ul32 *)(isec.contents.data() + rels[i].r_offset); + u32 insn2 = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4); + + // relax pcalau12i/ld.d to pcalau12i/addi.d + if (get_rd(insn1) != get_rd(insn2)) + continue; + + i64 dist = compute_distance(ctx, sym, isec, r); + // the second phase: relax pcalau12i/addi.d to pcaddi + if (dist % 4 == 0 && -(1 << 21) < dist && dist < (1 << 21)) + delta += 4; + } + break; } } From daedbf74030ee7a029a307372dc5ac15fd909622 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Sat, 3 Aug 2024 10:16:51 +0800 Subject: [PATCH 2/7] LoongArch: add test for relaxation of pcalau12i/ld.d to pcalau12i/addi.d or pcaddi --- test/elf/loongarch64_relax-got.sh | 64 +++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100755 test/elf/loongarch64_relax-got.sh diff --git a/test/elf/loongarch64_relax-got.sh b/test/elf/loongarch64_relax-got.sh new file mode 100755 index 0000000000..86b0be383e --- /dev/null +++ b/test/elf/loongarch64_relax-got.sh @@ -0,0 +1,64 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat <<'EOF' | $CC -g -o $t/a.o -c -xassembler - +.globl get_sym1, get_sym2, get_sym3, get_sym4, get_sym5 +get_sym1: + la.global $a0, sym1 + ld.w $a0, $a0, 0 + ret +get_sym2: + la.global $a0, sym2 + ld.w $a0, $a0, 0 + ret +get_sym3: + la.global $a0, sym3 + ld.w $a0, $a0, 0 + ret +get_sym4: + la.global $a0, sym4 + ld.w $a0, $a0, 0 + ret +get_sym5: + la.global $a0, sym5 + ld.w $a0, $a0, 0 + ret +EOF + +cat < + +int get_sym1(); +int get_sym2(); +int get_sym3(); +int get_sym4(); +int get_sym5(); + +int main() { + printf("%x %x %x %x %x\n", + get_sym1(), get_sym2(), get_sym3(), get_sym4(), get_sym5()); +} +EOF + +$CC -B. -g -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax +$QEMU $t/exe1 | grep -Eq '^0 ba beef 11beef deadbeef$' + +$CC -B. -g -o $t/exe2 $t/a.o $t/b.o $t/c.o +$QEMU $t/exe2 | grep -Eq '^0 ba beef 11beef deadbeef$' + +$OBJDUMP -d $t/exe2 | grep -A2 ':' | grep -Eq $'pcaddi' From d8b7c48928bffcb17459c27ae049dcb481662528 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Sat, 3 Aug 2024 10:34:53 +0800 Subject: [PATCH 3/7] LoongArch refactor --- elf/arch-loongarch.cc | 56 ++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/elf/arch-loongarch.cc b/elf/arch-loongarch.cc index bf6eb2fdd7..31ccaddb1a 100644 --- a/elf/arch-loongarch.cc +++ b/elf/arch-loongarch.cc @@ -399,36 +399,38 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { } break; case R_LARCH_GOT_PC_HI20: - if (removed_bytes != 0) { - // The first instruction of pcalau12i/ld.d has been removed - assert(removed_bytes == 4); - break; - } else { - if (i + 3 < rels.size() && - sym.is_local(ctx) && - !sym.is_absolute() && - !sym.is_ifunc() && - ctx.arg.relax && - rels[i+1].r_type == R_LARCH_RELAX && - rels[i+3].r_type == R_LARCH_RELAX && - rels[i+2].r_type == R_LARCH_GOT_PC_LO12 && - rels[i+2].r_offset == rel.r_offset + 4) { - u32 insn1 = *(ul32 *)(contents.data() + rel.r_offset); - u32 insn2 = *(ul32 *)(contents.data() + rels[i+2].r_offset); - u32 rd = get_rd(insn1); - - if (rd == get_rd(insn2)) { - // pcalau12i/ld.d has been relaxed to pcalau12i/addi.d - // reloc the pcalau12i as R_LARCH_PLACA_HI20 - write_j20(loc, hi20(S + A, P)); - break; + switch (removed_bytes) { + // pcalau12i/ld.d has been relaxed to pcaddi, the first insn has been removed. + case 4: + break; + case 0: + if (i + 3 < rels.size() && + sym.is_local(ctx) && + !sym.is_absolute() && + !sym.is_ifunc() && + ctx.arg.relax && + rels[i+1].r_type == R_LARCH_RELAX && + rels[i+3].r_type == R_LARCH_RELAX && + rels[i+2].r_type == R_LARCH_GOT_PC_LO12 && + rels[i+2].r_offset == rel.r_offset + 4) { + u32 insn1 = *(ul32 *)(contents.data() + rel.r_offset); + u32 insn2 = *(ul32 *)(contents.data() + rels[i+2].r_offset); + u32 rd = get_rd(insn1); + + if (rd == get_rd(insn2)) { + // pcalau12i/ld.d has been relaxed to pcalau12i/addi.d + // reloc the pcalau12i as R_LARCH_PLACA_HI20 + write_j20(loc, hi20(S + A, P)); + break; + } } - } - // relax not applied. - write_j20(loc, hi20(GOT + G + A, P)); + // relax not applied. + write_j20(loc, hi20(GOT + G + A, P)); + break; + default: + unreachable(); } - break; case R_LARCH_GOT64_PC_LO20: write_j20(loc, higher20(GOT + G + A, P)); break; From 9cb023774dfa7ecf458e4fbda63302b91ee3fde5 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Sat, 3 Aug 2024 11:42:31 +0800 Subject: [PATCH 4/7] LoongArch: disable debug sections for test --- test/elf/loongarch64_relax-got.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/elf/loongarch64_relax-got.sh b/test/elf/loongarch64_relax-got.sh index 86b0be383e..365610eb24 100755 --- a/test/elf/loongarch64_relax-got.sh +++ b/test/elf/loongarch64_relax-got.sh @@ -1,7 +1,7 @@ #!/bin/bash . $(dirname $0)/common.inc -cat <<'EOF' | $CC -g -o $t/a.o -c -xassembler - +cat <<'EOF' | $CC -o $t/a.o -c -xassembler - .globl get_sym1, get_sym2, get_sym3, get_sym4, get_sym5 get_sym1: la.global $a0, sym1 @@ -25,7 +25,7 @@ get_sym5: ret EOF -cat < int get_sym1(); @@ -55,10 +55,10 @@ int main() { } EOF -$CC -B. -g -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax +$CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax $QEMU $t/exe1 | grep -Eq '^0 ba beef 11beef deadbeef$' -$CC -B. -g -o $t/exe2 $t/a.o $t/b.o $t/c.o +$CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o $QEMU $t/exe2 | grep -Eq '^0 ba beef 11beef deadbeef$' $OBJDUMP -d $t/exe2 | grep -A2 ':' | grep -Eq $'pcaddi' From 2d622660a0a16c80e65dee2cabcceadabcb02a7e Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Sat, 3 Aug 2024 15:29:40 +0800 Subject: [PATCH 5/7] refactor relaxation of pcalau12i/ld.d and adjust test --- elf/arch-loongarch.cc | 76 +++++++++++-------------------- test/elf/loongarch64_relax-got.sh | 49 ++++---------------- 2 files changed, 36 insertions(+), 89 deletions(-) diff --git a/elf/arch-loongarch.cc b/elf/arch-loongarch.cc index 31ccaddb1a..d43d59cbd8 100644 --- a/elf/arch-loongarch.cc +++ b/elf/arch-loongarch.cc @@ -118,6 +118,10 @@ static u32 get_rd(u32 insn) { return insn & 0x1f; } +static u32 get_rj(u32 insn) { + return (insn >> 5) & 0x1f; +} + static void set_rj(u8 *loc, u32 rj) { assert(rj < 32); *(ul32 *)loc &= 0b111111'1111111111111111'00000'11111; @@ -364,63 +368,36 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { write_k12(loc, highest12(S + A, P)); break; case R_LARCH_GOT_PC_LO12: - if (i >= 2 && get_r_delta(i-1) - get_r_delta(i-2) == 4) { - // pcalau12i/ld.d has been relaxed to pcalau12i/addi.d, and - // then the pair has been relaxed to pcaddi. - // loc stores 'ld.d', rewrite ld.d with pcaddi - *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); - write_j20(loc, (S + A - P) >> 2); - } else { - if (i >= 2 && - i + 1 < rels.size() && - sym.is_local(ctx) && - !sym.is_absolute() && - !sym.is_ifunc() && - ctx.arg.relax && - rels[i-1].r_type == R_LARCH_RELAX && - rels[i+1].r_type == R_LARCH_RELAX && - rels[i-2].r_type == R_LARCH_GOT_PC_HI20 && - rels[i-2].r_offset == rel.r_offset - 4) { - u32 insn1 = *(ul32 *)(contents.data() + rels[i-2].r_offset); - u32 insn2 = *(ul32 *)(contents.data() + rel.r_offset); - u32 rd = get_rd(insn1); - - if (rd == get_rd(insn2)) { - // pcalau12i/ld.d has been relaxed to pcalau12i/addi.d - // rewrite the ld.d with addi.d - *(ul32 *)loc = 0x02c00000 | rd | (rd << 5); - write_k12(loc, S + A); - break; - } - } - - // relax not applied. - write_k12(loc, GOT + G + A); - } + write_k12(loc, GOT + G + A); break; case R_LARCH_GOT_PC_HI20: switch (removed_bytes) { // pcalau12i/ld.d has been relaxed to pcaddi, the first insn has been removed. case 4: + // loc stores 'ld.d', rewrite ld.d with pcaddi + *(ul32 *)(loc) = 0x1800'0000 | get_rd(*(ul32 *)loc); + write_j20(loc, (S + A - P) >> 2); break; case 0: if (i + 3 < rels.size() && - sym.is_local(ctx) && - !sym.is_absolute() && - !sym.is_ifunc() && + sym.is_pcrel_linktime_const(ctx); ctx.arg.relax && - rels[i+1].r_type == R_LARCH_RELAX && - rels[i+3].r_type == R_LARCH_RELAX && - rels[i+2].r_type == R_LARCH_GOT_PC_LO12 && - rels[i+2].r_offset == rel.r_offset + 4) { + rels[i + 1].r_type == R_LARCH_RELAX && + rels[i + 3].r_type == R_LARCH_RELAX && + rels[i + 2].r_type == R_LARCH_GOT_PC_LO12 && + rels[i + 2].r_offset == rel.r_offset + 4) { u32 insn1 = *(ul32 *)(contents.data() + rel.r_offset); - u32 insn2 = *(ul32 *)(contents.data() + rels[i+2].r_offset); + u32 insn2 = *(ul32 *)(contents.data() + rels[i + 2].r_offset); u32 rd = get_rd(insn1); - if (rd == get_rd(insn2)) { - // pcalau12i/ld.d has been relaxed to pcalau12i/addi.d + if (rd == get_rd(insn2) && rd == get_rj(insn2)) { + // relax pcalau12i/ld.d to pcalau12i/addi.d // reloc the pcalau12i as R_LARCH_PLACA_HI20 write_j20(loc, hi20(S + A, P)); + + // rewrite the ld.d insn with addi.d insn + *(ul32 *)(loc + 4) = 0x02c00000 | rd | (rd << 5); + write_k12(loc + 4, S + rels[i + 2].r_addend); break; } } @@ -859,7 +836,8 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { bool is_addi_d = (insn2 & 0xffc0'0000) == 0x02c0'0000; if (dist % 4 == 0 && -(1 << 21) <= dist && dist < (1 << 21) && - is_addi_d && get_rd(insn1) == get_rd(insn2)) + is_addi_d && get_rd(insn1) == get_rd(insn2) && + get_rd(insn2) == get_rj(insn2)) delta += 4; } break; @@ -890,10 +868,8 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { // // pcalau12i $t0, 0 // addi.d $t0, $t0, 0 - if (sym.is_local(ctx) && - !sym.is_absolute() && - !sym.is_ifunc() && - ctx.arg.relax && + if (ctx.arg.relax && + sym.is_pcrel_linktime_const(ctx); i + 3 < rels.size() && rels[i + 2].r_type == R_LARCH_GOT_PC_LO12 && rels[i + 2].r_offset == rels[i].r_offset + 4 && @@ -902,12 +878,12 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { u32 insn2 = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4); // relax pcalau12i/ld.d to pcalau12i/addi.d - if (get_rd(insn1) != get_rd(insn2)) + if (get_rd(insn1) != get_rd(insn2) || get_rd(insn2) != get_rj(insn2)) continue; i64 dist = compute_distance(ctx, sym, isec, r); // the second phase: relax pcalau12i/addi.d to pcaddi - if (dist % 4 == 0 && -(1 << 21) < dist && dist < (1 << 21)) + if (dist % 4 == 0 && -(1 << 21) <= dist && dist < (1 << 21)) delta += 4; } break; diff --git a/test/elf/loongarch64_relax-got.sh b/test/elf/loongarch64_relax-got.sh index 365610eb24..4e4df47741 100755 --- a/test/elf/loongarch64_relax-got.sh +++ b/test/elf/loongarch64_relax-got.sh @@ -2,63 +2,34 @@ . $(dirname $0)/common.inc cat <<'EOF' | $CC -o $t/a.o -c -xassembler - -.globl get_sym1, get_sym2, get_sym3, get_sym4, get_sym5 -get_sym1: - la.global $a0, sym1 - ld.w $a0, $a0, 0 - ret -get_sym2: - la.global $a0, sym2 - ld.w $a0, $a0, 0 - ret -get_sym3: - la.global $a0, sym3 - ld.w $a0, $a0, 0 - ret -get_sym4: - la.global $a0, sym4 - ld.w $a0, $a0, 0 - ret -get_sym5: - la.global $a0, sym5 +.globl get_sym +get_sym: + la.global $a0, sym ld.w $a0, $a0, 0 ret EOF cat < -int get_sym1(); -int get_sym2(); -int get_sym3(); -int get_sym4(); -int get_sym5(); +int get_sym(); int main() { - printf("%x %x %x %x %x\n", - get_sym1(), get_sym2(), get_sym3(), get_sym4(), get_sym5()); + printf("%x\n", get_sym()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax -$QEMU $t/exe1 | grep -Eq '^0 ba beef 11beef deadbeef$' +$QEMU $t/exe1 | grep -Eq '^beef$' $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -$QEMU $t/exe2 | grep -Eq '^0 ba beef 11beef deadbeef$' +$QEMU $t/exe2 | grep -Eq '^beef$' -$OBJDUMP -d $t/exe2 | grep -A2 ':' | grep -Eq $'pcaddi' +$OBJDUMP -d $t/exe2 | grep -A2 ':' | grep -Eq $'pcaddi' From 7863cf0d62d50bf3ab0589816e436a3471ac5ab3 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Sat, 3 Aug 2024 15:31:32 +0800 Subject: [PATCH 6/7] Skip unuse relocation when relaxation --- elf/arch-loongarch.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/elf/arch-loongarch.cc b/elf/arch-loongarch.cc index d43d59cbd8..ecd2570a25 100644 --- a/elf/arch-loongarch.cc +++ b/elf/arch-loongarch.cc @@ -398,6 +398,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // rewrite the ld.d insn with addi.d insn *(ul32 *)(loc + 4) = 0x02c00000 | rd | (rd << 5); write_k12(loc + 4, S + rels[i + 2].r_addend); + i += 3; break; } } From 3094d7d33ddf1a87d52c40e21580427771d98f21 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Mon, 5 Aug 2024 10:46:46 +0800 Subject: [PATCH 7/7] Skip the unuse relocations --- elf/arch-loongarch.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/elf/arch-loongarch.cc b/elf/arch-loongarch.cc index ecd2570a25..8e59d660f9 100644 --- a/elf/arch-loongarch.cc +++ b/elf/arch-loongarch.cc @@ -377,11 +377,12 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // loc stores 'ld.d', rewrite ld.d with pcaddi *(ul32 *)(loc) = 0x1800'0000 | get_rd(*(ul32 *)loc); write_j20(loc, (S + A - P) >> 2); + i += 3; break; case 0: - if (i + 3 < rels.size() && - sym.is_pcrel_linktime_const(ctx); - ctx.arg.relax && + if (ctx.arg.relax && + sym.is_pcrel_linktime_const(ctx) && + i + 3 < rels.size() && rels[i + 1].r_type == R_LARCH_RELAX && rels[i + 3].r_type == R_LARCH_RELAX && rels[i + 2].r_type == R_LARCH_GOT_PC_LO12 && @@ -409,6 +410,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { default: unreachable(); } + break; case R_LARCH_GOT64_PC_LO20: write_j20(loc, higher20(GOT + G + A, P)); break; @@ -870,7 +872,7 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { // pcalau12i $t0, 0 // addi.d $t0, $t0, 0 if (ctx.arg.relax && - sym.is_pcrel_linktime_const(ctx); + sym.is_pcrel_linktime_const(ctx) && i + 3 < rels.size() && rels[i + 2].r_type == R_LARCH_GOT_PC_LO12 && rels[i + 2].r_offset == rels[i].r_offset + 4 &&