-
Notifications
You must be signed in to change notification settings - Fork 42
Debugging miscompilations with bugpoint (using cross compilation and a remote host)
This is a short writeup of what I did in order to debug https://github.com/CTSRD-CHERI/llvm-project/issues/385 Most of it should apply to any miscompilation bug, but later steps are specific to this particular miscompilation.
First you will have to generate a reproducer (this does not need to be minimal since bugpoint will take care of that). The reproducer should be a simple program with a main function that prints some given output on success and different output on failure (miscompilation).
For the remainder of this page we will assume that the success output is "SUCCESSFUL" and the failure is either "FAILED" or "received signal 34" (i.e. a CHERI trap). Exiting with a non-zero exit code (or crashing) is also acceptable for the failure case
For example:
int miscompiled_function(int arg) { ... }
int main(void) {
if (miscompiled_function(1) == 1) {
printf("SUCCESSFUL");
} else {
__builtin_trap();
}
}
Ideally, you then create a pre-processed reproducer from that (or use the Makefile listed below to achieve that result).
For creating the reproducer I use the following Makefile to verify that the reproducer works.
all: broken-reproducer.exe good-reproducer.exe
# Tunables
CHERI_SDK_ROOT?=${HOME}/cheri/output/sdk
CHERI_SYSROOT?=$(CHERI_SDK_ROOT)/sysroot128
CHERI_CHERIBSD?=${HOME}/cheri/cheribsd
SSH_HOSTNAME?=cheribsd
GOOD_OPTFLAGS=-O1
BROKEN_OPTFLAGS=-O2
# Compiler and compile flags
CC=$(CHERI_SDK_ROOT)/bin/clang
OPT_BIN=$(CHERI_SDK_ROOT)/bin/opt
CFLAGS= -target cheri-unknown-freebsd13.0 -integrated-as -fcolor-diagnostics -mcpu=beri -fuse-ld=lld -Qunused-arguments -target cheri-unknown-freebsd13.0 --sysroot=$(CHERI_SYSROOT) -B$(CHERI_SYSROOT)/usr/bin -ftls-model=local-exec -ftls-model=initial-exec -ftls-model=initial-exec -O -pipe -G0 -mcpu=beri -EB -mabi=purecap -integrated-as -fpic -cheri-cap-table-abi=pcrel -Wno-deprecated-declarations -cheri=128 -mstack-alignment=16 -D__LP64__=1 -Qunused-arguments -Werror=cheri-bitwise-operations -msoft-float -DNO__SCCSID -DNO__RCSID -I$(CHERI_CHERIBSD)/lib/libc/include -I$(CHERI_CHERIBSD)/include -I$(CHERI_CHERIBSD)/lib/libc/mips -DNLS -D__DBINTERFACE_PRIVATE -I$(CHERI_CHERIBSD)/contrib/gdtoa -DNO_COMPAT7 -I$(CHERI_CHERIBSD)/contrib/libc-vis -DINET6 -I$(CHERI_CHERIBSD)/lib/libc/resolv -D_ACL_PRIVATE -DPOSIX_MISTAKE -I$(CHERI_CHERIBSD)/lib/libmd -I$(CHERI_CHERIBSD)/contrib/jemalloc/include -DMALLOC_PRODUCTION -I$(CHERI_CHERIBSD)/contrib/tzcode/stdtime -I$(CHERI_CHERIBSD)/lib/libc/stdtime -I$(CHERI_CHERIBSD)/lib/libc/locale -DBROKEN_DES -DPORTMAP -DDES_BUILTIN -I$(CHERI_CHERIBSD)/lib/libc/rpc -I$(CHERI_CHERIBSD)/lib/libc/mips/softfloat -I$(CHERI_CHERIBSD)/lib/libc/softfloat -DSOFTFLOAT_FOR_GCC -DYP -DSYMBOL_VERSIONING -g -MD -MF.depend.getaddrinfo.o -MTgetaddrinfo.o -std=gnu99 -Wno-format-zero-length -nobuiltininc -Wsystem-headers -Werror -Wall -Wno-format-y2k -Wno-uninitialized -Wno-pointer-sign -Wno-error=pass-failed -Wno-error=misleading-indentation -Wno-empty-body -Wno-string-plus-int -Wno-unused-const-variable -Wno-tautological-compare -Wno-unused-value -Wno-parentheses-equality -Wno-unused-function -Wno-enum-conversion -Wno-unused-local-typedef -Wno-address-of-packed-member -Wno-switch -Wno-switch-enum -Wno-knr-promoted-parameter -Qunused-arguments -I$(CHERI_CHERIBSD)/lib/libutil -I$(CHERI_CHERIBSD)/lib/msun/mips -I$(CHERI_CHERIBSD)/lib/msun/src -I$(CHERI_CHERIBSD)/lib/libc/net
CFLAGS+=-Wno-unused-variable -Wunused-function
LDFLAGS=-lc++
.PHONY: echo-ir-compile-command
echo-ir-compile-command:
@if [ -z "$(INPUT_FILE)" ] || [ -z "$(OUTPUT_FILE)" ]; then echo "Must set INPUT_FILE and OUTPUT_FILE"; false; fi
@echo "$(CC) $(CFLAGS) $(GOOD_OPTFLAGS) $(LDFLAGS) -o $(OUTPUT_FILE) -x ir $(INPUT_FILE)"
reproducer.preprocessed.c: reproducer.c Makefile
$(CC) $(CFLAGS) -E - -o $@ reproducer.c
reproducer-O0.ll: reproducer.c Makefile
$(CC) $(CFLAGS) -O0 -Xclang -disable-O0-optnone -emit-llvm -S -o $@ reproducer.c
if grep "Function Attrs" "$@" | grep optnone; then \
echo "Found optnone attribute in -O0 IR?"; \
rm -f "$@"; \
false; \
fi
reproducer-O1.ll: reproducer.c Makefile
$(CC) $(CFLAGS) -O1 -disable-llvm-optzns -emit-llvm -S -o $@ reproducer.c
reproducer-O2.ll: reproducer.c Makefile
$(CC) $(CFLAGS) -O2 -disable-llvm-optzns -emit-llvm -S -o $@ reproducer.c
reproducer-O3.ll: reproducer.c Makefile
$(CC) $(CFLAGS) -O3 -disable-llvm-optzns -emit-llvm -S -o $@ reproducer.c
broken-reproducer.exe: reproducer.c Makefile
$(CC) $(CFLAGS) $(BROKEN_OPTFLAGS) $(LDFLAGS) -o $@ reproducer.c
good-reproducer.exe: reproducer.c Makefile
$(CC) $(CFLAGS) $(GOOD_OPTFLAGS) $(LDFLAGS) -o $@ reproducer.c
broken-reproducer-from-%-ir.exe: reproducer-%.ll Makefile
$(CC) $(CFLAGS) $(BROKEN_OPTFLAGS) $(LDFLAGS) -o $@ -x ir $<
good-reproducer-from-%-ir.exe: reproducer-%.ll Makefile
$(CC) $(CFLAGS) $(GOOD_OPTFLAGS) $(LDFLAGS) -o $@ -x ir $<
LLVM_OPTFLAGS?=-O3
good-reproducer-%-with-opt.exe: reproducer-%.ll Makefile
$(OPT_BIN) -S $(LLVM_OPTFLAGS) -o $<.opt3.ll $<
$(CC) $(CFLAGS) $(GOOD_OPTFLAGS) $(LDFLAGS) -o $@ -x ir $<.opt3.ll
# Force TTY allocation with -tt to report the right exit code
RUN_TARGETS=run-good-reproducer run-broken-reproducer \
run-good-reproducer-from-O0-ir run-broken-reproducer-from-O0-ir \
run-good-reproducer-from-O1-ir run-broken-reproducer-from-O1-ir \
run-good-reproducer-from-O2-ir run-broken-reproducer-from-O2-ir \
run-good-reproducer-from-O3-ir run-broken-reproducer-from-O3-ir \
run-good-reproducer-O1-with-opt run-good-reproducer-O0-with-opt
.PHONY: $(RUN_TARGETS)
$(RUN_TARGETS): run-%: %.exe
scp -q "$^" "$(SSH_HOSTNAME):/tmp/$<"
ssh -tt $(SSH_HOSTNAME) -- "/tmp/$<"; echo "Exit code was $$?"
Example SSH config to allow fast upload (ControlMaster setting) to a CheriBSD instance. Adjust host and port if you want to run e.g. on an FPGA instead of localhost (e.g. started by cheribuild).
Host cheribsd
User root
HostName localhost
Port 12374
StrictHostKeyChecking no
ControlPath ~/.ssh/controlmasters/%r@%h:%p
ControlMaster auto
ControlPersist 5m
Using the makefile above you can generate LLVM IR and check reproducers by running make run-good-reproducer
and make run-broken-reproducer
The run-good-reproducer
target should produce output similar to the following (the exit code should be zero):
scp -q "good-reproducer.exe" "cheribsd:/tmp/good-reproducer.exe"
ssh -tt cheribsd -- "/tmp/good-reproducer.exe"; echo "Exit code was $?"
TEST SUCCESSFUL!
Shared connection to localhost closed.
Exit code was 0
The run-broken-reproducer
should produce output similar to the following (the exit code should be non-zero):
ssh -tt cheribsd -- "/tmp/broken-reproducer.exe"; echo "Exit code was $?"
TEST FAILED!
Shared connection to localhost closed.
Exit code was 1
Using the makefile above you can generate LLVM IR and check reproducers by running make run-{good,broken}-reproducer-from-{O0,O1,O2,O3}-ir
Ideally you should get the success output when running make run-good-reproducer-from-O0-ir
and the failure output when running make run-broken-reproducer-from-O0-ir
.
However, it is possible that the bug is not exposed by the -O0 IR since e.g. clang lifetime markers will not be emitted. In that case you can try if make run-{good,broken}-reproducer-from-{O1,O2,O3}-ir
works as expected.
Once you have found the right IR target, the LLVM IR input will be in the same directory as reproducer-<optlevel>.ll
(e.g. for O0, it will be reproducer-O0.ll).
Now that you have the LLVM IR input (we will assume it's called reproducer-O1.ll here), you can start using the LLVM bugpoint tool (more docs can be found here).
The command line options of bugpoint are quite difficult to use (it took me many hours to initially figure out how to do this miscompilation debugging), so I've written some scripts to make this easier.
We make use of the --run-custom flag to pass a custom script for running the compiled binary remotely.
To do this we need two helper scripts.
The first script is run-bugpoint.sh (make sure to set LLVM_BINDIR and adjust the -opt-args flags).
Since we are compiling for CHERI-MIPS in this example, we need to pass -mtriple=mips64c128-unknown-freebsd13-purecap -mcpu=cheri128 -mattr=+cheri128
to opt
in order to compile for the right target architecture.
#!/bin/sh
set -xe
# bugpoint searches for tools in $PATH
if ! test -e "$LLVM_BINDIR/opt"; then
echo "FATAL: cannot find opt command. Please set $LLVM_BINDIR"
exit 1
fi
if ! test -e "$LLVM_BINDIR/bugpoint"; then
echo "FATAL: cannot find bugpoint command. Please set $LLVM_BINDIR"
exit 1
fi
input_file=$1
export PATH=$LLVM_BINDIR:$PATH
# Select the set of passes to debug for miscompilation (default to -O3). This can be any valid argument to the opt tool
# passes=-O2
# passes="-simplifycfg -memcpyopt"
passes=-O3
bugpoint -verbose-errors $passes "$input_file" -compile-command /usr/bin/false --safe-run-custom --run-custom -exec-command $(pwd)/run_remote.py --opt-args -mcpu=cheri128 -mattr=+cheri128
TODO: integrate this with the makefile
We tell bugpoint to not attempt to compile the .ll file (we do that as part of of the -run-custom script).
TODO: I can't remember why I had to do that
The run-custom script takes care of compiling, uploading and running the test binary based on the (partially) optimized input that bugpoint passes to it.
It is important that we treat cases where we fail to compile the input (e.g. due to linker errors when building the executable) as a "success", i.e., returning zero since otherwise bugpoint
will assume that this invalid input is actually an interesting test case and reduce it to an empty IR file.
The script can be a lot faster if you can avoid the SCP overhead by using a NFS/SMB network share. This is significant for CHERI since the SSH authentication overhead is significant on a slow QEMU VM. Using a network file system allows us to avoid one authentication (the scp step) and only execute the binary.
If the run_remote.py script is not working as expected, bugpoint will not produce useful output. Try executing sh -x ./run_remote.sh reproducer-O1.ll; echo $?
. If this prints a non-zero exit status it is likely that some of the flags are wrong.
One potential error that happened to me was using a network file system but forgetting to mount it after rebooting the guest VM. The command output will show something like /nfsroot//tmp/bugpoint-test.exe: Command not found.
Another useful step for sanity checking (if the -O3 LLVM IR is broken), is to run sh -x ./run_remote.sh reproducer-O1.ll; echo $?
.
Run LLVM_BINDIR=/path/to/llvm sh ./run_bugpoint.sh reproducer-O1.ll
and wait for bugpoint to complete. It should print the pass that causes the problem and try to produce a minimal IR file by optimizing individual functions and basic blocks with the broken pass until a minimal output has been produced.
The output should look like the following:
Generating reference output from raw program:
Reference output is: bugpoint.reference.out-48bc76b
*** Checking the code generator...
*** Output matches: Debugging miscompilation!
Checking to see if '' compiles correctly: yup.
Checking to see if '-ee-instrument -tbaa -scoped-noalias -simplifycfg -sroa -early-cse -lower-expect -forceattrs -tbaa -scoped-noalias -inferattrs -callsite-splitting -ipsccp -called-value-propagation -attributor -globalopt -mem2reg -deadargelim -instcombine -simplifycfg -globals-aa -prune-eh -inline -functionattrs -argpromotion -sroa -early-cse-memssa -speculative-execution -jump-threading -correlated-propagation -simplifycfg -aggressive-instcombine -instcombine -libcalls-shrinkwrap -pgo-memop-opt -tailcallelim -simplifycfg -reassociate -loop-rotate -licm -loop-unswitch -simplifycfg -instcombine -indvars -loop-idiom -loop-deletion -loop-unroll -mldst-motion -gvn -memcpyopt -sccp -bdce -instcombine -jump-threading -correlated-propagation -dse -licm -adce -simplifycfg -instcombine -barrier -elim-avail-extern -rpo-functionattrs -globalopt -globaldce -globals-aa -float2int -lower-constant-intrinsics -loop-rotate -loop-distribute -loop-vectorize -loop-load-elim -instcombine -simplifycfg -instcombine -loop-unroll -instcombine -licm -transform-warning -alignment-from-assumptions -strip-dead-prototypes -globaldce -constmerge -loop-sink -instsimplify -div-rem-pairs -simplifycfg' compiles correctly: nope.
Checking to see if '-indvars -loop-idiom -loop-deletion -loop-unroll -mldst-motion -gvn -memcpyopt -sccp -bdce -instcombine -jump-threading -correlated-propagation -dse -licm -adce -simplifycfg -instcombine -barrier -elim-avail-extern -rpo-functionattrs -globalopt -globaldce -globals-aa -float2int -lower-constant-intrinsics -loop-rotate -loop-distribute -loop-vectorize -loop-load-elim -instcombine -simplifycfg -instcombine -loop-unroll -instcombine -licm -transform-warning -alignment-from-assumptions -strip-dead-prototypes -globaldce -constmerge -loop-sink -instsimplify -div-rem-pairs -simplifycfg' compiles correctly: yup.
Checking to see if '-ee-instrument -tbaa -scoped-noalias -simplifycfg -sroa -early-cse -lower-expect -forceattrs -tbaa -scoped-noalias -inferattrs -callsite-splitting -ipsccp -called-value-propagation -attributor -globalopt -mem2reg -deadargelim -instcombine -simplifycfg -globals-aa -prune-eh -inline -functionattrs -argpromotion -sroa -early-cse-memssa -speculative-execution -jump-threading -correlated-propagation -simplifycfg -aggressive-instcombine -instcombine -libcalls-shrinkwrap -pgo-memop-opt -tailcallelim -simplifycfg -reassociate -loop-rotate -licm -loop-unswitch -simplifycfg -instcombine' compiles correctly: yup.
If bugpoint starts debugging a code generator crash instead, see the next section for some common problems.
One problem that often happens is that bugpoint cannot detect that you are debugging a miscompilation, prints bugpoint can't help you with your problem!
and starts trying to debug a code generator crash.
*** Checking the code generator...
*** Output matches: Debugging miscompilation!
*** Optimized program matches reference output! No problem detected...
bugpoint can't help you with your problem!
In that case it makes sense to check if the reference output is as expected. For example, in one case I was getting a -Werror failure for the reference program, so the broken problem produces the same output.
[0;1;31merror: [0moverriding the module target triple with mips64c128-unknown-freebsd13.0-purecap [-Werror,-Woverride-module][0m
1 error generated.
Compiling test binary...
FAILED TO COMPILE
#!/usr/bin/env python3
import subprocess
import sys
import shlex
import datetime
import os
from pathlib import Path
# Adjust these variables as required
NFS_DIR_ON_HOST = Path.home() / "cheri/output/rootfs128/"
NFS_DIR_IN_GUEST = "/nfsroot"
SSH_HOST = "cheribsd"
LOG = Path(__file__).parent / ("log-" + str(datetime.datetime.now()) + "-pid" + str(os.getpid()) + ".txt")
def main():
## Main script body
input_ll = sys.argv[1]
exe_name = "bugpoint-test.exe"
output_exe = Path(exe_name).absolute()
exepath_in_shared_dir_rel = "tmp/" + exe_name
host_nfs_exepath = NFS_DIR_ON_HOST / exepath_in_shared_dir_rel
guest_nfs_exepath = Path(NFS_DIR_IN_GUEST, exepath_in_shared_dir_rel)
print("Compiling test binary...")
subprocess.check_call(["rm", "-f", str(output_exe)])
compile_command = subprocess.check_output(["make", "-C", Path(__file__).parent,
"INPUT_FILE=" + str(input_ll),
"OUTPUT_FILE=" + str(output_exe),
"echo-ir-compile-command"]).decode().strip()
compile_command += " -Wno-error" # Avoid random compilation failures
# print("Running", compile_command)
if subprocess.call(compile_command, shell=True) != 0:
print("FAILED TO COMPILE")
sys.exit(0) # Failed to compile -> not and interesting test case
print("Running test binary...")
# TODO: handle the non-NFS case where we have to upload first
subprocess.check_call(["cp", "-f", str(output_exe), str(host_nfs_exepath)])
result = subprocess.run(["ssh", "-tt", SSH_HOST, "--", str(guest_nfs_exepath)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# TODO: could check stuff here
print(result.stdout.decode().strip()) # Print the expected output
print("EXIT CODE =", result.returncode)
LOG.write_bytes(result.stdout)
sys.exit(result.returncode)
if __name__ == "__main__":
try:
main()
except Exception as e:
print("GOT EXCEPTION!!", e)
LOG.write_text(str(e))
sys.exit(0) # not interesting test case, but should be debugged
In the example miscompilation (https://github.com/CTSRD-CHERI/llvm-project/issues/385), bugpoint reported that the pass causing the miscompilation is GVN.
Many LLVM passes have internal options (grep for cl::opt<
) that can be used to change the behaviour of the optimization pass.
Looking at GVN.cpp, I saw there are various boolean flags that are true by default and some integer recursion depth flags.
As a first step I therefore extended the Makefile to pass -mllvm
flags that disable those parts of GVN:
CFLAGS+=-mllvm -enable-pre=false
CFLAGS+=-mllvm -enable-load-pre=false
CFLAGS+=-mllvm -enable-gvn-memdep=false
CFLAGS+=-mllvm -gvn-max-recurse-depth=0
CFLAGS+=-mllvm -gvn-max-num-deps=0
None of these had an effect on the resulting miscompilation, so that means I had to modify GVN for finer-grained control.
To see what transforms are being performed by GVN I added -mllvm -stats
. This flag dumps the value of all LLVM statistics counters on exit (I believe it's only available with assertions enabled builds of LLVM).
To see which statistics are available you can search for STATISTIC(
in LLVM.
These statistics can either be dumped on stderr, or to a file in JSON format (-Xclang -stats-file=/path/to/file
)
GVN defines various statistics and the following was printed by the broken reproducer:
2 gvn - Number of blocks merged
6 gvn - Number of equalities propagated
74 gvn - Number of instructions deleted
4 gvn - Number of loads deleted
58 gvn - Number of instructions simplified
I then checked the GVN pass to see where those statistics are being incremented and add some new flags to GVN to skip those parts of GVN:
-mllvm -gvn-propagate-equality=false
to skip GVN::propagateEquality(...)
-mllvm -gvn-process-loads=false
to skip GVN::processLoad(LoadInst *L)
After adding these flags, it turned out that -O2
(with the GVN flags listed above) and -mllvm -gvn-process-loads=false
was succeeding, but -mllvm -gvn-process-loads=true
produced the wrong output.
Therefore, we now know that the miscompilation is in GVN::processLoad(LoadInst *L)
(or at least that a transformation is happening there that exposes a bug in a later pass).
Now that we have narrowed down the culprit to a rather small difference, we can try to compile the (good and broken) IR after that pass with -O0 to see if it still reproduces.
I added the Makefile targets run-reproducer-optnone-after-good-pass
and reproducer-optnone-after-broken-pass
to validate this:
First, I generate LLVM IR for both the good and the bad case, by telling clang to stop after the GVN pass. This can be done by adding -mllvm -stop-after=gvn -emit-llvm
flag.
I then compile the resulting two IR files with clang -O0 to see if the bug is in GVN or if it exposes a bug in a later pass.
If the resulting IR compiled at O0 crashes in the -mllvm -gvn-process-loads=true
case and succeeds with -mllvm -gvn-process-loads=false
, this almost certainly means that the transform made by GVN is incorrect. If not, it means the transform exposes a later optimization opportunity that may be incorrect.
broken-reproducer-after-gvn.ll: reproducer.c Makefile
$(CC) $(CFLAGS) $(BROKEN_OPTFLAGS) -S -fverbose-asm -mllvm -stop-after=gvn -emit-llvm -o $@ reproducer.c
good-reproducer-after-gvn.ll: reproducer.c Makefile
$(CC) $(CFLAGS) $(GOOD_OPTFLAGS) -S -fverbose-asm -mllvm -stop-after=gvn -emit-llvm -o $@ reproducer.c
reproducer-optnone-after-good-pass.exe: good-reproducer-after-gvn.ll
$(CC) $(CFLAGS) -O0 -o $@ -x ir $<
reproducer-optnone-after-broken-pass.exe: broken-reproducer-after-gvn.ll
$(CC) $(CFLAGS) -O0 -o $@ -x ir $<
In my case, compiling and running broken-reproducer-after-gvn.ll
with -O0 compilation was broken and the good reproducer passed. Therefore, we now know that the bug is indeed in GVN.
Time to stare at some LLVM IR in a diff viewer (I use CLion, which I find works a lot better than a simple diff -u
).
Unfortunately the diff was still huge and I couldn't see anything obviously wrong. Time to go back to step 4b to further narrow down the miscompilation.
Earlier, bugpoint produced a LLVM bitcode reproducer that contained a smaller test case.
As this file is in binary format, I use llvm-dis -o foo.ll
to turn it into textual IR, that I then use for a LLVM LIT (TODO: add link) testcase.
Note: we can't just run GVN with opt since it will be missing the memory dependency analysis and alias analysis results that are needed for many transformations.
Therefore we run opt with opt -S -memdep -basicaa -gvn
(and the various flags to disable parts of GVN)
; RUN: opt -S -memdep -basicaa -gvn -o %s.gvn.noloads %s.instnamed -debug -enable-pre=false -enable-load-pre=false -gvn-max-recurse-depth=2 -gvn-max-num-deps=2 -gvn-propagate-equality=false -gvn-process-loads=false
; RUN: opt -S -memdep -basicaa -gvn -o %s.gvn.withloads %s.instnamed -debug -enable-pre=false -enable-load-pre=false -gvn-max-recurse-depth=2 -gvn-max-num-deps=2 -gvn-propagate-equality=false -gvn-process-loads=true
; Bugpoint-generated IR here:
; ....
Comparing the two outputs was interesting since, the -gvn-process-loads=true produced a ptrtoint
LLVM instruction. %4 = ptrtoint i8 addrspace(200)* %tmp776 to i64
.
Based on prior experience with CHERI this can cause problems in various passes that assume integers and pointers are effectively the same thing. SUSPICIOUS. Maybe we have narrowed down the bug now?
Even more suspiciously, the IR now also contains many undef
values:
br i1 undef, label %bb677, label %bb665
; ...
%tmp763 = phi %2 addrspace(200)* [ %tmp658, %bb665 ], [ undef, %bb671.bb762_crit_edge ], [ %tmp658, %bb683 ], [ %tmp658, %bb689 ], [ %tmp658, %bb701 ], [ %tmp658, %bb707 ], [ %tmp658, %bb729 ], [ %tmp658, %bb733 ], [ %tmp658, %bb738 ], [ null, %bb752 ], [ %tmp753, %bb756 ]
%tmp764 = phi i32 [ %tmp657, %bb665 ], [ undef, %bb671.bb762_crit_edge ], [ %tmp657, %bb683 ], [ %tmp657, %bb689 ], [ %tmp657, %bb701 ], [ %tmp657, %bb707 ], [ %tmp657, %bb729 ], [ %tmp657, %bb733 ], [ %tmp657, %bb738 ], [ %tmp748, %bb752 ], [ %tmp748, %bb756 ]
; ...
switch i32 undef, label %bb588 [
i32 28, label %bb589
i32 2, label %bb581
i32 0, label %bb582
]
In the debug output we also see GVN removed: %tmp401 = tail call i64 @llvm.cheri.cap.diff.i64(i8 addrspace(200)* %tmp332, i8 addrspace(200)* bitcast ([4 x { i32, i32, i32, i32, i8 addrspace(200)*, i8 addrspace(200)*, i32 }] addrspace(200)* @global.6 to i8 addrspace(200)*))
The @llvm.cheri.cap.diff.i64
intrinsic is used for pointer subtraction, so removing that could also be the source of the miscompilation. However, often this is perfectly reasonable to remove (e.g. all users are dead) so it's not the most likely problem.
To see where the ptrtoint is being created, I set a breakpoint in the two constructors of PtrToIntInst
Command to run: -mtriple=cheri-unknown-freebsd -mcpu=cheri128 -mattr=+cheri128 -S -memdep -basicaa -gvn -o /Users/alex/cheri/llvm-project/llvm/test/CodeGen/Mips/gvn-miscompile.ll.gvn.withloads /Users/alex/cheri/llvm-project/llvm/test/CodeGen/Mips/gvn-miscompile.ll -debug -enable-pre=false -enable-load-pre=false -gvn-max-recurse-depth=2 -gvn-max-num-deps=2 -gvn-propagate-equality=false -gvn-process-loads=true
Turns out this is caused by getStoreValueForLoadHelper:
llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>::CreatePtrToInt(llvm::Value*, llvm::Type*, llvm::Twine const&) IRBuilder.h:2106
llvm::Value* llvm::VNCoercion::getStoreValueForLoadHelper<llvm::Value, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter> >(llvm::Value*, unsigned int, llvm::Type*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&, llvm::DataLayout const&) VNCoercion.cpp:489
llvm::VNCoercion::getStoreValueForLoad(llvm::Value*, unsigned int, llvm::Type*, llvm::Instruction*, llvm::DataLayout const&) VNCoercion.cpp:517
llvm::gvn::AvailableValue::MaterializeAdjustedValue(llvm::LoadInst*, llvm::Instruction*, llvm::GVN&) const GVN.cpp:794
llvm::GVN::processLoad(llvm::LoadInst*) GVN.cpp:1639
llvm::GVN::processInstruction(llvm::Instruction*) GVN.cpp:2051
llvm::GVN::processBlock(llvm::BasicBlock*) GVN.cpp:2238
llvm::GVN::iterateOnFunction(llvm::Function&) GVN.cpp:2563
llvm::GVN::runImpl(llvm::Function&, llvm::AssumptionCache&, llvm::DominatorTree&, llvm::TargetLibraryInfo const&, llvm::AAResults&, llvm::MemoryDependenceResults*, llvm::LoopInfo*, llvm::OptimizationRemarkEmitter*) GVN.cpp:2193
llvm::gvn::GVNLegacyPass::runOnFunction(llvm::Function&) GVN.cpp:2743
llvm::FPPassManager::runOnFunction(llvm::Function&) LegacyPassManager.cpp:1483
llvm::FPPassManager::runOnModule(llvm::Module&) LegacyPassManager.cpp:1519
(anonymous namespace)::MPPassManager::runOnModule(llvm::Module&) LegacyPassManager.cpp:1584
llvm::legacy::PassManagerImpl::run(llvm::Module&) LegacyPassManager.cpp:1696
llvm::legacy::PassManager::run(llvm::Module&) LegacyPassManager.cpp:1727
main opt.cpp:937
start 0x00007fff6afd23d5
start 0x00007fff6afd23d5
We also see the debug output contains various statements like this:
GVN COERCED NONLOCAL VAL:
Offset: 4 %tmp34 = load i8 addrspace(200)*, i8 addrspace(200)* addrspace(200)* %tmp33, align 16
%2 = trunc i64 %1 to i32
My guess now is that it assumes a store of a pointer value can be forwarded to a load of an integer value. This seems like the code is either using unions or storing to buffers using capabilities and loading back integers.
Let's have a look at what %tmp33 is:
%tmp33 = bitcast %2 addrspace(200)* %ai to i8 addrspace(200)* addrspace(200)*
and %2 is %2 = type { i32, i32, i32, i32, i32, i8 addrspace(200)*, %3 addrspace(200)*, %2 addrspace(200)* }
.
So it does indeed seem like we are loading multiple integers from somewhere that a capability was stored to.
Let's rename the type to make it more obvious in the IR.
Inside AvailableValue::MaterializeAdjustedValue() the load instruction is %tmp58 = load i32, i32 addrspace(200)* %ai_family, align 4, !tbaa !14
, i.e. loading the second field from our suspicious type (a socket union?)
%ai_flags = getelementptr inbounds %suspicious_type, %suspicious_type addrspace(200)* %ai, i64 0, i32 0
%ai_family = getelementptr inbounds %suspicious_type, %suspicious_type addrspace(200)* %ai, i64 0, i32 1
%ai_socktype = getelementptr inbounds %suspicious_type, %suspicious_type addrspace(200)* %ai, i64 0, i32 2
%ai_protocol = getelementptr inbounds %suspicious_type, %suspicious_type addrspace(200)* %ai, i64 0, i32 3
%ai_canonname = getelementptr inbounds %suspicious_type, %suspicious_type addrspace(200)* %ai, i64 0, i32 5
Looking at the C code, this is a
struct addrinfo {
int ai_flags; /* AI_PASSIVE, AI_CANONNAME, AI_NUMERICHOST */
int ai_family; /* AF_xxx */
int ai_socktype; /* SOCK_xxx */
int ai_protocol; /* 0 or IPPROTO_xxx for IPv4 and IPv6 */
socklen_t ai_addrlen; /* length of ai_addr */
char *ai_canonname; /* canonical name for hostname */
struct sockaddr *ai_addr; /* binary address */
struct addrinfo *ai_next; /* next structure in linked list */
};
which matches the LLVM IR type %2 = type { i32, i32, i32, i32, i32, i8 addrspace(200)*, %3 addrspace(200)*, %2 addrspace(200)* }
By looking at the generated IR we can also see that the new GVN-created sequence is incorrect:
%ai = alloca %suspicious_type, align 16, addrspace(200)
%tmp33 = bitcast %2 addrspace(200)* %ai to i8 addrspace(200)* addrspace(200)*
%tmp34 = load i8 addrspace(200)*, i8 addrspace(200)* addrspace(200)* %tmp33, align 16
%0 = ptrtoint i8 addrspace(200)* %tmp34 to i64 ; INCORRECT transformation (does not transfer all bits)
%1 = lshr i64 %0, 64 ; Shift right by 64 to get field #2
%2 = trunc i64 %1 to i32 ; truncate to drop the high bits
This shows that GVN is attempting to forward the bits of the previous capability-size load to the individual sub-fields. However, LLVM is converting the 128-bit capability to an i64 using ptrtoint. For CHERI-MIPS this yields the the address field which is bits 64-127 of the capability (CHERI-MIPS is big-endian).
This is clearly incorrect and we have to fix this optimization.
Now that I have found the bug, I could create a minimal test case by hand, but there is an easier solution:
Insert an assertion when the incorrect value is being generated and then use my testcase reduction script [creduce_crash_testcase.py](https://github.com/CTSRD-CHERI/llvm-project/blob/master/clang/utils/creduce_crash_testcase.py)
on the bugpoint-generated input.
Note: Don't confuse this script with creduce-clang-crash.py which is a similar script that upstream added (taking some of my script code but not adding all the features).
I therefore added
// Check that this did not change the size of the value:
assert(DL.getTypeSizeInBits(SrcVal->getType()) == StoreSize &&
"Size of stored value should not change");
to getStoreValueForLoadHelper() in VNCoersion.cpp and as expected the input now crashes.
In order to get a minimal test case, I can now run
/Users/alex/cheri/llvm-project/cmake-build-debug/bin/creduce_crash_testcase.py /Users/alex/cheri/llvm-project/llvm/test/CodeGen/Mips/gvn-miscompile.ll --reduce-tool llvm-reduce
Note: --reduce-tool=bugpoint or --reduce-tool=creduce would also work.
After a few minutes llvm-reduce gives me the following test case:
; RUN: %cheri_opt -mcpu=cheri128 -mattr=+cheri128 -S -memdep -basicaa -gvn -o %s.gvn.withloads -debug -enable-pre=false -enable-load-pre=false -gvn-max-recurse-depth=2 -gvn-max-num-deps=2 -gvn-propagate-equality=false -gvn-process-loads=true %s
; ModuleID = '/Users/alex/cheri/llvm-project/llvm/test/CodeGen/Mips/gvn-miscompile-reduce.ll'
source_filename = "reproducer.c"
target datalayout = "E-m:e-pf200:128:128:128:64-i8:8:32-i16:16:32-i64:64-n32:64-S128-A200-P200-G200"
target triple = "cheri-unknown-freebsd"
%0 = type { i32, i32, i32, i32 }
%1 = type { i8 addrspace(200)*, i32 (i8 addrspace(200)*, i8 addrspace(200)*, i8 addrspace(200)*) addrspace(200)*, i8 addrspace(200)* }
%2 = type { i8, i8, [14 x i8] }
%suspicious_type = type { i32, i32, i32, i32, i32, i8 addrspace(200)*, %2 addrspace(200)*, %suspicious_type addrspace(200)* }
@explore = external dso_local unnamed_addr addrspace(200) constant [16 x %0], align 4
@.str = external dso_local unnamed_addr addrspace(200) constant [3 x i8], align 1
@in6_addrany = external dso_local addrspace(200) constant [16 x i8], align 1
@in6_loopback = external dso_local addrspace(200) constant [16 x i8], align 1
@in_addrany = external dso_local addrspace(200) constant [4 x i8], align 1
@in_loopback = external dso_local addrspace(200) constant [4 x i8], align 1
@afdl = external dso_local addrspace(200) constant [4 x { i32, i32, i32, i32, i8 addrspace(200)*, i8 addrspace(200)*, i32 }], align 16
@.str.4 = external dso_local unnamed_addr addrspace(200) constant [4 x i8], align 1
@.str.5 = external dso_local unnamed_addr addrspace(200) constant [4 x i8], align 1
@.str.6 = external dso_local unnamed_addr addrspace(200) constant [5 x i8], align 1
@.str.7 = external dso_local unnamed_addr addrspace(200) constant [8 x i8], align 1
@explore_fqdn.dtab = external dso_local addrspace(200) constant [4 x %1], align 16
@.str.8 = external dso_local unnamed_addr addrspace(200) constant [6 x i8], align 1
@.str.9 = external dso_local unnamed_addr addrspace(200) constant [4 x i8], align 1
@.str.10 = external dso_local unnamed_addr addrspace(200) constant [4 x i8], align 1
@.str.11 = external dso_local unnamed_addr addrspace(200) constant [6 x i8], align 1
@.str.12 = external dso_local unnamed_addr addrspace(200) constant [12 x i8], align 1
@.str.13 = external dso_local unnamed_addr addrspace(200) constant [11 x i8], align 1
@.str.14 = external dso_local unnamed_addr addrspace(200) constant [3 x i8], align 1
@.str.15 = external dso_local unnamed_addr addrspace(200) constant [3 x i8], align 1
@.str.16 = external dso_local unnamed_addr addrspace(200) constant [3 x i8], align 1
@.str.17 = external dso_local unnamed_addr addrspace(200) constant [2 x i8], align 1
@.str.19 = external dso_local unnamed_addr addrspace(200) constant [6 x i8], align 1
@.str.20 = external dso_local unnamed_addr addrspace(200) constant [15 x i8], align 1
@.str.21 = external dso_local unnamed_addr addrspace(200) constant [13 x i8], align 1
@default_dns_files = external dso_local addrspace(200) constant [3 x { i8 addrspace(200)*, i32 }], align 16
@str = external dso_local unnamed_addr addrspace(200) constant [23 x i8], align 1
@str.23 = external dso_local unnamed_addr addrspace(200) constant [23 x i8], align 1
@str.24 = external dso_local unnamed_addr addrspace(200) constant [35 x i8], align 1
; Function Attrs: nounwind uwtable
define signext i32 @my_getaddrinfo(i8 addrspace(200)* %hostname, i8 addrspace(200)* %servname, %suspicious_type addrspace(200)* readonly %hints, %suspicious_type addrspace(200)* addrspace(200)* nocapture %res) local_unnamed_addr addrspace(200) #0 {
entry:
%ai = alloca %suspicious_type, align 16, addrspace(200)
%ai_family = getelementptr inbounds %suspicious_type, %suspicious_type addrspace(200)* %ai, i64 0, i32 1
br label %if.end
if.end: ; preds = %entry
br label %if.end83
if.end83: ; preds = %if.end
switch i32 undef, label %sw.epilog96thread-pre-split [
]
sw.epilog96thread-pre-split: ; preds = %if.end83
br label %sw.epilog96
sw.epilog96: ; preds = %sw.epilog96thread-pre-split
switch i32 undef, label %if.end117 [
]
if.end117: ; preds = %sw.epilog96
%tmp37 = bitcast %suspicious_type addrspace(200)* %ai to i8 addrspace(200)* addrspace(200)*
br label %for.body120.us
for.body120.us: ; preds = %for.inc152.us, %if.end117
store i8 addrspace(200)* undef, i8 addrspace(200)* addrspace(200)* %tmp37, align 16
%tmp41 = load i32, i32 addrspace(200)* %ai_family, align 4, !tbaa !0
%cmp123.us = icmp eq i32 %tmp41, undef
br label %for.inc152.us
for.inc152.us: ; preds = %for.body120.us
br label %for.body120.us
}
attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="beri" "target-features"="+beri,+cheri128,+chericap,+soft-float,-noabicalls,+cheri128,+cheri128,+cheri128,+cheri128,+cheri128,+cheri128" "unsafe-fp-math"="false" "use-soft-float"="true" }
!0 = !{!1, !2, i64 4}
!1 = !{!"addrinfo", !2, i64 0, !2, i64 4, !2, i64 8, !2, i64 12, !2, i64 16, !5, i64 32, !5, i64 48, !5, i64 64}
!2 = !{!"int", !3, i64 0}
!3 = !{!"omnipotent char", !4, i64 0}
!4 = !{!"Simple C/C++ TBAA"}
!5 = !{!"any pointer", !3, i64 0}
This can further be reduced using bugpoint and creduce since they also remove other things. Creduce does a textual reduction and despite the name (C-reduce) it also works well for other textual inputs (the --not-c flag speeds things up a bit).
After bugpoint:
; RUN: opt -mcpu=cheri128 -mattr=+cheri128 -S -memdep -basicaa -gvn -o %s.gvn.withloads -debug -enable-pre=false -enable-load-pre=false -gvn-max-recurse-depth=2 -gvn-max-num-deps=2 -gvn-propagate-equality=false -gvn-process-loads=true %s
source_filename = "reproducer.c"
target datalayout = "E-m:e-pf200:128:128:128:64-i8:8:32-i16:16:32-i64:64-n32:64-S128-A200-P200-G200"
target triple = "cheri-unknown-freebsd"
%0 = type { i8, i8, [14 x i8] }
%struct.eggs = type { i32, i32, i32, i32, i32, i8 addrspace(200)*, %0 addrspace(200)*, %struct.eggs addrspace(200)* }
define void @baz() local_unnamed_addr addrspace(200) #0 {
bb:
%tmp = alloca %struct.eggs, align 16, addrspace(200)
%tmp1 = getelementptr inbounds %struct.eggs, %struct.eggs addrspace(200)* %tmp, i64 0, i32 1
%tmp2 = bitcast %struct.eggs addrspace(200)* %tmp to i8 addrspace(200)* addrspace(200)*
store i8 addrspace(200)* undef, i8 addrspace(200)* addrspace(200)* %tmp2, align 16
%tmp3 = load i32, i32 addrspace(200)* %tmp1, align 4
%tmp4 = icmp eq i32 %tmp3, undef
unreachable
}
attributes #0 = { "use-soft-float"="true" }