From a6daa76bc63891de8bb1bcc8f3402ecc36b50194 Mon Sep 17 00:00:00 2001 From: Nick Bray Date: Wed, 28 Aug 2024 10:17:02 -0700 Subject: [PATCH] hashtest: core functions for a standalone hashtest runner PiperOrigin-RevId: 668505831 --- fuzzer/hashtest/BUILD | 7 + fuzzer/hashtest/hashtest_runner.cc | 234 +++++++++++++++++++++++- fuzzer/hashtest/hashtest_runner.h | 160 +++++++++++++++- fuzzer/hashtest/hashtest_runner_test.cc | 42 +++++ 4 files changed, 441 insertions(+), 2 deletions(-) diff --git a/fuzzer/hashtest/BUILD b/fuzzer/hashtest/BUILD index 1754876f..3507190c 100644 --- a/fuzzer/hashtest/BUILD +++ b/fuzzer/hashtest/BUILD @@ -125,7 +125,12 @@ cc_library( ], deps = [ ":hashtest_generator_lib", + "@silifuzz//instruction:xed_util", + "@silifuzz//util:page_util", + "@cityhash", "@com_google_absl//absl/log:check", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -135,9 +140,11 @@ cc_test( "hashtest_runner_test.cc", ], deps = [ + ":hashtest_generator_lib", ":hashtest_runner_lib", "@silifuzz//instruction:xed_util", "@silifuzz//util:platform", + "@com_google_absl//absl/types:span", "@com_google_googletest//:gtest_main", ], ) diff --git a/fuzzer/hashtest/hashtest_runner.cc b/fuzzer/hashtest/hashtest_runner.cc index 042c9dd1..24d83e85 100644 --- a/fuzzer/hashtest/hashtest_runner.cc +++ b/fuzzer/hashtest/hashtest_runner.cc @@ -14,22 +14,143 @@ #include "./fuzzer/hashtest/hashtest_runner.h" +#include + #include +#include #include -#include +#include +#include +#include #include +#include +#include +#include #include "absl/log/check.h" +#include "absl/strings/str_cat.h" +#include "absl/types/span.h" +#include "third_party/cityhash/city.h" #include "./fuzzer/hashtest/hashtest_runner_widgits.h" +#include "./fuzzer/hashtest/instruction_pool.h" #include "./fuzzer/hashtest/synthesize_base.h" +#include "./fuzzer/hashtest/synthesize_test.h" +#include "./instruction/xed_util.h" +#include "./util/page_util.h" namespace silifuzz { +std::string FormatSeed(uint64_t seed) { + return absl::StrCat(absl::Hex(seed, absl::kZeroPad16)); +} + void RandomizeEntropyBuffer(uint64_t seed, EntropyBuffer& buffer) { std::independent_bits_engine engine(seed); std::generate(std::begin(buffer.bytes), std::end(buffer.bytes), engine); } +MemoryMapping::~MemoryMapping() { + if (ptr_ != nullptr) { + CHECK_EQ(munmap(ptr_, allocated_size_), 0); + } +} + +void DumpTest(uint64_t start_address, InstructionBlock& body) { + xed_decoded_inst_t xed_insn; + char formatted_insn_buf[96]; + + size_t offset = 0; + + while (offset < body.bytes.size()) { + xed_decoded_inst_zero(&xed_insn); + xed_decoded_inst_set_mode(&xed_insn, XED_MACHINE_MODE_LONG_64, + XED_ADDRESS_WIDTH_64b); + + xed_error_enum_t xed_error = xed_decode( + &xed_insn, body.bytes.data() + offset, body.bytes.size() - offset); + CHECK(xed_error == XED_ERROR_NONE); + CHECK(xed_decoded_inst_valid(&xed_insn)); + CHECK(FormatInstruction(xed_insn, start_address + offset, + formatted_insn_buf, sizeof(formatted_insn_buf))); + std::cout << std::hex << std::setfill('0') << std::setw(16) + << start_address + offset << ": " << formatted_insn_buf << "\n"; + offset += xed_decoded_inst_get_length(&xed_insn); + } + std::cout << std::dec; +} + +void SynthesizeTest(uint64_t seed, xed_chip_enum_t chip, + const InstructionPool& ipool, InstructionBlock& body) { + Rng rng(seed); + RegisterPool rpool{}; + InitRegisterLayout(chip, rpool); + + SynthesizeLoopBody(rng, ipool, rpool, body); + + // Decrement the loop counter at the end of the loop body. + SynthesizeGPRegDec(kLoopIndex, body); + + // Using JNLE so that the loop will abort if an SDC causes us to miss zero + // or jump to a negative index. + SynthesizeJnle(-(int32_t)body.bytes.size(), body); + + SynthesizeReturn(body); + size_t padding = (16 - (body.bytes.size() % 16)) % 16; + SynthesizeBreakpointTraps(16 + padding, body); +} + +Corpus SynthesizeCorpus(Rng& rng, xed_chip_enum_t chip, + const InstructionPool& ipool, size_t num_tests, + bool verbose) { + constexpr size_t kMaxTestBytes = 1024; + size_t mapping_size = RoundUpToPageAlignment(kMaxTestBytes * num_tests); + void* ptr = mmap(0, mapping_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + // TODO check result. + + std::vector tests(num_tests); + + size_t offset = 0; + for (size_t i = 0; i < num_tests; ++i) { + InstructionBlock body{}; + uint64_t seed = GetSeed(rng); + SynthesizeTest(seed, chip, ipool, body); + + // Copy the test into the mapping. + void* test_addr = reinterpret_cast(ptr) + offset; + size_t test_size = body.bytes.size(); + CHECK_LE(test_size, kMaxTestBytes); + memcpy(test_addr, body.bytes.data(), test_size); + offset += test_size; + + if (verbose) { + DumpTest(reinterpret_cast(test_addr), body); + } + + tests[i] = { + .seed = seed, + .code = test_addr, + }; + } + + // Make read-only executable. + mprotect(ptr, mapping_size, PROT_READ | PROT_EXEC); + + return Corpus{ + .tests = std::move(tests), + .mapping = + MemoryMapping(ptr, mapping_size, RoundUpToPageAlignment(offset)), + }; +} + +void ResultReporter::ReportHit(size_t test_index, const Test& test, + size_t input_index, const Input& input) { + hits.emplace_back(test_index, test.seed, input_index, input.seed); + + std::cout << "Hit " << FormatSeed(test.seed) << " / " + << FormatSeed(input.seed) << "\n"; +} + void RunHashTest(void* test, const TestConfig& config, const EntropyBuffer& input, EntropyBuffer& output) { if (config.vector_width == 512) { @@ -39,5 +160,116 @@ void RunHashTest(void* test, const TestConfig& config, } else { CHECK(false) << "Unsupported vector width: " << config.vector_width; } +#if defined(MEMORY_SANITIZER) + __msan_unpoison(output.bytes, output.NumBytes(config.vector_width)); +#endif +} + +uint64_t EntropyBufferHash(const EntropyBuffer& buffer, size_t vector_width) { + return CityHash64(reinterpret_cast(&buffer.bytes), + buffer.NumBytes(vector_width)); +} + +void ComputeEndStates(absl::Span tests, const TestConfig& config, + absl::Span inputs, + absl::Span end_states) { + CHECK_EQ(tests.size() * inputs.size(), end_states.size()); + for (size_t t = 0; t < tests.size(); ++t) { + for (size_t i = 0; i < inputs.size(); i++) { + EntropyBuffer output; + RunHashTest(tests[t].code, config, inputs[i].entropy, output); + end_states[t * inputs.size() + i].hash = + EntropyBufferHash(output, config.vector_width); + } + } +} + +// Given three end states, select the one that occurs at least twice and return +// true. If all the end states are different, return false. +bool ReconcileEndState(EndState& end_state, const EndState& other1, + const EndState& other2) { + if (end_state.hash == other1.hash) { + return true; + } + if (end_state.hash == other2.hash) { + return true; + } + if (other1.hash == other2.hash) { + end_state.hash = other1.hash; + return true; + } + + // No two of the end states match. + end_state.SetCouldNotBeComputed(); + return false; +} + +size_t ReconcileEndStates(absl::Span end_state, + absl::Span other1, + absl::Span other2) { + CHECK_EQ(end_state.size(), other1.size()); + CHECK_EQ(end_state.size(), other2.size()); + size_t fail_count = 0; + for (size_t i = 0; i < end_state.size(); ++i) { + if (!ReconcileEndState(end_state[i], other1[i], other2[i])) { + fail_count++; + } + } + return fail_count; +} + +void RunTest(size_t test_index, const Test& test, const TestConfig& config, + size_t input_index, const Input& input, const EndState& expected, + ResultReporter& result) { + // Run the test. + EntropyBuffer actual; + RunHashTest(test.code, config, input.entropy, actual); + + // Compare the end state. + bool ok = expected.hash == EntropyBufferHash(actual, config.vector_width); + + if (!ok) { + result.ReportHit(test_index, test, input_index, input); + } } + +void RunBatch(absl::Span tests, absl::Span inputs, + absl::Span end_states, const RunConfig& config, + size_t test_offset, ResultReporter& result) { + // Repeat the batch. + for (size_t r = 0; r < config.num_repeat; ++r) { + // Sweep through each input. + for (size_t i = 0; i < inputs.size(); i++) { + const Input& input = inputs[i]; + // Sweep through each test in the batch. + // The point of having a batch size > 1 is that the same test will not be + // run multiple times in a row. + for (size_t t = 0; t < tests.size(); ++t) { + const Test& test = tests[t]; + const EndState& expected = end_states[t * inputs.size() + i]; + if (expected.CouldNotBeComputed()) { + continue; + } + size_t test_index = test_offset + t; + if (r == 0 && i == 0 && test_index % 1000 == 0) { + std::cout << "Test " << test_index << " / " << FormatSeed(test.seed) + << "\n"; + } + RunTest(test_index, test, config.test, i, input, expected, result); + } + } + } +} + +void RunTests(absl::Span tests, absl::Span inputs, + absl::Span end_states, const RunConfig& config, + size_t test_offset, ResultReporter& result) { + for (size_t g = 0; g < tests.size(); g += config.batch_size) { + size_t batch_size = std::min(config.batch_size, tests.size() - g); + RunBatch(tests.subspan(g, batch_size), inputs, + end_states.subspan(g * inputs.size(), batch_size * inputs.size()), + config, test_offset + g, result); + } +} + } // namespace silifuzz diff --git a/fuzzer/hashtest/hashtest_runner.h b/fuzzer/hashtest/hashtest_runner.h index 0667d88b..9860608c 100644 --- a/fuzzer/hashtest/hashtest_runner.h +++ b/fuzzer/hashtest/hashtest_runner.h @@ -17,9 +17,26 @@ #include #include +#include +#include +#include + +#include "absl/types/span.h" +#include "./fuzzer/hashtest/instruction_pool.h" +#include "./fuzzer/hashtest/synthesize_base.h" namespace silifuzz { +// Extract 64-bits worth of entropy from an arbitrary RNG. +template +inline uint64_t GetSeed(R& rng) { + std::uniform_int_distribution dis; + return dis(rng); +} + +// Format a seed for printing in a consistent, zero-padded way. +std::string FormatSeed(uint64_t seed); + // TODO(ncbray): should there be 8 GP entropy registers? The loop counter was // carved out of the entropy pool, resulting in 7 registers. // TODO(ncbray): should rbp be reserved as a frame pointer? @@ -50,12 +67,153 @@ struct EntropyBuffer { // Fill the buffer with random bytes. void RandomizeEntropyBuffer(uint64_t seed, EntropyBuffer& buffer); +// Initial state for a test. +struct Input { + uint64_t seed; + EntropyBuffer entropy; +}; + +// Machine instructions for a test. +struct Test { + // The seed that was used to generate the test. + // Provides a semi-stable name for the test (the test generation algorithm may + // be improved from time to time). + uint64_t seed; + + // The entry point of the test. Jump here to run the test. + // This is a borrowed reference to memory owned by the Corpus struct. + void* code; +}; + +class MemoryMapping { + public: + MemoryMapping(void* ptr, size_t allocated_size, size_t used_size) + : ptr_(ptr), allocated_size_(allocated_size), used_size_(used_size) {} + + ~MemoryMapping(); + + // No copy. + MemoryMapping(const MemoryMapping&) = delete; + MemoryMapping& operator=(const MemoryMapping&) = delete; + + // Move allowed. + MemoryMapping(const MemoryMapping&&) = default; + MemoryMapping& operator=(const MemoryMapping&&) = default; + + size_t MemoryUse() { return used_size_; } + + private: + void* ptr_; + size_t allocated_size_; + size_t used_size_; +}; + +// A collection of tests. +struct Corpus { + std::vector tests; + MemoryMapping mapping; + + size_t MemoryUse() { + return sizeof(Corpus) + tests.size() * sizeof(Test) + mapping.MemoryUse(); + } +}; + +// Synthesize a corpus of random tests. +Corpus SynthesizeCorpus(Rng& rng, xed_chip_enum_t chip, + const InstructionPool& ipool, size_t num_tests, + bool verbose); + +// The configuration for running a single test. struct TestConfig { size_t vector_width; size_t num_iterations; }; -// Exported for testing. +// The expected end state of a test + input. +struct EndState { + // This field contains the hash of the entropy pool when the test exits. + // For an individual test, it would be faster to store the entire end state + // entropy struct and memcmp it. This uses 79x the memory of a hash, however, + // which can quickly become an issue as the number of tests and inputs + // increases. Memory bandwidth is also becomes more important for + // multi-threaded testing. The cost of hashing can easily pay for itself. + uint64_t hash; + + // It's astronomically unlikely for the hash to be zero, so use this value to + // mark an end state that could not be computed. We could also store this as a + // separate bool, but that would double the memory usage. + void SetCouldNotBeComputed() { hash = 0; } + + bool CouldNotBeComputed() const { return hash == 0; } +}; + +// For each test and input, compute the end state. +// end_states.size() should be tests.size() * inputs.size(). +// For test "t" and input "i", the end state will be stored at index: +// t * inputs.size() + i. +void ComputeEndStates(absl::Span tests, const TestConfig& config, + absl::Span inputs, + absl::Span end_states); + +// Given three lists of independently computed end states, determine which end +// state we belive is correct and copy it to `end_state`. If it is unclear which +// end state is correct, mark the entry in `end_state` as bad, and skip running +// that test in the future. +// Returns the number of end states that could not be reconciled. +size_t ReconcileEndStates(absl::Span end_state, + absl::Span other1, + absl::Span other2); + +// All the information we want to remember about each hit. +struct Hit { + // A unique identifier in the range [0, num_tests_generated) where + // num_tests_generated is the total number of tests generated during this + // invocation of the runner. (Each test has a unique index.) + size_t test_index; + // A unique identifier that should be stable between runs, but is not densely + // packed like test_index. + uint64_t test_seed; + // A unique identifier in the range [0, num_inputs_generated) where + // num_inputs_generated is the total number of inputs generated during this + // invocation of the runner. (Each input has a unique index.) + size_t input_index; + // A unique identifier that should be stable between runs. + uint64_t input_seed; +}; + +// An interface for reporting the results of test execution. +struct ResultReporter { + void ReportHit(size_t test_index, const Test& test, size_t input_index, + const Input& input); + + // It's usually much more compact to collect each hit rather than keep + // per-test statistics. We can always recreate those statistics later from the + // hits. + std::vector hits; +}; + +// The configuration for running multiple tests. +struct RunConfig { + // How should the test be run? + TestConfig test; + + // How many tests should you alternate between? + size_t batch_size; + + // How many times should you run each test + input? + size_t num_repeat; +}; + +// Run each test with each input, and check the end state. +// For test "t" and input "i", the end state will be at index: +// t * inputs.size() + i. +// Tests will executed in an interleaved order and repeated according to the +// `config`. +void RunTests(absl::Span tests, absl::Span inputs, + absl::Span end_states, const RunConfig& config, + size_t test_offset, ResultReporter& result); + +// Internal function, exported for testing. void RunHashTest(void* test, const TestConfig& config, const EntropyBuffer& input, EntropyBuffer& output); diff --git a/fuzzer/hashtest/hashtest_runner_test.cc b/fuzzer/hashtest/hashtest_runner_test.cc index efa565c7..3516e9d7 100644 --- a/fuzzer/hashtest/hashtest_runner_test.cc +++ b/fuzzer/hashtest/hashtest_runner_test.cc @@ -16,9 +16,13 @@ #include #include +#include #include "gtest/gtest.h" +#include "absl/types/span.h" #include "./fuzzer/hashtest/hashtest_runner_widgits.h" +#include "./fuzzer/hashtest/instruction_pool.h" +#include "./fuzzer/hashtest/synthesize_base.h" #include "./instruction/xed_util.h" #include "./util/platform.h" @@ -105,6 +109,44 @@ TEST(Runner, Run256) { SmokeTest(3, kVectorWidth); } +TEST(Runner, EndToEnd) { + InitXedIfNeeded(); + xed_chip_enum_t chip = PlatformIdToChip(CurrentPlatformId()); + if (chip == XED_CHIP_INVALID) { + GTEST_SKIP() << "Unsupported chip."; + } + + Rng rng(0); + + const RunConfig config = { + .test = + { + .vector_width = ChipVectorRegisterWidth(chip), + .num_iterations = 1, + }, + .batch_size = 1, + .num_repeat = 1, + }; + + InstructionPool ipool{}; + GenerateInstructionPool(rng, chip, ipool, false); + Corpus corpus = SynthesizeCorpus(rng, chip, ipool, 1, false); + + std::vector inputs; + inputs.resize(1); + RandomizeEntropyBuffer(GetSeed(rng), inputs[0].entropy); + + std::vector end_states; + end_states.resize(corpus.tests.size() * inputs.size()); + ComputeEndStates(corpus.tests, config.test, inputs, + absl::MakeSpan(end_states)); + + ResultReporter result; + RunTests(corpus.tests, inputs, end_states, config, 0, result); + + EXPECT_EQ(result.hits.size(), 0); +} + } // namespace } // namespace silifuzz