From d6fd43cf382dd221268bf60abb7262874d63ef6c Mon Sep 17 00:00:00 2001 From: Marc Kirchner Date: Fri, 15 Dec 2023 09:16:07 +0400 Subject: [PATCH 1/5] Add universal hash --- Makefile | 4 ++- include/uh.h | 9 ++++++ src/hamt.c | 2 +- src/uh.c | 23 ++++++++++++++ test/test_hamt.c | 81 ++++++++++++++++++++++++++++++------------------ 5 files changed, 87 insertions(+), 32 deletions(-) create mode 100644 include/uh.h create mode 100644 src/uh.c diff --git a/Makefile b/Makefile index 5caed60..7351062 100644 --- a/Makefile +++ b/Makefile @@ -5,13 +5,15 @@ INC_FLAGS := $(addprefix -I,$(INC_DIRS)) LIB_SRCS := \ src/hamt.c \ - src/murmur3.c + src/murmur3.c \ + src/uh.c LIB_OBJS := $(LIB_SRCS:%=$(BUILD_DIR)/%.o) LIB_DEPS := $(LIB_OBJS:.o=.d) TEST_HAMT_SRCS := \ src/murmur3.c \ + src/uh.c \ test/test_hamt.c \ test/utils.c \ test/words.c diff --git a/include/uh.h b/include/uh.h new file mode 100644 index 0000000..2e0f6ec --- /dev/null +++ b/include/uh.h @@ -0,0 +1,9 @@ +#ifndef UNIVERSAL_HASH_H +#define UNIVERSAL_HASH_H + +#include +#include + +uint32_t sedgewick_universal_hash(const char *str, uint32_t M); + +#endif diff --git a/src/hamt.c b/src/hamt.c index c00020a..083be4d 100644 --- a/src/hamt.c +++ b/src/hamt.c @@ -98,7 +98,7 @@ static inline hash_state *hash_next(hash_state *h) h->depth += 1; h->shift += 5; if (h->shift > 25) { - h->hash = h->hash_fn(h->key, h->depth / 5); + h->hash = h->hash_fn(h->key, h->depth); h->shift = 0; } return h; diff --git a/src/uh.c b/src/uh.c new file mode 100644 index 0000000..55aad2f --- /dev/null +++ b/src/uh.c @@ -0,0 +1,23 @@ +#include "uh.h" + +/* Sedgewick universal hash from Sedgewick R, "Algorithms in C" Third + * Edition, 1998, p. 579. Works on null-terminated C strings. + * + * Best hash function for 32-ary trees according to Bagwell P, "Ideal + * Hash Trees" (in comparison with Elf and PJW hash). + * + * And indeed, comparative experiments w/ murmur3 show more consistent and + * smaller max tree depths. For use in HAMT, it is importatnt to choose M + * large enough since the probability of two nonequal keys to collide is + * approximately 1/M. + */ + +uint32_t sedgewick_universal_hash(const char *str, uint32_t M) +{ + uint32_t h; + uint32_t a = 31415, b = 27183; + for (h = 0; *str != '\0'; ++str, a = a * b % (M - 1)) + h = (a * h + *str) % M; + return h; +} + diff --git a/test/test_hamt.c b/test/test_hamt.c index 31bc761..1cf9004 100644 --- a/test/test_hamt.c +++ b/test/test_hamt.c @@ -1,10 +1,13 @@ +#include "hamt.h" #include "minunit.h" #include +#include #include #include #include #include "murmur3.h" +#include "uh.h" #include "utils.h" #include "words.h" @@ -775,9 +778,16 @@ MU_TEST_CASE(test_persistent_remove_aspell_dict_en) return 0; } + +static uint32_t my_keyhash_universal(const void *key, const size_t gen) +{ + return sedgewick_universal_hash((const char *) key, gen << 8); +} + + MU_TEST_CASE(test_tree_depth) { - printf(". testing tree depth log32 assumptions"); + printf(". testing tree depth log32 assumptions\n"); size_t n_items = 1e6; char **words = NULL; @@ -785,40 +795,51 @@ MU_TEST_CASE(test_tree_depth) words_load_numbers(&words, 0, n_items); - t = hamt_create(my_keyhash_string, my_keycmp_string, - &hamt_allocator_default); - for (size_t i = 0; i < n_items; i++) { - hamt_set(t, words[i], words[i]); - } + hamt_key_hash_fn hash_fns[2] = { my_keyhash_string, my_keyhash_universal }; + char *hash_names[2] = { "murmur3", "sedgewick_universal" }; - /* Calculate the avg tree depth across all items */ - double avg_depth = 0.0; - size_t max_depth = 0; - for (size_t i = 0; i < n_items; i++) { - hash_state *hash = &(hash_state){.key = words[i], - .hash_fn = my_keyhash_string, - .hash = my_keyhash_string(words[i], 0), - .depth = 0, - .shift = 0}; - search_result sr = - search_recursive(t, t->root, hash, t->key_cmp, words[i], NULL); - if (sr.status != SEARCH_SUCCESS) { - printf("tree search failed for: %s\n", words[i]); - continue; + for (size_t k = 0; k < 2; ++k) { + + t = hamt_create(hash_fns[k], my_keycmp_string, &hamt_allocator_default); + for (size_t i = 0; i < n_items; i++) { + hamt_set(t, words[i], words[i]); } - // in order to calculate depth, item must exist - MU_ASSERT(sr.status == SEARCH_SUCCESS, "tree depth search failure"); - avg_depth = (avg_depth * i + sr.hash->depth) / (i + 1); - if (sr.hash->depth > max_depth) { - max_depth = sr.hash->depth; + /* Calculate the avg tree depth across all items */ + double avg_depth = 0.0; + size_t max_depth = 0; + for (size_t i = 0; i < n_items; i++) { + hash_state *hash = &(hash_state){.key = words[i], + .hash_fn = hash_fns[k], + .hash = hash_fns[k](words[i], 0), + .depth = 0, + .shift = 0}; + search_result sr = + search_recursive(t, t->root, hash, t->key_cmp, words[i], NULL); + if (sr.status != SEARCH_SUCCESS) { + printf("tree search failed for: %s\n", words[i]); + continue; + } + // in order to calculate depth, item must exist + MU_ASSERT(sr.status == SEARCH_SUCCESS, "tree depth search failure"); + avg_depth = (avg_depth * i + sr.hash->depth) / (i + 1); + if (sr.hash->depth > max_depth) { + max_depth = sr.hash->depth; + // printf("New max depth %lu for %s\n", max_depth, words[i]); + } + /* + else + if (sr.hash->depth == max_depth) { + printf("Equal max depth %lu for %s\n", max_depth, words[i]); + } + */ } - } - hamt_delete(t); + hamt_delete(t); + printf(" %s (avg depth for %lu items: %0.3f, expected %0.3f, max: %lu)\n", + hash_names[k], n_items, avg_depth, log2(n_items) / 5.0, + max_depth); /* log_32(n_items) */ + } words_free(words, n_items); - printf(" (avg tree depth w/ %lu items: %f, expected %f, max: %lu)\n", - n_items, avg_depth, log2(n_items) / 5.0, - max_depth); /* log_32(n_items) */ return 0; } int mu_tests_run = 0; From e5b64f5abd2544e044c454544dc029075b385c90 Mon Sep 17 00:00:00 2001 From: mkirchner Date: Fri, 15 Dec 2023 13:33:35 +0400 Subject: [PATCH 2/5] Update c-cpp.yml Update macOS version --- .github/workflows/c-cpp.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index d66d645..23f6f17 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -9,7 +9,7 @@ on: jobs: build: - runs-on: macos-11 + runs-on: macos-12 steps: - uses: actions/checkout@v2 From 0ba554d90f4992f2432ddbca863020161e0df060 Mon Sep 17 00:00:00 2001 From: Marc Kirchner Date: Fri, 15 Dec 2023 13:38:23 +0400 Subject: [PATCH 3/5] bug hunt --- Makefile | 2 +- test/test_hamt.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 7351062..eeeaf72 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ TEST_MURMUR_SRCS := test/test_murmur.c TEST_MURMUR_OBJS := $(TEST_MURMUR_SRCS:%=$(BUILD_DIR)/%.o) TEST_MURMUR_DEPS := $(TEST_MURMUR_OBJS:.o=.d) -CPPFLAGS ?= $(INC_FLAGS) -MMD -MP -g -O0 +CPPFLAGS ?= $(INC_FLAGS) -MMD -MP -O3 # -g -O0 lib: $(BUILD_DIR)/src/libhamt.dylib diff --git a/test/test_hamt.c b/test/test_hamt.c index 1cf9004..8953495 100644 --- a/test/test_hamt.c +++ b/test/test_hamt.c @@ -866,7 +866,7 @@ MU_TEST_SUITE(test_suite) MU_RUN_TEST(test_persistent_set); MU_RUN_TEST(test_persistent_aspell_dict_en); MU_RUN_TEST(test_persistent_remove_aspell_dict_en); - MU_RUN_TEST(test_tree_depth); + // MU_RUN_TEST(test_tree_depth); return 0; } From f0569d63479a13d80c84d860bd95e107032ee150 Mon Sep 17 00:00:00 2001 From: Marc Kirchner Date: Fri, 15 Dec 2023 13:47:21 +0400 Subject: [PATCH 4/5] Split makefile steps --- Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index eeeaf72..e696f94 100644 --- a/Makefile +++ b/Makefile @@ -33,7 +33,9 @@ lib: $(BUILD_DIR)/src/libhamt.dylib $(BUILD_DIR)/src/libhamt.dylib: $(LIB_OBJS) $(CC) $(LIB_OBJS) -dynamiclib -o $@ -test: $(BUILD_DIR)/test/test_hamt $(BUILD_DIR)/test/test_murmur +build_test: $(BUILD_DIR)/test/test_hamt $(BUILD_DIR)/test/test_murmur + +test: build_test $(BUILD_DIR)/test/test_murmur $(BUILD_DIR)/test/test_hamt From 4e77744b2a6393428d13b4e220cafb51c64b4196 Mon Sep 17 00:00:00 2001 From: mkirchner Date: Fri, 15 Dec 2023 13:48:49 +0400 Subject: [PATCH 5/5] Update c-cpp.yml Split make steps --- .github/workflows/c-cpp.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 23f6f17..bc83033 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -8,10 +8,14 @@ on: jobs: build: - runs-on: macos-12 steps: - uses: actions/checkout@v2 - - name: make + - name: Build libraries + run: make + - name: Build tests + run: make build_test + - name: Run tests run: make test +