Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Sedgewick's universal hash #40

Merged
merged 5 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions .github/workflows/c-cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@ on:

jobs:
build:

runs-on: macos-11
runs-on: macos-12

steps:
- uses: actions/checkout@v2
- name: make
- name: Build libraries
run: make
- name: Build tests
run: make build_test
- name: Run tests
run: make test

10 changes: 7 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@ INC_FLAGS := $(addprefix -I,$(INC_DIRS))

LIB_SRCS := \
src/hamt.c \
src/murmur3.c
src/murmur3.c \
src/uh.c

LIB_OBJS := $(LIB_SRCS:%=$(BUILD_DIR)/%.o)
LIB_DEPS := $(LIB_OBJS:.o=.d)

TEST_HAMT_SRCS := \
src/murmur3.c \
src/uh.c \
test/test_hamt.c \
test/utils.c \
test/words.c
Expand All @@ -24,14 +26,16 @@ TEST_MURMUR_SRCS := test/test_murmur.c
TEST_MURMUR_OBJS := $(TEST_MURMUR_SRCS:%=$(BUILD_DIR)/%.o)
TEST_MURMUR_DEPS := $(TEST_MURMUR_OBJS:.o=.d)

CPPFLAGS ?= $(INC_FLAGS) -MMD -MP -g -O0
CPPFLAGS ?= $(INC_FLAGS) -MMD -MP -O3 # -g -O0

lib: $(BUILD_DIR)/src/libhamt.dylib

$(BUILD_DIR)/src/libhamt.dylib: $(LIB_OBJS)
$(CC) $(LIB_OBJS) -dynamiclib -o $@

test: $(BUILD_DIR)/test/test_hamt $(BUILD_DIR)/test/test_murmur
build_test: $(BUILD_DIR)/test/test_hamt $(BUILD_DIR)/test/test_murmur

test: build_test
$(BUILD_DIR)/test/test_murmur
$(BUILD_DIR)/test/test_hamt

Expand Down
9 changes: 9 additions & 0 deletions include/uh.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#ifndef UNIVERSAL_HASH_H
#define UNIVERSAL_HASH_H

#include <stdint.h>
#include <stdlib.h>

uint32_t sedgewick_universal_hash(const char *str, uint32_t M);

#endif
2 changes: 1 addition & 1 deletion src/hamt.c
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ static inline hash_state *hash_next(hash_state *h)
h->depth += 1;
h->shift += 5;
if (h->shift > 25) {
h->hash = h->hash_fn(h->key, h->depth / 5);
h->hash = h->hash_fn(h->key, h->depth);
h->shift = 0;
}
return h;
Expand Down
23 changes: 23 additions & 0 deletions src/uh.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#include "uh.h"

/* Sedgewick universal hash from Sedgewick R, "Algorithms in C" Third
* Edition, 1998, p. 579. Works on null-terminated C strings.
*
* Best hash function for 32-ary trees according to Bagwell P, "Ideal
* Hash Trees" (in comparison with Elf and PJW hash).
*
* And indeed, comparative experiments w/ murmur3 show more consistent and
* smaller max tree depths. For use in HAMT, it is importatnt to choose M
* large enough since the probability of two nonequal keys to collide is
* approximately 1/M.
*/

uint32_t sedgewick_universal_hash(const char *str, uint32_t M)
{
uint32_t h;
uint32_t a = 31415, b = 27183;
for (h = 0; *str != '\0'; ++str, a = a * b % (M - 1))
h = (a * h + *str) % M;
return h;
}

83 changes: 52 additions & 31 deletions test/test_hamt.c
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#include "hamt.h"
#include "minunit.h"
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "murmur3.h"
#include "uh.h"
#include "utils.h"
#include "words.h"

Expand Down Expand Up @@ -775,50 +778,68 @@ MU_TEST_CASE(test_persistent_remove_aspell_dict_en)
return 0;
}


static uint32_t my_keyhash_universal(const void *key, const size_t gen)
{
return sedgewick_universal_hash((const char *) key, gen << 8);
}


MU_TEST_CASE(test_tree_depth)
{
printf(". testing tree depth log32 assumptions");
printf(". testing tree depth log32 assumptions\n");

size_t n_items = 1e6;
char **words = NULL;
struct hamt *t;

words_load_numbers(&words, 0, n_items);

t = hamt_create(my_keyhash_string, my_keycmp_string,
&hamt_allocator_default);
for (size_t i = 0; i < n_items; i++) {
hamt_set(t, words[i], words[i]);
}
hamt_key_hash_fn hash_fns[2] = { my_keyhash_string, my_keyhash_universal };
char *hash_names[2] = { "murmur3", "sedgewick_universal" };

/* Calculate the avg tree depth across all items */
double avg_depth = 0.0;
size_t max_depth = 0;
for (size_t i = 0; i < n_items; i++) {
hash_state *hash = &(hash_state){.key = words[i],
.hash_fn = my_keyhash_string,
.hash = my_keyhash_string(words[i], 0),
.depth = 0,
.shift = 0};
search_result sr =
search_recursive(t, t->root, hash, t->key_cmp, words[i], NULL);
if (sr.status != SEARCH_SUCCESS) {
printf("tree search failed for: %s\n", words[i]);
continue;
for (size_t k = 0; k < 2; ++k) {

t = hamt_create(hash_fns[k], my_keycmp_string, &hamt_allocator_default);
for (size_t i = 0; i < n_items; i++) {
hamt_set(t, words[i], words[i]);
}
// in order to calculate depth, item must exist
MU_ASSERT(sr.status == SEARCH_SUCCESS, "tree depth search failure");
avg_depth = (avg_depth * i + sr.hash->depth) / (i + 1);
if (sr.hash->depth > max_depth) {
max_depth = sr.hash->depth;
/* Calculate the avg tree depth across all items */
double avg_depth = 0.0;
size_t max_depth = 0;
for (size_t i = 0; i < n_items; i++) {
hash_state *hash = &(hash_state){.key = words[i],
.hash_fn = hash_fns[k],
.hash = hash_fns[k](words[i], 0),
.depth = 0,
.shift = 0};
search_result sr =
search_recursive(t, t->root, hash, t->key_cmp, words[i], NULL);
if (sr.status != SEARCH_SUCCESS) {
printf("tree search failed for: %s\n", words[i]);
continue;
}
// in order to calculate depth, item must exist
MU_ASSERT(sr.status == SEARCH_SUCCESS, "tree depth search failure");
avg_depth = (avg_depth * i + sr.hash->depth) / (i + 1);
if (sr.hash->depth > max_depth) {
max_depth = sr.hash->depth;
// printf("New max depth %lu for %s\n", max_depth, words[i]);
}
/*
else
if (sr.hash->depth == max_depth) {
printf("Equal max depth %lu for %s\n", max_depth, words[i]);
}
*/
}
}

hamt_delete(t);
hamt_delete(t);
printf(" %s (avg depth for %lu items: %0.3f, expected %0.3f, max: %lu)\n",
hash_names[k], n_items, avg_depth, log2(n_items) / 5.0,
max_depth); /* log_32(n_items) */
}
words_free(words, n_items);
printf(" (avg tree depth w/ %lu items: %f, expected %f, max: %lu)\n",
n_items, avg_depth, log2(n_items) / 5.0,
max_depth); /* log_32(n_items) */
return 0;
}
int mu_tests_run = 0;
Expand All @@ -845,7 +866,7 @@ MU_TEST_SUITE(test_suite)
MU_RUN_TEST(test_persistent_set);
MU_RUN_TEST(test_persistent_aspell_dict_en);
MU_RUN_TEST(test_persistent_remove_aspell_dict_en);
MU_RUN_TEST(test_tree_depth);
// MU_RUN_TEST(test_tree_depth);
return 0;
}

Expand Down
Loading