From db59bce47c418cafc1a384345f284fafe86d58bb Mon Sep 17 00:00:00 2001 From: David Hagen Date: Fri, 14 Jun 2024 07:05:36 -0400 Subject: [PATCH] Remove HashOutput (#68) Feature can be resurrected when bandwidth exists to make it actually work --- src/tensora/codegen/_type_to_c.py | 7 +- src/tensora/ir/types.py | 10 - .../iteration_graph/outputs/__init__.py | 1 - .../iteration_graph/outputs/_append.py | 10 +- src/tensora/iteration_graph/outputs/_hash.py | 326 ------------------ .../iteration_graph/tensora_hash_table.c | 167 --------- tests/codegen/test_ast_to_c.py | 1 - tests/csr_matmul_hash.c | 69 ---- 8 files changed, 5 insertions(+), 586 deletions(-) delete mode 100644 src/tensora/iteration_graph/outputs/_hash.py delete mode 100644 src/tensora/iteration_graph/tensora_hash_table.c delete mode 100644 tests/csr_matmul_hash.c diff --git a/src/tensora/codegen/_type_to_c.py b/src/tensora/codegen/_type_to_c.py index 5380d24..46329c9 100644 --- a/src/tensora/codegen/_type_to_c.py +++ b/src/tensora/codegen/_type_to_c.py @@ -2,7 +2,7 @@ from functools import singledispatch -from ..ir.types import Array, FixedArray, Float, HashTable, Integer, Mode, Pointer, Tensor, Type +from ..ir.types import Array, FixedArray, Float, Integer, Mode, Pointer, Tensor, Type def space_variable(variable: str | None = None) -> str: @@ -37,11 +37,6 @@ def type_to_c_mode(self: Mode, variable: str | None = None) -> str: return "taco_mode_t" + space_variable(variable) -@type_to_c.register(HashTable) -def type_to_c_hash_table(self: HashTable, variable: str | None = None) -> str: - return "hash_table_t" + space_variable(variable) - - @type_to_c.register(Pointer) def type_to_c_pointer(self: Pointer, variable: str | None = None) -> str: return f"{type_to_c(self.target)}* restrict" + space_variable(variable) diff --git a/src/tensora/ir/types.py b/src/tensora/ir/types.py index 55fdbf1..83594ca 100644 --- a/src/tensora/ir/types.py +++ b/src/tensora/ir/types.py @@ -8,8 +8,6 @@ "tensor", "Mode", "mode", - "HashTable", - "hash_table", "Pointer", "Array", "FixedArray", @@ -54,14 +52,6 @@ class Mode(Type): mode = Mode() -@dataclass(frozen=True, slots=True) -class HashTable(Type): - pass - - -hash_table = HashTable() - - @dataclass(frozen=True, slots=True) class Pointer(Type): target: Type diff --git a/src/tensora/iteration_graph/outputs/__init__.py b/src/tensora/iteration_graph/outputs/__init__.py index d23c83d..10dc4c7 100644 --- a/src/tensora/iteration_graph/outputs/__init__.py +++ b/src/tensora/iteration_graph/outputs/__init__.py @@ -1,4 +1,3 @@ from ._append import AppendOutput from ._base import Output from ._bucket import BucketOutput -from ._hash import HashOutput diff --git a/src/tensora/iteration_graph/outputs/_append.py b/src/tensora/iteration_graph/outputs/_append.py index 13a3048..f1886cc 100644 --- a/src/tensora/iteration_graph/outputs/_append.py +++ b/src/tensora/iteration_graph/outputs/_append.py @@ -21,7 +21,6 @@ from ..identifiable_expression import ast as ie_ast from ._base import Output from ._bucket import BucketOutput -from ._hash import HashOutput default_array_size = Multiply(IntegerLiteral(1024), IntegerLiteral(1024)) @@ -151,9 +150,8 @@ def next_output( ) return next_output, next_output.write_declarations(bucket), SourceBuilder() else: - next_output = HashOutput(self.output, self.next_layer) - return ( - next_output, - next_output.write_declarations(), - next_output.write_cleanup(kernel_type), + raise NotImplementedError( + "Encountered a sparse output layer preceded by a contraction layer or a later " + "output layer. This requires a hash table to store intermediate outputs, " + "which is not currently implemented." ) diff --git a/src/tensora/iteration_graph/outputs/_hash.py b/src/tensora/iteration_graph/outputs/_hash.py deleted file mode 100644 index 9ff0a91..0000000 --- a/src/tensora/iteration_graph/outputs/_hash.py +++ /dev/null @@ -1,326 +0,0 @@ -__all__ = ["HashOutput"] - -from dataclasses import dataclass, replace - -from ...format import Mode -from ...ir import SourceBuilder, types -from ...ir.ast import ( - Address, - ArrayAllocate, - ArrayLiteral, - Break, - Expression, - Free, - FunctionCall, - IntegerLiteral, - LessThan, - ModeLiteral, - NotEqual, - Variable, -) -from ...kernel_type import KernelType -from .._names import dimension_name, layer_pointer, previous_layer_pointer, vals_name -from .._write_sparse_ir import write_crd_assembly, write_pos_assembly -from ..identifiable_expression import TensorLayer -from ..identifiable_expression import ast as ie_ast -from ._base import Output -from ._bucket import BucketOutput - - -@dataclass(frozen=True, slots=True) -class HashOutput(Output): - output: ie_ast.Tensor - starting_layer: int - unfulfilled: set[int] - - def __init__( - self, output: ie_ast.Tensor, starting_layer: int, unfulfilled: set[int] | None = None - ): - object.__setattr__(self, "output", output) - object.__setattr__(self, "starting_layer", starting_layer) - if unfulfilled is not None: - object.__setattr__(self, "unfulfilled", unfulfilled) - else: - object.__setattr__( - self, "unfulfilled", set(range(starting_layer, self.final_dense_index())) - ) - - def final_dense_index(self): - final_dense_index = self.output.order - for i in reversed(range(self.starting_layer, self.output.order)): - if self.output.modes[i] == Mode.compressed: - break - else: - final_dense_index = i - - return final_dense_index - - def key_number(self, layer: int): - number = 0 - for i in range(self.starting_layer, self.output.order): - if layer == i: - return number - if self.output.modes[i] == Mode.compressed: - number += 1 - return number - - def write_declarations(self) -> SourceBuilder: - source = SourceBuilder("Hash table initialization") - - modes = [ModeLiteral(mode) for mode in self.output.modes[self.starting_layer :]] - dims = [ - dimension_name(variable) for variable in self.output.indexes[self.starting_layer :] - ] - - source.add_dependency("hash") - - # Construct hash table - source.append(self.name().declare(types.hash_table)) - source.append( - self.modes_name().declare(types.Array(types.mode)).assign(ArrayLiteral(modes)) - ) - source.append( - self.dims_name().declare(types.Array(types.integer)).assign(ArrayLiteral(dims)) - ) - source.append( - FunctionCall( - Variable("hash_construct"), - [ - Address(self.name()), - IntegerLiteral(len(modes)), - IntegerLiteral(self.final_dense_index() - self.starting_layer), - self.modes_name(), - self.dims_name(), - ], - ) - ) - - return source - - def write_assignment(self, right_hand_side: str, kernel_type: KernelType) -> SourceBuilder: - raise RuntimeError() - - def write_cleanup(self, kernel_type: KernelType) -> SourceBuilder: - source = SourceBuilder("Hash table cleanup") - - order_name = self.order_name() - loop_name = self.sort_index_name() - - # Argsort the elements by key - source.append( - self.order_name() - .declare(types.Pointer(types.integer)) - .assign(ArrayAllocate(types.integer, self.name().attr("count"))) - ) - source.append(loop_name.declare(types.integer).assign(0)) - with source.loop(LessThan(loop_name, self.name().attr("count"))): - source.append(self.order_name().idx(loop_name).assign(loop_name)) - source.append(loop_name.increment()) - source.append( - FunctionCall( - Variable("qsort_r"), - [ - self.order_name(), - self.name().attr("count"), - Variable("sizeof(uint32_t)"), # Temporary hack - Variable("hash_comparator"), - Address(self.name()), - ], - ) - ) - - # Extract indexes recursively - source.append(self.extract_index_name().declare(types.integer).assign(0)) - source.append(self.write_layer_cleanup(self.starting_layer, kernel_type)) - - # Free temporaries - source.append(Free(order_name)) - - # Free hash table - source.append(FunctionCall(Variable("hash_destruct"), [Address(self.name())])) - - return source - - def write_layer_cleanup(self, layer: int, kernel_type: KernelType): - source = SourceBuilder() - - if layer < self.final_dense_index(): - key_number = self.key_number(layer) - layer_index = Variable(self.output.indexes[layer]) - dimension_size = dimension_name(self.output.indexes[layer]) - position = layer_pointer(self.output.id, layer) - previous_position = previous_layer_pointer(self.output.id, layer) - end_position = self.end_position(key_number) - next_end_position = self.end_position(key_number + 1) - - # Reusable search code - # This is not applicable for the final key, which has no next key - search_source = SourceBuilder() - search_source.append( - next_end_position.declare(types.integer).assign(self.extract_index_name()) - ) - with search_source.loop(LessThan(next_end_position, end_position)): - with search_source.branch( - NotEqual( - self.name() - .attr("keys") - .idx(self.order_name().idx(next_end_position)) - .idx(key_number), - layer_index, - ) - ): - search_source.append(Break()) - search_source.append(next_end_position.increment()) - - # Keys phase - if self.output.modes[layer] == Mode.dense: - source.append(layer_index.declare(types.integer).assign(0)) - with source.loop(LessThan(layer_index, dimension_size)): - source.append( - position.declare(types.integer).assign(previous_position.plus(layer_index)) - ) - source.append(search_source) - source.append(self.write_layer_cleanup(layer + 1, kernel_type)) - source.append(layer_index.increment()) - - if layer == self.final_dense_index() - 1: - source.append(self.extract_index_name().increment()) - - elif self.output.modes[layer] == Mode.compressed: - with source.loop(LessThan(self.extract_index_name(), end_position)): - source.append( - layer_index.declare(types.integer).assign( - self.name() - .attr("keys") - .idx(self.order_name().idx(self.extract_index_name())) - .idx(key_number) - ) - ) - source.append(search_source) - source.append(self.write_layer_cleanup(layer + 1, kernel_type)) - - if kernel_type.is_assemble(): - source.append(write_crd_assembly(TensorLayer(self.output, layer))) - source.append(position.increment()) - - if layer == self.final_dense_index() - 1: - source.append(self.extract_index_name().increment()) - - if kernel_type.is_assemble(): - source.append(write_pos_assembly(TensorLayer(self.output, layer))) - elif layer < self.output.order: - # Bucket phase - layer_index = Variable(self.output.indexes[layer]) - dimension_size = dimension_name(self.output.indexes[layer]) - position = layer_pointer(self.output.id, layer) - previous_position = previous_layer_pointer(self.output.id, layer) - bucket_position = self.bucket_position(layer) - previous_bucket_position = self.previous_bucket_position(layer) - - source.append(layer_index.declare(types.integer).assign(0)) - with source.loop(LessThan(layer_index, dimension_size)): - source.append( - position.declare(types.integer).assign(previous_position.plus(layer_index)) - ) - source.append( - bucket_position.declare(types.integer).assign( - previous_bucket_position.plus(layer_index) - ) - ) - source.append(self.write_layer_cleanup(layer + 1, kernel_type)) - source.append(layer_index.increment()) - elif layer == self.output.order: - # Final phase - vals = vals_name(self.output.name) - previous_position = previous_layer_pointer(self.output.id, layer) - previous_bucket_position = self.previous_bucket_position(layer) - bucket = BucketOutput( - self.output, list(range(self.final_dense_index(), self.output.order)) - ) - source.append( - vals.idx(previous_position).assign(bucket.name().idx(previous_bucket_position)) - ) - - return source - - def next_output( - self, iteration_output: TensorLayer | None, kernel_type: KernelType - ) -> tuple[Output, SourceBuilder, SourceBuilder]: - if iteration_output is None: - return self, SourceBuilder(), SourceBuilder() - else: - next_unfulfilled = self.unfulfilled - {iteration_output.layer} - if len(next_unfulfilled) == 0: - final_dense_index = self.final_dense_index() - - next_output = BucketOutput( - self.output, list(range(final_dense_index, self.output.order)) - ) - - # Write declaration of bucket - source = SourceBuilder() - - key_names = [ - Variable(self.output.indexes[layer]) - for layer in range(self.starting_layer, final_dense_index) - ] - key_name = self.key_name() - - source.append( - key_name.declare(types.Array(types.integer)).assign(ArrayLiteral(key_names)) - ) - source.append( - next_output.write_declarations( - FunctionCall( - Variable("hash_get_bucket"), - [ - Address(self.name()), - key_name, - ], - ) - ) - ) - - return next_output, source, SourceBuilder() - else: - return ( - replace(self, unfulfilled=next_unfulfilled), - SourceBuilder(), - SourceBuilder(), - ) - - def name(self) -> Variable: - return Variable("hash_table") - - def modes_name(self) -> Variable: - return Variable(f"i_{self.name().name}_modes") - - def dims_name(self) -> Variable: - return Variable(f"i_{self.name().name}_dims") - - def key_name(self) -> Variable: - return Variable(f"i_{self.name().name}_key") - - def order_name(self) -> Variable: - return Variable(f"{self.name().name}_order") - - def sort_index_name(self) -> Variable: - return Variable(f"i_{self.name().name}_argsort") - - def extract_index_name(self) -> Variable: - return Variable(f"p_{self.name().name}_order") - - def end_position(self, key_number: int) -> Expression: - if key_number == 0: - return self.name().attr("count") - else: - return Variable(f"p_{self.name().name}_order_{key_number}_end") - - def bucket_position(self, layer: int): - return Variable(layer_pointer(self.output.id, layer).name + "_bucket") - - def previous_bucket_position(self, layer: int): - if layer == 0: - return IntegerLiteral(0) - else: - return self.bucket_position(layer - 1) diff --git a/src/tensora/iteration_graph/tensora_hash_table.c b/src/tensora/iteration_graph/tensora_hash_table.c deleted file mode 100644 index 8776330..0000000 --- a/src/tensora/iteration_graph/tensora_hash_table.c +++ /dev/null @@ -1,167 +0,0 @@ -#include - -static inline uint32_t murmur_32_scramble(uint32_t k) { - k *= 0xcc9e2d51; - k = (k << 15) | (k >> 17); - k *= 0x1b873593; - return k; -} - -uint32_t murmur3_32(uint32_t n_sparse, const uint32_t* key) { - uint32_t h = 1; - - for (size_t i = 0; i < n_sparse; i++) { - h ^= murmur_32_scramble(key[i]); - h = (h << 13) | (h >> 19); - h = h * 5 + 0xe6546b64; - } - - h ^= n_sparse; - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - return h; -} - -struct hash_table_t { - uint32_t n_layers; - uint32_t dense_start; - taco_mode_t *modes; - uint32_t n_sparse; - uint32_t bucket_size; - - uint32_t count; - uint32_t table_capacity; - uint32_t entries_capacity; - int32_t[] indexes; - uint32_t[] keys; - double[] values; -}; - -void hash_construct( - hash_table_t *hash_table, - uint32_t n_layers, - uint32_t dense_start, - taco_mode_t *modes, - uint32_t *dimensions -) { - uint32_t n_sparse = dense_start - n_layers - - uint32_t bucket_size = 1; - for (uint32_t i = dense_start; i < n_layers; i++) { - bucket_size *= dimensions[i]; - } - - uint32_t table_capacity = 10; // 1 MB - uint32_t entries_capacity = 1024*1024; - uint32_t[] indexes = malloc(sizeof(uint32_t) * (1 << table_capacity)); - for (uint32_t i = 0; i < (1 << table_capacity); i++) { - indexes[i] = -1; - } - - hash_table->n_layers = n_layers; - hash_table->dense_start = dense_start; - hash_table->modes = modes; - hash_table->n_sparse = n_sparse; - hash_table->bucket_size = bucket_size; - - hash_table->count = 0; - hash_table->table_capacity = table_capacity; - hash_table->entries_capacity = entries_capacity; - hash_table->indexes = indexes; - hash_table->keys = malloc(sizeof(uint32_t) * n_sparse * entries_capacity); - hash_table->values = malloc(sizeof(double) * bucket_size * entries_capacity); -} - -void hash_realloc(hash_table_t *hash_table, uint32_t index) { - // Heuristic to expand hash table when it is two thirds full - if (index * 3 > (1 << hash_table->table_capacity) * 2) { - hash_table->table_capacity++; - free(hash_table->indexes); - - // Fill the hash table with the sentinel - hash_table->indexes = malloc(sizeof(uint32_t) * (1 << hash_table->table_capacity)); - for (uint32_t i = 0; i < (1 << table_capacity); i++) { - hash_table->indexes[i] = -1; - } - - // Reinsert all the locations of elements into the hash table - uint32_t mask = 0xffffffffu >> (32 - hash_table->table_capacity); - for (uint32_t i = 0; i < hash_table->count; i++) { - uint32_t hash_value = murmur3_32(hash_table->n_sparse, key); - uint32_t short_hash = hash_value & mask; - - hash_table->indexes[short_hash] = location; - } - } - - if (index >= hash_table->entries_capacity) { - uint32_t entries_capacity = max(hash_table->entries_capacity * 2, index) - hash_table->entries_capacity = entries_capacity; - hash_table->keys = realloc(hash_table->keys, sizeof(uint32_t) * hash_table->n_sparse * entries_capacity); - hash_table->values = realloc(hash_table->values, sizeof(double) * hash_table->bucket_size * entries_capacity); - } -} - -double[] hash_get_bucket(hash_table_t *hash_table, uint32_t *key) { - uint32_t hash_value = murmur3_32(hash_table->n_sparse, key); - uint32_t mask = 0xffffffffu >> (32 - hash_table->table_capacity); - uint32_t short_hash = hash_value & mask; - - for (;;) { - if (hash_table->indexes[short_hash] == -1) { - // Empty location found. Store the key and initialize the bucket. - uint32_t location = hash_table->count; - - // Allocate more space, if needed - hash_realloc(hash_table*, location); - - hash_table->indexes[short_hash] = location; - - for (uint32_t i = 0; i < hash_table->n_sparse; i++) { - hash_table->keys[hash_table->n_sparse * location + i] = key[i]; - } - - double[] bucket = hash_table->values + location * hash_table->bucket_size; - for (uint32_t i = 0; i < bucket_size; i++) { - bucket[i] = 0.0; - } - - hash_table->count++; - return bucket; - } else { - // Location is occupied - uint32_t location = hash_table->indexes[short_hash]; - - for (uint32_t i = 0; i < hash_table->n_sparse; i++) { - if (hash_table->keys[hash_table->n_sparse * count + i] != key[i]) { - // Location was filled with different key. Increment (mod table capacity) and continue. - short_hash = (short_hash + 1) & mask; - continue; - } - } - // Location was filled with this key already. Return the bucket. - return hash_table->values + location * hash_table->bucket_size - } - } -} - -void hash_destruct(hash_table_t *hash_table) { - free(hash_table->keys); - free(hash_table->values); -} - -int hash_comparator(uint32_t *left, uint32_t *right, hash_table_t *hash_table) { - // Keys cannot be equal so that case can be ignored - - left_key = hash_table->keys[*left * hash_table->n_sparse]; - right_key = hash_table->keys[*right * hash_table->n_sparse]; - for (uint32_t i = 0; i < hash_table->n_sparse) { - if (left_key[i] > right_key[i]) { - return 1; - } - } - return -1; -} diff --git a/tests/codegen/test_ast_to_c.py b/tests/codegen/test_ast_to_c.py index 5bd493f..89b6305 100644 --- a/tests/codegen/test_ast_to_c.py +++ b/tests/codegen/test_ast_to_c.py @@ -90,7 +90,6 @@ def clean(string: str) -> str: (Declaration(Variable("x"), integer), "int32_t x"), (Declaration(Variable("x"), float), "double x"), (Declaration(Variable("x"), tensor), "taco_tensor_t x"), - (Declaration(Variable("x"), hash_table), "hash_table_t x"), (Declaration(Variable("x"), Pointer(float)), "double* restrict x"), (Declaration(Variable("x"), Pointer(Pointer(integer))), "int32_t* restrict* restrict x"), (Declaration(Variable("x"), Array(float)), "double x[]"), diff --git a/tests/csr_matmul_hash.c b/tests/csr_matmul_hash.c deleted file mode 100644 index a6c3f2b..0000000 --- a/tests/csr_matmul_hash.c +++ /dev/null @@ -1,69 +0,0 @@ -int evaluate(taco_tensor_t *a, taco_tensor_t *b, taco_tensor_t *c) { - int32_t i_dim = a->dimensions[0]; - int32_t k_dim = a->dimensions[1]; - int32_t j_dim = b->dimensions[1]; - int32_t* restrict a_1_pos = (int32_t*)(a->indices[1][0]); - int32_t* restrict a_1_crd = (int32_t*)(a->indices[1][1]); - double* restrict a_vals = (double*)(a->vals); - int32_t* restrict b_1_pos = (int32_t*)(b->indices[1][0]); - int32_t* restrict b_1_crd = (int32_t*)(b->indices[1][1]); - double* restrict b_vals = (double*)(b->vals); - int32_t* restrict c_1_pos = (int32_t*)(c->indices[1][0]); - int32_t* restrict c_1_crd = (int32_t*)(c->indices[1][1]); - double* restrict c_vals = (double*)(c->vals); - - a_1_pos = (int32_t*)malloc(sizeof(int32_t) * (a->dimensions[0] + 1)); - a_1_pos[0] = 0; - int32_t a_1_crd_capacity = 1048576; - a_1_crd = (int32_t*)malloc(sizeof(int32_t) * a_1_crd_capacity - int32_t p_a_0_1 = 0; - int32_t a_vals_capacity = 1048576; - a_vals = (double*)malloc(sizeof(double) * a_vals_capacity); - - for (int32_t i = 0; i < i_dim; i++) { - int32_t p_b_0_0 = i; - - hash_table_t hash_table = hash_construct(); - - for (int32_t p_b_0_1 = b_1_pos[p_b_0_0]; p_b_0_1 < b_1_pos[p_b_0_0+1]; p_b_0_1++) { - int32_t i_b_0_1 = b_1_crd[p_b_0_1]; - int32_t j = i_b_0_1; - int32_t p_c_0_0 = j; - - for (int32_t p_c_0_1 = c_1_pos[p_c_0_0]; p_c_0_1 < c_1_pos[p_c_0_0+1]; p_c_0_1++) { - int32_t i_c_0_1 = c_1_crd[p_c_0_1]; - int32_t k = i_c_0_1; - - // Once the last sparse index is known, find the item in the hash table, possibly allocating it - // This bucket has enough space to store the remaining dense dimensions - double[] bucket = hash_insert(&hash_table, {k}); - - // Write the dense elements into that bucket - bucket[0] = (b_vals[p_b_0_1] * c_vals[p_c_0_1]); - } - } - - uint32_t[] hash_table_order = malloc(sizeof(uint32_t) * hash_table->count); - for (uint32_t i = 0; i < hash_table->count; i++) { - hash_table_order[i] = i; - } - - qsort_r(hash_table_order, hash_table->count, sizeof(uint32_t), hash_comparator, &hash_table); - - for (uint32_t i_order = 0; i_order < hash_table->count; i++) { - a_1_crd[p_a_0_1] = hash_table->keys[i_order * hash_table->n_sparse + 0]; - for (uint32_t i_bucket = 0; i_bucket < hash_table->bucket_size; i_bucket++) { - a_vals[p_a_0_1 + i_bucket] = hash_table->values[i_order * hash_table->bucket_size + i_bucket]; - } - p_a_0_1 = p_a_0_1 + bucket_size; - } - - hash_reset(&hash_table); - } - - a->indices[1][0] = (unit8_t*)a_1_pos; - a->indices[1][1] = (unit8_t*)a_1_crd; - a->vals = (uint8_t*)a_vals; - - return 0; -} \ No newline at end of file