gxhash and idset

Lips7 · Jul 18, 2024 · debe14e · debe14e
1 parent fba7f0b
commit debe14e
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 125 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/matcher_rs/Cargo.toml b/matcher_rs/Cargo.toml
@@ -23,12 +23,12 @@ name = "matcher_rs"
 crate-type = ["rlib"]
 
 [dependencies]
-ahash = { version = "0.8.11", features = ["serde", "compile-time-rng"] }
 # aho-corasick = "1.1.3"
 aho-corasick-unsafe = { version = "0.0.4", git = "https://github.com/Lips7/aho-corasick" }
 bitflags = { version = "2.6.0", features = ["serde"] }
 daachorse = "1.0.0"
 fancy-regex = "0.13.0"
+gxhash = "3.4.1"
 id-set = "0.2.2"
 lazy_static = "1.5.0"
 nohash-hasher = "0.2.0"

diff --git a/matcher_rs/src/matcher.rs b/matcher_rs/src/matcher.rs
@@ -2,7 +2,7 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 
 use id_set::IdSet;
-use nohash_hasher::{IntMap, IntSet};
+use nohash_hasher::IntMap;
 use sonic_rs::{to_string, Deserialize, Serialize};
 
 use crate::process::process_matcher::{
@@ -613,7 +613,7 @@ impl Matcher {
         processed_text_process_type_set: &[(Cow<'a, str>, IdSet)],
     ) -> HashMap<u32, Vec<MatchResult>> {
         let mut match_result_dict = HashMap::new();
-        let mut failed_match_table_id_set = IntSet::default();
+        let mut failed_match_table_id_set = IdSet::default();
 
         if let Some(regex_matcher) = &self.regex_matcher {
             for regex_result in regex_matcher
@@ -651,10 +651,10 @@ impl Matcher {
                             .get_unchecked(simple_result.word_id as usize),
                     )
                 };
-                let match_table_id =
-                    ((word_table_conf.match_id as u64) << 32) | (word_table_conf.table_id as u64);
+                let match_table_id = ((word_table_conf.match_id as usize) << 32)
+                    | (word_table_conf.table_id as usize);
 
-                if failed_match_table_id_set.contains(&match_table_id) {
+                if failed_match_table_id_set.contains(match_table_id) {
                     continue;
                 }
 

diff --git a/matcher_rs/src/process/process_matcher.rs b/matcher_rs/src/process/process_matcher.rs
@@ -2,9 +2,6 @@ use std::borrow::Cow;
 use std::fmt::Display;
 use std::sync::Arc;
 
-#[cfg(any(feature = "runtime_build", feature = "dfa"))]
-use ahash::AHashMap;
-use ahash::HashMapExt;
 use aho_corasick_unsafe::AhoCorasick;
 #[cfg(any(feature = "runtime_build", feature = "dfa"))]
 use aho_corasick_unsafe::{AhoCorasickBuilder, AhoCorasickKind, MatchKind as AhoCorasickMatchKind};
@@ -16,6 +13,9 @@ use daachorse::{
     CharwiseDoubleArrayAhoCorasick, CharwiseDoubleArrayAhoCorasickBuilder,
     MatchKind as DoubleArrayAhoCorasickMatchKind,
 };
+#[cfg(any(feature = "runtime_build", feature = "dfa"))]
+use gxhash::HashMap as GxHashMap;
+use gxhash::HashMapExt;
 use id_set::IdSet;
 use lazy_static::lazy_static;
 use nohash_hasher::{IntMap, IsEnabled};
@@ -405,7 +405,7 @@ pub fn get_process_matcher(
 
     #[cfg(feature = "runtime_build")]
     {
-        let mut process_dict = AHashMap::default();
+        let mut process_dict = GxHashMap::default();
 
         match process_type_bit {
             ProcessType::None => {}
@@ -531,7 +531,7 @@ pub fn get_process_matcher(
             ProcessType::Delete => {
                 #[cfg(feature = "dfa")]
                 {
-                    let mut process_dict = AHashMap::default();
+                    let mut process_dict = GxHashMap::default();
                     process_dict.extend(TEXT_DELETE.trim().lines().map(|pair_str| (pair_str, "")));
                     process_dict.extend(WHITE_SPACE.iter().map(|&c| (c, "")));
                     process_dict.retain(|&key, &mut value| key != value);

diff --git a/matcher_rs/src/regex_matcher.rs b/matcher_rs/src/regex_matcher.rs
@@ -2,7 +2,6 @@ use std::borrow::Cow;
 
 use fancy_regex::{escape, Regex};
 use id_set::IdSet;
-use nohash_hasher::IntSet;
 use regex::RegexSet;
 use sonic_rs::{Deserialize, Serialize};
 
@@ -448,7 +447,7 @@ impl<'a> TextMatcherTrait<'a, RegexResult<'a>> for RegexMatcher {
         processed_text_process_type_set: &[(Cow<'a, str>, IdSet)],
     ) -> Vec<RegexResult<'a>> {
         let mut result_list = Vec::new();
-        let mut table_id_index_set = IntSet::default();
+        let mut table_id_index_set = IdSet::default();
 
         for (processed_text, process_type_set) in processed_text_process_type_set {
             for regex_pattern_table in &self.regex_pattern_table_list {
@@ -457,7 +456,7 @@ impl<'a> TextMatcherTrait<'a, RegexResult<'a>> for RegexMatcher {
                 }
                 match &regex_pattern_table.regex_type {
                     RegexType::Standard { regex } => {
-                        if table_id_index_set.insert(regex_pattern_table.table_id as u64) {
+                        if table_id_index_set.insert(regex_pattern_table.table_id as usize) {
                             for caps in regex.captures_iter(processed_text).flatten() {
                                 result_list.push(RegexResult {
                                     match_id: regex_pattern_table.match_id,
@@ -479,7 +478,7 @@ impl<'a> TextMatcherTrait<'a, RegexResult<'a>> for RegexMatcher {
                     } => {
                         for (index, regex) in regex_list.iter().enumerate() {
                             let table_id_index =
-                                ((regex_pattern_table.table_id as u64) << 32) | (index as u64);
+                                ((regex_pattern_table.table_id as usize) << 32) | (index as usize);
 
                             if table_id_index_set.insert(table_id_index) {
                                 if let Ok(is_match) = regex.is_match(processed_text) {
@@ -501,7 +500,7 @@ impl<'a> TextMatcherTrait<'a, RegexResult<'a>> for RegexMatcher {
                     } => {
                         for index in regex_set.matches(processed_text) {
                             let table_id_index =
-                                ((regex_pattern_table.table_id as u64) << 32) | (index as u64);
+                                ((regex_pattern_table.table_id as usize) << 32) | (index as usize);
 
                             if table_id_index_set.insert(table_id_index) {
                                 result_list.push(RegexResult {

diff --git a/matcher_rs/src/sim_matcher.rs b/matcher_rs/src/sim_matcher.rs
@@ -1,7 +1,6 @@
 use std::borrow::Cow;
 
 use id_set::IdSet;
-use nohash_hasher::IntSet;
 use rapidfuzz::distance;
 use sonic_rs::{Deserialize, Serialize};
 
@@ -341,13 +340,13 @@ impl<'a> TextMatcherTrait<'a, SimResult<'a>> for SimMatcher {
     /// - `similarity`: The similarity score of the match.
     ///
     /// The function ensures that only unique matches are included in the result list by maintaining
-    /// an [IntSet] to track already processed table ID and word index combinations.
+    /// an [IdSet] to track already processed table ID and word index combinations.
     fn _process_with_processed_text_process_type_set(
         &'a self,
         processed_text_process_type_set: &[(Cow<'a, str>, IdSet)],
     ) -> Vec<SimResult<'a>> {
         let mut result_list = Vec::new();
-        let mut table_id_index_set = IntSet::default();
+        let mut table_id_index_set = IdSet::default();
 
         for (processed_text, process_type_set) in processed_text_process_type_set {
             for sim_processed_table in &self.sim_processed_table_list {
@@ -358,7 +357,7 @@ impl<'a> TextMatcherTrait<'a, SimResult<'a>> for SimMatcher {
                     SimMatchType::Levenshtein => {
                         for (index, text) in sim_processed_table.word_list.iter().enumerate() {
                             let table_id_index =
-                                ((sim_processed_table.table_id as u64) << 32) | (index as u64);
+                                ((sim_processed_table.table_id as usize) << 32) | (index as usize);
 
                             if table_id_index_set.insert(table_id_index) {
                                 if let Some(similarity) =