Skip to content

Commit

Permalink
gxhash and idset
Browse files Browse the repository at this point in the history
  • Loading branch information
Lips7 committed Jul 18, 2024
1 parent fba7f0b commit debe14e
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 125 deletions.
100 changes: 11 additions & 89 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion matcher_rs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ name = "matcher_rs"
crate-type = ["rlib"]

[dependencies]
ahash = { version = "0.8.11", features = ["serde", "compile-time-rng"] }
# aho-corasick = "1.1.3"
aho-corasick-unsafe = { version = "0.0.4", git = "https://github.com/Lips7/aho-corasick" }
bitflags = { version = "2.6.0", features = ["serde"] }
daachorse = "1.0.0"
fancy-regex = "0.13.0"
gxhash = "3.4.1"
id-set = "0.2.2"
lazy_static = "1.5.0"
nohash-hasher = "0.2.0"
Expand Down
10 changes: 5 additions & 5 deletions matcher_rs/src/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::borrow::Cow;
use std::collections::HashMap;

use id_set::IdSet;
use nohash_hasher::{IntMap, IntSet};
use nohash_hasher::IntMap;
use sonic_rs::{to_string, Deserialize, Serialize};

use crate::process::process_matcher::{
Expand Down Expand Up @@ -613,7 +613,7 @@ impl Matcher {
processed_text_process_type_set: &[(Cow<'a, str>, IdSet)],
) -> HashMap<u32, Vec<MatchResult>> {
let mut match_result_dict = HashMap::new();
let mut failed_match_table_id_set = IntSet::default();
let mut failed_match_table_id_set = IdSet::default();

if let Some(regex_matcher) = &self.regex_matcher {
for regex_result in regex_matcher
Expand Down Expand Up @@ -651,10 +651,10 @@ impl Matcher {
.get_unchecked(simple_result.word_id as usize),
)
};
let match_table_id =
((word_table_conf.match_id as u64) << 32) | (word_table_conf.table_id as u64);
let match_table_id = ((word_table_conf.match_id as usize) << 32)
| (word_table_conf.table_id as usize);

if failed_match_table_id_set.contains(&match_table_id) {
if failed_match_table_id_set.contains(match_table_id) {
continue;
}

Expand Down
10 changes: 5 additions & 5 deletions matcher_rs/src/process/process_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@ use std::borrow::Cow;
use std::fmt::Display;
use std::sync::Arc;

#[cfg(any(feature = "runtime_build", feature = "dfa"))]
use ahash::AHashMap;
use ahash::HashMapExt;
use aho_corasick_unsafe::AhoCorasick;
#[cfg(any(feature = "runtime_build", feature = "dfa"))]
use aho_corasick_unsafe::{AhoCorasickBuilder, AhoCorasickKind, MatchKind as AhoCorasickMatchKind};
Expand All @@ -16,6 +13,9 @@ use daachorse::{
CharwiseDoubleArrayAhoCorasick, CharwiseDoubleArrayAhoCorasickBuilder,
MatchKind as DoubleArrayAhoCorasickMatchKind,
};
#[cfg(any(feature = "runtime_build", feature = "dfa"))]
use gxhash::HashMap as GxHashMap;
use gxhash::HashMapExt;
use id_set::IdSet;
use lazy_static::lazy_static;
use nohash_hasher::{IntMap, IsEnabled};
Expand Down Expand Up @@ -405,7 +405,7 @@ pub fn get_process_matcher(

#[cfg(feature = "runtime_build")]
{
let mut process_dict = AHashMap::default();
let mut process_dict = GxHashMap::default();

match process_type_bit {
ProcessType::None => {}
Expand Down Expand Up @@ -531,7 +531,7 @@ pub fn get_process_matcher(
ProcessType::Delete => {
#[cfg(feature = "dfa")]
{
let mut process_dict = AHashMap::default();
let mut process_dict = GxHashMap::default();
process_dict.extend(TEXT_DELETE.trim().lines().map(|pair_str| (pair_str, "")));
process_dict.extend(WHITE_SPACE.iter().map(|&c| (c, "")));
process_dict.retain(|&key, &mut value| key != value);
Expand Down
9 changes: 4 additions & 5 deletions matcher_rs/src/regex_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ use std::borrow::Cow;

use fancy_regex::{escape, Regex};
use id_set::IdSet;
use nohash_hasher::IntSet;
use regex::RegexSet;
use sonic_rs::{Deserialize, Serialize};

Expand Down Expand Up @@ -448,7 +447,7 @@ impl<'a> TextMatcherTrait<'a, RegexResult<'a>> for RegexMatcher {
processed_text_process_type_set: &[(Cow<'a, str>, IdSet)],
) -> Vec<RegexResult<'a>> {
let mut result_list = Vec::new();
let mut table_id_index_set = IntSet::default();
let mut table_id_index_set = IdSet::default();

for (processed_text, process_type_set) in processed_text_process_type_set {
for regex_pattern_table in &self.regex_pattern_table_list {
Expand All @@ -457,7 +456,7 @@ impl<'a> TextMatcherTrait<'a, RegexResult<'a>> for RegexMatcher {
}
match &regex_pattern_table.regex_type {
RegexType::Standard { regex } => {
if table_id_index_set.insert(regex_pattern_table.table_id as u64) {
if table_id_index_set.insert(regex_pattern_table.table_id as usize) {
for caps in regex.captures_iter(processed_text).flatten() {
result_list.push(RegexResult {
match_id: regex_pattern_table.match_id,
Expand All @@ -479,7 +478,7 @@ impl<'a> TextMatcherTrait<'a, RegexResult<'a>> for RegexMatcher {
} => {
for (index, regex) in regex_list.iter().enumerate() {
let table_id_index =
((regex_pattern_table.table_id as u64) << 32) | (index as u64);
((regex_pattern_table.table_id as usize) << 32) | (index as usize);

if table_id_index_set.insert(table_id_index) {
if let Ok(is_match) = regex.is_match(processed_text) {
Expand All @@ -501,7 +500,7 @@ impl<'a> TextMatcherTrait<'a, RegexResult<'a>> for RegexMatcher {
} => {
for index in regex_set.matches(processed_text) {
let table_id_index =
((regex_pattern_table.table_id as u64) << 32) | (index as u64);
((regex_pattern_table.table_id as usize) << 32) | (index as usize);

if table_id_index_set.insert(table_id_index) {
result_list.push(RegexResult {
Expand Down
7 changes: 3 additions & 4 deletions matcher_rs/src/sim_matcher.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use std::borrow::Cow;

use id_set::IdSet;
use nohash_hasher::IntSet;
use rapidfuzz::distance;
use sonic_rs::{Deserialize, Serialize};

Expand Down Expand Up @@ -341,13 +340,13 @@ impl<'a> TextMatcherTrait<'a, SimResult<'a>> for SimMatcher {
/// - `similarity`: The similarity score of the match.
///
/// The function ensures that only unique matches are included in the result list by maintaining
/// an [IntSet] to track already processed table ID and word index combinations.
/// an [IdSet] to track already processed table ID and word index combinations.
fn _process_with_processed_text_process_type_set(
&'a self,
processed_text_process_type_set: &[(Cow<'a, str>, IdSet)],
) -> Vec<SimResult<'a>> {
let mut result_list = Vec::new();
let mut table_id_index_set = IntSet::default();
let mut table_id_index_set = IdSet::default();

for (processed_text, process_type_set) in processed_text_process_type_set {
for sim_processed_table in &self.sim_processed_table_list {
Expand All @@ -358,7 +357,7 @@ impl<'a> TextMatcherTrait<'a, SimResult<'a>> for SimMatcher {
SimMatchType::Levenshtein => {
for (index, text) in sim_processed_table.word_list.iter().enumerate() {
let table_id_index =
((sim_processed_table.table_id as u64) << 32) | (index as u64);
((sim_processed_table.table_id as usize) << 32) | (index as usize);

if table_id_index_set.insert(table_id_index) {
if let Some(similarity) =
Expand Down
Loading

0 comments on commit debe14e

Please sign in to comment.