Skip to content

Commit

Permalink
use simd to accerlate simple process
Browse files Browse the repository at this point in the history
  • Loading branch information
Lips7 committed Jun 7, 2024
1 parent 96d5c86 commit e09dea1
Show file tree
Hide file tree
Showing 13 changed files with 95 additions and 193 deletions.
2 changes: 1 addition & 1 deletion .cargo/config.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[build]
rustflags = ["-C", "target-cpu=native"]
rustflags = ["-C", "target-cpu=native", "-Z", "threads=8"]
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 0 additions & 3 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@

cargo clean
cargo build --release --target=aarch64-apple-darwin
[ -e ./matcher_py/matcher_py/matcher_py.so ] && rm ./matcher_py/matcher_py/matcher_py.so
cp ./target/aarch64-apple-darwin/release/libmatcher_py.dylib ./matcher_py/matcher_py/matcher_py.so
[ -e ./matcher_c/matcher_c.so ] && rm ./matcher_c/matcher_c.so
cp ./target/aarch64-apple-darwin/release/libmatcher_c.dylib ./matcher_c/matcher_c.so
[ -e ./matcher_java/src/main/resources/matcher_c.so ] && rm ./matcher_java/src/main/resources/matcher_c.so
cp ./target/aarch64-apple-darwin/release/libmatcher_c.dylib ./matcher_java/src/main/resources/matcher_c.so
Binary file modified matcher_c/matcher_c.so
Binary file not shown.
Binary file modified matcher_java/src/main/resources/matcher_c.so
Binary file not shown.
Binary file modified matcher_py/matcher_py/matcher_py.so
Binary file not shown.
176 changes: 25 additions & 151 deletions matcher_py/matcher_py/test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -47,123 +47,55 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"matcher.is_match(r\"\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'test': '[{\"table_id\":1,\"word\":\"你好\"}]'}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"matcher.word_match(r\"你,好\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'{\"test\":\"[{\\\\\"table_id\\\\\":1,\\\\\"word\\\\\":\\\\\"你好\\\\\"}]\"}'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"matcher.word_match_as_string(\"你好\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['{\"test\":\"[{\\\\\"table_id\\\\\":1,\\\\\"word\\\\\":\\\\\"你好\\\\\"}]\"}',\n",
" '{\"test\":\"[{\\\\\"table_id\\\\\":1,\\\\\"word\\\\\":\\\\\"你好\\\\\"}]\"}',\n",
" '{}']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"matcher.batch_word_match_as_string([\"你好\", \"你好\", \"你真棒\"])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['{}', '{}', '{}', ..., '{}', '{}', '{}'], dtype=object)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"text_array = np.array([\"房东巴萨风景嘎哈快睡吧ndsac\"] * 10000, dtype=np.dtype(\"object\"))\n",
"matcher.numpy_word_match_as_string(text_array)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['{}', '{}', '{}', ..., '{}', '{}', '{}'], dtype=object)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"text_array = np.array([\"房东巴萨风景嘎哈快睡吧ndsac\"] * 10000, dtype=np.dtype(\"object\"))\n",
"matcher.numpy_word_match_as_string(text_array, inplace=True)\n",
Expand All @@ -172,7 +104,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -184,6 +116,7 @@
" SimpleWord(2, \"xxx\"),\n",
" SimpleWord(3, \"你好\"),\n",
" SimpleWord(6, r\"It's /\\/\\y duty\"),\n",
" SimpleWord(4, \"xxx,yyy\")\n",
" ],\n",
" SimpleMatchType.MatchFanjian: [SimpleWord(4, \"xxx,yyy\")],\n",
" SimpleMatchType.MatchNone: [SimpleWord(5, \"xxxxx,xxxxyyyyxxxxx\")],\n",
Expand All @@ -194,105 +127,46 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"simple_matcher.is_match(\"xxx\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'word_id': 6, 'word': \"It's /\\\\/\\\\y duty\"}]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"simple_matcher.simple_process(r\"It's /\\/\\y duty\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[[{'word_id': 6, 'word': \"It's /\\\\/\\\\y duty\"}],\n",
" [{'word_id': 3, 'word': '你好'}],\n",
" [{'word_id': 2, 'word': 'xxx'}]]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"simple_matcher.batch_simple_process([r\"It's /\\/\\y duty\", \"你好\", \"xxxxxxx\"])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([list([]), list([]), list([]), ..., list([]), list([]), list([])],\n",
" dtype=object)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"text_array = np.array([\"房东巴萨风景嘎哈快睡吧ndsac\"] * 10000, dtype=np.dtype(\"object\"))\n",
"simple_matcher.numpy_simple_process(text_array)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([list([]), list([]), list([]), ..., list([]), list([]), list([])],\n",
" dtype=object)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"text_array = np.array([\"房东巴萨风景嘎哈快睡吧ndsac\"] * 10000, dtype=np.dtype(\"object\"))\n",
"simple_matcher.numpy_simple_process(text_array, inplace=True)\n",
Expand Down
5 changes: 4 additions & 1 deletion matcher_rs/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Matcher Rust Implement
## Usage
Many usages u can find in [test.rs](./tests/test.rs).
Many usages u can find in [test.rs](./tests/test.rs).

## Limitations
Matchers can only handle words contains no more than 32 combined words and no more than 8 repeated word.
2 changes: 1 addition & 1 deletion matcher_rs/benches/bench.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use gxhash::HashMap as GxHashMap;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use gxhash::HashMap as GxHashMap;
use zerovec::VarZeroVec;

use matcher_rs::*;
Expand Down
2 changes: 2 additions & 0 deletions matcher_rs/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#![allow(internal_features)]
#![feature(core_intrinsics)]
#![feature(portable_simd)]
#![feature(iter_repeat_n)]

#[global_allocator]
static GLOBAL: mimalloc_rust::GlobalMiMalloc = mimalloc_rust::GlobalMiMalloc;
Expand Down
7 changes: 4 additions & 3 deletions matcher_rs/src/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ use std::borrow::Cow;
use std::collections::HashMap;
use std::rc::Rc;

use gxhash::HashMap as GxHashMap;
use bitflags::bitflags;
use serde::{Serializer, Deserializer};
use gxhash::HashMap as GxHashMap;
use serde::{Deserializer, Serializer};
use sonic_rs::{to_string, Deserialize, Serialize};
use zerovec::VarZeroVec;

Expand Down Expand Up @@ -108,7 +108,8 @@ impl Matcher {
let mut word_id: u64 = 0;
let mut word_table_list: Vec<Rc<WordTableConf>> = Vec::new();

let mut simple_wordlist_dict: GxHashMap<SimpleMatchType, Vec<SimpleWord>> = GxHashMap::default();
let mut simple_wordlist_dict: GxHashMap<SimpleMatchType, Vec<SimpleWord>> =
GxHashMap::default();

let mut regex_table_list: Vec<RegexTable> = Vec::new();
let mut sim_table_list: Vec<SimTable> = Vec::new();
Expand Down
Loading

0 comments on commit e09dea1

Please sign in to comment.