From 47eee45b098d6ce95eb2a627904bbbecd97eb3e2 Mon Sep 17 00:00:00 2001 From: LippsApple Date: Thu, 18 Jul 2024 11:52:51 +0800 Subject: [PATCH] breaking changes --- Cargo.lock | 110 +- DESIGN.md | 50 +- README.md | 124 +- ci/build.sh | 1 - ci/test.sh | 11 +- matcher_c/README.md | 38 +- matcher_c/extension_types.py | 152 +- matcher_c/matcher_c.h | 8 +- matcher_c/src/lib.rs | 522 +------ matcher_java/README.md | 65 +- .../java/com/matcher_java/MatcherJava.java | 8 +- .../com/matcher_java/MatcherJavaExample.java | 27 +- matcher_py/Cargo.toml | 5 +- matcher_py/README.md | 80 +- matcher_py/pyproject.toml | 8 +- .../python/matcher_py/extension_types.py | 152 +- matcher_py/python/matcher_py/matcher_py.pyi | 134 +- matcher_py/requirements-dev.lock | 6 +- matcher_py/requirements.lock | 4 - matcher_py/src/lib.rs | 1300 +---------------- matcher_py/test/test_matcher.py | 85 +- matcher_py/test/test_simple_matcher.py | 48 +- matcher_rs/Cargo.toml | 6 +- matcher_rs/README.md | 236 ++- matcher_rs/benches/bench.rs | 206 +-- matcher_rs/benches/bench_test.rs | 20 +- matcher_rs/build.rs | 111 +- .../{str_conv => process_map}/FANJIAN.txt | 0 matcher_rs/{str_conv => process_map}/NORM.txt | 0 .../{str_conv => process_map}/NUM-NORM.txt | 0 .../{str_conv => process_map}/PINYIN.txt | 0 .../{str_conv => process_map}/TEXT-DELETE.txt | 0 matcher_rs/src/lib.rs | 11 +- matcher_rs/src/matcher.rs | 620 ++++---- matcher_rs/src/process/constants.rs | 120 +- matcher_rs/src/process/process_matcher.rs | 868 +++-------- matcher_rs/src/regex_matcher.rs | 476 ++---- matcher_rs/src/sim_matcher.rs | 523 ++----- matcher_rs/src/simple_matcher.rs | 406 +---- matcher_rs/tests/test.rs | 109 +- 40 files changed, 1631 insertions(+), 5019 deletions(-) rename matcher_rs/{str_conv => process_map}/FANJIAN.txt (100%) rename matcher_rs/{str_conv => process_map}/NORM.txt (100%) rename matcher_rs/{str_conv => process_map}/NUM-NORM.txt (100%) rename matcher_rs/{str_conv => process_map}/PINYIN.txt (100%) rename matcher_rs/{str_conv => process_map}/TEXT-DELETE.txt (100%) diff --git a/Cargo.lock b/Cargo.lock index 0a14469..fb6c73e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -29,7 +29,7 @@ dependencies = [ [[package]] name = "aho-corasick-unsafe" version = "0.0.4" -source = "git+https://github.com/Lips7/aho-corasick#408fb70204b00a5d6bcbea2f012a96e9e3fb43d2" +source = "git+https://github.com/Lips7/aho-corasick#66356d8b00f779c4e2b458b6857efb6aa0438a2e" dependencies = [ "memchr", "serde", @@ -247,9 +247,9 @@ dependencies = [ [[package]] name = "heck" -version = "0.4.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "id-set" @@ -321,7 +321,6 @@ name = "matcher_py" version = "0.4.6" dependencies = [ "matcher_rs", - "numpy", "pyo3", "pyo3-build-config", "rmp-serde", @@ -351,16 +350,6 @@ dependencies = [ "tinyvec", ] -[[package]] -name = "matrixmultiply" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2" -dependencies = [ - "autocfg", - "rawpointer", -] - [[package]] name = "memchr" version = "2.7.4" @@ -385,43 +374,12 @@ dependencies = [ "libmimalloc-sys", ] -[[package]] -name = "ndarray" -version = "0.15.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" -dependencies = [ - "matrixmultiply", - "num-complex", - "num-integer", - "num-traits", - "rawpointer", -] - [[package]] name = "nohash-hasher" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" -[[package]] -name = "num-complex" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-integer" -version = "0.1.46" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" -dependencies = [ - "num-traits", -] - [[package]] name = "num-traits" version = "0.2.19" @@ -431,21 +389,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "numpy" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec170733ca37175f5d75a5bea5911d6ff45d2cd52849ce98b685394e4f2f37f4" -dependencies = [ - "libc", - "ndarray", - "num-complex", - "num-integer", - "num-traits", - "pyo3", - "rustc-hash", -] - [[package]] name = "once_cell" version = "1.19.0" @@ -508,28 +451,27 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.21.2" +version = "0.22.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e00b96a521718e08e03b1a622f01c8a8deb50719335de3f60b3b3950f069d8" +checksum = "831e8e819a138c36e212f3af3fd9eeffed6bf1510a805af35b0edee5ffa59433" dependencies = [ "cfg-if", "indoc", "libc", "memoffset", - "parking_lot", + "once_cell", "portable-atomic", "pyo3-build-config", "pyo3-ffi", "pyo3-macros", - "serde", "unindent", ] [[package]] name = "pyo3-build-config" -version = "0.21.2" +version = "0.22.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7883df5835fafdad87c0d888b266c8ec0f4c9ca48a5bed6bbb592e8dedee1b50" +checksum = "1e8730e591b14492a8945cdff32f089250b05f5accecf74aeddf9e8272ce1fa8" dependencies = [ "once_cell", "target-lexicon", @@ -537,9 +479,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.21.2" +version = "0.22.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01be5843dc60b916ab4dad1dca6d20b9b4e6ddc8e15f50c47fe6d85f1fb97403" +checksum = "5e97e919d2df92eb88ca80a037969f44e5e70356559654962cbb3316d00300c6" dependencies = [ "libc", "pyo3-build-config", @@ -547,9 +489,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.21.2" +version = "0.22.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77b34069fc0682e11b31dbd10321cbf94808394c56fd996796ce45217dfac53c" +checksum = "eb57983022ad41f9e683a599f2fd13c3664d7063a3ac5714cae4b7bee7d3f206" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -559,9 +501,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.21.2" +version = "0.22.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08260721f32db5e1a5beae69a55553f56b99bd0e1c3e6e0a5e8851a9d0f5a85c" +checksum = "ec480c0c51ddec81019531705acac51bcdbeae563557c982aa8263bb96880372" dependencies = [ "heck", "proc-macro2", @@ -585,17 +527,11 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "270e04e5ea61d40841942bb15e451c29ee1618637bcf97fc7ede5dd4a9b1601b" -[[package]] -name = "rawpointer" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" - [[package]] name = "redox_syscall" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" +checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4" dependencies = [ "bitflags", ] @@ -657,12 +593,6 @@ dependencies = [ "serde", ] -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "rustix" version = "0.38.34" @@ -770,18 +700,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.62" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2675633b1499176c2dff06b0856a27976a8f9d436737b4cf4f312d4d91d8bbb" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.62" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d20468752b09f49e909e55a5d338caa8bedf615594e9d80bc4c565d30faf798c" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", diff --git a/DESIGN.md b/DESIGN.md index 6752167..a2a22d9 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -2,12 +2,12 @@ ## Transformation -* `FANJIAN`: build from [Unihan_Variants.txt](./data/str_conv/Unihan_Variants.txt) and [EquivalentUnifiedIdeograph.txt](./data/str_conv/EquivalentUnifiedIdeograph.txt). -* `NUM-NORM`: build from [DerivedNumericValues.txt](./data/str_conv/DerivedNumericValues.txt). -* `TEXT-DELETE` and `SYMBOL-NORM`: build from [DerivedGeneralCategory.txt](./data/str_conv/DerivedGeneralCategory.txt). -* `WHITE-SPACE`: build from [PropList.txt](./data/str_conv/PropList.txt). -* `PINYIN` and `PINYIN-CHAR`: build from [Unihan_Readings.txt](./data/str_conv/Unihan_Readings.txt). -* `NORM`: build from [NormalizationTest.txt](./data/str_conv/NormalizationTest.txt). +* `FANJIAN`: build from [Unihan_Variants.txt](./data/process_map/Unihan_Variants.txt) and [EquivalentUnifiedIdeograph.txt](./data/process_map/EquivalentUnifiedIdeograph.txt). +* `NUM-NORM`: build from [DerivedNumericValues.txt](./data/process_map/DerivedNumericValues.txt). +* `TEXT-DELETE` and `SYMBOL-NORM`: build from [DerivedGeneralCategory.txt](./data/process_map/DerivedGeneralCategory.txt). +* `WHITE-SPACE`: build from [PropList.txt](./data/process_map/PropList.txt). +* `PINYIN` and `PINYIN-CHAR`: build from [Unihan_Readings.txt](./data/process_map/Unihan_Readings.txt). +* `NORM`: build from [NormalizationTest.txt](./data/process_map/NormalizationTest.txt). ## Matcher @@ -29,9 +29,9 @@ The `Matcher` utilizes a JSON structure to define matches and tables. Below is a "777": [ { "table_id": 45, - "match_table_type": {"simple_match_type": "MatchNone"}, + "match_table_type": {"process_type": "MatchNone"}, "word_list": ["hello", "world"], - "exemption_simple_match_type": "MatchNone", + "exemption_process_type": "MatchNone", "exemption_word_list": [] } // other tables @@ -65,18 +65,18 @@ Input: "1": [ { "table_id": 1, - "match_table_type": {"simple_match_type": "MatchNone"}, + "match_table_type": {"process_type": "MatchNone"}, "word_list": ["hello", "world"], - "exemption_simple_match_type": "MatchNone", + "exemption_process_type": "MatchNone", "exemption_word_list": [] } ], "2": [ { "table_id": 2, - "match_table_type": {"simple_match_type": "MatchNone"}, + "match_table_type": {"process_type": "MatchNone"}, "word_list": ["你", "好"], - "exemption_simple_match_type": "MatchNone", + "exemption_process_type": "MatchNone", "exemption_word_list": [] } ], @@ -92,16 +92,16 @@ Input: "1": [ { "table_id": 1, - "match_table_type": {"simple_match_type": "MatchNone"}, + "match_table_type": {"process_type": "MatchNone"}, "word_list": ["hello", "world"], - "exemption_simple_match_type": "MatchNone", + "exemption_process_type": "MatchNone", "exemption_word_list": [] }, { "table_id": 2, - "match_table_type": {"simple_match_type": "MatchNone"}, + "match_table_type": {"process_type": "MatchNone"}, "word_list": ["你", "好"], - "exemption_simple_match_type": "MatchNone", + "exemption_process_type": "MatchNone", "exemption_word_list": [] } ] @@ -117,18 +117,18 @@ Input: "1": [ { "table_id": 1, - "match_table_type": {"simple_match_type": "MatchNone"}, + "match_table_type": {"process_type": "MatchNone"}, "word_list": ["hello", "world"], - "exemption_simple_match_type": "MatchNone", + "exemption_process_type": "MatchNone", "exemption_word_list": [] } ], "2": [ { "table_id": 2, - "match_table_type": {"simple_match_type": "MatchNone"}, + "match_table_type": {"process_type": "MatchNone"}, "word_list": ["你", "好"], - "exemption_simple_match_type": "MatchNone", + "exemption_process_type": "MatchNone", "exemption_word_list": [] } ], @@ -153,7 +153,7 @@ The `SimpleMatcher` uses a mapping structure to define words and their IDs based ```json { - "SimpleMatchType.None": { + "ProcessType.None": { "1": "hello&world", "2": "你好" // other words @@ -170,7 +170,7 @@ In real-world scenarios, `word_id` is used to uniquely identify a word in the da ### Logical Operations -- **OR Logic (between different `simple_match_type` and words in the same `simple_match_type`)**: The `simple_matcher` is considered matched if any word in the map is matched. +- **OR Logic (between different `process_type` and words in the same `process_type`)**: The `simple_matcher` is considered matched if any word in the map is matched. - **AND Logic (between words separated by `&` within a `WordID`)**: All words separated by `&` must be matched for the word to be considered as matched. - **NOT Logic (between words separated by `~` within a `WordID`)**: All words separated by `~` must not be matched for the word to be considered as matched. @@ -180,7 +180,7 @@ In real-world scenarios, `word_id` is used to uniquely identify a word in the da ```json Input: { - "SimpleMatchType.None": { + "ProcessType.None": { "1": "word1&word2" } } @@ -192,7 +192,7 @@ Output: Check if `word_id` 1 is matched. ```json Input: { - "SimpleMatchType.None": { + "ProcessType.None": { "1": "word1", "2": "word2" } @@ -205,7 +205,7 @@ Output: Check if `word_id` 1 or 2 is matched. ```json Input: { - "SimpleMatchType.None": { + "ProcessType.None": { "1": "word1~word2", } } diff --git a/README.md b/README.md index 4109c36..b1b9037 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,7 @@ ![PyPI - Version](https://img.shields.io/pypi/v/matcher_py)![PyPI - Python Version](https://img.shields.io/pypi/pyversions/matcher_py)![PyPI - Downloads](https://img.shields.io/pypi/dm/matcher_py) -A high-performance matcher for massive amounts of sensitive words. - -Designed to solve **AND OR NOT** and **TEXT VARIATIONS** problems in word/word_list matching. For detailed implementation, see the [Design Document](./DESIGN.md). +A high-performance matcher designed to solve **AND OR NOT** and **TEXT VARIATIONS** problems in word/word_list matching. It's helpful for - **Content Filtering**: Detecting and filtering out offensive or sensitive words. @@ -21,11 +19,13 @@ It's helpful for ## Features +For detailed implementation, see the [Design Document](./DESIGN.md). + - **Multiple Matching Methods**: - Simple Word Matching - Regex-Based Matching - Similarity-Based Matching -- **Text Normalization**: +- **Text Transformation**: - **Fanjian**: Simplify traditional Chinese characters to simplified ones. Example: `蟲艸` -> `虫草` - **Delete**: Remove specific characters. @@ -46,7 +46,7 @@ It's helpful for ## Usage -Non-Rust users must use **msgpack** for serializing matcher configurations to bytes. **Why msgpack?** It handles backslashes better and is faster than JSON. +Non-Rust users must use **msgpack** for serializing matcher configurations to bytes. **Msgpack** handles backslashes better and is faster than JSON. - Example issue with JSON: `It's /\/\y duty` is processed incorrectly. ### Rust Users @@ -77,116 +77,12 @@ Visit the [release page](https://github.com/Lips7/Matcher/releases) to download ## Benchmarks -Bench against pairs ([CN_WORD_LIST_100000](./data/word_list/cn/cn_words_100000.txt), [CN_HAYSTACK](./data/text/cn/西游记.txt)) and ([EN_WORD_LIST_100000](./data/word_list/en/en_words_100000.txt), [EN_HAYSTACK](./data/text/en/sherlock.txt)). Word selection is totally random. - -The `matcher_rs` library includes benchmarks to measure the performance of the matcher. You can find the benchmarks in the [bench.rs](./benches/bench.rs) file. To run the benchmarks, use the following command: - -```shell -cargo bench -``` - -``` -Current default simple match type: SimpleMatchType(None) -Current default simple word map size: 1000 -Current default combined times: 2 -Timer precision: 41 ns -bench fastest │ slowest │ median │ mean │ samples │ iters -├─ build_cn │ │ │ │ │ -│ ├─ build_cn_by_combined_times │ │ │ │ │ -│ │ ├─ 1 2.468 ms │ 3.355 ms │ 2.506 ms │ 2.536 ms │ 100 │ 100 -│ │ ├─ 2 5.303 ms │ 5.765 ms │ 5.402 ms │ 5.41 ms │ 100 │ 100 -│ │ ├─ 3 7.912 ms │ 10.16 ms │ 7.986 ms │ 8.081 ms │ 100 │ 100 -│ │ ├─ 4 10.59 ms │ 11.31 ms │ 10.73 ms │ 10.75 ms │ 100 │ 100 -│ │ ╰─ 5 13.03 ms │ 14.1 ms │ 13.13 ms │ 13.21 ms │ 100 │ 100 -│ ├─ build_cn_by_multiple_simple_match_type 26.63 ms │ 40.81 ms │ 26.99 ms │ 27.23 ms │ 100 │ 100 -│ ├─ build_cn_by_simple_match_type │ │ │ │ │ -│ │ ├─ "fanjian" 5.296 ms │ 6.12 ms │ 5.348 ms │ 5.398 ms │ 100 │ 100 -│ │ ├─ "fanjian_worddelete_textdelete_normalize" 5.43 ms │ 5.937 ms │ 5.47 ms │ 5.491 ms │ 100 │ 100 -│ │ ├─ "none" 5.268 ms │ 5.667 ms │ 5.375 ms │ 5.379 ms │ 100 │ 100 -│ │ ├─ "normalize" 5.373 ms │ 5.827 ms │ 5.423 ms │ 5.437 ms │ 100 │ 100 -│ │ ├─ "pinyin" 16.02 ms │ 24.52 ms │ 16.15 ms │ 16.34 ms │ 100 │ 100 -│ │ ├─ "pinyinchar" 15.81 ms │ 41.81 ms │ 16.29 ms │ 16.99 ms │ 100 │ 100 -│ │ ├─ "worddelete_textdelete" 5.291 ms │ 6.192 ms │ 5.409 ms │ 5.556 ms │ 100 │ 100 -│ │ ╰─ "worddelete_textdelete_normalize" 5.38 ms │ 6.311 ms │ 5.897 ms │ 5.866 ms │ 100 │ 100 -│ ╰─ build_cn_by_simple_word_map_size │ │ │ │ │ -│ ├─ 100 501.2 µs │ 838.9 µs │ 545.2 µs │ 559.5 µs │ 100 │ 100 -│ ├─ 1000 5.383 ms │ 18.63 ms │ 5.669 ms │ 5.88 ms │ 100 │ 100 -│ ├─ 10000 49.97 ms │ 99.73 ms │ 53.03 ms │ 54.13 ms │ 93 │ 93 -│ ╰─ 50000 194.1 ms │ 366.2 ms │ 204.9 ms │ 212.6 ms │ 24 │ 24 -├─ build_en │ │ │ │ │ -│ ├─ build_en_by_combined_times │ │ │ │ │ -│ │ ├─ 1 5.43 ms │ 6.427 ms │ 5.84 ms │ 5.907 ms │ 100 │ 100 -│ │ ├─ 2 12.9 ms │ 21.5 ms │ 13.6 ms │ 13.83 ms │ 100 │ 100 -│ │ ├─ 3 21.99 ms │ 24.19 ms │ 22.89 ms │ 22.8 ms │ 100 │ 100 -│ │ ├─ 4 29.3 ms │ 50.2 ms │ 30.84 ms │ 31.27 ms │ 100 │ 100 -│ │ ╰─ 5 38.12 ms │ 40.88 ms │ 38.44 ms │ 38.58 ms │ 100 │ 100 -│ ├─ build_en_by_multiple_simple_match_type 16.43 ms │ 19 ms │ 16.79 ms │ 16.95 ms │ 100 │ 100 -│ ├─ build_en_by_simple_match_type │ │ │ │ │ -│ │ ├─ "none" 13.97 ms │ 15.1 ms │ 14.56 ms │ 14.58 ms │ 100 │ 100 -│ │ ├─ "normalize" 12.35 ms │ 17.97 ms │ 13.05 ms │ 13.13 ms │ 100 │ 100 -│ │ ├─ "worddelete_textdelete" 13.5 ms │ 14.87 ms │ 13.96 ms │ 13.97 ms │ 100 │ 100 -│ │ ╰─ "worddelete_textdelete_normalize" 11.83 ms │ 13.31 ms │ 12.46 ms │ 12.54 ms │ 100 │ 100 -│ ╰─ build_en_by_simple_word_map_size │ │ │ │ │ -│ ├─ 100 848.1 µs │ 1.286 ms │ 925.4 µs │ 929 µs │ 100 │ 100 -│ ├─ 1000 12.57 ms │ 16.46 ms │ 13.38 ms │ 13.38 ms │ 100 │ 100 -│ ├─ 10000 178.1 ms │ 192.3 ms │ 182.2 ms │ 183.7 ms │ 28 │ 28 -│ ╰─ 50000 743.3 ms │ 884.1 ms │ 752.2 ms │ 776.2 ms │ 7 │ 7 -├─ search_cn │ │ │ │ │ -│ ├─ search_cn_baseline │ │ │ │ │ -│ │ ├─ 100 2.907 ms │ 11.87 ms │ 3.068 ms │ 3.359 ms │ 100 │ 100 -│ │ ├─ 1000 2.99 ms │ 3.422 ms │ 3.006 ms │ 3.033 ms │ 100 │ 100 -│ │ ├─ 10000 5.197 ms │ 5.801 ms │ 5.269 ms │ 5.294 ms │ 100 │ 100 -│ │ ╰─ 50000 12.44 ms │ 16.52 ms │ 14.2 ms │ 13.89 ms │ 100 │ 100 -│ ├─ search_cn_by_combined_times │ │ │ │ │ -│ │ ├─ 1 3.702 ms │ 4.091 ms │ 3.728 ms │ 3.749 ms │ 100 │ 100 -│ │ ├─ 2 4.442 ms │ 4.826 ms │ 4.458 ms │ 4.467 ms │ 100 │ 100 -│ │ ├─ 3 5.054 ms │ 5.595 ms │ 5.078 ms │ 5.093 ms │ 100 │ 100 -│ │ ├─ 4 6.136 ms │ 6.777 ms │ 6.159 ms │ 6.177 ms │ 100 │ 100 -│ │ ╰─ 5 6.235 ms │ 11.38 ms │ 6.396 ms │ 6.51 ms │ 100 │ 100 -│ ├─ search_cn_by_multiple_simple_match_type 64.81 ms │ 80.83 ms │ 66.49 ms │ 66.75 ms │ 100 │ 100 -│ ├─ search_cn_by_simple_match_type │ │ │ │ │ -│ │ ├─ "fanjian" 6.781 ms │ 7.486 ms │ 6.841 ms │ 6.927 ms │ 100 │ 100 -│ │ ├─ "fanjian_worddelete_textdelete_normalize" 21.47 ms │ 45.61 ms │ 21.82 ms │ 22.33 ms │ 100 │ 100 -│ │ ├─ "none" 4.684 ms │ 5.198 ms │ 4.705 ms │ 4.731 ms │ 100 │ 100 -│ │ ├─ "normalize" 14.62 ms │ 15.81 ms │ 15.5 ms │ 15.28 ms │ 100 │ 100 -│ │ ├─ "pinyin" 57.98 ms │ 63.66 ms │ 60.31 ms │ 59.92 ms │ 84 │ 84 -│ │ ├─ "pinyinchar" 63.8 ms │ 74.02 ms │ 65.47 ms │ 66.22 ms │ 76 │ 76 -│ │ ├─ "worddelete_textdelete" 13.2 ms │ 14.62 ms │ 13.43 ms │ 13.65 ms │ 100 │ 100 -│ │ ╰─ "worddelete_textdelete_normalize" 18.97 ms │ 21.06 ms │ 19.73 ms │ 19.83 ms │ 100 │ 100 -│ ╰─ search_cn_by_simple_word_map_size │ │ │ │ │ -│ ├─ 100 3.031 ms │ 3.491 ms │ 3.082 ms │ 3.104 ms │ 100 │ 100 -│ ├─ 1000 4.793 ms │ 5.205 ms │ 4.997 ms │ 5.001 ms │ 100 │ 100 -│ ├─ 10000 10.12 ms │ 12.74 ms │ 10.7 ms │ 10.66 ms │ 100 │ 100 -│ ╰─ 50000 21.12 ms │ 27.96 ms │ 21.77 ms │ 23.13 ms │ 100 │ 100 -╰─ search_en │ │ │ │ │ - ├─ search_en_baseline │ │ │ │ │ - │ ├─ 100 328.3 µs │ 1.576 ms │ 343.1 µs │ 364.5 µs │ 100 │ 100 - │ ├─ 1000 343.6 µs │ 472.4 µs │ 369.9 µs │ 369.1 µs │ 100 │ 100 - │ ├─ 10000 1.169 ms │ 1.248 ms │ 1.197 ms │ 1.199 ms │ 100 │ 100 - │ ╰─ 50000 1.193 ms │ 1.304 ms │ 1.199 ms │ 1.205 ms │ 100 │ 100 - ├─ search_en_by_combined_times │ │ │ │ │ - │ ├─ 1 1.682 ms │ 4.053 ms │ 1.692 ms │ 1.727 ms │ 100 │ 100 - │ ├─ 2 2.481 ms │ 2.682 ms │ 2.502 ms │ 2.506 ms │ 100 │ 100 - │ ├─ 3 2.585 ms │ 2.979 ms │ 2.678 ms │ 2.69 ms │ 100 │ 100 - │ ├─ 4 2.654 ms │ 3.265 ms │ 2.761 ms │ 2.764 ms │ 100 │ 100 - │ ╰─ 5 2.74 ms │ 3.242 ms │ 2.752 ms │ 2.761 ms │ 100 │ 100 - ├─ search_en_by_multiple_simple_match_type 9.173 ms │ 10.27 ms │ 9.351 ms │ 9.481 ms │ 100 │ 100 - ├─ search_en_by_simple_match_type │ │ │ │ │ - │ ├─ "none" 1.99 ms │ 2.286 ms │ 2.006 ms │ 2.049 ms │ 100 │ 100 - │ ├─ "normalize" 3.992 ms │ 4.064 ms │ 4.009 ms │ 4.012 ms │ 100 │ 100 - │ ├─ "worddelete_textdelete" 6.198 ms │ 7.005 ms │ 6.225 ms │ 6.253 ms │ 100 │ 100 - │ ╰─ "worddelete_textdelete_normalize" 10.51 ms │ 32.63 ms │ 11.1 ms │ 11.41 ms │ 100 │ 100 - ╰─ search_en_by_simple_word_map_size │ │ │ │ │ - ├─ 100 1.384 ms │ 1.616 ms │ 1.458 ms │ 1.471 ms │ 100 │ 100 - ├─ 1000 2.395 ms │ 2.587 ms │ 2.427 ms │ 2.432 ms │ 100 │ 100 - ├─ 10000 3.091 ms │ 4.291 ms │ 3.113 ms │ 3.127 ms │ 100 │ 100 - ╰─ 50000 3.668 ms │ 5.738 ms │ 3.831 ms │ 3.853 ms │ 100 │ 100 -``` +Please refer to [benchmarks](./matcher_rs/README.md#benchmarks) for details. ## Roadmap ### Performance -- [x] ~~Cache middle results during different SimpleMatchType reduce_process_text function calling. (failed, too slow)~~ +- [x] ~~Cache middle results during different ProcessType reduce_process_text function calling. (failed, too slow)~~ - [x] Try more aho-corasick library to improve performance and reduce memory usage. - [x] ~~https://github.com/daac-tools/crawdad (produce char-wise index, not byte-wise index, it's not acceptable)~~ - [x] https://github.com/daac-tools/daachorse (use it when Fanjian, PinYin or PinYinChar transformation is performed) @@ -195,8 +91,8 @@ bench fastest │ slowest - [x] See https://github.com/Lips7/aho-corasick. - [ ] Optimize NOT logic word-wise. - [x] Optimize `RegexMatcher` using `RegexSet`. -- [x] Optimize `SimpleMatcher` when multiple `SimpleMatchType` are used. - 1. Consider if there are multiple `SimpleMatchType` +- [x] Optimize `SimpleMatcher` when multiple `ProcessType` are used. + 1. Consider if there are multiple `ProcessType` * None * Fanjian * FanjianDelete @@ -210,7 +106,7 @@ bench fastest │ slowest 1. Consider we have to perform FanjianDeleteNormalize, we need to perform Fanjian first, then Delete, then Normalize, 3 kinds of Process Matcher are needed to perform replacement or delete, the text has to be scanned 3 times. 2. What if we only construct only 1 Process Matcher which's patterns contains all the Fanjian, Delete and Normalize 3 kinds of patterns? We could scan the text only once to get all the positions that should be perform replacement or delete. 3. We need to take care of the byte index will change after replacement or delete, so we need to take the offset changes into account. -- [x] Merge multiple aho-corasick matcher into one when multiple `SimpleMatchType` are used. +- [x] Merge multiple aho-corasick matcher into one when multiple `ProcessType` are used. - [x] When `dfa` feature is disabled, use daachorse to perform text processing. - [x] Do not use it for simple process function, too slow to build. diff --git a/ci/build.sh b/ci/build.sh index 96298eb..1a91cb7 100644 --- a/ci/build.sh +++ b/ci/build.sh @@ -1,5 +1,4 @@ cargo update cargo build --release -cp ./target/release/libmatcher_py.dylib ./matcher_py/python/matcher_py/matcher_py.so cp ./target/release/libmatcher_c.dylib ./matcher_c/matcher_c.so cp ./target/release/libmatcher_c.dylib ./matcher_java/src/main/resources/matcher_c.so \ No newline at end of file diff --git a/ci/test.sh b/ci/test.sh index a2a401a..8b56dbc 100644 --- a/ci/test.sh +++ b/ci/test.sh @@ -1,13 +1,18 @@ cargo fmt cargo clippy --all-targets -- -D warnings -cargo test -cargo test --features "serde" cargo doc +cd matcher_rs +cargo test --no-default-features +cargo test --no-default-features --features "dfa" +cargo test --no-default-features --features "runtime_build" +cargo test --no-default-features --features "runtime_build,dfa" +cargo test --no-default-features --features "dfa,serde" +cd .. + cd matcher_py unset CONDA_PREFIX maturin develop ruff format . pytest - cd .. \ No newline at end of file diff --git a/matcher_c/README.md b/matcher_c/README.md index 2529502..a82b85f 100644 --- a/matcher_c/README.md +++ b/matcher_c/README.md @@ -27,7 +27,7 @@ import msgspec from cffi import FFI -from extension_types import MatchTableType, SimpleMatchType, MatchTable +from extension_types import MatchTableType, ProcessType, MatchTable ## define ffi ffi = FFI() @@ -40,9 +40,11 @@ matcher = lib.init_matcher( 1: [ MatchTable( table_id=1, - match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - word_list=["hello&world", "hello", "world"], - exemption_simple_match_type=SimpleMatchType.MatchNone, + match_table_type=MatchTableType.Simple( + process_type=ProcessType.MatchNone + ), + word_list=["hello,world", "hello", "world"], + exemption_process_type=ProcessType.MatchNone, exemption_word_list=[], ) ] @@ -50,11 +52,18 @@ matcher = lib.init_matcher( ) # check is match -lib.matcher_is_match(matcher, "hello".encode("utf-8")) # True +lib.matcher_is_match(matcher, "hello".encode("utf-8")) # True -# match word, output json string -res = lib.matcher_word_match(matcher, "hello,world".encode("utf-8")) # {1:[{"match_id":1,"table_id":1,"word":"hello"},{"match_id":1,"table_id":1,"word":"hello&world"},{"match_id":1,"table_id":1,"word":"world"}]"} -print(ffi.string(res).decode("utf-8")) # +# match as list +res = lib.matcher_process_as_string(matcher, "hello,world".encode("utf-8")) +print(ffi.string(res).decode("utf-8")) +# [{"match_id":1,"table_id":1,"word_id":0,"word":"hello,world","similarity":1.0},{"match_id":1,"table_id":1,"word_id":1,"word":"hello","similarity":1.0},{"match_id":1,"table_id":1,"word_id":2,"word":"world","similarity":1.0}] +lib.drop_string(res) + +# match as dict +res = lib.matcher_word_match_as_string(matcher, "hello,world".encode("utf-8")) +print(ffi.string(res).decode("utf-8")) +# {"1":[{"match_id":1,"table_id":1,"word_id":0,"word":"hello,world","similarity":1.0},{"match_id":1,"table_id":1,"word_id":1,"word":"hello","similarity":1.0},{"match_id":1,"table_id":1,"word_id":2,"word":"world","similarity":1.0}]} lib.drop_string(res) # drop matcher @@ -63,19 +72,22 @@ lib.drop_matcher(matcher) # init simple matcher simple_matcher = lib.init_simple_matcher( msgspec.msgpack.encode(({ - SimpleMatchType.MatchFanjianDeleteNormalize | SimpleMatchType.MatchPinYinChar: { + ProcessType.MatchFanjianDeleteNormalize | ProcessType.MatchPinYinChar: { 1: "妳好&世界", - 2: "hello" + 2: "hello", } })) ) # check is match -lib.simple_matcher_is_match(simple_matcher, "你好世界".encode("utf-8")) # True +lib.simple_matcher_is_match(simple_matcher, "你好世界".encode("utf-8")) # True -# match word, output json string -res = lib.simple_matcher_process(simple_matcher, "nihaoshijie!hello!world!".encode("utf-8")) # [{"word_id":1,"word":"妳好,世界"},{"word_id":2,"word":"hello"}] +# match as list +res = lib.simple_matcher_process_as_string( + simple_matcher, "nihaoshijie!hello!world!".encode("utf-8") +) print(ffi.string(res).decode("utf-8")) +# [{"word_id":1,"word":"妳好&世界"},{"word_id":2,"word":"hello"}] lib.drop_string(res) # drop simple matcher diff --git a/matcher_c/extension_types.py b/matcher_c/extension_types.py index 412c932..641c290 100644 --- a/matcher_c/extension_types.py +++ b/matcher_c/extension_types.py @@ -1,155 +1,85 @@ from enum import Enum, IntFlag -from typing import Dict, List +from typing import Dict, List, TypedDict -import msgspec - - -class SimpleMatchType(IntFlag): - """ - IntFlag representing different simple match types. - - Attributes: - MatchNone (int): A match type indicating no specific match criteria (0b00000001). - MatchFanjian (int): A match type for matching between traditional and simplified Chinese characters (0b00000010). - MatchWordDelete (int): A match type where words are deleted for matching purposes (0b00000100). - MatchTextDelete (int): A match type where text is deleted for matching purposes (0b00001000). - MatchDelete (int): A combined match type where both word and text deletions are applied (0b00001100). - MatchNormalize (int): A match type where text normalization is applied (0b00010000). - MatchDeleteNormalize (int): A combined match type where deletion and normalization are both applied (0b00011100). - MatchFanjianDeleteNormalize (int): A combined match type that includes Fanjian matching, deletion, and normalization (0b00011110). - MatchPinYin (int): A match type using Pinyin for matching Chinese characters (0b00100000). - MatchPinYinChar (int): A match type using individual Pinyin characters for a finer granularity match (0b01000000). - """ +class ProcessType(IntFlag): MatchNone = 0b00000001 MatchFanjian = 0b00000010 - MatchWordDelete = 0b00000100 - MatchTextDelete = 0b00001000 - MatchDelete = 0b00001100 - MatchNormalize = 0b00010000 - MatchDeleteNormalize = 0b00011100 - MatchFanjianDeleteNormalize = 0b00011110 - MatchPinYin = 0b00100000 - MatchPinYinChar = 0b01000000 + MatchDelete = 0b00000100 + MatchNormalize = 0b00001000 + MatchDeleteNormalize = 0b00001100 + MatchFanjianDeleteNormalize = 0b00001110 + MatchPinYin = 0b00010000 + MatchPinYinChar = 0b00100000 class RegexMatchType(Enum): - """ - Enum representing different regex match types. - - Attributes: - MatchSimilarChar (str): A match type that finds characters similar to a given set ("similar_char"). - MatchAcrostic (str): A match type that looks for acrostic patterns ("acrostic"). - MatchRegex (str): A match type that uses regular expressions for matching ("regex"). - """ - MatchSimilarChar = "similar_char" MatchAcrostic = "acrostic" MatchRegex = "regex" class SimMatchType(Enum): - """ - Enum representing different similarity match types. - - Attributes: - MatchLevenshtein (str): A match type using the Levenshtein distance algorithm for measuring the difference between two sequences ("levenshtein"). - MatchDamrauLevenshtein (str): A match type using the Damerau-Levenshtein distance algorithm, an extension of Levenshtein with transpositions allowed ("damrau_levenshtein"). - MatchIndel (str): A match type that uses insertion and deletion operations for matching purposes ("indel"). - MatchJaro (str): A match type using the Jaro distance algorithm to compare the similarity between two strings ("jaro"). - MatchJaroWinkler (str): A match type using the Jaro-Winkler distance algorithm, an extension of Jaro with added weight for matching starting characters ("jaro_winkler"). - """ - MatchLevenshtein = "levenshtein" - MatchDamrauLevenshtein = "damrau_levenshtein" - MatchIndel = "indel" - MatchJaro = "jaro" - MatchJaroWinkler = "jaro_winkler" - - -class Simple(msgspec.Struct): - """ - Represents a simple match configuration. - - Attributes: - simple_match_type (SimpleMatchType): The type of simple match to be used, as defined in SimpleMatchType. - """ - - simple_match_type: SimpleMatchType -class Regex(msgspec.Struct): - """ - Represents a regular expression match configuration. +class Simple(TypedDict): + process_type: ProcessType - Attributes: - regex_match_type (RegexMatchType): The type of regular expression match to be used, as defined in RegexMatchType. - """ +class Regex(TypedDict): + process_type: ProcessType regex_match_type: RegexMatchType -class Similar(msgspec.Struct): - """ - Represents a similarity match configuration. - - Attributes: - sim_match_type (SimMatchType): The type of similarity match to be used, as defined in SimMatchType. - threshold (float): The threshold value for the similarity match. This value determines the minimum similarity score required for a match to be considered successful. - """ - +class Similar(TypedDict): + process_type: ProcessType sim_match_type: SimMatchType threshold: float class MatchTableType: - """ - A class representing different types of match tables. - - Attributes: - Simple (Simple): Represents a simple match configuration. - Regex (Regex): Represents a regular expression match configuration. - Similar (Similar): Represents a similarity match configuration. - """ - - Simple = Simple - Regex = Regex - Similar = Similar - - -class MatchTable(msgspec.Struct): - """ - Represents a match table configuration with various match types. - - Attributes: - table_id (int): The unique identifier for the match table. - match_table_type (MatchTableType): The type of match table, can be one of Simple, Regex, or Similar as defined in MatchTableType. - word_list (List[str]): A list of words to be used for matching. - exemption_simple_match_type (SimpleMatchType): Specifies which simple match type(s) to exempt from the match operation. - exemption_word_list (List[str]): A list of words to exempt from the match operation. - """ - + def Simple(process_type: ProcessType) -> Dict[str, Simple]: + return {"simple": Simple(process_type=process_type)} + + def Regex( + process_type: ProcessType, regex_match_type: RegexMatchType + ) -> Dict[str, Regex]: + return { + "regex": Regex(process_type=process_type, regex_match_type=regex_match_type) + } + + def Similar( + process_type: ProcessType, sim_match_type: SimMatchType, threshold: float + ) -> Dict[str, Similar]: + return { + "similar": Similar( + process_type=process_type, + sim_match_type=sim_match_type, + threshold=threshold, + ) + } + + +class MatchTable(TypedDict): table_id: int match_table_type: MatchTableType word_list: List[str] - exemption_simple_match_type: SimpleMatchType + exemption_process_type: ProcessType exemption_word_list: List[str] MatchTableMap = Dict[int, List[MatchTable]] -class MatchResult(msgspec.Struct): +class MatchResult(TypedDict): table_id: int word: str -MatcherMatchResult = Dict[str, List[MatchResult]] +SimpleTable = Dict[ProcessType, Dict[int, str]] -class SimpleResult(msgspec.Struct): +class SimpleResult(TypedDict): word_id: int word: str - - -SimpleMatchTypeWordMap = Dict[SimpleMatchType, Dict[int, str]] diff --git a/matcher_c/matcher_c.h b/matcher_c/matcher_c.h index dd90543..f778e70 100644 --- a/matcher_c/matcher_c.h +++ b/matcher_c/matcher_c.h @@ -1,12 +1,12 @@ void* init_matcher(char* match_table_map_bytes); bool matcher_is_match(void* matcher, char* text); -char* matcher_process(void* matcher, char* text); -char* matcher_word_match(void* matcher, char* text); +char* matcher_process_as_string(void* matcher, char* text); +char* matcher_word_match_as_string(void* matcher, char* text); void drop_matcher(void* matcher); -void* init_simple_matcher(char* smt_word_map_bytes); +void* init_simple_matcher(char* simple_table_bytes); bool simple_matcher_is_match(void* simple_matcher, char* text); -char* simple_matcher_process(void* simple_matcher, char* text); +char* simple_matcher_process_as_string(void* simple_matcher, char* text); void drop_simple_matcher(void* simple_matcher); void drop_string(char* ptr); \ No newline at end of file diff --git a/matcher_c/src/lib.rs b/matcher_c/src/lib.rs index e147209..5c579f5 100644 --- a/matcher_c/src/lib.rs +++ b/matcher_c/src/lib.rs @@ -3,54 +3,8 @@ use std::{ str, }; -use matcher_rs::{MatchTableMap, Matcher, SimpleMatchTypeWordMap, SimpleMatcher, TextMatcherTrait}; +use matcher_rs::{MatchTableMap, Matcher, SimpleMatcher, SimpleTable, TextMatcherTrait}; -/// # Safety -/// This function is unsafe because it assumes that the provided pointer is valid and points to a null-terminated -/// byte string that can be deserialized into a [MatchTableMap]. -/// -/// # Arguments -/// * `match_table_map_bytes` - A pointer to a null-terminated byte string that represents a serialized [MatchTableMap]. -/// -/// # Returns -/// * A raw pointer to a new [Matcher] instance that is created using the deserialized [MatchTableMap]. -/// -/// # Panics -/// This function will panic if the deserialization of `match_table_map_bytes` fails. -/// -/// # Description -/// This function initializes a [Matcher] instance from the provided serialized [MatchTableMap] byte string. -/// It performs deserialization of the byte string, transforms it into a [MatchTableMap], and then uses it to -/// create a new [Matcher]. The newly created [Matcher] instance is then wrapped in a [Box] and converted -/// into a raw pointer before being returned. -/// -/// # Example -/// -/// ``` -/// use std::collections::HashMap; -/// use std::ffi::CString; -/// -/// use matcher_c::*; -/// use matcher_rs::{MatchTable, MatchTableType, SimpleMatchType}; -/// -/// let mut match_table_map = HashMap::new(); -/// match_table_map.insert( -/// 1, -/// vec![ -/// MatchTable { -/// table_id: 1, -/// match_table_type: MatchTableType::Simple { simple_match_type: SimpleMatchType::None }, -/// word_list: vec!["hello", "world"], -/// exemption_simple_match_type: SimpleMatchType::None, -/// exemption_word_list: vec![], -/// } -/// ] -/// ); -/// let match_table_map_bytes = CString::new(rmp_serde::to_vec_named(&match_table_map).unwrap()).unwrap(); -/// -/// let matcher_ptr = unsafe {init_matcher(match_table_map_bytes.as_ptr())}; -/// unsafe {drop_matcher(matcher_ptr)}; -/// ``` #[no_mangle] pub unsafe extern "C" fn init_matcher(match_table_map_bytes: *const c_char) -> *mut Matcher { unsafe { @@ -67,60 +21,6 @@ pub unsafe extern "C" fn init_matcher(match_table_map_bytes: *const c_char) -> * } } -/// # Safety -/// This function is unsafe because it assumes that the provided `matcher` and `text` pointers are valid. -/// The `matcher` pointer should point to a valid [Matcher] instance, and the `text` pointer should point to a null-terminated byte string. -/// -/// # Arguments -/// * `matcher` - A raw pointer to a [Matcher] instance. -/// * `text` - A pointer to a null-terminated byte string that represents the text to be matched. -/// -/// # Returns -/// * A boolean value indicating whether the text matches the pattern defined by the [Matcher] instance. -/// -/// # Panics -/// This function will panic if the `matcher` pointer is null. -/// -/// # Description -/// This function calls the [is_match](matcher_rs::Matcher::is_match) method on a [Matcher] instance. It converts the raw pointers to their -/// respective Rust types, performs the [is_match](matcher_rs::Matcher::is_match) operation, and returns a boolean indicating the match result. -/// The conversion assumes that the `text` pointer points to a valid UTF-8 encoded, null-terminated C string, and -/// that the `matcher` pointer is valid and non-null. -/// -/// # Example -/// -/// ``` -/// use std::collections::HashMap; -/// use std::ffi::CString; -/// -/// use matcher_c::*; -/// use matcher_rs::{MatchTable, MatchTableType, SimpleMatchType}; -/// -/// let mut match_table_map = HashMap::new(); -/// match_table_map.insert( -/// 1, -/// vec![ -/// MatchTable { -/// table_id: 1, -/// match_table_type: MatchTableType::Simple { simple_match_type: SimpleMatchType::None }, -/// word_list: vec!["hello", "world"], -/// exemption_simple_match_type: SimpleMatchType::None, -/// exemption_word_list: vec![], -/// } -/// ] -/// ); -/// let match_table_map_bytes = CString::new(rmp_serde::to_vec_named(&match_table_map).unwrap()).unwrap(); -/// -/// let matcher_ptr = unsafe {init_matcher(match_table_map_bytes.as_ptr())}; -/// -/// let match_text_bytes = CString::new("hello world!").unwrap(); -/// let not_match_text_bytes = CString::new("test").unwrap(); -/// -/// assert!(unsafe {matcher_is_match(matcher_ptr, match_text_bytes.as_ptr())}); -/// assert!(!unsafe {matcher_is_match(matcher_ptr, not_match_text_bytes.as_ptr())}); -/// -/// unsafe {drop_matcher(matcher_ptr)}; -/// ``` #[no_mangle] pub unsafe extern "C" fn matcher_is_match(matcher: *mut Matcher, text: *const c_char) -> bool { unsafe { @@ -131,73 +31,8 @@ pub unsafe extern "C" fn matcher_is_match(matcher: *mut Matcher, text: *const c_ } } -/// # Safety -/// This function is unsafe because it assumes that the provided `matcher` and `text` pointers are valid. -/// The `matcher` pointer should point to a valid [Matcher] instance, and the `text` pointer should point to a -/// null-terminated byte string. -/// -/// # Arguments -/// * `matcher` - A raw pointer to a [Matcher] instance. -/// * `text` - A pointer to a null-terminated byte string that represents the text to be processed. -/// -/// # Returns -/// * A raw pointer to an [c_char] holding the result of the [process](matcher_rs::Matcher::process) function called on the [Matcher] instance, serialized using [sonic_rs::to_vec]. -/// -/// # Panics -/// This function will panic if the `matcher` pointer is null. -/// -/// # Description -/// This function calls the [process](matcher_rs::Matcher::process) method on a [Matcher] instance, serializes the result using [sonic_rs::to_vec], -/// and then converts this serialized result to a C-compatible CString. -/// The resulting CString is then returned as a raw pointer before being returned. -/// -/// # Example -/// -/// ``` -/// use std::collections::HashMap; -/// use std::ffi::{CStr, CString}; -/// use std::str; -/// -/// use matcher_c::*; -/// use matcher_rs::{MatchTable, MatchTableType, SimpleMatchType}; -/// -/// let mut match_table_map = HashMap::new(); -/// match_table_map.insert( -/// 1, -/// vec![ -/// MatchTable { -/// table_id: 1, -/// match_table_type: MatchTableType::Simple { simple_match_type: SimpleMatchType::None }, -/// word_list: vec!["hello", "world"], -/// exemption_simple_match_type: SimpleMatchType::None, -/// exemption_word_list: vec![], -/// } -/// ] -/// ); -/// let match_table_map_bytes = CString::new(rmp_serde::to_vec_named(&match_table_map).unwrap()).unwrap(); -/// -/// let matcher_ptr = unsafe {init_matcher(match_table_map_bytes.as_ptr())}; -/// -/// let match_text_bytes = CString::new("hello world!").unwrap(); -/// -/// assert_eq!( -/// unsafe { -/// str::from_utf8_unchecked( -/// CStr::from_ptr( -/// matcher_process( -/// matcher_ptr, -/// match_text_bytes.as_ptr() -/// ) -/// ).to_bytes() -/// ) -/// }, -/// r#"[{"match_id":1,"table_id":1,"word":"hello"},{"match_id":1,"table_id":1,"word":"world"}]"# -/// ); -/// -/// unsafe {drop_matcher(matcher_ptr)}; -/// ``` #[no_mangle] -pub unsafe extern "C" fn matcher_process( +pub unsafe extern "C" fn matcher_process_as_string( matcher: *mut Matcher, text: *const c_char, ) -> *mut c_char { @@ -217,88 +52,8 @@ pub unsafe extern "C" fn matcher_process( res.into_raw() } -/// # Safety -/// This function is unsafe because it assumes that the provided `matcher` and `text` pointers are valid. -/// The `matcher` pointer should point to a valid [Matcher] instance, and the `text` pointer should point to a -/// null-terminated byte string. -/// -/// # Arguments -/// * `matcher` - A raw pointer to a [Matcher] instance. -/// * `text` - A pointer to a null-terminated byte string that represents the text to be matched. -/// -/// # Returns -/// * A raw pointer to an [c_char] holding the result of the [word_match_as_string](matcher_rs::Matcher::word_match_as_string) function called on the [Matcher] instance. -/// -/// # Panics -/// This function will panic if the `matcher` pointer is null. -/// -/// # Description -/// This function calls the [word_match_as_string](matcher_rs::Matcher::word_match_as_string) method on a [Matcher] instance, converting the result to a JSON string. -/// It converts the raw pointers to their respective Rust types, performs the [word_match](matcher_rs::Matcher::word_match_as_string) operation, -/// serializes the result as a JSON string, and then converts this string to a C-compatible CString. -/// The resulting CString is then returned as a raw pointer before being returned. -/// -/// # Example -/// -/// ``` -/// use std::collections::HashMap; -/// use std::ffi::{CStr, CString}; -/// use std::str; -/// -/// use matcher_c::*; -/// use matcher_rs::{MatchTable, MatchTableType, SimpleMatchType}; -/// -/// let mut match_table_map = HashMap::new(); -/// match_table_map.insert( -/// 1, -/// vec![ -/// MatchTable { -/// table_id: 1, -/// match_table_type: MatchTableType::Simple { simple_match_type: SimpleMatchType::None }, -/// word_list: vec!["hello", "world"], -/// exemption_simple_match_type: SimpleMatchType::None, -/// exemption_word_list: vec![], -/// } -/// ] -/// ); -/// let match_table_map_bytes = CString::new(rmp_serde::to_vec_named(&match_table_map).unwrap()).unwrap(); -/// -/// let matcher_ptr = unsafe {init_matcher(match_table_map_bytes.as_ptr())}; -/// -/// let match_text_bytes = CString::new("hello world!").unwrap(); -/// let not_match_text_bytes = CString::new("test").unwrap(); -/// -/// assert_eq!( -/// unsafe { -/// str::from_utf8_unchecked( -/// CStr::from_ptr( -/// matcher_word_match( -/// matcher_ptr, -/// match_text_bytes.as_ptr() -/// ) -/// ).to_bytes() -/// ) -/// }, -/// r#"{"1":[{"match_id":1,"table_id":1,"word":"hello"},{"match_id":1,"table_id":1,"word":"world"}]}"# -/// ); -/// assert_eq!( -/// unsafe { -/// str::from_utf8_unchecked( -/// CStr::from_ptr( -/// matcher_word_match( -/// matcher_ptr, -/// not_match_text_bytes.as_ptr() -/// ) -/// ).to_bytes() -/// ) -/// }, -/// r#"{}"# -/// ); -/// -/// unsafe {drop_matcher(matcher_ptr)}; -/// ``` #[no_mangle] -pub unsafe extern "C" fn matcher_word_match( +pub unsafe extern "C" fn matcher_word_match_as_string( matcher: *mut Matcher, text: *const c_char, ) -> *mut c_char { @@ -315,158 +70,31 @@ pub unsafe extern "C" fn matcher_word_match( res.into_raw() } -/// # Safety -/// This function is unsafe because it assumes that the provided `matcher` pointer is valid and was previously allocated using [Box::into_raw]. -/// It also assumes that the lifetime of the `matcher` pointer is over and it is safe to drop the data. -/// -/// # Arguments -/// * `matcher` - A raw pointer to a [Matcher] instance that needs to be freed. -/// -/// # Panics -/// This function will panic if the `matcher` pointer is null. -/// It is the caller's responsibility to ensure that the pointer is valid and that no other references to the [Matcher] instance exist. -/// -/// # Description -/// This function converts the raw pointer back into a [Box] and then drops it, effectively freeing the memory that the [Matcher] instance occupied. -/// After calling this function, the `matcher` pointer must not be used again. -/// -/// # Example -/// -/// ``` -/// use std::collections::HashMap; -/// use std::ffi::CString; -/// -/// use matcher_c::*; -/// use matcher_rs::{MatchTable, MatchTableType, SimpleMatchType}; -/// -/// let mut match_table_map = HashMap::new(); -/// match_table_map.insert( -/// 1, -/// vec![ -/// MatchTable { -/// table_id: 1, -/// match_table_type: MatchTableType::Simple { -/// simple_match_type: SimpleMatchType::None, -/// }, -/// word_list: vec!["hello", "world"], -/// exemption_simple_match_type: SimpleMatchType::None, -/// exemption_word_list: vec![], -/// } -/// ] -/// ); -/// let match_table_map_bytes = CString::new(rmp_serde::to_vec_named(&match_table_map).unwrap()).unwrap(); -/// -/// let matcher_ptr = unsafe {init_matcher(match_table_map_bytes.as_ptr())}; -/// unsafe {drop_matcher(matcher_ptr)}; -/// ``` #[no_mangle] pub unsafe extern "C" fn drop_matcher(matcher: *mut Matcher) { unsafe { drop(Box::from_raw(matcher)) } } -/// # Safety -/// This function is unsafe because it assumes that the provided pointer is valid and points to a null-terminated -/// byte string that can be deserialized into a [SimpleMatchTypeWordMap]. -/// -/// # Arguments -/// * `smt_word_map_bytes` - A pointer to a null-terminated byte string that represents a serialized [SimpleMatchTypeWordMap]. -/// -/// # Returns -/// * A raw pointer to a new [SimpleMatcher] instance that is created using the deserialized [SimpleMatchTypeWordMap]. -/// -/// # Panics -/// This function will panic if the deserialization of `smt_word_map_bytes` fails. -/// -/// # Description -/// This function initializes a [SimpleMatcher] instance from the provided serialized [SimpleMatchTypeWordMap] byte string. -/// It performs deserialization of the byte string, transforms it into a [SimpleMatchTypeWordMap], and then uses it to -/// create a new [SimpleMatcher]. The newly created [SimpleMatcher] instance is then wrapped in a [Box] and converted -/// into a raw pointer before being returned. -/// -/// # Example -/// -/// ``` -/// use std::collections::HashMap; -/// use std::ffi::CString; -/// -/// use matcher_c::*; -/// use matcher_rs::{SimpleMatcher, SimpleMatchType}; -/// -/// let mut smt_word_map = HashMap::new(); -/// let mut word_map = HashMap::new(); -/// word_map.insert(1, "hello&world"); -/// smt_word_map.insert(SimpleMatchType::None, word_map); -/// let smt_word_map_bytes = CString::new(rmp_serde::to_vec_named(&smt_word_map).unwrap()).unwrap(); -/// -/// let simple_matcher_ptr = unsafe {init_simple_matcher(smt_word_map_bytes.as_ptr())}; -/// unsafe {drop_simple_matcher(simple_matcher_ptr)}; -/// ``` #[no_mangle] pub unsafe extern "C" fn init_simple_matcher( - smt_word_map_bytes: *const c_char, + simple_table_bytes: *const c_char, ) -> *mut SimpleMatcher { unsafe { - let smt_word_map: SimpleMatchTypeWordMap = - match rmp_serde::from_slice(CStr::from_ptr(smt_word_map_bytes).to_bytes()) { - Ok(smt_word_map) => smt_word_map, + let simple_table: SimpleTable = + match rmp_serde::from_slice(CStr::from_ptr(simple_table_bytes).to_bytes()) { + Ok(simple_table) => simple_table, Err(e) => { panic!( - "Deserialize smt_word_map_bytes failed, Please check the input data.\nErr: {}", + "Deserialize simple_table_bytes failed, Please check the input data.\nErr: {}", e, ) } }; - Box::into_raw(Box::new(SimpleMatcher::new(&smt_word_map))) + Box::into_raw(Box::new(SimpleMatcher::new(&simple_table))) } } -/// # Safety -/// This function is unsafe because it assumes that the provided `simple_matcher` and `text` pointers are valid. -/// The `simple_matcher` pointer should point to a valid [SimpleMatcher] instance, and the `text` pointer should -/// point to a null-terminated byte string that represents the text to be processed. -/// -/// # Arguments -/// * `simple_matcher` - A raw pointer to a [SimpleMatcher] instance. -/// * `text` - A pointer to a null-terminated byte string that represents the text to be matched. -/// -/// # Returns -/// * A boolean value indicating whether the text matches the pattern defined by the [SimpleMatcher] instance. -/// -/// # Panics -/// This function will panic if the `simple_matcher` pointer is null. -/// -/// # Description -/// This function calls the [is_match](matcher_rs::SimpleMatcher::is_match) method on a [SimpleMatcher] instance. It converts the raw pointers -/// to their respective Rust types, performs the [is_match](matcher_rs::SimpleMatcher::is_match) operation, and returns a boolean indicating the match result. -/// the match result. The conversion assumes that the `text` pointer points to a valid UTF-8 encoded, -/// null-terminated C string, and that the `simple_matcher` pointer is valid and non-null. -/// -/// # Example -/// -/// ``` -/// use std::collections::HashMap; -/// use std::ffi::CString; -/// -/// use matcher_c::*; -/// use matcher_rs::{SimpleMatcher, SimpleMatchType}; -/// -/// let mut smt_word_map = HashMap::new(); -/// let mut word_map = HashMap::new(); -/// word_map.insert(1, "hello&world"); -/// smt_word_map.insert(SimpleMatchType::None, word_map); -/// let smt_word_map_bytes = CString::new(rmp_serde::to_vec_named(&smt_word_map).unwrap()).unwrap(); -/// -/// let simple_matcher_ptr = unsafe {init_simple_matcher(smt_word_map_bytes.as_ptr())}; -/// -/// let match_text_bytes = CString::new("hello world!").unwrap(); -/// let not_match_text_bytes = CString::new("test").unwrap(); -/// -/// assert!(unsafe {simple_matcher_is_match(simple_matcher_ptr, match_text_bytes.as_ptr())}); -/// assert!(!unsafe{simple_matcher_is_match(simple_matcher_ptr, not_match_text_bytes.as_ptr())}); -/// -/// unsafe {drop_simple_matcher(simple_matcher_ptr)}; -/// ``` #[no_mangle] pub unsafe extern "C" fn simple_matcher_is_match( simple_matcher: *mut SimpleMatcher, @@ -480,77 +108,8 @@ pub unsafe extern "C" fn simple_matcher_is_match( } } -/// # Safety -/// This function is unsafe because it assumes that the provided `simple_matcher` and `text` pointers are valid. -/// The `simple_matcher` pointer should point to a valid [SimpleMatcher] instance, and the `text` pointer should point to a null-terminated byte string. -/// -/// # Arguments -/// * `simple_matcher` - A raw pointer to a [SimpleMatcher] instance. -/// * `text` - A pointer to a null-terminated byte string that represents the text to be processed. -/// -/// # Returns -/// * A raw pointer to a [c_char] holding the result of the [process](matcher_rs::SimpleMatcher::process) function called on the [SimpleMatcher] instance. The result is serialized to a JSON string. -/// -/// # Panics -/// This function will panic if the `simple_matcher` pointer is null. -/// -/// # Description -/// This function calls the [process](matcher_rs::SimpleMatcher::process) method on a [SimpleMatcher] instance. It converts the raw pointers -/// to their respective Rust types, performs the [process](matcher_rs::SimpleMatcher::process) operation, serializes the result as a JSON string, -/// and then converts this string to a C-compatible CString. The resulting CString is then returned as a raw pointer before being returned. -/// -/// # Example -/// -/// ``` -/// use std::collections::HashMap; -/// use std::ffi::{CStr, CString}; -/// use std::str; -/// -/// use matcher_c::*; -/// use matcher_rs::{SimpleMatcher, SimpleMatchType}; -/// -/// let mut smt_word_map = HashMap::new(); -/// let mut word_map = HashMap::new(); -/// word_map.insert(1, "hello&world"); -/// smt_word_map.insert(SimpleMatchType::None, word_map); -/// let smt_word_map_bytes = CString::new(rmp_serde::to_vec_named(&smt_word_map).unwrap()).unwrap(); -/// -/// let simple_matcher_ptr = unsafe {init_simple_matcher(smt_word_map_bytes.as_ptr())}; -/// -/// let match_text_bytes = CString::new("hello world!").unwrap(); -/// let non_match_text_bytes = CString::new("test").unwrap(); -/// -/// assert_eq!( -/// unsafe { -/// str::from_utf8_unchecked( -/// CStr::from_ptr( -/// simple_matcher_process( -/// simple_matcher_ptr, -/// match_text_bytes.as_ptr() -/// ) -/// ).to_bytes() -/// ) -/// }, -/// r#"[{"word_id":1,"word":"hello&world"}]"# -/// ); -/// assert_eq!( -/// unsafe { -/// str::from_utf8_unchecked( -/// CStr::from_ptr( -/// simple_matcher_process( -/// simple_matcher_ptr, -/// non_match_text_bytes.as_ptr() -/// ) -/// ).to_bytes() -/// ) -/// }, -/// r#"[]"# -/// ); -/// -/// unsafe {drop_simple_matcher(simple_matcher_ptr)}; -/// ``` #[no_mangle] -pub unsafe extern "C" fn simple_matcher_process( +pub unsafe extern "C" fn simple_matcher_process_as_string( simple_matcher: *mut SimpleMatcher, text: *const c_char, ) -> *mut c_char { @@ -570,72 +129,11 @@ pub unsafe extern "C" fn simple_matcher_process( res.into_raw() } -/// # Safety -/// This function is unsafe because it assumes that the provided `simple_matcher` pointer is valid and was previously allocated using [Box::into_raw]. -/// It also assumes that the lifetime of the `simple_matcher` pointer is over and it is safe to drop the data. -/// -/// # Arguments -/// * `simple_matcher` - A raw pointer to a [SimpleMatcher] instance that needs to be freed. -/// -/// # Panics -/// This function will panic if the `simple_matcher` pointer is null. -/// It is the caller's responsibility to ensure that the pointer is valid and that no other references to the [SimpleMatcher] instance exist. -/// -/// # Description -/// This function converts the raw pointer back into a [Box] and then drops it, effectively freeing the memory that the [SimpleMatcher] instance occupied. -/// After calling this function, the `simple_matcher` pointer must not be used again. -/// -/// # Example -/// -/// ``` -/// use std::collections::HashMap; -/// use std::ffi::CString; -/// -/// use matcher_c::*; -/// use matcher_rs::{SimpleMatcher, SimpleMatchType}; -/// -/// let mut smt_word_map = HashMap::new(); -/// let mut word_map = HashMap::new(); -/// word_map.insert(1, "hello&world"); -/// smt_word_map.insert(SimpleMatchType::None, word_map); -/// let smt_word_map_bytes = CString::new(rmp_serde::to_vec_named(&smt_word_map).unwrap()).unwrap(); -/// -/// let simple_matcher_ptr = unsafe {init_simple_matcher(smt_word_map_bytes.as_ptr())}; -/// unsafe {drop_simple_matcher(simple_matcher_ptr)}; -/// ``` #[no_mangle] pub unsafe extern "C" fn drop_simple_matcher(simple_matcher: *mut SimpleMatcher) { unsafe { drop(Box::from_raw(simple_matcher)) } } -/// # Safety -/// This function is unsafe because it assumes that the provided pointer is a valid and previously allocated -/// CString that needs to be freed. The function will take ownership of the pointer, which implies that no other -/// part of the code should attempt to use or free this pointer after this function is called. -/// -/// # Arguments -/// * `ptr` - A raw pointer to a [c_char] that represents a CString to be freed. -/// -/// # Panics -/// This function will panic if the `ptr` is null. It is the caller's responsibility to ensure that the pointer is -/// valid and that no other references to the CString exist. -/// -/// # Description -/// This function takes a raw pointer to a [c_char], converts it back into a CString, and then drops it, effectively -/// freeing the memory that the CString occupied. After calling this function, the `ptr` must not be used again. -/// -/// # Example -/// -/// ``` -/// use std::ffi::CString; -/// -/// use matcher_c::*; -/// -/// let c_string = CString::new("hello world!").unwrap(); -/// let c_string_ptr = c_string.into_raw(); -/// -/// unsafe {drop_string(c_string_ptr)}; -/// ``` #[no_mangle] pub unsafe extern "C" fn drop_string(ptr: *mut c_char) { unsafe { drop(CString::from_raw(ptr)) } diff --git a/matcher_java/README.md b/matcher_java/README.md index 8c87cd1..b0380a8 100644 --- a/matcher_java/README.md +++ b/matcher_java/README.md @@ -29,40 +29,12 @@ Copy the code below or refer to [MatcherJavaExample.java](./src/test/java/com/ma ```java package com.matcher_java; -import org.msgpack.core.MessageBufferPacker; -import org.msgpack.core.MessagePack; - -import com.sun.jna.Library; -import com.sun.jna.Native; import com.sun.jna.Pointer; - import java.io.IOException; +import org.msgpack.core.MessageBufferPacker; +import org.msgpack.core.MessagePack; -interface Matcher extends Library { - Matcher INSTANCE = (Matcher) Native.load( - Matcher.class.getResource("/matcher_c.so").getPath(), - Matcher.class); - - Pointer init_matcher(byte[] match_table_map_bytes); - - boolean matcher_is_match(Pointer matcher, byte[] text_bytes); - - Pointer matcher_word_match(Pointer matcher, byte[] text_bytes); - - void drop_matcher(Pointer matcher); - - Pointer init_simple_matcher(byte[] smt_word_map_bytes); - - boolean simple_matcher_is_match(Pointer simple_matcher, byte[] text_bytes); - - Pointer simple_matcher_process(Pointer simple_matcher, byte[] text_bytes); - - void drop_simple_matcher(Pointer simple_matcher); - - void drop_string(Pointer ptr); -} - -public class Demo { +public class MatcherJavaExample { public static void main(String[] args) throws IOException { System.out.println("Simple Matcher Test"); simple_matcher_process_demo(); @@ -76,17 +48,17 @@ public class Demo { public static void simple_matcher_process_demo() throws IOException { MessageBufferPacker packer = MessagePack.newDefaultBufferPacker(); packer.packMapHeader(1); - packer.packInt(30); // 30 = FanjianDeleteNormalize + packer.packInt(1); // 1 = None packer.packMapHeader(1); packer.packInt(1); packer.packString("hello&world"); packer.close(); - byte[] smt_word_map_bytes = packer.toByteArray(); + byte[] simple_table_bytes = packer.toByteArray(); - Matcher instance = Matcher.INSTANCE; + MatcherJava instance = MatcherJava.INSTANCE; - Pointer simple_matcher = instance.init_simple_matcher(smt_word_map_bytes); + Pointer simple_matcher = instance.init_simple_matcher(simple_table_bytes); byte[] str_bytes = "hello,world".getBytes("utf-8"); byte[] c_str_bytes = new byte[str_bytes.length + 1]; @@ -95,7 +67,7 @@ public class Demo { boolean is_match = instance.simple_matcher_is_match(simple_matcher, c_str_bytes); System.out.printf("is_match: %s\n", is_match); - Pointer match_res_ptr = instance.simple_matcher_process(simple_matcher, c_str_bytes); + Pointer match_res_ptr = instance.simple_matcher_process_as_string(simple_matcher, c_str_bytes); String match_res = match_res_ptr.getString(0, "utf-8"); System.out.printf("match_res: %s\n", match_res); instance.drop_string(match_res_ptr); @@ -114,19 +86,19 @@ public class Demo { packer.packInt(1); packer.packString("match_table_type"); packer.packMapHeader(1); - packer.packString("simple_match_type"); - packer.packInt(30); // 30 = FanjianDeleteNormalize + packer.packString("process_type"); + packer.packInt(1); // 1 = None packer.packString("word_list"); packer.packArrayHeader(1); packer.packString("hello&world"); - packer.packString("exemption_simple_match_type"); + packer.packString("exemption_process_type"); packer.packInt(1); // 1 = None packer.packString("exemption_word_list"); packer.packArrayHeader(0); byte[] match_table_map_dict_bytes = packer.toByteArray(); packer.close(); - Matcher instance = Matcher.INSTANCE; + MatcherJava instance = MatcherJava.INSTANCE; Pointer matcher = instance.init_matcher(match_table_map_dict_bytes); @@ -137,10 +109,15 @@ public class Demo { boolean is_match = instance.matcher_is_match(matcher, c_str_bytes); System.out.printf("is_match: %s\n", is_match); - Pointer match_res_ptr = instance.matcher_word_match(matcher, c_str_bytes); - String match_res = match_res_ptr.getString(0, "utf-8"); - System.out.printf("match_res: %s\n", match_res); - instance.drop_string(match_res_ptr); + Pointer match_res_ptr_1 = instance.matcher_process_as_string(matcher, c_str_bytes); + String match_res_1 = match_res_ptr_1.getString(0, "utf-8"); + System.out.printf("match_res: %s\n", match_res_1); + instance.drop_string(match_res_ptr_1); + + Pointer match_res_ptr_2 = instance.matcher_word_match_as_string(matcher, c_str_bytes); + String match_res_2 = match_res_ptr_2.getString(0, "utf-8"); + System.out.printf("match_res: %s\n", match_res_2); + instance.drop_string(match_res_ptr_2); instance.drop_matcher(matcher); } diff --git a/matcher_java/src/main/java/com/matcher_java/MatcherJava.java b/matcher_java/src/main/java/com/matcher_java/MatcherJava.java index 60c260e..6922ee4 100644 --- a/matcher_java/src/main/java/com/matcher_java/MatcherJava.java +++ b/matcher_java/src/main/java/com/matcher_java/MatcherJava.java @@ -13,17 +13,17 @@ interface MatcherJava extends Library { boolean matcher_is_match(Pointer matcher, byte[] text_bytes); - Pointer matcher_process(Pointer matcher, byte[] text_bytes); + Pointer matcher_process_as_string(Pointer matcher, byte[] text_bytes); - Pointer matcher_word_match(Pointer matcher, byte[] text_bytes); + Pointer matcher_word_match_as_string(Pointer matcher, byte[] text_bytes); void drop_matcher(Pointer matcher); - Pointer init_simple_matcher(byte[] smt_word_map_bytes); + Pointer init_simple_matcher(byte[] simple_table_bytes); boolean simple_matcher_is_match(Pointer simple_matcher, byte[] text_bytes); - Pointer simple_matcher_process(Pointer simple_matcher, byte[] text_bytes); + Pointer simple_matcher_process_as_string(Pointer simple_matcher, byte[] text_bytes); void drop_simple_matcher(Pointer simple_matcher); diff --git a/matcher_java/src/test/java/com/matcher_java/MatcherJavaExample.java b/matcher_java/src/test/java/com/matcher_java/MatcherJavaExample.java index 7c54645..4d5ab1a 100644 --- a/matcher_java/src/test/java/com/matcher_java/MatcherJavaExample.java +++ b/matcher_java/src/test/java/com/matcher_java/MatcherJavaExample.java @@ -19,17 +19,17 @@ public static void main(String[] args) throws IOException { public static void simple_matcher_process_demo() throws IOException { MessageBufferPacker packer = MessagePack.newDefaultBufferPacker(); packer.packMapHeader(1); - packer.packInt(30); // 30 = FanjianDeleteNormalize + packer.packInt(1); // 1 = None packer.packMapHeader(1); packer.packInt(1); packer.packString("hello&world"); packer.close(); - byte[] smt_word_map_bytes = packer.toByteArray(); + byte[] simple_table_bytes = packer.toByteArray(); MatcherJava instance = MatcherJava.INSTANCE; - Pointer simple_matcher = instance.init_simple_matcher(smt_word_map_bytes); + Pointer simple_matcher = instance.init_simple_matcher(simple_table_bytes); byte[] str_bytes = "hello,world".getBytes("utf-8"); byte[] c_str_bytes = new byte[str_bytes.length + 1]; @@ -38,7 +38,7 @@ public static void simple_matcher_process_demo() throws IOException { boolean is_match = instance.simple_matcher_is_match(simple_matcher, c_str_bytes); System.out.printf("is_match: %s\n", is_match); - Pointer match_res_ptr = instance.simple_matcher_process(simple_matcher, c_str_bytes); + Pointer match_res_ptr = instance.simple_matcher_process_as_string(simple_matcher, c_str_bytes); String match_res = match_res_ptr.getString(0, "utf-8"); System.out.printf("match_res: %s\n", match_res); instance.drop_string(match_res_ptr); @@ -57,12 +57,12 @@ public static void matcher_process_demo() throws IOException { packer.packInt(1); packer.packString("match_table_type"); packer.packMapHeader(1); - packer.packString("simple_match_type"); - packer.packInt(30); // 30 = FanjianDeleteNormalize + packer.packString("process_type"); + packer.packInt(1); // 1 = None packer.packString("word_list"); packer.packArrayHeader(1); packer.packString("hello&world"); - packer.packString("exemption_simple_match_type"); + packer.packString("exemption_process_type"); packer.packInt(1); // 1 = None packer.packString("exemption_word_list"); packer.packArrayHeader(0); @@ -80,10 +80,15 @@ public static void matcher_process_demo() throws IOException { boolean is_match = instance.matcher_is_match(matcher, c_str_bytes); System.out.printf("is_match: %s\n", is_match); - Pointer match_res_ptr = instance.matcher_word_match(matcher, c_str_bytes); - String match_res = match_res_ptr.getString(0, "utf-8"); - System.out.printf("match_res: %s\n", match_res); - instance.drop_string(match_res_ptr); + Pointer match_res_ptr_1 = instance.matcher_process_as_string(matcher, c_str_bytes); + String match_res_1 = match_res_ptr_1.getString(0, "utf-8"); + System.out.printf("match_res: %s\n", match_res_1); + instance.drop_string(match_res_ptr_1); + + Pointer match_res_ptr_2 = instance.matcher_word_match_as_string(matcher, c_str_bytes); + String match_res_2 = match_res_ptr_2.getString(0, "utf-8"); + System.out.printf("match_res: %s\n", match_res_2); + instance.drop_string(match_res_ptr_2); instance.drop_matcher(matcher); } diff --git a/matcher_py/Cargo.toml b/matcher_py/Cargo.toml index 6c64510..6d0ccbf 100644 --- a/matcher_py/Cargo.toml +++ b/matcher_py/Cargo.toml @@ -20,9 +20,8 @@ crate-type = ["cdylib"] [dependencies] matcher_rs = { path = "../matcher_rs", version = "0.4.6" } -numpy = "0.21.0" -pyo3 = { version = "0.21.2", features = ["extension-module", "serde"] } +pyo3 = { version = "0.22.2", features = ["extension-module"] } rmp-serde = "1.3.0" [build-dependencies] -pyo3-build-config = "0.21.2" +pyo3-build-config = "0.22.2" diff --git a/matcher_py/README.md b/matcher_py/README.md index 324c0c8..c9817dd 100644 --- a/matcher_py/README.md +++ b/matcher_py/README.md @@ -48,23 +48,22 @@ The `msgspec` library is recommended for serializing the matcher configuration d ### Explanation of the configuration * `Matcher`'s configuration is defined by the `MatchTableMap = Dict[int, List[MatchTable]]` type, the key of `MatchTableMap` is called `match_id`, **for each `match_id`, the `table_id` inside is required to be unique**. -* `SimpleMatcher`'s configuration is defined by the `SimpleMatchTableMap = Dict[SimpleMatchType, Dict[int, str]]` type, the value `Dict[int, str]`'s key is called `word_id`, **`word_id` is required to be globally unique**. +* `SimpleMatcher`'s configuration is defined by the `SimpleTable = Dict[ProcessType, Dict[int, str]]` type, the value `Dict[int, str]`'s key is called `word_id`, **`word_id` is required to be globally unique**. #### MatchTable * `table_id`: The unique ID of the match table. * `match_table_type`: The type of the match table. * `word_list`: The word list of the match table. -* `exemption_simple_match_type`: The type of the exemption simple match. +* `exemption_process_type`: The type of the exemption simple match. * `exemption_word_list`: The exemption word list of the match table. For each match table, word matching is performed over the `word_list`, and exemption word matching is performed over the `exemption_word_list`. If the exemption word matching result is True, the word matching result will be False. #### MatchTableType -* `Simple`: Supports simple multiple patterns matching with text normalization defined by `simple_match_type`. - * We offer transformation methods for text normalization, including `Fanjian`, `Normalize`, `PinYin` ···. - * It can handle combination patterns and repeated times sensitive matching, delimited by `&`, such as `hello&world&hello` will match `hellohelloworld` and `worldhellohello`, but not `helloworld` due to the repeated times of `hello`. +* `Simple`: Supports simple multiple patterns matching with text normalization defined by `process_type`. + * It can handle combination patterns and repeated times sensitive matching, delimited by `&` and `~`, such as `hello&world&hello` will match `hellohelloworld` and `worldhellohello`, but not `helloworld` due to the repeated times of `hello`. * `Regex`: Supports regex patterns matching. * `SimilarChar`: Supports similar character matching using regex. * `["hello,hallo,hollo,hi", "word,world,wrd,🌍", "!,?,~"]` will match `helloworld!`, `hollowrd?`, `hi🌍~` ··· any combinations of the words split by `,` in the list. @@ -74,27 +73,23 @@ For each match table, word matching is performed over the `word_list`, and exemp * `["h[aeiou]llo", "w[aeiou]rd"]` will match `hello`, `world`, `hillo`, `wurld` ··· any text that matches the regex in the list. * `Similar`: Supports similar text matching based on distance and threshold. * `Levenshtein`: Supports similar text matching based on Levenshtein distance. - * `DamerauLevenshtein`: Supports similar text matching based on Damerau-Levenshtein distance. - * `Indel`: Supports similar text matching based on Indel distance. - * `Jaro`: Supports similar text matching based on Jaro distance. - * `JaroWinkler`: Supports similar text matching based on Jaro-Winkler distance. -#### SimpleMatchType +#### ProcessType * `None`: No transformation. -* `Fanjian`: Traditional Chinese to simplified Chinese transformation. Based on [FANJIAN](../matcher_rs/str_conv/FANJIAN.txt). +* `Fanjian`: Traditional Chinese to simplified Chinese transformation. Based on [FANJIAN](./process_map/FANJIAN.txt). * `妳好` -> `你好` * `現⾝` -> `现身` -* `Delete`: Delete all punctuation, special characters and white spaces. +* `Delete`: Delete all punctuation, special characters and white spaces. Based on [TEXT_DELETE](./process_map/TEXT-DELETE.txt) and `WHITE_SPACE`. * `hello, world!` -> `helloworld` * `《你∷好》` -> `你好` -* `Normalize`: Normalize all English character variations and number variations to basic characters. Based on [SYMBOL_NORM](../matcher_rs/str_conv/SYMBOL-NORM.txt), [NORM](../matcher_rs/str_conv/NORM.txt) and [NUM_NORM](../matcher_rs/str_conv/NUM-NORM.txt). +* `Normalize`: Normalize all English character variations and number variations to basic characters. Based on [SYMBOL_NORM](./process_map/SYMBOL-NORM.txt), [NORM](./process_map/NORM.txt) and [NUM_NORM](./process_map/NUM-NORM.txt). * `ℋЀ⒈㈠Õ` -> `he11o` * `⒈Ƨ㊂` -> `123` -* `PinYin`: Convert all unicode Chinese characters to pinyin with boundaries. Based on [PINYIN](../matcher_rs/str_conv/PINYIN.txt). +* `PinYin`: Convert all unicode Chinese characters to pinyin with boundaries. Based on [PINYIN](./process_map/PINYIN.txt). * `你好` -> ` ni hao ` * `西安` -> ` xi an ` -* `PinYinChar`: Convert all unicode Chinese characters to pinyin without boundaries. Based on [PINYIN](../matcher_rs/str_conv/PINYIN.txt). +* `PinYinChar`: Convert all unicode Chinese characters to pinyin without boundaries. Based on [PINYIN](./process_map/PINYIN.txt). * `你好` -> `nihao` * `西安` -> `xian` @@ -102,20 +97,16 @@ You can combine these transformations as needed. Pre-defined combinations like ` Avoid combining `PinYin` and `PinYinChar` due to that `PinYin` is a more limited version of `PinYinChar`, in some cases like `xian`, can be treat as two words `xi` and `an`, or only one word `xian`. -`Delete` is technologically a combination of `TextDelete` and `WordDelete`, we implement different delete methods for text and word. 'Cause we believe special characters are parts of the word, users put them in words deliberately, but not for text. For `text_process` and `reduce_text_process` functions, users should use `TextDelete` instead of `WordDelete`. -* `WordDelete`: Delete all patterns in `WHITE_SPACE`. -* `TextDelete`: Delete all patterns in [TEXT_DELETE](../matcher_rs/str_conv/TEXT-DELETE.txt). - ### Text Process Usage Here’s an example of how to use the `reduce_text_process` and `text_process` functions: ```python from matcher_py import reduce_text_process, text_process -from matcher_py.extension_types import SimpleMatchType +from matcher_py.extension_types import ProcessType -print(reduce_text_process(SimpleMatchType.MatchTextDelete | SimpleMatchType.MatchNormalize, "hello, world!")) -print(text_process(SimpleMatchType.MatchTextDelete, "hello, world!")) +print(reduce_text_process(ProcessType.MatchTextDelete | ProcessType.MatchNormalize, "hello, world!")) +print(text_process(ProcessType.MatchTextDelete, "hello, world!")) ``` ### Matcher Basic Usage @@ -124,9 +115,9 @@ Here’s an example of how to use the `Matcher`: ```python import msgspec -import numpy as np + from matcher_py import Matcher -from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType +from matcher_py.extension_types import MatchTable, MatchTableType, ProcessType msgpack_encoder = msgspec.msgpack.Encoder() matcher = Matcher( @@ -134,9 +125,9 @@ matcher = Matcher( 1: [ MatchTable( table_id=1, - match_table_type=MatchTableType.Simple(simple_match_type = SimpleMatchType.MatchFanjianDeleteNormalize), + match_table_type=MatchTableType.Simple(process_type = ProcessType.MatchFanjianDeleteNormalize), word_list=["hello", "world"], - exemption_simple_match_type=SimpleMatchType.MatchNone, + exemption_process_type=ProcessType.MatchNone, exemption_word_list=["word"], ) ] @@ -145,27 +136,14 @@ matcher = Matcher( # Check if a text matches assert matcher.is_match("hello") assert not matcher.is_match("hello, word") +# Perform process as a list +result = matcher.proces("hello") +assert result == "[{\"match_id\":1,\"table_id\":1,\"word_id\":1,\"word\":\"hello\",\"similarity\":1}]" # Perform word matching as a dict assert matcher.word_match(r"hello, world")[1] # Perform word matching as a string result = matcher.word_match_as_string("hello") -assert result == """{1:[{\"match_id\":1,\"table_id\":1,\"word\":\"hello\"}]"}""" -# Perform batch processing as a dict using a list -text_list = ["hello", "world", "hello,word"] -batch_results = matcher.batch_word_match(text_list) -print(batch_results) -# Perform batch processing as a string using a list -text_list = ["hello", "world", "hello,word"] -batch_results = matcher.batch_word_match_as_string(text_list) -print(batch_results) -# Perform batch processing as a dict using a numpy array -text_array = np.array(["hello", "world", "hello,word"], dtype=np.dtype("object")) -numpy_results = matcher.numpy_word_match(text_array) -print(numpy_results) -# Perform batch processing as a string using a numpy array -text_array = np.array(["hello", "world", "hello,word"], dtype=np.dtype("object")) -numpy_results = matcher.numpy_word_match_as_string(text_array) -print(numpy_results) +assert result == """{1:[{\"match_id\":1,\"table_id\":1,\"word_id\":1,\"word\":\"hello\",\"similarity\":1}]}""" ``` ### Simple Matcher Basic Usage @@ -176,25 +154,17 @@ Here’s an example of how to use the `SimpleMatcher`: import msgspec import numpy as np from matcher_py import SimpleMatcher -from matcher_py.extension_types import SimpleMatchType +from matcher_py.extension_types import ProcessType msgpack_encoder = msgspec.msgpack.Encoder() simple_matcher = SimpleMatcher( - msgpack_encoder.encode({SimpleMatchType.MatchNone: {1: "example"}}) + msgpack_encoder.encode({ProcessType.MatchNone: {1: "example"}}) ) # Check if a text matches assert simple_matcher.is_match("example") # Perform simple processing -results = simple_matcher.simple_process("example") -print(results) -# Perform batch processing using a list -text_list = ["example", "test", "example test"] -batch_results = simple_matcher.batch_simple_process(text_list) -print(batch_results) -# Perform batch processing using a NumPy array -text_array = np.array(["example", "test", "example test"], dtype=np.dtype("object")) -numpy_results = simple_matcher.numpy_simple_process(text_array) -print(numpy_results) +result = simple_matcher.process("example") +assert result == "[{\"word_id\":1,\"word\":\"hello\"}]" ``` ## Contributing diff --git a/matcher_py/pyproject.toml b/matcher_py/pyproject.toml index 760a7d5..de729d6 100644 --- a/matcher_py/pyproject.toml +++ b/matcher_py/pyproject.toml @@ -5,7 +5,6 @@ version = "0.4.6" readme = "README.md" requires-python = ">=3.8" authors = [{ name = 'Fuji Guo', email = "f975793771@gmail.com" }] -dependencies = ["numpy>=1.16", "msgspec"] classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", @@ -36,8 +35,11 @@ build-backend = "maturin" [tool.rye] managed = true dev-dependencies = [ - "pytest>=8.2.2", - "pip>=24.1.2", + "cffi", + "ipykernel", + "msgspec", + "pytest", + "pip", ] [tool.maturin] diff --git a/matcher_py/python/matcher_py/extension_types.py b/matcher_py/python/matcher_py/extension_types.py index 412c932..641c290 100644 --- a/matcher_py/python/matcher_py/extension_types.py +++ b/matcher_py/python/matcher_py/extension_types.py @@ -1,155 +1,85 @@ from enum import Enum, IntFlag -from typing import Dict, List +from typing import Dict, List, TypedDict -import msgspec - - -class SimpleMatchType(IntFlag): - """ - IntFlag representing different simple match types. - - Attributes: - MatchNone (int): A match type indicating no specific match criteria (0b00000001). - MatchFanjian (int): A match type for matching between traditional and simplified Chinese characters (0b00000010). - MatchWordDelete (int): A match type where words are deleted for matching purposes (0b00000100). - MatchTextDelete (int): A match type where text is deleted for matching purposes (0b00001000). - MatchDelete (int): A combined match type where both word and text deletions are applied (0b00001100). - MatchNormalize (int): A match type where text normalization is applied (0b00010000). - MatchDeleteNormalize (int): A combined match type where deletion and normalization are both applied (0b00011100). - MatchFanjianDeleteNormalize (int): A combined match type that includes Fanjian matching, deletion, and normalization (0b00011110). - MatchPinYin (int): A match type using Pinyin for matching Chinese characters (0b00100000). - MatchPinYinChar (int): A match type using individual Pinyin characters for a finer granularity match (0b01000000). - """ +class ProcessType(IntFlag): MatchNone = 0b00000001 MatchFanjian = 0b00000010 - MatchWordDelete = 0b00000100 - MatchTextDelete = 0b00001000 - MatchDelete = 0b00001100 - MatchNormalize = 0b00010000 - MatchDeleteNormalize = 0b00011100 - MatchFanjianDeleteNormalize = 0b00011110 - MatchPinYin = 0b00100000 - MatchPinYinChar = 0b01000000 + MatchDelete = 0b00000100 + MatchNormalize = 0b00001000 + MatchDeleteNormalize = 0b00001100 + MatchFanjianDeleteNormalize = 0b00001110 + MatchPinYin = 0b00010000 + MatchPinYinChar = 0b00100000 class RegexMatchType(Enum): - """ - Enum representing different regex match types. - - Attributes: - MatchSimilarChar (str): A match type that finds characters similar to a given set ("similar_char"). - MatchAcrostic (str): A match type that looks for acrostic patterns ("acrostic"). - MatchRegex (str): A match type that uses regular expressions for matching ("regex"). - """ - MatchSimilarChar = "similar_char" MatchAcrostic = "acrostic" MatchRegex = "regex" class SimMatchType(Enum): - """ - Enum representing different similarity match types. - - Attributes: - MatchLevenshtein (str): A match type using the Levenshtein distance algorithm for measuring the difference between two sequences ("levenshtein"). - MatchDamrauLevenshtein (str): A match type using the Damerau-Levenshtein distance algorithm, an extension of Levenshtein with transpositions allowed ("damrau_levenshtein"). - MatchIndel (str): A match type that uses insertion and deletion operations for matching purposes ("indel"). - MatchJaro (str): A match type using the Jaro distance algorithm to compare the similarity between two strings ("jaro"). - MatchJaroWinkler (str): A match type using the Jaro-Winkler distance algorithm, an extension of Jaro with added weight for matching starting characters ("jaro_winkler"). - """ - MatchLevenshtein = "levenshtein" - MatchDamrauLevenshtein = "damrau_levenshtein" - MatchIndel = "indel" - MatchJaro = "jaro" - MatchJaroWinkler = "jaro_winkler" - - -class Simple(msgspec.Struct): - """ - Represents a simple match configuration. - - Attributes: - simple_match_type (SimpleMatchType): The type of simple match to be used, as defined in SimpleMatchType. - """ - - simple_match_type: SimpleMatchType -class Regex(msgspec.Struct): - """ - Represents a regular expression match configuration. +class Simple(TypedDict): + process_type: ProcessType - Attributes: - regex_match_type (RegexMatchType): The type of regular expression match to be used, as defined in RegexMatchType. - """ +class Regex(TypedDict): + process_type: ProcessType regex_match_type: RegexMatchType -class Similar(msgspec.Struct): - """ - Represents a similarity match configuration. - - Attributes: - sim_match_type (SimMatchType): The type of similarity match to be used, as defined in SimMatchType. - threshold (float): The threshold value for the similarity match. This value determines the minimum similarity score required for a match to be considered successful. - """ - +class Similar(TypedDict): + process_type: ProcessType sim_match_type: SimMatchType threshold: float class MatchTableType: - """ - A class representing different types of match tables. - - Attributes: - Simple (Simple): Represents a simple match configuration. - Regex (Regex): Represents a regular expression match configuration. - Similar (Similar): Represents a similarity match configuration. - """ - - Simple = Simple - Regex = Regex - Similar = Similar - - -class MatchTable(msgspec.Struct): - """ - Represents a match table configuration with various match types. - - Attributes: - table_id (int): The unique identifier for the match table. - match_table_type (MatchTableType): The type of match table, can be one of Simple, Regex, or Similar as defined in MatchTableType. - word_list (List[str]): A list of words to be used for matching. - exemption_simple_match_type (SimpleMatchType): Specifies which simple match type(s) to exempt from the match operation. - exemption_word_list (List[str]): A list of words to exempt from the match operation. - """ - + def Simple(process_type: ProcessType) -> Dict[str, Simple]: + return {"simple": Simple(process_type=process_type)} + + def Regex( + process_type: ProcessType, regex_match_type: RegexMatchType + ) -> Dict[str, Regex]: + return { + "regex": Regex(process_type=process_type, regex_match_type=regex_match_type) + } + + def Similar( + process_type: ProcessType, sim_match_type: SimMatchType, threshold: float + ) -> Dict[str, Similar]: + return { + "similar": Similar( + process_type=process_type, + sim_match_type=sim_match_type, + threshold=threshold, + ) + } + + +class MatchTable(TypedDict): table_id: int match_table_type: MatchTableType word_list: List[str] - exemption_simple_match_type: SimpleMatchType + exemption_process_type: ProcessType exemption_word_list: List[str] MatchTableMap = Dict[int, List[MatchTable]] -class MatchResult(msgspec.Struct): +class MatchResult(TypedDict): table_id: int word: str -MatcherMatchResult = Dict[str, List[MatchResult]] +SimpleTable = Dict[ProcessType, Dict[int, str]] -class SimpleResult(msgspec.Struct): +class SimpleResult(TypedDict): word_id: int word: str - - -SimpleMatchTypeWordMap = Dict[SimpleMatchType, Dict[int, str]] diff --git a/matcher_py/python/matcher_py/matcher_py.pyi b/matcher_py/python/matcher_py/matcher_py.pyi index 072dc39..54d240f 100644 --- a/matcher_py/python/matcher_py/matcher_py.pyi +++ b/matcher_py/python/matcher_py/matcher_py.pyi @@ -1,137 +1,23 @@ -from typing import Dict, List, Optional, Tuple, TypedDict +from typing import Dict, List +from .extension_types import SimpleResult, MatchResult -import numpy as np - -class SimpleResult(TypedDict): - word_id: int - word: str - -class MatchResult(TypedDict): - table_id: int - word: str - -def text_process(simple_match_type: int, text: str) -> str: - """ - Processes the provided text based on the specified match type. - - Args: - simple_match_type (int): An integer representing the type of match to perform. - text (str): The input text to be processed. - - Returns: - str: The processed text as a single string. - """ - ... - -def reduce_text_process(simple_match_type: int, text: str) -> List[str]: - """ - Reduces the provided text based on the specified match type. - - Args: - simple_match_type (int): An integer representing the type of match to perform. - text (str): The input text to be processed. - - Returns: - List[str]: A list of strings representing the processed text. - """ - ... +def text_process(process_type: int, text: str) -> str: ... +def reduce_text_process(process_type: int, text: str) -> List[str]: ... class Matcher: - """ - Matcher class is designed to perform text matching operations using a provided match table map - in byte format. It includes functionalities for detecting matches, processing single text inputs, - and batch processing multiple text inputs in list and NumPy array formats. Additional methods - provide results in both dictionary and string formats. - - Methods: - __init__(self, match_table_map_bytes: bytes) -> None: - Initializes the Matcher instance with the provided byte array representing - the match table map. - __getnewargs__(self) -> Tuple[bytes, str, str]: - Retrieves the arguments necessary to create a new instance of Matcher. - __getstate__(self) -> Dict: - Gets the state of the Matcher instance as a dictionary. - __setstate__(self, state_dict: Dict): - Sets the state of the Matcher instance from the provided dictionary. - is_match(self, text: str) -> bool: - Checks if the provided text matches any word in the match table map. - process(self, text: str) -> List[MatchResult]: - Processes the provided text and returns a list of MatchResult dictionaries. - word_match(self, text: str) -> Dict[int, List[MatchResult]]: - Processes the provided text, matching words and returning a dictionary where - the keys are word IDs and the values are lists of MatchResult dictionaries. - word_match_as_string(self, text: str) -> str: - Processes the provided text and returns matching words as a formatted string. - batch_word_match(self, text_array: List[str]) -> List[Dict[int, str]]: - Processes a list of texts, matching words in each text and returning a list of - dictionaries where the keys are word IDs and the values are matching words as strings. - batch_word_match_as_string(self, text_array: List[str]) -> List[str]: - Processes a list of texts and returns matching words for each text as formatted strings. - numpy_word_match(self, text_array: np.ndarray, inplace=False) -> Optional[np.ndarray]: - Processes a NumPy array of texts, matching words in each text and returning a NumPy - array of dictionaries where the keys are word IDs and the values are lists of MatchResult - dictionaries. If inplace is True, the operation is performed in-place. - numpy_word_match_as_string(self, text_array: np.ndarray, inplace=False) -> Optional[np.ndarray]: - Processes a NumPy array of texts and returns matching words for each text as formatted - strings in a NumPy array. If inplace is True, the operation is performed in-place. - """ def __init__(self, match_table_map_bytes: bytes) -> None: ... - def __getnewargs__(self) -> Tuple[bytes, str, str]: ... - def __getstate__(self) -> Dict: ... - def __setstate__(self, state_dict: Dict): ... + def __getnewargs__(self) -> bytes: ... + def __getstate__(self) -> bytes: ... + def __setstate__(self, match_table_map_bytes: bytes): ... def is_match(self, text: str) -> bool: ... def process(self, text: str) -> List[MatchResult]: ... def word_match(self, text: str) -> Dict[int, List[MatchResult]]: ... def word_match_as_string(self, text: str) -> str: ... - def batch_word_match(self, text_array: List[str]) -> List[Dict[int, str]]: ... - def batch_word_match_as_string(self, text_array: List[str]) -> List[str]: ... - def numpy_word_match( - self, text_array: np.ndarray, inplace=False - ) -> Optional[np.ndarray]: ... - def numpy_word_match_as_string( - self, text_array: np.ndarray, inplace=False - ) -> Optional[np.ndarray]: ... class SimpleMatcher: - """ - SimpleMatcher class is designed to perform basic text matching and processing - operations using a provided word list dictionary in byte format. It offers functionalities - for detecting matches, processing single text inputs, and batch processing multiple text - inputs both in list and NumPy array formats. - - Methods: - __init__(self, simple_wordlist_dict_bytes: bytes) -> None: - Initializes the SimpleMatcher instance with the provided byte array representing - the simple word list dictionary. - __getnewargs__(self) -> bytes: - Retrieves the arguments necessary to create a new instance of SimpleMatcher. - __getstate__(self) -> bytes: - Gets the state of the SimpleMatcher instance as a byte array. - __setstate__(self, simple_wordlist_dict_bytes: bytes): - Sets the state of the SimpleMatcher instance from the provided byte array - representing the word list dictionary. - is_match(self, text: str) -> bool: - Checks if the provided text matches any word in the simple word list. - simple_process(self, text: str) -> List[SimpleResult]: - Processes the provided text, matching words and returning a list of SimpleResult - dictionaries representing the matches. - batch_simple_process(self, text_array: List[str]) -> List[List[SimpleResult]]: - Processes a list of texts, matching words in each text and returning a list of lists - of SimpleResult dictionaries representing the matches. - numpy_simple_process(self, text_array: np.ndarray, inplace=False) -> Optional[np.ndarray]: - Processes a NumPy array of texts, matching words in each text and returning a NumPy - array of lists of SimpleResult dictionaries representing the matches. If inplace is True, - the operation is performed in-place. - """ - def __init__(self, simple_wordlist_dict_bytes: bytes) -> None: ... + def __init__(self, simple_table_bytes: bytes) -> None: ... def __getnewargs__(self) -> bytes: ... def __getstate__(self) -> bytes: ... - def __setstate__(self, simple_wordlist_dict_bytes: bytes): ... + def __setstate__(self, simple_table_bytes: bytes): ... def is_match(self, text: str) -> bool: ... - def simple_process(self, text: str) -> List[SimpleResult]: ... - def batch_simple_process( - self, text_array: List[str] - ) -> List[List[SimpleResult]]: ... - def numpy_simple_process( - self, text_array: np.ndarray, inplace=False - ) -> Optional[np.ndarray]: ... + def process(self, text: str) -> List[SimpleResult]: ... diff --git a/matcher_py/requirements-dev.lock b/matcher_py/requirements-dev.lock index 769bad2..46892db 100644 --- a/matcher_py/requirements-dev.lock +++ b/matcher_py/requirements-dev.lock @@ -10,15 +10,15 @@ # universal: false -e file:. +cffi==1.16.0 iniconfig==2.0.0 # via pytest msgspec==0.18.6 - # via matcher-py -numpy==2.0.0 - # via matcher-py packaging==24.1 # via pytest pip==24.1.2 pluggy==1.5.0 # via pytest +pycparser==2.22 + # via cffi pytest==8.2.2 diff --git a/matcher_py/requirements.lock b/matcher_py/requirements.lock index 8c497ab..505fd45 100644 --- a/matcher_py/requirements.lock +++ b/matcher_py/requirements.lock @@ -10,7 +10,3 @@ # universal: false -e file:. -msgspec==0.18.6 - # via matcher-py -numpy==2.0.0 - # via matcher-py diff --git a/matcher_py/src/lib.rs b/matcher_py/src/lib.rs index 10c4225..de87ecf 100644 --- a/matcher_py/src/lib.rs +++ b/matcher_py/src/lib.rs @@ -1,72 +1,23 @@ use std::borrow::Cow; use std::collections::HashMap; -use numpy::{PyArray1, PyArrayMethods}; use pyo3::exceptions::PyValueError; use pyo3::prelude::{ - pyclass, pymethods, pymodule, wrap_pyfunction, Py, PyModule, PyObject, PyResult, Python, -}; -use pyo3::types::{ - PyAnyMethods, PyDict, PyDictMethods, PyList, PyListMethods, PyString, PyStringMethods, + pyclass, pymethods, pymodule, wrap_pyfunction, PyModule, PyObject, PyResult, Python, }; +use pyo3::types::{PyDict, PyDictMethods, PyModuleMethods}; use pyo3::{intern, pyfunction, Bound, IntoPy}; use matcher_rs::{ reduce_text_process as reduce_text_process_rs, text_process as text_process_rs, - MatchResult as MatchResultRs, MatchResultTrait, MatchTableMap as MatchTableMapRs, - Matcher as MatcherRs, SimpleMatchType, SimpleMatchTypeWordMap as SimpleMatchTypeWordMapRs, - SimpleMatcher as SimpleMatcherRs, SimpleResult as SimpleResultRs, TextMatcherTrait, + MatchResult as MatchResultRs, MatchTableMap as MatchTableMapRs, Matcher as MatcherRs, + ProcessType, SimpleMatcher as SimpleMatcherRs, SimpleResult as SimpleResultRs, + SimpleTable as SimpleTableRs, TextMatcherTrait, }; -/// A struct that wraps around the [SimpleResultRs] struct from the [matcher_rs] crate. -/// -/// This struct serves as a bridge between the Rust and Python representations of match results. -/// It encapsulates a [SimpleResultRs] instance and provides necessary implementations -/// for conversion and trait compliance, facilitating its use in Python. -/// -/// # Lifetime Parameters -/// - `'a`: The lifetime parameter that corresponds to the lifetime of the encapsulated -/// [SimpleResultRs] instance. -/// -/// # Example -/// ```no_run -/// use std::borrow::Cow; -/// -/// use matcher_py::*; -/// use matcher_rs::SimpleResult as SimpleResultRs; -/// -/// let simple_result_rs = SimpleResultRs { -/// word_id: 1, -/// word: Cow::borrowed("example"), -/// }; -/// let simple_result = SimpleResult(simple_result_rs); -/// -/// assert_eq!(simple_result.0.word_id, 1); -/// assert_eq!(simple_result.0.word, "example"); -/// ``` struct SimpleResult<'a>(SimpleResultRs<'a>); impl<'a> IntoPy for SimpleResult<'a> { - /// Converts a [SimpleResult] instance into a Python dictionary [PyObject]. - /// - /// This implementation of the [IntoPy] trait allows for converting a [SimpleResult] - /// into a Python dictionary containing the match result data, which can be used - /// in Python code. The dictionary includes the following key-value pairs: - /// - /// - `"word_id"`: The unique identifier (u32) for the matched word. - /// - `"word"`: The matched word as a string slice. - /// - /// # Parameters - /// - `self`: The [SimpleResult] instance to be converted. - /// - `py`: The Python interpreter state. - /// - /// # Returns - /// - [PyObject]: A Python dictionary containing the match result data. - /// - /// # Panics - /// Panics if setting a dictionary item fails. Although highly unlikely, - /// failures might occur due to memory issues or internal Python state inconsistencies. - /// ```text fn into_py(self, py: Python<'_>) -> PyObject { let dict = PyDict::new_bound(py); @@ -79,39 +30,9 @@ impl<'a> IntoPy for SimpleResult<'a> { } } -impl MatchResultTrait<'_> for SimpleResult<'_> { - fn word_id(&self) -> u32 { - self.0.word_id() - } - fn word(&self) -> &str { - self.0.word.as_ref() - } -} - struct MatchResult<'a>(MatchResultRs<'a>); impl<'a> IntoPy for MatchResult<'a> { - /// Converts a [MatchResult] instance into a Python dictionary [PyObject]. - /// - /// This implementation of the [IntoPy] trait allows for converting a [MatchResult] - /// into a Python dictionary containing the match result data, which can be used - /// in Python code. The dictionary includes the following key-value pairs: - /// - /// - `"match_id"`: The unique identifier (u32) for the match. - /// - `"table_id"`: The unique identifier (u32) for the table. - /// - `"word"`: The matched word as a string slice. - /// - /// # Parameters - /// - `self`: The [MatchResult] instance to be converted. - /// - `py`: The Python interpreter state. - /// - /// # Returns - /// - [PyObject]: A Python dictionary containing the match result data. - /// - /// # Panics - /// Panics if setting a dictionary item fails. Although highly unlikely, - /// failures might occur due to memory issues or internal Python state inconsistencies. - /// ```text fn into_py(self, py: Python<'_>) -> PyObject { let dict = PyDict::new_bound(py); @@ -126,132 +47,25 @@ impl<'a> IntoPy for MatchResult<'a> { } } -/// Processes text using a specified simple match type. -/// -/// This function applies a text processing operation based on the provided simple match type, -/// which is an enumeration representing different types of simple matches. -/// The function is directly linked to the `text_process_rs` function from the -/// `matcher_rs` crate, which performs the actual processing logic. -/// -/// # Parameters -/// - `simple_match_type` (u8): A byte value that corresponds to a specific type of simple match. -/// - `text` (&str): A string slice containing the text to be processed. -/// -/// # Returns -/// - [`PyResult>`]: On success, returns a `Cow` string representing the processed text. -/// On failure, returns a Python exception detailing the error. -/// -/// # Errors -/// This function will return a `PyValueError` if the text processing operation -/// in the `matcher_rs` crate fails, encapsulating the underlying error message. #[pyfunction] -#[pyo3(signature=(simple_match_type, text))] -fn text_process(simple_match_type: u8, text: &str) -> PyResult> { - let simple_match_type = - SimpleMatchType::from_bits(simple_match_type).unwrap_or(SimpleMatchType::None); - match text_process_rs(simple_match_type, text) { +#[pyo3(signature=(process_type, text))] +fn text_process(process_type: u8, text: &str) -> PyResult> { + let process_type = ProcessType::from_bits(process_type).unwrap_or(ProcessType::None); + match text_process_rs(process_type, text) { Ok(result) => Ok(result), Err(e) => Err(PyValueError::new_err(e)), } } -/// Reduces text using a specified simple match type. -/// -/// This function applies a text reduction process based on the provided simple match type, -/// which is an enumeration representing different types of simple matches. -/// The function is directly linked to the `reduce_text_process_rs` function from the -/// `matcher_rs` crate, which performs the actual reduction logic. -/// -/// # Parameters -/// - `simple_match_type` (u8): A byte value that corresponds to a specific type of simple match. -/// - `text` (&str): A string slice containing the text to be processed. -/// -/// # Returns -/// - [`Vec>`]: A vector of `Cow` strings representing the reduced text fragments. -/// -/// # Errors -/// This function will default to `SimpleMatchType::None` if the provided byte value does not -/// correspond to a valid `SimpleMatchType`. It will not raise an error in such cases but will -/// produce results based on the `SimpleMatchType::None`. #[pyfunction] -#[pyo3(signature=(simple_match_type, text))] -fn reduce_text_process(simple_match_type: u8, text: &str) -> Vec> { - let simple_match_type = - SimpleMatchType::from_bits(simple_match_type).unwrap_or(SimpleMatchType::None); - reduce_text_process_rs(simple_match_type, text) +#[pyo3(signature=(process_type, text))] +fn reduce_text_process(process_type: u8, text: &str) -> Vec> { + let process_type = ProcessType::from_bits(process_type).unwrap_or(ProcessType::None); + reduce_text_process_rs(process_type, text) .into_iter() .collect() } -/// A Python class that wraps the [MatcherRs] struct from the [matcher_rs] crate. -/// -/// This class provides functionality for text matching using a deserialized match table map. -/// It allows for single text matching, batch text processing using Python lists, and batch -/// processing using NumPy arrays. -/// -/// # Fields -/// - `matcher`: An instance of the [MatcherRs] struct that performs the core matching logic. -/// - `match_table_map_bytes`: A serialized byte array representing the match table map, -/// used for reconstructing the [MatcherRs] instance during deserialization. -/// -/// # Example -/// -/// ```python -/// import msgspec -/// import numpy as np -/// -/// from matcher_py import Matcher -/// from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType -/// -/// msgpack_encoder = msgspec.msgpack.Encoder() -/// -/// matcher = Matcher( -/// msgpack_encoder.encode( -/// { -/// 1: [ -/// MatchTable( -/// table_id=1, -/// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), -/// word_list=["hello", "world"], -/// exemption_simple_match_type=SimpleMatchType.MatchNone, -/// exemption_word_list=["word"], -/// ) -/// ] -/// } -/// ) -/// ) -/// -/// # Check if a text matches -/// assert matcher.is_match("hello") -/// assert not matcher.is_match("hello, word") -/// -/// # Perform word matching as a dict -/// assert matcher.word_match(r"hello, world")[1] -/// -/// # Perform word matching as a string -/// result = matcher.word_match_as_string("hello") -/// assert result == """{1:"[{\\"table_id\\":1,\\"word\\":\\"hello\\"}]"}""" -/// -/// # Perform batch processing as a dict using a list -/// text_list = ["hello", "world", "hello,word"] -/// batch_results = matcher.batch_word_match(text_list) -/// print(batch_results) -/// -/// # Perform batch processing as a string using a list -/// text_list = ["hello", "world", "hello,word"] -/// batch_results = matcher.batch_word_match_as_string(text_list) -/// print(batch_results) -/// -/// # Perform batch processing as a dict using a numpy array -/// text_array = np.array(["hello", "world", "hello,word"], dtype=np.dtype("object")) -/// numpy_results = matcher.numpy_word_match(text_array) -/// print(numpy_results) -/// -/// # Perform batch processing as a string using a numpy array -/// text_array = np.array(["hello", "world", "hello,word"], dtype=np.dtype("object")) -/// numpy_results = matcher.numpy_word_match_as_string(text_array) -/// print(numpy_results) -/// ``` #[pyclass(module = "matcher_py")] struct Matcher { matcher: MatcherRs, @@ -260,50 +74,6 @@ struct Matcher { #[pymethods] impl Matcher { - /// Creates a new instance of the [Matcher] class from a serialized byte array. - /// - /// This constructor takes a serialized byte array representing a match table map, - /// deserializes it, and uses it to initialize the [Matcher] instance. If the - /// deserialization fails, an error is returned. - /// - /// # Parameters - /// - `match_table_map_bytes`: A reference to a byte slice containing the serialized - /// match table map data. - /// - /// # Returns - /// - [`PyResult`]: A result containing the newly created [Matcher] instance - /// if successful, or a [PyValueError] if deserialization fails. - /// - /// # Errors - /// Returns a [PyValueError] with an error message if the deserialization of the byte array - /// into a [MatchTableMapRs] fails. The error message includes details about the failure. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// from matcher_py import Matcher - /// from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// match_table_map_bytes = msgpack_encoder.encode( - /// { - /// 1: [ - /// MatchTable( - /// table_id=1, - /// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - /// word_list=["hello", "world"], - /// exemption_simple_match_type=SimpleMatchType.MatchNone, - /// exemption_word_list=["word"], - /// ) - /// ] - /// } - /// ) - /// - /// matcher = Matcher(match_table_map_bytes) - /// ``` #[new] #[pyo3(signature=(match_table_map_bytes))] fn new(match_table_map_bytes: &[u8]) -> PyResult { @@ -323,155 +93,14 @@ impl Matcher { }) } - /// Returns the arguments needed to create a new [Matcher] instance during unpickling. - /// - /// This method provides the byte array representing the match table map, which is - /// necessary to reconstruct the [Matcher] object when it is unpickled in Python. - /// - /// # Parameters - /// - `self`: The [Matcher] instance. - /// - /// # Returns - /// - `&[u8]`: A reference to the byte array containing the serialized match table map data. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// from matcher_py import Matcher - /// from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// matcher = Matcher( - /// msgpack_encoder.encode( - /// { - /// 1: [ - /// MatchTable( - /// table_id=1, - /// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - /// word_list=["hello", "world"], - /// exemption_simple_match_type=SimpleMatchType.MatchNone, - /// exemption_word_list=["word"], - /// ) - /// ] - /// } - /// ) - /// ) - /// - /// pickle_data = pickle.dumps(matcher) - /// unpickled_matcher = pickle.loads(pickle_data) - /// ``` fn __getnewargs__(&self) -> (&[u8],) { (&self.match_table_map_bytes,) } - /// Serializes the [Matcher] object's state for pickling. - /// - /// This method is called during the pickling process to extract the state of the - /// [Matcher] instance in the form of a byte array. This byte array represents the - /// match table map, which can be used to reconstruct the [Matcher] object during - /// unpickling. - /// - /// # Returns - /// - `&[u8]`: A reference to the byte array containing the serialized match table map data. - /// - /// # Example - /// - /// ```python - /// import pickle - /// import msgspec - /// - /// from matcher_py import Matcher - /// from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// matcher = Matcher( - /// msgpack_encoder.encode( - /// { - /// 1: [ - /// MatchTable( - /// table_id=1, - /// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - /// word_list=["hello", "world"], - /// exemption_simple_match_type=SimpleMatchType.MatchNone, - /// exemption_word_list=["word"], - /// ) - /// ] - /// } - /// ) - /// ) - /// - /// # Serialize the object to a byte array - /// pickle_data = pickle.dumps(matcher) - /// - /// # Deserialize the object from a byte array - /// unpickled_matcher = pickle.loads(pickle_data) - /// ``` fn __getstate__(&self) -> &[u8] { &self.match_table_map_bytes } - /// Restores the state of the [Matcher] object from a serialized byte array. - /// - /// This method is called during the unpickling process to restore the state of the - /// [Matcher] instance using the provided byte array. The byte array should represent - /// a serialized [MatchTableMapRs]. The method deserializes this byte array to reconstruct - /// the match table map and updates the `matcher` attribute accordingly. - /// - /// # Parameters - /// - `self`: The [Matcher] instance. - /// - `match_table_map_bytes`: A reference to the byte array containing the serialized - /// match table map data. - /// - /// # Example - /// - /// ```python - /// import pickle - /// import msgspec - /// - /// from matcher_py import Matcher - /// from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// matcher = Matcher( - /// msgpack_encoder.encode( - /// { - /// 1: [ - /// MatchTable( - /// table_id=1, - /// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - /// word_list=["hello", "world"], - /// exemption_simple_match_type=SimpleMatchType.MatchNone, - /// exemption_word_list=["word"], - /// ) - /// ] - /// } - /// ) - /// ) - /// - /// # Serialize the object to a byte array - /// pickle_data = pickle.dumps(matcher) - /// - /// # Deserialize the object from a byte array and restore its state - /// unpickled_matcher = pickle.loads(pickle_data) - /// unpickled_matcher.__setstate__(msgpack_encoder.encode( - /// { - /// 1: [ - /// MatchTable( - /// table_id=1, - /// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - /// word_list=["hello", "world"], - /// exemption_simple_match_type=SimpleMatchType.MatchNone, - /// exemption_word_list=["word"], - /// ) - /// ] - /// } - /// )) - /// ``` #[pyo3(signature=(match_table_map_bytes))] fn __setstate__(&mut self, match_table_map_bytes: &[u8]) { self.matcher = MatcherRs::new( @@ -479,99 +108,22 @@ impl Matcher { ); } - /// Checks if the given text contains any matches according to the configured match tables. - /// - /// This method uses the `is_match` function of the [MatcherRs] instance to determine whether - /// the input text contains any words that match the criteria defined in the match tables. - /// - /// # Parameters - /// - `self`: The [Matcher] instance. - /// - `text`: A string slice representing the text to be checked for matches. - /// - /// # Returns - /// - `bool`: `true` if the text contains a match, `false` otherwise. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// from matcher_py import Matcher - /// from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// matcher = Matcher( - /// msgpack_encoder.encode( - /// { - /// 1: [ - /// MatchTable( - /// table_id=1, - /// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - /// word_list=["hello", "world"], - /// exemption_simple_match_type=SimpleMatchType.MatchNone, - /// exemption_word_list=["word"], - /// ) - /// ] - /// } - /// ) - /// ) - /// - /// # Check if the text contains any matches - /// result = matcher.is_match("hello") - /// print(result) # Output: True - /// ``` #[pyo3(signature=(text))] fn is_match(&self, text: &str) -> bool { self.matcher.is_match(text) } - /// Performs word matching on the given text and returns the results as a dictionary. - /// - /// This method leverages the `word_match` function of the [MatcherRs] instance to identify - /// matches within the provided text. The results are mapped into a `HashMap` where the - /// keys are match IDs and the values are lists of [MatchResult] objects. - /// - /// # Parameters - /// - `self`: The [Matcher] instance. - /// - `text`: A string slice representing the text to be checked for matches. - /// - /// # Returns - /// - [`HashMap>>`]: A dictionary where each key is a match ID (u32), - /// and each value is a list of [MatchResult] objects corresponding to the matches found. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// from matcher_py import Matcher - /// from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// matcher = Matcher( - /// msgpack_encoder.encode( - /// { - /// 1: [ - /// MatchTable( - /// table_id=1, - /// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - /// word_list=["hello", "world"], - /// exemption_simple_match_type=SimpleMatchType.MatchNone, - /// exemption_word_list=["word"], - /// ) - /// ] - /// } - /// ) - /// ) - /// - /// # Perform word matching and get the results as a dictionary - /// result = matcher.word_match("hello") - /// print(result) # Output: Dictionary with match IDs as keys and lists of MatchResult objects as values - /// ``` #[pyo3(signature=(text))] - fn word_match(&self, text: &str) -> HashMap>> { + fn process<'a>(&'a self, text: &'a str) -> Vec> { + self.matcher + .process(text) + .into_iter() + .map(MatchResult) + .collect() + } + + #[pyo3(signature=(text))] + fn word_match<'a>(&'a self, text: &'a str) -> HashMap>> { self.matcher .word_match(text) .into_iter() @@ -584,843 +136,67 @@ impl Matcher { .collect() } - /// Processes the given text and returns a list of match results. - /// - /// This method utilizes the `process` function of the [MatcherRs] instance to analyze - /// the input text for matches. The results are collected into a vector of [MatchResult] objects. - /// - /// # Parameters - /// - `self`: The [Matcher] instance. - /// - `text`: A string slice representing the text to be processed. - /// - /// # Returns - /// - [`Vec>`]: A vector containing the match results as [MatchResult] objects. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// from matcher_py import Matcher - /// from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// matcher = Matcher( - /// msgpack_encoder.encode( - /// { - /// 1: [ - /// MatchTable( - /// table_id=1, - /// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - /// word_list=["hello", "world"], - /// exemption_simple_match_type=SimpleMatchType.MatchNone, - /// exemption_word_list=["word"], - /// ) - /// ] - /// } - /// ) - /// ) - /// - /// # Process the text and get the match results - /// result = matcher.process("hello") - /// print(result) # Output: List of MatchResult objects - /// ``` - #[pyo3(signature=(text))] - fn process(&self, text: &str) -> Vec> { - self.matcher - .process(text) - .into_iter() - .map(MatchResult) - .collect() - } - - /// Returns the word match results for the given text as a JSON string. - /// - /// This method checks if the input `text` is empty. If it is, the method returns an empty JSON object (`{}`) - /// as a string. Otherwise, it leverages the `word_match_as_string` function of the [MatcherRs] instance to - /// obtain the word match results as a JSON string. - /// - /// # Parameters - /// - `self`: The [Matcher] instance. - /// - `text`: A string slice representing the text to be checked for matches. - /// - /// # Returns - /// - `String`: A JSON string representing the match results. Returns `{}` if the input text is empty. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// from matcher_py import Matcher - /// from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// matcher = Matcher( - /// msgpack_encoder.encode( - /// { - /// 1: [ - /// MatchTable( - /// table_id=1, - /// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - /// word_list=["hello", "world"], - /// exemption_simple_match_type=SimpleMatchType.MatchNone, - /// exemption_word_list=["word"], - /// ) - /// ] - /// } - /// ) - /// ) - /// - /// # Get word match results as a JSON string - /// result = matcher.word_match_as_string("hello") - /// print(result) # Output: JSON string with match results - /// ``` #[pyo3(signature=(text))] fn word_match_as_string(&self, text: &str) -> String { - text.is_empty() - .then_some(String::from("{}")) - .unwrap_or_else(|| self.matcher.word_match_as_string(text)) - } - - /// Batch processes a list of texts and performs word matching on each text, - /// returning the results as a list of dictionaries. - /// - /// This method iterates over a [PyList] containing texts, performs word matching - /// on each text using the [word_match](Matcher::word_match) method, and collects - /// the results into a [Vec>>>]. - /// - /// # Parameters - /// - `self`: The [Matcher] instance. - /// - `text_array`: A reference to a [PyList] containing texts to be processed. - /// - /// # Returns - /// - [`PyResult>>>>`]: A result containing a - /// vector of dictionaries. Each dictionary has match IDs (u32) as keys and lists - /// of [MatchResult] objects as values. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// from matcher_py import Matcher - /// from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// matcher = Matcher( - /// msgpack_encoder.encode( - /// { - /// 1: [ - /// MatchTable( - /// table_id=1, - /// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - /// word_list=["hello", "world"], - /// exemption_simple_match_type=SimpleMatchType.MatchNone, - /// exemption_word_list=["word"], - /// ) - /// ] - /// } - /// ) - /// ) - /// - /// # Perform word matching for a batch of texts - /// text_array = ["hello", "world", "hello world"] - /// result = matcher.batch_word_match(text_array) - /// print(result) # Output: List of dictionaries with match results for each text - /// ``` - #[pyo3(signature=(text_array))] - fn batch_word_match( - &self, - text_array: &Bound<'_, PyList>, - ) -> PyResult>>>> { - let mut result_list = Vec::with_capacity(text_array.len()); - - for text in text_array.iter() { - let text_py_string = text.downcast::()?; - result_list.push(self.word_match(text_py_string.to_cow().as_ref().unwrap())); - } - - Ok(result_list) - } - - /// Batch processes a list of texts and performs word matching, - /// returning the results as a list of JSON strings. - /// - /// This method iterates over a [PyList] containing texts, performs word matching - /// on each text using the [word_match_as_string](Matcher::word_match_as_string) method, - /// and collects the results into a vector of JSON strings. - /// - /// # Parameters - /// - `self`: The [Matcher] instance. - /// - `text_array`: A reference to a [PyList] containing texts to be processed. - /// - /// # Returns - /// - [`PyResult>`]: A result containing a vector of JSON strings. Each string - /// represents the match results for the corresponding input text. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// from matcher_py import Matcher - /// from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// matcher = Matcher( - /// msgpack_encoder.encode( - /// { - /// 1: [ - /// MatchTable( - /// table_id=1, - /// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - /// word_list=["hello", "world"], - /// exemption_simple_match_type=SimpleMatchType.MatchNone, - /// exemption_word_list=["word"], - /// ) - /// ] - /// } - /// ) - /// ) - /// - /// # Perform word matching for a batch of texts and get results as JSON strings - /// text_array = ["hello", "world", "hello world"] - /// result = matcher.batch_word_match_as_string(text_array) - /// print(result) # Output: List of JSON strings with match results for each text - /// ``` - #[pyo3(signature=(text_array))] - fn batch_word_match_as_string(&self, text_array: &Bound<'_, PyList>) -> PyResult> { - let mut result_list = Vec::with_capacity(text_array.len()); - - for text in text_array.iter() { - let text_py_string = text.downcast::()?; - result_list.push(self.word_match_as_string(text_py_string.to_cow().as_ref().unwrap())); - } - - Ok(result_list) - } - - /// Batch processes a NumPy 1-D array of texts and performs word matching - /// on each text, returning the results as Python objects. - /// - /// This function iterates over a NumPy 1-D array of texts, performs word matching - /// on each text, and collects the results into a new NumPy array or modifies the - /// original array in-place based on the `inplace` parameter. If `inplace` is set to `true`, - /// the original array is modified directly. The result for each text is obtained by - /// calling the [word_match](Matcher::word_match) method. - /// - /// # Parameters - /// - `self`: The [Matcher] instance. - /// - `py`: The Python interpreter state. - /// - `text_array`: A reference to a [PyArray1] containing texts to be processed. - /// - `inplace`: A boolean flag indicating whether to modify the original array in-place. - /// - /// # Returns - /// - [`Option>>`]: If `inplace` is `false`, a new `PyArray1` containing - /// the word match results for each text as Python objects. If `inplace` is `true`, returns - /// [None] as the original array is modified in-place. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// import numpy as np - /// - /// from matcher_py import Matcher - /// from matcher_py.extension_types import MatchTable, MatchTableType, SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// matcher = Matcher( - /// msgpack_encoder.encode( - /// { - /// 1: [ - /// MatchTable( - /// table_id=1, - /// match_table_type=MatchTableType.Simple(simple_match_type=SimpleMatchType.MatchNone), - /// word_list=["hello", "world"], - /// exemption_simple_match_type=SimpleMatchType.MatchNone, - /// exemption_word_list=["word"], - /// ) - /// ] - /// } - /// ) - /// ) - /// - /// text_array = np.array(["hello", "world", "hello word"], dtype=np.dtype("object")) - /// result = matcher.numpy_word_match(text_array) - /// print(result) # Output: A new NumPy array with word match results as Python objects - /// - /// inplace_result = matcher.numpy_word_match(text_array, inplace=True) - /// print(text_array) # Output: The original NumPy array modified with word match results - /// ``` - #[pyo3(signature=(text_array, inplace = false))] - fn numpy_word_match( - &self, - py: Python, - text_array: &Bound<'_, PyArray1>, - inplace: bool, - ) -> Option>> { - if inplace { - unsafe { text_array.as_array_mut() }.map_inplace(|text| { - *text = text - .downcast_bound::(py) - .map_or(py.None(), |text_py_string| { - self.word_match(text_py_string.to_cow().as_ref().unwrap()) - .into_py(py) - }); - }); - None - } else { - Some( - PyArray1::::from_owned_array_bound( - py, - unsafe { text_array.as_array() }.map(|text| { - text.downcast_bound::(py) - .map_or(py.None(), |text_py_string| { - self.word_match(text_py_string.to_cow().as_ref().unwrap()) - .into_py(py) - }) - }), - ) - .into(), - ) - } - } - - /// Batch processes a NumPy 1-D array of texts and performs word matching as strings - /// on each text, returning the results as Python objects. - /// - /// This function iterates over a NumPy 1-D array of texts, performs word matching - /// as strings on each text, and collects the results into a new NumPy array or modifies the - /// original array in-place based on the `inplace` parameter. If `inplace` is set to `true`, - /// the original array is modified directly. The result for each text is obtained by - /// calling the [word_match_as_string](Matcher::word_match_as_string) method. - /// - /// # Parameters - /// - `self`: The [Matcher] instance. - /// - `py`: The Python interpreter state. - /// - `text_array`: A reference to a [PyArray1] containing texts to be processed. - /// - `inplace`: A boolean flag indicating whether to modify the original array in-place. - /// - /// # Returns - /// - [`Option>>`]: If `inplace` is `false`, a new `PyArray1` containing - /// the word match results as strings for each text as Python objects. If `inplace` is `true`, - /// returns [None] as the original array is modified in-place. - /// - /// # Example - /// - /// ```python - /// import numpy as np - /// - /// from matcher_py import Matcher - /// - /// matcher = Matcher(...) - /// - /// text_array = np.array(["hello", "world", "hello word"], dtype=np.dtype("object")) - /// result = matcher.numpy_word_match_as_string(text_array) - /// print(result) # Output: A new NumPy array with word match results as Python objects - /// - /// inplace_result = matcher.numpy_word_match_as_string(text_array, inplace=True) - /// print(text_array) # Output: The original NumPy array modified with word match results - /// ``` - #[pyo3(signature=(text_array, inplace = false))] - fn numpy_word_match_as_string( - &self, - py: Python, - text_array: &Bound<'_, PyArray1>, - inplace: bool, - ) -> Option>> { - if inplace { - unsafe { text_array.as_array_mut() }.map_inplace(|text| { - *text = text - .downcast_bound::(py) - .map_or(py.None(), |text_py_string| { - self.word_match_as_string(text_py_string.to_cow().as_ref().unwrap()) - .into_py(py) - }); - }); - None - } else { - Some( - PyArray1::::from_owned_array_bound( - py, - unsafe { text_array.as_array() }.map(|text| { - text.downcast_bound::(py) - .map_or(py.None(), |text_py_string| { - self.word_match_as_string(text_py_string.to_cow().as_ref().unwrap()) - .into_py(py) - }) - }), - ) - .into(), - ) - } + self.matcher.word_match_as_string(text) } } -/// A Python class that wraps the [SimpleMatcherRs] struct from the [matcher_rs] crate. -/// -/// This class provides functionality for simple text matching using a serialized -/// type word map. It enables single text matching and batch text processing -/// using both Python lists and NumPy arrays. -/// -/// # Fields -/// - `simple_matcher`: An instance of the [SimpleMatcherRs] struct which performs -/// the core matching logic. -/// - `smt_word_map_bytes`: A serialized byte array representing the -/// simple match type word map used for initializing the `simple_matcher` field during -/// deserialization. -/// -/// # Example -/// ```python -/// import msgspec -/// -/// import numpy as np -/// -/// from matcher_py import SimpleMatcher -/// from matcher_py.extension_types import SimpleMatchType -/// -/// msgpack_encoder = msgspec.msgpack.Encoder() -/// -/// simple_matcher = SimpleMatcher( -/// msgpack_encoder.encode( -/// { -/// SimpleMatchType.MatchNone: { -/// 1: "example" -/// } -/// } -/// ) -/// ) -/// -/// # Check if a text matches -/// assert simple_matcher.is_match("example") -/// -/// # Perform simple processing -/// results = simple_matcher.simple_process("example") -/// print(results) -/// -/// # Perform batch processing using a list -/// text_list = ["example", "test", "example test"] -/// batch_results = simple_matcher.batch_simple_process(text_list) -/// print(batch_results) -/// -/// # Perform batch processing using a NumPy array -/// text_array = np.array(["example", "test", "example test"], dtype=np.dtype("object")) -/// numpy_results = simple_matcher.numpy_simple_process(text_array) -/// print(numpy_results) -/// ``` #[pyclass(module = "matcher_py")] struct SimpleMatcher { simple_matcher: SimpleMatcherRs, - smt_word_map_bytes: Vec, + simple_table_bytes: Vec, } #[pymethods] impl SimpleMatcher { - /// Creates a new instance of [SimpleMatcher]. - /// - /// This constructor initializes a new [SimpleMatcher] by deserializing the provided byte array - /// representing the simple match type word map. The byte array is deserialized using the `rmp_serde` - /// crate to reconstruct the map, which is then used to initialize the underlying `simple_matcher` field. - /// - /// # Parameters - /// - `_py`: The Python interpreter state. - /// - `smt_word_map_bytes`: A byte slice that contains the serialized simple match type word map. - /// - /// # Errors - /// - Returns a [PyValueError] if deserialization of the `smt_word_map_bytes` fails. - /// - /// # Returns - /// - [`PyResult`]: An instance of [SimpleMatcher] if deserialization and initialization are successful. - /// - /// # Example - /// ```python - /// import msgspec - /// - /// from matcher_py import SimpleMatcher - /// from matcher_py.extension_types import SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// smt_word_map = msgpack_encoder.encode( - /// { - /// SimpleMatchType.MatchNone: { - /// 1: "example" - /// } - /// } - /// ) - /// - /// simple_matcher = SimpleMatcher(smt_word_map) - /// print(simple_matcher.simple_matcher) - /// ``` #[new] - #[pyo3(signature=(smt_word_map_bytes))] - fn new(_py: Python, smt_word_map_bytes: &[u8]) -> PyResult { - let smt_word_map: SimpleMatchTypeWordMapRs = match rmp_serde::from_slice(smt_word_map_bytes) - { - Ok(smt_word_map) => smt_word_map, + #[pyo3(signature=(simple_table_bytes))] + fn new(_py: Python, simple_table_bytes: &[u8]) -> PyResult { + let simple_table: SimpleTableRs = match rmp_serde::from_slice(simple_table_bytes) { + Ok(simple_table) => simple_table, Err(e) => { return Err(PyValueError::new_err(format!( - "Deserialize smt_word_map_bytes failed, Please check the input data.\n Err: {}", + "Deserialize simple_table_bytes failed, Please check the input data.\n Err: {}", e ))) } }; Ok(SimpleMatcher { - simple_matcher: SimpleMatcherRs::new(&smt_word_map), - smt_word_map_bytes: Vec::from(smt_word_map_bytes), + simple_matcher: SimpleMatcherRs::new(&simple_table), + simple_table_bytes: Vec::from(simple_table_bytes), }) } - /// Provides the arguments necessary to recreate the [SimpleMatcher] object during unpickling. - /// - /// This method is called by the Python pickling process and provides the serialized - /// simple match type word map byte array, which is necessary to reconstruct the [SimpleMatcher] - /// object. - /// - /// # Returns - /// - `&[u8]`: A reference to the byte array containing the serialized simple match type word map data. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// from matcher_py import SimpleMatcher - /// from matcher_py.extension_types import SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// smt_word_map = msgpack_encoder.encode( - /// { - /// SimpleMatchType.MatchNone: { - /// 1: "example" - /// } - /// } - /// ) - /// - /// simple_matcher = SimpleMatcher(smt_word_map) - /// - /// # Check the args returned for recreating the object - /// serialized_args = simple_matcher.__getnewargs__() - /// print(serialized_args) - /// ``` fn __getnewargs__(&self) -> (&[u8],) { - (&self.smt_word_map_bytes,) + (&self.simple_table_bytes,) } - /// Serializes the state of the [SimpleMatcher] object for pickling. - /// - /// This method is called during the pickling process to capture the state of the [SimpleMatcher] - /// instance. It returns a reference to the byte array containing the serialized simple match - /// type word map data, which is used to reconstruct the object during unpickling. - /// - /// # Returns - /// - `&[u8]`: A reference to the byte array representing the serialized simple match type word map. - /// - /// # Example - /// - /// ```python - /// import pickle - /// import msgspec - /// - /// from matcher_py import SimpleMatcher - /// from matcher_py.extension_types import SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// smt_word_map = msgpack_encoder.encode( - /// { - /// SimpleMatchType.MatchNone: { - /// 1: "example" - /// } - /// } - /// ) - /// - /// simple_matcher = SimpleMatcher(smt_word_map) - /// - /// # Serialize SimpleMatcher instance to a byte stream using pickle - /// pickled_data = pickle.dumps(simple_matcher) - /// print(pickled_data) - /// ``` fn __getstate__(&self) -> &[u8] { - &self.smt_word_map_bytes + &self.simple_table_bytes } - /// Restores the state of the [SimpleMatcher] object from the provided serialized data. - /// - /// This method is called during the unpickling process to reinitialize the `simple_matcher` - /// instance with the given serialized simple match type word map byte array. The byte array - /// is deserialized into a [SimpleMatchTypeWordMapRs] and a new [SimpleMatcherRs] instance is - /// created using the deserialized word map. - /// - /// # Parameters - /// - `self`: The [SimpleMatcher] instance. - /// - `smt_word_map_bytes`: A reference to a byte slice containing the serialized - /// simple match type word map data. - /// - /// # Example - /// - /// ```python - /// import pickle - /// import msgspec - /// - /// from matcher_py import SimpleMatcher - /// from matcher_py.extension_types import SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// smt_word_map = msgpack_encoder.encode( - /// { - /// SimpleMatchType.MatchNone: { - /// 1: "example" - /// } - /// } - /// ) - /// - /// simple_matcher = SimpleMatcher(smt_word_map) - /// - /// # Serialize and deserialize using pickle - /// pickled_data = pickle.dumps(simple_matcher) - /// deserialized_matcher = pickle.loads(pickled_data) - /// - /// # The deserialized object should have the same state - /// assert deserialized_matcher.is_match("example") - /// ``` - #[pyo3(signature=(smt_word_map_bytes))] - fn __setstate__(&mut self, smt_word_map_bytes: &[u8]) { + #[pyo3(signature=(simple_table_bytes))] + fn __setstate__(&mut self, simple_table_bytes: &[u8]) { self.simple_matcher = SimpleMatcherRs::new( - &rmp_serde::from_slice::(smt_word_map_bytes).unwrap(), + &rmp_serde::from_slice::(simple_table_bytes).unwrap(), ); } - /// Checks if the given text matches any of the patterns in the simple matcher. - /// - /// This method takes a string slice as input and invokes the `is_match` method on the internal - /// `simple_matcher` instance. It returns a boolean indicating whether the text matches any of - /// the patterns defined in the `simple_matcher`. - /// - /// # Parameters - /// - `self`: The [SimpleMatcher] instance. - /// - `text`: A reference to a string slice that will be checked against the patterns. - /// - /// # Returns - /// - `bool`: `true` if the text matches any pattern; otherwise, `false`. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// from matcher_py import SimpleMatcher - /// from matcher_py.extension_types import SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// simple_matcher = SimpleMatcher( - /// msgpack_encoder.encode( - /// { - /// SimpleMatchType.MatchNone: { - /// 1: "example" - /// } - /// } - /// ) - /// ) - /// - /// # Check if a given text matches any of the patterns - /// is_match = simple_matcher.is_match("example") - /// print(is_match) # Output: True if "example" matches any pattern; otherwise, False - /// ``` #[pyo3(signature=(text))] fn is_match(&self, text: &str) -> bool { self.simple_matcher.is_match(text) } - /// Performs simple processing on the given text and returns the results as a list of [SimpleResult] instances. - /// - /// This method takes a string slice as input, invokes the `process` method on the internal `simple_matcher` - /// instance, and collects the resulting items into a vector of [SimpleResult] instances. - /// - /// # Parameters - /// - `self`: The [SimpleMatcher] instance. - /// - `text`: A reference to a string slice that will be processed. - /// - /// # Returns - /// - [`Vec`]: A vector of [SimpleResult] instances representing the results of the simple processing. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// from matcher_py import SimpleMatcher - /// from matcher_py.extension_types import SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// - /// simple_matcher = SimpleMatcher( - /// msgpack_encoder.encode( - /// { - /// SimpleMatchType.MatchNone: { - /// 1: "example" - /// } - /// } - /// ) - /// ) - /// - /// results = simple_matcher.simple_process("example") - /// print(results) # Output: A list of SimpleResult instances - /// ``` #[pyo3(signature=(text))] - fn simple_process(&self, text: &str) -> Vec { + fn process<'a>(&'a self, text: &'a str) -> Vec { self.simple_matcher .process(text) .into_iter() .map(SimpleResult) .collect() } - - /// Batch processes a list of texts and performs simple processing - /// on each text, returning the results as vectors of [SimpleResult] instances. - /// - /// This function iterates over a Python list of texts, performs simple processing - /// on each text, and collects the results into a vector of vectors of [SimpleResult] instances. - /// The result for each text is obtained by calling the [simple_process](SimpleMatcher::simple_process) - /// method, producing a vector of [SimpleResult] instances for each text. - /// - /// # Parameters - /// - `self`: The [SimpleMatcher] instance. - /// - `text_array`: A reference to a [PyList] containing texts to be processed. - /// - /// # Returns - /// - [`PyResult>>`]: A vector of vectors containing the simple processing results - /// for each text as vectors of [SimpleResult] instances. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// - /// from matcher_py import SimpleMatcher - /// from matcher_py.extension_types import SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// simple_matcher = SimpleMatcher( - /// msgpack_encoder.encode( - /// { - /// SimpleMatchType.MatchNone: { - /// 1: "example" - /// } - /// } - /// ) - /// ) - /// - /// text_list = ["example", "test", "example test"] - /// result = simple_matcher.batch_simple_process(text_list) - /// print(result) # Output: A list of lists of SimpleResult instances - /// ``` - #[pyo3(signature=(text_array))] - fn batch_simple_process( - &self, - text_array: &Bound<'_, PyList>, - ) -> PyResult>> { - let mut result_list = Vec::with_capacity(text_array.len()); - - for text in text_array.iter() { - let text_py_string = text.downcast::()?; - result_list.push(self.simple_process(text_py_string.to_cow().as_ref().unwrap())); - } - - Ok(result_list) - } - - /// Processes a NumPy array of texts using the simple processing method, - /// with an optional in-place operation. Each element of the input array - /// is expected to be a Python string object. - /// - /// This function can either modify the input NumPy array in-place or return a - /// new NumPy array with the processed results. The processing for each text - /// is performed by the [simple_process](SimpleMatcher::simple_process) method, - /// which returns [SimpleResult] instances. - /// - /// # Parameters - /// - `self`: The [SimpleMatcher] instance. - /// - `py`: The Python interpreter state, managed by the PyO3 library. - /// - `text_array`: A reference to a NumPy array containing Python string objects - /// to be processed. - /// - `inplace`: A boolean flag indicating whether the processing should be done - /// in-place. Defaults to `false`. - /// - /// # Returns - /// - [`Option>>`]: Returns `None` if `inplace` is `true`. - /// Otherwise, returns a new NumPy array with the processed results. - /// - /// # Example - /// - /// ```python - /// import msgspec - /// import numpy as np - /// - /// from matcher_py import SimpleMatcher - /// from matcher_py.extension_types import SimpleMatchType - /// - /// msgpack_encoder = msgspec.msgpack.Encoder() - /// simple_matcher = SimpleMatcher( - /// msgpack_encoder.encode( - /// { - /// SimpleMatchType.MatchNone: { - /// 1: "example" - /// } - /// } - /// ) - /// ) - /// - /// text_array = np.array(["example", "test", "example test"]) - /// result = simple_matcher.numpy_simple_process(text_array, inplace=False) - /// print(result) # Output: A NumPy array with lists of SimpleResult instances - /// - /// simple_matcher.numpy_simple_process(text_array, inplace=True) - /// print(text_array) # Output: The original NumPy array modified in-place with lists of SimpleResult instances - /// ``` - #[pyo3(signature=(text_array, inplace = false))] - fn numpy_simple_process( - &self, - py: Python, - text_array: &Bound<'_, PyArray1>, - inplace: bool, - ) -> Option>> { - if inplace { - unsafe { text_array.as_array_mut() }.map_inplace(|text| { - *text = text - .downcast_bound::(py) - .map_or(py.None(), |text_py_string| { - self.simple_process(text_py_string.to_cow().as_ref().unwrap()) - .into_py(py) - }); - }); - None - } else { - Some( - PyArray1::::from_owned_array_bound( - py, - unsafe { text_array.as_array() }.map(|text| { - text.downcast_bound::(py) - .map_or(py.None(), |text_py_string| { - self.simple_process(text_py_string.to_cow().as_ref().unwrap()) - .into_py(py) - }) - }), - ) - .into(), - ) - } - } } #[pymodule] diff --git a/matcher_py/test/test_matcher.py b/matcher_py/test/test_matcher.py index 54ececf..27e278f 100644 --- a/matcher_py/test/test_matcher.py +++ b/matcher_py/test/test_matcher.py @@ -1,10 +1,10 @@ import pytest import msgspec -import numpy as np + from matcher_py.matcher_py import Matcher from matcher_py.extension_types import ( - SimpleMatchType, + ProcessType, MatchTable, MatchTableType, RegexMatchType, @@ -43,10 +43,10 @@ def test_init_with_empty_map(): MatchTable( table_id=1, match_table_type=MatchTableType.Simple( - SimpleMatchType.MatchNone + process_type=ProcessType.MatchNone ), word_list=[], - exemption_simple_match_type=SimpleMatchType.MatchNone, + exemption_process_type=ProcessType.MatchNone, exemption_word_list=[], ) ] @@ -70,10 +70,11 @@ def test_regex(): MatchTable( table_id=1, match_table_type=MatchTableType.Regex( - RegexMatchType.MatchRegex + process_type=ProcessType.MatchNone, + regex_match_type=RegexMatchType.MatchRegex, ), word_list=["h[aeiou]llo", "w[aeiou]rd"], - exemption_simple_match_type=SimpleMatchType.MatchNone, + exemption_process_type=ProcessType.MatchNone, exemption_word_list=[], ) ] @@ -94,10 +95,11 @@ def test_similar_char(): MatchTable( table_id=1, match_table_type=MatchTableType.Regex( - RegexMatchType.MatchSimilarChar + process_type=ProcessType.MatchNone, + regex_match_type=RegexMatchType.MatchSimilarChar, ), word_list=["hello,hi,H,你好", "world,word,🌍,世界"], - exemption_simple_match_type=SimpleMatchType.MatchNone, + exemption_process_type=ProcessType.MatchNone, exemption_word_list=[], ) ] @@ -118,10 +120,12 @@ def test_similar_text_levenshtein(): MatchTable( table_id=1, match_table_type=MatchTableType.Similar( - SimMatchType.MatchLevenshtein, 0.8 + process_type=ProcessType.MatchNone, + sim_match_type=SimMatchType.MatchLevenshtein, + threshold=0.8, ), word_list=["helloworld"], - exemption_simple_match_type=SimpleMatchType.MatchNone, + exemption_process_type=ProcessType.MatchNone, exemption_word_list=[], ) ] @@ -144,10 +148,11 @@ def test_acrostic(): MatchTable( table_id=1, match_table_type=MatchTableType.Regex( - RegexMatchType.MatchAcrostic + process_type=ProcessType.MatchNone, + regex_match_type=RegexMatchType.MatchAcrostic, ), word_list=["h,e,l,l,o", "你,好"], - exemption_simple_match_type=SimpleMatchType.MatchNone, + exemption_process_type=ProcessType.MatchNone, exemption_word_list=[], ) ] @@ -175,10 +180,10 @@ def test_exemption(): MatchTable( table_id=1, match_table_type=MatchTableType.Simple( - SimpleMatchType.MatchNone + process_type=ProcessType.MatchNone ), word_list=["helloworld"], - exemption_simple_match_type=SimpleMatchType.MatchNone, + exemption_process_type=ProcessType.MatchNone, exemption_word_list=["worldwide"], ) ] @@ -195,19 +200,20 @@ def test_exemption(): MatchTable( table_id=1, match_table_type=MatchTableType.Simple( - SimpleMatchType.MatchNone + process_type=ProcessType.MatchNone ), word_list=["helloworld"], - exemption_simple_match_type=SimpleMatchType.MatchNone, + exemption_process_type=ProcessType.MatchNone, exemption_word_list=["worldwide"], ), MatchTable( - table_id=1, + table_id=2, match_table_type=MatchTableType.Regex( - RegexMatchType.MatchRegex + process_type=ProcessType.MatchNone, + regex_match_type=RegexMatchType.MatchRegex, ), word_list=["hello"], - exemption_simple_match_type=SimpleMatchType.MatchNone, + exemption_process_type=ProcessType.MatchNone, exemption_word_list=["worldwide"], ), ] @@ -216,44 +222,3 @@ def test_exemption(): ) assert matcher.is_match("helloworld") assert not matcher.is_match("helloworldwide") - - -@pytest.fixture(scope="module") -def matcher(): - return Matcher( - msgpack_encoder.encode( - { - 1: [ - MatchTable( - table_id=1, - match_table_type=MatchTableType.Simple( - SimpleMatchType.MatchNone - ), - word_list=["helloworld"], - exemption_simple_match_type=SimpleMatchType.MatchNone, - exemption_word_list=[], - ) - ] - } - ) - ) - - -def test_batch_word_match(matcher): - assert len(matcher.batch_word_match(["helloworld"])) == 1 - - -def test_batch_word_match_as_string(matcher): - assert len(matcher.batch_word_match_as_string(["helloworld"])) == 1 - - -def test_numpy_word_match(matcher): - text_array = np.array(["helloworld"] * 1000, dtype=np.dtype("object")) - matcher.numpy_word_match(text_array) - matcher.numpy_word_match(text_array, inplace=True) - - -def test_numpy_word_match_as_string(matcher): - text_array = np.array(["helloworld"] * 1000, dtype=np.dtype("object")) - matcher.numpy_word_match_as_string(text_array) - matcher.numpy_word_match_as_string(text_array, inplace=True) diff --git a/matcher_py/test/test_simple_matcher.py b/matcher_py/test/test_simple_matcher.py index 36f5a78..2e4b848 100644 --- a/matcher_py/test/test_simple_matcher.py +++ b/matcher_py/test/test_simple_matcher.py @@ -1,8 +1,9 @@ import pytest + import msgspec -import numpy as np + from matcher_py.matcher_py import SimpleMatcher -from matcher_py.extension_types import SimpleMatchType +from matcher_py.extension_types import ProcessType msgpack_encoder = msgspec.msgpack.Encoder() json_encoder = msgspec.json.Encoder() @@ -40,48 +41,48 @@ def test_init_with_invalid_map(): def test_fanjian(): simple_matcher = SimpleMatcher( - msgpack_encoder.encode({SimpleMatchType.MatchFanjian: {1: "你好"}}) + msgpack_encoder.encode({ProcessType.MatchFanjian: {1: "你好"}}) ) assert simple_matcher.is_match("妳好") - assert simple_matcher.simple_process("你好")[0]["word_id"] == 1 - assert simple_matcher.simple_process("你好")[0]["word"] == "你好" + assert simple_matcher.process("你好")[0]["word_id"] == 1 + assert simple_matcher.process("你好")[0]["word"] == "你好" simple_matcher = SimpleMatcher( - msgpack_encoder.encode({SimpleMatchType.MatchFanjian: {1: "妳好"}}) + msgpack_encoder.encode({ProcessType.MatchFanjian: {1: "妳好"}}) ) assert simple_matcher.is_match("你好") - assert simple_matcher.simple_process("你好")[0]["word_id"] == 1 - assert simple_matcher.simple_process("你好")[0]["word"] == "妳好" + assert simple_matcher.process("你好")[0]["word_id"] == 1 + assert simple_matcher.process("你好")[0]["word"] == "妳好" def test_delete(): simple_matcher = SimpleMatcher( - msgpack_encoder.encode({SimpleMatchType.MatchDelete: {1: "你好"}}) + msgpack_encoder.encode({ProcessType.MatchDelete: {1: "你好"}}) ) assert simple_matcher.is_match("你!好") - assert len(simple_matcher.simple_process("你!好")) == 1 + assert len(simple_matcher.process("你!好")) == 1 def test_normalize(): simple_matcher = SimpleMatcher( msgpack_encoder.encode( { - SimpleMatchType.MatchNormalize: { + ProcessType.MatchNormalize: { 1: "he11o", } } ) ) assert simple_matcher.is_match("ℋЀ⒈㈠Õ") - assert simple_matcher.simple_process("ℋЀ⒈㈠Õ")[0]["word_id"] == 1 - assert simple_matcher.simple_process("ℋЀ⒈㈠Õ")[0]["word"] == "he11o" + assert simple_matcher.process("ℋЀ⒈㈠Õ")[0]["word_id"] == 1 + assert simple_matcher.process("ℋЀ⒈㈠Õ")[0]["word"] == "he11o" def test_pinyin(): simple_matcher = SimpleMatcher( msgpack_encoder.encode( { - SimpleMatchType.MatchPinYin: { + ProcessType.MatchPinYin: { 1: "西安", } } @@ -95,7 +96,7 @@ def test_pinyinchar(): simple_matcher = SimpleMatcher( msgpack_encoder.encode( { - SimpleMatchType.MatchPinYinChar: { + ProcessType.MatchPinYinChar: { 1: "西安", } } @@ -104,20 +105,3 @@ def test_pinyinchar(): assert simple_matcher.is_match("洗按") assert simple_matcher.is_match("现") assert simple_matcher.is_match("xian") - - -@pytest.fixture(scope="module") -def simple_matcher(): - return SimpleMatcher( - msgpack_encoder.encode({SimpleMatchType.MatchNone: {1: "helloworld"}}) - ) - - -def test_batch_simple_process(simple_matcher): - assert len(simple_matcher.batch_simple_process(["helloworld"])) == 1 - - -def test_numpy_simple_process(simple_matcher): - text_array = np.array(["helloworld"] * 1000, dtype=np.dtype("object")) - simple_matcher.numpy_simple_process(text_array) - simple_matcher.numpy_simple_process(text_array, inplace=True) diff --git a/matcher_rs/Cargo.toml b/matcher_rs/Cargo.toml index 38bbcf0..18dbf11 100644 --- a/matcher_rs/Cargo.toml +++ b/matcher_rs/Cargo.toml @@ -53,16 +53,14 @@ divan = "0.1.14" fastrand = "2.1.0" [features] -default = ["prebuilt", "dfa"] -# By enable prebuilt feature, we could boost process matcher build time, but with package size increasing. -prebuilt = [] +default = ["dfa"] # By enable runtime_build feature, we could build process matcher at runtime, but with build time increasing. runtime_build = [] # By enable serde feature, we could serialize and deserialize matcher and simple_matcher. # With serde feature, AhoCorasick's prefilter is disabled, because I don't know how to serialize it correctly, # which will lead to performance regression when the patterns size is small (say, less than 100). serde = ["aho-corasick-unsafe/serde"] -# By enable dfa feature, we could use dfa to perform simple matching, but with significantly incresaing memory consumption. +# By enable dfa feature, we could use dfa to perform simple matching, but with significantly increasing memory consumption. dfa = [] [[bench]] diff --git a/matcher_rs/README.md b/matcher_rs/README.md index f7142bc..9c84d97 100644 --- a/matcher_rs/README.md +++ b/matcher_rs/README.md @@ -10,7 +10,7 @@ Designed to solve **AND OR NOT** and **TEXT VARIATIONS** problems in word/word_l - Simple Word Matching - Regex-Based Matching - Similarity-Based Matching -- **Text Normalization**: +- **Text Transformation**: - **Fanjian**: Simplify traditional Chinese characters to simplified ones. Example: `蟲艸` -> `虫艹` - **Delete**: Remove specific characters. @@ -42,22 +42,21 @@ cargo add matcher_rs ### Explanation of the configuration * `Matcher`'s configuration is defined by the `MatchTableMap = HashMap>` type, the key of `MatchTableMap` is called `match_id`, **for each `match_id`, the `table_id` inside is required to be unique**. -* `SimpleMatcher`'s configuration is defined by the `SimpleMatchTableMap = HashMap>` type, the value `HashMap`'s key is called `word_id`, **`word_id` is required to be globally unique**. +* `SimpleMatcher`'s configuration is defined by the `SimpleTable = HashMap>` type, the value `HashMap`'s key is called `word_id`, **`word_id` is required to be globally unique**. #### MatchTable * `table_id`: The unique ID of the match table. * `match_table_type`: The type of the match table. * `word_list`: The word list of the match table. -* `exemption_simple_match_type`: The type of the exemption simple match. +* `exemption_process_type`: The type of the exemption simple match. * `exemption_word_list`: The exemption word list of the match table. For each match table, word matching is performed over the `word_list`, and exemption word matching is performed over the `exemption_word_list`. If the exemption word matching result is True, the word matching result will be False. #### MatchTableType -* `Simple`: Supports simple multiple patterns matching with text normalization defined by `simple_match_type`. - * We offer transformation methods for text normalization, including `Fanjian`, `Normalize`, `PinYin` ···. +* `Simple`: Supports simple multiple patterns matching with text normalization defined by `process_type`. * It can handle combination patterns and repeated times sensitive matching, delimited by `&` and `~`, such as `hello&world&hello` will match `hellohelloworld` and `worldhellohello`, but not `helloworld` due to the repeated times of `hello`. * `Regex`: Supports regex patterns matching. * `SimilarChar`: Supports similar character matching using regex. @@ -68,27 +67,23 @@ For each match table, word matching is performed over the `word_list`, and exemp * `["h[aeiou]llo", "w[aeiou]rd"]` will match `hello`, `world`, `hillo`, `wurld` ··· any text that matches the regex in the list. * `Similar`: Supports similar text matching based on distance and threshold. * `Levenshtein`: Supports similar text matching based on Levenshtein distance. - * `DamerauLevenshtein`: Supports similar text matching based on Damerau-Levenshtein distance. - * `Indel`: Supports similar text matching based on Indel distance. - * `Jaro`: Supports similar text matching based on Jaro distance. - * `JaroWinkler`: Supports similar text matching based on Jaro-Winkler distance. -#### SimpleMatchType +#### ProcessType * `None`: No transformation. -* `Fanjian`: Traditional Chinese to simplified Chinese transformation. Based on [FANJIAN](./str_conv/FANJIAN.txt). +* `Fanjian`: Traditional Chinese to simplified Chinese transformation. Based on [FANJIAN](./process_map/FANJIAN.txt). * `妳好` -> `你好` * `現⾝` -> `现身` -* `Delete`: Delete all punctuation, special characters and white spaces. +* `Delete`: Delete all punctuation, special characters and white spaces. Based on [TEXT_DELETE](./process_map/TEXT-DELETE.txt) and `WHITE_SPACE`. * `hello, world!` -> `helloworld` * `《你∷好》` -> `你好` -* `Normalize`: Normalize all English character variations and number variations to basic characters. Based on [SYMBOL_NORM](./str_conv/SYMBOL-NORM.txt), [NORM](./str_conv/NORM.txt) and [NUM_NORM](./str_conv/NUM-NORM.txt). +* `Normalize`: Normalize all English character variations and number variations to basic characters. Based on [SYMBOL_NORM](./process_map/SYMBOL-NORM.txt), [NORM](./process_map/NORM.txt) and [NUM_NORM](./process_map/NUM-NORM.txt). * `ℋЀ⒈㈠Õ` -> `he11o` * `⒈Ƨ㊂` -> `123` -* `PinYin`: Convert all unicode Chinese characters to pinyin with boundaries. Based on [PINYIN](./str_conv/PINYIN.txt). +* `PinYin`: Convert all unicode Chinese characters to pinyin with boundaries. Based on [PINYIN](./process_map/PINYIN.txt). * `你好` -> ` ni hao ` * `西安` -> ` xi an ` -* `PinYinChar`: Convert all unicode Chinese characters to pinyin without boundaries. Based on [PINYIN](./str_conv/PINYIN.txt). +* `PinYinChar`: Convert all unicode Chinese characters to pinyin without boundaries. Based on [PINYIN](./process_map/PINYIN.txt). * `你好` -> `nihao` * `西安` -> `xian` @@ -96,31 +91,27 @@ You can combine these transformations as needed. Pre-defined combinations like ` Avoid combining `PinYin` and `PinYinChar` due to that `PinYin` is a more limited version of `PinYinChar`, in some cases like `xian`, can be treat as two words `xi` and `an`, or only one word `xian`. -`Delete` is technologically a combination of `TextDelete` and `WordDelete`, we implement different delete methods for text and word. 'Cause we believe special characters are parts of the word, users put them in words deliberately, but not for text. For `text_process` and `reduce_text_process` functions, users should use `TextDelete` instead of `WordDelete`. -* `WordDelete`: Delete all patterns in `WHITE_SPACE`. -* `TextDelete`: Delete all patterns in [TEXT_DELETE](./str_conv/TEXT-DELETE.txt). - ### Basic Example Here’s a basic example of how to use the `Matcher` struct for text matching: ```rust -use matcher_rs::{text_process, reduce_text_process, SimpleMatchType}; +use matcher_rs::{text_process, reduce_text_process, ProcessType}; -let result = text_process(SimpleMatchType::TextDelete, "你好,世界!"); -let result = reduce_text_process(SimpleMatchType::FanjianDeleteNormalize, "你好,世界!"); +let result = text_process(ProcessType::Delete, "你好,世界!"); +let result = reduce_text_process(ProcessType::FanjianDeleteNormalize, "你好,世界!"); ``` ```rust use std::collections::HashMap; -use matcher_rs::{Matcher, MatchTableMap, MatchTable, MatchTableType, SimpleMatchType}; +use matcher_rs::{Matcher, MatchTableMap, MatchTable, MatchTableType, ProcessType}; let match_table_map: MatchTableMap = HashMap::from_iter(vec![ (1, vec![MatchTable { table_id: 1, - match_table_type: MatchTableType::Simple { simple_match_type: SimpleMatchType::FanjianDeleteNormalize}, + match_table_type: MatchTableType::Simple { process_type: ProcessType::FanjianDeleteNormalize}, word_list: vec!["example", "test"], - exemption_simple_match_type: SimpleMatchType::FanjianDeleteNormalize, + exemption_process_type: ProcessType::FanjianDeleteNormalize, exemption_word_list: vec![], }]), ]); @@ -131,17 +122,17 @@ let results = matcher.word_match(text); ```rust use std::collections::HashMap; -use matcher_rs::{SimpleMatchType, SimpleMatcher}; +use matcher_rs::{ProcessType, SimpleMatcher}; -let mut smt_word_map = HashMap::new(); +let mut simple_table = HashMap::new(); let mut simple_word_map = HashMap::new(); simple_word_map.insert(1, "你好"); simple_word_map.insert(2, "世界"); -smt_word_map.insert(SimpleMatchType::Fanjian, simple_word_map); +simple_table.insert(ProcessType::Fanjian, simple_word_map); -let matcher = SimpleMatcher::new(&smt_word_map); +let matcher = SimpleMatcher::new(&simple_table); let text = "你好,世界!"; let results = matcher.process(text); ``` @@ -149,10 +140,9 @@ let results = matcher.process(text); For more detailed usage examples, please refer to the [test.rs](./tests/test.rs) file. ## Feature Flags -* `prebuilt`: By enable prebuilt feature, we could boost process matcher build time, but with package size increasing. * `runtime_build`: By enable runtime_build feature, we could build process matcher at runtime, but with build time increasing. * `serde`: By enable serde feature, we could serialize and deserialize matcher and simple_matcher. With serde feature, AhoCorasick's prefilter is disabled, because I don't know how to serialize it correctly, which will lead to performance regression when the patterns size is small (say, less than 100). -* `dfa`: By enable dfa feature, we could use dfa to perform simple matching, but with significantly incresaing memory consumption. +* `dfa`: By enable dfa feature, we could use dfa to perform simple matching, but with significantly increasing memory consumption. Default feature is `prebuilt` and `dfa`, `prebuilt` and `runtime_build` can't be enabled at same time. If you want to make `Matcher` and `SimpleMatcher` serializable, you should enable `serde` feature. @@ -167,101 +157,101 @@ cargo bench ``` ``` -Current default simple match type: SimpleMatchType(None) +Current default simple match type: ProcessType(None) Current default simple word map size: 1000 Current default combined times: 2 Timer precision: 41 ns -bench fastest │ slowest │ median │ mean │ samples │ iters -├─ build_cn │ │ │ │ │ -│ ├─ build_cn_by_combined_times │ │ │ │ │ -│ │ ├─ 1 2.468 ms │ 3.355 ms │ 2.506 ms │ 2.536 ms │ 100 │ 100 -│ │ ├─ 2 5.303 ms │ 5.765 ms │ 5.402 ms │ 5.41 ms │ 100 │ 100 -│ │ ├─ 3 7.912 ms │ 10.16 ms │ 7.986 ms │ 8.081 ms │ 100 │ 100 -│ │ ├─ 4 10.59 ms │ 11.31 ms │ 10.73 ms │ 10.75 ms │ 100 │ 100 -│ │ ╰─ 5 13.03 ms │ 14.1 ms │ 13.13 ms │ 13.21 ms │ 100 │ 100 -│ ├─ build_cn_by_multiple_simple_match_type 26.63 ms │ 40.81 ms │ 26.99 ms │ 27.23 ms │ 100 │ 100 -│ ├─ build_cn_by_simple_match_type │ │ │ │ │ -│ │ ├─ "fanjian" 5.296 ms │ 6.12 ms │ 5.348 ms │ 5.398 ms │ 100 │ 100 -│ │ ├─ "fanjian_worddelete_textdelete_normalize" 5.43 ms │ 5.937 ms │ 5.47 ms │ 5.491 ms │ 100 │ 100 -│ │ ├─ "none" 5.268 ms │ 5.667 ms │ 5.375 ms │ 5.379 ms │ 100 │ 100 -│ │ ├─ "normalize" 5.373 ms │ 5.827 ms │ 5.423 ms │ 5.437 ms │ 100 │ 100 -│ │ ├─ "pinyin" 16.02 ms │ 24.52 ms │ 16.15 ms │ 16.34 ms │ 100 │ 100 -│ │ ├─ "pinyinchar" 15.81 ms │ 41.81 ms │ 16.29 ms │ 16.99 ms │ 100 │ 100 -│ │ ├─ "worddelete_textdelete" 5.291 ms │ 6.192 ms │ 5.409 ms │ 5.556 ms │ 100 │ 100 -│ │ ╰─ "worddelete_textdelete_normalize" 5.38 ms │ 6.311 ms │ 5.897 ms │ 5.866 ms │ 100 │ 100 -│ ╰─ build_cn_by_simple_word_map_size │ │ │ │ │ -│ ├─ 100 501.2 µs │ 838.9 µs │ 545.2 µs │ 559.5 µs │ 100 │ 100 -│ ├─ 1000 5.383 ms │ 18.63 ms │ 5.669 ms │ 5.88 ms │ 100 │ 100 -│ ├─ 10000 49.97 ms │ 99.73 ms │ 53.03 ms │ 54.13 ms │ 93 │ 93 -│ ╰─ 50000 194.1 ms │ 366.2 ms │ 204.9 ms │ 212.6 ms │ 24 │ 24 -├─ build_en │ │ │ │ │ -│ ├─ build_en_by_combined_times │ │ │ │ │ -│ │ ├─ 1 5.43 ms │ 6.427 ms │ 5.84 ms │ 5.907 ms │ 100 │ 100 -│ │ ├─ 2 12.9 ms │ 21.5 ms │ 13.6 ms │ 13.83 ms │ 100 │ 100 -│ │ ├─ 3 21.99 ms │ 24.19 ms │ 22.89 ms │ 22.8 ms │ 100 │ 100 -│ │ ├─ 4 29.3 ms │ 50.2 ms │ 30.84 ms │ 31.27 ms │ 100 │ 100 -│ │ ╰─ 5 38.12 ms │ 40.88 ms │ 38.44 ms │ 38.58 ms │ 100 │ 100 -│ ├─ build_en_by_multiple_simple_match_type 16.43 ms │ 19 ms │ 16.79 ms │ 16.95 ms │ 100 │ 100 -│ ├─ build_en_by_simple_match_type │ │ │ │ │ -│ │ ├─ "none" 13.97 ms │ 15.1 ms │ 14.56 ms │ 14.58 ms │ 100 │ 100 -│ │ ├─ "normalize" 12.35 ms │ 17.97 ms │ 13.05 ms │ 13.13 ms │ 100 │ 100 -│ │ ├─ "worddelete_textdelete" 13.5 ms │ 14.87 ms │ 13.96 ms │ 13.97 ms │ 100 │ 100 -│ │ ╰─ "worddelete_textdelete_normalize" 11.83 ms │ 13.31 ms │ 12.46 ms │ 12.54 ms │ 100 │ 100 -│ ╰─ build_en_by_simple_word_map_size │ │ │ │ │ -│ ├─ 100 848.1 µs │ 1.286 ms │ 925.4 µs │ 929 µs │ 100 │ 100 -│ ├─ 1000 12.57 ms │ 16.46 ms │ 13.38 ms │ 13.38 ms │ 100 │ 100 -│ ├─ 10000 178.1 ms │ 192.3 ms │ 182.2 ms │ 183.7 ms │ 28 │ 28 -│ ╰─ 50000 743.3 ms │ 884.1 ms │ 752.2 ms │ 776.2 ms │ 7 │ 7 -├─ search_cn │ │ │ │ │ -│ ├─ search_cn_baseline │ │ │ │ │ -│ │ ├─ 100 2.907 ms │ 11.87 ms │ 3.068 ms │ 3.359 ms │ 100 │ 100 -│ │ ├─ 1000 2.99 ms │ 3.422 ms │ 3.006 ms │ 3.033 ms │ 100 │ 100 -│ │ ├─ 10000 5.197 ms │ 5.801 ms │ 5.269 ms │ 5.294 ms │ 100 │ 100 -│ │ ╰─ 50000 12.44 ms │ 16.52 ms │ 14.2 ms │ 13.89 ms │ 100 │ 100 -│ ├─ search_cn_by_combined_times │ │ │ │ │ -│ │ ├─ 1 3.702 ms │ 4.091 ms │ 3.728 ms │ 3.749 ms │ 100 │ 100 -│ │ ├─ 2 4.442 ms │ 4.826 ms │ 4.458 ms │ 4.467 ms │ 100 │ 100 -│ │ ├─ 3 5.054 ms │ 5.595 ms │ 5.078 ms │ 5.093 ms │ 100 │ 100 -│ │ ├─ 4 6.136 ms │ 6.777 ms │ 6.159 ms │ 6.177 ms │ 100 │ 100 -│ │ ╰─ 5 6.235 ms │ 11.38 ms │ 6.396 ms │ 6.51 ms │ 100 │ 100 -│ ├─ search_cn_by_multiple_simple_match_type 64.81 ms │ 80.83 ms │ 66.49 ms │ 66.75 ms │ 100 │ 100 -│ ├─ search_cn_by_simple_match_type │ │ │ │ │ -│ │ ├─ "fanjian" 6.781 ms │ 7.486 ms │ 6.841 ms │ 6.927 ms │ 100 │ 100 -│ │ ├─ "fanjian_worddelete_textdelete_normalize" 21.47 ms │ 45.61 ms │ 21.82 ms │ 22.33 ms │ 100 │ 100 -│ │ ├─ "none" 4.684 ms │ 5.198 ms │ 4.705 ms │ 4.731 ms │ 100 │ 100 -│ │ ├─ "normalize" 14.62 ms │ 15.81 ms │ 15.5 ms │ 15.28 ms │ 100 │ 100 -│ │ ├─ "pinyin" 57.98 ms │ 63.66 ms │ 60.31 ms │ 59.92 ms │ 84 │ 84 -│ │ ├─ "pinyinchar" 63.8 ms │ 74.02 ms │ 65.47 ms │ 66.22 ms │ 76 │ 76 -│ │ ├─ "worddelete_textdelete" 13.2 ms │ 14.62 ms │ 13.43 ms │ 13.65 ms │ 100 │ 100 -│ │ ╰─ "worddelete_textdelete_normalize" 18.97 ms │ 21.06 ms │ 19.73 ms │ 19.83 ms │ 100 │ 100 -│ ╰─ search_cn_by_simple_word_map_size │ │ │ │ │ -│ ├─ 100 3.031 ms │ 3.491 ms │ 3.082 ms │ 3.104 ms │ 100 │ 100 -│ ├─ 1000 4.793 ms │ 5.205 ms │ 4.997 ms │ 5.001 ms │ 100 │ 100 -│ ├─ 10000 10.12 ms │ 12.74 ms │ 10.7 ms │ 10.66 ms │ 100 │ 100 -│ ╰─ 50000 21.12 ms │ 27.96 ms │ 21.77 ms │ 23.13 ms │ 100 │ 100 -╰─ search_en │ │ │ │ │ - ├─ search_en_baseline │ │ │ │ │ - │ ├─ 100 328.3 µs │ 1.576 ms │ 343.1 µs │ 364.5 µs │ 100 │ 100 - │ ├─ 1000 343.6 µs │ 472.4 µs │ 369.9 µs │ 369.1 µs │ 100 │ 100 - │ ├─ 10000 1.169 ms │ 1.248 ms │ 1.197 ms │ 1.199 ms │ 100 │ 100 - │ ╰─ 50000 1.193 ms │ 1.304 ms │ 1.199 ms │ 1.205 ms │ 100 │ 100 - ├─ search_en_by_combined_times │ │ │ │ │ - │ ├─ 1 1.682 ms │ 4.053 ms │ 1.692 ms │ 1.727 ms │ 100 │ 100 - │ ├─ 2 2.481 ms │ 2.682 ms │ 2.502 ms │ 2.506 ms │ 100 │ 100 - │ ├─ 3 2.585 ms │ 2.979 ms │ 2.678 ms │ 2.69 ms │ 100 │ 100 - │ ├─ 4 2.654 ms │ 3.265 ms │ 2.761 ms │ 2.764 ms │ 100 │ 100 - │ ╰─ 5 2.74 ms │ 3.242 ms │ 2.752 ms │ 2.761 ms │ 100 │ 100 - ├─ search_en_by_multiple_simple_match_type 9.173 ms │ 10.27 ms │ 9.351 ms │ 9.481 ms │ 100 │ 100 - ├─ search_en_by_simple_match_type │ │ │ │ │ - │ ├─ "none" 1.99 ms │ 2.286 ms │ 2.006 ms │ 2.049 ms │ 100 │ 100 - │ ├─ "normalize" 3.992 ms │ 4.064 ms │ 4.009 ms │ 4.012 ms │ 100 │ 100 - │ ├─ "worddelete_textdelete" 6.198 ms │ 7.005 ms │ 6.225 ms │ 6.253 ms │ 100 │ 100 - │ ╰─ "worddelete_textdelete_normalize" 10.51 ms │ 32.63 ms │ 11.1 ms │ 11.41 ms │ 100 │ 100 - ╰─ search_en_by_simple_word_map_size │ │ │ │ │ - ├─ 100 1.384 ms │ 1.616 ms │ 1.458 ms │ 1.471 ms │ 100 │ 100 - ├─ 1000 2.395 ms │ 2.587 ms │ 2.427 ms │ 2.432 ms │ 100 │ 100 - ├─ 10000 3.091 ms │ 4.291 ms │ 3.113 ms │ 3.127 ms │ 100 │ 100 - ╰─ 50000 3.668 ms │ 5.738 ms │ 3.831 ms │ 3.853 ms │ 100 │ 100 +bench fastest │ slowest │ median │ mean │ samples │ iters +├─ build_cn │ │ │ │ │ +│ ├─ build_cn_by_combined_times │ │ │ │ │ +│ │ ├─ 1 2.593 ms │ 5.474 ms │ 2.672 ms │ 2.803 ms │ 100 │ 100 +│ │ ├─ 2 5.259 ms │ 6.592 ms │ 5.438 ms │ 5.537 ms │ 100 │ 100 +│ │ ├─ 3 7.982 ms │ 10.01 ms │ 8.591 ms │ 8.7 ms │ 100 │ 100 +│ │ ├─ 4 10.59 ms │ 65.93 ms │ 11.86 ms │ 12.82 ms │ 100 │ 100 +│ │ ╰─ 5 13.46 ms │ 16.05 ms │ 14.18 ms │ 14.36 ms │ 100 │ 100 +│ ├─ build_cn_by_multiple_process_type 27.6 ms │ 43.1 ms │ 28.34 ms │ 28.83 ms │ 100 │ 100 +│ ├─ build_cn_by_process_type │ │ │ │ │ +│ │ ├─ "delete" 5.332 ms │ 6.308 ms │ 5.525 ms │ 5.597 ms │ 100 │ 100 +│ │ ├─ "delete_normalize" 5.394 ms │ 6.605 ms │ 5.601 ms │ 5.618 ms │ 100 │ 100 +│ │ ├─ "fanjian" 5.33 ms │ 5.739 ms │ 5.428 ms │ 5.467 ms │ 100 │ 100 +│ │ ├─ "fanjian_delete_normalize" 5.485 ms │ 6.35 ms │ 5.724 ms │ 5.791 ms │ 100 │ 100 +│ │ ├─ "none" 5.439 ms │ 6.201 ms │ 5.545 ms │ 5.612 ms │ 100 │ 100 +│ │ ├─ "normalize" 5.351 ms │ 6.041 ms │ 5.662 ms │ 5.662 ms │ 100 │ 100 +│ │ ├─ "pinyin" 6.996 ms │ 9.993 ms │ 7.254 ms │ 7.284 ms │ 100 │ 100 +│ │ ╰─ "pinyinchar" 7.056 ms │ 8.977 ms │ 7.415 ms │ 7.449 ms │ 100 │ 100 +│ ╰─ build_cn_by_simple_word_map_size │ │ │ │ │ +│ ├─ 100 520.4 µs │ 912.2 µs │ 562.9 µs │ 568.7 µs │ 100 │ 100 +│ ├─ 1000 5.184 ms │ 6.008 ms │ 5.369 ms │ 5.415 ms │ 100 │ 100 +│ ├─ 10000 51.18 ms │ 61.37 ms │ 53.76 ms │ 53.82 ms │ 93 │ 93 +│ ╰─ 50000 190.9 ms │ 213.9 ms │ 196.4 ms │ 197.6 ms │ 26 │ 26 +├─ build_en │ │ │ │ │ +│ ├─ build_en_by_combined_times │ │ │ │ │ +│ │ ├─ 1 6.323 ms │ 7.754 ms │ 6.504 ms │ 6.531 ms │ 100 │ 100 +│ │ ├─ 2 13.82 ms │ 15.83 ms │ 14.19 ms │ 14.23 ms │ 100 │ 100 +│ │ ├─ 3 20.42 ms │ 24.58 ms │ 21.29 ms │ 21.38 ms │ 100 │ 100 +│ │ ├─ 4 28.54 ms │ 31.17 ms │ 29.12 ms │ 29.21 ms │ 100 │ 100 +│ │ ╰─ 5 37.47 ms │ 40.15 ms │ 38.64 ms │ 38.68 ms │ 100 │ 100 +│ ├─ build_en_by_multiple_process_type 16.1 ms │ 17.82 ms │ 16.67 ms │ 16.7 ms │ 100 │ 100 +│ ├─ build_en_by_process_type │ │ │ │ │ +│ │ ├─ "delete" 12.54 ms │ 14.42 ms │ 13.19 ms │ 13.24 ms │ 100 │ 100 +│ │ ├─ "delete_normalize" 11.16 ms │ 12.49 ms │ 11.45 ms │ 11.52 ms │ 100 │ 100 +│ │ ├─ "none" 13.2 ms │ 14.31 ms │ 13.57 ms │ 13.59 ms │ 100 │ 100 +│ │ ╰─ "normalize" 12.02 ms │ 13.74 ms │ 12.52 ms │ 12.54 ms │ 100 │ 100 +│ ╰─ build_en_by_simple_word_map_size │ │ │ │ │ +│ ├─ 100 938.9 µs │ 1.257 ms │ 1.007 ms │ 1.013 ms │ 100 │ 100 +│ ├─ 1000 13.53 ms │ 15.2 ms │ 14.04 ms │ 14.03 ms │ 100 │ 100 +│ ├─ 10000 160.5 ms │ 174.9 ms │ 164.1 ms │ 165.2 ms │ 31 │ 31 +│ ╰─ 50000 689.6 ms │ 817.3 ms │ 719 ms │ 727.6 ms │ 7 │ 7 +├─ search_cn │ │ │ │ │ +│ ├─ search_cn_baseline │ │ │ │ │ +│ │ ├─ 100 2.907 ms │ 4.152 ms │ 2.945 ms │ 3.033 ms │ 100 │ 100 +│ │ ├─ 1000 3.081 ms │ 3.266 ms │ 3.153 ms │ 3.162 ms │ 100 │ 100 +│ │ ├─ 10000 9.386 ms │ 10.59 ms │ 9.733 ms │ 9.708 ms │ 100 │ 100 +│ │ ╰─ 50000 33.38 ms │ 42.97 ms │ 35.56 ms │ 36.28 ms │ 100 │ 100 +│ ├─ search_cn_by_combined_times │ │ │ │ │ +│ │ ├─ 1 4.148 ms │ 4.967 ms │ 4.181 ms │ 4.219 ms │ 100 │ 100 +│ │ ├─ 2 5.601 ms │ 6.266 ms │ 5.751 ms │ 5.773 ms │ 100 │ 100 +│ │ ├─ 3 6.85 ms │ 8.021 ms │ 7.243 ms │ 7.282 ms │ 100 │ 100 +│ │ ├─ 4 7.382 ms │ 8.841 ms │ 7.734 ms │ 7.773 ms │ 100 │ 100 +│ │ ╰─ 5 8.952 ms │ 12.99 ms │ 10.04 ms │ 9.958 ms │ 100 │ 100 +│ ├─ search_cn_by_multiple_process_type 66.7 ms │ 148.4 ms │ 75.71 ms │ 78.7 ms │ 100 │ 100 +│ ├─ search_cn_by_process_type │ │ │ │ │ +│ │ ├─ "delete" 14.13 ms │ 17.09 ms │ 15.15 ms │ 15.17 ms │ 100 │ 100 +│ │ ├─ "delete_normalize" 20.14 ms │ 24.2 ms │ 21.53 ms │ 21.72 ms │ 100 │ 100 +│ │ ├─ "fanjian" 7.07 ms │ 8.242 ms │ 7.478 ms │ 7.474 ms │ 100 │ 100 +│ │ ├─ "fanjian_delete_normalize" 22.36 ms │ 24.46 ms │ 23.33 ms │ 23.32 ms │ 100 │ 100 +│ │ ├─ "none" 5.852 ms │ 6.8 ms │ 6.244 ms │ 6.208 ms │ 100 │ 100 +│ │ ├─ "normalize" 14.11 ms │ 17.09 ms │ 14.89 ms │ 14.99 ms │ 100 │ 100 +│ │ ├─ "pinyin" 55.21 ms │ 140.7 ms │ 56.74 ms │ 58.11 ms │ 87 │ 87 +│ │ ╰─ "pinyinchar" 57.37 ms │ 151.5 ms │ 61.23 ms │ 65.84 ms │ 76 │ 76 +│ ╰─ search_cn_by_simple_word_map_size │ │ │ │ │ +│ ├─ 100 3.16 ms │ 5.387 ms │ 3.499 ms │ 3.64 ms │ 100 │ 100 +│ ├─ 1000 5.66 ms │ 7.839 ms │ 6.457 ms │ 6.504 ms │ 100 │ 100 +│ ├─ 10000 22.55 ms │ 90.1 ms │ 28.91 ms │ 29.91 ms │ 100 │ 100 +│ ╰─ 50000 75.08 ms │ 122.5 ms │ 87.05 ms │ 90.99 ms │ 55 │ 55 +╰─ search_en │ │ │ │ │ + ├─ search_en_baseline │ │ │ │ │ + │ ├─ 100 343.4 µs │ 593.2 µs │ 380.9 µs │ 389.2 µs │ 100 │ 100 + │ ├─ 1000 355.1 µs │ 472.7 µs │ 389.7 µs │ 393.1 µs │ 100 │ 100 + │ ├─ 10000 1.213 ms │ 1.554 ms │ 1.27 ms │ 1.291 ms │ 100 │ 100 + │ ╰─ 50000 1.194 ms │ 1.342 ms │ 1.201 ms │ 1.209 ms │ 100 │ 100 + ├─ search_en_by_combined_times │ │ │ │ │ + │ ├─ 1 1.698 ms │ 2.499 ms │ 1.883 ms │ 1.914 ms │ 100 │ 100 + │ ├─ 2 2.066 ms │ 3.646 ms │ 2.321 ms │ 2.391 ms │ 100 │ 100 + │ ├─ 3 2.628 ms │ 3.176 ms │ 2.8 ms │ 2.81 ms │ 100 │ 100 + │ ├─ 4 2.879 ms │ 4.266 ms │ 3.153 ms │ 3.259 ms │ 100 │ 100 + │ ╰─ 5 2.748 ms │ 3.31 ms │ 2.785 ms │ 2.812 ms │ 100 │ 100 + ├─ search_en_by_multiple_process_type 9.42 ms │ 12.25 ms │ 9.974 ms │ 10.16 ms │ 100 │ 100 + ├─ search_en_by_process_type │ │ │ │ │ + │ ├─ "delete" 6.613 ms │ 8.215 ms │ 7.027 ms │ 7.208 ms │ 100 │ 100 + │ ├─ "delete_normalize" 7.938 ms │ 9.425 ms │ 8.116 ms │ 8.215 ms │ 100 │ 100 + │ ├─ "none" 2.648 ms │ 16.51 ms │ 2.943 ms │ 3.417 ms │ 100 │ 100 + │ ╰─ "normalize" 4.085 ms │ 5.228 ms │ 4.245 ms │ 4.321 ms │ 100 │ 100 + ╰─ search_en_by_simple_word_map_size │ │ │ │ │ + ├─ 100 1.375 ms │ 1.681 ms │ 1.458 ms │ 1.469 ms │ 100 │ 100 + ├─ 1000 2.393 ms │ 2.699 ms │ 2.447 ms │ 2.46 ms │ 100 │ 100 + ├─ 10000 3.34 ms │ 4.793 ms │ 3.578 ms │ 3.656 ms │ 100 │ 100 + ╰─ 50000 5.516 ms │ 8.122 ms │ 6.252 ms │ 6.428 ms │ 100 │ 100 ``` ## Contributing diff --git a/matcher_rs/benches/bench.rs b/matcher_rs/benches/bench.rs index cedd3ad..30c4498 100644 --- a/matcher_rs/benches/bench.rs +++ b/matcher_rs/benches/bench.rs @@ -1,25 +1,25 @@ use divan::Bencher; -use matcher_rs::{SimpleMatchType, SimpleMatcher, TextMatcherTrait}; +use matcher_rs::{ProcessType, SimpleMatcher, TextMatcherTrait}; use nohash_hasher::IntMap; -const CN_SIMPLE_MATCH_TYPE_LIST: &[SimpleMatchType] = &[ - SimpleMatchType::None, - SimpleMatchType::Fanjian, - SimpleMatchType::Delete, - SimpleMatchType::Normalize, - SimpleMatchType::PinYin, - SimpleMatchType::PinYinChar, - SimpleMatchType::DeleteNormalize, - SimpleMatchType::FanjianDeleteNormalize, +const CN_PROCESS_TYPE_LIST: &[ProcessType] = &[ + ProcessType::None, + ProcessType::Fanjian, + ProcessType::Delete, + ProcessType::Normalize, + ProcessType::PinYin, + ProcessType::PinYinChar, + ProcessType::DeleteNormalize, + ProcessType::FanjianDeleteNormalize, ]; const CN_WORD_LIST_100000: &str = include_str!("../../data/word_list/cn/cn_words_100000.txt"); const CN_HAYSTACK: &str = include_str!("../../data/text/cn/西游记.txt"); -const EN_SIMPLE_MATCH_TYPE_LIST: &[SimpleMatchType] = &[ - SimpleMatchType::None, - SimpleMatchType::Delete, - SimpleMatchType::Normalize, - SimpleMatchType::DeleteNormalize, +const EN_PROCESS_TYPE_LIST: &[ProcessType] = &[ + ProcessType::None, + ProcessType::Delete, + ProcessType::Normalize, + ProcessType::DeleteNormalize, ]; const EN_WORD_LIST_100000: &str = include_str!("../../data/word_list/en/en_words_100000.txt"); const EN_HAYSTACK: &str = include_str!("../../data/text/en/sherlock.txt"); @@ -27,7 +27,7 @@ const EN_HAYSTACK: &str = include_str!("../../data/text/en/sherlock.txt"); const SIMPLE_WORD_MAP_SIZE_LIST: &[usize] = &[100, 1000, 10000, 50000]; const COMBINED_TIMES_LIST: &[usize] = &[1, 2, 3, 4, 5]; -const DEFAULT_SIMPLE_MATCH_TYPE: SimpleMatchType = SimpleMatchType::None; +const DEFAULT_PROCESS_TYPE: ProcessType = ProcessType::None; const DEFAULT_SIMPLE_WORD_MAP_SIZE: usize = 1000; const DEFAULT_COMBINED_TIMES: usize = 2; @@ -85,9 +85,9 @@ fn build_simple_word_map_baseline( mod build_cn { use super::*; - #[divan::bench(args = CN_SIMPLE_MATCH_TYPE_LIST, max_time = 5)] - fn build_cn_by_simple_match_type(bencher: Bencher, simple_match_type: SimpleMatchType) { - let mut smt_word_map = IntMap::default(); + #[divan::bench(args = CN_PROCESS_TYPE_LIST, max_time = 5)] + fn build_cn_by_process_type(bencher: Bencher, process_type: ProcessType) { + let mut simple_table = IntMap::default(); let mut global_word_id = 0; let simple_word_map = build_simple_word_map( "cn", @@ -95,16 +95,16 @@ mod build_cn { DEFAULT_COMBINED_TIMES, &mut global_word_id, ); - smt_word_map.insert(simple_match_type, simple_word_map); + simple_table.insert(process_type, simple_word_map); bencher.bench(|| { - let _ = SimpleMatcher::new(&smt_word_map); + let _ = SimpleMatcher::new(&simple_table); }); } #[divan::bench(args = SIMPLE_WORD_MAP_SIZE_LIST, max_time = 5)] fn build_cn_by_simple_word_map_size(bencher: Bencher, simple_word_map_size: usize) { - let mut smt_word_map = IntMap::default(); + let mut simple_table = IntMap::default(); let mut global_word_id = 0; let simple_word_map = build_simple_word_map( "cn", @@ -112,16 +112,16 @@ mod build_cn { DEFAULT_COMBINED_TIMES, &mut global_word_id, ); - smt_word_map.insert(DEFAULT_SIMPLE_MATCH_TYPE, simple_word_map); + simple_table.insert(DEFAULT_PROCESS_TYPE, simple_word_map); bencher.bench(|| { - let _ = SimpleMatcher::new(&smt_word_map); + let _ = SimpleMatcher::new(&simple_table); }); } #[divan::bench(args = COMBINED_TIMES_LIST, max_time = 5)] fn build_cn_by_combined_times(bencher: Bencher, combined_times: usize) { - let mut smt_word_map = IntMap::default(); + let mut simple_table = IntMap::default(); let mut global_word_id = 0; let simple_word_map = build_simple_word_map( "cn", @@ -129,23 +129,23 @@ mod build_cn { combined_times, &mut global_word_id, ); - smt_word_map.insert(DEFAULT_SIMPLE_MATCH_TYPE, simple_word_map); + simple_table.insert(DEFAULT_PROCESS_TYPE, simple_word_map); bencher.bench(|| { - let _ = SimpleMatcher::new(&smt_word_map); + let _ = SimpleMatcher::new(&simple_table); }); } #[divan::bench] - fn build_cn_by_multiple_simple_match_type(bencher: Bencher) { - let mut smt_word_map = IntMap::default(); + fn build_cn_by_multiple_process_type(bencher: Bencher) { + let mut simple_table = IntMap::default(); let mut global_word_id = 0; - for simple_match_type in [ - SimpleMatchType::Fanjian, - SimpleMatchType::DeleteNormalize, - SimpleMatchType::FanjianDeleteNormalize, - SimpleMatchType::Delete, - SimpleMatchType::Normalize, + for process_type in [ + ProcessType::Fanjian, + ProcessType::DeleteNormalize, + ProcessType::FanjianDeleteNormalize, + ProcessType::Delete, + ProcessType::Normalize, ] { let simple_word_map = build_simple_word_map( "cn", @@ -153,11 +153,11 @@ mod build_cn { DEFAULT_COMBINED_TIMES, &mut global_word_id, ); - smt_word_map.insert(simple_match_type, simple_word_map); + simple_table.insert(process_type, simple_word_map); } bencher.bench(|| { - let _ = SimpleMatcher::new(&smt_word_map); + let _ = SimpleMatcher::new(&simple_table); }); } } @@ -165,9 +165,9 @@ mod build_cn { mod build_en { use super::*; - #[divan::bench(args = EN_SIMPLE_MATCH_TYPE_LIST, max_time = 5)] - fn build_en_by_simple_match_type(bencher: Bencher, simple_match_type: SimpleMatchType) { - let mut smt_word_map = IntMap::default(); + #[divan::bench(args = EN_PROCESS_TYPE_LIST, max_time = 5)] + fn build_en_by_process_type(bencher: Bencher, process_type: ProcessType) { + let mut simple_table = IntMap::default(); let mut global_word_id = 0; let simple_word_map = build_simple_word_map( "en", @@ -175,16 +175,16 @@ mod build_en { DEFAULT_COMBINED_TIMES, &mut global_word_id, ); - smt_word_map.insert(simple_match_type, simple_word_map); + simple_table.insert(process_type, simple_word_map); bencher.bench(|| { - let _ = SimpleMatcher::new(&smt_word_map); + let _ = SimpleMatcher::new(&simple_table); }); } #[divan::bench(args = SIMPLE_WORD_MAP_SIZE_LIST, max_time = 5)] fn build_en_by_simple_word_map_size(bencher: Bencher, simple_word_map_size: usize) { - let mut smt_word_map = IntMap::default(); + let mut simple_table = IntMap::default(); let mut global_word_id = 0; let simple_word_map = build_simple_word_map( "en", @@ -192,16 +192,16 @@ mod build_en { DEFAULT_COMBINED_TIMES, &mut global_word_id, ); - smt_word_map.insert(DEFAULT_SIMPLE_MATCH_TYPE, simple_word_map); + simple_table.insert(DEFAULT_PROCESS_TYPE, simple_word_map); bencher.bench(|| { - let _ = SimpleMatcher::new(&smt_word_map); + let _ = SimpleMatcher::new(&simple_table); }); } #[divan::bench(args = COMBINED_TIMES_LIST, max_time = 5)] fn build_en_by_combined_times(bencher: Bencher, combined_times: usize) { - let mut smt_word_map = IntMap::default(); + let mut simple_table = IntMap::default(); let mut global_word_id = 0; let simple_word_map = build_simple_word_map( "en", @@ -209,21 +209,21 @@ mod build_en { combined_times, &mut global_word_id, ); - smt_word_map.insert(DEFAULT_SIMPLE_MATCH_TYPE, simple_word_map); + simple_table.insert(DEFAULT_PROCESS_TYPE, simple_word_map); bencher.bench(|| { - let _ = SimpleMatcher::new(&smt_word_map); + let _ = SimpleMatcher::new(&simple_table); }); } #[divan::bench] - fn build_en_by_multiple_simple_match_type(bencher: Bencher) { - let mut smt_word_map = IntMap::default(); + fn build_en_by_multiple_process_type(bencher: Bencher) { + let mut simple_table = IntMap::default(); let mut global_word_id = 0; - for simple_match_type in [ - SimpleMatchType::None, - SimpleMatchType::Delete, - SimpleMatchType::DeleteNormalize, + for process_type in [ + ProcessType::None, + ProcessType::Delete, + ProcessType::DeleteNormalize, ] { let simple_word_map = build_simple_word_map( "cn", @@ -231,11 +231,11 @@ mod build_en { DEFAULT_COMBINED_TIMES, &mut global_word_id, ); - smt_word_map.insert(simple_match_type, simple_word_map); + simple_table.insert(process_type, simple_word_map); } bencher.bench(|| { - let _ = SimpleMatcher::new(&smt_word_map); + let _ = SimpleMatcher::new(&simple_table); }); } } @@ -245,10 +245,10 @@ mod search_cn { #[divan::bench(args = SIMPLE_WORD_MAP_SIZE_LIST, max_time = 5)] fn search_cn_baseline(bencher: Bencher, simple_word_map_size: usize) { - let mut smt_word_map = IntMap::default(); + let mut simple_table = IntMap::default(); let simple_word_map = build_simple_word_map_baseline("cn", simple_word_map_size); - smt_word_map.insert(DEFAULT_SIMPLE_MATCH_TYPE, simple_word_map); - let simple_matcher = SimpleMatcher::new(&smt_word_map); + simple_table.insert(DEFAULT_PROCESS_TYPE, simple_word_map); + let simple_matcher = SimpleMatcher::new(&simple_table); bencher.bench(|| { for line in CN_HAYSTACK.lines() { @@ -257,9 +257,9 @@ mod search_cn { }); } - #[divan::bench(args = CN_SIMPLE_MATCH_TYPE_LIST, max_time = 5)] - fn search_cn_by_simple_match_type(bencher: Bencher, simple_match_type: SimpleMatchType) { - let mut smt_word_map = IntMap::default(); + #[divan::bench(args = CN_PROCESS_TYPE_LIST, max_time = 5)] + fn search_cn_by_process_type(bencher: Bencher, process_type: ProcessType) { + let mut simple_table = IntMap::default(); let mut global_word_id = 0; let simple_word_map = build_simple_word_map( "cn", @@ -267,8 +267,8 @@ mod search_cn { DEFAULT_COMBINED_TIMES, &mut global_word_id, ); - smt_word_map.insert(simple_match_type, simple_word_map); - let simple_matcher = SimpleMatcher::new(&smt_word_map); + simple_table.insert(process_type, simple_word_map); + let simple_matcher = SimpleMatcher::new(&simple_table); bencher.bench(|| { for line in CN_HAYSTACK.lines() { @@ -279,7 +279,7 @@ mod search_cn { #[divan::bench(args = SIMPLE_WORD_MAP_SIZE_LIST, max_time = 5)] fn search_cn_by_simple_word_map_size(bencher: Bencher, simple_word_map_size: usize) { - let mut smt_word_map = IntMap::default(); + let mut simple_table = IntMap::default(); let mut global_word_id = 0; let simple_word_map = build_simple_word_map( "cn", @@ -287,8 +287,8 @@ mod search_cn { DEFAULT_COMBINED_TIMES, &mut global_word_id, ); - smt_word_map.insert(DEFAULT_SIMPLE_MATCH_TYPE, simple_word_map); - let simple_matcher = SimpleMatcher::new(&smt_word_map); + simple_table.insert(DEFAULT_PROCESS_TYPE, simple_word_map); + let simple_matcher = SimpleMatcher::new(&simple_table); bencher.bench(|| { for line in CN_HAYSTACK.lines() { @@ -299,7 +299,7 @@ mod search_cn { #[divan::bench(args = COMBINED_TIMES_LIST, max_time = 5)] fn search_cn_by_combined_times(bencher: Bencher, combined_times: usize) { - let mut smt_word_map = IntMap::default(); + let mut simple_table = IntMap::default(); let mut global_word_id = 0; let simple_word_map = build_simple_word_map( "cn", @@ -307,8 +307,8 @@ mod search_cn { combined_times, &mut global_word_id, ); - smt_word_map.insert(DEFAULT_SIMPLE_MATCH_TYPE, simple_word_map); - let simple_matcher = SimpleMatcher::new(&smt_word_map); + simple_table.insert(DEFAULT_PROCESS_TYPE, simple_word_map); + let simple_matcher = SimpleMatcher::new(&simple_table); bencher.bench(|| { for line in CN_HAYSTACK.lines() { @@ -318,15 +318,15 @@ mod search_cn { } #[divan::bench] - fn search_cn_by_multiple_simple_match_type(bencher: Bencher) { - let mut smt_word_map = IntMap::default(); + fn search_cn_by_multiple_process_type(bencher: Bencher) { + let mut simple_table = IntMap::default(); let mut global_word_id = 0; - for simple_match_type in [ - SimpleMatchType::Fanjian, - SimpleMatchType::DeleteNormalize, - SimpleMatchType::FanjianDeleteNormalize, - SimpleMatchType::Delete, - SimpleMatchType::Normalize, + for process_type in [ + ProcessType::Fanjian, + ProcessType::DeleteNormalize, + ProcessType::FanjianDeleteNormalize, + ProcessType::Delete, + ProcessType::Normalize, ] { let simple_word_map = build_simple_word_map( "cn", @@ -334,9 +334,9 @@ mod search_cn { DEFAULT_COMBINED_TIMES, &mut global_word_id, ); - smt_word_map.insert(simple_match_type, simple_word_map); + simple_table.insert(process_type, simple_word_map); } - let simple_matcher = SimpleMatcher::new(&smt_word_map); + let simple_matcher = SimpleMatcher::new(&simple_table); bencher.bench(|| { for line in CN_HAYSTACK.lines() { @@ -351,10 +351,10 @@ mod search_en { #[divan::bench(args = SIMPLE_WORD_MAP_SIZE_LIST, max_time = 5)] fn search_en_baseline(bencher: Bencher, simple_word_map_size: usize) { - let mut smt_word_map = IntMap::default(); + let mut simple_table = IntMap::default(); let simple_word_map = build_simple_word_map_baseline("en", simple_word_map_size); - smt_word_map.insert(DEFAULT_SIMPLE_MATCH_TYPE, simple_word_map); - let simple_matcher = SimpleMatcher::new(&smt_word_map); + simple_table.insert(DEFAULT_PROCESS_TYPE, simple_word_map); + let simple_matcher = SimpleMatcher::new(&simple_table); bencher.bench(|| { for line in CN_HAYSTACK.lines() { @@ -363,9 +363,9 @@ mod search_en { }); } - #[divan::bench(args = EN_SIMPLE_MATCH_TYPE_LIST, max_time = 5)] - fn search_en_by_simple_match_type(bencher: Bencher, simple_match_type: SimpleMatchType) { - let mut smt_word_map = IntMap::default(); + #[divan::bench(args = EN_PROCESS_TYPE_LIST, max_time = 5)] + fn search_en_by_process_type(bencher: Bencher, process_type: ProcessType) { + let mut simple_table = IntMap::default(); let mut global_word_id = 0; let simple_word_map = build_simple_word_map( "en", @@ -373,8 +373,8 @@ mod search_en { DEFAULT_COMBINED_TIMES, &mut global_word_id, ); - smt_word_map.insert(simple_match_type, simple_word_map); - let simple_matcher = SimpleMatcher::new(&smt_word_map); + simple_table.insert(process_type, simple_word_map); + let simple_matcher = SimpleMatcher::new(&simple_table); bencher.bench(|| { for line in EN_HAYSTACK.lines() { @@ -385,7 +385,7 @@ mod search_en { #[divan::bench(args = SIMPLE_WORD_MAP_SIZE_LIST, max_time = 5)] fn search_en_by_simple_word_map_size(bencher: Bencher, simple_word_map_size: usize) { - let mut smt_word_map = IntMap::default(); + let mut simple_table = IntMap::default(); let mut global_word_id = 0; let simple_word_map = build_simple_word_map( "en", @@ -393,8 +393,8 @@ mod search_en { DEFAULT_COMBINED_TIMES, &mut global_word_id, ); - smt_word_map.insert(DEFAULT_SIMPLE_MATCH_TYPE, simple_word_map); - let simple_matcher = SimpleMatcher::new(&smt_word_map); + simple_table.insert(DEFAULT_PROCESS_TYPE, simple_word_map); + let simple_matcher = SimpleMatcher::new(&simple_table); bencher.bench(|| { for line in EN_HAYSTACK.lines() { @@ -405,7 +405,7 @@ mod search_en { #[divan::bench(args = COMBINED_TIMES_LIST, max_time = 5)] fn search_en_by_combined_times(bencher: Bencher, combined_times: usize) { - let mut smt_word_map = IntMap::default(); + let mut simple_table = IntMap::default(); let mut global_word_id = 0; let simple_word_map = build_simple_word_map( "en", @@ -413,8 +413,8 @@ mod search_en { combined_times, &mut global_word_id, ); - smt_word_map.insert(DEFAULT_SIMPLE_MATCH_TYPE, simple_word_map); - let simple_matcher = SimpleMatcher::new(&smt_word_map); + simple_table.insert(DEFAULT_PROCESS_TYPE, simple_word_map); + let simple_matcher = SimpleMatcher::new(&simple_table); bencher.bench(|| { for line in EN_HAYSTACK.lines() { @@ -424,13 +424,13 @@ mod search_en { } #[divan::bench] - fn search_en_by_multiple_simple_match_type(bencher: Bencher) { - let mut smt_word_map = IntMap::default(); + fn search_en_by_multiple_process_type(bencher: Bencher) { + let mut simple_table = IntMap::default(); let mut global_word_id = 0; - for simple_match_type in [ - SimpleMatchType::None, - SimpleMatchType::Delete, - SimpleMatchType::DeleteNormalize, + for process_type in [ + ProcessType::None, + ProcessType::Delete, + ProcessType::DeleteNormalize, ] { let simple_word_map = build_simple_word_map( "en", @@ -438,9 +438,9 @@ mod search_en { DEFAULT_COMBINED_TIMES, &mut global_word_id, ); - smt_word_map.insert(simple_match_type, simple_word_map); + simple_table.insert(process_type, simple_word_map); } - let simple_matcher = SimpleMatcher::new(&smt_word_map); + let simple_matcher = SimpleMatcher::new(&simple_table); bencher.bench(|| { for line in EN_HAYSTACK.lines() { @@ -451,7 +451,7 @@ mod search_en { } fn main() { - println!("Current default simple match type: {DEFAULT_SIMPLE_MATCH_TYPE:?}"); + println!("Current default simple match type: {DEFAULT_PROCESS_TYPE:?}"); println!("Current default simple word map size: {DEFAULT_SIMPLE_WORD_MAP_SIZE:?}"); println!("Current default combined times: {DEFAULT_COMBINED_TIMES:?}"); diff --git a/matcher_rs/benches/bench_test.rs b/matcher_rs/benches/bench_test.rs index 484162a..da33a5d 100644 --- a/matcher_rs/benches/bench_test.rs +++ b/matcher_rs/benches/bench_test.rs @@ -3,24 +3,24 @@ extern crate test; use test::Bencher; -use matcher_rs::{build_smt_tree, reduce_text_process_with_tree, SimpleMatchType}; +use matcher_rs::{build_process_type_tree, reduce_text_process_with_tree, ProcessType}; #[bench] fn bench_test(b: &mut Bencher) { - let smt_list = [ - SimpleMatchType::Fanjian, - SimpleMatchType::DeleteNormalize - SimpleMatchType::WordDelete, - SimpleMatchType::FanjianDeleteNormalize - SimpleMatchType::WordDelete, - SimpleMatchType::Delete - SimpleMatchType::WordDelete, - SimpleMatchType::Normalize, + let process_type_list = [ + ProcessType::Fanjian, + ProcessType::DeleteNormalize, + ProcessType::FanjianDeleteNormalize, + ProcessType::Delete, + ProcessType::Normalize, ]; - let smt_tree = build_smt_tree(&smt_list); + let process_typetree = build_process_type_tree(&process_type_list); - reduce_text_process_with_tree(&smt_tree, "你好,我是中国人"); + reduce_text_process_with_tree(&process_typetree, "hello world!"); b.iter(|| { for _ in 0..1000 { - reduce_text_process_with_tree(&smt_tree, "你好,我是中国人"); + reduce_text_process_with_tree(&process_typetree, "hello world!"); } }); } diff --git a/matcher_rs/build.rs b/matcher_rs/build.rs index 09457c8..8f831a2 100644 --- a/matcher_rs/build.rs +++ b/matcher_rs/build.rs @@ -1,14 +1,39 @@ use std::io::Result; +/// The `main` function serves as the build script for a Rust project, responsible for +/// generating binary data files used in text conversion and matching tasks. +/// Depending on the features enabled, it reads specific conversion mappings from +/// text files, processes them, and writes them to binary files. +/// +/// It comprises several key steps: +/// +/// 1. Print instructions to re-run build script if specific files change. +/// 2. Conditionally process text conversion data only if 'runtime_build' feature is not enabled. +/// 3. Load text content from files in the 'process_map' directory into constants like FANJIAN, NUM_NORM, NORM, and PINYIN. +/// 4. For each mapping type ('fanjian', 'normalize', 'pinyin'): +/// - Aggregate conversion mappings from loaded constants into a HashMap. +/// - Clean the HashMap by removing identity mappings. +/// - Create binary files containing the list of strings to match and the list of corresponding replacements. +/// - For 'pinyin': +/// - Also create a binary file with trimmed replacements. +/// - For specified mappings ('fanjian', 'pinyin'): +/// - Use the `daachorse` crate to build and serialize a CharwiseDoubleArrayAhoCorasick matcher, and write it to a binary file. +/// - For 'normalize', when DFA feature is not enabled: +/// - Similarly, build a matcher with a different match kind and serialize it. +/// 5. Additionally, if 'dfa' feature is not enabled: +/// - Load delete and whitespace character patterns from TEXT_DELETE constant and WHITE_SPACE array respectively. +/// - Aggregate these patterns into a HashSet to remove duplicates. +/// - Write these patterns to a binary file. +/// - Build a matcher for these patterns, serialize it, and write it to a binary file. +/// +/// The function completes by returning `Ok(())` to indicate successful completion of the build script. fn main() -> Result<()> { println!("cargo:rerun-if-changed=build.rs"); - println!("cargo:rerun-if-changed=str_conv"); + println!("cargo:rerun-if-changed=process_map"); - #[cfg(feature = "prebuilt")] + #[cfg(not(feature = "runtime_build"))] { use std::collections::HashMap; - #[cfg(not(feature = "dfa"))] - use std::collections::HashSet; use std::env; use std::fs::File; use std::io::Write; @@ -18,27 +43,33 @@ fn main() -> Result<()> { MatchKind as DoubleArrayAhoCorasickMatchKind, }; - const FANJIAN: &str = include_str!("./str_conv/FANJIAN.txt"); - const NUM_NORM: &str = include_str!("./str_conv/NUM-NORM.txt"); - const NORM: &str = include_str!("./str_conv/NORM.txt"); - const PINYIN: &str = include_str!("./str_conv/PINYIN.txt"); - #[cfg(not(feature = "dfa"))] - const TEXT_DELETE: &str = include_str!("./str_conv/TEXT-DELETE.txt"); + /// These constants include the contents of their respective text files + /// from the `process_map` directory. Each constant refers to a specific + /// text conversion mapping used within the project. The text files + /// contain tab-separated values, where each line represents a pair of + /// strings that define a specific conversion. + /// + /// - `FANJIAN` includes simplified and traditional Chinese character mappings. + /// - `NUM_NORM` includes mappings for normalizing numbers. + /// - `NORM` includes mappings for various normalization forms. + /// - `PINYIN` includes mappings for converting characters to Pinyin. + const FANJIAN: &str = include_str!("./process_map/FANJIAN.txt"); + const NUM_NORM: &str = include_str!("./process_map/NUM-NORM.txt"); + const NORM: &str = include_str!("./process_map/NORM.txt"); + const PINYIN: &str = include_str!("./process_map/PINYIN.txt"); let out_dir = env::var("OUT_DIR").unwrap(); - let process_str_conv_map = HashMap::from([ + let process_str_map = HashMap::from([ ("fanjian", vec![FANJIAN]), ("normalize", vec![NORM, NUM_NORM]), ("pinyin", vec![PINYIN]), - #[cfg(not(feature = "dfa"))] - ("text_delete", vec![TEXT_DELETE]), ]); - for smt_bit_str in ["fanjian", "normalize", "pinyin"] { + for process_type_bit_str in ["fanjian", "normalize", "pinyin"] { let mut process_dict = HashMap::new(); - for str_conv_map in process_str_conv_map.get(smt_bit_str).unwrap() { - process_dict.extend(str_conv_map.trim().lines().map(|pair_str| { + for process_map in process_str_map.get(process_type_bit_str).unwrap() { + process_dict.extend(process_map.trim().lines().map(|pair_str| { let mut pair_str_split = pair_str.split('\t'); ( pair_str_split.next().unwrap(), @@ -54,18 +85,19 @@ fn main() -> Result<()> { .collect::>(); let mut process_list_bin = - File::create(format!("{out_dir}/{smt_bit_str}_process_list.bin"))?; + File::create(format!("{out_dir}/{process_type_bit_str}_process_list.bin"))?; process_list_bin.write_all(process_list.join("\n").as_bytes())?; let process_replace_list = process_dict .iter() .map(|(_, &val)| val) .collect::>(); - let mut process_replace_list_bin = - File::create(format!("{out_dir}/{smt_bit_str}_process_replace_list.bin"))?; + let mut process_replace_list_bin = File::create(format!( + "{out_dir}/{process_type_bit_str}_process_replace_list.bin" + ))?; process_replace_list_bin.write_all(process_replace_list.join("\n").as_bytes())?; - if smt_bit_str == "pinyin" { + if process_type_bit_str == "pinyin" { let process_replace_list = process_dict .iter() .map(|(_, &val)| val.trim_matches(' ')) @@ -75,7 +107,7 @@ fn main() -> Result<()> { process_replace_list_bin.write_all(process_replace_list.join("\n").as_bytes())?; } - if ["fanjian", "pinyin"].contains(&smt_bit_str) { + if ["fanjian", "pinyin"].contains(&process_type_bit_str) { let matcher: CharwiseDoubleArrayAhoCorasick = CharwiseDoubleArrayAhoCorasickBuilder::new() .match_kind(DoubleArrayAhoCorasickMatchKind::Standard) @@ -83,13 +115,13 @@ fn main() -> Result<()> { .unwrap(); let matcher_bytes = matcher.serialize(); let mut matcher_bin = File::create(format!( - "{out_dir}/{smt_bit_str}_daachorse_charwise_u32_matcher.bin" + "{out_dir}/{process_type_bit_str}_daachorse_charwise_u32_matcher.bin" ))?; matcher_bin.write_all(&matcher_bytes)?; } #[cfg(not(feature = "dfa"))] - if smt_bit_str == "normalize" { + if process_type_bit_str == "normalize" { let matcher: CharwiseDoubleArrayAhoCorasick = CharwiseDoubleArrayAhoCorasickBuilder::new() .match_kind(DoubleArrayAhoCorasickMatchKind::LeftmostLongest) @@ -97,24 +129,43 @@ fn main() -> Result<()> { .unwrap(); let matcher_bytes = matcher.serialize(); let mut matcher_bin = File::create(format!( - "{out_dir}/{smt_bit_str}_daachorse_charwise_u32_matcher.bin" + "{out_dir}/{process_type_bit_str}_daachorse_charwise_u32_matcher.bin" ))?; matcher_bin.write_all(&matcher_bytes)?; } } #[cfg(not(feature = "dfa"))] - for smt_bit_str in ["text_delete"] { + { + use std::collections::HashSet; + + /// These constants define deletion and whitespace character mappings + /// that are used within the project. The `TEXT_DELETE` constant + /// includes contents from the `TEXT-DELETE.txt` file in the `process_map` + /// directory, which contains textual patterns to be deleted. + /// The `WHITE_SPACE` constant includes various Unicode whitespace + /// characters that are treated as whitespace in the project's text + /// processing logic. + /// + /// - `TEXT_DELETE` includes patterns of text identified for deletion. + /// - `WHITE_SPACE` includes numerous Unicode representations of whitespace. + const TEXT_DELETE: &str = include_str!("./process_map/TEXT-DELETE.txt"); + const WHITE_SPACE: &[&str] = &[ + "\u{0009}", "\u{000A}", "\u{000B}", "\u{000C}", "\u{000D}", "\u{0020}", "\u{0085}", + "\u{00A0}", "\u{1680}", "\u{2000}", "\u{2001}", "\u{2002}", "\u{2003}", "\u{2004}", + "\u{2005}", "\u{2006}", "\u{2007}", "\u{2008}", "\u{2009}", "\u{200A}", "\u{200D}", + "\u{200F}", "\u{2028}", "\u{2029}", "\u{202F}", "\u{205F}", "\u{3000}", + ]; + let mut process_set = HashSet::new(); - for str_conv_map in process_str_conv_map.get(smt_bit_str).unwrap() { - process_set.extend(str_conv_map.trim().lines().map(|line| line)); - } + process_set.extend(TEXT_DELETE.trim().lines().map(|line| line)); + process_set.extend(WHITE_SPACE); let process_list = process_set.iter().map(|&s| s).collect::>(); let mut process_list_bin = - File::create(format!("{out_dir}/{smt_bit_str}_process_list.bin"))?; + File::create(format!("{out_dir}/{process_type_bit_str}_process_list.bin"))?; process_list_bin.write_all(process_list.join("\n").as_bytes())?; let matcher: CharwiseDoubleArrayAhoCorasick = @@ -124,7 +175,7 @@ fn main() -> Result<()> { .unwrap(); let matcher_bytes = matcher.serialize(); let mut matcher_bin = File::create(format!( - "{out_dir}/{smt_bit_str}_daachorse_charwise_u32_matcher.bin" + "{out_dir}/{process_type_bit_str}_daachorse_charwise_u32_matcher.bin" ))?; matcher_bin.write_all(&matcher_bytes)?; } diff --git a/matcher_rs/str_conv/FANJIAN.txt b/matcher_rs/process_map/FANJIAN.txt similarity index 100% rename from matcher_rs/str_conv/FANJIAN.txt rename to matcher_rs/process_map/FANJIAN.txt diff --git a/matcher_rs/str_conv/NORM.txt b/matcher_rs/process_map/NORM.txt similarity index 100% rename from matcher_rs/str_conv/NORM.txt rename to matcher_rs/process_map/NORM.txt diff --git a/matcher_rs/str_conv/NUM-NORM.txt b/matcher_rs/process_map/NUM-NORM.txt similarity index 100% rename from matcher_rs/str_conv/NUM-NORM.txt rename to matcher_rs/process_map/NUM-NORM.txt diff --git a/matcher_rs/str_conv/PINYIN.txt b/matcher_rs/process_map/PINYIN.txt similarity index 100% rename from matcher_rs/str_conv/PINYIN.txt rename to matcher_rs/process_map/PINYIN.txt diff --git a/matcher_rs/str_conv/TEXT-DELETE.txt b/matcher_rs/process_map/TEXT-DELETE.txt similarity index 100% rename from matcher_rs/str_conv/TEXT-DELETE.txt rename to matcher_rs/process_map/TEXT-DELETE.txt diff --git a/matcher_rs/src/lib.rs b/matcher_rs/src/lib.rs index ebd414f..51615b3 100644 --- a/matcher_rs/src/lib.rs +++ b/matcher_rs/src/lib.rs @@ -1,8 +1,3 @@ -#[cfg(all(feature = "prebuilt", feature = "runtime_build"))] -compile_error!( - "feature \"prebuilt\" and feature \"runtime_build\" cannot be enabled at the same time" -); - #[cfg(all(target_os = "linux", target_arch = "aarch64"))] #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; @@ -16,12 +11,12 @@ pub use util::word::SimpleWord; mod process; pub use process::process_matcher::{ - build_smt_tree, reduce_text_process, reduce_text_process_with_list, - reduce_text_process_with_tree, text_process, + build_process_type_tree, reduce_text_process, reduce_text_process_emit, + reduce_text_process_with_list, reduce_text_process_with_tree, text_process, ProcessType, }; mod simple_matcher; -pub use simple_matcher::{SimpleMatchType, SimpleMatchTypeWordMap, SimpleMatcher, SimpleResult}; +pub use simple_matcher::{SimpleMatcher, SimpleResult, SimpleTable}; mod regex_matcher; pub use regex_matcher::{RegexMatchType, RegexMatcher, RegexTable}; diff --git a/matcher_rs/src/matcher.rs b/matcher_rs/src/matcher.rs index e2def1f..8a41855 100644 --- a/matcher_rs/src/matcher.rs +++ b/matcher_rs/src/matcher.rs @@ -1,171 +1,96 @@ use std::borrow::Cow; use std::collections::HashMap; +use id_set::IdSet; use nohash_hasher::{IntMap, IntSet}; use sonic_rs::{to_string, Deserialize, Serialize}; -use crate::regex_matcher::{RegexMatchType, RegexMatcher, RegexTable}; -use crate::sim_matcher::{SimMatchType, SimMatcher, SimTable}; -use crate::simple_matcher::{SimpleMatchType, SimpleMatcher}; +use crate::process::process_matcher::{ + build_process_type_tree, reduce_text_process_with_tree, ProcessType, ProcessTypeBitNode, +}; +use crate::regex_matcher::{RegexMatchType, RegexMatcher, RegexResult, RegexTable}; +use crate::sim_matcher::{SimMatchType, SimMatcher, SimResult, SimTable}; +use crate::simple_matcher::{SimpleMatcher, SimpleTable}; pub trait TextMatcherTrait<'a, T: MatchResultTrait<'a> + 'a> { - fn is_match(&self, text: &str) -> bool; - fn process(&'a self, text: &str) -> Vec; - fn process_iter(&'a self, text: &str) -> Box + 'a> { + fn is_match(&'a self, text: &'a str) -> bool; + fn _is_match_with_processed_text_process_type_set( + &'a self, + processed_text_process_type_set: &[(Cow<'a, str>, IdSet)], + ) -> bool; + fn process(&'a self, text: &'a str) -> Vec; + fn _process_with_processed_text_process_type_set( + &'a self, + processed_text_process_type_set: &[(Cow<'a, str>, IdSet)], + ) -> Vec; + fn process_iter(&'a self, text: &'a str) -> Box + 'a> { Box::new(self.process(text).into_iter()) } - fn batch_process(&'a self, text_array: &[&str]) -> Vec> { + fn batch_process(&'a self, text_array: &[&'a str]) -> Vec> { text_array.iter().map(|&text| self.process(text)).collect() } } pub trait MatchResultTrait<'a> { - fn word_id(&self) -> u32 { - 0 - } - fn table_id(&self) -> u32 { - 0 - } + fn match_id(&self) -> u32; + fn table_id(&self) -> u32; + fn word_id(&self) -> u32; fn word(&self) -> &str; - fn similarity(&self) -> f64 { - 1.0 - } + fn similarity(&self) -> f64; } -/// An enumeration representing the different types of matching strategies available for a match table. -/// -/// This enum defines the various strategies that can be applied when attempting to match text -/// within a table. Each variant encapsulates the specific configuration required for that type of matching. -/// -/// # Variants -/// -/// * `Simple { simple_match_type }` - Indicates the use of a simple matching strategy. Contains a `simple_match_type` field of type [SimpleMatchType]. -/// * `Regex { regex_match_type }` - Indicates the use of a regular expression matching strategy. Contains a `regex_match_type` field of type [RegexMatchType]. -/// * `Similar { sim_match_type, threshold }` - Indicates the use of a similarity-based matching strategy. Contains a `sim_match_type` field of type [SimMatchType] and a `threshold` field of type [f64]. -/// -/// # Serde Attributes -/// -/// The `snake_case` renaming strategy is used for serialization and deserialization to ensure -/// that the field names in the serialized output conform to the snake_case convention. -/// -/// # Example -/// -/// ``` -/// use matcher_rs::{MatchTableType, SimpleMatchType, RegexMatchType, SimMatchType}; -/// -/// let simple_match = MatchTableType::Simple { -/// simple_match_type: SimpleMatchType::None, -/// }; -/// -/// let regex_match = MatchTableType::Regex { -/// regex_match_type: RegexMatchType::Regex, -/// }; -/// -/// let similar_match = MatchTableType::Similar { -/// sim_match_type: SimMatchType::Levenshtein, -/// threshold: 0.8, -/// }; -/// ``` #[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq)] -#[serde(untagged)] +#[serde(rename_all = "snake_case")] pub enum MatchTableType { Simple { - simple_match_type: SimpleMatchType, + process_type: ProcessType, }, Regex { regex_match_type: RegexMatchType, + process_type: ProcessType, }, Similar { sim_match_type: SimMatchType, threshold: f64, + process_type: ProcessType, }, } -/// A structure representing a match table configuration used for text matching operations. -/// -/// This structure defines the necessary fields and types required for configuring a match -/// table. Each match table has an associated matching strategy, a list of words to be matched, -/// and a list of exemptions. The match table configuration is essential for initializing matchers -/// and performing text matching processes. -/// -/// The structure supports serialization and deserialization through the `serde` library, -/// allowing it to be easily converted to and from various data formats like JSON. -/// -/// # Fields -/// -/// * `table_id` - A [u32] that uniquely identifies the match table in the system. -/// * `match_table_type` - A [MatchTableType] enumeration that specifies the matching strategy to be used. -/// * `word_list` - A [`Vec<&'a str>`] containing the list of words for matching. The use of `&'a str` -/// allows the words to be borrowed, which can optimize memory usage. -/// * `exemption_simple_match_type` - A [SimpleMatchType] indicating the matching strategy for the exemption words. -/// * `exemption_word_list` - A [`Vec<&'a str>`] containing the list of words to be exempted from matching. Like `word_list`, -/// this is also a borrowed vector to allow efficient memory use. -/// -/// # Lifetimes -/// -/// * `'a` - The lifetime associated with the `word_list` and `exemption_word_list` fields, ensuring that the data -/// for the words can be borrowed for efficiency. -/// -/// # Serde Attributes -/// -/// The `borrow` attribute on `word_list` and `exemption_word_list` fields ensures that the deserialized -/// data can borrow from the input data, providing better performance by avoiding unnecessary allocations. #[derive(Serialize, Deserialize, Debug, Clone)] pub struct MatchTable<'a> { pub table_id: u32, pub match_table_type: MatchTableType, #[serde(borrow)] pub word_list: Vec<&'a str>, - pub exemption_simple_match_type: SimpleMatchType, + pub exemption_process_type: ProcessType, #[serde(borrow)] pub exemption_word_list: Vec<&'a str>, } -/// A structure representing the configuration of a word table used in text matching. -/// -/// This structure holds the details of a specific word table and its configuration within -/// the text matching system. It includes a unique identifier for the match, the table's -/// identifier, and a flag indicating whether the word table represents an exemption. -/// -/// # Fields -/// -/// * `match_id` - A [u32] representing the identifier of the match within the system. -/// * `table_id` - A [u32] representing the identifier of the table within the system. -/// * `is_exemption` - A [bool] flag that indicates whether the word table is an exemption. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] struct WordTableConf { match_id: u32, table_id: u32, + offset: u32, is_exemption: bool, } -/// A structure representing the result of a matching operation. -/// -/// This structure contains details about an individual matching result, -/// including the identifier of the matching table and the matched word itself. -/// -/// # Fields -/// -/// * `match_id` - A [u32] that uniquely identifies the match within the system. -/// * `table_id` - A [u32] that uniquely identifies the table in which the match was found. -/// * `word` - A [Cow<'a, str>] that holds the matched word. The [Cow] type allows the word -/// to be either borrowed from the original data or owned, optimizing for efficient memory use. -/// -/// # Lifetimes -/// -/// * `'a` - The lifetime associated with the `word` field, ensuring that the data -/// for the word can be borrowed for efficiency. #[derive(Serialize)] pub struct MatchResult<'a> { pub match_id: u32, pub table_id: u32, + pub word_id: u32, pub word: Cow<'a, str>, + pub similarity: f64, } impl MatchResultTrait<'_> for MatchResult<'_> { + fn match_id(&self) -> u32 { + self.match_id + } fn word_id(&self) -> u32 { - 0 + self.word_id } fn table_id(&self) -> u32 { self.table_id @@ -173,113 +98,60 @@ impl MatchResultTrait<'_> for MatchResult<'_> { fn word(&self) -> &str { self.word.as_ref() } + fn similarity(&self) -> f64 { + self.similarity + } +} + +impl<'a, 'b: 'a> From> for MatchResult<'a> { + fn from(sim_result: SimResult<'b>) -> Self { + MatchResult { + match_id: sim_result.match_id, + table_id: sim_result.table_id, + word_id: sim_result.word_id, + word: sim_result.word, + similarity: sim_result.similarity, + } + } +} + +impl<'a, 'b: 'a> From> for MatchResult<'a> { + fn from(regex_result: RegexResult<'b>) -> Self { + MatchResult { + match_id: regex_result.match_id, + table_id: regex_result.table_id, + word_id: regex_result.word_id, + word: regex_result.word, + similarity: 1.0, + } + } } pub type MatchTableMap<'a> = IntMap>>; -/// The [Matcher] struct encapsulates various matching strategies and their configurations used for text processing. -/// -/// This structure holds configurations for simple, regex, and similarity-based matchers. It manages -/// different maps and matchers necessary to perform text matching operations. -/// -/// # Fields -/// -/// * `simple_word_table_conf_map` - An [IntMap] that maps word table configuration IDs to their configurations. -/// * `simple_word_table_conf_id_map` - An [IntMap] that maps word IDs to their corresponding word table configuration IDs. -/// * `simple_matcher` - An [`Option`] that holds the simple matcher if it exists. -/// * `regex_matcher` - An [`Option`] that holds the regex matcher if it exists. -/// * `sim_matcher` - An [`Option`] that holds the similarity matcher if it exists. -/// -/// The [Matcher] struct is typically instantiated through the [new](Matcher::new) method, which processes an input map of match tables -/// and initializes the appropriate matchers and data structures. -/// -/// # Example -/// -/// ``` -/// use matcher_rs::{Matcher, MatchTable, MatchTableType, SimpleMatchType}; -/// use std::collections::HashMap; -/// -/// let mut match_table_map = HashMap::new(); -/// match_table_map.insert( -/// 1, -/// vec![MatchTable { -/// table_id: 1, -/// match_table_type: MatchTableType::Simple { simple_match_type: SimpleMatchType::None }, -/// word_list: vec!["apple", "banana"], -/// exemption_simple_match_type: SimpleMatchType::None, -/// exemption_word_list: vec!["orange"], -/// }], -/// ); -/// -/// let matcher = Matcher::new(&match_table_map); -/// ``` #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct Matcher { - simple_word_table_conf_map: IntMap, - simple_word_table_conf_id_map: IntMap, + process_type_tree: Vec, + simple_word_table_conf_list: Vec, + simple_word_table_conf_index_list: Vec, simple_matcher: Option, regex_matcher: Option, sim_matcher: Option, } impl Matcher { - /// Creates a new [Matcher] instance from the provided match table map. - /// - /// This function processes the input map of match tables to initialize the various - /// components of the [Matcher] including simple, regex, and similarity-based matchers. - /// - /// # Arguments - /// - /// * `match_table_map` - A reference to a [HashMap] where the keys are [u32] identifiers - /// and the values are vectors of [MatchTable] instances representing different types of match tables. - /// - /// # Returns - /// - /// A [Matcher] instance initialized with the configurations derived from the provided match table map. - /// - /// The construction process involves: - /// - /// 1. Iterating through the provided match table map. - /// 2. Extracting table configurations and populating the corresponding matcher-specific data structures: - /// - Simple match type word map - /// - Regex table list - /// - Similarity table list - /// 3. Handling exemptions by updating the word table configurations. - /// - /// The word and table identifiers are incremented as new entries are processed and added. - /// - /// # Example - /// - /// ``` - /// use matcher_rs::{Matcher, MatchTable, MatchTableType, SimpleMatchType}; - /// use std::collections::HashMap; - /// - /// let mut match_table_map = HashMap::new(); - /// match_table_map.insert( - /// 1, - /// vec![MatchTable { - /// table_id: 1, - /// match_table_type: MatchTableType::Simple { simple_match_type: SimpleMatchType::None }, - /// word_list: vec!["apple", "banana"], - /// exemption_simple_match_type: SimpleMatchType::None, - /// exemption_word_list: vec!["orange"], - /// }], - /// ); - /// - /// let matcher = Matcher::new(&match_table_map); - /// ``` - pub fn new<'a, S>(match_table_map: &HashMap>, S>) -> Matcher { - let mut word_id: u32 = 0; - let mut word_table_conf_id: u32 = 0; - - let mut simple_word_table_conf_map = IntMap::default(); - let mut simple_word_table_conf_id_map = IntMap::default(); - - let mut smt_word_map: IntMap> = IntMap::default(); - - let mut regex_table_list: Vec = Vec::new(); - let mut sim_table_list: Vec = Vec::new(); + pub fn new(match_table_map: &HashMap>, S>) -> Matcher { + let mut process_type_list = Vec::new(); + + let mut simple_word_id = 0; + let mut simple_word_table_conf_id = 0; + let mut simple_word_table_conf_list = Vec::new(); + let mut simple_word_table_conf_index_list = Vec::new(); + let mut simple_table: SimpleTable = IntMap::default(); + + let mut regex_table_list = Vec::new(); + let mut sim_table_list = Vec::new(); for (&match_id, table_list) in match_table_map { for table in table_list { @@ -290,41 +162,49 @@ impl Matcher { if !word_list.is_empty() { match match_table_type { - MatchTableType::Simple { simple_match_type } => { - simple_word_table_conf_map.insert( - word_table_conf_id, - WordTableConf { - match_id, - table_id, - is_exemption: false, - }, - ); - - let simple_word_map = - smt_word_map.entry(simple_match_type).or_default(); + MatchTableType::Simple { process_type } => { + process_type_list.push(process_type); + simple_word_table_conf_list.push(WordTableConf { + match_id, + table_id, + offset: simple_word_id, + is_exemption: false, + }); + + let simple_word_map = simple_table.entry(process_type).or_default(); for word in word_list.iter() { - simple_word_table_conf_id_map.insert(word_id, word_table_conf_id); - simple_word_map.insert(word_id, word); - word_id += 1; + simple_word_table_conf_index_list.push(simple_word_table_conf_id); + simple_word_map.insert(simple_word_id, word); + simple_word_id += 1; } - word_table_conf_id += 1 + simple_word_table_conf_id += 1 } MatchTableType::Similar { + process_type, sim_match_type, threshold, - } => sim_table_list.push(SimTable { - table_id, - match_id, - sim_match_type, - word_list, - threshold, - }), - MatchTableType::Regex { regex_match_type } => { + } => { + process_type_list.push(process_type); + sim_table_list.push(SimTable { + table_id, + match_id, + process_type, + sim_match_type, + word_list, + threshold, + }) + } + MatchTableType::Regex { + process_type, + regex_match_type, + } => { + process_type_list.push(process_type); regex_table_list.push(RegexTable { table_id, match_id, + process_type, regex_match_type, word_list, }) @@ -333,186 +213,184 @@ impl Matcher { } if !exemption_word_list.is_empty() { - simple_word_table_conf_map.insert( - word_table_conf_id, - WordTableConf { - match_id, - table_id, - is_exemption: true, - }, - ); - - let simple_word_map = smt_word_map - .entry(table.exemption_simple_match_type) + process_type_list.push(table.exemption_process_type); + simple_word_table_conf_list.push(WordTableConf { + match_id, + table_id, + offset: simple_word_id, + is_exemption: true, + }); + + let simple_word_map = simple_table + .entry(table.exemption_process_type) .or_default(); for exemption_word in exemption_word_list.iter() { - simple_word_table_conf_id_map.insert(word_id, word_table_conf_id); - simple_word_map.insert(word_id, exemption_word); - word_id += 1; + simple_word_table_conf_index_list.push(simple_word_table_conf_id); + simple_word_map.insert(simple_word_id, exemption_word); + simple_word_id += 1; } - word_table_conf_id += 1 + simple_word_table_conf_id += 1 } } } + let process_type_tree = build_process_type_tree(&process_type_list); + Matcher { - simple_word_table_conf_map, - simple_word_table_conf_id_map, - simple_matcher: (!smt_word_map.is_empty()).then(|| SimpleMatcher::new(&smt_word_map)), + process_type_tree, + simple_word_table_conf_list, + simple_word_table_conf_index_list, + simple_matcher: (!simple_table.is_empty()).then(|| SimpleMatcher::new(&simple_table)), regex_matcher: (!regex_table_list.is_empty()) .then(|| RegexMatcher::new(®ex_table_list)), sim_matcher: (!sim_table_list.is_empty()).then(|| SimMatcher::new(&sim_table_list)), } } - /// Matches the provided text and returns the raw results as a [HashMap] with match identifiers and vectors of [MatchResult]s. - /// - /// This function takes a string slice representing the text to be matched and processes it using the available - /// matchers (simple, regex, and similarity matchers). It gathers the matching results into a [HashMap] where - /// the keys are match identifiers and the values are vectors of [MatchResult] instances. - /// - /// The function proceeds through the following steps: - /// - /// 1. **Regex Matching**: If a regex matcher is available, processes the text with it and collects the results. - /// 2. **Similarity Matching**: If a similarity matcher is available, processes the text with it and collects the results. - /// 3. **Simple Matching**: If a simple matcher is available, processes the text with it. It also checks for exemptions - /// and updates the match results accordingly. - /// - /// # Arguments - /// - /// * `text` - A string slice representing the text to be matched. - /// - /// # Returns - /// - /// A [`HashMap>`] where the keys are match identifiers and the values are vectors of [MatchResult] - /// instances containing the matching results for each identifier. - /// - /// If the provided text is empty, the function returns an empty [HashMap]. - pub fn word_match(&self, text: &str) -> HashMap> { - if !text.is_empty() { - let mut match_result_dict = HashMap::new(); - let mut failed_match_table_id_set = IntSet::default(); - - if let Some(regex_matcher) = &self.regex_matcher { - for regex_result in regex_matcher.process(text) { - let result_list = match_result_dict - .entry(regex_result.match_id) - .or_insert(Vec::new()); + pub fn word_match<'a>(&'a self, text: &'a str) -> HashMap> { + if text.is_empty() { + return HashMap::default(); + } - result_list.push(MatchResult { - match_id: regex_result.match_id, - table_id: regex_result.table_id, - word: regex_result.word, - }) - } + let processed_text_process_type_set = + reduce_text_process_with_tree(&self.process_type_tree, text); + + self._word_match_with_processed_text_process_type_set(&processed_text_process_type_set) + } + + fn _word_match_with_processed_text_process_type_set<'a>( + &'a self, + processed_text_process_type_set: &[(Cow<'a, str>, IdSet)], + ) -> HashMap> { + let mut match_result_dict = HashMap::new(); + let mut failed_match_table_id_set = IntSet::default(); + + if let Some(regex_matcher) = &self.regex_matcher { + for regex_result in regex_matcher + ._process_with_processed_text_process_type_set(processed_text_process_type_set) + { + let result_list: &mut Vec = match_result_dict + .entry(regex_result.match_id) + .or_insert(Vec::new()); + + result_list.push(regex_result.into()); } + } - if let Some(sim_matcher) = &self.sim_matcher { - for sim_result in sim_matcher.process(text) { - let result_list = match_result_dict - .entry(sim_result.match_id) - .or_insert(Vec::new()); + if let Some(sim_matcher) = &self.sim_matcher { + for sim_result in sim_matcher + ._process_with_processed_text_process_type_set(processed_text_process_type_set) + { + let result_list = match_result_dict + .entry(sim_result.match_id) + .or_insert(Vec::new()); - result_list.push(MatchResult { - match_id: sim_result.match_id, - table_id: sim_result.table_id, - word: sim_result.word, - }) - } + result_list.push(sim_result.into()); } + } - if let Some(simple_matcher) = &self.simple_matcher { - for simple_result in simple_matcher.process(text) { - // Guaranteed not failed - let word_table_conf = unsafe { - self.simple_word_table_conf_map - .get( - self.simple_word_table_conf_id_map - .get(&simple_result.word_id) - .unwrap_unchecked(), - ) - .unwrap_unchecked() - }; - let match_table_id = ((word_table_conf.match_id as u64) << 32) - | (word_table_conf.table_id as u64); - - if failed_match_table_id_set.contains(&match_table_id) { - continue; - } + if let Some(simple_matcher) = &self.simple_matcher { + for simple_result in simple_matcher + ._process_with_processed_text_process_type_set(processed_text_process_type_set) + { + // Guaranteed not failed + let word_table_conf = unsafe { + self.simple_word_table_conf_list.get_unchecked( + *self + .simple_word_table_conf_index_list + .get_unchecked(simple_result.word_id as usize), + ) + }; + let match_table_id = + ((word_table_conf.match_id as u64) << 32) | (word_table_conf.table_id as u64); + + if failed_match_table_id_set.contains(&match_table_id) { + continue; + } - let result_list = match_result_dict - .entry(word_table_conf.match_id) - .or_insert(Vec::new()); - if word_table_conf.is_exemption { - failed_match_table_id_set.insert(match_table_id); - result_list.retain(|match_result| { - match_result.table_id != word_table_conf.table_id - }); - } else { - result_list.push(MatchResult { - match_id: word_table_conf.match_id, - table_id: word_table_conf.table_id, - word: simple_result.word, - }); - } + let result_list = match_result_dict + .entry(word_table_conf.match_id) + .or_insert(Vec::new()); + if word_table_conf.is_exemption { + failed_match_table_id_set.insert(match_table_id); + result_list + .retain(|match_result| match_result.table_id != word_table_conf.table_id); + } else { + result_list.push(MatchResult { + match_id: word_table_conf.match_id, + table_id: word_table_conf.table_id, + word_id: unsafe { + simple_result.word_id.unchecked_sub(word_table_conf.offset) + }, + word: simple_result.word, + similarity: 1.0, + }); } } - - match_result_dict.retain(|_, match_result_list| !match_result_list.is_empty()); - match_result_dict - } else { - HashMap::new() } + + match_result_dict.retain(|_, match_result_list| !match_result_list.is_empty()); + match_result_dict } - /// Matches the provided text and returns the raw results as a serialized JSON string. - /// - /// This function takes a string slice representing the text to be matched and processes it using the available - /// matchers (simple, regex, and similarity matchers). It gathers the matching results into a [HashMap] where - /// the keys are match identifiers and the values are vectors of [MatchResult] instances. The results are then - /// serialized into a JSON string using the [to_string] function from the [sonic_rs] crate. - /// - /// # Arguments - /// - /// * `text` - A string slice representing the text to be matched. - /// - /// # Returns - /// - /// A [String] containing the serialized JSON representation of the raw matching results. - /// - /// # Safety - /// - /// The function uses an `unsafe` block to call [unwrap_unchecked](Result::unwrap_unchecked) on the [to_string] function, which skips - /// the error checking for performance optimization. It is important to ensure that the serialization process - /// does not fail, as [unwrap_unchecked](Result::unwrap_unchecked) will cause undefined behavior if an error occurs. pub fn word_match_as_string(&self, text: &str) -> String { + if text.is_empty() { + return String::from("{}"); + } unsafe { to_string(&self.word_match(text)).unwrap_unchecked() } } } impl<'a> TextMatcherTrait<'a, MatchResult<'a>> for Matcher { fn is_match(&self, text: &str) -> bool { - !self.word_match(text).is_empty() + let processed_text_process_type_set = + reduce_text_process_with_tree(&self.process_type_tree, text); + + self._is_match_with_processed_text_process_type_set(&processed_text_process_type_set) + } + + fn _is_match_with_processed_text_process_type_set( + &'a self, + processed_text_process_type_set: &[(Cow<'a, str>, IdSet)], + ) -> bool { + match &self.simple_matcher { + Some(_) => !self + ._word_match_with_processed_text_process_type_set(processed_text_process_type_set) + .is_empty(), + None => { + if let Some(regex_matcher) = &self.regex_matcher { + if regex_matcher._is_match_with_processed_text_process_type_set( + processed_text_process_type_set, + ) { + return true; + } + } + if let Some(sim_matcher) = &self.sim_matcher { + if sim_matcher._is_match_with_processed_text_process_type_set( + processed_text_process_type_set, + ) { + return true; + } + } + false + } + } + } + + fn process(&'a self, text: &'a str) -> Vec> { + let processed_text_process_type_set = + reduce_text_process_with_tree(&self.process_type_tree, text); + + self._process_with_processed_text_process_type_set(&processed_text_process_type_set) } - /// Processes the provided text and returns a vector of [MatchResult] instances. - /// - /// This function takes a string slice representing the text to be processed and matches it using the available - /// matchers (simple, regex, and similarity matchers). It gathers the matching results and organizes them - /// by their respective match identifiers. The results for each match identifier are then flattened into a single - /// vector of [MatchResult] instances. - /// - /// # Arguments - /// - /// * `text` - A string slice representing the text to be processed. - /// - /// # Returns - /// - /// A [Vec] of [MatchResult] instances containing the matching results for all match identifiers. - fn process(&'a self, text: &str) -> Vec> { - self.word_match(text) + fn _process_with_processed_text_process_type_set( + &'a self, + processed_text_process_type_set: &[(Cow<'a, str>, IdSet)], + ) -> Vec> { + self._word_match_with_processed_text_process_type_set(processed_text_process_type_set) .into_iter() .flat_map(|(_, result_list)| result_list) // Flatten the result lists from all match IDs into a single iterator. .collect() diff --git a/matcher_rs/src/process/constants.rs b/matcher_rs/src/process/constants.rs index a7fe784..dc61a04 100644 --- a/matcher_rs/src/process/constants.rs +++ b/matcher_rs/src/process/constants.rs @@ -1,65 +1,63 @@ -#[cfg(feature = "runtime_build")] -pub mod runtime_build_feature { - pub const FANJIAN: &str = include_str!("../../str_conv/FANJIAN.txt"); - pub const TEXT_DELETE: &str = include_str!("../../str_conv/TEXT-DELETE.txt"); - pub const NUM_NORM: &str = include_str!("../../str_conv/NUM-NORM.txt"); - pub const NORM: &str = include_str!("../../str_conv/NORM.txt"); - pub const PINYIN: &str = include_str!("../../str_conv/PINYIN.txt"); - - pub const WHITE_SPACE: &[&str] = &[ - "\u{0009}", "\u{000A}", "\u{000B}", "\u{000C}", "\u{000D}", "\u{0020}", "\u{0085}", - "\u{00A0}", "\u{1680}", "\u{2000}", "\u{2001}", "\u{2002}", "\u{2003}", "\u{2004}", - "\u{2005}", "\u{2006}", "\u{2007}", "\u{2008}", "\u{2009}", "\u{200A}", "\u{200D}", - "\u{200F}", "\u{2028}", "\u{2029}", "\u{202F}", "\u{205F}", "\u{3000}", - ]; -} +pub const WHITE_SPACE: &[&str] = &[ + "\u{0009}", "\u{000A}", "\u{000B}", "\u{000C}", "\u{000D}", "\u{0020}", "\u{0085}", "\u{00A0}", + "\u{1680}", "\u{2000}", "\u{2001}", "\u{2002}", "\u{2003}", "\u{2004}", "\u{2005}", "\u{2006}", + "\u{2007}", "\u{2008}", "\u{2009}", "\u{200A}", "\u{200D}", "\u{200F}", "\u{2028}", "\u{2029}", + "\u{202F}", "\u{205F}", "\u{3000}", +]; -#[cfg(feature = "prebuilt")] -pub mod prebuilt_feature { - #[cfg(feature = "dfa")] - pub const NORMALIZE_PROCESS_LIST_STR: &str = - include_str!(concat!(env!("OUT_DIR"), "/normalize_process_list.bin")); - #[cfg(not(feature = "dfa"))] - pub const NORMALIZE_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!( - env!("OUT_DIR"), - "/normalize_daachorse_charwise_u32_matcher.bin" - )); - pub const NORMALIZE_PROCESS_REPLACE_LIST_STR: &str = include_str!(concat!( - env!("OUT_DIR"), - "/normalize_process_replace_list.bin" - )); +#[cfg(feature = "runtime_build")] +pub const FANJIAN: &str = include_str!("../../process_map/FANJIAN.txt"); +#[cfg(feature = "runtime_build")] +pub const TEXT_DELETE: &str = include_str!("../../process_map/TEXT-DELETE.txt"); +#[cfg(feature = "runtime_build")] +pub const NUM_NORM: &str = include_str!("../../process_map/NUM-NORM.txt"); +#[cfg(feature = "runtime_build")] +pub const NORM: &str = include_str!("../../process_map/NORM.txt"); +#[cfg(feature = "runtime_build")] +pub const PINYIN: &str = include_str!("../../process_map/PINYIN.txt"); - pub const FANJIAN_PROCESS_REPLACE_LIST_STR: &str = include_str!(concat!( - env!("OUT_DIR"), - "/fanjian_process_replace_list.bin" - )); - pub const FANJIAN_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!( - env!("OUT_DIR"), - "/fanjian_daachorse_charwise_u32_matcher.bin" - )); - pub const PINYIN_PROCESS_REPLACE_LIST_STR: &str = - include_str!(concat!(env!("OUT_DIR"), "/pinyin_process_replace_list.bin")); - pub const PINYINCHAR_PROCESS_REPLACE_LIST_STR: &str = include_str!(concat!( - env!("OUT_DIR"), - "/pinyinchar_process_replace_list.bin" - )); - pub const PINYIN_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!( - env!("OUT_DIR"), - "/pinyin_daachorse_charwise_u32_matcher.bin" - )); +#[cfg(all(not(feature = "runtime_build"), feature = "dfa"))] +pub const NORMALIZE_PROCESS_LIST_STR: &str = + include_str!(concat!(env!("OUT_DIR"), "/normalize_process_list.bin")); +#[cfg(all(not(feature = "runtime_build"), not(feature = "dfa")))] +pub const NORMALIZE_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!( + env!("OUT_DIR"), + "/normalize_daachorse_charwise_u32_matcher.bin" +)); +#[cfg(not(feature = "runtime_build"))] +pub const NORMALIZE_PROCESS_REPLACE_LIST_STR: &str = include_str!(concat!( + env!("OUT_DIR"), + "/normalize_process_replace_list.bin" +)); - #[cfg(feature = "dfa")] - pub const TEXT_DELETE: &str = include_str!("../../str_conv/TEXT-DELETE.txt"); - #[cfg(not(feature = "dfa"))] - pub const TEXT_DELETE_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!( - env!("OUT_DIR"), - "/text_delete_daachorse_charwise_u32_matcher.bin" - )); +#[cfg(not(feature = "runtime_build"))] +pub const FANJIAN_PROCESS_REPLACE_LIST_STR: &str = include_str!(concat!( + env!("OUT_DIR"), + "/fanjian_process_replace_list.bin" +)); +#[cfg(not(feature = "runtime_build"))] +pub const FANJIAN_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!( + env!("OUT_DIR"), + "/fanjian_daachorse_charwise_u32_matcher.bin" +)); +#[cfg(not(feature = "runtime_build"))] +pub const PINYIN_PROCESS_REPLACE_LIST_STR: &str = + include_str!(concat!(env!("OUT_DIR"), "/pinyin_process_replace_list.bin")); +#[cfg(not(feature = "runtime_build"))] +pub const PINYINCHAR_PROCESS_REPLACE_LIST_STR: &str = include_str!(concat!( + env!("OUT_DIR"), + "/pinyinchar_process_replace_list.bin" +)); +#[cfg(not(feature = "runtime_build"))] +pub const PINYIN_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!( + env!("OUT_DIR"), + "/pinyin_daachorse_charwise_u32_matcher.bin" +)); - pub const WHITE_SPACE: &[&str] = &[ - "\u{0009}", "\u{000A}", "\u{000B}", "\u{000C}", "\u{000D}", "\u{0020}", "\u{0085}", - "\u{00A0}", "\u{1680}", "\u{2000}", "\u{2001}", "\u{2002}", "\u{2003}", "\u{2004}", - "\u{2005}", "\u{2006}", "\u{2007}", "\u{2008}", "\u{2009}", "\u{200A}", "\u{200D}", - "\u{200F}", "\u{2028}", "\u{2029}", "\u{202F}", "\u{205F}", "\u{3000}", - ]; -} +#[cfg(all(not(feature = "runtime_build"), feature = "dfa"))] +pub const TEXT_DELETE: &str = include_str!("../../process_map/TEXT-DELETE.txt"); +#[cfg(all(not(feature = "runtime_build"), not(feature = "dfa")))] +pub const TEXT_DELETE_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!( + env!("OUT_DIR"), + "/text_delete_daachorse_charwise_u32_matcher.bin" +)); diff --git a/matcher_rs/src/process/process_matcher.rs b/matcher_rs/src/process/process_matcher.rs index 69fb576..9cfcd5b 100644 --- a/matcher_rs/src/process/process_matcher.rs +++ b/matcher_rs/src/process/process_matcher.rs @@ -1,11 +1,15 @@ use std::borrow::Cow; +use std::fmt::Display; use std::sync::Arc; -use ahash::{AHashMap, HashMapExt}; -use aho_corasick_unsafe::{ - AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, MatchKind as AhoCorasickMatchKind, -}; -#[cfg(feature = "prebuilt")] +#[cfg(any(feature = "runtime_build", feature = "dfa"))] +use ahash::AHashMap; +use ahash::HashMapExt; +use aho_corasick_unsafe::AhoCorasick; +#[cfg(any(feature = "runtime_build", feature = "dfa"))] +use aho_corasick_unsafe::{AhoCorasickBuilder, AhoCorasickKind, MatchKind as AhoCorasickMatchKind}; +use bitflags::bitflags; +#[cfg(not(feature = "runtime_build"))] use daachorse::CharwiseDoubleArrayAhoCorasick; #[cfg(feature = "runtime_build")] use daachorse::{ @@ -14,38 +18,66 @@ use daachorse::{ }; use id_set::IdSet; use lazy_static::lazy_static; -use nohash_hasher::IntMap; +use nohash_hasher::{IntMap, IsEnabled}; use parking_lot::RwLock; -#[cfg(feature = "serde")] +use serde::{Deserializer, Serializer}; use sonic_rs::{Deserialize, Serialize}; use tinyvec::ArrayVec; -#[cfg(feature = "prebuilt")] -use crate::process::constants::prebuilt_feature::*; +use crate::process::constants::*; + +bitflags! { + #[derive(Hash, PartialEq, Eq, Clone, Copy, Debug, Default)] + pub struct ProcessType: u8 { + const None = 0b00000001; + const Fanjian = 0b00000010; + const Delete = 0b00000100; + const Normalize = 0b00001000; + const DeleteNormalize = 0b00001100; + const FanjianDeleteNormalize = 0b00001110; + const PinYin = 0b00010000; + const PinYinChar = 0b00100000; + } +} -#[cfg(feature = "runtime_build")] -use crate::process::constants::runtime_build_feature::*; +impl Serialize for ProcessType { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + self.bits().serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for ProcessType { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let bits: u8 = u8::deserialize(deserializer)?; + Ok(ProcessType::from_bits_retain(bits)) + } +} + +impl Display for ProcessType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let display_str_list = self + .iter_names() + .map(|(name, _)| name.to_lowercase()) + .collect::>(); + write!(f, "{:?}", display_str_list.join("_")) + } +} -use crate::SimpleMatchType; +impl IsEnabled for ProcessType {} -type ProcessMatcherCache = - RwLock, ProcessMatcher)>>>; +type ProcessMatcherCache = RwLock, ProcessMatcher)>>>; lazy_static! { pub static ref PROCESS_MATCHER_CACHE: ProcessMatcherCache = RwLock::new(IntMap::with_capacity(8)); } -/// [ProcessMatcher] is an enum designed to differentiate between matching strategies based on the input text type. -/// -/// This enum is used as part of the text processing framework, allowing for specialized handling of Chinese text -/// compared to other types of text. It supports two variants: -/// -/// - [Chinese](ProcessMatcher::Chinese): Utilizes a [`CharwiseDoubleArrayAhoCorasick`] matcher optimized for Chinese characters. -/// - [Others](ProcessMatcher::Others): Uses an [AhoCorasick] matcher for all other types of text. -/// -/// By distinguishing between these two categories, [ProcessMatcher] allows for more efficient and accurate pattern -/// matching tailored to the linguistic properties of the text being processed. #[derive(Clone)] pub enum ProcessMatcher { #[cfg(not(feature = "dfa"))] @@ -55,28 +87,6 @@ pub enum ProcessMatcher { } impl ProcessMatcher { - /// Replaces all occurrences of patterns in the input text with corresponding replacements from the provided list. - /// - /// This function performs a find-and-replace operation on the input text. It searches for patterns using the internal matcher - /// (either [`CharwiseDoubleArrayAhoCorasick`] for Chinese text or [AhoCorasick] for other text) and replaces each match - /// with the corresponding replacement string from the given `process_replace_list`. - /// - /// # Parameters - /// - /// * `text`: A reference to the input text where replacements will be made. - /// * `process_replace_list`: A slice of replacement strings. Each match from the internal matcher is replaced with the - /// corresponding string from this list. - /// - /// # Returns - /// - /// * `(bool, Cow<'a, str>)`: A tuple where the first element is a boolean indicating whether any replacements were made, - /// and the second element is a [Cow] string containing the modified text. If no replacements were made, the original text - /// is returned as a [Cow::Borrowed]. - /// - /// # Safety - /// - /// This function uses unsafe code to access slices and indices. This assumes that the match indices and the replacement list - /// indices are always within bounds. #[inline(always)] pub fn replace_all<'a>( &self, @@ -131,25 +141,6 @@ impl ProcessMatcher { } } - /// Deletes all occurrences of patterns in the input text. - /// - /// This function performs a delete operation on the input text. It searches for patterns using the internal matcher - /// (either [`CharwiseDoubleArrayAhoCorasick`] for Chinese text or [AhoCorasick] for other text) and removes each match - /// from the input. - /// - /// # Parameters - /// - /// * `text`: A reference to the input text where patterns will be deleted. - /// - /// # Returns - /// - /// * `(bool, Cow<'a, str>)`: A tuple where the first element is a boolean indicating whether any deletions were made, - /// and the second element is a [Cow] string containing the modified text. If no deletions were made, the original text - /// is returned as a [Cow::Borrowed]. - /// - /// # Safety - /// - /// This function uses unsafe code to access slices and indices. This assumes that the match indices are always within bounds. #[inline(always)] pub fn delete_all<'a>(&self, text: &'a str) -> (bool, Cow<'a, str>) { let mut result = String::with_capacity(text.len()); @@ -189,57 +180,24 @@ impl ProcessMatcher { } } -/// Generates a [ProcessMatcher] based on the provided [SimpleMatchType] at runtime. -/// -/// This implementation constructs the matcher and replacement list at runtime based on the specified [SimpleMatchType]. -/// The function generates the matcher data and caches it for future use. -/// -/// # Parameters -/// -/// - `smt_bit`: A variant of [SimpleMatchType] which specifies the type of matching operation to be performed. -/// -/// # Returns -/// -/// - An [`Arc`] containing a tuple: -/// - A vector of replacement patterns ([`Vec<&str>`]). -/// - A [ProcessMatcher] object configured for the specified match type. -/// -/// # Match Types -/// -/// The function supports the following match types: -/// -/// - [SimpleMatchType::None]: Returns an empty matcher. -/// - [SimpleMatchType::Fanjian]: Builds a matcher for Fanjian text normalization using runtime construction. -/// - [SimpleMatchType::WordDelete]: Builds a matcher for deleting whitespace and punctuation. -/// - [SimpleMatchType::TextDelete]: Builds a matcher for deleting special text characters and whitespace. -/// - [SimpleMatchType::Normalize]: Builds a matcher for normalizing symbols, text, and numbers. -/// - [SimpleMatchType::PinYin]: Builds a matcher for converting text to PinYin using runtime construction. -/// - [SimpleMatchType::PinYinChar]: Builds a matcher for converting text to PinYin characters using runtime construction. -/// -/// # Notes -/// -/// - The matcher construction utilizes the Aho-Corasick algorithm for efficient pattern matching. -/// - The function retains key-value pairs in the replacement dictionary where the key and value are not identical. -/// - The matcher data is cached to optimize repeated calls with the same match type, improving performance. -/// -/// The function may use either the `Chinese` or `Others` variant of the [ProcessMatcher], depending on the [[SimpleMatchType]]. -#[cfg(feature = "runtime_build")] -pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, ProcessMatcher)> { +pub fn get_process_matcher( + process_type_bit: ProcessType, +) -> Arc<(Vec<&'static str>, ProcessMatcher)> { { let process_matcher_cache = PROCESS_MATCHER_CACHE.read(); - if let Some(cached_result) = process_matcher_cache.get(&smt_bit) { + if let Some(cached_result) = process_matcher_cache.get(&process_type_bit) { return Arc::clone(cached_result); } } + #[cfg(feature = "runtime_build")] { let mut process_dict = AHashMap::default(); - match smt_bit { - SimpleMatchType::None => {} - - SimpleMatchType::Fanjian => { + match process_type_bit { + ProcessType::None => {} + ProcessType::Fanjian => { process_dict.extend(FANJIAN.trim().lines().map(|pair_str| { let mut pair_str_split = pair_str.split('\t'); ( @@ -248,18 +206,13 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, ) })); } - - SimpleMatchType::WordDelete => { - process_dict.extend(WHITE_SPACE.iter().map(|&c| (c, ""))); - } - - SimpleMatchType::TextDelete => { + ProcessType::Delete => { process_dict.extend(TEXT_DELETE.trim().lines().map(|pair_str| (pair_str, ""))); process_dict.extend(WHITE_SPACE.iter().map(|&c| (c, ""))); } - SimpleMatchType::Normalize => { - for str_conv_map in [NORM, NUM_NORM] { - process_dict.extend(str_conv_map.trim().lines().map(|pair_str| { + ProcessType::Normalize => { + for process_map in [NORM, NUM_NORM] { + process_dict.extend(process_map.trim().lines().map(|pair_str| { let mut pair_str_split = pair_str.split('\t'); ( pair_str_split.next().unwrap(), @@ -268,8 +221,7 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, })); } } - - SimpleMatchType::PinYin => { + ProcessType::PinYin => { process_dict.extend(PINYIN.trim().lines().map(|pair_str| { let mut pair_str_split = pair_str.split('\t'); ( @@ -278,8 +230,7 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, ) })); } - - SimpleMatchType::PinYinChar => { + ProcessType::PinYinChar => { process_dict.extend(PINYIN.trim().lines().map(|pair_str| { let mut pair_str_split = pair_str.split('\t'); ( @@ -293,8 +244,8 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, process_dict.retain(|&key, &mut value| key != value); - let (process_replace_list, process_matcher) = match smt_bit { - SimpleMatchType::Fanjian | SimpleMatchType::PinYin | SimpleMatchType::PinYinChar => ( + let (process_replace_list, process_matcher) = match process_type_bit { + ProcessType::Fanjian | ProcessType::PinYin | ProcessType::PinYinChar => ( process_dict.iter().map(|(_, &val)| val).collect(), ProcessMatcher::Chinese( CharwiseDoubleArrayAhoCorasickBuilder::new() @@ -309,7 +260,7 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, ), ), #[cfg(not(feature = "dfa"))] - SimpleMatchType::TextDelete | SimpleMatchType::Normalize => ( + ProcessType::Delete | ProcessType::Normalize => ( process_dict.iter().map(|(_, &val)| val).collect(), ProcessMatcher::LeftMost( CharwiseDoubleArrayAhoCorasickBuilder::new() @@ -339,68 +290,23 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, ), ), }; - let uncached_result = Arc::new((process_replace_list, process_matcher)); let mut process_matcher_cache = PROCESS_MATCHER_CACHE.write(); - process_matcher_cache.insert(smt_bit, Arc::clone(&uncached_result)); - uncached_result + process_matcher_cache.insert(process_type_bit, Arc::clone(&uncached_result)); + return uncached_result; } -} -/// Generates a [ProcessMatcher] based on the provided [SimpleMatchType]. -/// -/// This implementation makes use of prebuilt, serialized data for certain match types to enhance -/// performance by avoiding runtime construction of the matcher and replacement list. The function -/// expects that the relevant data has been compiled with the `prebuilt` feature. -/// -/// # Parameters -/// -/// - `smt_bit`: A variant of [SimpleMatchType] enumerating the various matching strategies. -/// -/// # Returns -/// -/// - A tuple containing: -/// - A vector of replacement patterns ([`Vec<&str>`]). -/// - A [ProcessMatcher] object relevant to the specified match type. -/// -/// # Safety -/// -/// For certain match types like [Fanjian](SimpleMatchType::Fanjian), [PinYin](SimpleMatchType::PinYin), [PinYinChar](SimpleMatchType::PinYinChar), unsafe deserialization is performed -/// using [deserialize_unchecked](CharwiseDoubleArrayAhoCorasick::deserialize_unchecked). This assumes that the prebuilt serialized data is trustworthy and correctly formatted. -/// -/// # Match Types -/// -/// The function supports the following match types: -/// -/// - [SimpleMatchType::None]: Returns an empty matcher. -/// - [SimpleMatchType::Fanjian]: Returns a matcher using prebuilt replacement list and matcher data for Fanjian. -/// - [SimpleMatchType::WordDelete]: Builds a matcher for deleting punctuation and whitespace. -/// - [SimpleMatchType::TextDelete]: Builds a matcher for deleting special text characters and whitespace. -/// - [SimpleMatchType::Normalize]: Returns a matcher using prebuilt normalization data. -/// - [SimpleMatchType::PinYin]: Returns a matcher using prebuilt replacement list and matcher data for PinYin. -/// - [SimpleMatchType::PinYinChar]: Returns a matcher using prebuilt replacement list and matcher data for PinYin characters. -/// -/// This function requires the `prebuilt` feature to be enabled. -#[cfg(feature = "prebuilt")] -pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, ProcessMatcher)> { + #[cfg(not(feature = "runtime_build"))] { - let process_matcher_cache = PROCESS_MATCHER_CACHE.read(); - - if let Some(cached_result) = process_matcher_cache.get(&smt_bit) { - return Arc::clone(cached_result); - } - } - - { - let (process_replace_list, process_matcher) = match smt_bit { - SimpleMatchType::None => { + let (process_replace_list, process_matcher) = match process_type_bit { + ProcessType::None => { let empty_patterns: Vec<&str> = Vec::new(); ( Vec::new(), ProcessMatcher::Others(AhoCorasick::new(&empty_patterns).unwrap()), ) } - SimpleMatchType::Fanjian => ( + ProcessType::Fanjian => ( FANJIAN_PROCESS_REPLACE_LIST_STR.lines().collect(), // Guaranteed not failed ProcessMatcher::Chinese(unsafe { @@ -410,27 +316,7 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, .0 }), ), - SimpleMatchType::WordDelete => { - let mut process_dict = AHashMap::default(); - process_dict.extend(WHITE_SPACE.iter().map(|&c| (c, ""))); - process_dict.retain(|&key, &mut value| key != value); - let process_list = process_dict - .iter() - .map(|(&key, _)| key) - .collect::>(); - - ( - Vec::new(), - ProcessMatcher::Others( - AhoCorasickBuilder::new() - .kind(Some(AhoCorasickKind::DFA)) - .match_kind(AhoCorasickMatchKind::LeftmostLongest) - .build(&process_list) - .unwrap(), - ), - ) - } - SimpleMatchType::TextDelete => { + ProcessType::Delete => { #[cfg(feature = "dfa")] { let mut process_dict = AHashMap::default(); @@ -453,7 +339,6 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, ), ) } - #[cfg(not(feature = "dfa"))] { ( @@ -467,7 +352,7 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, ) } } - SimpleMatchType::Normalize => { + ProcessType::Normalize => { #[cfg(feature = "dfa")] { ( @@ -481,7 +366,6 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, ), ) } - #[cfg(not(feature = "dfa"))] { ( @@ -495,7 +379,7 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, ) } } - SimpleMatchType::PinYin => ( + ProcessType::PinYin => ( PINYIN_PROCESS_REPLACE_LIST_STR.lines().collect(), // Guaranteed not failed ProcessMatcher::Chinese(unsafe { @@ -505,8 +389,7 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, .0 }), ), - - SimpleMatchType::PinYinChar => ( + ProcessType::PinYinChar => ( PINYINCHAR_PROCESS_REPLACE_LIST_STR.lines().collect(), // Guaranteed not failed ProcessMatcher::Chinese(unsafe { @@ -521,69 +404,39 @@ pub fn get_process_matcher(smt_bit: SimpleMatchType) -> Arc<(Vec<&'static str>, let uncached_result = Arc::new((process_replace_list, process_matcher)); let mut process_matcher_cache = PROCESS_MATCHER_CACHE.write(); - process_matcher_cache.insert(smt_bit, Arc::clone(&uncached_result)); + process_matcher_cache.insert(process_type_bit, Arc::clone(&uncached_result)); uncached_result } } -/// Processes the input text according to the specified single-bit [SimpleMatchType]. -/// -/// This function takes a [SimpleMatchType] bit flag and transforms the input text based on the rules -/// associated with that flag. It accepts only a single bit of `simple_match_type` and returns a Result -/// containing the transformed text or an error. -/// -/// # Arguments -/// -/// * `smt_bit` - A single bit of [SimpleMatchType] defining a specific text transformation rule. -/// * `text` - A string slice representing the input text to be transformed. -/// -/// # Returns -/// -/// * [`Result, &'static str>`] - The function returns a `Cow` (Copy on Write) string containing -/// the processed text if the transformation is successful or an error message if more than one bit is set. -/// -/// # Errors -/// -/// This function will return an error if the `smt_bit` contains more than one active transformation bit. -/// -/// # Detailed Processing: -/// -/// 1. Checks if more than one bit is set in `smt_bit` and returns an error if true. -/// 2. Retrieves the cached matcher and replacement list for the given bit. -/// 3. Initializes the `result` as a borrowed version of the input `text`. -/// 4. Matches the transformation type and applies the corresponding matcher: -/// a. [SimpleMatchType::None] - Do nothing. -/// b. [SimpleMatchType::Fanjian] - Apply the matcher and replace all occurrences. -/// c. [SimpleMatchType::TextDelete] | [SimpleMatchType::WordDelete] - Apply the matcher and delete all occurrences. -/// d. Other types - Apply the matcher and replace all occurrences. -/// 5. Updates the `result` accordingly and returns it within an `Ok`. #[inline(always)] -pub fn text_process(smt_bit: SimpleMatchType, text: &str) -> Result, &'static str> { - if smt_bit.iter().count() > 1 { - return Err("text_process function only accept one bit of simple_match_type"); +pub fn text_process( + process_type_bit: ProcessType, + text: &str, +) -> Result, &'static str> { + if process_type_bit.iter().count() > 1 { + return Err("text_process function only accept one bit of process_type"); } - let cached_result = get_process_matcher(smt_bit); + let cached_result = get_process_matcher(process_type_bit); let (process_replace_list, process_matcher) = cached_result.as_ref(); let mut result = Cow::Borrowed(text); - match (smt_bit, process_matcher) { - (SimpleMatchType::None, _) => {} - (SimpleMatchType::Fanjian, pm) => match pm.replace_all(text, process_replace_list) { + match (process_type_bit, process_matcher) { + (ProcessType::None, _) => {} + (ProcessType::Fanjian, pm) => match pm.replace_all(text, process_replace_list) { (true, Cow::Owned(pt)) => { result = Cow::Owned(pt); } (false, _) => {} (_, _) => unreachable!(), }, - (SimpleMatchType::TextDelete | SimpleMatchType::WordDelete, pm) => { - match pm.delete_all(text) { - (true, Cow::Owned(pt)) => { - result = Cow::Owned(pt); - } - (false, _) => {} - (_, _) => unreachable!(), + (ProcessType::Delete, pm) => match pm.delete_all(text) { + (true, Cow::Owned(pt)) => { + result = Cow::Owned(pt); } - } + (false, _) => {} + (_, _) => unreachable!(), + }, (_, pm) => match pm.replace_all(text, process_replace_list) { (true, Cow::Owned(pt)) => { result = Cow::Owned(pt); @@ -595,59 +448,29 @@ pub fn text_process(smt_bit: SimpleMatchType, text: &str) -> Result Ok(result) } -/// Processes the input text to apply transformations specified by the SimpleMatchType. -/// -/// This function iterates over the bits of a SimpleMatchType to apply various text transformations. -/// Depending on the transformation type (e.g., text replace, text delete, etc.), it processes the text -/// and stores the result in an array of [Cow] (Copy on Write) strings. -/// -/// # Arguments -/// -/// * `simple_match_type` - A [SimpleMatchType] bit flags that define specific text transformation rules. -/// * `text` - A string slice representing the input text to be transformed. -/// -/// # Returns -/// -/// * [`ArrayVec<\[Cow<'a, str>; 8\]>`] - A fixed-size vector containing the processed versions of the input text. -/// -/// # Detailed Processing: -/// -/// 1. Initialize an [ArrayVec] to hold up to 8 versions of the processed text. -/// 2. Push the original text into the vector as the first entry. -/// 3. Iterate over each bit in the `simple_match_type`: -/// a. Retrieve the cached matcher and replacement list for the current bit. -/// b. Borrow the last processed text from the vector using an unsafe operation. -/// c. Match the current transformation type and apply the corresponding matcher: -/// i. [SimpleMatchType::None] - Do nothing. -/// iii. [SimpleMatchType::TextDelete] | [SimpleMatchType::WordDelete] - Apply the matcher and delete all occurrences. -/// iv. Other types - Apply the matcher and replace all occurrences. -/// d. Update the current text entry or append new entries to the vector depending on the transformation result. -/// 4. Return the populated [ArrayVec] containing all processed text variations. #[inline(always)] pub fn reduce_text_process<'a>( - simple_match_type: SimpleMatchType, + process_type: ProcessType, text: &'a str, ) -> ArrayVec<[Cow<'a, str>; 8]> { let mut processed_text_list: ArrayVec<[Cow<'a, str>; 8]> = ArrayVec::new(); processed_text_list.push(Cow::Borrowed(text)); - for smt_bit in simple_match_type.iter() { - let cached_result = get_process_matcher(smt_bit); + for process_type_bit in process_type.iter() { + let cached_result = get_process_matcher(process_type_bit); let (process_replace_list, process_matcher) = cached_result.as_ref(); // Guaranteed not failed let tmp_processed_text = unsafe { processed_text_list.last_mut().unwrap_unchecked() }; - match (smt_bit, process_matcher) { - (SimpleMatchType::None, _) => {} - (SimpleMatchType::TextDelete | SimpleMatchType::WordDelete, pm) => { - match pm.delete_all(tmp_processed_text.as_ref()) { - (true, Cow::Owned(pt)) => { - processed_text_list.push(Cow::Owned(pt)); - } - (false, _) => {} - (_, _) => unreachable!(), + match (process_type_bit, process_matcher) { + (ProcessType::None, _) => {} + (ProcessType::Delete, pm) => match pm.delete_all(tmp_processed_text.as_ref()) { + (true, Cow::Owned(pt)) => { + processed_text_list.push(Cow::Owned(pt)); } - } + (false, _) => {} + (_, _) => unreachable!(), + }, (_, pm) => match pm.replace_all(tmp_processed_text.as_ref(), process_replace_list) { (true, Cow::Owned(pt)) => { processed_text_list.push(Cow::Owned(pt)); @@ -661,72 +484,32 @@ pub fn reduce_text_process<'a>( processed_text_list } -/// Processes the input text to apply transformations specified by the SimpleMatchType. -/// -/// This function iterates over the bits of a SimpleMatchType to apply various text transformations. -/// Depending on the transformation type (e.g., text replace, text delete, etc.), it processes the text -/// and stores the result in an array of [Cow] (Copy on Write) strings. -/// -/// # Arguments -/// -/// * `simple_match_type` - A [SimpleMatchType] bit flags that define specific text transformation rules. -/// * `text` - A string slice representing the input text to be transformed. -/// -/// # Returns -/// -/// * [`ArrayVec<\[Cow<'a, str>; 8\]>`] - A fixed-size vector containing the processed versions of the input text. -/// -/// # Detailed Processing: -/// -/// 1. Initialize an [ArrayVec] to hold up to 8 versions of the processed text. -/// 2. Push the original text into the vector as the first entry. -/// 3. Iterate over each bit in the `simple_match_type`: -/// a. Retrieve the cached matcher and replacement list for the current bit. -/// b. Borrow the last processed text from the vector using an unsafe operation. -/// c. Match the current transformation type and apply the corresponding matcher: -/// i. [SimpleMatchType::None] - Do nothing. -/// ii. [SimpleMatchType::Fanjian] | [SimpleMatchType::Normalize] - Apply the matcher and replace all occurrences. -/// iii. [SimpleMatchType::TextDelete] | [SimpleMatchType::WordDelete] - Apply the matcher and delete all occurrences. -/// iv. Other types - Apply the matcher and replace all occurrences. -/// d. Update the current text entry or append new entries to the vector depending on the transformation result. -/// 4. Return the populated [ArrayVec] containing all processed text variations. #[inline(always)] pub fn reduce_text_process_emit<'a>( - simple_match_type: SimpleMatchType, + process_type: ProcessType, text: &'a str, ) -> ArrayVec<[Cow<'a, str>; 8]> { let mut processed_text_list: ArrayVec<[Cow<'a, str>; 8]> = ArrayVec::new(); processed_text_list.push(Cow::Borrowed(text)); - for smt_bit in simple_match_type.iter() { - let cached_result = get_process_matcher(smt_bit); + for process_type_bit in process_type.iter() { + let cached_result = get_process_matcher(process_type_bit); let (process_replace_list, process_matcher) = cached_result.as_ref(); // Guaranteed not failed let tmp_processed_text = unsafe { processed_text_list.last_mut().unwrap_unchecked() }; - match (smt_bit, process_matcher) { - (SimpleMatchType::None, _) => {} - (SimpleMatchType::Fanjian | SimpleMatchType::Normalize, pm) => { - match pm.replace_all(tmp_processed_text.as_ref(), process_replace_list) { - (true, Cow::Owned(pt)) => { - *tmp_processed_text = Cow::Owned(pt); - } - (false, _) => {} - (_, _) => unreachable!(), - } - } - (SimpleMatchType::TextDelete | SimpleMatchType::WordDelete, pm) => { - match pm.delete_all(tmp_processed_text.as_ref()) { - (true, Cow::Owned(pt)) => { - processed_text_list.push(Cow::Owned(pt)); - } - (false, _) => {} - (_, _) => unreachable!(), + match (process_type_bit, process_matcher) { + (ProcessType::None, _) => {} + (ProcessType::Delete, pm) => match pm.delete_all(tmp_processed_text.as_ref()) { + (true, Cow::Owned(pt)) => { + processed_text_list.push(Cow::Owned(pt)); } - } + (false, _) => {} + (_, _) => unreachable!(), + }, (_, pm) => match pm.replace_all(tmp_processed_text.as_ref(), process_replace_list) { (true, Cow::Owned(pt)) => { - processed_text_list.push(Cow::Owned(pt)); + *tmp_processed_text = Cow::Owned(pt); } (false, _) => {} (_, _) => unreachable!(), @@ -737,86 +520,37 @@ pub fn reduce_text_process_emit<'a>( processed_text_list } -/// A node representing a SimpleMatchType in a tree structure. -/// -/// This struct is used to build a tree of [SimpleMatchType] transformations, where each node -/// corresponds to a particular bit (transformation type) and holds a list of [SimpleMatchType] -/// values, the index of the processed text, and the indices of its child nodes. -/// -/// # Fields -/// -/// * `smt_list` - An [ArrayVec] holding up to 8 [SimpleMatchType] values that this node represents. -/// * `smt_bit` - A [SimpleMatchType] value representing the bit for this node. -/// * `is_processed` - A boolean value that check the node is processed. -/// * `processed_text_index` - An index pointing to the processed text associated with this node. -/// * `children` - An [ArrayVec] holding up to 8 usize indices pointing to the child nodes in the tree. -/// -/// # Example Usage -/// -/// The [SimpleMatchTypeBitNode] is primarily used within a tree structure to efficiently manage -/// and retrieve the various text transformations specified by different [SimpleMatchType] bit flags. -/// It leverages [ArrayVec] for efficient, fixed-size storage. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct SimpleMatchTypeBitNode { - smt_list: ArrayVec<[SimpleMatchType; 8]>, - smt_bit: SimpleMatchType, +pub struct ProcessTypeBitNode { + process_type_list: ArrayVec<[ProcessType; 8]>, + process_type_bit: ProcessType, is_processed: bool, processed_text_index: usize, children: ArrayVec<[usize; 8]>, } -/// Constructs a tree of `SimpleMatchTypeBitNode`` instances based on the given list of [SimpleMatchType] transformations. -/// -/// This function creates a hierarchy of `SimpleMatchTypeBitNode`` nodes representing different transformation types -/// defined by the provided `smt_list`. Each node in the tree corresponds to a specific bit transformation and may have -/// child nodes representing subsequent transformations. -/// -/// # Parameters -/// -/// * `smt_list`: A slice of [SimpleMatchType] representing the match types to be processed and included in the tree. -/// -/// # Returns -/// -/// A [Vec] containing the constructed tree of `SimpleMatchTypeBitNode`'s, where each node represents a different bit -/// transformation as defined in the `smt_list`. -/// -/// # Details -/// -/// The function starts by initializing the root node of the tree with a [SimpleMatchType::None]. -/// It then iterates through each [SimpleMatchType] in the input list and constructs the tree as follows: -/// -/// 1. For each `simple_match_type`, set the starting node as the root node. -/// 2. Iterate over each bit in the `simple_match_type`. -/// - If a child node with the current bit already exists, move to that child node. -/// - If no such child node exists, create a new child node, update the current node's children, and move to the new node. -/// 3. Upon finding or creating a node for the current bit, append the `simple_match_type` to the `smt_list` of that node. -/// -/// # Safety -/// -/// This function does not use any unsafe code, ensuring type safety and memory correctness. -/// -pub fn build_smt_tree(smt_list: &[SimpleMatchType]) -> Vec { - let mut smt_tree = Vec::new(); - let root = SimpleMatchTypeBitNode { - smt_list: ArrayVec::new(), - smt_bit: SimpleMatchType::None, +pub fn build_process_type_tree(process_type_list: &[ProcessType]) -> Vec { + let mut process_type_tree = Vec::new(); + let root = ProcessTypeBitNode { + process_type_list: ArrayVec::new(), + process_type_bit: ProcessType::None, is_processed: true, processed_text_index: 0, children: ArrayVec::new(), }; - smt_tree.push(root); - for &simple_match_type in smt_list.iter() { + process_type_tree.push(root); + for &process_type in process_type_list.iter() { let mut current_node_index = 0; - for smt_bit in simple_match_type.iter() { - let current_node = smt_tree[current_node_index]; - if current_node.smt_bit == smt_bit { + for process_type_bit in process_type.iter() { + let current_node = process_type_tree[current_node_index]; + if current_node.process_type_bit == process_type_bit { continue; } let mut is_found = false; for child_node_index in current_node.children { - if smt_bit == smt_tree[child_node_index].smt_bit { + if process_type_bit == process_type_tree[child_node_index].process_type_bit { current_node_index = child_node_index; is_found = true; break; @@ -824,90 +558,55 @@ pub fn build_smt_tree(smt_list: &[SimpleMatchType]) -> Vec, IdSet); 16\]>`]: A collection of tuples, where each tuple -/// contains a transformed version of the text and a set of [SimpleMatchType] transformations that -/// were applied to generate that version of the text. -/// -/// # Details -/// -/// The function begins by copying the input `smt_tree` into a mutable vector. It also initializes -/// an array to store the processed text and their associated match types. For each node in the tree, -/// the function processes the input text according to the transformation rules specified in the node. -/// -/// 1. It retrieves the current node and its processed text index. -/// 2. It iterates through each child node of the current node. -/// - If the child node is already processed, it updates the current index with the processed text index of the current node. -/// - If the child node is not processed, it applies the transformation specified by the match type of the child node. -/// - Based on the match type, it either deletes or replaces parts of the text. -/// - The transformed text and its match types are then stored in an array. -/// - The processed text index is updated with the index of the newly transformed text. -/// 3. The child node is marked as processed, and its processed text index is updated. -/// 4. The array of transformed texts and their match types is returned at the end. -/// -/// # Safety -/// -/// This function uses `unsafe` blocks to access elements in the vectors and arrays directly, -/// assuming that all necessary bounds checks and precautions are performed implicitly. Care should be -/// taken when modifying this function to avoid introducing undefined behavior. #[inline(always)] pub fn reduce_text_process_with_tree<'a>( - smt_tree: &[SimpleMatchTypeBitNode], + process_type_tree: &[ProcessTypeBitNode], text: &'a str, ) -> ArrayVec<[(Cow<'a, str>, IdSet); 16]> { - let mut smt_tree_copied: Vec = smt_tree.to_vec(); + let mut process_type_tree_copied: Vec = process_type_tree.to_vec(); - let mut processed_text_smt_set: ArrayVec<[(Cow<'a, str>, IdSet); 16]> = ArrayVec::new(); - processed_text_smt_set.push(( + let mut processed_text_process_type_set: ArrayVec<[(Cow<'a, str>, IdSet); 16]> = + ArrayVec::new(); + processed_text_process_type_set.push(( Cow::Borrowed(text), - IdSet::from_iter([SimpleMatchType::None.bits() as usize]), + IdSet::from_iter([ProcessType::None.bits() as usize]), )); - for (current_node_index, current_node) in smt_tree.iter().enumerate() { - let (left_tree, right_tree) = - unsafe { smt_tree_copied.split_at_mut_unchecked(current_node_index.unchecked_add(1)) }; + for (current_node_index, current_node) in process_type_tree.iter().enumerate() { + let (left_tree, right_tree) = unsafe { + process_type_tree_copied.split_at_mut_unchecked(current_node_index.unchecked_add(1)) + }; let current_copied_node = unsafe { left_tree.get_unchecked(current_node_index) }; let mut current_index = current_copied_node.processed_text_index; - let current_text_ptr = unsafe { processed_text_smt_set.get_unchecked(current_index) } - .0 - .as_ref() as *const str; + let current_text_ptr = + unsafe { processed_text_process_type_set.get_unchecked(current_index) } + .0 + .as_ref() as *const str; for child_node_index in current_node.children { let child_node = unsafe { @@ -921,22 +620,26 @@ pub fn reduce_text_process_with_tree<'a>( if child_node.is_processed { current_index = current_copied_node.processed_text_index; } else { - let cached_result = get_process_matcher(child_node.smt_bit); + let cached_result = get_process_matcher(child_node.process_type_bit); let (process_replace_list, process_matcher) = cached_result.as_ref(); - match child_node.smt_bit { - SimpleMatchType::None => {} - SimpleMatchType::TextDelete | SimpleMatchType::WordDelete => { + match child_node.process_type_bit { + ProcessType::None => {} + ProcessType::Delete => { match process_matcher.delete_all(unsafe { &*current_text_ptr }) { (true, Cow::Owned(pt)) => { - processed_text_smt_set.push(( + processed_text_process_type_set.push(( Cow::Owned(pt), IdSet::from_iter( - child_node.smt_list.iter().map(|smt| smt.bits() as usize), + child_node + .process_type_list + .iter() + .map(|smt| smt.bits() as usize), ), )); - current_index = - unsafe { processed_text_smt_set.len().unchecked_sub(1) }; + current_index = unsafe { + processed_text_process_type_set.len().unchecked_sub(1) + }; } (false, _) => { current_index = current_copied_node.processed_text_index; @@ -948,9 +651,9 @@ pub fn reduce_text_process_with_tree<'a>( .replace_all(unsafe { &*current_text_ptr }, process_replace_list) { (true, Cow::Owned(pt)) => { - processed_text_smt_set.push((Cow::Owned(pt), IdSet::new())); + processed_text_process_type_set.push((Cow::Owned(pt), IdSet::new())); current_index = - unsafe { processed_text_smt_set.len().unchecked_sub(1) }; + unsafe { processed_text_process_type_set.len().unchecked_sub(1) }; } (false, _) => { current_index = current_copied_node.processed_text_index; @@ -962,112 +665,94 @@ pub fn reduce_text_process_with_tree<'a>( } child_node.processed_text_index = current_index; - let processed_text_smt_tuple = - unsafe { processed_text_smt_set.get_unchecked_mut(current_index) }; - processed_text_smt_tuple - .1 - .extend(child_node.smt_list.iter().map(|smt| smt.bits() as usize)); + let processed_text_process_type_tuple = + unsafe { processed_text_process_type_set.get_unchecked_mut(current_index) }; + processed_text_process_type_tuple.1.extend( + child_node + .process_type_list + .iter() + .map(|smt| smt.bits() as usize), + ); } } - processed_text_smt_set + processed_text_process_type_set } -/// Processes the given text through a list of specified [SimpleMatchType] transformations. -/// -/// This function builds a tree structure from the list of [SimpleMatchType] transformations -/// and uses it to apply text transformations. The resulting texts and their associated -/// match types are collected into an [ArrayVec]. -/// -/// # Parameters -/// -/// * `smt_list`: A slice of [SimpleMatchType] enums representing the match types and their associated -/// transformations to be applied to the text. -/// * `text`: A string slice holding the initial text to be transformed. -/// -/// # Returns -/// -/// [`ArrayVec<\[(Cow<'a, str>, IdSet); 16\]>`]: A collection of tuples, where each tuple -/// contains a transformed version of the text and a set of [SimpleMatchType] transformations that -/// were applied to generate that version of the text. -/// -/// # Safety -/// -/// This function employs `unsafe` code to efficiently access and manipulate internal data structures. -/// Care should be taken when modifying this function to avoid introducing undefined behavior. #[inline(always)] pub fn reduce_text_process_with_list<'a>( - smt_list: &[SimpleMatchType], + process_type_list: &[ProcessType], text: &'a str, ) -> ArrayVec<[(Cow<'a, str>, IdSet); 16]> { - let mut smt_tree = Vec::with_capacity(8); - let mut root = SimpleMatchTypeBitNode { - smt_list: ArrayVec::new(), - smt_bit: SimpleMatchType::None, + let mut process_type_tree = Vec::with_capacity(8); + let mut root = ProcessTypeBitNode { + process_type_list: ArrayVec::new(), + process_type_bit: ProcessType::None, is_processed: true, processed_text_index: 0, children: ArrayVec::new(), }; - root.smt_list.push(SimpleMatchType::None); - smt_tree.push(root); + root.process_type_list.push(ProcessType::None); + process_type_tree.push(root); - let mut processed_text_smt_set: ArrayVec<[(Cow<'a, str>, IdSet); 16]> = ArrayVec::new(); - processed_text_smt_set.push(( + let mut processed_text_process_type_set: ArrayVec<[(Cow<'a, str>, IdSet); 16]> = + ArrayVec::new(); + processed_text_process_type_set.push(( Cow::Borrowed(text), - IdSet::from_iter([SimpleMatchType::None.bits() as usize]), + IdSet::from_iter([ProcessType::None.bits() as usize]), )); - for &simple_match_type in smt_list.iter() { + for &process_type in process_type_list.iter() { let mut current_text = text; let mut current_index = 0; let mut current_node_index = 0; - for smt_bit in simple_match_type.iter() { - let current_node = unsafe { smt_tree.get_unchecked(current_node_index) }; - if current_node.smt_bit == smt_bit { + for process_type_bit in process_type.iter() { + let current_node = unsafe { process_type_tree.get_unchecked(current_node_index) }; + if current_node.process_type_bit == process_type_bit { continue; } let mut is_found = false; for child_node_index in current_node.children { - if smt_bit == unsafe { smt_tree.get_unchecked(child_node_index) }.smt_bit { + if process_type_bit + == unsafe { process_type_tree.get_unchecked(child_node_index) }.process_type_bit + { current_node_index = child_node_index; is_found = true; break; } } - let current_node = unsafe { smt_tree.get_unchecked_mut(current_node_index) }; + let current_node = unsafe { process_type_tree.get_unchecked_mut(current_node_index) }; if !is_found { - let cached_result = get_process_matcher(smt_bit); + let cached_result = get_process_matcher(process_type_bit); let (process_replace_list, process_matcher) = cached_result.as_ref(); - match smt_bit { - SimpleMatchType::None => {} - SimpleMatchType::TextDelete | SimpleMatchType::WordDelete => { - match process_matcher.delete_all(current_text) { - (true, Cow::Owned(pt)) => { - processed_text_smt_set.push((Cow::Owned(pt), IdSet::new())); - current_index = processed_text_smt_set.len() - 1; - - let processed_text_smt_tuple = unsafe { - processed_text_smt_set - .get_unchecked_mut(current_node.processed_text_index) - }; - processed_text_smt_tuple - .1 - .insert(simple_match_type.bits() as usize); - } - (false, _) => { - current_index = current_node.processed_text_index; - } - (_, _) => unreachable!(), + match process_type_bit { + ProcessType::None => {} + ProcessType::Delete => match process_matcher.delete_all(current_text) { + (true, Cow::Owned(pt)) => { + processed_text_process_type_set.push((Cow::Owned(pt), IdSet::new())); + current_index = processed_text_process_type_set.len() - 1; + + let processed_text_process_type_tuple = unsafe { + processed_text_process_type_set + .get_unchecked_mut(current_node.processed_text_index) + }; + processed_text_process_type_tuple + .1 + .insert(process_type.bits() as usize); } - } + (false, _) => { + current_index = current_node.processed_text_index; + } + (_, _) => unreachable!(), + }, _ => match process_matcher.replace_all(current_text, process_replace_list) { (true, Cow::Owned(pt)) => { - processed_text_smt_set.push((Cow::Owned(pt), IdSet::new())); - current_index = processed_text_smt_set.len() - 1; + processed_text_process_type_set.push((Cow::Owned(pt), IdSet::new())); + current_index = processed_text_process_type_set.len() - 1; } (false, _) => { current_index = current_node.processed_text_index; @@ -1076,103 +761,36 @@ pub fn reduce_text_process_with_list<'a>( }, } - let mut child = SimpleMatchTypeBitNode { - smt_list: ArrayVec::new(), - smt_bit, + let mut child = ProcessTypeBitNode { + process_type_list: ArrayVec::new(), + process_type_bit, is_processed: true, processed_text_index: current_index, children: ArrayVec::new(), }; - child.smt_list.push(simple_match_type); - smt_tree.push(child); + child.process_type_list.push(process_type); + process_type_tree.push(child); - let new_node_index = smt_tree.len() - 1; - let current_node = unsafe { smt_tree.get_unchecked_mut(current_node_index) }; + let new_node_index = process_type_tree.len() - 1; + let current_node = + unsafe { process_type_tree.get_unchecked_mut(current_node_index) }; current_node.children.push(new_node_index); current_node_index = new_node_index; } else { current_index = current_node.processed_text_index; - current_node.smt_list.push(simple_match_type); + current_node.process_type_list.push(process_type); } - let processed_text_smt_tuple = - unsafe { processed_text_smt_set.get_unchecked_mut(current_index) }; - processed_text_smt_tuple + let processed_text_process_type_tuple = + unsafe { processed_text_process_type_set.get_unchecked_mut(current_index) }; + processed_text_process_type_tuple .1 - .insert(simple_match_type.bits() as usize); - current_text = unsafe { processed_text_smt_set.get_unchecked(current_index) } + .insert(process_type.bits() as usize); + current_text = unsafe { processed_text_process_type_set.get_unchecked(current_index) } .0 .as_ref(); } } - processed_text_smt_set -} - -#[cfg(test)] -mod test_text_process { - use super::*; - - #[test] - fn test_text_process() { - let text = text_process(SimpleMatchType::Fanjian, "躶軆"); - println!("{:?}", text); - } - - #[test] - fn test_reduce_text_process() { - let text = reduce_text_process(SimpleMatchType::FanjianDeleteNormalize, "~ᗩ~躶~𝚩~軆~Ⲉ~"); - println!("{:?}", text); - } - - #[test] - fn test_reduce_text_process_emit() { - let text = - reduce_text_process_emit(SimpleMatchType::FanjianDeleteNormalize, "~ᗩ~躶~𝚩~軆~Ⲉ~"); - println!("{:?}", text); - } - - #[test] - fn test_build_smt_tree() { - let smt_list = vec![ - SimpleMatchType::Fanjian, - SimpleMatchType::DeleteNormalize - SimpleMatchType::WordDelete, - SimpleMatchType::FanjianDeleteNormalize - SimpleMatchType::WordDelete, - SimpleMatchType::TextDelete, - SimpleMatchType::Normalize, - ]; - let smt_tree = build_smt_tree(&smt_list); - println!("{:?}", smt_tree); - } - - #[test] - fn test_reduce_text_process_with_tree() { - let smt_list = vec![ - SimpleMatchType::Fanjian, - SimpleMatchType::DeleteNormalize - SimpleMatchType::WordDelete, - SimpleMatchType::FanjianDeleteNormalize - SimpleMatchType::WordDelete, - SimpleMatchType::TextDelete, - SimpleMatchType::Normalize, - ]; - let smt_tree = build_smt_tree(&smt_list); - let text = "test爽-︻"; - - let processed_text_smt_set = reduce_text_process_with_tree(&smt_tree, text); - println!("{processed_text_smt_set:?}"); - } - - #[test] - fn test_reduce_text_process_with_list() { - let smt_list = vec![ - SimpleMatchType::Fanjian, - SimpleMatchType::DeleteNormalize - SimpleMatchType::WordDelete, - SimpleMatchType::FanjianDeleteNormalize - SimpleMatchType::WordDelete, - SimpleMatchType::TextDelete, - SimpleMatchType::Normalize, - ]; - let text = "test爽-︻"; - - let processed_text_smt_set = reduce_text_process_with_list(&smt_list, text); - println!("{processed_text_smt_set:?}"); - } + processed_text_process_type_set } diff --git a/matcher_rs/src/regex_matcher.rs b/matcher_rs/src/regex_matcher.rs index 3406ac3..1f254cf 100644 --- a/matcher_rs/src/regex_matcher.rs +++ b/matcher_rs/src/regex_matcher.rs @@ -1,27 +1,20 @@ use std::borrow::Cow; use fancy_regex::{escape, Regex}; +use id_set::IdSet; +use nohash_hasher::IntSet; use regex::RegexSet; use sonic_rs::{Deserialize, Serialize}; -use crate::matcher::{MatchResultTrait, TextMatcherTrait}; #[cfg(feature = "serde")] use crate::util::serde::{serde_regex, serde_regex_list, serde_regex_set}; +use crate::{ + matcher::{MatchResultTrait, TextMatcherTrait}, + process::process_matcher::{ + build_process_type_tree, reduce_text_process_with_tree, ProcessType, ProcessTypeBitNode, + }, +}; -/// Enumeration representing different types of regex match algorithms used in text matching. -/// -/// The [RegexMatchType] enum provides a way to distinguish between various match algorithms -/// that can be applied during regex pattern matching. Each variant defines a specific matching -/// strategy, allowing for flexible and tailored text matching operations. -/// -/// # Variants -/// -/// * [SimilarChar](RegexMatchType::Regex) - Represents a matching strategy that identifies matches based on character similarity. This type of matching is useful for finding text that is similar in character composition but not necessarily identical. -/// * [Acrostic](RegexMatchType::Acrostic) - Represents a matching strategy that identifies acrostic matches, where the matching portion of the text forms an acrostic pattern. This type of matching is particularly useful for specific types of literary analysis or word games. -/// * [Regex](RegexMatchType::Regex) - Represents a standard regex matching strategy, utilizing regular expressions to identify precise patterns within the text. This type of matching is widely used for its flexibility and power in text processing. -/// -/// This enum is used within various text matching applications to specify the match type to be applied, -/// enabling the application to execute the appropriate algorithm for the desired matching criteria. #[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq)] #[serde(rename_all = "snake_case")] pub enum RegexMatchType { @@ -30,71 +23,15 @@ pub enum RegexMatchType { Regex, } -/// Represents a table containing regex patterns and their associated metadata for text matching operations. -/// -/// The [RegexTable] struct is designed to encapsulate a collection of regex patterns along with relevant -/// identifiers and match type information. This structure is utilized in regex-based text matching processes -/// to organize and manage various sets of regex patterns efficiently. -/// -/// # Fields -/// -/// * `table_id` - A unique identifier for the regex table. This field is used to distinguish between different regex tables. -/// * `match_id` - An identifier that corresponds to the specific match operation associated with this regex table. This helps in tracking and categorizing match results. -/// * `regex_match_type` - The type of regex match algorithm being used, represented by the [RegexMatchType] enum. This field defines the matching strategy applied by the regex patterns in the table. -/// * `word_list` - A reference to a vector of string slices (&'a [`Vec<&'a str>`]) that represents the list of words or patterns that the regex in this table aims to match against. This collection allows the regex operations to process and match text efficiently. -/// -/// # Example -/// -/// ```rust -/// use matcher_rs::{RegexTable, RegexMatchType}; -/// -/// let word_list = vec!["example", "test", "sample"]; -/// let regex_table = RegexTable { -/// table_id: 1, -/// match_id: 42, -/// regex_match_type: RegexMatchType::Regex, -/// word_list: &word_list, -/// }; -/// -/// println!("{:?}", regex_table); -/// ``` -/// -/// The example above demonstrates how to create a [RegexTable] instance, populate it with a list of words, -/// and print the structure for debugging or logging purposes. -/// -/// This struct is primarily used in advanced text matching applications, where the organization and efficient -/// management of regex patterns are crucial for the performance and accuracy of the matching process. #[derive(Debug, Clone)] pub struct RegexTable<'a> { pub table_id: u32, pub match_id: u32, + pub process_type: ProcessType, pub regex_match_type: RegexMatchType, pub word_list: &'a Vec<&'a str>, } -/// Enum representing different types of regex pattern tables used in the [RegexMatcher]. -/// -/// The `RegexType` enum is utilized within `RegexPatternTable` to define the structure and behavior of the regex -/// patterns stored in each table. It supports two types of regex patterns: `Standard` and `List`. -/// -/// # Variants -/// -/// * `Standard` - Represents a table that holds a single compiled regex pattern. -/// - `regex` ([Regex]): The compiled regex pattern used for matching text. -/// -/// * `List` - Represents a table that holds a list of compiled regex patterns and their corresponding words. -/// - `regex_list` ([`Vec`]): A list of compiled regex patterns used for matching text. -/// - `word_list` ([`Vec`]): A list of words corresponding to each regex pattern in `regex_list`. -/// -/// * `Set` - Represents a table that holds a set of compiled regex patterns. -/// - `regex_set` ([RegexSet]): A regex set of compiled regex patterns used for matching text. -/// - `word_list` ([`Vec`]): A list of words corresponding to each regex pattern in `regex_list`. -/// -/// # Usage -/// -/// This enum enables the [RegexMatcher] to distinguish between tables that use a singular regex pattern and those -/// that use multiple patterns stored in a list. The associated data for each variant ensures that the [RegexMatcher] -/// can accurately process match operations and return results based on the specific table type. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] enum RegexType { @@ -114,160 +51,56 @@ enum RegexType { }, } -/// A structure representing a table of regex patterns used for text matching. -/// -/// The `RegexPatternTable` struct is designed to hold compiled regex patterns and associated metadata, -/// allowing the [RegexMatcher] to efficiently organize and manage different sets of patterns for matching -/// text. Each `RegexPatternTable` instance corresponds to a specific regex table and contains details -/// such as a unique identifier, match identifier, and the type of regex patterns stored. -/// -/// # Fields -/// -/// * `table_id` - A unique identifier for the regex pattern table. This identifier distinguishes the table from other regex tables. -/// * `match_id` - A unique identifier for the match, which corresponds to the `match_id` of the [RegexTable] that contains the regex pattern. -/// * `regex_type` - The type of regex pattern table, represented by the `RegexType` enum. This field determines the structure and behavior of the regex patterns stored in the table. -/// -/// The `RegexPatternTable` struct is utilized internally by the [RegexMatcher] to categorize and execute regex-based text matching operations. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] struct RegexPatternTable { table_id: u32, match_id: u32, + process_type: ProcessType, regex_type: RegexType, } -/// Represents a result from a regex matching operation, containing metadata about the match. -/// -/// The `RegexResult` structure is designed to encapsulate information about a particular regex match, -/// including the matched word or pattern, the table identifier from which the match originated, and -/// the match identifier associated with the match. -/// -/// # Fields -/// -/// * `match_id` - A [u32] that serves as an identifier for the match. This identifier -/// is used to differentiate between match results originating from different regex tables, allowing -/// for more detailed and organized match results. -/// -/// * `table_id` - A [u32] representing the unique identifier of the regex table that produced the match result. -/// This helps in distinguishing which regex table contributed to the result, facilitating organized processing -/// and categorization of matches. -/// -/// * `word` - A [Cow<'a, str>] that holds the matched word or pattern. This field can either be a -/// borrowed string slice or an owned [String], offering flexibility in how the match result is stored. -/// -/// This structure is primarily utilized in text matching applications where regex patterns are used -/// to identify specific words or patterns within the target text, and the results need to be tracked -/// and processed accordingly. #[derive(Debug, Clone)] pub struct RegexResult<'a> { pub match_id: u32, pub table_id: u32, + pub word_id: u32, pub word: Cow<'a, str>, } impl MatchResultTrait<'_> for RegexResult<'_> { + fn match_id(&self) -> u32 { + self.match_id + } fn table_id(&self) -> u32 { self.table_id } + fn word_id(&self) -> u32 { + self.word_id + } fn word(&self) -> &str { - self.word.as_ref() + &self.word + } + fn similarity(&self) -> f64 { + 1.0 } } -/// A structure responsible for managing and handling regex pattern tables for text matching. -/// -/// The [RegexMatcher] stores a list of `RegexPatternTable` structures, each of which contains -/// regex patterns and associated metadata used for efficient text matching operations. The struct -/// provides methods to create a new instance from a list of [RegexTable] structures, as well as -/// to check for matches and process the text to produce a list of match results. -/// -/// # Fields -/// -/// * `regex_pattern_table_list` - A vector of `RegexPatternTable` structures that hold regex patterns -/// and associated metadata for text matching. -/// -/// # Usage -/// -/// This structure is used within the [RegexMatcher] to efficiently manage multiple regex patterns -/// and their corresponding match tables. It enables the [RegexMatcher] to perform text matching -/// operations and return results based on the provided regex tables. -/// -/// # Example -/// -/// ``` -/// use matcher_rs::{RegexMatcher, RegexTable, RegexMatchType, TextMatcherTrait}; -/// -/// let regex_table = RegexTable { -/// table_id: 1, -/// match_id: 1, -/// regex_match_type: RegexMatchType::SimilarChar, -/// word_list: &vec!["1,一", "2,二"], -/// }; -/// -/// let regex_matcher = RegexMatcher::new(&vec![regex_table]); -/// assert!(regex_matcher.is_match("12")); -/// assert!(regex_matcher.is_match("一2")); -/// assert!(regex_matcher.is_match("1二")); -/// ``` #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct RegexMatcher { + process_type_tree: Vec, regex_pattern_table_list: Vec, } impl RegexMatcher { - /// Creates a new [RegexMatcher] instance from a list of [RegexTable]. - /// - /// This constructor function initializes a [RegexMatcher] with the provided list of [RegexTable] instances. - /// Each [RegexTable] contains regex patterns and other metadata. The function processes these tables and - /// compiles the regex patterns into `RegexPatternTable` structures, which are then stored in the `regex_pattern_table_list`. - /// - /// # Arguments - /// - /// * `regex_table_list` - A slice of [RegexTable] instances to be used for initializing the [RegexMatcher]. - /// - /// # Returns - /// - /// * [RegexMatcher] - A new instance of [RegexMatcher] containing compiled regex patterns and associated metadata. - /// - /// # Processing - /// - /// The function handles different `RegexMatchType` variants within the [RegexTable]: - /// - /// * [SimilarChar](RegexMatchType::SimilarChar) - Constructs a regex pattern where each character in the word list is separated by an optional dot (`.?`). - /// This pattern is then compiled into a single regex and stored in a `RegexPatternTable` with `RegexType::Standard`. - /// - /// * [Acrostic](RegexMatchType::Acrostic) - Creates regex patterns that match words starting from the beginning or after any punctuation or whitespace. - /// These patterns are compiled into individual regexes and stored in a `RegexPatternTable` with either `RegexType::List` - /// or `RegexType::Set`, depending on whether a `RegexSet` can be successfully created. - /// - /// * [Regex](RegexMatchType::Regex) - Compiles each word in the word list into individual regexes and stores them in a `RegexPatternTable` with either - /// `RegexType::List` or `RegexType::Set`, similar to the `Acrostic` type. - /// - /// Any invalid regex patterns encountered during the creation process are ignored, and a warning message is printed to the console. - /// - /// # Examples - /// - /// ``` - /// use matcher_rs::{RegexMatcher, RegexTable, RegexMatchType, TextMatcherTrait}; - /// - /// let regex_table = RegexTable { - /// table_id: 1, - /// match_id: 1, - /// regex_match_type: RegexMatchType::SimilarChar, - /// word_list: &vec!["1,一", "2,二"], - /// }; - /// - /// let regex_matcher = RegexMatcher::new(&vec![regex_table]); - /// - /// assert!(regex_matcher.is_match("12")); - /// assert!(regex_matcher.is_match("一2")); - /// assert!(regex_matcher.is_match("1二")); - /// ``` pub fn new(regex_table_list: &[RegexTable]) -> RegexMatcher { + let mut process_type_list = Vec::with_capacity(regex_table_list.len()); let mut regex_pattern_table_list = Vec::with_capacity(regex_table_list.len()); for regex_table in regex_table_list { + process_type_list.push(regex_table.process_type); + let size = regex_table.word_list.len(); match regex_table.regex_match_type { @@ -282,6 +115,7 @@ impl RegexMatcher { regex_pattern_table_list.push(RegexPatternTable { table_id: regex_table.table_id, match_id: regex_table.match_id, + process_type: regex_table.process_type, regex_type: RegexType::Standard { regex: Regex::new(&pattern).unwrap(), }, @@ -323,6 +157,7 @@ impl RegexMatcher { regex_pattern_table_list.push(RegexPatternTable { table_id: regex_table.table_id, match_id: regex_table.match_id, + process_type: regex_table.process_type, regex_type, }); } @@ -356,183 +191,134 @@ impl RegexMatcher { regex_pattern_table_list.push(RegexPatternTable { table_id: regex_table.table_id, match_id: regex_table.match_id, + process_type: regex_table.process_type, regex_type, }); } }; } + let process_type_tree = build_process_type_tree(&process_type_list); + RegexMatcher { + process_type_tree, regex_pattern_table_list, } } } impl<'a> TextMatcherTrait<'a, RegexResult<'a>> for RegexMatcher { - /// Determines if the provided text matches any of the regex patterns stored in the match tables. - /// - /// This function iterates through all the `RegexPatternTable` instances in `regex_pattern_table_list` - /// and checks if the provided text matches any of the regex patterns based on the `RegexType` of each table. - /// - /// # Arguments - /// - /// * `self` - A reference to the [RegexMatcher] instance. - /// * `text` - A string slice (`&str`) containing the text to be checked for matches against the regex patterns. - /// - /// # Returns - /// - /// * `bool` - Returns `true` if the text matches any of the regex patterns, otherwise returns `false`. - /// - /// # Match Checking - /// - /// The function handles different `RegexType` variants within the `RegexPatternTable`: - /// - /// * `Standard` - Checks if the text matches the single compiled regex pattern stored in the table. - /// If a match is found, the function returns `true`. - /// - /// * `List` - Iterates through the list of compiled regex patterns and checks if the text matches - /// any of them. If a match is found, the function returns `true`. - /// - /// * `Set` - Checks if the text matches the single compiled regex pattern stored in the table. - /// If a match is found, the function returns `true`. - /// - /// If no matches are found after checking all regex patterns in all tables, the function returns `false`. - /// - /// # Examples - /// - /// ``` - /// use matcher_rs::{RegexMatcher, RegexTable, RegexMatchType, TextMatcherTrait}; - /// - /// let regex_table = RegexTable { - /// table_id: 1, - /// match_id: 1, - /// regex_match_type: RegexMatchType::SimilarChar, - /// word_list: &vec!["1,一", "2,二"], - /// }; - /// - /// let regex_matcher = RegexMatcher::new(&vec![regex_table]); - /// - /// assert!(regex_matcher.is_match("12")); - /// assert!(regex_matcher.is_match("一2")); - /// assert!(regex_matcher.is_match("1二")); - /// ``` - fn is_match(&self, text: &str) -> bool { - for regex_table in &self.regex_pattern_table_list { - match ®ex_table.regex_type { - RegexType::Standard { regex } => { - if regex.is_match(text).unwrap() { - return true; - } - } - RegexType::List { regex_list, .. } => { - if regex_list.iter().any(|regex| regex.is_match(text).unwrap()) { - return true; - } + fn is_match(&'a self, text: &'a str) -> bool { + let processed_text_process_type_set = + reduce_text_process_with_tree(&self.process_type_tree, text); + + self._is_match_with_processed_text_process_type_set(&processed_text_process_type_set) + } + + fn _is_match_with_processed_text_process_type_set( + &'a self, + processed_text_process_type_set: &[(Cow<'a, str>, IdSet)], + ) -> bool { + for (processed_text, process_type_set) in processed_text_process_type_set { + for regex_pattern_table in &self.regex_pattern_table_list { + if !process_type_set.contains(regex_pattern_table.process_type.bits() as usize) { + continue; } - RegexType::Set { regex_set, .. } => { - if regex_set.is_match(text) { - return true; - } + + let is_match = match ®ex_pattern_table.regex_type { + RegexType::Standard { regex } => regex.is_match(processed_text).unwrap(), + RegexType::List { regex_list, .. } => regex_list + .iter() + .any(|regex| regex.is_match(processed_text).unwrap()), + RegexType::Set { regex_set, .. } => regex_set.is_match(processed_text), + }; + + if is_match { + return true; } } } - false } - /// Processes the provided text and returns a list of regex match results. - /// - /// This function iterates through all the `RegexPatternTable` instances in `regex_pattern_table_list` - /// and searches for matches within the provided text based on the `RegexType` of each table. - /// - /// # Arguments - /// - /// * `&'a self` - A reference to the [RegexMatcher] instance with a defined lifetime `'a`. - /// * `text` - A string slice (`&str`) containing the text to be checked for regex matches. - /// - /// # Returns - /// - /// * [`Vec>`] - A vector containing the results of regex matches. Each result includes - /// the matched word, table ID, and match ID. - /// - /// # Match Processing - /// - /// The function handles different `RegexType` variants within the `RegexPatternTable`: - /// - /// * `Standard` - Iterates through the captures of the regex for the given text. For each capture - /// group (excluding the entire match), it collects the matched substrings, concatenates them, and - /// stores the result. - /// - /// * `List` - Iterates through the list of compiled regex patterns. If the text matches any regex, - /// it pushes the associated word from `word_list` and the table/match IDs to the result list. - /// - /// * `Set` - Retrieves the patterns from the regex set. For each matched pattern index, it pushes - /// the corresponding pattern and the table/match IDs to the result list. - /// - /// # Examples - /// - /// ``` - /// use matcher_rs::{RegexMatcher, RegexTable, RegexMatchType, TextMatcherTrait}; - /// use std::borrow::Cow; - /// - /// let regex_table = RegexTable { - /// table_id: 1, - /// match_id: 1, - /// regex_match_type: RegexMatchType::SimilarChar, - /// word_list: &vec!["1,一", "2,二"], - /// }; - /// - /// let regex_matcher = RegexMatcher::new(&vec![regex_table]); - /// - /// let results = regex_matcher.process("12"); - /// for result in results { - /// println!("Matched word: {}", result.word); - /// println!("Table ID: {}", result.table_id); - /// println!("Match ID: {}", result.match_id); - /// } - /// ``` - fn process(&'a self, text: &str) -> Vec> { + fn process(&'a self, text: &'a str) -> Vec> { + let processed_text_process_type_set = + reduce_text_process_with_tree(&self.process_type_tree, text); + + self._process_with_processed_text_process_type_set(&processed_text_process_type_set) + } + + fn _process_with_processed_text_process_type_set( + &'a self, + processed_text_process_type_set: &[(Cow<'a, str>, IdSet)], + ) -> Vec> { let mut result_list = Vec::new(); + let mut table_id_index_set = IntSet::default(); - for regex_table in &self.regex_pattern_table_list { - match ®ex_table.regex_type { - RegexType::Standard { regex } => { - result_list.extend(regex.captures_iter(text).map(|caps| { - RegexResult { - match_id: regex_table.match_id, - table_id: regex_table.table_id, - word: Cow::Owned( - caps.unwrap() - .iter() - .skip(1) - .filter_map(|m| m.map(|match_char| match_char.as_str())) - .collect::(), - ), - } - })) + for (processed_text, process_type_set) in processed_text_process_type_set { + for regex_pattern_table in &self.regex_pattern_table_list { + if !process_type_set.contains(regex_pattern_table.process_type.bits() as usize) { + continue; } - RegexType::List { - regex_list, - word_list, - } => result_list.extend(regex_list.iter().enumerate().filter_map( - |(index, regex)| { - regex.is_match(text).unwrap().then_some(RegexResult { - match_id: regex_table.match_id, - table_id: regex_table.table_id, - word: Cow::Borrowed(&word_list[index]), - }) - }, - )), - RegexType::Set { - regex_set, - word_list, - } => result_list.extend(regex_set.matches(text).into_iter().map(|index| { - RegexResult { - match_id: regex_table.match_id, - table_id: regex_table.table_id, - word: Cow::Borrowed(&word_list[index]), + match ®ex_pattern_table.regex_type { + RegexType::Standard { regex } => { + if table_id_index_set.insert(regex_pattern_table.table_id as u64) { + for caps in regex.captures_iter(processed_text).flatten() { + result_list.push(RegexResult { + match_id: regex_pattern_table.match_id, + table_id: regex_pattern_table.table_id, + word_id: 0, + word: Cow::Owned( + caps.iter() + .skip(1) + .filter_map(|m| m.map(|match_char| match_char.as_str())) + .collect::(), + ), + }); + } + } } - })), + RegexType::List { + regex_list, + word_list, + } => { + for (index, regex) in regex_list.iter().enumerate() { + let table_id_index = + ((regex_pattern_table.table_id as u64) << 32) | (index as u64); + + if table_id_index_set.insert(table_id_index) { + if let Ok(is_match) = regex.is_match(processed_text) { + if is_match { + result_list.push(RegexResult { + match_id: regex_pattern_table.match_id, + table_id: regex_pattern_table.table_id, + word_id: index as u32, + word: Cow::Borrowed(&word_list[index]), + }); + } + } + } + } + } + RegexType::Set { + regex_set, + word_list, + } => { + for index in regex_set.matches(processed_text) { + let table_id_index = + ((regex_pattern_table.table_id as u64) << 32) | (index as u64); + + if table_id_index_set.insert(table_id_index) { + result_list.push(RegexResult { + match_id: regex_pattern_table.match_id, + table_id: regex_pattern_table.table_id, + word_id: index as u32, + word: Cow::Borrowed(&word_list[index]), + }); + } + } + } + } } } diff --git a/matcher_rs/src/sim_matcher.rs b/matcher_rs/src/sim_matcher.rs index fd6e215..494d9df 100644 --- a/matcher_rs/src/sim_matcher.rs +++ b/matcher_rs/src/sim_matcher.rs @@ -1,480 +1,191 @@ use std::borrow::Cow; -use fancy_regex::Regex; +use id_set::IdSet; +use nohash_hasher::IntSet; use rapidfuzz::distance; use sonic_rs::{Deserialize, Serialize}; -use crate::matcher::{MatchResultTrait, TextMatcherTrait}; -#[cfg(feature = "serde")] -use crate::util::serde::serde_regex; +use crate::{ + matcher::{MatchResultTrait, TextMatcherTrait}, + process::process_matcher::{ + build_process_type_tree, reduce_text_process_with_tree, ProcessType, ProcessTypeBitNode, + }, +}; -/// An enumeration representing different types of similarity matching algorithms. -/// -/// The [SimMatchType] enum defines several types of algorithms that can be used -/// for similarity matching operations. Each variant corresponds to a specific -/// algorithm, providing flexibility in choosing the appropriate method based on -/// the use case. -/// -/// # Variants -/// -/// - [Levenshtein](SimMatchType::Levenshtein): Represents the Levenshtein distance algorithm, which calculates -/// the number of single-character edits (insertions, deletions, or substitutions) -/// required to change one word into another. -/// - [DamerauLevenshtein](SimMatchType::DamerauLevenshtein): Represents the Damerau-Levenshtein distance algorithm, -/// an extension of Levenshtein that also considers transpositions (swapping of -/// two adjacent characters) as a single edit. -/// - [Indel](SimMatchType::Indel): Represents the Insertion-Deletion distance algorithm, focusing on -/// insertions and deletions as the only operations. -/// - [Jaro](SimMatchType::Jaro): Represents the Jaro distance algorithm, measuring the similarity between -/// two strings based on the number and order of matching characters. -/// - [JaroWinkler](SimMatchType::JaroWinkler): Represents the Jaro-Winkler distance algorithm, a variant of Jaro -/// that gives more favorable ratings to strings that match from the beginning. -/// -/// This enum can be serialized and deserialized using Serde, with the variant names -/// automatically converted to snake_case during this process. #[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq)] #[serde(rename_all = "snake_case")] pub enum SimMatchType { Levenshtein, - DamerauLevenshtein, - Indel, - Jaro, - JaroWinkler, } -/// A struct representing a similarity table used for matching operations. -/// -/// The [SimTable] struct is used to define a table of words and associated identifiers that -/// will be used in similarity matching. Each table has an ID, a match identifier, a list of words, -/// and a threshold for scoring. -/// -/// The lifetime `'a` ensures that the references to the word list remain valid for as long as -/// the `SimTable` instance exists. -/// -/// # Fields -/// -/// - `table_id` ([u32]): The unique identifier for the similarity table. -/// - `match_id` ([u32]): An ID that serves as an identifier for the match within the table. -/// - `sim_match_type` ([SimMatchType]): The type of similarity matching algorithm to be used -/// with this table. -/// - `word_list` ([&'a Vec<&'a str>]): A reference to a vector of string slices representing -/// the words in this similarity table. These words will be used in the matching process. -/// - `threshold` ([f64]): The threshold value for similarity scoring. This score typically -/// ranges from 0.0 to 1.0, with higher values indicating higher similarity. -/// -/// # Example -/// -/// ``` -/// use matcher_rs::{SimTable, SimMatchType}; -/// -/// let words = vec!["example1", "example2"]; -/// -/// let table = SimTable { -/// table_id: 1, -/// match_id: 1, -/// sim_match_type: SimMatchType::Levenshtein, -/// word_list: &words, -/// threshold: 0.8, -/// }; -/// ``` #[derive(Debug, Clone)] pub struct SimTable<'a> { pub table_id: u32, pub match_id: u32, + pub process_type: ProcessType, pub sim_match_type: SimMatchType, pub word_list: &'a Vec<&'a str>, pub threshold: f64, } -/// A struct representing a processed similarity table. -/// -/// The [SimProcessedTable] struct holds the preprocessed data for similarity matching operations. -/// After a [SimTable] has been processed, its data is converted and stored in this struct, which -/// includes all necessary information for performing match operations, such as the unique table ID, -/// match ID, type of similarity matching algorithm used, a list of words, and the threshold for -/// similarity scoring. -/// -/// # Fields -/// -/// - `table_id` ([u32]): The unique identifier for the similarity table. -/// - `match_id` ([u32]): An ID that serves as an identifier for the match within the table. -/// - `sim_match_type` ([SimMatchType]): The type of similarity matching algorithm used for this table. -/// - `word_list` ([`Vec`]): A vector of owned strings representing the words in this similarity table. -/// These words have been preprocessed and are ready for the matching process. -/// - `threshold` ([f64]): The threshold value for similarity scoring. This score ranges from 0.0 to 1.0, -/// with higher values indicating higher similarity. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] struct SimProcessedTable { table_id: u32, match_id: u32, + process_type: ProcessType, sim_match_type: SimMatchType, word_list: Vec, threshold: f64, } -/// A struct representing the result of a similarity match. -/// -/// The `SimResult` struct captures the details of a word that was found to be similar -/// during the similarity matching process. It includes the matched word, the unique -/// identifier of the table where the word was found, the match identifier of that table, -/// and the similarity score computed for the match. -/// -/// The lifetimes ensure that the references in the `SimResult` struct remain valid -/// for as long as the struct instance exists. -/// -/// # Fields -/// -/// - `match_id` ([u32]): An ID that serves as an identifier for the match. -/// - `table_id` ([u32]): The unique identifier of the table where the word was found. -/// - `word` ([Cow<'a, str>]): The word that was found to be similar. It is stored as a [Cow] -/// (clone-on-write) to allow for both owned and borrowed strings. -/// - `similarity` ([f64]): The similarity score computed for the match. This score typically -/// ranges from 0.0 to 1.0, with higher values indicating greater similarity. #[derive(Debug, Clone)] pub struct SimResult<'a> { pub match_id: u32, pub table_id: u32, + pub word_id: u32, pub word: Cow<'a, str>, pub similarity: f64, } impl MatchResultTrait<'_> for SimResult<'_> { + fn match_id(&self) -> u32 { + self.match_id + } fn table_id(&self) -> u32 { self.table_id } + fn word_id(&self) -> u32 { + 0 + } fn word(&self) -> &str { - self.word.as_ref() + &self.word + } + fn similarity(&self) -> f64 { + self.similarity } } -/// A struct representing a similarity matcher. -/// -/// The [SimMatcher] struct is responsible for managing and processing similarity matching -/// operations on provided textual data using predefined tables. It includes functionality -/// to preprocess text by removing special characters and to search for matches within -/// the preprocessed tables using normalized Levenshtein similarity. -/// -/// # Fields -/// -/// - `remove_special_pattern` ([Regex]): A compiled regular expression used for removing -/// special characters from the text before processing. -/// - `sim_processed_table_list` ([`Vec`]): A vector containing preprocessed -/// tables, where each table consists of a list of words and identifiers ready for -/// similarity matching. -/// -/// # Example -/// -/// ``` -/// use matcher_rs::{SimMatcher, SimTable, SimMatchType}; -/// -/// let word_list = vec!["example1", "example2"]; -/// -/// let sim_tables = vec![ -/// SimTable { -/// table_id: 1, -/// match_id: 1, -/// sim_match_type: SimMatchType::Levenshtein, -/// word_list: &word_list, -/// threshold: 0.8, -/// }, -/// // Add more SimTable instances as desired -/// ]; -/// -/// let matcher = SimMatcher::new(&sim_tables); -/// ``` #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct SimMatcher { - #[cfg_attr(feature = "serde", serde(with = "serde_regex"))] - remove_special_pattern: Regex, + process_type_tree: Vec, sim_processed_table_list: Vec, } impl SimMatcher { - /// Creates a new instance of [SimMatcher] by preprocessing the provided list of [SimTable] instances. - /// - /// This function takes a reference to a list of [SimTable] instances provided by the user and - /// preprocesses each table to create corresponding `SimProcessedTable` instances. The preprocessing - /// involves compiling a regular expression for removing special characters and converting the - /// words and match identifiers to owned [String] types. - /// - /// # Parameters - /// - /// - `sim_table_list` (&[SimTable]): A reference to a slice of [SimTable] instances to be preprocessed. - /// - /// # Returns - /// - /// - [SimMatcher]: A new instance of [SimMatcher] with preprocessed tables ready for similarity matching. - /// - /// # Example - /// - /// ``` - /// use matcher_rs::{SimMatcher, SimTable, SimMatchType}; - /// - /// let word_list = vec!["example1", "example2"]; - /// - /// let sim_tables = vec![ - /// SimTable { - /// table_id: 1, - /// match_id: 1, - /// sim_match_type: SimMatchType::Levenshtein, - /// word_list: &word_list, - /// threshold: 0.8, - /// }, - /// // Add more SimTable instances as desired - /// ]; - /// - /// let matcher = SimMatcher::new(&sim_tables); - /// ``` pub fn new(sim_table_list: &[SimTable]) -> SimMatcher { + let mut process_type_list = Vec::with_capacity(sim_table_list.len()); + let mut sim_processed_table_list = Vec::with_capacity(sim_table_list.len()); + + for sim_table in sim_table_list { + process_type_list.push(sim_table.process_type); + sim_processed_table_list.push(SimProcessedTable { + table_id: sim_table.table_id, + match_id: sim_table.match_id, + process_type: sim_table.process_type, + sim_match_type: sim_table.sim_match_type, + word_list: sim_table + .word_list + .iter() + .map(|&word| word.to_owned()) + .collect::>(), + threshold: sim_table.threshold, + }) + } + + let process_type_tree = build_process_type_tree(&process_type_list); + SimMatcher { - remove_special_pattern: Regex::new(r"\W+").unwrap(), - sim_processed_table_list: sim_table_list - .iter() - .map(|sim_table| SimProcessedTable { - table_id: sim_table.table_id, - match_id: sim_table.match_id, - sim_match_type: sim_table.sim_match_type, - word_list: sim_table - .word_list - .iter() - .map(|&word| word.to_owned()) - .collect::>(), - threshold: sim_table.threshold, - }) - .collect(), + process_type_tree, + sim_processed_table_list, } } } impl<'a> TextMatcherTrait<'a, SimResult<'a>> for SimMatcher { - /// Checks if the given text has any similarity match within the preprocessed tables. - /// - /// This function processes the input text by removing special characters and then - /// checks if the processed text has any similarity match within the preprocessed tables. - /// Various similarity metrics are used based on the type specified in each table. - /// The function returns `true` if there is any match that meets the threshold specified - /// for similarity, otherwise `false`. - /// - /// # Parameters - /// - /// - `text` (&str): A reference to the text string to be processed and checked - /// against the preprocessed tables for similarity matches. - /// - /// # Returns - /// - /// - (bool): `true` if a similarity match is found that meets the specified threshold, otherwise `false`. - /// - /// # Example - /// - /// ``` - /// use matcher_rs::{SimMatcher, SimTable, TextMatcherTrait, SimMatchType}; - /// - /// let word_list = vec!["example1", "example2"]; - /// - /// let sim_tables = vec![ - /// SimTable { - /// table_id: 1, - /// match_id: 1, - /// word_list: &word_list, - /// sim_match_type: SimMatchType::Levenshtein, - /// threshold: 0.8, - /// }, - /// // Add more SimTable instances as desired - /// ]; - /// - /// let matcher = SimMatcher::new(&sim_tables); - /// - /// let is_matched = matcher.is_match("example3"); - /// - /// if is_matched { - /// println!("The text has a similarity match in the preprocessed tables."); - /// } else { - /// println!("No similarity match found."); - /// } - /// ``` - fn is_match(&self, text: &str) -> bool { - let processed_text = self.remove_special_pattern.replace_all(text, ""); + fn is_match(&'a self, text: &'a str) -> bool { + let processed_text_process_type_set = + reduce_text_process_with_tree(&self.process_type_tree, text); - self.sim_processed_table_list - .iter() - .any(|sim_table| match sim_table.sim_match_type { - SimMatchType::Levenshtein => sim_table.word_list.iter().any(|text| { - distance::levenshtein::normalized_similarity_with_args( - text.chars(), - processed_text.chars(), - &distance::levenshtein::Args::default().score_cutoff(sim_table.threshold), - ) - .is_some() - }), - SimMatchType::DamerauLevenshtein => sim_table.word_list.iter().any(|text| { - distance::damerau_levenshtein::normalized_similarity_with_args( - text.chars(), - processed_text.chars(), - &distance::damerau_levenshtein::Args::default() - .score_cutoff(sim_table.threshold), - ) - .is_some() - }), - SimMatchType::Indel => sim_table.word_list.iter().any(|text| { - distance::indel::normalized_similarity_with_args( - text.chars(), - processed_text.chars(), - &distance::indel::Args::default().score_cutoff(sim_table.threshold), - ) - .is_some() - }), - SimMatchType::Jaro => sim_table.word_list.iter().any(|text| { - distance::jaro::normalized_similarity_with_args( - text.chars(), - processed_text.chars(), - &distance::jaro::Args::default().score_cutoff(sim_table.threshold), - ) - .is_some() - }), - SimMatchType::JaroWinkler => sim_table.word_list.iter().any(|text| { - distance::jaro_winkler::normalized_similarity_with_args( - text.chars(), - processed_text.chars(), - &distance::jaro_winkler::Args::default().score_cutoff(sim_table.threshold), - ) - .is_some() - }), - }) + self._is_match_with_processed_text_process_type_set(&processed_text_process_type_set) } - /// Processes the input text and returns a list of similarity results based on the - /// preprocessed tables and their respective similarity match types and thresholds. - /// - /// This function removes special characters from the input text, then iterates through - /// each preprocessed similarity table to calculate the similarity scores between the - /// processed input text and each word in the table's word list. The results are collected - /// into a vector of `SimResult` instances for each word that meets the similarity threshold. - /// - /// # Parameters - /// - /// - `text` (&str): A reference to the text string to be processed and checked against - /// the preprocessed tables for similarity matches. - /// - /// # Returns - /// - /// - [`Vec`]: A vector containing `SimResult` instances for each word that meets - /// the similarity threshold specified in the corresponding similarity table. - /// - /// # Example - /// - /// ``` - /// use matcher_rs::{SimMatcher, SimTable, TextMatcherTrait, SimMatchType}; - /// - /// let word_list = vec!["example1", "example2"]; - /// - /// let sim_tables = vec![ - /// SimTable { - /// table_id: 1, - /// match_id: 1, - /// word_list: &word_list, - /// sim_match_type: SimMatchType::Levenshtein, - /// threshold: 0.8, - /// }, - /// // Add more SimTable instances as desired - /// ]; - /// - /// let matcher = SimMatcher::new(&sim_tables); - /// - /// let results = matcher.process("example3"); - /// - /// for result in results { - /// println!( - /// "Found match in table {}: word={}, similarity={}", - /// result.table_id, result.word, result.similarity - /// ); - /// } - /// ``` - fn process(&'a self, text: &str) -> Vec> { - let processed_text = self.remove_special_pattern.replace_all(text, ""); - - let mut result_list = Vec::new(); - - for sim_table in &self.sim_processed_table_list { - match sim_table.sim_match_type { - SimMatchType::Levenshtein => { - result_list.extend(sim_table.word_list.iter().filter_map(|text| { + fn _is_match_with_processed_text_process_type_set( + &'a self, + processed_text_process_type_set: &[(Cow<'a, str>, id_set::IdSet)], + ) -> bool { + for (processed_text, process_type_set) in processed_text_process_type_set { + for sim_processed_table in &self.sim_processed_table_list { + if !process_type_set.contains(sim_processed_table.process_type.bits() as usize) { + continue; + } + let is_match = match sim_processed_table.sim_match_type { + SimMatchType::Levenshtein => sim_processed_table.word_list.iter().any(|text| { distance::levenshtein::normalized_similarity_with_args( text.chars(), processed_text.chars(), &distance::levenshtein::Args::default() - .score_cutoff(sim_table.threshold), - ) - .map(|similarity| SimResult { - match_id: sim_table.match_id, - table_id: sim_table.table_id, - word: Cow::Borrowed(text), - similarity, - }) - })) - } - SimMatchType::DamerauLevenshtein => { - result_list.extend(sim_table.word_list.iter().filter_map(|text| { - distance::damerau_levenshtein::normalized_similarity_with_args( - text.chars(), - processed_text.chars(), - &distance::damerau_levenshtein::Args::default() - .score_cutoff(sim_table.threshold), + .score_cutoff(sim_processed_table.threshold), ) - .map(|similarity| SimResult { - match_id: sim_table.match_id, - table_id: sim_table.table_id, - word: Cow::Borrowed(text), - similarity, - }) - })) - } - SimMatchType::Indel => { - result_list.extend(sim_table.word_list.iter().filter_map(|text| { - distance::indel::normalized_similarity_with_args( - text.chars(), - processed_text.chars(), - &distance::indel::Args::default().score_cutoff(sim_table.threshold), - ) - .map(|similarity| SimResult { - match_id: sim_table.match_id, - table_id: sim_table.table_id, - word: Cow::Borrowed(text), - similarity, - }) - })) + .is_some() + }), + }; + + if is_match { + return true; } - SimMatchType::Jaro => { - result_list.extend(sim_table.word_list.iter().filter_map(|text| { - distance::jaro::normalized_similarity_with_args( - text.chars(), - processed_text.chars(), - &distance::jaro::Args::default().score_cutoff(sim_table.threshold), - ) - .map(|similarity| SimResult { - match_id: sim_table.match_id, - table_id: sim_table.table_id, - word: Cow::Borrowed(text), - similarity, - }) - })) + } + } + + false + } + + fn process(&'a self, text: &'a str) -> Vec> { + let processed_text_process_type_set = + reduce_text_process_with_tree(&self.process_type_tree, text); + + self._process_with_processed_text_process_type_set(&processed_text_process_type_set) + } + + fn _process_with_processed_text_process_type_set( + &'a self, + processed_text_process_type_set: &[(Cow<'a, str>, IdSet)], + ) -> Vec> { + let mut result_list = Vec::new(); + let mut table_id_index_set = IntSet::default(); + + for (processed_text, process_type_set) in processed_text_process_type_set { + for sim_processed_table in &self.sim_processed_table_list { + if !process_type_set.contains(sim_processed_table.process_type.bits() as usize) { + continue; } - SimMatchType::JaroWinkler => { - result_list.extend(sim_table.word_list.iter().filter_map(|text| { - distance::jaro_winkler::normalized_similarity_with_args( - text.chars(), - processed_text.chars(), - &distance::jaro_winkler::Args::default() - .score_cutoff(sim_table.threshold), - ) - .map(|similarity| SimResult { - match_id: sim_table.match_id, - table_id: sim_table.table_id, - word: Cow::Borrowed(text), - similarity, - }) - })) + match sim_processed_table.sim_match_type { + SimMatchType::Levenshtein => { + for (index, text) in sim_processed_table.word_list.iter().enumerate() { + let table_id_index = + ((sim_processed_table.table_id as u64) << 32) | (index as u64); + + if table_id_index_set.insert(table_id_index) { + if let Some(similarity) = + distance::levenshtein::normalized_similarity_with_args( + text.chars(), + processed_text.chars(), + &distance::levenshtein::Args::default() + .score_cutoff(sim_processed_table.threshold), + ) + { + result_list.push(SimResult { + match_id: sim_processed_table.match_id, + table_id: sim_processed_table.table_id, + word_id: index as u32, + word: Cow::Borrowed(text), + similarity, + }); + } + } + } + } } } } diff --git a/matcher_rs/src/simple_matcher.rs b/matcher_rs/src/simple_matcher.rs index 2bbfbed..1ebb710 100644 --- a/matcher_rs/src/simple_matcher.rs +++ b/matcher_rs/src/simple_matcher.rs @@ -1,98 +1,19 @@ -use std::fmt::Display; use std::iter; use std::{borrow::Cow, collections::HashMap}; use ahash::AHashMap; use aho_corasick_unsafe::{AhoCorasick, AhoCorasickBuilder, AhoCorasickKind}; -use bitflags::bitflags; -use nohash_hasher::{IntMap, IntSet, IsEnabled}; -use serde::{Deserializer, Serializer}; +use nohash_hasher::{IntMap, IntSet}; use sonic_rs::{Deserialize, Serialize}; use crate::matcher::{MatchResultTrait, TextMatcherTrait}; use crate::process::process_matcher::{ - build_smt_tree, reduce_text_process_emit, reduce_text_process_with_tree, SimpleMatchTypeBitNode, + build_process_type_tree, reduce_text_process_emit, reduce_text_process_with_tree, ProcessType, + ProcessTypeBitNode, }; -bitflags! { - /// [SimpleMatchType] is a set of flags used to specify various text transformation rules. - /// - /// Each flag represents a specific type of string conversion or deletion operation. - /// The flags can be combined using bitwise operations to create complex transformation rules. - /// - /// # Flags - /// - /// * [None](SimpleMatchType::None) (0b00000001) - No transformation is applied. - /// * [Fanjian](SimpleMatchType::Fanjian) (0b00000010) - Simplifies traditional Chinese characters to simplified ones. - /// * [WordDelete](SimpleMatchType::WordDelete) (0b00000100) - Deletes word-level components based on predefined rules. - /// * [TextDelete](SimpleMatchType::TextDelete) (0b00001000) - Deletes text-level components, including special characters and whitespace. - /// * [Delete](SimpleMatchType::Delete) (0b00001100) - Combines [WordDelete](SimpleMatchType::WordDelete) and [TextDelete](SimpleMatchType::TextDelete) transformations. - /// * [Normalize](SimpleMatchType::Normalize) (0b00010000) - Normalizes the text, including case normalization and removing variations. - /// * [DeleteNormalize](SimpleMatchType::DeleteNormalize) (0b00011100) - Combines [Delete](SimpleMatchType::Delete) and [Normalize](SimpleMatchType::Normalize) transformations. - /// * [FanjianDeleteNormalize](SimpleMatchType::FanjianDeleteNormalize) (0b00011110) - Combines [Fanjian](SimpleMatchType::Fanjian), [Delete](SimpleMatchType::Delete), and [Normalize](SimpleMatchType::Normalize) transformations. - /// * [PinYin](SimpleMatchType::PinYin) (0b00100000) - Converts Chinese characters to their Pinyin representation. - /// * [PinYinChar](SimpleMatchType::PinYinChar) (0b01000000) - Converts individual Chinese characters to their Pinyin representation. - #[derive(Hash, PartialEq, Eq, Clone, Copy, Debug, Default)] - pub struct SimpleMatchType: u8 { - const None = 0b00000001; - const Fanjian = 0b00000010; - const WordDelete = 0b00000100; - const TextDelete = 0b00001000; - const Delete = 0b00001100; - const Normalize = 0b00010000; - const DeleteNormalize = 0b00011100; - const FanjianDeleteNormalize = 0b00011110; - const PinYin = 0b00100000; - const PinYinChar = 0b01000000; - } -} - -impl Serialize for SimpleMatchType { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - self.bits().serialize(serializer) - } -} +pub type SimpleTable<'a> = IntMap>; -impl<'de> Deserialize<'de> for SimpleMatchType { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - let bits: u8 = u8::deserialize(deserializer)?; - Ok(SimpleMatchType::from_bits_retain(bits)) - } -} - -impl Display for SimpleMatchType { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let display_str_list = self - .iter_names() - .map(|(name, _)| name.to_lowercase()) - .collect::>(); - write!(f, "{:?}", display_str_list.join("_")) - } -} - -impl IsEnabled for SimpleMatchType {} - -pub type SimpleMatchTypeWordMap<'a> = IntMap>; - -/// `WordConf` represents the configuration and attributes of a specific word, -/// including its textual representation, split bit vector, and a non-indexable position. -/// -/// This structure is essential for configuring words that will be processed by the -/// [SimpleMatcher] for pattern matching and text transformations. The `word` field holds -/// the actual text of the word, `split_bit` contains the vector for split bits, and -/// `not_index` indicates a specific position that should not be indexed during the matching process. -/// -/// # Fields -/// -/// * `word` - A [String] representing the textual content of the word. -/// * `split_bit` - A [`Vec`] representing the vector that holds split bits for the word. -/// * `not_index` - A [usize] denoting a position in the word that is exempt from indexing. #[derive(Debug, Clone, Serialize, Deserialize)] struct WordConf { word: String, @@ -100,33 +21,6 @@ struct WordConf { not_index: usize, } -/// [SimpleResult] represents the result of a matching operation. -/// -/// This structure is used to store the outcome of a text matching operation performed -/// by the [SimpleMatcher]. It holds details about the matched word, including its -/// unique identifier (`word_id`) and the matched text (`word`). The [SimpleResult] -/// structure is designed to provide a consistent and accessible interface for retrieving -/// the results of text matching operations. -/// -/// # Fields -/// -/// * `word_id` - A [u32] value representing the unique identifier of the matched word. -/// * `word` - A [Cow<'a, str>] representing the matched text. This allows the text to be -/// either borrowed or owned, providing flexibility in handling the string data. -/// -/// # Example -/// -/// ``` -/// use matcher_rs::{SimpleResult, MatchResultTrait}; -/// use std::borrow::Cow; -/// -/// let result = SimpleResult { -/// word_id: 42, -/// word: Cow::Borrowed("example"), -/// }; -/// -/// assert_eq!(result.word_id(), 42); -/// ``` #[derive(Debug, Serialize)] pub struct SimpleResult<'a> { pub word_id: u32, @@ -134,140 +28,50 @@ pub struct SimpleResult<'a> { } impl MatchResultTrait<'_> for SimpleResult<'_> { + fn match_id(&self) -> u32 { + 0 + } + fn table_id(&self) -> u32 { + 0 + } fn word_id(&self) -> u32 { self.word_id } fn word(&self) -> &str { - self.word.as_ref() + &self.word + } + fn similarity(&self) -> f64 { + 1.0 } } -/// [SimpleMatcher] is a structure designed for efficient pattern matching and text transformations. -/// -/// The [SimpleMatcher] structure encapsulates various configurations, matchers, and nodes needed to -/// perform text matching operations efficiently. It uses different matching rules defined by the -/// [SimpleMatchType] and builds necessary data structures for pattern matching, including an Aho-Corasick -/// automaton for fast multi-pattern matching. -/// -/// # Fields -/// -/// * `smt_tree` - A [Vec] of `SimpleMatchTypeBitNode` that represents the match type tree for hierarchical -/// or complex match type relationships. -/// * `smt_matcher` - An [AhoCorasick] matcher that facilitates the multi-pattern matching based on the configured -/// match types and word patterns. -/// * `smt_ac_dedup_word_conf_list` - A [Vec] of lists containing tuples of [SimpleMatchType], word ID [u32], and -/// a size [usize] that helps in deduplication of word configurations for the matcher. -/// * `simple_word_conf_map` - An [IntMap] that maps word IDs [u32] to their corresponding `WordConf` structures, -/// providing configuration details for each word. -/// -/// The [SimpleMatcher] is typically initialized and configured using the provided word maps and match types, -/// and it is used to perform fast and reliable text matching operations in various applications. -/// -/// # Example -/// -/// ``` -/// use std::collections::HashMap; -/// use matcher_rs::{SimpleMatcher, SimpleMatchType, TextMatcherTrait}; -/// -/// // Initialize word maps and SimpleMatchType instances. -/// let word_maps = HashMap::from([ -/// (SimpleMatchType::Fanjian, HashMap::from([(1, "ChineseWord1"), (2, "ChineseWord2")])), -/// (SimpleMatchType::Normalize, HashMap::from([(3, "NormalizationExample1"), (4, "NormalizationExample2")])) -/// ]); -/// -/// // Create a SimpleMatcher instance using the provided word maps. -/// let simple_matcher = SimpleMatcher::new(&word_maps); -/// -/// // Check if a text matches any patterns based on the configured SimpleMatcher. -/// let text = "ExampleText"; -/// let is_match = simple_matcher.is_match(text); -/// -/// // Process the input text and return a list of matching results. -/// let results = simple_matcher.process(text); -/// ``` -/// -/// # See also: -/// -/// * [SimpleMatchType] - Enum defining various match types and their respective flags. -/// * [SimpleMatcher::new] - Method to initialize a new `SimpleMatcher` instance. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct SimpleMatcher { - smt_tree: Vec, - smt_matcher: AhoCorasick, - smt_ac_dedup_word_conf_list: Vec>, - simple_word_conf_map: IntMap, + process_type_tree: Vec, + ac_matcher: AhoCorasick, + ac_dedup_word_conf_list: Vec>, + word_conf_map: IntMap, } impl SimpleMatcher { - /// Constructs a new `SimpleMatcher` instance from a provided map of `SimpleMatchType` to word maps. - /// - /// This function initializes a `SimpleMatcher` with mappings and configurations needed for efficient - /// text matching based on the provided `SimpleMatchType` rules. It creates the necessary structures for - /// pattern matching, including Aho-Corasick tables and word configuration mappings. - /// - /// # Arguments - /// - /// * `smt_word_map` - A reference to a `HashMap` where keys are `SimpleMatchType` and values - /// are `HashMap` of word IDs to their corresponding words. - /// - /// # Type Parameters - /// - /// * `I` - A type that can be referenced as a string slice. This represents the type of the words in the map. - /// * `S1` - A hasher for the inner `HashMap` keys (word IDs). - /// * `S2` - A hasher for the outer `HashMap` keys (`SimpleMatchType`). - /// - /// # Returns - /// - /// * `SimpleMatcher` - A configured `SimpleMatcher` instance ready for pattern matching. - /// - /// # Example - /// - /// ``` - /// use std::collections::HashMap; - /// use matcher_rs::{SimpleMatcher, SimpleMatchType}; - /// - /// let smt_word_map = HashMap::from([ - /// (SimpleMatchType::Fanjian, HashMap::from([(1, "example1"), (2, "example2")])), - /// (SimpleMatchType::Normalize, HashMap::from([(3, "example3"), (4, "example4")])), - /// ]); - /// - /// let simple_matcher = SimpleMatcher::new(&smt_word_map); - /// ``` - /// - /// # Detailed Processing: - /// - /// 1. Collects and copies the keys from `smt_word_map` to create `smt_list`. - /// 2. If the length of `smt_word_map` is 4 or more, builds the `smt_tree` using `build_smt_tree`. - /// 3. Initializes empty vectors and maps for storing configurations and deduplication. - /// 4. Iterates over each `SimpleMatchType` and its corresponding word map: - /// a. For each word, splits it based on '&' and '~' characters to separate the included and excluded parts. - /// b. Processes the split words and updates counters for both included and excluded parts. - /// c. Inserts the word configurations into `simple_word_conf_map`. - /// d. Processes and reduces the text for the Aho-Corasick matcher, updating the deduplication maps. - /// 5. Chooses the Aho-Corasick matcher kind and prefilter settings based on feature flags. - /// 6. Builds the Aho-Corasick matcher using the processed and reduced text words. - /// 7. Returns a new `SimpleMatcher` instance with the initialized structures. - /// pub fn new( - smt_word_map: &HashMap, S2>, + process_type_word_map: &HashMap, S2>, ) -> SimpleMatcher where I: AsRef, { - let mut smt_list = Vec::new(); - let mut smt_ac_dedup_word_conf_list = Vec::new(); - let mut simple_word_conf_map = IntMap::default(); + let mut process_type_list = Vec::new(); + let mut ac_dedup_word_conf_list = Vec::new(); + let mut word_conf_map = IntMap::default(); let mut ac_dedup_word_id = 0; let mut ac_dedup_word_list = Vec::new(); let mut ac_dedup_word_id_map = AHashMap::new(); - for (&simple_match_type, simple_word_map) in smt_word_map { - let word_simple_match_type = simple_match_type - SimpleMatchType::TextDelete; - let text_simple_match_type = simple_match_type - SimpleMatchType::WordDelete; - - smt_list.push(text_simple_match_type); + for (&process_type, simple_word_map) in process_type_word_map { + let word_process_type = process_type - ProcessType::Delete; + process_type_list.push(process_type); for (&simple_word_id, simple_word) in simple_word_map { let mut ac_split_word_and_counter = AHashMap::default(); @@ -328,7 +132,7 @@ impl SimpleMatcher { .chain(ac_split_word_not_counter.values().copied()) .collect::>(); - simple_word_conf_map.insert( + word_conf_map.insert( simple_word_id, WordConf { word: simple_word.as_ref().to_owned(), @@ -342,18 +146,18 @@ impl SimpleMatcher { .chain(ac_split_word_not_counter.keys()) .enumerate() { - for ac_word in reduce_text_process_emit(word_simple_match_type, split_word) { + for ac_word in reduce_text_process_emit(word_process_type, split_word) { if let Some(ac_dedup_word_id) = ac_dedup_word_id_map.get(ac_word.as_ref()) { // Guaranteed not failed - let word_conf_list: &mut Vec<(SimpleMatchType, u32, usize)> = unsafe { - smt_ac_dedup_word_conf_list + let word_conf_list: &mut Vec<(ProcessType, u32, usize)> = unsafe { + ac_dedup_word_conf_list .get_unchecked_mut(*ac_dedup_word_id as usize) }; - word_conf_list.push((text_simple_match_type, simple_word_id, offset)); + word_conf_list.push((process_type, simple_word_id, offset)); } else { ac_dedup_word_id_map.insert(ac_word.clone(), ac_dedup_word_id); - smt_ac_dedup_word_conf_list.push(vec![( - text_simple_match_type, + ac_dedup_word_conf_list.push(vec![( + process_type, simple_word_id, offset, )]); @@ -365,7 +169,7 @@ impl SimpleMatcher { } } - let smt_tree = build_smt_tree(&smt_list); + let process_type_tree = build_process_type_tree(&process_type_list); #[cfg(feature = "dfa")] let aho_corasick_kind = AhoCorasickKind::DFA; @@ -377,7 +181,7 @@ impl SimpleMatcher { #[cfg(not(feature = "serde"))] let prefilter = true; - let smt_matcher = AhoCorasickBuilder::new() + let ac_matcher = AhoCorasickBuilder::new() .kind(Some(aho_corasick_kind)) .ascii_case_insensitive(true) .prefilter(prefilter) @@ -385,79 +189,58 @@ impl SimpleMatcher { .unwrap(); SimpleMatcher { - smt_tree, - smt_matcher, - smt_ac_dedup_word_conf_list, - simple_word_conf_map, + process_type_tree, + ac_matcher, + ac_dedup_word_conf_list, + word_conf_map, } } } impl<'a> TextMatcherTrait<'a, SimpleResult<'a>> for SimpleMatcher { - /// Checks if the input text contains any matches based on the patterns stored in the matcher. - /// - /// This function returns a boolean indicating whether any patterns are found within the input text. - /// It processes the input text according to transformations defined by each [SimpleMatchType], - /// and utilizes the Aho-Corasick algorithm to find overlapping patterns. If at least one match is found - /// according to the configurations, the function returns `true`. - /// - /// # Arguments - /// - /// * `text` - A string slice representing the input text to be checked for matches. - /// - /// # Returns - /// - /// * `bool` - `true` if at least one match is found, `false` otherwise. - /// - /// # Detailed Processing: - /// - /// 1. If the input text is empty, return `false`. - /// 2. Initialize maps and sets to track word configurations during processing, including: - /// * `word_id_split_bit_map`: A map to track the bit matrices for word configurations. - /// * `word_id_set`: A set to track word IDs that have a valid match. - /// * `not_word_id_set`: A set to track word IDs that should be excluded. - /// 3. Process the input text using `reduce_text_process_with_tree` to get transformed versions - /// and corresponding [SimpleMatchType] sets. - /// 4. Iterate through the processed text and corresponding sets: - /// a. Use the Aho-Corasick matcher to find overlapping patterns. - /// b. For each match, update the bit matrices according to the configurations. - /// c. Check if the match should be excluded based on the not set or existing configurations. - /// d. If a valid match is found according to the bit matrices, add it to the `word_id_set`. - /// 5. If `word_id_set` is not empty after processing, return `true`. - /// 6. Return `false` if no valid matches are found. - fn is_match(&self, text: &str) -> bool { + fn is_match(&'a self, text: &'a str) -> bool { if text.is_empty() { return false; } + let processed_text_process_type_set = + reduce_text_process_with_tree(&self.process_type_tree, text); + + self._is_match_with_processed_text_process_type_set(&processed_text_process_type_set) + } + + fn _is_match_with_processed_text_process_type_set( + &'a self, + processed_text_process_type_set: &[(Cow<'a, str>, id_set::IdSet)], + ) -> bool { let mut word_id_split_bit_map = IntMap::default(); let mut word_id_set = IntSet::default(); let mut not_word_id_set = IntSet::default(); - let processed_text_smt_list = reduce_text_process_with_tree(&self.smt_tree, text); - let processed_times = processed_text_smt_list.len(); + let processed_times = processed_text_process_type_set.len(); - for (index, (processed_text, smt_set)) in processed_text_smt_list.iter().enumerate() { + for (index, (processed_text, process_type_set)) in + processed_text_process_type_set.iter().enumerate() + { // Guaranteed not failed for ac_dedup_result in unsafe { - self.smt_matcher + self.ac_matcher .try_find_overlapping_iter(processed_text.as_ref()) .unwrap_unchecked() } { // Guaranteed not failed - for &(match_simple_match_type, word_id, offset) in unsafe { - self.smt_ac_dedup_word_conf_list + for &(match_process_type, word_id, offset) in unsafe { + self.ac_dedup_word_conf_list .get_unchecked(ac_dedup_result.pattern().as_usize()) } { - if !smt_set.contains(match_simple_match_type.bits() as usize) + if !process_type_set.contains(match_process_type.bits() as usize) || not_word_id_set.contains(&word_id) { continue; } // Guaranteed not failed - let word_conf = - unsafe { self.simple_word_conf_map.get(&word_id).unwrap_unchecked() }; + let word_conf = unsafe { self.word_conf_map.get(&word_id).unwrap_unchecked() }; let split_bit_matrix = word_id_split_bit_map.entry(word_id).or_insert_with(|| { @@ -498,78 +281,48 @@ impl<'a> TextMatcherTrait<'a, SimpleResult<'a>> for SimpleMatcher { false } - /// - /// This function processes the input text and returns a vector of [SimpleResult] containing word matches - /// found within the text. It utilizes transformations defined by each [SimpleMatchType] and utilizes - /// the Aho-Corasick algorithm to identify overlapping patterns. - /// - /// # Arguments - /// - /// * `text` - A string slice representing the input text to be checked for matches. - /// - /// # Returns - /// - /// * [`Vec`] - A vector containing [SimpleResult] objects, each containing a `word_id` and `word` - /// indicating a valid match found within the input text. - /// - /// # Detailed Processing: - /// - /// 1. If the input text is empty, return an empty vector. - /// 2. Initialize maps and sets to track word configurations during processing, including: - /// * `word_id_split_bit_map`: A map to track the bit matrices for word configurations. - /// * `not_word_id_set`: A set to track word IDs that should be excluded. - /// 3. Process the input text using `reduce_text_process_with_tree` to get transformed versions - /// and corresponding [SimpleMatchType] sets. - /// 4. Iterate through the processed text and corresponding sets: - /// a. Use the Aho-Corasick matcher to find overlapping patterns. - /// b. For each match, update the bit matrices according to the configurations. - /// c. Check if the match should be excluded based on the not set or existing configurations. - /// 5. Filter out and collect valid matches into a vector of [SimpleResult]: - /// * A match is considered valid if it satisfies the bit matrix configurations and - /// is not present in the `not_word_id_set`. - /// - /// # Safety - /// - /// The function uses several `unsafe` blocks for performance reasons, assuming that: - /// * The iterator over the processed text will not fail. - /// * The configurations for word ID and bit matrices are valid and properly aligned. - /// * Accessing elements in maps and vectors using unchecked indexing will not lead to out-of-bound errors. - /// - /// Use of these `unsafe` blocks is carefully justified to ensure efficient processing and is based - /// on guarantees provided either by the input text and configuration maps or the logical structure - /// of the program. - fn process(&'a self, text: &str) -> Vec> { + fn process(&'a self, text: &'a str) -> Vec> { if text.is_empty() { return Vec::new(); } + let processed_text_process_type_set = + reduce_text_process_with_tree(&self.process_type_tree, text); + + self._process_with_processed_text_process_type_set(&processed_text_process_type_set) + } + + fn _process_with_processed_text_process_type_set( + &'a self, + processed_text_process_type_set: &[(Cow<'a, str>, id_set::IdSet)], + ) -> Vec> { let mut word_id_split_bit_map = IntMap::default(); let mut not_word_id_set = IntSet::default(); - let processed_text_smt_list = reduce_text_process_with_tree(&self.smt_tree, text); - let processed_times = processed_text_smt_list.len(); + let processed_times = processed_text_process_type_set.len(); - for (index, (processed_text, smt_set)) in processed_text_smt_list.iter().enumerate() { + for (index, (processed_text, process_type_set)) in + processed_text_process_type_set.iter().enumerate() + { // Guaranteed not failed for ac_dedup_result in unsafe { - self.smt_matcher + self.ac_matcher .try_find_overlapping_iter(processed_text.as_ref()) .unwrap_unchecked() } { // Guaranteed not failed - for &(match_simple_match_type, word_id, offset) in unsafe { - self.smt_ac_dedup_word_conf_list + for &(match_process_type, word_id, offset) in unsafe { + self.ac_dedup_word_conf_list .get_unchecked(ac_dedup_result.pattern().as_usize()) } { - if !smt_set.contains(match_simple_match_type.bits() as usize) + if !process_type_set.contains(match_process_type.bits() as usize) || not_word_id_set.contains(&word_id) { continue; } // Guaranteed not failed - let word_conf = - unsafe { self.simple_word_conf_map.get(&word_id).unwrap_unchecked() }; + let word_conf = unsafe { self.word_conf_map.get(&word_id).unwrap_unchecked() }; let split_bit_matrix = word_id_split_bit_map.entry(word_id).or_insert_with(|| { @@ -607,8 +360,7 @@ impl<'a> TextMatcherTrait<'a, SimpleResult<'a>> for SimpleMatcher { word_id, word: Cow::Borrowed( // Guaranteed not failed - &unsafe { self.simple_word_conf_map.get(&word_id).unwrap_unchecked() } - .word, + &unsafe { self.word_conf_map.get(&word_id).unwrap_unchecked() }.word, ), }) }) diff --git a/matcher_rs/tests/test.rs b/matcher_rs/tests/test.rs index a07aca1..0b3dc5a 100644 --- a/matcher_rs/tests/test.rs +++ b/matcher_rs/tests/test.rs @@ -1,16 +1,16 @@ mod test_simple { use std::collections::HashMap; - use matcher_rs::{SimpleMatchType, SimpleMatcher, SimpleWord, TextMatcherTrait}; + use matcher_rs::{ProcessType, SimpleMatcher, SimpleWord, TextMatcherTrait}; #[test] fn simple_match_init() { let _ = SimpleMatcher::new(&HashMap::from([( - SimpleMatchType::None, + ProcessType::None, HashMap::from([(1, "")]), )])); let _ = SimpleMatcher::new(&HashMap::from([( - SimpleMatchType::None, + ProcessType::None, HashMap::from([(1, "hello"), (2, "world")]), )])); } @@ -18,13 +18,13 @@ mod test_simple { #[test] fn simple_match_fanjian() { let simple_matcher = SimpleMatcher::new(&HashMap::from([( - SimpleMatchType::Fanjian, + ProcessType::Fanjian, HashMap::from([(1, "你好")]), )])); assert!(simple_matcher.is_match("妳好")); let simple_matcher = SimpleMatcher::new(&HashMap::from([( - SimpleMatchType::Fanjian, + ProcessType::Fanjian, HashMap::from([(1, "妳好")]), )])); assert!(simple_matcher.is_match("你好")); @@ -33,7 +33,7 @@ mod test_simple { #[test] fn simple_match_delete() { let simple_matcher = SimpleMatcher::new(&HashMap::from([( - SimpleMatchType::Delete, + ProcessType::Delete, HashMap::from([(1, "你好")]), )])); assert!(simple_matcher.is_match("你!好")); @@ -42,7 +42,7 @@ mod test_simple { #[test] fn simple_match_normalize() { let simple_matcher = SimpleMatcher::new(&HashMap::from([( - SimpleMatchType::Normalize, + ProcessType::Normalize, HashMap::from([(1, "he11o")]), )])); assert!(simple_matcher.is_match("ℋЀ⒈㈠Õ")); @@ -51,7 +51,7 @@ mod test_simple { #[test] fn simple_match_pinyin() { let simple_matcher = SimpleMatcher::new(&HashMap::from([( - SimpleMatchType::PinYin, + ProcessType::PinYin, HashMap::from([(1, "西安")]), )])); assert!(simple_matcher.is_match("洗按")); @@ -61,7 +61,7 @@ mod test_simple { #[test] fn simple_match_pinyinchar() { let simple_matcher = SimpleMatcher::new(&HashMap::from([( - SimpleMatchType::PinYinChar, + ProcessType::PinYinChar, HashMap::from([(1, "西安")]), )])); assert!(simple_matcher.is_match("洗按")); @@ -72,7 +72,7 @@ mod test_simple { #[test] fn simple_match_combination() { let simple_matcher = SimpleMatcher::new(&HashMap::from([( - SimpleMatchType::None, + ProcessType::None, HashMap::from([ (1, SimpleWord::from("hello").and("world")), (2, SimpleWord::from("hello").and("world").and("hello")), @@ -95,13 +95,14 @@ mod test_simple { } mod test_regex { - use matcher_rs::{RegexMatchType, RegexMatcher, RegexTable, TextMatcherTrait}; + use matcher_rs::{ProcessType, RegexMatchType, RegexMatcher, RegexTable, TextMatcherTrait}; #[test] fn regex_match_regex() { let regex_matcher = RegexMatcher::new(&[RegexTable { table_id: 1, match_id: 1, + process_type: ProcessType::None, regex_match_type: RegexMatchType::Regex, word_list: &vec!["h[aeiou]llo", "w[aeiou]rd"], }]); @@ -115,6 +116,7 @@ mod test_regex { let regex_matcher = RegexMatcher::new(&[RegexTable { table_id: 1, match_id: 1, + process_type: ProcessType::None, regex_match_type: RegexMatchType::Acrostic, word_list: &vec!["h,e,l,l,o", "你,好"], }]); @@ -129,6 +131,7 @@ mod test_regex { let regex_matcher = RegexMatcher::new(&[RegexTable { table_id: 1, match_id: 1, + process_type: ProcessType::None, regex_match_type: RegexMatchType::SimilarChar, word_list: &vec!["hello,hi,H,你好", "world,word,🌍,世界"], }]); @@ -139,13 +142,14 @@ mod test_regex { } mod test_sim { - use matcher_rs::{SimMatchType, SimMatcher, SimTable, TextMatcherTrait}; + use matcher_rs::{ProcessType, SimMatchType, SimMatcher, SimTable, TextMatcherTrait}; #[test] fn sim_match() { let sim_matcher = SimMatcher::new(&[SimTable { table_id: 1, match_id: 1, + process_type: ProcessType::None, sim_match_type: SimMatchType::Levenshtein, word_list: &vec!["helloworld"], threshold: 0.8, @@ -161,7 +165,7 @@ mod test_sim { mod test_matcher { use std::collections::HashMap; - use matcher_rs::{MatchTable, MatchTableType, Matcher, SimpleMatchType, TextMatcherTrait}; + use matcher_rs::{MatchTable, MatchTableType, Matcher, ProcessType, TextMatcherTrait}; #[test] fn matcher_init() { @@ -171,10 +175,10 @@ mod test_matcher { vec![MatchTable { table_id: 1, match_table_type: MatchTableType::Simple { - simple_match_type: SimpleMatchType::None, + process_type: ProcessType::None, }, word_list: vec![], - exemption_simple_match_type: SimpleMatchType::None, + exemption_process_type: ProcessType::None, exemption_word_list: vec![], }], )])); @@ -187,10 +191,10 @@ mod test_matcher { vec![MatchTable { table_id: 1, match_table_type: MatchTableType::Simple { - simple_match_type: SimpleMatchType::None, + process_type: ProcessType::None, }, word_list: vec!["hello"], - exemption_simple_match_type: SimpleMatchType::None, + exemption_process_type: ProcessType::None, exemption_word_list: vec!["world"], }], )])); @@ -198,3 +202,74 @@ mod test_matcher { assert!(!matcher.is_match("hello,world")) } } + +mod test_process { + use matcher_rs::{ + build_process_type_tree, reduce_text_process, reduce_text_process_emit, + reduce_text_process_with_list, reduce_text_process_with_tree, text_process, ProcessType, + }; + + #[test] + fn test_text_process() { + let text = text_process(ProcessType::Fanjian, "~ᗩ~躶~𝚩~軆~Ⲉ~"); + println!("{:?}", text); + } + + #[test] + fn test_reduce_text_process() { + let text = reduce_text_process(ProcessType::FanjianDeleteNormalize, "~ᗩ~躶~𝚩~軆~Ⲉ~"); + println!("{:?}", text); + } + + #[test] + fn test_reduce_text_process_emit() { + let text = reduce_text_process_emit(ProcessType::FanjianDeleteNormalize, "~ᗩ~躶~𝚩~軆~Ⲉ~"); + println!("{:?}", text); + } + + #[test] + fn test_build_process_type_tree() { + let process_type_list = vec![ + ProcessType::Fanjian, + ProcessType::DeleteNormalize, + ProcessType::FanjianDeleteNormalize, + ProcessType::Delete, + ProcessType::Normalize, + ]; + let process_type_tree = build_process_type_tree(&process_type_list); + println!("{:?}", process_type_tree); + } + + #[test] + fn test_reduce_text_process_with_tree() { + let process_type_list = vec![ + ProcessType::Fanjian, + ProcessType::DeleteNormalize, + ProcessType::FanjianDeleteNormalize, + ProcessType::Delete, + ProcessType::Normalize, + ]; + let process_type_tree = build_process_type_tree(&process_type_list); + let text = "test爽-︻"; + + let processed_text_process_type_set = + reduce_text_process_with_tree(&process_type_tree, text); + println!("{processed_text_process_type_set:?}"); + } + + #[test] + fn test_reduce_text_process_with_list() { + let process_type_list = vec![ + ProcessType::Fanjian, + ProcessType::DeleteNormalize, + ProcessType::FanjianDeleteNormalize, + ProcessType::Delete, + ProcessType::Normalize, + ]; + let text = "test爽-︻"; + + let processed_text_process_type_set = + reduce_text_process_with_list(&process_type_list, text); + println!("{processed_text_process_type_set:?}"); + } +}