From 9ff4e88fb9fc4066b99eed12496d50332f1107b0 Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Thu, 12 Sep 2024 23:00:01 +0900 Subject: [PATCH 1/3] =?UTF-8?q?update:=20#410:=20=E6=96=87=E5=AD=97?= =?UTF-8?q?=E5=88=97=E3=81=8B=E3=82=89=E3=83=9B=E3=83=AF=E3=82=A4=E3=83=88?= =?UTF-8?q?=E3=82=B9=E3=83=9A=E3=83=BC=E3=82=B9=E3=82=92=E5=8F=96=E3=82=8A?= =?UTF-8?q?=E9=99=A4=E3=81=8F=E3=83=A1=E3=82=BD=E3=83=83=E3=83=89=E3=82=92?= =?UTF-8?q?=E5=AE=9A=E7=BE=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/src/util/extension.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/core/src/util/extension.rs b/core/src/util/extension.rs index f7a81cf3..a8cd5940 100644 --- a/core/src/util/extension.rs +++ b/core/src/util/extension.rs @@ -10,10 +10,15 @@ impl CharExt for char { } pub(crate) trait StrExt { + fn strip_whitespaces(&self) -> String; fn strip_variation_selectors(&self) -> String; } impl StrExt for str { + /// 文字列からホワイトスペースを取り除きます + fn strip_whitespaces(&self) -> String { + self.chars().filter(|c| !c.is_whitespace()).collect() + } /// 文字列から異字体セレクタを取り除きます fn strip_variation_selectors(&self) -> String { self.chars() @@ -59,4 +64,16 @@ mod tests { assert_ne!(normal, variant); assert_eq!(normal, variant.strip_variation_selectors()); } + + #[test] + fn strip_whitespaces() { + assert_eq!("四谷1丁目".strip_whitespaces(), "四谷1丁目"); + assert_eq!("四谷 1丁目".strip_whitespaces(), "四谷1丁目"); + assert_eq!("四谷 1丁目".strip_whitespaces(), "四谷1丁目"); + assert_eq!("四谷 1 丁 目".strip_whitespaces(), "四谷1丁目"); + assert_eq!("神田3丁目".strip_whitespaces(), "神田3丁目"); + assert_eq!("神田 3丁目".strip_whitespaces(), "神田3丁目"); + assert_eq!("神田  3丁目".strip_whitespaces(), "神田3丁目"); + assert_eq!("神田 3 丁目".strip_whitespaces(), "神田3丁目"); + } } From 8aaebcdfb3de363127705b6fcba1f086940aa51b Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Thu, 12 Sep 2024 23:10:27 +0900 Subject: [PATCH 2/3] =?UTF-8?q?update:=20#410:=20=E3=83=95=E3=82=A3?= =?UTF-8?q?=E3=83=BC=E3=83=81=E3=83=A3=E3=83=95=E3=83=A9=E3=82=B0`eliminat?= =?UTF-8?q?e-whitespaces`=E3=81=8C=E6=8C=87=E5=AE=9A=E3=81=95=E3=82=8C?= =?UTF-8?q?=E3=81=9F=E5=A0=B4=E5=90=88=E3=80=81=E5=85=A5=E5=8A=9B=E3=81=95?= =?UTF-8?q?=E3=82=8C=E3=81=9F=E6=96=87=E5=AD=97=E5=88=97=E3=81=8B=E3=82=89?= =?UTF-8?q?=E3=83=9B=E3=83=AF=E3=82=A4=E3=83=88=E3=82=B9=E3=83=9A=E3=83=BC?= =?UTF-8?q?=E3=82=B9=E3=82=92=E5=8F=96=E3=82=8A=E9=99=A4=E3=81=8F=E3=82=88?= =?UTF-8?q?=E3=81=86=E3=81=AB=E3=81=97=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/Cargo.toml | 1 + core/src/tokenizer/read_prefecture.rs | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/core/Cargo.toml b/core/Cargo.toml index 2ad307de..0e72ca7d 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -19,6 +19,7 @@ default = ["city-name-correction"] blocking = ["reqwest/blocking"] city-name-correction = [] format-house-number = [] +eliminate-whitespaces = [] [dependencies] itertools = "0.13.0" diff --git a/core/src/tokenizer/read_prefecture.rs b/core/src/tokenizer/read_prefecture.rs index d811f320..e88f639b 100644 --- a/core/src/tokenizer/read_prefecture.rs +++ b/core/src/tokenizer/read_prefecture.rs @@ -60,7 +60,11 @@ impl Tokenizer { prefecture_name: None, city_name: None, town_name: None, - rest: input.strip_variation_selectors(), + rest: if cfg!(feature = "eliminate-whitespaces") { + input.strip_variation_selectors().strip_whitespaces() + } else { + input.strip_variation_selectors() + }, _state: PhantomData, } } @@ -117,6 +121,17 @@ mod tests { assert_eq!(tokenizer.rest, "東京都葛飾区立石5-13-1") } + #[test] + #[cfg(feature = "eliminate-whitespaces")] + fn new_ホワイトスペース除却() { + let tokenizer = Tokenizer::new("東京都 目黒区 下目黒 4‐1‐1"); + assert_eq!(tokenizer.input, "東京都 目黒区 下目黒 4‐1‐1"); + assert_eq!(tokenizer.prefecture_name, None); + assert_eq!(tokenizer.city_name, None); + assert_eq!(tokenizer.town_name, None); + assert_eq!(tokenizer.rest, "東京都目黒区下目黒4‐1‐1") + } + #[test] fn read_prefecture_成功() { let tokenizer = Tokenizer::new("東京都港区芝公園4丁目2-8"); From f61e93d81a1f875e267c7782a4c0be0f44227083 Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Thu, 12 Sep 2024 23:12:57 +0900 Subject: [PATCH 3/3] =?UTF-8?q?update:=20#410:=20=E3=83=95=E3=82=A3?= =?UTF-8?q?=E3=83=BC=E3=83=81=E3=83=A3=E3=83=95=E3=83=A9=E3=82=B0`eliminat?= =?UTF-8?q?e-whitespaces`=E3=81=AB=E3=81=A4=E3=81=84=E3=81=A6=E3=83=89?= =?UTF-8?q?=E3=82=AD=E3=83=A5=E3=83=A1=E3=83=B3=E3=83=88=E3=81=AB=E8=A8=98?= =?UTF-8?q?=E8=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/lib.rs b/core/src/lib.rs index aa31c5a9..b942087c 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -4,6 +4,7 @@ //! - `blocking`: Provide method that works synchronously //! - `city-name-correction`*(enabled by default)*: Enable autocorrection if ambiguous city name was typed //! - `format-house-number`: Enable normalization of addresses after town name +//! - `eliminate-whitespaces`*(experimental)*: Enable elimination of whitespaces from given text #[cfg(all(target_family = "wasm", feature = "blocking"))] compile_error! {