From 39766c207347fc893dfcc974e1d8b7a73028b44a Mon Sep 17 00:00:00 2001 From: yuuki toriyama Date: Mon, 12 Aug 2024 12:17:13 +0900 Subject: [PATCH 1/6] =?UTF-8?q?update:=20nom=E3=81=AE=E9=99=A4=E5=8D=B4:?= =?UTF-8?q?=20`VagueExpressionAdapter`=E3=81=A7nom=E3=82=92=E4=BD=BF?= =?UTF-8?q?=E7=94=A8=E3=81=97=E3=81=A6=E3=81=84=E3=82=8B=E7=AE=87=E6=89=80?= =?UTF-8?q?=E3=82=92=E8=87=AA=E5=89=8D=E3=81=AE=E5=AE=9F=E8=A3=85=E3=81=AB?= =?UTF-8?q?=E5=B7=AE=E3=81=97=E6=9B=BF=E3=81=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/src/parser/adapter/vague_expression_adapter.rs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/core/src/parser/adapter/vague_expression_adapter.rs b/core/src/parser/adapter/vague_expression_adapter.rs index dc400878..ccb4058f 100644 --- a/core/src/parser/adapter/vague_expression_adapter.rs +++ b/core/src/parser/adapter/vague_expression_adapter.rs @@ -1,8 +1,4 @@ use crate::util::sequence_matcher::SequenceMatcher; -use nom::bytes::complete::{is_a, is_not}; -use nom::combinator::rest; -use nom::error::Error; -use nom::sequence::tuple; pub struct VagueExpressionAdapter; @@ -11,13 +7,8 @@ impl VagueExpressionAdapter { if let Ok(highest_match) = SequenceMatcher::get_most_similar_match(input, region_name_list, None) { - let mut parser = tuple(( - is_not::<&str, &str, Error<&str>>("町村"), - is_a::<&str, &str, Error<&str>>("町村"), - rest, - )); - if let Ok((_, (_, _, rest))) = parser(input) { - return Some((rest.to_string(), highest_match)); + if let Some(position) = input.chars().position(|c| c == '町' || c == '村') { + return Some((input.chars().skip(position + 1).collect(), highest_match)); } } None From 36391b2fee3479b06911194d3ead0c53d2cebe0e Mon Sep 17 00:00:00 2001 From: yuuki toriyama Date: Mon, 12 Aug 2024 12:25:11 +0900 Subject: [PATCH 2/6] =?UTF-8?q?update:=20nom=E3=81=AE=E9=99=A4=E5=8D=B4:?= =?UTF-8?q?=20`VagueExpressionAdapter#apply`=E3=81=AE=E8=BF=94=E3=82=8A?= =?UTF-8?q?=E5=80=A4=E3=81=AE=E3=82=BF=E3=83=97=E3=83=AB=E3=81=AE=E4=B8=AD?= =?UTF-8?q?=E8=BA=AB=E3=81=AE=E9=A0=86=E7=95=AA=E3=82=92=E5=85=A5=E3=82=8C?= =?UTF-8?q?=E6=9B=BF=E3=81=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../adapter/vague_expression_adapter.rs | 22 +++++++++---------- core/src/tokenizer/read_city.rs | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/core/src/parser/adapter/vague_expression_adapter.rs b/core/src/parser/adapter/vague_expression_adapter.rs index ccb4058f..7ec6b724 100644 --- a/core/src/parser/adapter/vague_expression_adapter.rs +++ b/core/src/parser/adapter/vague_expression_adapter.rs @@ -8,7 +8,7 @@ impl VagueExpressionAdapter { SequenceMatcher::get_most_similar_match(input, region_name_list, None) { if let Some(position) = input.chars().position(|c| c == '町' || c == '村') { - return Some((input.chars().skip(position + 1).collect(), highest_match)); + return Some((highest_match, input.chars().skip(position + 1).collect())); } } None @@ -23,50 +23,50 @@ mod tests { #[test] fn 郡名が省略されている場合_吉田郡永平寺町() { let fukui = Prefecture::fukui(); - let (rest, city_name) = VagueExpressionAdapter {} + let (city_name, rest) = VagueExpressionAdapter {} .apply("永平寺町志比5-5", &fukui.cities) .unwrap(); - assert_eq!(rest, "志比5-5"); assert_eq!(city_name, "吉田郡永平寺町"); + assert_eq!(rest, "志比5-5"); } #[test] fn 郡名が省略されている場合_今立郡池田町() { let fukui = Prefecture::fukui(); - let (rest, city_name) = VagueExpressionAdapter {} + let (city_name, rest) = VagueExpressionAdapter {} .apply("池田町稲荷28-7", &fukui.cities) .unwrap(); - assert_eq!(rest, "稲荷28-7"); assert_eq!(city_name, "今立郡池田町"); + assert_eq!(rest, "稲荷28-7"); } #[test] fn 郡名が省略されている場合_南条郡南越前町() { let fukui = Prefecture::fukui(); - let (rest, city_name) = VagueExpressionAdapter {} + let (city_name, rest) = VagueExpressionAdapter {} .apply("南越前町今庄74-7-1", &fukui.cities) .unwrap(); - assert_eq!(rest, "今庄74-7-1"); assert_eq!(city_name, "南条郡南越前町"); + assert_eq!(rest, "今庄74-7-1"); } #[test] fn 郡名が省略されている場合_西村山郡河北町() { let yamagata = Prefecture::yamagata(); - let (rest, city_name) = VagueExpressionAdapter {} + let (city_name, rest) = VagueExpressionAdapter {} .apply("河北町大字吉田字馬場261", &yamagata.cities) .unwrap(); - assert_eq!(rest, "大字吉田字馬場261"); assert_eq!(city_name, "西村山郡河北町"); + assert_eq!(rest, "大字吉田字馬場261"); } #[test] fn 郡名と町名が一致している場合_最上郡最上町() { let yamagata = Prefecture::yamagata(); - let (rest, city_name) = VagueExpressionAdapter {} + let (city_name, rest) = VagueExpressionAdapter {} .apply("最上町法田2672-2", &yamagata.cities) .unwrap(); - assert_eq!(rest, "法田2672-2"); assert_eq!(city_name, "最上郡最上町"); + assert_eq!(rest, "法田2672-2"); } } diff --git a/core/src/tokenizer/read_city.rs b/core/src/tokenizer/read_city.rs index 67a11114..4869e2fb 100644 --- a/core/src/tokenizer/read_city.rs +++ b/core/src/tokenizer/read_city.rs @@ -75,9 +75,9 @@ impl Tokenizer { return Ok(Tokenizer { input: self.input.clone(), prefecture_name: self.prefecture_name.clone(), - city_name: Some(result.1), + city_name: Some(result.0), town_name: None, - rest: result.0, + rest: result.1, _state: PhantomData::, }); } From f4996f33970abc1ef2323fe1893c9ebf061bfab6 Mon Sep 17 00:00:00 2001 From: yuuki toriyama Date: Mon, 12 Aug 2024 12:55:10 +0900 Subject: [PATCH 3/6] =?UTF-8?q?update:=20nom=E3=81=AE=E9=99=A4=E5=8D=B4:?= =?UTF-8?q?=20`OrthographicalVariantAdapter`=E3=81=A7nom=E3=82=92=E4=BD=BF?= =?UTF-8?q?=E7=94=A8=E3=81=97=E3=81=A6=E3=81=84=E3=82=8B=E7=AE=87=E6=89=80?= =?UTF-8?q?=E3=82=92=E8=87=AA=E5=89=8D=E3=81=AE=E5=AE=9F=E8=A3=85=E3=81=AB?= =?UTF-8?q?=E5=B7=AE=E3=81=97=E6=9B=BF=E3=81=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../adapter/orthographical_variant_adapter.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/core/src/parser/adapter/orthographical_variant_adapter.rs b/core/src/parser/adapter/orthographical_variant_adapter.rs index ceed8b3f..52d2f4ce 100644 --- a/core/src/parser/adapter/orthographical_variant_adapter.rs +++ b/core/src/parser/adapter/orthographical_variant_adapter.rs @@ -1,7 +1,4 @@ use itertools::Itertools; -use nom::bytes::complete::tag; -use nom::error::VerboseError; -use nom::Parser; pub type Variant = &'static [&'static str]; @@ -86,11 +83,15 @@ impl OrthographicalVariantAdapter { // マッチ候補の中でパターンに引っかかるものがあれば文字を置き換えてマッチを試す if candidate.contains(permutation[0]) { let edited_region_name = candidate.replace(permutation[0], permutation[1]); - if let Ok((rest, _)) = - tag::<&str, &str, VerboseError<&str>>(&edited_region_name).parse(input) - { + if input.starts_with(&edited_region_name) { // マッチすれば早期リターン - return Some((rest.to_string(), region_name.to_string())); + return Some(( + input + .chars() + .skip(edited_region_name.chars().count()) + .collect(), + region_name.to_string(), + )); } else { // マッチしなければsemi_candidatesに置き換え後の文字列をpush semi_candidates.push(edited_region_name.clone()); From 04ea5feb9739d54dc17f01e164da4caeb0c82438 Mon Sep 17 00:00:00 2001 From: yuuki toriyama Date: Mon, 12 Aug 2024 13:01:14 +0900 Subject: [PATCH 4/6] =?UTF-8?q?update:=20nom=E3=81=AE=E9=99=A4=E5=8D=B4:?= =?UTF-8?q?=20`OrthographicalVariantAdapter#apply`=E3=81=AE=E8=BF=94?= =?UTF-8?q?=E3=82=8A=E5=80=A4=E3=81=AE=E3=82=BF=E3=83=97=E3=83=AB=E3=81=AE?= =?UTF-8?q?=E4=B8=AD=E8=BA=AB=E3=81=AE=E9=A0=86=E7=95=AA=E3=82=92=E5=85=A5?= =?UTF-8?q?=E3=82=8C=E6=9B=BF=E3=81=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/src/parser/adapter/orthographical_variant_adapter.rs | 2 +- core/src/tokenizer/read_city.rs | 4 ++-- core/src/tokenizer/read_town.rs | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/parser/adapter/orthographical_variant_adapter.rs b/core/src/parser/adapter/orthographical_variant_adapter.rs index 52d2f4ce..f870c60d 100644 --- a/core/src/parser/adapter/orthographical_variant_adapter.rs +++ b/core/src/parser/adapter/orthographical_variant_adapter.rs @@ -86,11 +86,11 @@ impl OrthographicalVariantAdapter { if input.starts_with(&edited_region_name) { // マッチすれば早期リターン return Some(( + region_name.to_string(), input .chars() .skip(edited_region_name.chars().count()) .collect(), - region_name.to_string(), )); } else { // マッチしなければsemi_candidatesに置き換え後の文字列をpush diff --git a/core/src/tokenizer/read_city.rs b/core/src/tokenizer/read_city.rs index 4869e2fb..a882d618 100644 --- a/core/src/tokenizer/read_city.rs +++ b/core/src/tokenizer/read_city.rs @@ -61,9 +61,9 @@ impl Tokenizer { return Ok(Tokenizer { input: self.input.clone(), prefecture_name: self.prefecture_name.clone(), - city_name: Some(result.1), + city_name: Some(result.0), town_name: None, - rest: result.0, + rest: result.1, _state: PhantomData::, }); } diff --git a/core/src/tokenizer/read_town.rs b/core/src/tokenizer/read_town.rs index 1f216530..eb4ccd4d 100644 --- a/core/src/tokenizer/read_town.rs +++ b/core/src/tokenizer/read_town.rs @@ -95,8 +95,8 @@ fn find_town(input: &str, candidates: &Vec) -> Option<(String, String)> Variant::蛍, ], }; - if let Some(result) = adapter.apply(input, candidate) { - return Some(result); + if let Some((region_name, rest)) = adapter.apply(input, candidate) { + return Some((rest, region_name)); }; } None From 2fa2464f6310b592f0275ff0bc73062842a19681 Mon Sep 17 00:00:00 2001 From: yuuki toriyama Date: Mon, 12 Aug 2024 13:08:51 +0900 Subject: [PATCH 5/6] =?UTF-8?q?update:=20nom=E3=81=AE=E9=99=A4=E5=8D=B4:?= =?UTF-8?q?=20`find=5Ftown()`=E3=81=AE=E8=BF=94=E3=82=8A=E5=80=A4=E3=81=AE?= =?UTF-8?q?=E3=82=BF=E3=83=97=E3=83=AB=E3=81=AE=E4=B8=AD=E8=BA=AB=E3=81=AE?= =?UTF-8?q?=E9=A0=86=E7=95=AA=E3=82=92=E5=85=A5=E3=82=8C=E6=9B=BF=E3=81=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `OrthographicalVariantAdapter#apply`の返り値と同じ順番になるようにした --- core/src/tokenizer/read_town.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/core/src/tokenizer/read_town.rs b/core/src/tokenizer/read_town.rs index eb4ccd4d..da8605ed 100644 --- a/core/src/tokenizer/read_town.rs +++ b/core/src/tokenizer/read_town.rs @@ -18,36 +18,36 @@ impl Tokenizer { if rest.contains("丁目") { rest = NonKanjiBlockNumberFilter {}.apply(rest); } - if let Some(result) = find_town(&rest, &candidates) { + if let Some((town_name, rest)) = find_town(&rest, &candidates) { return Ok(Tokenizer { input: self.input.clone(), prefecture_name: self.prefecture_name.clone(), city_name: self.city_name.clone(), - town_name: Some(result.1), - rest: result.0, + town_name: Some(town_name), + rest, _state: PhantomData::, }); } // 「〇〇町L丁目M番N」ではなく「〇〇町L-M-N」と表記されているような場合 rest = InvalidTownNameFormatFilter {}.apply(rest); - if let Some(result) = find_town(&rest, &candidates) { + if let Some((town_name, rest)) = find_town(&rest, &candidates) { return Ok(Tokenizer { input: self.input.clone(), prefecture_name: self.prefecture_name.clone(), city_name: self.city_name.clone(), - town_name: Some(result.1), - rest: result.0, + town_name: Some(town_name), + rest, _state: PhantomData::, }); } // ここまでで町名の検出に成功しない場合は、「大字」の省略の可能性を検討する - if let Some(result) = find_town(&format!("大字{}", rest), &candidates) { + if let Some((town_name, rest)) = find_town(&format!("大字{}", rest), &candidates) { return Ok(Tokenizer { input: self.input.clone(), prefecture_name: self.prefecture_name.clone(), city_name: self.city_name.clone(), - town_name: Some(result.1), - rest: result.0, + town_name: Some(town_name), + rest, _state: PhantomData::, }); } @@ -66,11 +66,11 @@ fn find_town(input: &str, candidates: &Vec) -> Option<(String, String)> for candidate in candidates { if input.starts_with(candidate) { return Some(( + candidate.to_string(), input .chars() .skip(candidate.chars().count()) .collect::(), - candidate.to_string(), )); } let adapter = OrthographicalVariantAdapter { @@ -95,8 +95,8 @@ fn find_town(input: &str, candidates: &Vec) -> Option<(String, String)> Variant::蛍, ], }; - if let Some((region_name, rest)) = adapter.apply(input, candidate) { - return Some((rest, region_name)); + if let Some(result) = adapter.apply(input, candidate) { + return Some(result); }; } None From d2f02f025fc9023468526b1dc835314a9ff77db1 Mon Sep 17 00:00:00 2001 From: yuuki toriyama Date: Mon, 12 Aug 2024 13:16:23 +0900 Subject: [PATCH 6/6] =?UTF-8?q?remove:=20nom=E3=81=AE=E9=99=A4=E5=8D=B4:?= =?UTF-8?q?=20=E4=BE=9D=E5=AD=98=E9=96=A2=E4=BF=82=E3=81=8B=E3=82=89nom?= =?UTF-8?q?=E3=82=92=E9=99=A4=E5=8D=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/core/Cargo.toml b/core/Cargo.toml index 00e2f54d..d76cd3d2 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -21,7 +21,6 @@ blocking = ["reqwest/blocking"] [dependencies] itertools = "0.13.0" js-sys = "0.3.67" -nom = "7.1.3" rapidfuzz = "0.5.0" regex = "1.10.2" serde.workspace = true