diff --git a/core/Cargo.toml b/core/Cargo.toml index d76cd3d2..d4b35f09 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -15,8 +15,9 @@ rust-version = "1.73.0" crate-type = ["rlib", "cdylib"] [features] -default = [] +default = ["city-name-correction"] blocking = ["reqwest/blocking"] +city-name-correction = [] [dependencies] itertools = "0.13.0" diff --git a/core/src/lib.rs b/core/src/lib.rs index 63fe3cbb..f340ca78 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -1,3 +1,9 @@ +//! A Rust library to parse japanese addresses. +//! +//! ## Feature flags +//! - `blocking`: Provide method that works synchronously +//! - `city-name-correction`*(enabled by default)*: Enable autocorrection if ambiguous city name was typed + #[cfg(all(target_family = "wasm", feature = "blocking"))] compile_error! { "The `blocking` feature is not supported with wasm target." diff --git a/core/src/parser.rs b/core/src/parser.rs index c51eed3b..7245e1fe 100644 --- a/core/src/parser.rs +++ b/core/src/parser.rs @@ -98,11 +98,21 @@ pub async fn parse(api: Arc, input: &str) -> ParseResult { Ok(result) => result, }; // 市町村名を特定 - let Ok(tokenizer) = tokenizer.read_city(prefecture.cities) else { - return ParseResult { - address: Address::from(tokenizer), - error: Some(Error::new_parse_error(ParseErrorKind::City)), - }; + let tokenizer = match tokenizer.read_city(&prefecture.cities) { + Ok(found) => found, + Err(not_found) => { + // 市区町村が特定できない場合かつフィーチャフラグが有効な場合、郡名が抜けている可能性を検討 + match not_found.read_city_with_county_name_completion(&prefecture.cities) { + Ok(found) if cfg!(feature = "city-name-correction") => found, + _ => { + // それでも見つからない場合は終了 + return ParseResult { + address: Address::from(tokenizer), + error: Some(Error::new_parse_error(ParseErrorKind::City)), + }; + } + } + } }; // その市町村の町名リストを取得 let city = match api @@ -254,11 +264,19 @@ pub fn parse_blocking(api: Arc, input: &str) -> ParseResult { } Ok(result) => result, }; - let Ok(tokenizer) = tokenizer.read_city(prefecture.cities) else { - return ParseResult { - address: Address::from(tokenizer), - error: Some(Error::new_parse_error(ParseErrorKind::City)), - }; + let tokenizer = match tokenizer.read_city(&prefecture.cities) { + Ok(found) => found, + Err(not_found) => { + match not_found.read_city_with_county_name_completion(&prefecture.cities) { + Ok(found) if cfg!(feature = "city-name-correction") => found, + _ => { + return ParseResult { + address: Address::from(tokenizer), + error: Some(Error::new_parse_error(ParseErrorKind::City)), + }; + } + } + } }; let city = match api.get_city_master( tokenizer.prefecture_name.as_ref().unwrap(), diff --git a/core/src/parser/adapter.rs b/core/src/parser/adapter.rs index d64fc3c0..f606829f 100644 --- a/core/src/parser/adapter.rs +++ b/core/src/parser/adapter.rs @@ -1,2 +1 @@ pub mod orthographical_variant_adapter; -pub mod vague_expression_adapter; diff --git a/core/src/parser/adapter/vague_expression_adapter.rs b/core/src/parser/adapter/vague_expression_adapter.rs deleted file mode 100644 index badec48f..00000000 --- a/core/src/parser/adapter/vague_expression_adapter.rs +++ /dev/null @@ -1,122 +0,0 @@ -use crate::util::sequence_matcher::SequenceMatcher; - -pub struct VagueExpressionAdapter; - -impl VagueExpressionAdapter { - pub fn apply(self, input: &str, region_name_list: &[String]) -> Option<(String, String)> { - if let Ok(highest_match) = - SequenceMatcher::get_most_similar_match(input, region_name_list, None) - { - if let Ok(complemented_address) = complement_county_name(input, &highest_match) { - return Some(( - highest_match.clone(), - complemented_address - .chars() - .skip(highest_match.chars().count()) - .collect(), - )); - } - } - None - } -} - -/// 郡名が抜けている住所に郡名を補う関数 -/// -/// 欠けている郡名を補うだけで、それ以上のことはしない。 -/// 市区町村名に表記揺れがあってもそれを上書きすることはしない。 -fn complement_county_name(vague_address: &str, with: &str) -> Result { - match with.chars().position(|c| c == '郡') { - None => Err("郡名が見つかりませんでした"), - Some(position) => Ok(with.chars().take(position + 1).collect::() + vague_address), - } -} - -#[cfg(test)] -mod tests { - use crate::domain::geolonia::entity::Prefecture; - use crate::parser::adapter::vague_expression_adapter::{ - complement_county_name, VagueExpressionAdapter, - }; - - #[test] - fn complement_county_name_郡名が省略されている場合() { - assert_eq!( - complement_county_name("大町町大字福母297", "杵島郡大町町").unwrap(), - "杵島郡大町町大字福母297" - ); - assert_eq!( - complement_county_name("村田町大字村田字迫6", "柴田郡村田町").unwrap(), - "柴田郡村田町大字村田字迫6" - ); - assert_eq!( - complement_county_name("玉村町上新田1116", "佐波郡玉村町").unwrap(), - "佐波郡玉村町上新田1116" - ); - // 市区町村名に表記揺れも含む場合 - assert_eq!( - complement_county_name("桧原村上元郷403", "西多摩郡檜原村").unwrap(), - "西多摩郡桧原村上元郷403" - ) - } - - #[test] - fn 郡名が省略されている場合_吉田郡永平寺町() { - let fukui = Prefecture::fukui(); - let (city_name, rest) = VagueExpressionAdapter {} - .apply("永平寺町志比5-5", &fukui.cities) - .unwrap(); - assert_eq!(city_name, "吉田郡永平寺町"); - assert_eq!(rest, "志比5-5"); - } - - #[test] - fn 郡名が省略されている場合_今立郡池田町() { - let fukui = Prefecture::fukui(); - let (city_name, rest) = VagueExpressionAdapter {} - .apply("池田町稲荷28-7", &fukui.cities) - .unwrap(); - assert_eq!(city_name, "今立郡池田町"); - assert_eq!(rest, "稲荷28-7"); - } - - #[test] - fn 郡名が省略されている場合_南条郡南越前町() { - let fukui = Prefecture::fukui(); - let (city_name, rest) = VagueExpressionAdapter {} - .apply("南越前町今庄74-7-1", &fukui.cities) - .unwrap(); - assert_eq!(city_name, "南条郡南越前町"); - assert_eq!(rest, "今庄74-7-1"); - } - - #[test] - fn 郡名が省略されている場合_西村山郡河北町() { - let yamagata = Prefecture::yamagata(); - let (city_name, rest) = VagueExpressionAdapter {} - .apply("河北町大字吉田字馬場261", &yamagata.cities) - .unwrap(); - assert_eq!(city_name, "西村山郡河北町"); - assert_eq!(rest, "大字吉田字馬場261"); - } - - #[test] - fn 郡名が省略されている場合_杵島郡大町町() { - let saga = Prefecture::saga(); - let (city_name, rest) = VagueExpressionAdapter {} - .apply("大町町大字大町5017番地", &saga.cities) - .unwrap(); - assert_eq!(city_name, "杵島郡大町町"); - assert_eq!(rest, "大字大町5017番地"); - } - - #[test] - fn 郡名と町名が一致している場合_最上郡最上町() { - let yamagata = Prefecture::yamagata(); - let (city_name, rest) = VagueExpressionAdapter {} - .apply("最上町法田2672-2", &yamagata.cities) - .unwrap(); - assert_eq!(city_name, "最上郡最上町"); - assert_eq!(rest, "法田2672-2"); - } -} diff --git a/core/src/tokenizer.rs b/core/src/tokenizer.rs index 0a4c9af7..aa305502 100644 --- a/core/src/tokenizer.rs +++ b/core/src/tokenizer.rs @@ -1,4 +1,5 @@ pub(crate) mod read_city; +pub(crate) mod read_city_with_county_name_completion; pub(crate) mod read_prefecture; pub(crate) mod read_town; @@ -11,6 +12,8 @@ pub(crate) struct PrefectureNameFound; #[derive(Debug)] pub(crate) struct CityNameFound; #[derive(Debug)] +pub(crate) struct CityNameNotFound; +#[derive(Debug)] pub(crate) struct TownNameFound; #[derive(Debug)] pub(crate) struct End; diff --git a/core/src/tokenizer/read_city.rs b/core/src/tokenizer/read_city.rs index a882d618..4c865f14 100644 --- a/core/src/tokenizer/read_city.rs +++ b/core/src/tokenizer/read_city.rs @@ -3,15 +3,14 @@ use std::marker::PhantomData; use crate::parser::adapter::orthographical_variant_adapter::{ OrthographicalVariantAdapter, OrthographicalVariants, Variant, }; -use crate::parser::adapter::vague_expression_adapter::VagueExpressionAdapter; -use crate::tokenizer::{CityNameFound, End, PrefectureNameFound, Tokenizer}; +use crate::tokenizer::{CityNameFound, CityNameNotFound, PrefectureNameFound, Tokenizer}; impl Tokenizer { pub(crate) fn read_city( &self, - candidates: Vec, - ) -> Result, Tokenizer> { - for candidate in &candidates { + candidates: &Vec, + ) -> Result, Tokenizer> { + for candidate in candidates { if self.rest.starts_with(candidate) { return Ok(Tokenizer { input: self.input.clone(), @@ -69,26 +68,13 @@ impl Tokenizer { } } - // ここまでで市町村名の特定ができない場合はVagueExpressionAdapterを使用して市町村名を推測する - let vague_expression_adapter = VagueExpressionAdapter {}; - if let Some(result) = vague_expression_adapter.apply(self.rest.as_str(), &candidates) { - return Ok(Tokenizer { - input: self.input.clone(), - prefecture_name: self.prefecture_name.clone(), - city_name: Some(result.0), - town_name: None, - rest: result.1, - _state: PhantomData::, - }); - } - Err(Tokenizer { input: self.input.clone(), prefecture_name: self.prefecture_name.clone(), city_name: None, town_name: None, rest: self.rest.clone(), - _state: PhantomData::, + _state: PhantomData::, }) } } @@ -108,7 +94,7 @@ mod tests { rest: "横浜市保土ケ谷区川辺町2番地9".to_string(), _state: PhantomData::, }; - let result = tokenizer.read_city(vec![ + let result = tokenizer.read_city(&vec![ "横浜市保土ケ谷区".to_string(), "横浜市鶴見区".to_string(), "横浜市西区".to_string(), @@ -132,7 +118,7 @@ mod tests { rest: "横浜市保土ヶ谷区川辺町2番地9".to_string(), _state: PhantomData::, }; - let result = tokenizer.read_city(vec![ + let result = tokenizer.read_city(&vec![ "横浜市保土ケ谷区".to_string(), "横浜市鶴見区".to_string(), "横浜市西区".to_string(), @@ -146,31 +132,6 @@ mod tests { assert_eq!(tokenizer.rest, "川辺町2番地9"); } - #[test] - fn read_city_vague_expression_adapterで成功() { - let tokenizer = Tokenizer { - input: "埼玉県東秩父村大字御堂634番地".to_string(), // 「秩父郡」が省略されている - prefecture_name: Some("埼玉県".to_string()), - city_name: None, - town_name: None, - rest: "東秩父村大字御堂634番地".to_string(), - _state: PhantomData::, - }; - let result = tokenizer.read_city(vec![ - "秩父郡皆野町".to_string(), - "秩父郡長瀞町".to_string(), - "秩父郡小鹿野町".to_string(), - "秩父郡東秩父村".to_string(), - ]); - assert!(result.is_ok()); - let tokenizer = result.unwrap(); - assert_eq!(tokenizer.input, "埼玉県東秩父村大字御堂634番地"); - assert_eq!(tokenizer.prefecture_name, Some("埼玉県".to_string())); - assert_eq!(tokenizer.city_name, Some("秩父郡東秩父村".to_string())); - assert_eq!(tokenizer.town_name, None); - assert_eq!(tokenizer.rest, "大字御堂634番地"); - } - #[test] fn read_city_失敗() { let tokenizer = Tokenizer { @@ -181,7 +142,7 @@ mod tests { rest: "京都市上京区川辺町2番地9".to_string(), _state: PhantomData::, }; - let result = tokenizer.read_city(vec![ + let result = tokenizer.read_city(&vec![ "横浜市保土ケ谷区".to_string(), "横浜市鶴見区".to_string(), "横浜市西区".to_string(), diff --git a/core/src/tokenizer/read_city_with_county_name_completion.rs b/core/src/tokenizer/read_city_with_county_name_completion.rs new file mode 100644 index 00000000..b666c82f --- /dev/null +++ b/core/src/tokenizer/read_city_with_county_name_completion.rs @@ -0,0 +1,205 @@ +use crate::tokenizer::{CityNameFound, CityNameNotFound, End, Tokenizer}; +use crate::util::sequence_matcher::SequenceMatcher; +use std::marker::PhantomData; + +impl Tokenizer { + pub(crate) fn read_city_with_county_name_completion( + &self, + candidates: &[String], + ) -> Result, Tokenizer> { + if let Ok(highest_match) = + SequenceMatcher::get_most_similar_match(&self.rest, candidates, None) + { + if let Ok(complemented_address) = complement_county_name(&self.rest, &highest_match) { + return Ok(Tokenizer { + input: self.input.clone(), + prefecture_name: self.prefecture_name.clone(), + city_name: Some(highest_match.clone()), + town_name: None, + rest: complemented_address + .chars() + .skip(highest_match.chars().count()) + .collect(), + _state: PhantomData::, + }); + } + } + Err(Tokenizer { + input: self.input.clone(), + prefecture_name: self.prefecture_name.clone(), + city_name: None, + town_name: None, + rest: self.rest.clone(), + _state: PhantomData::, + }) + } +} + +/// 郡名が抜けている住所に郡名を補う関数 +/// +/// 欠けている郡名を補うだけで、それ以上のことはしない。 +/// 市区町村名に表記揺れがあってもそれを上書きすることはしない。 +fn complement_county_name(vague_address: &str, with: &str) -> Result { + match with.chars().position(|c| c == '郡') { + None => Err("郡名が見つかりませんでした"), + Some(position) => Ok(with.chars().take(position + 1).collect::() + vague_address), + } +} + +#[cfg(test)] +mod tests { + use crate::domain::geolonia::entity::Prefecture; + use crate::tokenizer::read_city_with_county_name_completion::complement_county_name; + use crate::tokenizer::{CityNameNotFound, Tokenizer}; + use std::marker::PhantomData; + + #[test] + fn complement_county_name_郡名が省略されている場合() { + assert_eq!( + complement_county_name("大町町大字福母297", "杵島郡大町町").unwrap(), + "杵島郡大町町大字福母297" + ); + assert_eq!( + complement_county_name("村田町大字村田字迫6", "柴田郡村田町").unwrap(), + "柴田郡村田町大字村田字迫6" + ); + assert_eq!( + complement_county_name("玉村町上新田1116", "佐波郡玉村町").unwrap(), + "佐波郡玉村町上新田1116" + ); + // 市区町村名に表記揺れも含む場合 + assert_eq!( + complement_county_name("桧原村上元郷403", "西多摩郡檜原村").unwrap(), + "西多摩郡桧原村上元郷403" + ) + } + + #[test] + fn read_city_with_county_name_completion_秩父郡東秩父村() { + let tokenizer = Tokenizer { + input: "埼玉県東秩父村大字御堂634番地".to_string(), // 「秩父郡」が省略されている + prefecture_name: Some("埼玉県".to_string()), + city_name: None, + town_name: None, + rest: "東秩父村大字御堂634番地".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_city_with_county_name_completion(&vec![ + "秩父郡皆野町".to_string(), + "秩父郡長瀞町".to_string(), + "秩父郡小鹿野町".to_string(), + "秩父郡東秩父村".to_string(), + ]); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.input, "埼玉県東秩父村大字御堂634番地"); + assert_eq!(tokenizer.prefecture_name, Some("埼玉県".to_string())); + assert_eq!(tokenizer.city_name, Some("秩父郡東秩父村".to_string())); + assert_eq!(tokenizer.town_name, None); + assert_eq!(tokenizer.rest, "大字御堂634番地"); + } + + #[test] + fn read_city_with_county_name_completion_吉田郡永平寺町() { + let tokenizer = Tokenizer { + input: "".to_string(), + prefecture_name: None, + city_name: None, + town_name: None, + rest: "永平寺町志比5-5".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_city_with_county_name_completion(&Prefecture::fukui().cities); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.city_name, Some("吉田郡永平寺町".to_string())); + assert_eq!(tokenizer.rest, "志比5-5"); + } + + #[test] + fn read_city_with_county_name_completion_今立郡池田町() { + let tokenizer = Tokenizer { + input: "".to_string(), + prefecture_name: None, + city_name: None, + town_name: None, + rest: "池田町稲荷28-7".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_city_with_county_name_completion(&Prefecture::fukui().cities); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.city_name, Some("今立郡池田町".to_string())); + assert_eq!(tokenizer.rest, "稲荷28-7"); + } + + #[test] + fn read_city_with_county_name_completion_南条郡南越前町() { + let tokenizer = Tokenizer { + input: "".to_string(), + prefecture_name: None, + city_name: None, + town_name: None, + rest: "南越前町今庄74-7-1".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_city_with_county_name_completion(&Prefecture::fukui().cities); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.city_name, Some("南条郡南越前町".to_string())); + assert_eq!(tokenizer.rest, "今庄74-7-1"); + } + + #[test] + fn read_city_with_county_name_completion_西村山郡河北町() { + let tokenizer = Tokenizer { + input: "".to_string(), + prefecture_name: None, + city_name: None, + town_name: None, + rest: "河北町大字吉田字馬場261".to_string(), + _state: PhantomData::, + }; + let result = + tokenizer.read_city_with_county_name_completion(&Prefecture::yamagata().cities); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.city_name, Some("西村山郡河北町".to_string())); + assert_eq!(tokenizer.rest, "大字吉田字馬場261"); + } + + #[test] + fn read_city_with_county_name_completion_杵島郡大町町() { + let tokenizer = Tokenizer { + input: "".to_string(), + prefecture_name: None, + city_name: None, + town_name: None, + rest: "大町町大字大町5017番地".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_city_with_county_name_completion(&Prefecture::saga().cities); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.city_name, Some("杵島郡大町町".to_string())); + assert_eq!(tokenizer.rest, "大字大町5017番地"); + } + + #[test] + fn read_city_with_county_name_completion_最上郡最上町() { + let tokenizer = Tokenizer { + input: "".to_string(), + prefecture_name: None, + city_name: None, + town_name: None, + rest: "最上町法田2672-2".to_string(), + _state: PhantomData::, + }; + let result = + tokenizer.read_city_with_county_name_completion(&Prefecture::yamagata().cities); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.city_name, Some("最上郡最上町".to_string())); + assert_eq!(tokenizer.rest, "法田2672-2"); + } +}