Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release/v0.1.13をmainブランチにマージ #404

Merged
merged 10 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ members = [
resolver = "2"

[workspace.package]
version = "0.1.12"
version = "0.1.13"
edition = "2021"
description = "A Rust Library to parse japanese addresses."
repository = "https://github.com/YuukiToriyama/japanese-address-parser"
Expand Down
2 changes: 0 additions & 2 deletions core/src/parser/adapter/orthographical_variant_adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ pub trait OrthographicalVariants {
const 恵: Variant;
const 穂: Variant;
const 梼: Variant;
const 葛: Variant;
const 蛍: Variant;
const 與: Variant;
const 瀧: Variant;
Expand Down Expand Up @@ -53,7 +52,6 @@ impl OrthographicalVariants for Variant {
const 恵: Variant = &["恵", "惠"];
const 穂: Variant = &["穂", "穗"];
const 梼: Variant = &["梼", "檮"];
const 葛: Variant = &["葛󠄀", "葛"];
const 蛍: Variant = &["蛍", "螢"];
const 與: Variant = &["與", "与"];
const 瀧: Variant = &["瀧", "滝"];
Expand Down
4 changes: 0 additions & 4 deletions core/src/tokenizer/read_city.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,10 @@ impl Tokenizer<PrefectureNameFound> {
}
"東京都" => {
variant_list.push(Variant::檜);
variant_list.push(Variant::葛);
}
"兵庫県" => {
variant_list.push(Variant::塚);
}
"奈良県" => {
variant_list.push(Variant::葛);
}
"高知県" => {
variant_list.push(Variant::梼);
}
Expand Down
17 changes: 14 additions & 3 deletions core/src/tokenizer/read_prefecture.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::marker::PhantomData;

use crate::tokenizer::{End, Init, PrefectureNameFound, Tokenizer};
use crate::util::extension::StrExt;

const PREFECTURE_NAME_LIST: [&str; 47] = [
"北海道",
Expand Down Expand Up @@ -59,21 +60,21 @@ impl Tokenizer<Init> {
prefecture_name: None,
city_name: None,
town_name: None,
rest: input.to_string(),
rest: input.strip_variation_selectors(),
_state: PhantomData,
}
}

pub(crate) fn read_prefecture(&self) -> Result<Tokenizer<PrefectureNameFound>, Tokenizer<End>> {
for prefecture_name in PREFECTURE_NAME_LIST {
if self.input.starts_with(prefecture_name) {
if self.rest.starts_with(prefecture_name) {
return Ok(Tokenizer {
input: self.input.clone(),
prefecture_name: Some(prefecture_name.to_string()),
city_name: None,
town_name: None,
rest: self
.input
.rest
.chars()
.skip(prefecture_name.chars().count())
.collect::<String>(),
Expand Down Expand Up @@ -106,6 +107,16 @@ mod tests {
assert_eq!(tokenizer.rest, "東京都港区芝公園4丁目2-8");
}

#[test]
fn new_異字体セレクタ除去() {
let tokenizer = Tokenizer::new("東京都葛\u{E0100}飾区立石5-13-1");
assert_eq!(tokenizer.input, "東京都葛\u{E0100}飾区立石5-13-1");
assert_eq!(tokenizer.prefecture_name, None);
assert_eq!(tokenizer.city_name, None);
assert_eq!(tokenizer.town_name, None);
assert_eq!(tokenizer.rest, "東京都葛飾区立石5-13-1")
}

#[test]
fn read_prefecture_成功() {
let tokenizer = Tokenizer::new("東京都港区芝公園4丁目2-8");
Expand Down
1 change: 1 addition & 0 deletions core/src/util.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod converter;
pub(crate) mod extension;
pub mod sequence_matcher;
mod trimmer;
62 changes: 62 additions & 0 deletions core/src/util/extension.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
pub(crate) trait CharExt {
fn is_variation_selector(&self) -> bool;
}

impl CharExt for char {
/// 異字体セレクタかどうかを判別します
fn is_variation_selector(&self) -> bool {
matches!(self, '\u{FE00}'..='\u{FE0F}' | '\u{E0100}'..='\u{E01EF}')
}
}

pub(crate) trait StrExt {
fn strip_variation_selectors(&self) -> String;
}

impl StrExt for str {
/// 文字列から異字体セレクタを取り除きます
fn strip_variation_selectors(&self) -> String {
self.chars()
.filter(|c| !c.is_variation_selector())
.collect()
}
}

#[cfg(test)]
mod tests {
use crate::util::extension::{CharExt, StrExt};

#[test]
fn is_variation_selector() {
assert_eq!('あ'.is_variation_selector(), false);
assert_eq!('亜'.is_variation_selector(), false);

assert_eq!('\u{FDFF}'.is_variation_selector(), false);
assert_eq!('\u{FE00}'.is_variation_selector(), true);

assert_eq!('\u{FE0F}'.is_variation_selector(), true);
assert_eq!('\u{FE10}'.is_variation_selector(), false);

assert_eq!('\u{E00FF}'.is_variation_selector(), false);
assert_eq!('\u{E0100}'.is_variation_selector(), true);

assert_eq!('\u{E01EF}'.is_variation_selector(), true);
assert_eq!('\u{E01F0}'.is_variation_selector(), false);
}

#[test]
fn strip_variation_selectors_逢坂() {
let normal = "\u{9022}\u{5742}"; // 逢坂
let variant = "\u{9022}\u{E0101}\u{5742}"; // 逢󠄁坂
assert_ne!(normal, variant);
assert_eq!(normal, variant.strip_variation_selectors());
}

#[test]
fn strip_variation_selectors_茨城() {
let normal = "\u{8328}\u{57CE}";
let variant = "\u{8328}\u{E0100}\u{57CE}";
assert_ne!(normal, variant);
assert_eq!(normal, variant.strip_variation_selectors());
}
}
5 changes: 5 additions & 0 deletions tests/integration_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,8 @@ async fn 郡が省略されている場合への対応テスト() {
async fn 郡名と町名が一致している場合() {
run_data_driven_tests("./test_data/郡名と町名が一致している場合.csv").await
}

#[tokio::test]
async fn 異字体セレクタを含む場合への対応() {
run_data_driven_tests("./test_data/異字体セレクタを含む場合への対応.csv").await
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,8 @@ address,prefecture,city,town,rest
# 茨城県
茨城県鹿嶋市大字平井1187-1,茨城県,鹿嶋市,大字平井,1187-1
茨城県鹿島市大字平井1187-1,茨城県,鹿嶋市,大字平井,1187-1
# 東京都
東京都葛飾区立石5-13-1,東京都,葛飾区,立石五丁目,13-1
東京都葛󠄀飾区立石5-13-1,東京都,葛飾区,立石五丁目,13-1
# 兵庫県
兵庫県宝塚市売布東の町8-19,兵庫県,宝塚市,売布東の町,8-19
兵庫県宝塚市売布東の町8-19,兵庫県,宝塚市,売布東の町,8-19
兵庫県宝塚市武庫川町1-1,兵庫県,宝塚市,武庫川町,1-1
兵庫県宝塚市武庫川町1-1,兵庫県,宝塚市,武庫川町,1-1
# 奈良県
奈良県葛󠄀城市柿本166番地,奈良県,葛城市,柿本,166番地
奈良県葛城市柿本166番地,奈良県,葛城市,柿本,166番地
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
address,prefecture,city,town,rest
東京都葛飾区立石5-13-1,東京都,葛飾区,立石五丁目,13-1
東京都葛󠄀飾区立石5-13-1,東京都,葛飾区,立石五丁目,13-1
奈良県葛城市柿本166番地,奈良県,葛城市,柿本,166番地
奈良県葛󠄀城市柿本166番地,奈良県,葛城市,柿本,166番地
鹿児島県薩摩川内市上甑町中甑250-1,鹿児島県,薩摩川内市,上甑町中甑,250-1
鹿児島県薩摩川内市上甑󠄀町中甑󠄀250-1,鹿児島県,薩摩川内市,上甑町中甑,250-1
Loading