From 37d6a9ec44ea3c2e4b5504924e7000aa928dad6d Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Fri, 11 Oct 2024 22:59:33 +0900 Subject: [PATCH 1/8] =?UTF-8?q?update:=20#459:=20PR=E4=BD=9C=E6=88=90?= =?UTF-8?q?=E6=99=82=E3=81=AB=E5=AE=9F=E8=A1=8C=E3=81=95=E3=82=8C=E3=82=8B?= =?UTF-8?q?=E3=83=AF=E3=83=BC=E3=82=AF=E3=83=95=E3=83=AD=E3=83=BC=E3=81=AB?= =?UTF-8?q?=E3=81=A4=E3=81=84=E3=81=A6=E5=B7=AE=E5=88=86=E3=81=8CMarkdown?= =?UTF-8?q?=E3=83=95=E3=82=A1=E3=82=A4=E3=83=AB=E3=81=AE=E3=81=BF=E3=81=AE?= =?UTF-8?q?=E5=A0=B4=E5=90=88=E3=81=AF=E5=AE=9F=E8=A1=8C=E3=81=95=E3=82=8C?= =?UTF-8?q?=E3=81=AA=E3=81=84=E3=82=88=E3=81=86=E3=81=AB=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/code-quality-check.yaml | 1 + .github/workflows/python-build-check.yaml | 1 + .github/workflows/run-test.yaml | 1 + 3 files changed, 3 insertions(+) diff --git a/.github/workflows/code-quality-check.yaml b/.github/workflows/code-quality-check.yaml index a1be635e..36fea575 100644 --- a/.github/workflows/code-quality-check.yaml +++ b/.github/workflows/code-quality-check.yaml @@ -5,6 +5,7 @@ on: paths: - '**.rs' - '**/Cargo.toml' + - '!*.md' jobs: build: diff --git a/.github/workflows/python-build-check.yaml b/.github/workflows/python-build-check.yaml index 530e1801..2daf9d81 100644 --- a/.github/workflows/python-build-check.yaml +++ b/.github/workflows/python-build-check.yaml @@ -4,6 +4,7 @@ on: pull_request: push: branches: [ 'main' ] + paths-ignore: [ '*.md' ] permissions: contents: read diff --git a/.github/workflows/run-test.yaml b/.github/workflows/run-test.yaml index 833272fb..9ccbdec7 100644 --- a/.github/workflows/run-test.yaml +++ b/.github/workflows/run-test.yaml @@ -4,6 +4,7 @@ on: pull_request: push: branches: [ 'main' ] + paths-ignore: [ '*.md' ] env: CARGO_TERM_COLOR: always From 6c0701d0519b450ea38133cecbd937054a1cd03a Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Fri, 11 Oct 2024 23:36:16 +0900 Subject: [PATCH 2/8] =?UTF-8?q?update:=20#459:=20=E6=96=87=E6=B3=95?= =?UTF-8?q?=E7=9A=84=E3=81=AA=E8=AA=A4=E3=82=8A=E3=82=92=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `README.md` --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 412cc8e1..d9e8bebc 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ![Rust Version](https://img.shields.io/badge/rust%20version-%3E%3D1.73.0-orange) [![Unit test & Integration test](https://github.com/YuukiToriyama/japanese-address-parser/actions/workflows/run-test.yaml/badge.svg?branch=main)](https://github.com/YuukiToriyama/japanese-address-parser/actions/workflows/run-test.yaml) -A Rust Library to parse japanese addresses. +A Rust library for parsing Japanese addresses. ## Usage @@ -47,7 +47,7 @@ fn main() { [![npmjs](https://img.shields.io/npm/v/%40toriyama/japanese-address-parser)](https://www.npmjs.com/package/@toriyama/japanese-address-parser) This crate is designed to be buildable for `wasm32-unknown-unknown` with `wasm-pack`. -Pre-compiled wasm module is available npmjs.com +Pre-compiled wasm module is available on npmjs.com ```bash npm install @toriyama/japanese-address-parser @@ -68,28 +68,28 @@ init().then(() => { [![PyPI - Version](https://img.shields.io/pypi/v/japanese-address-parser-py)](https://pypi.org/project/japanese-address-parser-py/) -This library can be called from Python world. For more detail, see [python module's README](python/README.md). +This library can be called from the Python world. For more details, see [python module's README](python/README.md). ## Road to v1 -The goals that this library aims to achieve are below. +The goals of this library are as follows. -- Supports not only wasm target but also various platforms and architectures. +- Supports not only wasm but also multiple platforms and architectures. - Enables more advanced normalization. For example, provides more detailed analysis than town level. -- Provides latlng of the given address. -- Enables processing of town names that have ceased to exist as a result of municipal mergers. +- Returns the location of the given address. +- Enables processing of town names that no longer exist due to municipal mergers. ## Support This software is maintained by [YuukiToriyama](https://github.com/yuukitoriyama). -If you have questions, please create an issue. +If you have any questions, please create a new issue. ## Acknowledgements -This software was developed inspired +This software was inspired by [@geolonia/normalize-japanese-addresses](https://github.com/geolonia/normalize-japanese-addresses). -Also, the parsing process uses [Geolonia 住所データ](https://github.com/geolonia/japanese-addresses) provided -by [株式会社Geolonia](https://www.geolonia.com/company/). +In addition, the parsing process uses [Geolonia 住所データ](https://github.com/geolonia/japanese-addresses) which is +provided by [株式会社Geolonia](https://www.geolonia.com/company/). ## License From a399dde5f1a2cc685afe0c0fe75c658fefae5603 Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Fri, 11 Oct 2024 23:36:48 +0900 Subject: [PATCH 3/8] =?UTF-8?q?update:=20#459:=20`Usage`=E3=81=AE=E7=AF=80?= =?UTF-8?q?=E3=82=92=E6=9B=B8=E3=81=8D=E7=9B=B4=E3=81=97=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d9e8bebc..9a28bdf1 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,11 @@ A Rust library for parsing Japanese addresses. ## Usage -Add this to your `Cargo.toml` +Add the following to your `Cargo.toml`. -```bash -cargo add japanese-address-parser -# or -cargo add japanese-address-parser -F blocking +```toml +[dependencies] +japanese-address-parser = "0.1" ``` ### Async Version From 31bb51eb56216e8dd2c1a489599666f2b6dcea56 Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Fri, 11 Oct 2024 23:41:55 +0900 Subject: [PATCH 4/8] =?UTF-8?q?update:=20#459:=20=E6=96=87=E6=B3=95?= =?UTF-8?q?=E7=9A=84=E3=81=AA=E8=AA=A4=E3=82=8A=E3=82=92=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `python/README.md` --- python/README.md | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/python/README.md b/python/README.md index 6ed3cc1d..e05e7499 100644 --- a/python/README.md +++ b/python/README.md @@ -1,15 +1,18 @@ # japanese-address-parser-py -A python toolkit for processing japanese addresses + +A Python toolkit for processing Japanese addresses [![PyPI - Version](https://img.shields.io/pypi/v/japanese-address-parser-py)](https://pypi.org/project/japanese-address-parser-py/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/japanese-address-parser-py)](https://pypi.org/project/japanese-address-parser-py/#history) [![Unit test & Integration test](https://github.com/YuukiToriyama/japanese-address-parser/actions/workflows/run-test.yaml/badge.svg?branch=main)](https://github.com/YuukiToriyama/japanese-address-parser/actions/workflows/run-test.yaml) ## What is it? -**japanese-address-parser-py** is a Python package for parsing japanese addresses. -Any address can be processed into structured data. + +**japanese-address-parser-py** is a Python package for parsing Japanese addresses. +Any address can be parsed into structured data. ## Installation from PyPI + ```bash pip install japanese-address-parser-py ``` @@ -38,7 +41,6 @@ for address in address_list: {'town': '日本大通', 'city': '横浜市中区', 'prefecture': '神奈川県', 'rest': '1'} ``` - ```python from japanese_address_parser_py import Parser @@ -59,8 +61,9 @@ print(parse_result.address["rest"]) ``` ## Development -This library is written in Rust language. You need to set up a Rust development environment to build this library. -Also, you need to install `maturin` because this library uses it in order to generate Python bindings. + +This library is written in Rust. You need to set up a Rust development environment to build this library. +Also, you need to install `maturin` as this library uses it in order to generate Python bindings. ```bash # Install maturin @@ -78,18 +81,19 @@ pip3 install dist/japanese_address_parser_py-[version]-cp37-abi3-[arch].whl ## Support This software is maintained by [YuukiToriyama](https://github.com/yuukitoriyama). -If you have questions, please create an issue. +If you have any questions, please create a new issue. ## Where to get source code + The source code is hosted on GitHub at: https://github.com/YuukiToriyama/japanese-address-parser ## Acknowledgements -This software was developed inspired +This software was inspired by [@geolonia/normalize-japanese-addresses](https://github.com/geolonia/normalize-japanese-addresses). -Also, the parsing process uses [Geolonia 住所データ](https://github.com/geolonia/japanese-addresses) provided -by [株式会社Geolonia](https://www.geolonia.com/company/). +In addition, the parsing process uses [Geolonia 住所データ](https://github.com/geolonia/japanese-addresses) which is +provided by [株式会社Geolonia](https://www.geolonia.com/company/). ## License From 83b2aaaac878c20c3cce9d56d239517e7b9f676d Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Fri, 11 Oct 2024 23:47:31 +0900 Subject: [PATCH 5/8] =?UTF-8?q?fix:=20#459:=20Markdown=E3=81=97=E3=81=8B?= =?UTF-8?q?=E5=B7=AE=E5=88=86=E3=81=8C=E3=81=AA=E3=81=84=E5=A0=B4=E5=90=88?= =?UTF-8?q?=E3=83=AF=E3=83=BC=E3=82=AF=E3=83=95=E3=83=AD=E3=83=BC=E3=81=AE?= =?UTF-8?q?=E5=AE=9F=E8=A1=8C=E3=82=92=E3=82=B9=E3=82=AD=E3=83=83=E3=83=97?= =?UTF-8?q?=E3=81=99=E3=82=8B=E8=A8=AD=E5=AE=9A=E3=81=8C=E8=AA=A4=E3=81=A3?= =?UTF-8?q?=E3=81=A6=E3=81=84=E3=81=9F=E3=81=AE=E3=81=A7=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/python-build-check.yaml | 2 +- .github/workflows/run-test.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-build-check.yaml b/.github/workflows/python-build-check.yaml index 2daf9d81..df44e5fa 100644 --- a/.github/workflows/python-build-check.yaml +++ b/.github/workflows/python-build-check.yaml @@ -2,9 +2,9 @@ name: Python module build check on: pull_request: + paths-ignore: [ '*.md' ] push: branches: [ 'main' ] - paths-ignore: [ '*.md' ] permissions: contents: read diff --git a/.github/workflows/run-test.yaml b/.github/workflows/run-test.yaml index 9ccbdec7..ebaa277f 100644 --- a/.github/workflows/run-test.yaml +++ b/.github/workflows/run-test.yaml @@ -2,9 +2,9 @@ name: Unit test & Integration test on: pull_request: + paths-ignore: [ '*.md' ] push: branches: [ 'main' ] - paths-ignore: [ '*.md' ] env: CARGO_TERM_COLOR: always From 8386356c644c25c3948b6bf13d4dece112c9a5ea Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Sat, 19 Oct 2024 23:38:57 +0900 Subject: [PATCH 6/8] =?UTF-8?q?update:=20jisx0401=E3=81=AE=E5=B0=8E?= =?UTF-8?q?=E5=85=A5:=20`read=5Fprefecture()`=E3=82=92jisx0401=E3=83=99?= =?UTF-8?q?=E3=83=BC=E3=82=B9=E3=81=AE=E5=AE=9F=E8=A3=85=E3=81=AB=E5=A4=89?= =?UTF-8?q?=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/Cargo.toml | 1 + core/src/tokenizer/read_prefecture.rs | 65 +++++---------------------- 2 files changed, 11 insertions(+), 55 deletions(-) diff --git a/core/Cargo.toml b/core/Cargo.toml index 609cec8c..d06c2412 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -33,6 +33,7 @@ regex = { version = "1.10.6", default-features = false, features = ["std", "unic serde.workspace = true reqwest = { version = "0.12.5", default-features = false, features = ["json", "rustls-tls"] } js-sys = "0.3.67" +jisx0401 = "0.1.0-beta.3" [dev-dependencies] criterion = { version = "0.5.1", default-features = false, features = ["html_reports"] } diff --git a/core/src/tokenizer/read_prefecture.rs b/core/src/tokenizer/read_prefecture.rs index acc0c78b..a843e524 100644 --- a/core/src/tokenizer/read_prefecture.rs +++ b/core/src/tokenizer/read_prefecture.rs @@ -3,56 +3,6 @@ use crate::tokenizer::{End, Init, PrefectureNameFound, Tokenizer}; use crate::util::extension::StrExt; use std::marker::PhantomData; -const PREFECTURE_NAME_LIST: [&str; 47] = [ - "北海道", - "青森県", - "岩手県", - "宮城県", - "秋田県", - "山形県", - "福島県", - "茨城県", - "栃木県", - "群馬県", - "埼玉県", - "千葉県", - "東京都", - "神奈川県", - "新潟県", - "富山県", - "石川県", - "福井県", - "山梨県", - "長野県", - "岐阜県", - "静岡県", - "愛知県", - "三重県", - "滋賀県", - "京都府", - "大阪府", - "兵庫県", - "奈良県", - "和歌山県", - "鳥取県", - "島根県", - "岡山県", - "広島県", - "山口県", - "徳島県", - "香川県", - "愛媛県", - "高知県", - "福岡県", - "佐賀県", - "長崎県", - "熊本県", - "大分県", - "宮崎県", - "鹿児島県", - "沖縄県", -]; - impl Tokenizer { pub(crate) fn new(input: &str) -> Self { Self { @@ -69,9 +19,10 @@ impl Tokenizer { pub(crate) fn read_prefecture( &self, ) -> Result<(String, Tokenizer), Tokenizer> { - for prefecture_name in PREFECTURE_NAME_LIST { - if self.rest.starts_with(prefecture_name) { - return Ok(( + match find_prefecture(&self.rest) { + Some(prefecture) => { + let prefecture_name = prefecture.name_ja(); + Ok(( prefecture_name.to_string(), Tokenizer { tokens: vec![Token::Prefecture(Prefecture { @@ -85,13 +36,17 @@ impl Tokenizer { .collect::(), _state: PhantomData::, }, - )); + )) } + None => Err(self.finish()), } - Err(self.finish()) } } +fn find_prefecture(input: &str) -> Option<&jisx0401::Prefecture> { + jisx0401::Prefecture::values().find(|&prefecture| input.starts_with(prefecture.name_ja())) +} + #[cfg(test)] mod tests { use crate::domain::common::token::Token; From 7860fd586090b5bf92a755891d9cb3326e1d8a41 Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Sun, 20 Oct 2024 00:01:59 +0900 Subject: [PATCH 7/8] =?UTF-8?q?update:=20jisx0401=E3=81=AE=E5=B0=8E?= =?UTF-8?q?=E5=85=A5:=20`read=5Fprefecture()`=E3=81=AE=E8=BF=94=E3=82=8A?= =?UTF-8?q?=E5=80=A4=E3=82=92=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 都道府県名のStringを返す作りになっていたが、`jisx0401::Prefecture`を返す作りにした --- core/src/parser.rs | 20 ++++++++++---------- core/src/tokenizer/read_prefecture.rs | 9 +++++---- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/core/src/parser.rs b/core/src/parser.rs index 15e35db8..d9229091 100644 --- a/core/src/parser.rs +++ b/core/src/parser.rs @@ -81,7 +81,7 @@ impl Parser { pub async fn parse(api: Arc, input: &str) -> ParseResult { let tokenizer = Tokenizer::new(input); // 都道府県を特定 - let (prefecture_name, tokenizer) = match tokenizer.read_prefecture() { + let (prefecture, tokenizer) = match tokenizer.read_prefecture() { Ok(found) => found, Err(tokenizer) => { return ParseResult { @@ -91,7 +91,7 @@ pub async fn parse(api: Arc, input: &str) -> ParseResult { } }; // その都道府県の市町村名リストを取得 - let prefecture = match api.get_prefecture_master(&prefecture_name).await { + let prefecture_master = match api.get_prefecture_master(prefecture.name_ja()).await { Err(error) => { return ParseResult { address: Address::from(tokenizer.finish()), @@ -101,11 +101,11 @@ pub async fn parse(api: Arc, input: &str) -> ParseResult { Ok(result) => result, }; // 市町村名を特定 - let (city_name, tokenizer) = match tokenizer.read_city(&prefecture.cities) { + let (city_name, tokenizer) = match tokenizer.read_city(&prefecture_master.cities) { Ok(found) => found, Err(not_found) => { // 市区町村が特定できない場合かつフィーチャフラグが有効な場合、郡名が抜けている可能性を検討 - match not_found.read_city_with_county_name_completion(&prefecture.cities) { + match not_found.read_city_with_county_name_completion(&prefecture_master.cities) { Ok(found) if cfg!(feature = "city-name-correction") => found, _ => { // それでも見つからない場合は終了 @@ -118,7 +118,7 @@ pub async fn parse(api: Arc, input: &str) -> ParseResult { } }; // その市町村の町名リストを取得 - let city = match api.get_city_master(&prefecture_name, &city_name).await { + let city = match api.get_city_master(prefecture.name_ja(), &city_name).await { Err(error) => { return ParseResult { address: Address::from(tokenizer.finish()), @@ -247,7 +247,7 @@ mod tests { #[cfg(feature = "blocking")] pub fn parse_blocking(api: Arc, input: &str) -> ParseResult { let tokenizer = Tokenizer::new(input); - let (prefecture_name, tokenizer) = match tokenizer.read_prefecture() { + let (prefecture, tokenizer) = match tokenizer.read_prefecture() { Ok(found) => found, Err(tokenizer) => { return ParseResult { @@ -256,7 +256,7 @@ pub fn parse_blocking(api: Arc, input: &str) -> ParseResult { } } }; - let prefecture = match api.get_prefecture_master(&prefecture_name) { + let prefecture_master = match api.get_prefecture_master(prefecture.name_ja()) { Err(error) => { return ParseResult { address: Address::from(tokenizer.finish()), @@ -265,10 +265,10 @@ pub fn parse_blocking(api: Arc, input: &str) -> ParseResult { } Ok(result) => result, }; - let (city_name, tokenizer) = match tokenizer.read_city(&prefecture.cities) { + let (city_name, tokenizer) = match tokenizer.read_city(&prefecture_master.cities) { Ok(found) => found, Err(not_found) => { - match not_found.read_city_with_county_name_completion(&prefecture.cities) { + match not_found.read_city_with_county_name_completion(&prefecture_master.cities) { Ok(found) if cfg!(feature = "city-name-correction") => found, _ => { return ParseResult { @@ -279,7 +279,7 @@ pub fn parse_blocking(api: Arc, input: &str) -> ParseResult { } } }; - let city = match api.get_city_master(&prefecture_name, &city_name) { + let city = match api.get_city_master(prefecture.name_ja(), &city_name) { Err(error) => { return ParseResult { address: Address::from(tokenizer.finish()), diff --git a/core/src/tokenizer/read_prefecture.rs b/core/src/tokenizer/read_prefecture.rs index a843e524..65fabfdf 100644 --- a/core/src/tokenizer/read_prefecture.rs +++ b/core/src/tokenizer/read_prefecture.rs @@ -18,12 +18,12 @@ impl Tokenizer { pub(crate) fn read_prefecture( &self, - ) -> Result<(String, Tokenizer), Tokenizer> { + ) -> Result<(jisx0401::Prefecture, Tokenizer), Tokenizer> { match find_prefecture(&self.rest) { Some(prefecture) => { let prefecture_name = prefecture.name_ja(); Ok(( - prefecture_name.to_string(), + prefecture.clone(), Tokenizer { tokens: vec![Token::Prefecture(Prefecture { prefecture_name: prefecture_name.to_string(), @@ -51,6 +51,7 @@ fn find_prefecture(input: &str) -> Option<&jisx0401::Prefecture> { mod tests { use crate::domain::common::token::Token; use crate::tokenizer::Tokenizer; + use jisx0401::Prefecture; #[test] fn new() { @@ -79,8 +80,8 @@ mod tests { let tokenizer = Tokenizer::new("東京都港区芝公園4丁目2-8"); let result = tokenizer.read_prefecture(); assert!(result.is_ok()); - let (prefecture_name, tokenizer) = result.unwrap(); - assert_eq!(prefecture_name, "東京都"); + let (prefecture, tokenizer) = result.unwrap(); + assert_eq!(prefecture, Prefecture::TOKYO); assert_eq!(tokenizer.tokens.len(), 1); assert_eq!(tokenizer.rest, "港区芝公園4丁目2-8"); } From 074f77b8ab39b4f14f95f85d2369ef43e8593c2c Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Sun, 20 Oct 2024 00:05:44 +0900 Subject: [PATCH 8/8] update-version: 0.1.18 -> 0.1.19 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 4cd8b1fe..4a467768 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.1.18" +version = "0.1.19" edition = "2021" description = "A Rust Library to parse japanese addresses." repository = "https://github.com/YuukiToriyama/japanese-address-parser"