-
-
Notifications
You must be signed in to change notification settings - Fork 77
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use ICU tokenizer to improve some Asian languages support #498
base: master
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,6 +49,16 @@ module.exports.tests.functional = function(test, common){ | |
assertAnalysis( 'place', 'Toys "R" Us!', [ 'toys', 'r', 'us' ]); | ||
assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]); | ||
|
||
// complicated tokenization for some Asian languages | ||
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); | ||
assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); | ||
assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); | ||
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', | ||
['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The correct splitting is: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The full Chinese addresses are usually ...省 ...市 ...区 ...路 ...号 building_name ...座 ...楼 ...室 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW, if it’s by tokens then they make sense. I feel single characters like '东', '三', '环', '中路' may be too small for search, but it’s not wrong. |
||
assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The correct splitting is: |
||
assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]); | ||
assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]); | ||
|
||
suite.run( t.end ); | ||
}); | ||
}; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,16 @@ module.exports.tests.analyze = function(test, common){ | |
assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] ); | ||
assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] ); | ||
|
||
// complicated tokenization for some Asian languages | ||
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); | ||
assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); | ||
assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My Thai mate suggested
apart from different split, it's visible that first letter was changed from บ้ to บ in your case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @dzviagin ! I will take a look, likely it is a result of removal of diacritic symbols, which existed even before this PR... |
||
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', | ||
['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']); | ||
assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]); | ||
assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]); | ||
assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]); | ||
|
||
suite.run( t.end ); | ||
}); | ||
}; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,16 +22,16 @@ function generate(){ | |
"analysis": { | ||
"tokenizer": { | ||
"peliasTokenizer": { | ||
"type": "pattern", | ||
"pattern": "[\\s,/\\\\-]+" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The previous pattern matched on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, I'll try to check if we have relevant tests and if we don't, will write them 👍 |
||
"type": "icu_tokenizer" | ||
} | ||
}, | ||
"analyzer": { | ||
"peliasAdmin": { | ||
"type": "custom", | ||
"tokenizer": "peliasTokenizer", | ||
"char_filter" : ["punctuation", "nfkc_normalizer"], | ||
"char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"], | ||
"filter": [ | ||
"ampersand_replacer", | ||
"lowercase", | ||
"trim", | ||
"synonyms/custom_admin/multiword", | ||
|
@@ -46,8 +46,9 @@ function generate(){ | |
"peliasIndexOneEdgeGram" : { | ||
"type": "custom", | ||
"tokenizer" : "peliasTokenizer", | ||
"char_filter" : ["punctuation", "nfkc_normalizer"], | ||
"char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"], | ||
"filter": [ | ||
"ampersand_replacer", | ||
"lowercase", | ||
"trim", | ||
"synonyms/custom_name/multiword", | ||
|
@@ -66,8 +67,9 @@ function generate(){ | |
"peliasQuery": { | ||
"type": "custom", | ||
"tokenizer": "peliasTokenizer", | ||
"char_filter": ["punctuation", "nfkc_normalizer"], | ||
"char_filter": ["ampersand_mapper", "punctuation", "nfkc_normalizer"], | ||
"filter": [ | ||
"ampersand_replacer", | ||
"lowercase", | ||
"trim", | ||
"icu_folding", | ||
|
@@ -80,8 +82,9 @@ function generate(){ | |
"peliasPhrase": { | ||
"type": "custom", | ||
"tokenizer":"peliasTokenizer", | ||
"char_filter" : ["punctuation", "nfkc_normalizer"], | ||
"char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"], | ||
"filter": [ | ||
"ampersand_replacer", | ||
"lowercase", | ||
"trim", | ||
"remove_duplicate_spaces", | ||
|
@@ -129,8 +132,9 @@ function generate(){ | |
"peliasStreet": { | ||
"type": "custom", | ||
"tokenizer":"peliasTokenizer", | ||
"char_filter" : ["punctuation", "nfkc_normalizer"], | ||
"char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"], | ||
"filter": [ | ||
"ampersand_replacer", | ||
"lowercase", | ||
"trim", | ||
"remove_duplicate_spaces", | ||
|
@@ -147,8 +151,9 @@ function generate(){ | |
"peliasIndexCountryAbbreviation": { | ||
"type": "custom", | ||
"tokenizer": "peliasTokenizer", | ||
"char_filter": ["punctuation", "nfkc_normalizer"], | ||
"char_filter": ["ampersand_mapper", "punctuation", "nfkc_normalizer"], | ||
"filter": [ | ||
"ampersand_replacer", | ||
"lowercase", | ||
"trim", | ||
"icu_folding", | ||
|
@@ -161,8 +166,9 @@ function generate(){ | |
"peliasIndexCountryAbbreviationOneEdgeGram": { | ||
"type": "custom", | ||
"tokenizer": "peliasTokenizer", | ||
"char_filter": ["punctuation", "nfkc_normalizer"], | ||
"char_filter": ["ampersand_mapper", "punctuation", "nfkc_normalizer"], | ||
"filter": [ | ||
"ampersand_replacer", | ||
"lowercase", | ||
"trim", | ||
"icu_folding", | ||
|
@@ -175,6 +181,12 @@ function generate(){ | |
}, | ||
}, | ||
"filter" : { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wish this |
||
// replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter) | ||
"ampersand_replacer": { | ||
"type": "pattern_replace", | ||
"pattern": "AMPERSANDPLACEHOLDER", | ||
"replacement": "&" | ||
}, | ||
"street_synonyms_multiplexer": { | ||
"type": "multiplexer", | ||
"preserve_original": false, | ||
|
@@ -248,6 +260,13 @@ function generate(){ | |
// more generated below | ||
}, | ||
"char_filter": { | ||
// icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it, | ||
// as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter) | ||
"ampersand_mapper": { | ||
"type": "pattern_replace", | ||
"pattern": "&", | ||
"replacement": " AMPERSANDPLACEHOLDER " | ||
}, | ||
"punctuation" : { | ||
"type" : "mapping", | ||
"mappings" : punctuation.blacklist.map(function(c){ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you explain a little more about the segmentation here?
Are these token groups from research/knowledge or just a copy->paste of what was generated?
The number 1 on the end catches my eye and I'm curious how it appeared from
ซอยเพชรบุรี๑
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, I used ChatGPT to generate them and then tools like Google Translate to validate, that these tokens indeed have separate meanings... Tbh it would be great to have someone native to one of those languages to validate 🤔 I will try to find one 😄
Thai language has its own digits (but they use Arabic ones as well btw) and
๑
is1
. Btw we convert them already even without this tokenizer. Not sure at which step though. But good question is if this number should be a separate token, but IIRC it is how it works right now(if you inputfoo1
it will generate singlefoo1
token), so I decided to limit scope of changes and may be will address it with separate PR. WDYT?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This makes sense, it's not an issue, normalizing to Arabic numerals is fine.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and yeah, we should get a human to check a few examples, maybe best to generate a few more before asking someone in different classes such as address, venue(s), streets, intersections, localities etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this feature will go through a bunch of testing which might take a long time, flagging potential issues now will be much easier than finding them later, I'm fairly confident we can resolve any issues.