From 8c2712b04d42732d2e46012c13f747734a451635 Mon Sep 17 00:00:00 2001 From: Siarhei Fedartsou Date: Tue, 26 Nov 2024 18:55:18 +0100 Subject: [PATCH 1/3] Use ICU tokenizer to improve some Asian languages support --- .../analyzer_peliasIndexOneEdgeGram.js | 5 +++ integration/analyzer_peliasQuery.js | 10 +++++ integration/analyzer_peliasStreet.js | 11 ++++++ settings.js | 37 ++++++++++++++----- test/fixtures/expected.json | 27 +++++++++++++- test/settings.js | 21 +++++++---- 6 files changed, 93 insertions(+), 18 deletions(-) diff --git a/integration/analyzer_peliasIndexOneEdgeGram.js b/integration/analyzer_peliasIndexOneEdgeGram.js index 1db32362..7cd418ae 100644 --- a/integration/analyzer_peliasIndexOneEdgeGram.js +++ b/integration/analyzer_peliasIndexOneEdgeGram.js @@ -85,6 +85,11 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] ); assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] ); + + assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [ + '0:ซ', '0:ซอ', '0:ซอย', + '1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ', + '2:f', '2:fo', '2:foo'] ); suite.run( t.end ); }); diff --git a/integration/analyzer_peliasQuery.js b/integration/analyzer_peliasQuery.js index f0cf199e..3c940ebc 100644 --- a/integration/analyzer_peliasQuery.js +++ b/integration/analyzer_peliasQuery.js @@ -49,6 +49,16 @@ module.exports.tests.functional = function(test, common){ assertAnalysis( 'place', 'Toys "R" Us!', [ 'toys', 'r', 'us' ]); assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]); + // complicated tokenization for some Asian languages + assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); + assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); + assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); + assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', + ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']); + assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]); + assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]); + assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]); + suite.run( t.end ); }); }; diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index 2fa0e494..a8f542a9 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -1,4 +1,5 @@ // validate analyzer is behaving as expected +const { assert } = require('@hapi/joi'); const Suite = require('../test/elastictest/Suite') module.exports.tests = {}; @@ -22,6 +23,16 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] ); assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] ); + // complicated tokenization for some Asian languages + assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); + assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); + assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); + assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', + ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']); + assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]); + assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]); + assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]); + suite.run( t.end ); }); }; diff --git a/settings.js b/settings.js index f2dd633a..c8f3437b 100644 --- a/settings.js +++ b/settings.js @@ -22,16 +22,16 @@ function generate(){ "analysis": { "tokenizer": { "peliasTokenizer": { - "type": "pattern", - "pattern": "[\\s,/\\\\-]+" + "type": "icu_tokenizer" } }, "analyzer": { "peliasAdmin": { "type": "custom", "tokenizer": "peliasTokenizer", - "char_filter" : ["punctuation", "nfkc_normalizer"], + "char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "synonyms/custom_admin/multiword", @@ -46,8 +46,9 @@ function generate(){ "peliasIndexOneEdgeGram" : { "type": "custom", "tokenizer" : "peliasTokenizer", - "char_filter" : ["punctuation", "nfkc_normalizer"], + "char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "synonyms/custom_name/multiword", @@ -66,8 +67,9 @@ function generate(){ "peliasQuery": { "type": "custom", "tokenizer": "peliasTokenizer", - "char_filter": ["punctuation", "nfkc_normalizer"], + "char_filter": ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -80,8 +82,9 @@ function generate(){ "peliasPhrase": { "type": "custom", "tokenizer":"peliasTokenizer", - "char_filter" : ["punctuation", "nfkc_normalizer"], + "char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "remove_duplicate_spaces", @@ -129,8 +132,9 @@ function generate(){ "peliasStreet": { "type": "custom", "tokenizer":"peliasTokenizer", - "char_filter" : ["punctuation", "nfkc_normalizer"], + "char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "remove_duplicate_spaces", @@ -147,8 +151,9 @@ function generate(){ "peliasIndexCountryAbbreviation": { "type": "custom", "tokenizer": "peliasTokenizer", - "char_filter": ["punctuation", "nfkc_normalizer"], + "char_filter": ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -161,8 +166,9 @@ function generate(){ "peliasIndexCountryAbbreviationOneEdgeGram": { "type": "custom", "tokenizer": "peliasTokenizer", - "char_filter": ["punctuation", "nfkc_normalizer"], + "char_filter": ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -175,6 +181,12 @@ function generate(){ }, }, "filter" : { + // replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter) + "ampersand_replacer": { + "type": "pattern_replace", + "pattern": "AMPERSANDPLACEHOLDER", + "replacement": "&" + }, "street_synonyms_multiplexer": { "type": "multiplexer", "preserve_original": false, @@ -248,6 +260,13 @@ function generate(){ // more generated below }, "char_filter": { + // icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it, + // as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter) + "ampersand_mapper": { + "type": "pattern_replace", + "pattern": "&", + "replacement": " AMPERSANDPLACEHOLDER " + }, "punctuation" : { "type" : "mapping", "mappings" : punctuation.blacklist.map(function(c){ diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 8bddef1e..e7439549 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -15,8 +15,7 @@ "analysis": { "tokenizer": { "peliasTokenizer": { - "type": "pattern", - "pattern": "[\\s,/\\\\-]+" + "type": "icu_tokenizer" } }, "analyzer": { @@ -24,10 +23,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "synonyms/custom_admin/multiword", @@ -43,10 +44,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "synonyms/custom_name/multiword", @@ -66,10 +69,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -83,10 +88,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "remove_duplicate_spaces", @@ -143,10 +150,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "remove_duplicate_spaces", @@ -164,10 +173,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -181,10 +192,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -197,6 +210,11 @@ } }, "filter": { + "ampersand_replacer": { + "type": "pattern_replace", + "pattern": "AMPERSANDPLACEHOLDER", + "replacement": "&" + }, "street_synonyms_multiplexer": { "type": "multiplexer", "preserve_original": false, @@ -2271,6 +2289,11 @@ } }, "char_filter": { + "ampersand_mapper": { + "type": "pattern_replace", + "pattern": "&", + "replacement": " AMPERSANDPLACEHOLDER " + }, "punctuation": { "type": "mapping", "mappings": [ diff --git a/test/settings.js b/test/settings.js index 78c6b2ba..70fac7f3 100644 --- a/test/settings.js +++ b/test/settings.js @@ -57,13 +57,14 @@ module.exports.tests.peliasAdminAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasAdmin; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ['punctuation', 'nfkc_normalizer'], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ['ampersand_mapper', 'punctuation', 'nfkc_normalizer'], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasAdmin token filters', function (t) { var analyzer = settings().analysis.analyzer.peliasAdmin; t.deepEqual(analyzer.filter, [ + "ampersand_replacer", "lowercase", "trim", "synonyms/custom_admin/multiword", @@ -85,13 +86,14 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasIndexOneEdgeGram; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation","nfkc_normalizer"], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ["ampersand_mapper", "punctuation","nfkc_normalizer"], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasIndexOneEdgeGram token filters', function(t) { var analyzer = settings().analysis.analyzer.peliasIndexOneEdgeGram; t.deepEqual( analyzer.filter, [ + "ampersand_replacer", "lowercase", "trim", "synonyms/custom_name/multiword", @@ -117,13 +119,14 @@ module.exports.tests.peliasQueryAnalyzer = function (test, common) { var analyzer = s.analysis.analyzer.peliasQuery; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ['punctuation', 'nfkc_normalizer'], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ['ampersand_mapper', 'punctuation', 'nfkc_normalizer'], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasQuery token filters', function (t) { var analyzer = settings().analysis.analyzer.peliasQuery; t.deepEqual(analyzer.filter, [ + 'ampersand_replacer', 'lowercase', 'trim', 'icu_folding', @@ -143,13 +146,14 @@ module.exports.tests.peliasPhraseAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasPhrase; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation","nfkc_normalizer"], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ["ampersand_mapper", "punctuation", "nfkc_normalizer"], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasPhrase token filters', function(t) { var analyzer = settings().analysis.analyzer.peliasPhrase; t.deepEqual( analyzer.filter, [ + "ampersand_replacer", "lowercase", "trim", "remove_duplicate_spaces", @@ -236,13 +240,14 @@ module.exports.tests.peliasStreetAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasStreet; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ['punctuation', 'nfkc_normalizer'], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ['ampersand_mapper', 'punctuation', 'nfkc_normalizer'], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasStreet token filters', function(t) { var analyzer = settings().analysis.analyzer.peliasStreet; t.deepEqual( analyzer.filter, [ + "ampersand_replacer", "lowercase", "trim", "remove_duplicate_spaces", @@ -266,13 +271,14 @@ module.exports.tests.peliasIndexCountryAbbreviation = function (test, common) { var analyzer = s.analysis.analyzer.peliasIndexCountryAbbreviation; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ['punctuation', 'nfkc_normalizer'], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ['ampersand_mapper', 'punctuation', 'nfkc_normalizer'], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasIndexCountryAbbreviation token filters', function (t) { var analyzer = settings().analysis.analyzer.peliasIndexCountryAbbreviation; t.deepEqual(analyzer.filter, [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -292,13 +298,14 @@ module.exports.tests.peliasIndexCountryAbbreviationOneEdgeGramAnalyzer = functio var analyzer = s.analysis.analyzer.peliasIndexCountryAbbreviationOneEdgeGram; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation", "nfkc_normalizer"], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ["ampersand_mapper", "punctuation", "nfkc_normalizer"], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasIndexCountryAbbreviationOneEdgeGram token filters', function (t) { var analyzer = settings().analysis.analyzer.peliasIndexCountryAbbreviationOneEdgeGram; t.deepEqual(analyzer.filter, [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", From 89d7ce85ffc0f34b7eea4b6e8844f051e53d3798 Mon Sep 17 00:00:00 2001 From: Siarhei Fedartsou Date: Wed, 27 Nov 2024 18:08:09 +0100 Subject: [PATCH 2/3] Remove unused import --- integration/analyzer_peliasStreet.js | 1 - 1 file changed, 1 deletion(-) diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index a8f542a9..a901f391 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -1,5 +1,4 @@ // validate analyzer is behaving as expected -const { assert } = require('@hapi/joi'); const Suite = require('../test/elastictest/Suite') module.exports.tests = {}; From 50b080d95defa5e8ea80b2134ce23fe416ed6baf Mon Sep 17 00:00:00 2001 From: Siarhei Fedartsou Date: Fri, 29 Nov 2024 20:48:00 +0100 Subject: [PATCH 3/3] Add more chinese test cases --- integration/analyzer_peliasQuery.js | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/integration/analyzer_peliasQuery.js b/integration/analyzer_peliasQuery.js index 3c940ebc..9f07c7e1 100644 --- a/integration/analyzer_peliasQuery.js +++ b/integration/analyzer_peliasQuery.js @@ -55,9 +55,22 @@ module.exports.tests.functional = function(test, common){ assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']); - assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]); - assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]); - assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]); + // correct word by word split according to native speaker: 马来西亚 / 霹雳州 / 怡保 / 31400, 怡保花园 / 第5巷 / 45号 + assertAnalysis('chinese_address2', '马来西亚霹雳州怡保31400怡保花园第5巷45号', + ["马来", "西亚", "霹", "雳", "州", "怡", "保", "31400", "怡", "保", "花园", "第", "5", "巷", "45", "号"]); + // correct word by word split: 马来西亚 / 柔佛新山 / 81200 / , / 士古来路 / , / 百万时尚广场 + assertAnalysis('chinese_address3', '马来西亚柔佛新山81200士古来路百万时尚广场', + ["马来", "西亚", "柔", "佛", "新山", "81200", "士", "古来", "路", "百万", "时尚", "广场"]); + // correct word by word split: 马来西亚/ 槟城 / 亚依淡 / 11500 / , / 极乐寺 / , / 回返路 + assertAnalysis('chinese_address4', '马来西亚槟城亚依淡11500极乐寺回返路', + ["马来", "西亚", "槟", "城", "亚", "依", "淡", "11500", "极乐", "寺", "回", "返", "路"]); + // correct word by word split: 马来西亚 / 吉隆坡 / 50000 / , / 茨厂街 / 123号 + assertAnalysis('chinese_address5', '马来西亚吉隆坡50000茨厂街123号', + ["马来", "西亚", "吉隆坡", "50000", "茨", "厂", "街", "123", "号"]); + + assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]); + assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]); + assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]); suite.run( t.end ); });