Korean NLP on Hive

Tokenize Korean sentences on Hive.

tokenize_ko(String line [,
            const array<string> userDict,
            const string mode = "discard",
            const array<string> stopTags,
            boolean outputUnknownUnigrams
           ]) - returns tokenized strings in array<string>

Implementation is based on Lucene Korean analyzer.

Usage

mvn clean install

add jar hive-udf-tokenize_ko-0.0.1.jar;
create temporary function tokenize_ko as 'me.takuti.hive.nlp.tokenizer.TokenizeKoUDF';

select tokenize_ko("소설 무궁화꽃이 피었습니다.");
-- ["소설","무궁","화","꽃","피"]

select tokenize_ko("소설 무궁화꽃이 피었습니다.", null, "mixed");
-- ["소설","무궁화","무궁","화","꽃","피"]

select tokenize_ko("소설 무궁화꽃이 피었습니다.", null, "discard", array("E", "VV"));
-- ["소설","무궁","화","꽃","이"]

select tokenize_ko("Hello, world.", null, "none", array(), true);
-- ["h","e","l","l","o","w","o","r","l","d"]

select tokenize_ko("Hello, world.", null, "none", array(), false);
-- ["hello","world"]

select tokenize_ko("나는 C++ 언어를 프로그래밍 언어로 사랑한다.", null, "discard", array());
-- ["나","는","c","언어","를","프로그래밍","언어","로","사랑","하","ᆫ다"]

select tokenize_ko("나는 C++ 언어를 프로그래밍 언어로 사랑한다.", array("C++"), "discard", array());
-- ["나","는","c++","언어","를","프로그래밍","언어","로","사랑","하","ᆫ다"]

Note that other languages, English, Japanese and Chinese, are similarly supported by Apache Hivemall.

Name		Name	Last commit message	Last commit date
Latest commit History 15 Commits
src		src
.gitignore		.gitignore
LICENSE		LICENSE
README.md		README.md
pom.xml		pom.xml

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

Korean NLP on Hive

Usage

About

Releases

Packages

Languages

License

takuti/hive-udf-tokenize_ko

Folders and files

Latest commit

History

Repository files navigation

Korean NLP on Hive

Usage

About

Topics

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages