From d168b2323ff4eb8ca0ae4785da0133a9c28ae282 Mon Sep 17 00:00:00 2001 From: Kyrylo Malakhov Date: Tue, 2 Feb 2021 11:32:32 +0200 Subject: [PATCH 1/5] add localize config for English --- server/config.models.en.json | 54 ++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 server/config.models.en.json diff --git a/server/config.models.en.json b/server/config.models.en.json new file mode 100644 index 0000000..0cbee5e --- /dev/null +++ b/server/config.models.en.json @@ -0,0 +1,54 @@ +{ + "models": { + "word2vec": [ + { + "description": "Distributional semantic model of word representation «Oles Honchar» is used (using dataset - problems of poetics of Oles Honchar's creative work), word2vec algorithm with dimension 500d. Entity - a word, lemmatized, lowercased. Word2vec hyperparameters: -size 500 -negative 5 -window 5 -threads 24 -min_count 10 -iter 20.", + "name": "honchar.lowercased.lemmatized.word2vec.c.text.format.500d", + "link": "./models/honchar.lowercased.lemmatized.word2vec.c.text.format.500d", + "language": "ua", + "index": 0, + "placeholders": { + "term": "гончар", + "terms": "гончар письменник герой", + "similarity": "письменник" + } + }, + { + "description": "Distributional semantic model of word representation «Fiction» is used (using dataset - fiction literature), word2vec algorithm with dimension 300d. Entity - a word, lemmatized, lowercased. Word2vec hyperparameters: -size 300 -negative 7 -window 4 -threads 6 -min_count 10 -iter 5 -alpha 0.030.", + "name": "fiction.lowercased.lemmatized.word2vec.300d", + "link": "./models/fiction.lowercased.lemmatized.word2vec.300d", + "language": "ua", + "index": 1, + "placeholders": { + "term": "казка", + "terms": "казка легенда байка", + "similarity": "вірш" + } + }, + { + "description": "Distributional semantic model of word representation «Sukhomlinsky» is used (using dataset - the book «I give my heart to children»), word2vec algorithm with dimension 500d. Entity - a word, lemmatized, lowercased. Word2vec hyperparameters: -size 500 -negative 7 -window 4 -min_count 10 -iter 10.", + "name": "suhomlinskyy.lowercased.lemmatized.word2vec.c.text.format.500d", + "link": "./models/suhomlinskyy.lowercased.lemmatized.word2vec.c.text.format.500d", + "language": "ua", + "index": 2, + "placeholders": { + "term": "сухомлинський", + "terms": "сухомлинський василь герой", + "similarity": "комуніст" + } + }, + { + "description": "Distributional semantic model of word representation «WhiteBook» is used (using dataset - the book «The White Book of Physical and Rehabilitation Medicine»). The White Book (WB) of Physical and Rehabilitation Medicine (PRM) in Europe is produced by the 4 European PRM Bodies and constitutes the reference book for PRM physicians in Europe. It has now reached its third edition; the first was published in 1989 and the second in 2006/2007. The WB has multiple purposes, including providing a unifying framework for European countries, to inform decision-makers on European and national level, to offer educational material for PRM trainees and physicians and information about PRM to the medical community, other rehabilitation professionals and the public. The WB states the importance of PRM, a primary medical specialty that is present all over Europe, with a specific corpus disciplinae, a common background and history throughout Europe. PRM is internationally recognised and a partner of major international bodies, including the World Health Organization (WHO). PRM activities are strongly based on the documents of the United Nations (UN) and WHO, such as the Convention of the Rights of Persons with Disabilities (2006), the World Report on Disability (2011), the WHO Global Disability Action Plan 2014-2021 (2014) and the WHO initiative “Rehabilitation 2030: a call for action” (2017). The WB is organized in four sections, 11 chapters and some appendices. The WB starts with basic definitions and concepts of PRM and continues with why rehabilitation is needed by individuals and society. Rehabilitation focuses not only on health conditions but also on functioning. Accordingly, PRM is the medical specialty that strives to improve functioning of people with a health condition or experiencing disability. The fundamentals of PRM, the history of the PRM specialty, and the structure and activities of PRM organizations in Europe are presented, followed by a thorough presentation of the practice of PRM, i.e. knowledge and skills of PRM physicians, the clinical field of competence of PRM, the place of the PRM specialty in the healthcare system and society, education and continuous professional development of PRM physicians, specificities and challenges of science and research in PRM. The WB concludes with the way forward for the specialty: challenges and perspectives for the future of PRM. Word2vec algorithm with dimension 500d. Entity - a word, lemmatized, lowercased. Word2vec hyperparameters: -size 500 -negative 7 -window 4 -min_count 10 -iter 10.", + "name": "whitebook.lowercased.word2vec.c.text.format.500d", + "link": "./models/whitebook.lowercased.word2vec.c.text.format.500d", + "language": "ua", + "index": 3, + "placeholders": { + "term": "реабілітація", + "terms": "фізична реабілітація людини", + "similarity": "людини" + } + } + ] + } +} \ No newline at end of file From b8fd0989f3c13ed041c7813a0dc648c31ff70496 Mon Sep 17 00:00:00 2001 From: Kyrylo Malakhov Date: Tue, 2 Feb 2021 11:39:01 +0200 Subject: [PATCH 2/5] fix name --- server/config.models.simple.en.json | 54 +++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 server/config.models.simple.en.json diff --git a/server/config.models.simple.en.json b/server/config.models.simple.en.json new file mode 100644 index 0000000..0cbee5e --- /dev/null +++ b/server/config.models.simple.en.json @@ -0,0 +1,54 @@ +{ + "models": { + "word2vec": [ + { + "description": "Distributional semantic model of word representation «Oles Honchar» is used (using dataset - problems of poetics of Oles Honchar's creative work), word2vec algorithm with dimension 500d. Entity - a word, lemmatized, lowercased. Word2vec hyperparameters: -size 500 -negative 5 -window 5 -threads 24 -min_count 10 -iter 20.", + "name": "honchar.lowercased.lemmatized.word2vec.c.text.format.500d", + "link": "./models/honchar.lowercased.lemmatized.word2vec.c.text.format.500d", + "language": "ua", + "index": 0, + "placeholders": { + "term": "гончар", + "terms": "гончар письменник герой", + "similarity": "письменник" + } + }, + { + "description": "Distributional semantic model of word representation «Fiction» is used (using dataset - fiction literature), word2vec algorithm with dimension 300d. Entity - a word, lemmatized, lowercased. Word2vec hyperparameters: -size 300 -negative 7 -window 4 -threads 6 -min_count 10 -iter 5 -alpha 0.030.", + "name": "fiction.lowercased.lemmatized.word2vec.300d", + "link": "./models/fiction.lowercased.lemmatized.word2vec.300d", + "language": "ua", + "index": 1, + "placeholders": { + "term": "казка", + "terms": "казка легенда байка", + "similarity": "вірш" + } + }, + { + "description": "Distributional semantic model of word representation «Sukhomlinsky» is used (using dataset - the book «I give my heart to children»), word2vec algorithm with dimension 500d. Entity - a word, lemmatized, lowercased. Word2vec hyperparameters: -size 500 -negative 7 -window 4 -min_count 10 -iter 10.", + "name": "suhomlinskyy.lowercased.lemmatized.word2vec.c.text.format.500d", + "link": "./models/suhomlinskyy.lowercased.lemmatized.word2vec.c.text.format.500d", + "language": "ua", + "index": 2, + "placeholders": { + "term": "сухомлинський", + "terms": "сухомлинський василь герой", + "similarity": "комуніст" + } + }, + { + "description": "Distributional semantic model of word representation «WhiteBook» is used (using dataset - the book «The White Book of Physical and Rehabilitation Medicine»). The White Book (WB) of Physical and Rehabilitation Medicine (PRM) in Europe is produced by the 4 European PRM Bodies and constitutes the reference book for PRM physicians in Europe. It has now reached its third edition; the first was published in 1989 and the second in 2006/2007. The WB has multiple purposes, including providing a unifying framework for European countries, to inform decision-makers on European and national level, to offer educational material for PRM trainees and physicians and information about PRM to the medical community, other rehabilitation professionals and the public. The WB states the importance of PRM, a primary medical specialty that is present all over Europe, with a specific corpus disciplinae, a common background and history throughout Europe. PRM is internationally recognised and a partner of major international bodies, including the World Health Organization (WHO). PRM activities are strongly based on the documents of the United Nations (UN) and WHO, such as the Convention of the Rights of Persons with Disabilities (2006), the World Report on Disability (2011), the WHO Global Disability Action Plan 2014-2021 (2014) and the WHO initiative “Rehabilitation 2030: a call for action” (2017). The WB is organized in four sections, 11 chapters and some appendices. The WB starts with basic definitions and concepts of PRM and continues with why rehabilitation is needed by individuals and society. Rehabilitation focuses not only on health conditions but also on functioning. Accordingly, PRM is the medical specialty that strives to improve functioning of people with a health condition or experiencing disability. The fundamentals of PRM, the history of the PRM specialty, and the structure and activities of PRM organizations in Europe are presented, followed by a thorough presentation of the practice of PRM, i.e. knowledge and skills of PRM physicians, the clinical field of competence of PRM, the place of the PRM specialty in the healthcare system and society, education and continuous professional development of PRM physicians, specificities and challenges of science and research in PRM. The WB concludes with the way forward for the specialty: challenges and perspectives for the future of PRM. Word2vec algorithm with dimension 500d. Entity - a word, lemmatized, lowercased. Word2vec hyperparameters: -size 500 -negative 7 -window 4 -min_count 10 -iter 10.", + "name": "whitebook.lowercased.word2vec.c.text.format.500d", + "link": "./models/whitebook.lowercased.word2vec.c.text.format.500d", + "language": "ua", + "index": 3, + "placeholders": { + "term": "реабілітація", + "terms": "фізична реабілітація людини", + "similarity": "людини" + } + } + ] + } +} \ No newline at end of file From 995c5bad6224b72e2c70533312cdaa1cc844e829 Mon Sep 17 00:00:00 2001 From: Kyrylo Malakhov Date: Tue, 2 Feb 2021 11:39:13 +0200 Subject: [PATCH 3/5] cleanup --- server/config.models.en.json | 54 ------------------------------------ 1 file changed, 54 deletions(-) delete mode 100644 server/config.models.en.json diff --git a/server/config.models.en.json b/server/config.models.en.json deleted file mode 100644 index 0cbee5e..0000000 --- a/server/config.models.en.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "models": { - "word2vec": [ - { - "description": "Distributional semantic model of word representation «Oles Honchar» is used (using dataset - problems of poetics of Oles Honchar's creative work), word2vec algorithm with dimension 500d. Entity - a word, lemmatized, lowercased. Word2vec hyperparameters: -size 500 -negative 5 -window 5 -threads 24 -min_count 10 -iter 20.", - "name": "honchar.lowercased.lemmatized.word2vec.c.text.format.500d", - "link": "./models/honchar.lowercased.lemmatized.word2vec.c.text.format.500d", - "language": "ua", - "index": 0, - "placeholders": { - "term": "гончар", - "terms": "гончар письменник герой", - "similarity": "письменник" - } - }, - { - "description": "Distributional semantic model of word representation «Fiction» is used (using dataset - fiction literature), word2vec algorithm with dimension 300d. Entity - a word, lemmatized, lowercased. Word2vec hyperparameters: -size 300 -negative 7 -window 4 -threads 6 -min_count 10 -iter 5 -alpha 0.030.", - "name": "fiction.lowercased.lemmatized.word2vec.300d", - "link": "./models/fiction.lowercased.lemmatized.word2vec.300d", - "language": "ua", - "index": 1, - "placeholders": { - "term": "казка", - "terms": "казка легенда байка", - "similarity": "вірш" - } - }, - { - "description": "Distributional semantic model of word representation «Sukhomlinsky» is used (using dataset - the book «I give my heart to children»), word2vec algorithm with dimension 500d. Entity - a word, lemmatized, lowercased. Word2vec hyperparameters: -size 500 -negative 7 -window 4 -min_count 10 -iter 10.", - "name": "suhomlinskyy.lowercased.lemmatized.word2vec.c.text.format.500d", - "link": "./models/suhomlinskyy.lowercased.lemmatized.word2vec.c.text.format.500d", - "language": "ua", - "index": 2, - "placeholders": { - "term": "сухомлинський", - "terms": "сухомлинський василь герой", - "similarity": "комуніст" - } - }, - { - "description": "Distributional semantic model of word representation «WhiteBook» is used (using dataset - the book «The White Book of Physical and Rehabilitation Medicine»). The White Book (WB) of Physical and Rehabilitation Medicine (PRM) in Europe is produced by the 4 European PRM Bodies and constitutes the reference book for PRM physicians in Europe. It has now reached its third edition; the first was published in 1989 and the second in 2006/2007. The WB has multiple purposes, including providing a unifying framework for European countries, to inform decision-makers on European and national level, to offer educational material for PRM trainees and physicians and information about PRM to the medical community, other rehabilitation professionals and the public. The WB states the importance of PRM, a primary medical specialty that is present all over Europe, with a specific corpus disciplinae, a common background and history throughout Europe. PRM is internationally recognised and a partner of major international bodies, including the World Health Organization (WHO). PRM activities are strongly based on the documents of the United Nations (UN) and WHO, such as the Convention of the Rights of Persons with Disabilities (2006), the World Report on Disability (2011), the WHO Global Disability Action Plan 2014-2021 (2014) and the WHO initiative “Rehabilitation 2030: a call for action” (2017). The WB is organized in four sections, 11 chapters and some appendices. The WB starts with basic definitions and concepts of PRM and continues with why rehabilitation is needed by individuals and society. Rehabilitation focuses not only on health conditions but also on functioning. Accordingly, PRM is the medical specialty that strives to improve functioning of people with a health condition or experiencing disability. The fundamentals of PRM, the history of the PRM specialty, and the structure and activities of PRM organizations in Europe are presented, followed by a thorough presentation of the practice of PRM, i.e. knowledge and skills of PRM physicians, the clinical field of competence of PRM, the place of the PRM specialty in the healthcare system and society, education and continuous professional development of PRM physicians, specificities and challenges of science and research in PRM. The WB concludes with the way forward for the specialty: challenges and perspectives for the future of PRM. Word2vec algorithm with dimension 500d. Entity - a word, lemmatized, lowercased. Word2vec hyperparameters: -size 500 -negative 7 -window 4 -min_count 10 -iter 10.", - "name": "whitebook.lowercased.word2vec.c.text.format.500d", - "link": "./models/whitebook.lowercased.word2vec.c.text.format.500d", - "language": "ua", - "index": 3, - "placeholders": { - "term": "реабілітація", - "terms": "фізична реабілітація людини", - "similarity": "людини" - } - } - ] - } -} \ No newline at end of file From 8653216e1f0207dbb4475922b710cda28104d012 Mon Sep 17 00:00:00 2001 From: Kyrylo Malakhov Date: Tue, 2 Feb 2021 11:39:48 +0200 Subject: [PATCH 4/5] add models and model_en selection --- server/server.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/server/server.py b/server/server.py index 10de7e4..5085e91 100644 --- a/server/server.py +++ b/server/server.py @@ -45,6 +45,8 @@ # app.secret_key = b'_5#y2L"F4Q8z\n\xec]/' app.secret_key = os.urandom(42) +config_flag = 'ua' + # * Load models from config file to memory # ! Caution: Loading a large number of models requires a significant amount of RAM try: @@ -64,6 +66,13 @@ models_array.append(word_vectors) del word_vectors +# * Load models-en from config file to memory +try: + with open('./config.models.simple.en.json') as config_file_en: + models_en = json.load(config_file_en) +except IOError as e: + logging.error(e, exc_info=True) + """ from gensim.models import Word2Vec as WV_model model = WV_model.load('./models/suhomlinskyy.lowercased.lemmatized.word2vec.500d') @@ -92,8 +101,10 @@ def index(): def fallback(page): print(page) if 'ua' in page: + config_flag = 'ua' return render_template('index-ukr.html') if 'en' in page: + config_flag = 'en' return render_template('index-eng.html') # special file handlers @@ -122,7 +133,11 @@ def send_logos(path): # * models list @app.route('/api/models') def get_models_list(): - return jsonify(models) + if config_flag == 'ua': + return jsonify(models) + if config_flag == 'en': + return jsonify(models_en) + # return jsonify(models) # * computational endpoints @app.route('/api/word2vec/similarity', methods=['POST']) From 92e4cd0d5325a59cf26e1530494f22bd7d9e5f64 Mon Sep 17 00:00:00 2001 From: Kyrylo Malakhov Date: Tue, 2 Feb 2021 12:04:07 +0200 Subject: [PATCH 5/5] fix config selection --- server/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/server.py b/server/server.py index 5085e91..de2c38d 100644 --- a/server/server.py +++ b/server/server.py @@ -99,7 +99,7 @@ def index(): # let's Angular do the routs job @app.route('/') def fallback(page): - print(page) + global config_flag if 'ua' in page: config_flag = 'ua' return render_template('index-ukr.html')