From 115b80c576f4dcfb38bf645d2cf23b886a174ff8 Mon Sep 17 00:00:00 2001 From: Ciaran O'Reilly Date: Sun, 8 Mar 2020 16:38:46 +0100 Subject: [PATCH] =?UTF-8?q?Versi=C3=B3=200.0.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Primera publicació del model amb reconeixement d'entitats --- .gitignore | 2 ++ MANIFEST.in | 1 + README.md | 38 ++++++++++++++++++++++++++++++++ meta.json | 34 +++++++++++++++++++++++++++++ setup.py | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 137 insertions(+) create mode 100644 .gitignore create mode 100644 MANIFEST.in create mode 100644 README.md create mode 100644 meta.json create mode 100644 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..527558b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +ca_* \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..e5d6d01 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include meta.json diff --git a/README.md b/README.md new file mode 100644 index 0000000..0e9f607 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# [CA] Model pel processament del llenguatge natural en Català per a spaCy + +Model per a [spaCy](https://spacy.io) de la llengua catalana generat a partir de: + +- Vectors de paraules de [fastText](https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md) +- Gramàtica, morfologia i sintaxi fent servir dades del corpus d'[AnCora](https://github.com/UniversalDependencies/UD_Catalan-AnCora) +- Annotacions per a l'extracció d'entitats derivades de la wikipedia ([Cross-lingual Name Tagging and Linking for 282 Languages](http://nlp.cs.rpi.edu/paper/282elisa2017.pdf)) + +Degut a la mida final del model (2.5GB) i dels vectors de paraules (1.1GB) aquests no s'inclouen al repositori però podeu descarregar-vos el model final a la secció Publicacions (Releases). + +## Instal·lació i ús + +Podeu instal·lar el model i fer-lo servir amb spaCy executant les següents ordres a l'interfície de línia d'ordres: + +```sh +> pip install https://github.com/ccoreilly/spacy-catala/releases/download/v0.0.2/ca_fasttext_wiki-0.0.2.tar.gz +> python -m spacy link ca_fasttext_wiki ca +``` + +# [EN] spaCy NLP Model for the Catalan language + +spaCy NLP model for the Catalan language generated from: + +- [fastText](https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md) word vectors +- The [AnCora](https://github.com/UniversalDependencies/UD_Catalan-AnCora) corpus for parts of speech, morphological features, and syntactic dependencies. +- Wikipedia annotations for named entity extraction ([Cross-lingual Name Tagging and Linking for 282 Languages](http://nlp.cs.rpi.edu/paper/282elisa2017.pdf)) +- + +The final model is around 2.5GB and the fastText vectors over 1GB which is why they are not included in this repository. You can download the model under the Releases tab. + +## Installing and using the model + +You can install and use the model in spaCy by executing the following commands: + +```sh +> pip install https://github.com/ccoreilly/spacy-catala/releases/download/v0.0.2/ca_fasttext_wiki-0.0.2.tar.gz +> python -m spacy link ca_fasttext_wiki ca +``` diff --git a/meta.json b/meta.json new file mode 100644 index 0000000..b7e8b06 --- /dev/null +++ b/meta.json @@ -0,0 +1,34 @@ +{ + "accuracy": { + "uas":24.6151530788, + "las":23.711622807, + "ents_p":44.912142152, + "ents_r":21.7459467727, + "ents_f":29.3034819462, + "tags_acc":97.6588546924, + "token_acc":100.0 + }, + "author": "Ciaran O'Reilly", + "description": "Catalan Model from fastText vectors and annotations from the catalan Wikipedia", + "email": "ciaran@oreilly.cat", + "lang": "ca", + "license": "MIT", + "name": "fasttext_wiki", + "parent_package": "spacy", + "pipeline": ["tagger", "parser", "ner"], + "sources": ["fastText"], + "spacy_version": ">=2.1.8", + "speed": { + "nwords":326934, + "cpu":8088.3390474066, + "gpu":7692.6920447333 + }, + "url": "https://nlu.cat", + "vectors":{ + "width":300, + "vectors":2000000, + "keys":2000000, + "name":"ca_model.vectors" + }, + "version": "0.0.2" +} diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6771602 --- /dev/null +++ b/setup.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# coding: utf8 +from __future__ import unicode_literals + +import io +import json +from os import path, walk +from shutil import copy +from setuptools import setup + + +def load_meta(fp): + with io.open(fp, encoding='utf8') as f: + return json.load(f) + + +def list_files(data_dir): + output = [] + for root, _, filenames in walk(data_dir): + for filename in filenames: + if not filename.startswith('.'): + output.append(path.join(root, filename)) + output = [path.relpath(p, path.dirname(data_dir)) for p in output] + output.append('meta.json') + return output + + +def list_requirements(meta): + parent_package = meta.get('parent_package', 'spacy') + requirements = [parent_package + meta['spacy_version']] + if 'setup_requires' in meta: + requirements += meta['setup_requires'] + return requirements + + +def setup_package(): + root = path.abspath(path.dirname(__file__)) + meta_path = path.join(root, 'meta.json') + meta = load_meta(meta_path) + model_name = str(meta['lang'] + '_' + meta['name']) + model_dir = path.join(model_name, model_name + '-' + meta['version']) + + copy(meta_path, path.join(model_name)) + copy(meta_path, model_dir) + + setup( + name=model_name, + description=meta['description'], + author=meta['author'], + author_email=meta['email'], + url=meta['url'], + version=meta['version'], + license=meta['license'], + packages=[model_name], + package_data={model_name: list_files(model_dir)}, + install_requires=list_requirements(meta), + zip_safe=False, + ) + + +if __name__ == '__main__': + setup_package()