Align behavior of spaCy pipeline component with standalone Lemmy

- Change name of the extension attribute to 'lemmas' - Change return type from string to list - Return multiple lemmas when lemmatization was ambiguous - Update example notebook - Update REAMDE
sorenlind · Feb 15, 2019 · b00f339 · b00f339
1 parent cf39d5f
commit b00f339
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 18 deletions.
diff --git a/README.rst b/README.rst
@@ -45,8 +45,8 @@ Usage
     # add the comonent to the spaCy pipeline.
     nlp.add_pipe(pipe, after='tagger')
 
-    # lemmas can now be accessed using the `._.lemma` attribute on the tokens
-    nlp("akvariernes")[0]._.lemma
+    # lemmas can now be accessed using the `._.lemmas` attribute on the tokens
+    nlp("akvariernes")[0]._.lemmas
 
 Training
 --------

diff --git a/lemmy/pipe/component.py b/lemmy/pipe/component.py
@@ -18,10 +18,10 @@ class LemmyPipelineComponent(object):
     def __init__(self, rules):
         """Initialize a pipeline component instance."""
         self._internal = Lemmatizer(rules)
-        self._lemma = 'lemma'
+        self._lemmas = 'lemmas'
 
         # Add attributes
-        Token.set_extension(self._lemma, default=None)
+        Token.set_extension(self._lemmas, default=None)
 
     def __call__(self, doc):
         """
@@ -32,20 +32,18 @@ def __call__(self, doc):
         """
         for token in doc:
             if token.lemma_ == PRON_LEMMA:
-                lemma = PRON_LEMMA
+                lemmas = [PRON_LEMMA]
             else:
-                lemma = self._get_lemma(token)
+                lemmas = self._get_lemmas(token)
 
-            if not lemma:
+            if not lemmas:
                 continue
-            token._.set(self._lemma, lemma)
+            token._.set(self._lemmas, lemmas)
         return doc
 
-    def _get_lemma(self, token):
+    def _get_lemmas(self, token):
         lemmas = self._internal.lemmatize(token.pos_, token.text)
-        if len(lemmas) != 1:
-            return None
-        return lemmas[0]
+        return lemmas
 
 
 def load():

diff --git a/notebooks/04 spacy_example.ipynb b/notebooks/04 spacy_example.ipynb
@@ -7,7 +7,7 @@
     "# Using lemma with spaCy\n",
     "This is an example of how to use lemma with spaCy. \n",
     "\n",
-    "**Caution**: The Danish model included with spaCy is not trained for POS tagging. This model can not be used with lemma since the lemma pipeline component for spaCy requires POS tags. You must train your own spaCy model capable of POS tagging."
+    "**Caution**: The Danish model included with spaCy is not trained for POS tagging. This model can not be used with Lemmy since the Lemmy pipeline component for spaCy requires POS tags. You must train your own spaCy model capable of POS tagging."
    ]
   },
   {
@@ -60,7 +60,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "That's all there is to it. Now we can access the lemma of each token using the `._.lemma` attribute."
+    "That's all there is to it. Now we can access the lemmas of each token using the `._.lemmas` attribute."
    ]
   },
   {
@@ -71,7 +71,7 @@
     {
      "data": {
       "text/plain": [
-       "'akvarie'"
+       "['akvarie']"
       ]
      },
      "execution_count": 4,
@@ -80,7 +80,7 @@
     }
    ],
    "source": [
-    "nlp(\"akvariernes\")[0]._.lemma"
+    "nlp(\"akvariernes\")[0]._.lemmas"
    ]
   },
   {
@@ -93,7 +93,7 @@
     "    row_format = \"{token:12}| {pos:12}| {lemma:12}\"\n",
     "    print(row_format.format(token=\"TOKEN\", pos=\"POS\", lemma=\"LEMMA\"))\n",
     "    print(\"-\"*36)\n",
-    "    rows = [(t.orth_, t.pos_, t._.lemma if t._.lemma else \"None\") for t in nlp(text)]\n",
+    "    rows = [(t.orth_, t.pos_, \" / \".join(t._.lemmas)) for t in nlp(text)]\n",
     "    for token, pos, lemma in rows:\n",
     "        print(row_format.format(token=token, pos=pos, lemma=lemma))"
    ]
@@ -144,7 +144,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.2"
+   "version": "3.6.5"
   }
  },
  "nbformat": 4,