Skip to content

Commit

Permalink
Align behavior of spaCy pipeline component with standalone Lemmy
Browse files Browse the repository at this point in the history
- Change name of the extension attribute to 'lemmas'
- Change return type from string to list
- Return multiple lemmas when lemmatization was ambiguous
- Update example notebook
- Update REAMDE
  • Loading branch information
sorenlind committed Feb 15, 2019
1 parent cf39d5f commit b00f339
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 18 deletions.
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ Usage
# add the comonent to the spaCy pipeline.
nlp.add_pipe(pipe, after='tagger')
# lemmas can now be accessed using the `._.lemma` attribute on the tokens
nlp("akvariernes")[0]._.lemma
# lemmas can now be accessed using the `._.lemmas` attribute on the tokens
nlp("akvariernes")[0]._.lemmas
Training
--------
Expand Down
18 changes: 8 additions & 10 deletions lemmy/pipe/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ class LemmyPipelineComponent(object):
def __init__(self, rules):
"""Initialize a pipeline component instance."""
self._internal = Lemmatizer(rules)
self._lemma = 'lemma'
self._lemmas = 'lemmas'

# Add attributes
Token.set_extension(self._lemma, default=None)
Token.set_extension(self._lemmas, default=None)

def __call__(self, doc):
"""
Expand All @@ -32,20 +32,18 @@ def __call__(self, doc):
"""
for token in doc:
if token.lemma_ == PRON_LEMMA:
lemma = PRON_LEMMA
lemmas = [PRON_LEMMA]
else:
lemma = self._get_lemma(token)
lemmas = self._get_lemmas(token)

if not lemma:
if not lemmas:
continue
token._.set(self._lemma, lemma)
token._.set(self._lemmas, lemmas)
return doc

def _get_lemma(self, token):
def _get_lemmas(self, token):
lemmas = self._internal.lemmatize(token.pos_, token.text)
if len(lemmas) != 1:
return None
return lemmas[0]
return lemmas


def load():
Expand Down
12 changes: 6 additions & 6 deletions notebooks/04 spacy_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"# Using lemma with spaCy\n",
"This is an example of how to use lemma with spaCy. \n",
"\n",
"**Caution**: The Danish model included with spaCy is not trained for POS tagging. This model can not be used with lemma since the lemma pipeline component for spaCy requires POS tags. You must train your own spaCy model capable of POS tagging."
"**Caution**: The Danish model included with spaCy is not trained for POS tagging. This model can not be used with Lemmy since the Lemmy pipeline component for spaCy requires POS tags. You must train your own spaCy model capable of POS tagging."
]
},
{
Expand Down Expand Up @@ -60,7 +60,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"That's all there is to it. Now we can access the lemma of each token using the `._.lemma` attribute."
"That's all there is to it. Now we can access the lemmas of each token using the `._.lemmas` attribute."
]
},
{
Expand All @@ -71,7 +71,7 @@
{
"data": {
"text/plain": [
"'akvarie'"
"['akvarie']"
]
},
"execution_count": 4,
Expand All @@ -80,7 +80,7 @@
}
],
"source": [
"nlp(\"akvariernes\")[0]._.lemma"
"nlp(\"akvariernes\")[0]._.lemmas"
]
},
{
Expand All @@ -93,7 +93,7 @@
" row_format = \"{token:12}| {pos:12}| {lemma:12}\"\n",
" print(row_format.format(token=\"TOKEN\", pos=\"POS\", lemma=\"LEMMA\"))\n",
" print(\"-\"*36)\n",
" rows = [(t.orth_, t.pos_, t._.lemma if t._.lemma else \"None\") for t in nlp(text)]\n",
" rows = [(t.orth_, t.pos_, \" / \".join(t._.lemmas)) for t in nlp(text)]\n",
" for token, pos, lemma in rows:\n",
" print(row_format.format(token=token, pos=pos, lemma=lemma))"
]
Expand Down Expand Up @@ -144,7 +144,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
"version": "3.6.5"
}
},
"nbformat": 4,
Expand Down

0 comments on commit b00f339

Please sign in to comment.