Skip to content

Commit

Permalink
Access the stemmer of Terrier from PyTerrier (#382)
Browse files Browse the repository at this point in the history
* Update index.py

* added test cases

* added documentation

* fixes and testing other stemmers

---------

Co-authored-by: Sean MacAvaney <sean.macavaney@gmail.com>
  • Loading branch information
cmacdonald and seanmacavaney authored Mar 22, 2023
1 parent 4560df0 commit 5f9b35e
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 1 deletion.
24 changes: 23 additions & 1 deletion pyterrier/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,11 +261,18 @@ class IndexingType(enum.Enum):
SINGLEPASS = 2 #: A single-pass indexing regime, which builds an inverted index directly. No direct index structure is created. Typically is faster than classical indexing.
MEMORY = 3 #: An in-memory index. No persistent index is created.

_stemmer_cache = {}
class TerrierStemmer(Enum):
"""
This enum provides an API for the stemmers available in Terrier. The stemming configuration is saved in the index
and loaded at retrieval time. `Snowball <https://snowballstem.org/>`_ stemmers for various languages
`are available in Terrier <http://terrier.org/docs/current/javadoc/org/terrier/terms/package-summary.html>`_.
It can also be used to access the stemmer::
stemmer = pt.TerrierStemmer.porter
stemmed_word = stemmer.stem('abandoned')
"""
none = 'none' #: Apply no stemming
porter = 'porter' #: Apply Porter's English stemmer
Expand Down Expand Up @@ -316,7 +323,22 @@ def _to_class(this):

if isinstance(this, str):
return this


def stem(self, tok):
if self not in _stemmer_cache:
clz_name = self._to_class(self)
if clz_name is None:
class NoOpStem():
def stem(self, word):
return word
_stemmer_cache[self] = NoOpStem()
else:
if '.' not in clz_name:
clz_name = f'org.terrier.terms.{clz_name}'
# stemmers are termpipeline objects, and these have chained constructors
# pass None to use the appropriate constructor
_stemmer_cache[self] = autoclass(clz_name)(None)
return _stemmer_cache[self].stem(tok)

class TerrierStopwords(Enum):
"""
Expand Down
29 changes: 29 additions & 0 deletions tests/test_terrier_wrappers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pandas as pd
import pyterrier as pt
import os
import unittest
from .base import BaseTestCase
import warnings

class TestTerrierWrappers(BaseTestCase):

def test_stemming(self):
stemmer = pt.TerrierStemmer.porter
TESTS = {
"abandoned": "abandon",
"abandon": "abandon",
"abergavenny": "abergavenni"
}
for i,j in TESTS.items():
self.assertEqual(j, stemmer.stem(i))

stemmer = pt.TerrierStemmer.none
for i,j in TESTS.items():
self.assertEqual(i, stemmer.stem(i))

stemmer = pt.TerrierStemmer.portugese
for i,j in TESTS.items():
self.assertTrue(len(stemmer.stem(i)) > 0)

if __name__ == "__main__":
unittest.main()

0 comments on commit 5f9b35e

Please sign in to comment.