terrier-org · cmacdonald · Aug 16, 2024 · Jul 20, 2024 · Jul 20, 2024 · Jul 21, 2024
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
@@ -6,8 +6,7 @@ name: Continuous Testing
 on:
   push:
     branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  pull_request: {}
 
 jobs:
   build:
@@ -18,11 +17,11 @@ jobs:
         java: [11, 13]
         os: ['ubuntu-latest', 'macos-13', 'windows-latest']
         terrier: ['snapshot'] #'5.3', '5.4-SNAPSHOT', 
-        include:
-            - os: 'macos-latest'
-              python-version: '3.9'
-              java: 11
-              terrier: 'snapshot'
+        # include:
+        #     - os: 'macos-latest'
+        #       python-version: '3.9'
+        #       java: 11
+        #       terrier: 'snapshot'
 
     runs-on: ${{ matrix.os }}
     steps:
@@ -79,13 +78,6 @@ jobs:
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         #flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 
-    - name: RM3 unit tests
-      env:
-        TERRIER_VERSION: ${{ matrix.terrier }}
-      run: |
-        pytest -p no:faulthandler tests/test_rewrite_rm3.py
-        # Hide underlying Jnius problem by disabling faulthandler: https://github.com/pytest-dev/pytest/issues/7634
-
     - name: Flash unit tests
       env:
         TERRIER_VERSION: ${{ matrix.terrier }}

diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
@@ -0,0 +1,26 @@
+name: Code Style Checks
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+    runs-on: 'ubuntu-latest'
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+
+    - name: Install
+      run: |
+        pip install flake8 ./extras/pyterrier-flake8-ext/
+
+    - name: pt.java.required checks
+      run: |
+        flake8 ./pyterrier --select=PT --show-source --statistics --count
diff --git a/.github/workflows/anserini.yml → .github/workflows/test-anserini.yml b/.github/workflows/anserini.yml → .github/workflows/test-anserini.yml
@@ -15,7 +15,8 @@ jobs:
     strategy:
       matrix:
         python-version: ['3.10']
-        java: [13]
+        anserini-version: ['==0.19.0', '==0.22.0', '==0.36.0', '']
+        java: [21]
         os: ['ubuntu-latest']
         terrier: ['snapshot'] #'5.3', '5.4-SNAPSHOT', 
 
@@ -63,5 +64,5 @@ jobs:
       env:
         TERRIER_VERSION: ${{ matrix.terrier }}
       run: |
-        pip install pyserini==0.22.0 faiss-cpu torch
+        pip install pyserini${{ matrix.anserini-version }} faiss-cpu torch
         pytest --durations=20 -p no:faulthandler  tests/anserini/
diff --git a/.github/workflows/test-parallel.yaml b/.github/workflows/test-parallel.yaml
@@ -0,0 +1,72 @@
+name: Continuous Testing of Parallel Components
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.11']
+        java: [11, 13]
+        os: ['ubuntu-latest']
+        terrier: ['snapshot']
+
+    runs-on: ${{ matrix.os }}
+    steps:
+
+    - name: Setup dependencies for xgBoost on macOs-latest
+      if: matrix.os == 'macOs-latest'
+      run: |
+        brew install libomp
+
+    - uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      if: matrix.os != 'self-hosted'
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Setup java
+      if: matrix.os != 'self-hosted'
+      uses: actions/setup-java@v4
+      with:
+        java-version: ${{ matrix.java }}
+        distribution: 'zulu'
+
+    - name: Install Terrier snapshot
+      if: matrix.terrier == '5.4-SNAPSHOT'
+      run: |
+        git clone https://github.com/terrier-org/terrier-core.git
+        cd terrier-core
+        mvn -B -DskipTests install
+
+    # follows https://medium.com/ai2-blog/python-caching-in-github-actions-e9452698e98d
+    - name: Loading Python & dependencies from cache
+      if: matrix.os != 'self-hosted'
+      uses: actions/cache@v4
+      with:
+        path: ${{ env.pythonLocation }}
+        key: ${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-test.txt') }}
+
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install --upgrade --upgrade-strategy eager -r requirements.txt
+        pip install --upgrade --upgrade-strategy eager -r requirements-test.txt
+        #install this software
+        pip install --timeout=120 .
+        pip install pytest
+
+    - name: All unit tests
+      env:
+        TERRIER_VERSION: ${{ matrix.terrier }}
+        PARALLEL_TESTING: '1'
+      run: |
+        pytest --durations=20 -p no:faulthandler tests/test_grid.py tests/test_grid.py tests/test_parallel.py tests/test_pool.py
+
diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ See the [indexing documentation](https://pyterrier.readthedocs.io/en/latest/terr
 ```python
 topics = pt.io.read_topics(topicsFile)
 qrels = pt.io.read_qrels(qrelsFile)
-BM25_br = pt.BatchRetrieve(index, wmodel="BM25")
+BM25_br = pt.terrier.Retriever(index, wmodel="BM25")
 res = BM25_br.transform(topics)
 pt.Evaluate(res, qrels, metrics = ['map'])
 ```
@@ -56,7 +56,7 @@ There is a worked example in the [experiment notebook](examples/notebooks/experi
 
 PyTerrier makes it easy to develop complex retrieval pipelines using Python operators such as `>>` to chain different retrieval components. Each retrieval approach is a [transformer](https://pyterrier.readthedocs.io/en/latest/transformer.html), having one key method, `transform()`, which takes a single Pandas dataframe as input, and returns another dataframe. Two examples might encapsulate applying the sequential dependence model, or a query expansion process:
 ```python
-sdm_bm25 = pt.rewrite.SDM() >> pt.BatchRetrieve(indexref, wmodel="BM25")
+sdm_bm25 = pt.rewrite.SDM() >> pt.terrier.Retriever(indexref, wmodel="BM25")
 bo1_qe = BM25_br >> pt.rewrite.Bo1QueryExpansion() >> BM25_br
 ```
 
@@ -83,8 +83,8 @@ You can see examples of how to use these, including notebooks that run on Google
 Complex learning to rank pipelines, including for learning-to-rank, can be constructed using PyTerrier's operator language. For example, to combine two features and make them available for learning, we can use the `**` operator.
 ```python
 two_features = BM25_br >> ( 
-  pt.BatchRetrieve(indexref, wmodel="DirichletLM") ** 
-  pt.BatchRetrieve(indexref, wmodel="PL2") 
+  pt.terrier.Retriever(indexref, wmodel="DirichletLM") ** 
+  pt.terrier.Retriever(indexref, wmodel="PL2") 
 )
 ```
 

diff --git a/docs/anserini.rst b/docs/anserini.rst
@@ -16,15 +16,15 @@ Comparative retrieval from Anserini and Terrier::
     trIndex = "/path/to/data.properties"
     luceneIndex "/path/to/lucene-index-dir"
 
-    BM25_tr = pt.BatchRetrieve(trIndex, wmodel="BM25")
+    BM25_tr = pt.terrier.Retriever(trIndex, wmodel="BM25")
     BM25_ai = pt.anserini.AnseriniBatchRetrieve(luceneIndex, wmodel="BM25")
 
     pt.Experiment([BM25_tr, BM25_ai], topics, qrels, eval_metrics=["map"])
 
 
 AnseriniBatchRetrieve can also be used as a re-ranker::
 
-    BM25_tr = pt.BatchRetrieve(trIndex, wmodel="BM25")
+    BM25_tr = pt.terrier.Retriever(trIndex, wmodel="BM25")
     QLD_ai = pt.anserini.AnseriniBatchRetrieve(luceneIndex, wmodel="QLD")
 
     pipe = BM25_tr >> QLD_ai

diff --git a/docs/apply.rst b/docs/apply.rst
@@ -85,9 +85,9 @@ Its also possible to construct a transformer that makes a new column on a row-wi
 
 For instance, if the column you are creating is called rank_2, it might be created as follows::
 
-    pipe = pt.BatchRetrieve(index) >> pt.apply.rank_2(lambda row: row["rank"] * 2)
+    pipe = pt.terrier.Retriever(index) >> pt.apply.rank_2(lambda row: row["rank"] * 2)
 
 To create a transformer that drops a column, you can instead pass `drop=True` as a kwarg::
 
-    pipe = pt.BatchRetrieve(index, metadata=["docno", "text"] >> pt.text.scorer() >> pt.apply.text(drop=True)
+    pipe = pt.terrier.Retriever(index, metadata=["docno", "text"] >> pt.text.scorer() >> pt.apply.text(drop=True)
 
diff --git a/docs/conf.py b/docs/conf.py
@@ -21,7 +21,6 @@
 # -- Dataset table listing -----------------------------------------------------
 import pyterrier as pt
 import textwrap
-pt.init()
 
 from extras import generate_includes
 if not "QUICK" in os.environ:

diff --git a/docs/datamodel.md b/docs/datamodel.md
@@ -51,7 +51,7 @@ A dataframe representing which documents are retrieved and scored for a given qu
 
 Note that rank is computed by sorting by qid ascending, then score descending. The first rank for each query is 0. The `pyterrier.model.add_rank()` function is used for adding the rank column. 
 
-Optional columns might support additional transformers, such as text (for the contents of the documents), url or title columns. Their presence can facilitate more advanced transformers, such as BERT-based transformers which operate on the raw text of the documents. For instance, if the Terrier index has additional metadata attributes, these can be included by BatchRetrieve using the `metadata` kwarg, i.e. `pt.BatchRetrieve(index, metadata=["docno", "title", "body"])`. 
+Optional columns might support additional transformers, such as text (for the contents of the documents), url or title columns. Their presence can facilitate more advanced transformers, such as BERT-based transformers which operate on the raw text of the documents. For instance, if the Terrier index has additional metadata attributes, these can be included by BatchRetrieve using the `metadata` kwarg, i.e. `pt.terrier.Retriever(index, metadata=["docno", "title", "body"])`. 
 
 Note that the retrieved documents is a subset of the cartesian product of documents and queries; it is important that the query (text) attribute is present for at least ONE document rather than all documents for a given query.
 

diff --git a/docs/datasets.rst b/docs/datasets.rst
@@ -11,11 +11,11 @@ each defined dataset can download and provide easy access to:
  - relevance assessments (aka, labels or qrels), as a dataframe, ready for evaluation
  - ready-made Terrier indices, where appropriate
 
-.. autofunction:: pyterrier.datasets.list_datasets()
+.. autofunction:: pyterrier.datasets.list_datasets
 
-.. autofunction:: pyterrier.datasets.find_datasets()
+.. autofunction:: pyterrier.datasets.find_datasets
 
-.. autofunction:: pyterrier.datasets.get_dataset()
+.. autofunction:: pyterrier.datasets.get_dataset
 
 .. autoclass:: pyterrier.datasets.Dataset
     :members:
@@ -27,8 +27,8 @@ Many of the PyTerrier unit tests are based on the `Vaswani NPL test collection <
 PyTerrier provides a ready-made index on the `Terrier Data Repository <http://data.terrier.org/>`_. This allows experiments to be easily conducted::
 
     dataset = pt.get_dataset("vaswani")
-    bm25 = pt.BatchRetrieve.from_dataset(dataset, "terrier_stemmed", wmodel="BM25")
-    dph = pt.BatchRetrieve.from_dataset(dataset, "terrier_stemmed", wmodel="DPH")
+    bm25 = pt.terrier.Retriever.from_dataset(dataset, "terrier_stemmed", wmodel="BM25")
+    dph = pt.terrier.Retriever.from_dataset(dataset, "terrier_stemmed", wmodel="DPH")
     pt.Experiment(
         [bm25, dph],
         dataset.get_topics(),
@@ -44,8 +44,8 @@ Indexing and then retrieval of documents from the `MSMARCO document corpus <http
     indexref = indexer.index(dataset.get_corpus())
     index = pt.IndexFactory.of(indexref)
 
-    DPH_br = pt.BatchRetrieve(index, wmodel="DPH") % 100
-    BM25_br = pt.BatchRetrieve(index, wmodel="BM25") % 100
+    DPH_br = pt.terrier.Retriever(index, wmodel="DPH") % 100
+    BM25_br = pt.terrier.Retriever(index, wmodel="BM25") % 100
     # this runs an experiment to obtain results on the TREC 2019 Deep Learning track queries and qrels
     pt.Experiment(
         [DPH_br, BM25_br], 
@@ -62,8 +62,8 @@ You can also index datasets that include a corpus using IterDictIndexer and get_
     indexref = indexer.index(dataset.get_corpus_iter(), fields=('title', 'abstract'))
     index = pt.IndexFactory.of(indexref)
 
-    DPH_br = pt.BatchRetrieve(index, wmodel="DPH") % 100
-    BM25_br = pt.BatchRetrieve(index, wmodel="BM25") % 100
+    DPH_br = pt.terrier.Retriever(index, wmodel="DPH") % 100
+    BM25_br = pt.terrier.Retriever(index, wmodel="BM25") % 100
     # this runs an experiment to obtain results on the TREC COVID queries and qrels
     pt.Experiment(
         [DPH_br, BM25_br], 

diff --git a/docs/experiments.rst b/docs/experiments.rst
@@ -27,8 +27,8 @@ Getting average effectiveness over a set of topics::
     # vaswani dataset provides an index, topics and qrels
 
     # lets generate two BRs to compare
-    tfidf = pt.BatchRetrieve(dataset.get_index(), wmodel="TF_IDF")
-    bm25 = pt.BatchRetrieve(dataset.get_index(), wmodel="BM25")
+    tfidf = pt.terrier.Retriever(dataset.get_index(), wmodel="TF_IDF")
+    bm25 = pt.terrier.Retriever(dataset.get_index(), wmodel="BM25")
 
     pt.Experiment(
         [tfidf, bm25],

diff --git a/docs/experiments/Robust04.md b/docs/experiments/Robust04.md
@@ -34,10 +34,10 @@ Here we define and evaluate standard weighting models.
 
 ```python
 
-BM25 = pt.BatchRetrieve(index, wmodel="BM25")
-DPH  = pt.BatchRetrieve(index, wmodel="DPH")
-PL2  = pt.BatchRetrieve(index, wmodel="PL2")
-DLM  = pt.BatchRetrieve(index, wmodel="DirichletLM")
+BM25 = pt.terrier.Retriever(index, wmodel="BM25")
+DPH  = pt.terrier.Retriever(index, wmodel="DPH")
+PL2  = pt.terrier.Retriever(index, wmodel="PL2")
+DLM  = pt.terrier.Retriever(index, wmodel="DirichletLM")
 
 pt.Experiment(
     [BM25, DPH, PL2, DLM],

diff --git a/docs/extras/generate_includes.py b/docs/extras/generate_includes.py
@@ -49,8 +49,8 @@ def experiment_includes():
                 os.path.join(tempfile.gettempdir(), "vaswani_index")
             ).index(pt.get_dataset('vaswani').get_corpus_iter())
 
-    tfidf = pt.BatchRetrieve(indexref, wmodel="TF_IDF")
-    bm25 = pt.BatchRetrieve(indexref, wmodel="BM25")
+    tfidf = pt.terrier.Retriever(indexref, wmodel="TF_IDF")
+    bm25 = pt.terrier.Retriever(indexref, wmodel="BM25")
 
     table = pt.Experiment(
         [tfidf, bm25],