diff --git a/docs/terrier-retrieval.rst b/docs/terrier-retrieval.rst index 68d897bb..bc97a3cc 100644 --- a/docs/terrier-retrieval.rst +++ b/docs/terrier-retrieval.rst @@ -35,14 +35,67 @@ Retriever +Query Formats for Terrier retrievers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default Terrier assumes that queries can be parsed by its `standard query parser `_, +which is standard search-engine like query language. Queries provided by Dataset objects are assumed to be in this format, using the +standard `["qid", "query"]` dataframe columns. + +Two alternative query formats are also supported: + + - MatchOp - this is a `lower-level query language `_ supported by Terrier, which is Indri-like in nature, and supports operators like ``#1()``. (exact phrase and ``#combine()`` (weighting). MatchOp queries stored in the `"query"` column. + + - pre-tokenised queries - in this format, query terms are provided, with weights, in a dictionary. Query terms are assumed to be already stemmed. This + format is useful for techniques that weight query terms, such as for Learned Sparse Retrieval (e.g. see `pyterrier_splade `_). + +The following query dataframes are therefore equivalent: + + - Raw query: + + ===== ============================= + qid query + ===== ============================= + 1 chemical chemical reactions + ===== ============================= + + - Using Terrier's QL to express weights on query terms: + + ===== ============================= + qid query + ===== ============================= + 1 chemical^2 reactions + ===== ============================= + + - Using Terrier's MatchOpQL to express weights on stemmed and tokenised query terms: + + ===== ====================================== + qid query + ===== ====================================== + 1 #combine:0=2:1=1(chemic reaction) + ===== ====================================== + + - Use the query_toks column (the query column is ignored): + + ===== ====================================== ============================= + qid query_toks query + ===== ====================================== ============================= + 1 {'chemic' : 2.0, 'reaction' : 1} chemical chemical reactions + ===== ====================================== ============================= + + + Terrier Configuration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ When using PyTerrier, we have to be aware of the underlying Terrier configuration, -namely *properties* and *controls*. Properties are global configuration and were -traditionally configured by editing a `terrier.properties` file; In contrast, -controls are per-query configuration. In PyTerrier, we specify both when we construct -the Retriever object: +namely *properties* and *controls*. We aim to surface the most common configuration options +through the Python API, but occasionally its necessary to resort to properties or controls +directly. + +Properties are global configuration and were traditionally configured by editing a +`terrier.properties` file; In contrast, controls are per-query configuration. In PyTerrier, +we specify both when we construct the Retriever object: Common controls: - `"wmodel"` - the name of the weighting model. (This can also be specified using the wmodel kwarg). @@ -55,7 +108,8 @@ Common controls: Common properties: - `"termpipelines"` - the default Terrier term pipeline configuration is `"Stopwords,PorterStemmer"`. If you have created an index with a different configuration, you will need to set the `"termpipelines"` - property for *each* Retriever constructed. + property for *each* Retriever constructed. NB: These are now configurable using ``stemming=`` and + ``stopwords=`` kwargs. **Examples**:: @@ -98,7 +152,7 @@ Good Practice:: pl2 = pt.terrier.Retriever(index, wmodel="PL2") # here, we share the index between two instances of Retriever -You can use the IndexFactory to specify that the index data structures to be loaded into memory:: +You can use the IndexFactory to specify that the index data structures to be loaded into memory, which can benefit efficiency:: # load all structures into memory inmemindex = pt.IndexFactory.of("/path/to/data.properties", memory=True) diff --git a/pyterrier/terrier/retriever.py b/pyterrier/terrier/retriever.py index 134f8a4d..59eeb453 100644 --- a/pyterrier/terrier/retriever.py +++ b/pyterrier/terrier/retriever.py @@ -9,6 +9,7 @@ import concurrent from concurrent.futures import ThreadPoolExecutor import pyterrier as pt +from typing import Dict _matchops = ["#combine", "#uw", "#1", "#tag", "#prefix", "#band", "#base64", "#syn"] def _matchop(query): @@ -17,6 +18,18 @@ def _matchop(query): return True return False +def _querytoks2matchop(query_toks: Dict[str,float]) -> str: + def _matchop_tok(t, w): + import base64 + import string + if not all(a in string.ascii_letters + string.digits for a in t): + encoded = base64.b64encode(t.encode('utf-8')).decode("utf-8") + t = f'#base64({encoded})' + if w != 1: + t = f'#combine:0={w:f}({t})' + return t + return ' '.join([ _matchop_tok(t, w) for (t,w) in query_toks.items() ]) + @pt.java.required def _function2wmodel(function): from jnius import PythonJavaClass, java_method @@ -301,13 +314,19 @@ def __setstate__(self, d): def _retrieve_one(self, row, input_results=None, docno_provided=False, docid_provided=False, scores_provided=False): rank = FIRST_RANK qid = str(row.qid) - query = row.query - if len(query) == 0: - warn("Skipping empty query for qid %s" % qid) - return [] - srq = self.manager.newSearchRequest(qid, query) - + # row is a namedtuple, whose fields are exposed in _fields + query_toks_present : bool = 'query_toks' in row._fields + if query_toks_present: + query = '' # Clear the query so it doesn't match the "applypipeline:off" or "_matchop" condictions below... The query_toks query is converted below. + srq = self.manager.newSearchRequest(qid) + else: + query = row.query + if len(query) == 0: + warn("Skipping empty query for qid %s" % qid) + return [] + srq = self.manager.newSearchRequest(qid, query) + for control, value in self.controls.items(): srq.setControl(control, str(value)) @@ -326,6 +345,17 @@ def _retrieve_one(self, row, input_results=None, docno_provided=False, docid_pro srq.setControl("parseql", "off") srq.setControl("matchopql", "on") + if query_toks_present: + if len(row.query_toks) == 0: + warn("Skipping empty query_toks for qid %s" % qid) + return [] + srq.setControl("terrierql", "off") + srq.setControl("parsecontrols", "off") + srq.setControl("parseql", "off") + srq.setControl("matchopql", "on") + query = _querytoks2matchop(row.query_toks) + srq.setOriginalQuery(query) + #ask decorate only to grab what we need srq.setControl("decorate", ",".join(self.metadata)) @@ -719,12 +749,18 @@ def transform(self, queries): newscores=[] for row in pt.tqdm(queries.itertuples(), desc=str(self), total=queries.shape[0], unit="q") if self.verbose else queries.itertuples(): qid = str(row.qid) - query = row.query - if len(query) == 0: - warn("Skipping empty query for qid %s" % qid) - continue - - srq = self.manager.newSearchRequest(qid, query) + query_toks_present : bool = 'query_toks' in row._fields + if query_toks_present: + # Even though it might look like we should parse the query toks here, we don't want the resulting query to be caught by the conditions + # that come before the "if query_toks_present" check. So we set it to an empty string and handle the parsing below. + query = '' + srq = self.manager.newSearchRequest(qid) + else: + query = row.query + if len(query) == 0: + warn("Skipping empty query for qid %s" % qid) + continue + srq = self.manager.newSearchRequest(qid, query) for control, value in self.controls.items(): srq.setControl(control, str(value)) @@ -741,6 +777,17 @@ def transform(self, queries): srq.setControl("parseql", "off") srq.setControl("matchopql", "on") + if query_toks_present: + if len(row.query_toks) == 0: + warn("Skipping empty query_toks for qid %s" % qid) + return [] + srq.setControl("terrierql", "off") + srq.setControl("parsecontrols", "off") + srq.setControl("parseql", "off") + srq.setControl("matchopql", "on") + query = _querytoks2matchop(row.query_toks) + srq.setOriginalQuery(query) + # this handles the case that a candidate set of documents has been set. if docno_provided or docid_provided: # we use RequestContextMatching to make a ResultSet from the diff --git a/tests/test_br.py b/tests/test_br.py index 950e62fb..06e969aa 100644 --- a/tests/test_br.py +++ b/tests/test_br.py @@ -56,6 +56,25 @@ def test_br_cutoff(self): result = retr.transform(input_set) self.assertEqual(10, len(result)) + def test_br_query_toks(self): + indexloc = self.here + "/fixtures/index/data.properties" + + retr = pt.terrier.Retriever(indexloc) + query_terrier = 'applytermpipeline:off chemic^2 reaction^0.5' + result_terrier = retr.search(query_terrier) + + query_matchop = '#combine:0=2:1=0.5(chemic reaction)' + result_matchop = retr.search(query_matchop) + + query_toks = { 'chemic' : 2, 'reaction' : 0.5} + result_toks = retr.transform(pd.DataFrame([['1', query_toks]], columns=['qid', 'query_toks'])) + + self.assertEqual(len(result_terrier), len(result_matchop)) + self.assertEqual(len(result_terrier), len(result_toks)) + from pandas.testing import assert_frame_equal + assert_frame_equal(result_terrier[["qid", "docno", "score", "rank"]], result_matchop[["qid", "docno", "score", "rank"]]) + assert_frame_equal(result_terrier[["qid", "docno", "score", "rank"]], result_toks[["qid", "docno", "score", "rank"]]) + def test_br_cutoff_stability(self): indexloc = self.here + "/fixtures/index/data.properties" input_set = pd.DataFrame([ @@ -197,7 +216,6 @@ def test_num_manual_wmodel(self): except JavaException as ja: print(ja.stacktrace) raise ja - def test_num_python_wmodel(self): indexref = self.here+"/fixtures/index/data.properties" diff --git a/tests/test_fbr.py b/tests/test_fbr.py index f226f279..55811f2a 100644 --- a/tests/test_fbr.py +++ b/tests/test_fbr.py @@ -90,10 +90,6 @@ def test_fbr_reranking2(self): result1F_map = { row.docno : row.feature0 for row in result1.itertuples() } result2_map = { row.docno : row.score for row in result2.itertuples() } - print(result1F_map) - print(result2_map) - - # check features scores # NB: places can go no less than 4, as two documents have similar PL2 scores for rank, row in enumerate(result0.itertuples()): @@ -141,6 +137,25 @@ def test_fbr(self): retrBasic = pt.terrier.Retriever(indexref) if "matching" in retrBasic.controls: self.assertNotEqual(retrBasic.controls["matching"], "FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull") + + def test_fbr_query_toks(self): + indexloc = self.here + "/fixtures/index/data.properties" + + retr = pt.terrier.FeaturesRetriever(indexloc, ["WMODEL:PL2"], wmodel="DPH") + query_terrier = 'applytermpipeline:off chemic^2 reaction^0.5' + result_terrier = retr.search(query_terrier) + + query_matchop = '#combine:0=2:1=0.5(chemic reaction)' + result_matchop = retr.search(query_matchop) + + query_toks = { 'chemic' : 2, 'reaction' : 0.5} + result_toks = retr.transform(pd.DataFrame([['1', query_toks]], columns=['qid', 'query_toks'])) + + self.assertEqual(len(result_terrier), len(result_matchop)) + self.assertEqual(len(result_terrier), len(result_toks)) + from pandas.testing import assert_frame_equal + assert_frame_equal(result_terrier[["qid", "docno", "score", "rank", "features"]], result_matchop[["qid", "docno", "score", "rank", "features"]]) + assert_frame_equal(result_terrier[["qid", "docno", "score", "rank", "features"]], result_toks[["qid", "docno", "score", "rank", "features"]]) def test_fbr_example(self): JIR = pt.java.autoclass('org.terrier.querying.IndexRef')