Merge pull request #258 from allenai/neuclir23

neuclir23
allenai · Apr 15, 2024 · c4b9b98 · c4b9b98
2 parents 68df906 + 8ac78fe
commit c4b9b98
Show file tree

Hide file tree

Showing 11 changed files with 512 additions and 78 deletions.
diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py
@@ -13,6 +13,7 @@
 from . import codec
 from . import cord19
 from . import cranfield
+from . import csl
 from . import disks45
 from . import dpr_w100
 from . import codesearchnet

diff --git a/ir_datasets/datasets/csl.py b/ir_datasets/datasets/csl.py
@@ -0,0 +1,60 @@
+from typing import List, NamedTuple
+from enum import Enum
+
+import ir_datasets
+from ir_datasets.util import DownloadConfig, GzipExtract
+from ir_datasets.datasets.base import Dataset, YamlDocumentation
+from ir_datasets.formats.trec import TrecQrels
+
+from ir_datasets.formats import JsonlDocs, ExctractedCCQueries, ExctractedCCNoReportQuery
+
+from ir_datasets.util.fileio import TarExtract
+
+NAME = 'csl'
+
+class CslDoc(NamedTuple):
+    doc_id: str
+    title: str
+    abstract: str
+    keywords: List[str]
+    category: str
+    category_eng: str
+    discipline: str
+    discipline_eng: str
+    def default_text(self):
+        return f'{self.title}\n{self.abstract}'
+
+QREL_DEFS = {
+    3: 'Very-valuable. Information in the document would be found in the lead paragraph of a report that is later written on the topic.',
+    1: 'Somewhat-valuable. The most valuable information in the document would be found in the remainder of such a report.',
+    0: 'Not-valuable. Information in the document might be included in a report footnote, or omitted entirely.',
+}
+
+
+def _init():
+    subsets = {}
+    base_path = ir_datasets.util.home_path()/NAME
+    dlc = DownloadConfig.context(NAME, base_path)
+    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
+
+    docs = JsonlDocs(GzipExtract(dlc['docs']), doc_cls=CslDoc, namespace=NAME, lang='zh', count_hint=395927)
+    base = Dataset(
+        docs,
+        documentation('_')
+    )
+
+    subsets["trec-2023"] = Dataset(
+        docs,
+        ExctractedCCQueries(dlc['trec-2023/queries'], subset_lang='zh', filter_lwq=False, cls=ExctractedCCNoReportQuery, namespace=NAME),
+        TrecQrels(TarExtract(dlc['trec-2023/qrels'], 'tech_final_qrels.txt'), QREL_DEFS),
+        documentation('trec-2023'),
+    )
+
+    ir_datasets.registry.register(NAME, base)
+    for s in sorted(subsets):
+        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
+
+    return base, subsets
+
+
+base, subsets = _init()
diff --git a/ir_datasets/datasets/neuclir.py b/ir_datasets/datasets/neuclir.py
@@ -3,14 +3,14 @@
 from functools import lru_cache
 
 import ir_datasets
-from ir_datasets.util import DownloadConfig
-from ir_datasets.datasets.base import Dataset, YamlDocumentation
+from ir_datasets.util import DownloadConfig, Lazy
+from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
 from ir_datasets.formats.trec import TrecQrels
 
-from ir_datasets.formats import ExctractedCCDocs, ExctractedCCQueries
+from ir_datasets.formats import ExctractedCCDocs, ExctractedCCQueries, ExctractedCCNoReportQuery, ExctractedCCNoReportNoHtNarQuery, ExctractedCCMultiMtQuery
 
 from ir_datasets.datasets.hc4 import NAME as HC4_NAME
-from ir_datasets.util.fileio import GzipExtract
+from ir_datasets.util.fileio import GzipExtract, TarExtract
 
 NAME = 'neuclir'
 
@@ -57,6 +57,17 @@ def qrels_iter(self):
                 yield qrel
 
 
+class LangFilteredTrecQrels(TrecQrels):
+    def __init__(self, qrels_dlc, qrels_defs, lang, format_3col=False):
+        super().__init__(qrels_dlc, qrels_defs, format_3col)
+        self._lang = lang
+
+    def qrels_iter(self):
+        for qrel in super().qrels_iter():
+            if qrel.iteration == self._lang:
+                yield qrel
+
+
 QREL_DEFS = {
     3: 'Very-valuable. Information in the document would be found in the lead paragraph of a report that is later written on the topic.',
     1: 'Somewhat-valuable. The most valuable information in the document would be found in the remainder of such a report.',
@@ -73,25 +84,61 @@ def _init():
     base = Dataset(documentation('_')) # dummy top level ds
     subsets["1"] = Dataset(documentation('1')) # dummy year level ds
 
+    qrels2022 = dlc['trec-2022/qrels']
+    qrels2023 = TarExtract(dlc['trec-2023/qrels'], 'qrels.final')
+
     # For NeuCLIR Collection 1
     for lang in ['zh', 'fa', 'ru']:
-        lang_docs = ExctractedCCDocs(dlc[f'1/{lang}/docs'], subset_lang=lang, namespace=NAME, count=DOC_COUNTS[lang])
+        lang3 = {'fa': 'fas', 'zh': 'zho', 'ru': 'rus'}[lang]
+        lang_docs = ExctractedCCDocs(GzipExtract(dlc[f'1/{lang}/docs']), subset_lang=lang, namespace=NAME, count=DOC_COUNTS[lang])
         subsets[f"1/{lang}"] = Dataset(
             lang_docs,
             documentation(f"1/{lang}")
         )
+        qrels = LangFilteredTrecQrels(qrels2022, QREL_DEFS, lang3)
+        subsets[f"1/{lang}/trec-2022"] = Dataset(
+            lang_docs,
+            FilteredQueries(ExctractedCCQueries(dlc['trec-2022/queries'], subset_lang=lang, filter_lwq=False, cls=ExctractedCCNoReportQuery, namespace=NAME), _lazy_qids_set(qrels), mode='include'),
+            qrels,
+            documentation(f"1/{lang}/trec-2022"),
+        )
+        qrels = LangFilteredTrecQrels(qrels2023, QREL_DEFS, lang3)
+        subsets[f"1/{lang}/trec-2023"] = Dataset(
+            lang_docs,
+            FilteredQueries(ExctractedCCQueries(dlc['trec-2023/queries'], subset_lang=lang, filter_lwq=False, cls=ExctractedCCNoReportNoHtNarQuery, namespace=NAME), _lazy_qids_set(qrels), mode='include'),
+            qrels,
+            documentation(f"1/{lang}/trec-2023"),
+        )
         include_doc_id_dlc = hc4_dlc[f'{lang}/docs/ids'] if lang != 'ru' else tuple([ hc4_dlc[f'{lang}/docs/ids/{i}'] for i in range(8) ])
         subsets[f"1/{lang}/hc4-filtered"] = Dataset(
-            FilteredExctractedCCDocs(dlc[f'1/{lang}/docs'], subset_lang=lang, namespace=NAME, include_doc_id_dlc=include_doc_id_dlc),
-            ExctractedCCQueries([hc4_dlc[f'dev/topics'], hc4_dlc[f'test/topics']], subset_lang=lang, namespace=NAME),
+            FilteredExctractedCCDocs(GzipExtract(dlc[f'1/{lang}/docs']), subset_lang=lang, namespace=NAME, include_doc_id_dlc=include_doc_id_dlc),
+            ExctractedCCQueries([hc4_dlc['dev/topics'], hc4_dlc['test/topics']], subset_lang=lang, namespace=NAME),
             FilteredTrecQrels([ hc4_dlc[f'{lang}/dev/qrels'], hc4_dlc[f'{lang}/test/qrels'] ], QREL_DEFS, include_doc_id_dlc=include_doc_id_dlc),
             documentation(f"1/{lang}/hc4-filtered")
         )
-
+
+    multi_docs = ExctractedCCDocs([GzipExtract(dlc[f'1/{lang}/docs']) for lang in ['zh', 'fa', 'ru']], namespace=NAME, count=sum(DOC_COUNTS.values()), docstore_path=base_path/'1'/'multi')
+    subsets['1/multi'] = Dataset(
+        multi_docs,
+        documentation("1/multi")
+    )
+
+    subsets['1/multi/trec-2023'] = Dataset(
+        multi_docs,
+        ExctractedCCQueries(dlc['trec-2023/queries'], filter_lwq=False, cls=ExctractedCCMultiMtQuery, namespace=NAME),
+        TrecQrels(qrels2023, QREL_DEFS),
+        documentation("1/multi/trec-2023")
+    )
+
     ir_datasets.registry.register(NAME, base)
     for s in sorted(subsets):
         ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
 
     return base, subsets
 
+
+def _lazy_qids_set(qrels):
+    return Lazy(lambda: {qrel.query_id for qrel in qrels.qrels_iter()})
+
+
 base, subsets = _init()
diff --git a/ir_datasets/docs/csl.yaml b/ir_datasets/docs/csl.yaml
@@ -0,0 +1,14 @@
+_:
+  pretty_name: 'CSL'
+  desc: '
+<p>
+The CSL dataset, used for the TREC NueCLIR technical document task.
+</p>
+'
+
+trec-2023:
+  desc: '
+<p>
+The TREC NeuCLIR 2023 technical documen task.
+</p>
+'
diff --git a/ir_datasets/docs/neuclir.yaml b/ir_datasets/docs/neuclir.yaml
@@ -21,37 +21,6 @@ Users can conduct experiemnts on this collection with queries and qrels in HC4 f
   <li><a href="https://neuclir.github.io/">NeuCLIR Track Website</a></li>
   <li><a href="https://github.com/NeuCLIR/download-collection">Collection Repository</a></li>
 </ul>'
-  data_access: '
-<p>
-To access the docuemnts of this dataset, you will need to download the documents from Common Crawl. 
-The script for downloading and validating the documents are in <a href="https://github.com/NeuCLIR/download-collection">NeuCLIR/download-collection </a>. 
-Please use the following command to download the documents:
-</p>
-<code>
-git clone https://github.com/NeuCLIR/download-collection<br/>
-cd download-collection<br/>
-pip install -r requirements.txt<br/>
-python download_documents.py --storage ~/.ir_datasets/neuclir/1 \ <br/>
-                             --zho ./resource/zho/ids.jsonl.gz \ <br/>
-                             --fas ./resource/fas/ids.jsonl.gz \ <br/>
-                             --rus ./resource/rus/ids.*.jsonl.gz \ <br/>
-                             --jobs {number of process}<br/>
-</code>
-<p>
-After download, please also post-process the downloaded file to verify all and only specified documents are downloaded, and 
-modify the ordering of the collection to match the original specified ordering in the id files. 
-</p>
-<code>
-for lang in zho fas rus; do <br/>
-&nbsp;&nbsp;python fix_document_order.py --raw_download_file ~/.ir_datasets/neuclir/1/$lang/docs.jsonl \ <br/>
-&nbsp;&nbsp;                             --id_file ./resource/$lang/ids*.jsonl.gz \ <br/>
-&nbsp;&nbsp;                             --check_hash <br/>
-done
-</code>
-<p>
-You can also store the documents in other directory and create a soft link for <kbd>~/.ir_datasets/neuclir/22/</kbd>.
-</p>
-'
 
 1:
   desc: '
@@ -121,3 +90,58 @@ The 54 queries are the <a class="ds-ref">hc4/ru/dev</a> and <a class="ds-ref">hc
   docs_instructions: *inst
   bibtex_ids: ['Lawrie2022HC4']
 
+1/fa/trec-2022:
+  desc: '
+<p>
+Topics and assessments for the TREC NeuCLIR 2022 (Persian language CLIR).
+</p>
+'
+
+1/fa/trec-2023:
+  desc: '
+<p>
+Topics and assessments for the TREC NeuCLIR 2023 (Persian language CLIR).
+</p>
+'
+
+1/ru/trec-2022:
+  desc: '
+<p>
+Topics and assessments for the TREC NeuCLIR 2022 (Russian language CLIR).
+</p>
+'
+
+1/ru/trec-2023:
+  desc: '
+<p>
+Topics and assessments for the TREC NeuCLIR 2023 (Russian language CLIR).
+</p>
+'
+
+1/zh/trec-2022:
+  desc: '
+<p>
+Topics and assessments for the TREC NeuCLIR 2022 (Chinese language CLIR).
+</p>
+'
+
+1/zh/trec-2023:
+  desc: '
+<p>
+Topics and assessments for the TREC NeuCLIR 2023 (Chinese language CLIR).
+</p>
+'
+
+1/multi:
+  desc: '
+<p>
+A combined corpus of NeuCLIR v1 including all Persian, Russian, and Chinese documents.
+</p>
+'
+
+1/multi/trec-2023:
+  desc: '
+<p>
+Topics and assessments for the TREC NeuCLIR 2023 multi-language retrieval task.
+</p>
+'
diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json
@@ -879,6 +879,29 @@
     }
   },
 
+  "csl": {
+    "docs": {
+      "url": "https://huggingface.co/datasets/neuclir/csl/resolve/main/data/csl.jsonl.gz?download=true",
+      "size_hint": 115749077,
+      "expected_md5": "4198f7b442187320e2351b3b473c1883",
+      "cache_path": "docs.jsonl.gz"
+    },
+    "trec-2023/queries": {
+      "url": "https://trec.nist.gov/data/neuclir/2023/neuclir-2023-technical_topics.0719.jsonl",
+      "size_hint": 86519,
+      "expected_md5": "0dd5ba173c695362a8705056edca481b",
+      "cache_path": "trec-2023/topics.jsonl",
+      "irds_mirror": true
+    },
+    "trec-2023/qrels": {
+      "url": "https://trec.nist.gov/data/neuclir/2023/neuclir-2023-qrels.final.tar.gz",
+      "size_hint": 6023886,
+      "expected_md5": "cea4ff3d9eba612c7119e6490217d4e1",
+      "cache_path": "trec-2023/qrels.tar.gz",
+      "irds_mirror": true
+    }
+  },
+
   "disks45": {
     "docs": {
       "instructions": "The TREC Robust document collection is from TREC disks 4 and 5. Due to the copyrighted nature of the  documents, this collection is for research use only, which requires agreements to be filed with NIST. See details here: <https://trec.nist.gov/data/cd45/index.html>.\nMore details about the procedure can be found here: <https://ir-datasets.com/trec-robust04.html#DataAccess>.\nOnce completed, place the uncompressed source here: {path}\nThis should contain directories like NEWS_data/FBIS, NEWS_data/FR94, etc.",
@@ -5351,22 +5374,55 @@
     }
   },
 
-
   "neuclir": {
     "1/fa/docs": {
-      "instructions": "To use this dataset, you need to download the document files using the HC4 document download the post-processing script here: <https://github.com/NeuCLIR/download-collection>.\nTo proceed, symlink the source file here: {path}",
-      "cache_path": "1/fas/docs.jsonl"
+      "url": "https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/fas-00000-of-00001.jsonl.gz?download=true",
+      "size_hint": 2359094118,
+      "expected_md5": "c88f79f6b6da974db22cef3dd73fcee1",
+      "cache_path": "1/fas/docs.jsonl.gz"
     },
     "1/zh/docs": {
-      "instructions": "To use this dataset, you need to download the document files using the HC4 document download the post-processing script here: <https://github.com/NeuCLIR/download-collection>.\nTo proceed, symlink the source file here: {path}",
-      "cache_path": "1/zho/docs.jsonl"
+      "url": "https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true",
+      "size_hint": 3188072408,
+      "expected_md5": "99eb400f3a474603d1db5d41f606889b",
+      "cache_path": "1/zho/docs.jsonl.gz"
     },
     "1/ru/docs": {
-      "instructions": "To use this dataset, you need to download the document files using the HC4 document download the post-processing script here: <https://github.com/NeuCLIR/download-collection>.\nTo proceed, symlink the source file here: {path}",
-      "cache_path": "1/rus/docs.jsonl"
+      "url": "https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/zho-00000-of-00001.jsonl.gz?download=true",
+      "size_hint": 4504119267,
+      "expected_md5": "3aabc798a3b5dd92d7c47db9521870b1",
+      "cache_path": "1/rus/docs.jsonl.gz"
+    },
+    "trec-2022/queries": {
+      "url": "https://trec.nist.gov/data/neuclir/2022/topics.0720.utf8.jsonl",
+      "size_hint": 662272,
+      "expected_md5": "264bf244f798670f063f32ff57ba6135",
+      "cache_path": "trec-2022/topics.jsonl",
+      "irds_mirror": true
+    },
+    "trec-2022/qrels": {
+      "url": "https://trec.nist.gov/data/neuclir/2022/2022-qrels.all",
+      "size_hint": 4785668,
+      "expected_md5": "8dc1aecf13fbe358eea74ade7496b085",
+      "cache_path": "trec-2022/qrels",
+      "irds_mirror": true
+    },
+    "trec-2023/queries": {
+      "url": "https://trec.nist.gov/data/neuclir/2023/neuclir-2023-topics.0605.jsonl",
+      "size_hint": 683779,
+      "expected_md5": "3dbb41b02bfbd719d8b55632d9b15b83",
+      "cache_path": "trec-2023/topics.jsonl",
+      "irds_mirror": true
+    },
+    "trec-2023/qrels": {
+      "url": "https://trec.nist.gov/data/neuclir/2023/neuclir-2023-qrels.final.tar.gz",
+      "size_hint": 6023886,
+      "expected_md5": "cea4ff3d9eba612c7119e6490217d4e1",
+      "cache_path": "trec-2023/qrels.tar.gz",
+      "irds_mirror": true
     }
   },
-  
+
   "nyt": {
     "source": {
       "instructions": "The New York Times Annotated Corpus. It is available from the LDC via: <https://catalog.ldc.upenn.edu/LDC2008T19>.\nMore details about the procedure can be found here: <https://ir-datasets.com/nyt.html#DataAccess>.\nTo proceed, symlink the source file here: {path}",