Skip to content

Commit

Permalink
Merge pull request #258 from allenai/neuclir23
Browse files Browse the repository at this point in the history
neuclir23
  • Loading branch information
seanmacavaney authored Apr 15, 2024
2 parents 68df906 + 8ac78fe commit c4b9b98
Show file tree
Hide file tree
Showing 11 changed files with 512 additions and 78 deletions.
1 change: 1 addition & 0 deletions ir_datasets/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from . import codec
from . import cord19
from . import cranfield
from . import csl
from . import disks45
from . import dpr_w100
from . import codesearchnet
Expand Down
60 changes: 60 additions & 0 deletions ir_datasets/datasets/csl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from typing import List, NamedTuple
from enum import Enum

import ir_datasets
from ir_datasets.util import DownloadConfig, GzipExtract
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.formats.trec import TrecQrels

from ir_datasets.formats import JsonlDocs, ExctractedCCQueries, ExctractedCCNoReportQuery

from ir_datasets.util.fileio import TarExtract

NAME = 'csl'

class CslDoc(NamedTuple):
doc_id: str
title: str
abstract: str
keywords: List[str]
category: str
category_eng: str
discipline: str
discipline_eng: str
def default_text(self):
return f'{self.title}\n{self.abstract}'

QREL_DEFS = {
3: 'Very-valuable. Information in the document would be found in the lead paragraph of a report that is later written on the topic.',
1: 'Somewhat-valuable. The most valuable information in the document would be found in the remainder of such a report.',
0: 'Not-valuable. Information in the document might be included in a report footnote, or omitted entirely.',
}


def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')

docs = JsonlDocs(GzipExtract(dlc['docs']), doc_cls=CslDoc, namespace=NAME, lang='zh', count_hint=395927)
base = Dataset(
docs,
documentation('_')
)

subsets["trec-2023"] = Dataset(
docs,
ExctractedCCQueries(dlc['trec-2023/queries'], subset_lang='zh', filter_lwq=False, cls=ExctractedCCNoReportQuery, namespace=NAME),
TrecQrels(TarExtract(dlc['trec-2023/qrels'], 'tech_final_qrels.txt'), QREL_DEFS),
documentation('trec-2023'),
)

ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

return base, subsets


base, subsets = _init()
63 changes: 55 additions & 8 deletions ir_datasets/datasets/neuclir.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
from functools import lru_cache

import ir_datasets
from ir_datasets.util import DownloadConfig
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.util import DownloadConfig, Lazy
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
from ir_datasets.formats.trec import TrecQrels

from ir_datasets.formats import ExctractedCCDocs, ExctractedCCQueries
from ir_datasets.formats import ExctractedCCDocs, ExctractedCCQueries, ExctractedCCNoReportQuery, ExctractedCCNoReportNoHtNarQuery, ExctractedCCMultiMtQuery

from ir_datasets.datasets.hc4 import NAME as HC4_NAME
from ir_datasets.util.fileio import GzipExtract
from ir_datasets.util.fileio import GzipExtract, TarExtract

NAME = 'neuclir'

Expand Down Expand Up @@ -57,6 +57,17 @@ def qrels_iter(self):
yield qrel


class LangFilteredTrecQrels(TrecQrels):
def __init__(self, qrels_dlc, qrels_defs, lang, format_3col=False):
super().__init__(qrels_dlc, qrels_defs, format_3col)
self._lang = lang

def qrels_iter(self):
for qrel in super().qrels_iter():
if qrel.iteration == self._lang:
yield qrel


QREL_DEFS = {
3: 'Very-valuable. Information in the document would be found in the lead paragraph of a report that is later written on the topic.',
1: 'Somewhat-valuable. The most valuable information in the document would be found in the remainder of such a report.',
Expand All @@ -73,25 +84,61 @@ def _init():
base = Dataset(documentation('_')) # dummy top level ds
subsets["1"] = Dataset(documentation('1')) # dummy year level ds

qrels2022 = dlc['trec-2022/qrels']
qrels2023 = TarExtract(dlc['trec-2023/qrels'], 'qrels.final')

# For NeuCLIR Collection 1
for lang in ['zh', 'fa', 'ru']:
lang_docs = ExctractedCCDocs(dlc[f'1/{lang}/docs'], subset_lang=lang, namespace=NAME, count=DOC_COUNTS[lang])
lang3 = {'fa': 'fas', 'zh': 'zho', 'ru': 'rus'}[lang]
lang_docs = ExctractedCCDocs(GzipExtract(dlc[f'1/{lang}/docs']), subset_lang=lang, namespace=NAME, count=DOC_COUNTS[lang])
subsets[f"1/{lang}"] = Dataset(
lang_docs,
documentation(f"1/{lang}")
)
qrels = LangFilteredTrecQrels(qrels2022, QREL_DEFS, lang3)
subsets[f"1/{lang}/trec-2022"] = Dataset(
lang_docs,
FilteredQueries(ExctractedCCQueries(dlc['trec-2022/queries'], subset_lang=lang, filter_lwq=False, cls=ExctractedCCNoReportQuery, namespace=NAME), _lazy_qids_set(qrels), mode='include'),
qrels,
documentation(f"1/{lang}/trec-2022"),
)
qrels = LangFilteredTrecQrels(qrels2023, QREL_DEFS, lang3)
subsets[f"1/{lang}/trec-2023"] = Dataset(
lang_docs,
FilteredQueries(ExctractedCCQueries(dlc['trec-2023/queries'], subset_lang=lang, filter_lwq=False, cls=ExctractedCCNoReportNoHtNarQuery, namespace=NAME), _lazy_qids_set(qrels), mode='include'),
qrels,
documentation(f"1/{lang}/trec-2023"),
)
include_doc_id_dlc = hc4_dlc[f'{lang}/docs/ids'] if lang != 'ru' else tuple([ hc4_dlc[f'{lang}/docs/ids/{i}'] for i in range(8) ])
subsets[f"1/{lang}/hc4-filtered"] = Dataset(
FilteredExctractedCCDocs(dlc[f'1/{lang}/docs'], subset_lang=lang, namespace=NAME, include_doc_id_dlc=include_doc_id_dlc),
ExctractedCCQueries([hc4_dlc[f'dev/topics'], hc4_dlc[f'test/topics']], subset_lang=lang, namespace=NAME),
FilteredExctractedCCDocs(GzipExtract(dlc[f'1/{lang}/docs']), subset_lang=lang, namespace=NAME, include_doc_id_dlc=include_doc_id_dlc),
ExctractedCCQueries([hc4_dlc['dev/topics'], hc4_dlc['test/topics']], subset_lang=lang, namespace=NAME),
FilteredTrecQrels([ hc4_dlc[f'{lang}/dev/qrels'], hc4_dlc[f'{lang}/test/qrels'] ], QREL_DEFS, include_doc_id_dlc=include_doc_id_dlc),
documentation(f"1/{lang}/hc4-filtered")
)


multi_docs = ExctractedCCDocs([GzipExtract(dlc[f'1/{lang}/docs']) for lang in ['zh', 'fa', 'ru']], namespace=NAME, count=sum(DOC_COUNTS.values()), docstore_path=base_path/'1'/'multi')
subsets['1/multi'] = Dataset(
multi_docs,
documentation("1/multi")
)

subsets['1/multi/trec-2023'] = Dataset(
multi_docs,
ExctractedCCQueries(dlc['trec-2023/queries'], filter_lwq=False, cls=ExctractedCCMultiMtQuery, namespace=NAME),
TrecQrels(qrels2023, QREL_DEFS),
documentation("1/multi/trec-2023")
)

ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

return base, subsets


def _lazy_qids_set(qrels):
return Lazy(lambda: {qrel.query_id for qrel in qrels.qrels_iter()})


base, subsets = _init()
14 changes: 14 additions & 0 deletions ir_datasets/docs/csl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
_:
pretty_name: 'CSL'
desc: '
<p>
The CSL dataset, used for the TREC NueCLIR technical document task.
</p>
'

trec-2023:
desc: '
<p>
The TREC NeuCLIR 2023 technical documen task.
</p>
'
86 changes: 55 additions & 31 deletions ir_datasets/docs/neuclir.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,37 +21,6 @@ Users can conduct experiemnts on this collection with queries and qrels in HC4 f
<li><a href="https://neuclir.github.io/">NeuCLIR Track Website</a></li>
<li><a href="https://github.com/NeuCLIR/download-collection">Collection Repository</a></li>
</ul>'
data_access: '
<p>
To access the docuemnts of this dataset, you will need to download the documents from Common Crawl.
The script for downloading and validating the documents are in <a href="https://github.com/NeuCLIR/download-collection">NeuCLIR/download-collection </a>.
Please use the following command to download the documents:
</p>
<code>
git clone https://github.com/NeuCLIR/download-collection<br/>
cd download-collection<br/>
pip install -r requirements.txt<br/>
python download_documents.py --storage ~/.ir_datasets/neuclir/1 \ <br/>
--zho ./resource/zho/ids.jsonl.gz \ <br/>
--fas ./resource/fas/ids.jsonl.gz \ <br/>
--rus ./resource/rus/ids.*.jsonl.gz \ <br/>
--jobs {number of process}<br/>
</code>
<p>
After download, please also post-process the downloaded file to verify all and only specified documents are downloaded, and
modify the ordering of the collection to match the original specified ordering in the id files.
</p>
<code>
for lang in zho fas rus; do <br/>
&nbsp;&nbsp;python fix_document_order.py --raw_download_file ~/.ir_datasets/neuclir/1/$lang/docs.jsonl \ <br/>
&nbsp;&nbsp; --id_file ./resource/$lang/ids*.jsonl.gz \ <br/>
&nbsp;&nbsp; --check_hash <br/>
done
</code>
<p>
You can also store the documents in other directory and create a soft link for <kbd>~/.ir_datasets/neuclir/22/</kbd>.
</p>
'

1:
desc: '
Expand Down Expand Up @@ -121,3 +90,58 @@ The 54 queries are the <a class="ds-ref">hc4/ru/dev</a> and <a class="ds-ref">hc
docs_instructions: *inst
bibtex_ids: ['Lawrie2022HC4']

1/fa/trec-2022:
desc: '
<p>
Topics and assessments for the TREC NeuCLIR 2022 (Persian language CLIR).
</p>
'

1/fa/trec-2023:
desc: '
<p>
Topics and assessments for the TREC NeuCLIR 2023 (Persian language CLIR).
</p>
'

1/ru/trec-2022:
desc: '
<p>
Topics and assessments for the TREC NeuCLIR 2022 (Russian language CLIR).
</p>
'

1/ru/trec-2023:
desc: '
<p>
Topics and assessments for the TREC NeuCLIR 2023 (Russian language CLIR).
</p>
'

1/zh/trec-2022:
desc: '
<p>
Topics and assessments for the TREC NeuCLIR 2022 (Chinese language CLIR).
</p>
'

1/zh/trec-2023:
desc: '
<p>
Topics and assessments for the TREC NeuCLIR 2023 (Chinese language CLIR).
</p>
'

1/multi:
desc: '
<p>
A combined corpus of NeuCLIR v1 including all Persian, Russian, and Chinese documents.
</p>
'

1/multi/trec-2023:
desc: '
<p>
Topics and assessments for the TREC NeuCLIR 2023 multi-language retrieval task.
</p>
'
72 changes: 64 additions & 8 deletions ir_datasets/etc/downloads.json
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,29 @@
}
},

"csl": {
"docs": {
"url": "https://huggingface.co/datasets/neuclir/csl/resolve/main/data/csl.jsonl.gz?download=true",
"size_hint": 115749077,
"expected_md5": "4198f7b442187320e2351b3b473c1883",
"cache_path": "docs.jsonl.gz"
},
"trec-2023/queries": {
"url": "https://trec.nist.gov/data/neuclir/2023/neuclir-2023-technical_topics.0719.jsonl",
"size_hint": 86519,
"expected_md5": "0dd5ba173c695362a8705056edca481b",
"cache_path": "trec-2023/topics.jsonl",
"irds_mirror": true
},
"trec-2023/qrels": {
"url": "https://trec.nist.gov/data/neuclir/2023/neuclir-2023-qrels.final.tar.gz",
"size_hint": 6023886,
"expected_md5": "cea4ff3d9eba612c7119e6490217d4e1",
"cache_path": "trec-2023/qrels.tar.gz",
"irds_mirror": true
}
},

"disks45": {
"docs": {
"instructions": "The TREC Robust document collection is from TREC disks 4 and 5. Due to the copyrighted nature of the documents, this collection is for research use only, which requires agreements to be filed with NIST. See details here: <https://trec.nist.gov/data/cd45/index.html>.\nMore details about the procedure can be found here: <https://ir-datasets.com/trec-robust04.html#DataAccess>.\nOnce completed, place the uncompressed source here: {path}\nThis should contain directories like NEWS_data/FBIS, NEWS_data/FR94, etc.",
Expand Down Expand Up @@ -5351,22 +5374,55 @@
}
},


"neuclir": {
"1/fa/docs": {
"instructions": "To use this dataset, you need to download the document files using the HC4 document download the post-processing script here: <https://github.com/NeuCLIR/download-collection>.\nTo proceed, symlink the source file here: {path}",
"cache_path": "1/fas/docs.jsonl"
"url": "https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/fas-00000-of-00001.jsonl.gz?download=true",
"size_hint": 2359094118,
"expected_md5": "c88f79f6b6da974db22cef3dd73fcee1",
"cache_path": "1/fas/docs.jsonl.gz"
},
"1/zh/docs": {
"instructions": "To use this dataset, you need to download the document files using the HC4 document download the post-processing script here: <https://github.com/NeuCLIR/download-collection>.\nTo proceed, symlink the source file here: {path}",
"cache_path": "1/zho/docs.jsonl"
"url": "https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/rus-00000-of-00001.jsonl.gz?download=true",
"size_hint": 3188072408,
"expected_md5": "99eb400f3a474603d1db5d41f606889b",
"cache_path": "1/zho/docs.jsonl.gz"
},
"1/ru/docs": {
"instructions": "To use this dataset, you need to download the document files using the HC4 document download the post-processing script here: <https://github.com/NeuCLIR/download-collection>.\nTo proceed, symlink the source file here: {path}",
"cache_path": "1/rus/docs.jsonl"
"url": "https://huggingface.co/datasets/neuclir/neuclir1/resolve/main/data/zho-00000-of-00001.jsonl.gz?download=true",
"size_hint": 4504119267,
"expected_md5": "3aabc798a3b5dd92d7c47db9521870b1",
"cache_path": "1/rus/docs.jsonl.gz"
},
"trec-2022/queries": {
"url": "https://trec.nist.gov/data/neuclir/2022/topics.0720.utf8.jsonl",
"size_hint": 662272,
"expected_md5": "264bf244f798670f063f32ff57ba6135",
"cache_path": "trec-2022/topics.jsonl",
"irds_mirror": true
},
"trec-2022/qrels": {
"url": "https://trec.nist.gov/data/neuclir/2022/2022-qrels.all",
"size_hint": 4785668,
"expected_md5": "8dc1aecf13fbe358eea74ade7496b085",
"cache_path": "trec-2022/qrels",
"irds_mirror": true
},
"trec-2023/queries": {
"url": "https://trec.nist.gov/data/neuclir/2023/neuclir-2023-topics.0605.jsonl",
"size_hint": 683779,
"expected_md5": "3dbb41b02bfbd719d8b55632d9b15b83",
"cache_path": "trec-2023/topics.jsonl",
"irds_mirror": true
},
"trec-2023/qrels": {
"url": "https://trec.nist.gov/data/neuclir/2023/neuclir-2023-qrels.final.tar.gz",
"size_hint": 6023886,
"expected_md5": "cea4ff3d9eba612c7119e6490217d4e1",
"cache_path": "trec-2023/qrels.tar.gz",
"irds_mirror": true
}
},

"nyt": {
"source": {
"instructions": "The New York Times Annotated Corpus. It is available from the LDC via: <https://catalog.ldc.upenn.edu/LDC2008T19>.\nMore details about the procedure can be found here: <https://ir-datasets.com/nyt.html#DataAccess>.\nTo proceed, symlink the source file here: {path}",
Expand Down
Loading

0 comments on commit c4b9b98

Please sign in to comment.