-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataloader.py
92 lines (69 loc) · 2.25 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""Useful functions to load data from json files"""
import json
from typing import Dict, List, Tuple
def load_data(dataset_path: str) -> Tuple[List[str], List[Dict]]:
"""
Load both contexts and questions from a json file in SQuAD format.
Parameters
----------
dataset_path : str
path to the json file containing the dataset
Returns
-------
contexts, questions: Tuple[List[str], List[Dict]]
the contexts and questions in the dataset.
The questions are dictionaries such as {'question': '...', 'context_id': 0}
"""
with open(dataset_path, "rb") as file:
data = json.load(file)
contexts = []
questions = []
i = 0
for article in data["data"]:
for paragraph in article["paragraphs"]:
contexts.append(paragraph["context"])
for qa in paragraph["qas"]:
questions.append({"question": qa["question"], "context_id": i})
i += 1
return contexts, questions
def load_contexts(dataset_path: str) -> List[str]:
"""
Load contexts from a json file in SQuAD format.
Parameters
----------
dataset_path : str
path to the json file containing the dataset
Returns
-------
contexts: List[str]
the contexts in the dataset.
"""
with open(dataset_path, "rb") as file:
data = json.load(file)
contexts = []
for article in data["data"]:
for paragraph in article["paragraphs"]:
contexts.append(paragraph["context"])
return contexts
def load_questions(dataset_path: str) -> List[Dict]:
"""
Load questions from a json file in SQuAD format.
Parameters
----------
dataset_path : str
path to the json file containing the dataset
Returns
-------
questions: List[Dict]
the questions in the dataset. The questions are dictionaries such as {'question': '...', 'context_id': 0}
"""
with open(dataset_path, "rb") as file:
data = json.load(file)
questions = []
i = 0
for article in data["data"]:
for paragraph in article["paragraphs"]:
for qa in paragraph["qas"]:
questions.append({"question": qa["question"], "context_id": i})
i += 1
return questions