-
Notifications
You must be signed in to change notification settings - Fork 8
/
conftest.py
220 lines (182 loc) · 7.08 KB
/
conftest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import os
import jwt
import pytest
from hydra import compose, initialize_config_dir
from hydra.utils import instantiate
from omegaconf import DictConfig
from kazu.data import Document, LinkingMetrics, CandidatesToMetrics
from kazu.annotation.label_studio import (
LabelStudioManager,
)
from kazu.database.in_memory_db import SynonymDatabase
from kazu.ontology_preprocessing.base import (
IDX,
DEFAULT_LABEL,
SYN,
MAPPING_TYPE,
OntologyParser,
)
from kazu.tests.utils import CONFIG_DIR, DummyParser
from kazu.utils.constants import HYDRA_VERSION_BASE
from kazu.web.server import start, stop
from kazu.utils.caching import kazu_disk_cache
from kazu.steps.linking.post_processing.disambiguation.context_scoring import (
TfIdfScorer,
GildaTfIdfScorer,
)
from kazu.utils.utils import Singleton
from kazu.steps.joint_ner_and_linking.memory_efficient_string_matching import (
MemoryEfficientStringMatchingStep,
)
@pytest.fixture(scope="session")
def override_kazu_test_config():
def _override_kazu_test_config(overrides: list[str]) -> DictConfig:
"""Return an optionally overriden copy of the kazu test config.
:return:
"""
# needs a str, can't take a Path
with initialize_config_dir(version_base=HYDRA_VERSION_BASE, config_dir=str(CONFIG_DIR)):
cfg = compose(config_name="config", overrides=overrides)
return cfg
return _override_kazu_test_config
@pytest.fixture(scope="session")
def kazu_test_config(override_kazu_test_config):
return override_kazu_test_config(overrides=[])
_SIMPLE_TEST_CASE_DATA = [
("EGFR is a gene", "gene"),
("CAT1 is a gene", "gene"),
("my cat sat on the mat", "species"),
("For the treatment of anorexia nervosa.", "disease"),
]
@pytest.fixture(scope="function")
def ner_simple_test_cases() -> list[Document]:
"""Return simple Documents testing NER functionality.
This needs to be function-scoped because Documents can be mutated.
"""
docs = [Document.create_simple_document(x[0]) for x in _SIMPLE_TEST_CASE_DATA]
return docs
@pytest.fixture(scope="session")
def set_up_p27_test_case() -> tuple[CandidatesToMetrics, DummyParser]:
dummy_data = {
IDX: ["1", "1", "1", "2", "2", "2", "3", "3", "3"],
DEFAULT_LABEL: [
"CDKN1B",
"CDKN1B",
"CDKN1B",
"PAK2",
"PAK2",
"PAK2",
"ZNRD2",
"ZNRD2",
"ZNRD2",
],
SYN: [
"cyclin-dependent kinase inhibitor 1B (p27, Kip1)",
"CDKN1B",
"p27",
"PAK-2p27",
"p27",
"PAK2",
"Autoantigen p27",
"ZNRD2",
"p27",
],
MAPPING_TYPE: ["", "", "", "", "", "", "", "", ""],
}
parser = DummyParser(data=dummy_data, name="test_tfidf_parser", source="test_tfidf_parser")
parser.populate_databases()
candidates_with_metrics = {
candidate: LinkingMetrics() for candidate in SynonymDatabase().get_all(parser.name).values()
}
return candidates_with_metrics, parser
@pytest.fixture(scope="session")
def make_label_studio_manager():
# using a 'factory fixture' here gives us the ability to share code when
# we need to make a label studio manager which is the same except for a custom project.
def _make_label_studio_manager(
project_name: str = os.environ["LS_PROJECT_NAME"],
) -> LabelStudioManager:
label_studio_url_and_port = os.environ["LS_URL_PORT"]
headers = {
"Authorization": f"Token {os.environ['LS_TOKEN']}",
"Content-Type": "application/json",
}
manager = LabelStudioManager(
project_name=project_name, headers=headers, url=label_studio_url_and_port
)
return manager
return _make_label_studio_manager
@pytest.fixture(scope="module", params=[False, True])
def ray_server(override_kazu_test_config, request):
# clear any residual singleton info, as ray runs separate processes and
# hanging resources can cause OOM
Singleton.clear_all()
if request.param:
# we want jwt auth
os.environ["KAZU_JWT_KEY"] = "this secret key is not secret"
headers = {
"Authorization": f'Bearer {jwt.encode({"username": "user"}, os.environ["KAZU_JWT_KEY"], algorithm="HS256")}'
}
overrides = ["Middlewares=jwt"]
else:
headers = {}
overrides = []
cfg = override_kazu_test_config(
overrides=overrides,
)
start(cfg)
yield headers
stop()
@pytest.fixture(scope="function")
def mock_kazu_disk_cache_on_parsers(monkeypatch):
"""Disables the caching functions on OntologyParsers during testing.
Since we employ diskcache in a slightly unusual way, we need to use some python
tricks to turn the caching on/off during tests.
"""
def do_nothing(*args, **kwargs):
return
# list of memoize functions
funcs = [
OntologyParser._populate_databases,
OntologyParser._export_metadata,
OntologyParser._export_linking_candidates,
]
# ...mapped to the underlying function
# type ignore needed because it would be a pain to try and tell mypy that
# these are all 'wrapped' decorated functions so they have this attribute.
original_funcs = {func: func.__wrapped__ for func in funcs} # type: ignore[attr-defined]
for func, original_func in original_funcs.items():
# set the __cache_key__ to do nothing
original_func.__cache_key__ = do_nothing
# set the memoized function to the original function
monkeypatch.setattr(OntologyParser, func.__name__, original_func) # type: ignore[attr-defined] # doesn't know it will have __name__
# also prevent the original cache from deleting anything
monkeypatch.setattr(kazu_disk_cache, "delete", do_nothing)
# run the calling test
yield
# delete the "do_nothing" function for __cache_key__ at the end of the test
for original_func in original_funcs.values():
del original_func.__cache_key__
# What can possibly go wrong?
@pytest.fixture(scope="function")
def mock_build_vectoriser_cache(monkeypatch):
# type ignore as above - mypy doesn't know this function is 'wrapped'
monkeypatch.setattr(TfIdfScorer, "build_vectorizers", TfIdfScorer.build_vectorizers.__wrapped__) # type: ignore[attr-defined]
@pytest.fixture(scope="function")
def mock_build_fast_string_matcher_cache(monkeypatch):
# type ignore as above - mypy doesn't know this function is 'wrapped'
monkeypatch.setattr(
MemoryEfficientStringMatchingStep,
"_create_automaton",
MemoryEfficientStringMatchingStep._create_automaton.__wrapped__, # type: ignore[attr-defined]
)
@pytest.fixture(scope="function")
def mock_build_gilda_vectoriser_cache(monkeypatch):
monkeypatch.setattr(
GildaTfIdfScorer,
"_calculate_id_vectors",
GildaTfIdfScorer._calculate_id_vectors.__wrapped__, # type: ignore[attr-defined]
)
@pytest.fixture(scope="class")
def gliner_step(kazu_test_config):
yield instantiate(kazu_test_config.GLiNERStep)