Skip to content

Commit

Permalink
feat(model, pyproject): develop reading test data from yaml
Browse files Browse the repository at this point in the history
  • Loading branch information
Iamhexi committed Oct 6, 2024
1 parent 36dcd3d commit a6b153e
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 18 deletions.
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ nltk = "^3.9.1"
rich = "^13.8.1"
tqdm = "^4.66.5"
sentence-transformers = "^3.1.1"
pyyaml = "^6.0.2"

[tool.poetry.group.test]

Expand Down
26 changes: 11 additions & 15 deletions tests/model/qg_experiment.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Module with performance experiments of Question Generation module."""

from pathlib import Path
import yaml # type: ignore[import-untyped]
import numpy as np
from sentence_transformers import SentenceTransformer

Expand All @@ -15,28 +17,22 @@ def measure_qg_performance_with_cosine_similarity() -> Result:
"""
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

test_data = [
{
'question': 'What color is the sky during the day?',
'context': 'During the day, the sky appears blue.',
'answer': 'blue',
},
{
'question': 'What is the function of the frontend in software development?',
'context': 'In software development, the terms frontend and backend refer to the distinct roles of the user interface (frontend) and the data management layer (backend) of an application. In a client-server architecture, the client typically represents the frontend, while the server represents the backend, even if some presentation tasks are handled by the server.',
'answer': 'presentation layer',
},
]
test_data = None
with open(
Path('tests/model/qg_test_data.yaml'), 'rt', encoding='utf-8'
) as fd:
test_data = yaml.safe_load(fd)

qg = QuestionGeneration()
metric = Metric.COSINE_SIMILARITY
model_name = qg.trained_model_path.split('/')[1]
data_points: np.ndarray = np.zeros(shape=(len(test_data), 1))

for i, test_item in enumerate(test_data):
suggested_answer = test_item['answer']
context = test_item['context']
reference_question = test_item['question']
item = test_item['item']
suggested_answer = item['answer']
context = item['context']
reference_question = item['question']

generated_question = qg.generate(
answer=suggested_answer, context=context
Expand Down
19 changes: 19 additions & 0 deletions tests/model/qg_test_data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
- item:
context: In software development, the terms frontend and backend refer to the distinct roles of the user interface (frontend) and the data management layer (backend) of an application. In a client-server architecture, the client typically represents the frontend, while the server represents the backend, even if some presentation tasks are handled by the server.
question: What is the function of the frontend in software development?
answer: presentation layer

- item:
context: During the day, the sky appears blue.
question: What color is the sky during the day?
answer: blue

- item:
context: GNU Recutils is a set of tools and libraries to access human-editable, plain text databases called recfiles. The data is stored as a sequence of records, each record containing an arbitrary number of named fields.
question: What is an advantage of using GNU Recutils?
answer: There is free software to manipulate recfiles and a text editor is enough to edit it.

- item:
context: To calculate a word embedding vector for the provided sentence, you would typically use a pre-trained word embedding model such as Word2Vec, GloVe, or FastText. These models convert words into numerical vectors based on their semantic meanings and contexts.
question: What is used to convert a setence to a word embedding vector?
answer: Pre-trained word embedding model, e.g. Word2Vec, GloVE, FastText.
13 changes: 11 additions & 2 deletions tests/model/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from tqdm import tqdm

from knowledge_verificator.utils.filesystem import create_text_file
from knowledge_verificator.io_handler import console


class Metric(Enum):
Expand Down Expand Up @@ -75,6 +76,9 @@ def _collect_experiments(self) -> list[Callable]:
if not os.path.isfile(file_path):
continue

if not file.endswith('.py'):
continue

experiment_functions.extend(
self._collect_functions_from_file(file_path=file_path)
)
Expand Down Expand Up @@ -106,9 +110,13 @@ def _collect_functions_from_file(self, file_path: Path) -> list[Callable]:

spec.loader.exec_module(module)

# Get all functions in the module.
# Only function performing experiments: measure_
return [
func for _, func in inspect.getmembers(module, inspect.isfunction)
func
for func_name, func in inspect.getmembers(
module, inspect.isfunction
)
if func_name.startswith('measure_')
]

def run(self) -> None:
Expand All @@ -123,6 +131,7 @@ def run(self) -> None:
unit='experiment',
iterable=self._collect_experiments(),
):
console.print(f'Running {experiment.__name__}...')
result = experiment()
results.append(result)

Expand Down

0 comments on commit a6b153e

Please sign in to comment.