From 9c024438114fed70f962dc5a99f271f5782f7d33 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sun, 25 Feb 2024 18:39:08 +0800 Subject: [PATCH 1/9] added deepeval flag to pytest --- deepeval/plugins/plugin.py | 54 ++++++++++++++++++++++---------------- temp_test_run_data.json | 1 + test_llm/test_a.py | 2 ++ 3 files changed, 35 insertions(+), 22 deletions(-) create mode 100644 temp_test_run_data.json create mode 100644 test_llm/test_a.py diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py index 968e7aa97..83f46cd38 100644 --- a/deepeval/plugins/plugin.py +++ b/deepeval/plugins/plugin.py @@ -8,27 +8,30 @@ def pytest_sessionstart(session: pytest.Session): - test_run_manager.save_to_disk = True - try: - deployment_configs = session.config.getoption("--deployment") - disable_request = False - - if deployment_configs is None: - deployment = False - else: - deployment = True - deployment_configs = json.loads(deployment_configs) - disable_request = deployment_configs.pop("is_pull_request", False) - deployment_configs = DeploymentConfigs(**deployment_configs) - - test_run_manager.create_test_run( - deployment=deployment, - deployment_configs=deployment_configs, - file_name=session.config.getoption("file_or_dir")[0], - disable_request=disable_request, - ) - except: - test_run_manager.create_test_run() + deepeval = session.config.getoption("--deepeval") + + if deepeval: + test_run_manager.save_to_disk = True + try: + deployment_configs = session.config.getoption("--deployment") + disable_request = False + + if deployment_configs is None: + deployment = False + else: + deployment = True + deployment_configs = json.loads(deployment_configs) + disable_request = deployment_configs.pop("is_pull_request", False) + deployment_configs = DeploymentConfigs(**deployment_configs) + + test_run_manager.create_test_run( + deployment=deployment, + deployment_configs=deployment_configs, + file_name=session.config.getoption("file_or_dir")[0], + disable_request=disable_request, + ) + except: + test_run_manager.create_test_run() def pytest_addoption(parser): @@ -39,13 +42,20 @@ def pytest_addoption(parser): help="Set deployment configs", ) + parser.addoption( + "--deepeval", + action="store", + default=False, + help="Set deepeval env", + ) + @pytest.hookimpl(tryfirst=True) def pytest_runtest_protocol( item: pytest.Item, nextitem: Optional[pytest.Item] ) -> Optional[Any]: os.environ[PYTEST_RUN_TEST_NAME] = item.nodeid.split("::")[-1] - return None # continue with the default protocol + return None @pytest.hookimpl(tryfirst=True, hookwrapper=True) diff --git a/temp_test_run_data.json b/temp_test_run_data.json new file mode 100644 index 000000000..a72ac9a36 --- /dev/null +++ b/temp_test_run_data.json @@ -0,0 +1 @@ +{"testFile": "test_llm", "deployment": false, "testCases": [], "metricScores": [], "configurations": {}} \ No newline at end of file diff --git a/test_llm/test_a.py b/test_llm/test_a.py new file mode 100644 index 000000000..f3f296ffc --- /dev/null +++ b/test_llm/test_a.py @@ -0,0 +1,2 @@ +def test_ok(): + pass \ No newline at end of file From 5a7f709500a3fcfabcb3f1ccc7a60f07a9680803 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sun, 25 Feb 2024 18:39:18 +0800 Subject: [PATCH 2/9] reformat --- deepeval/plugins/plugin.py | 4 +++- test_llm/test_a.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py index 83f46cd38..862bd8341 100644 --- a/deepeval/plugins/plugin.py +++ b/deepeval/plugins/plugin.py @@ -21,7 +21,9 @@ def pytest_sessionstart(session: pytest.Session): else: deployment = True deployment_configs = json.loads(deployment_configs) - disable_request = deployment_configs.pop("is_pull_request", False) + disable_request = deployment_configs.pop( + "is_pull_request", False + ) deployment_configs = DeploymentConfigs(**deployment_configs) test_run_manager.create_test_run( diff --git a/test_llm/test_a.py b/test_llm/test_a.py index f3f296ffc..fe510729c 100644 --- a/test_llm/test_a.py +++ b/test_llm/test_a.py @@ -1,2 +1,2 @@ def test_ok(): - pass \ No newline at end of file + pass From 58160327277aa82b0586043ba00829b92ea3d4ce Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sun, 25 Feb 2024 18:51:23 +0800 Subject: [PATCH 3/9] added flag --- deepeval/cli/test.py | 3 +++ deepeval/plugins/plugin.py | 17 ++++------------- deepeval/utils.py | 8 ++++++++ temp_test_run_data.json | 2 +- tests/test_hallucination.py | 4 ++-- 5 files changed, 18 insertions(+), 16 deletions(-) diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py index 3e2e5edf1..26f02f0d3 100644 --- a/deepeval/cli/test.py +++ b/deepeval/cli/test.py @@ -54,6 +54,7 @@ def run( check_if_valid_file(test_file_or_directory) test_run_manager.reset() pytest_args = [test_file_or_directory] + if exit_on_first_failure: pytest_args.insert(0, "-x") @@ -77,6 +78,8 @@ def run( if num_processes is not None: pytest_args.extend(["-n", str(num_processes)]) + pytest_args.append("--deepeval") + # Add the deepeval plugin file to pytest arguments pytest_args.extend(["-p", "plugins"]) diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py index 862bd8341..1039b606e 100644 --- a/deepeval/plugins/plugin.py +++ b/deepeval/plugins/plugin.py @@ -5,12 +5,12 @@ from typing import Optional, Any from deepeval.constants import PYTEST_RUN_TEST_NAME from deepeval.test_run import test_run_manager, DeploymentConfigs - +from deepeval.utils import is_running_deepeval def pytest_sessionstart(session: pytest.Session): - deepeval = session.config.getoption("--deepeval") + is_running_deepeval = is_running_deepeval() - if deepeval: + if is_running_deepeval: test_run_manager.save_to_disk = True try: deployment_configs = session.config.getoption("--deployment") @@ -21,9 +21,7 @@ def pytest_sessionstart(session: pytest.Session): else: deployment = True deployment_configs = json.loads(deployment_configs) - disable_request = deployment_configs.pop( - "is_pull_request", False - ) + disable_request = deployment_configs.pop("is_pull_request", False) deployment_configs = DeploymentConfigs(**deployment_configs) test_run_manager.create_test_run( @@ -44,13 +42,6 @@ def pytest_addoption(parser): help="Set deployment configs", ) - parser.addoption( - "--deepeval", - action="store", - default=False, - help="Set deepeval env", - ) - @pytest.hookimpl(tryfirst=True) def pytest_runtest_protocol( diff --git a/deepeval/utils.py b/deepeval/utils.py index 4468f46f4..b98334c9d 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -14,6 +14,14 @@ from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER +_is_running_deepeval = True + +def set_is_running_deepeval(flag: bool): + global _is_running_deepeval + _is_running_deepeval = flag + +def is_running_deepeval() -> bool: + return _is_running_deepeval def get_deployment_configs() -> Optional[Dict]: if os.getenv("GITHUB_ACTIONS") == "true": diff --git a/temp_test_run_data.json b/temp_test_run_data.json index a72ac9a36..acbadf1cc 100644 --- a/temp_test_run_data.json +++ b/temp_test_run_data.json @@ -1 +1 @@ -{"testFile": "test_llm", "deployment": false, "testCases": [], "metricScores": [], "configurations": {}} \ No newline at end of file +{"deployment": false, "testCases": [{"name": "test_length_metric", "input": "placeholder", "actualOutput": "This is a long sentence that is more than 3 letters", "success": true, "metricsMetadata": [{"metric": "Latency", "score": 1.0, "threshold": 10.0, "success": true}], "runDuration": 2.1499989088624716e-05, "latency": 8.3}, {"name": "test_customer_chatbot[test_case0]", "input": "How many cars did Tesla sell?", "actualOutput": "I don't have access to that data", "expectedOutput": "578 in the year of 2022.", "success": false, "metricsMetadata": [{"metric": "Coherence", "score": 0.1242689897104402, "threshold": 0.5, "success": false, "reason": "This metric looking good!"}], "runDuration": 7.458002073690295e-06, "context": ["I don't know"]}, {"name": "test_customer_chatbot[test_case1]", "input": "What's the refund policy", "actualOutput": "I don't know", "expectedOutput": "I don't know", "success": false, "metricsMetadata": [{"metric": "Coherence", "score": 0.2666893953970654, "threshold": 0.5, "success": false, "reason": "This metric looking good!"}], "runDuration": 7.124996045604348e-06, "context": ["I don't know"]}, {"name": "test_customer_chatbot[test_case2]", "input": "How many suitcases can the trunk of a Cybertruck fit?", "actualOutput": "To my knowledge, a cybertruck can fit up to 3 medium sized suitcases.", "expectedOutput": "Four", "success": false, "metricsMetadata": [{"metric": "Coherence", "score": 0.12683304885975322, "threshold": 0.5, "success": false, "reason": "This metric looking good!"}], "runDuration": 4.582980182021856e-06, "context": ["3"]}, {"name": "test_customer_chatbot[test_case3]", "input": "Who is the current president of the united states?", "actualOutput": "Biden", "expectedOutput": "Joe Biden", "success": true, "metricsMetadata": [{"metric": "Coherence", "score": 0.536845362348648, "threshold": 0.5, "success": true, "reason": "This metric looking good!"}], "runDuration": 4.0000013541430235e-06, "context": ["Joe Biden is the current president"]}, {"name": "test_hallucination_metric_2", "input": "placeholder", "actualOutput": "Python is a programming language.", "success": false, "metricsMetadata": [{"metric": "Hallucination", "score": 1.0, "threshold": 0.6, "success": false, "reason": "The score is 1.00 because the actual output directly contradicts the provided context regarding Python's status as a programming language, indicating a complete deviation from the facts presented in the context.", "evaluationModel": "gpt-4-0125-preview"}], "runDuration": 4.936658249993343, "latency": 0.2, "cost": 1.0, "context": ["Python is NOT a programming language."]}, {"name": "test_hallucination_metric_3", "input": "placeholder", "actualOutput": "Python is a programming language.", "success": false, "metricsMetadata": [{"metric": "Hallucination", "score": 1.0, "threshold": 0.6, "success": false, "reason": "The score is 1.00 because the actual output directly contradicts the provided context by misidentifying Python as something other than a programming language, indicating a complete departure from the factual information given.", "evaluationModel": "gpt-4-0125-preview"}], "runDuration": 4.42829695797991, "latency": 13.0, "cost": 0.1, "context": ["Python is a snake."]}, {"name": "test_cost_metric", "input": "...", "actualOutput": "...", "success": true, "metricsMetadata": [{"metric": "Cost", "score": 12.0, "threshold": 12.0, "success": true}], "runDuration": 0.00047237498802132905, "cost": 12.0}, {"name": "test_latency_metric", "input": "...", "actualOutput": "...", "success": true, "metricsMetadata": [{"metric": "Latency", "score": 8.3, "threshold": 12.0, "success": true}], "runDuration": 0.0019009170064236969, "latency": 8.3}], "metricScores": [], "configurations": {}} \ No newline at end of file diff --git a/tests/test_hallucination.py b/tests/test_hallucination.py index ccea5dbcd..dd74fb92f 100644 --- a/tests/test_hallucination.py +++ b/tests/test_hallucination.py @@ -20,7 +20,7 @@ def test_hallucination_metric(): assert_test(test_case, [metric]) -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric_2(): metric = HallucinationMetric(threshold=0.6) test_case = LLMTestCase( @@ -34,7 +34,7 @@ def test_hallucination_metric_2(): assert_test(test_case, [metric]) -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric_3(): metric = HallucinationMetric(threshold=0.6) test_case = LLMTestCase( From 20bf434a6e1596b64e4b4cb961b58cc47ed19594 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sun, 25 Feb 2024 18:51:33 +0800 Subject: [PATCH 4/9] reformat --- deepeval/plugins/plugin.py | 5 ++++- deepeval/utils.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py index 1039b606e..08526c045 100644 --- a/deepeval/plugins/plugin.py +++ b/deepeval/plugins/plugin.py @@ -7,6 +7,7 @@ from deepeval.test_run import test_run_manager, DeploymentConfigs from deepeval.utils import is_running_deepeval + def pytest_sessionstart(session: pytest.Session): is_running_deepeval = is_running_deepeval() @@ -21,7 +22,9 @@ def pytest_sessionstart(session: pytest.Session): else: deployment = True deployment_configs = json.loads(deployment_configs) - disable_request = deployment_configs.pop("is_pull_request", False) + disable_request = deployment_configs.pop( + "is_pull_request", False + ) deployment_configs = DeploymentConfigs(**deployment_configs) test_run_manager.create_test_run( diff --git a/deepeval/utils.py b/deepeval/utils.py index b98334c9d..960d59580 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -16,13 +16,16 @@ _is_running_deepeval = True + def set_is_running_deepeval(flag: bool): global _is_running_deepeval _is_running_deepeval = flag + def is_running_deepeval() -> bool: return _is_running_deepeval + def get_deployment_configs() -> Optional[Dict]: if os.getenv("GITHUB_ACTIONS") == "true": env_info = { From c6638faf8261b19dccd686337471ffbc1eb31108 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sun, 25 Feb 2024 19:00:15 +0800 Subject: [PATCH 5/9] removed temp file for pure pytest --- deepeval/cli/test.py | 3 ++- deepeval/evaluate.py | 4 ++++ deepeval/plugins/plugin.py | 4 ++-- deepeval/utils.py | 4 ++-- temp_test_run_data.json | 1 - 5 files changed, 10 insertions(+), 6 deletions(-) delete mode 100644 temp_test_run_data.json diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py index 26f02f0d3..41c89fd40 100644 --- a/deepeval/cli/test.py +++ b/deepeval/cli/test.py @@ -8,6 +8,7 @@ from deepeval.utils import delete_file_if_exists, get_deployment_configs from deepeval.test_run import invoke_test_run_end_hook from deepeval.telemetry import capture_evaluation_count +from deepeval.utils import set_is_running_deepeval app = typer.Typer(name="test") @@ -78,7 +79,7 @@ def run( if num_processes is not None: pytest_args.extend(["-n", str(num_processes)]) - pytest_args.append("--deepeval") + set_is_running_deepeval(True) # Add the deepeval plugin file to pytest arguments pytest_args.extend(["-p", "plugins"]) diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py index d57ba24c7..5a7cdfd51 100644 --- a/deepeval/evaluate.py +++ b/deepeval/evaluate.py @@ -11,6 +11,7 @@ from deepeval.tracing import get_trace_stack from deepeval.constants import PYTEST_RUN_TEST_NAME from deepeval.test_run import test_run_manager, APITestCase, MetricsMetadata +from deepeval.utils import get_is_running_deepeval @dataclass @@ -122,6 +123,9 @@ def run_test( def assert_test(test_case: LLMTestCase, metrics: List[BaseMetric]): + if get_is_running_deepeval() is False: + return + if not isinstance(test_case, LLMTestCase): raise TypeError("'test_case' must be an instance of 'LLMTestCase'.") diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py index 08526c045..fe7b9cf1b 100644 --- a/deepeval/plugins/plugin.py +++ b/deepeval/plugins/plugin.py @@ -5,11 +5,11 @@ from typing import Optional, Any from deepeval.constants import PYTEST_RUN_TEST_NAME from deepeval.test_run import test_run_manager, DeploymentConfigs -from deepeval.utils import is_running_deepeval +from deepeval.utils import get_is_running_deepeval def pytest_sessionstart(session: pytest.Session): - is_running_deepeval = is_running_deepeval() + is_running_deepeval = get_is_running_deepeval() if is_running_deepeval: test_run_manager.save_to_disk = True diff --git a/deepeval/utils.py b/deepeval/utils.py index 960d59580..c3b3deb9b 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -14,7 +14,7 @@ from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER -_is_running_deepeval = True +_is_running_deepeval = False def set_is_running_deepeval(flag: bool): @@ -22,7 +22,7 @@ def set_is_running_deepeval(flag: bool): _is_running_deepeval = flag -def is_running_deepeval() -> bool: +def get_is_running_deepeval() -> bool: return _is_running_deepeval diff --git a/temp_test_run_data.json b/temp_test_run_data.json deleted file mode 100644 index acbadf1cc..000000000 --- a/temp_test_run_data.json +++ /dev/null @@ -1 +0,0 @@ -{"deployment": false, "testCases": [{"name": "test_length_metric", "input": "placeholder", "actualOutput": "This is a long sentence that is more than 3 letters", "success": true, "metricsMetadata": [{"metric": "Latency", "score": 1.0, "threshold": 10.0, "success": true}], "runDuration": 2.1499989088624716e-05, "latency": 8.3}, {"name": "test_customer_chatbot[test_case0]", "input": "How many cars did Tesla sell?", "actualOutput": "I don't have access to that data", "expectedOutput": "578 in the year of 2022.", "success": false, "metricsMetadata": [{"metric": "Coherence", "score": 0.1242689897104402, "threshold": 0.5, "success": false, "reason": "This metric looking good!"}], "runDuration": 7.458002073690295e-06, "context": ["I don't know"]}, {"name": "test_customer_chatbot[test_case1]", "input": "What's the refund policy", "actualOutput": "I don't know", "expectedOutput": "I don't know", "success": false, "metricsMetadata": [{"metric": "Coherence", "score": 0.2666893953970654, "threshold": 0.5, "success": false, "reason": "This metric looking good!"}], "runDuration": 7.124996045604348e-06, "context": ["I don't know"]}, {"name": "test_customer_chatbot[test_case2]", "input": "How many suitcases can the trunk of a Cybertruck fit?", "actualOutput": "To my knowledge, a cybertruck can fit up to 3 medium sized suitcases.", "expectedOutput": "Four", "success": false, "metricsMetadata": [{"metric": "Coherence", "score": 0.12683304885975322, "threshold": 0.5, "success": false, "reason": "This metric looking good!"}], "runDuration": 4.582980182021856e-06, "context": ["3"]}, {"name": "test_customer_chatbot[test_case3]", "input": "Who is the current president of the united states?", "actualOutput": "Biden", "expectedOutput": "Joe Biden", "success": true, "metricsMetadata": [{"metric": "Coherence", "score": 0.536845362348648, "threshold": 0.5, "success": true, "reason": "This metric looking good!"}], "runDuration": 4.0000013541430235e-06, "context": ["Joe Biden is the current president"]}, {"name": "test_hallucination_metric_2", "input": "placeholder", "actualOutput": "Python is a programming language.", "success": false, "metricsMetadata": [{"metric": "Hallucination", "score": 1.0, "threshold": 0.6, "success": false, "reason": "The score is 1.00 because the actual output directly contradicts the provided context regarding Python's status as a programming language, indicating a complete deviation from the facts presented in the context.", "evaluationModel": "gpt-4-0125-preview"}], "runDuration": 4.936658249993343, "latency": 0.2, "cost": 1.0, "context": ["Python is NOT a programming language."]}, {"name": "test_hallucination_metric_3", "input": "placeholder", "actualOutput": "Python is a programming language.", "success": false, "metricsMetadata": [{"metric": "Hallucination", "score": 1.0, "threshold": 0.6, "success": false, "reason": "The score is 1.00 because the actual output directly contradicts the provided context by misidentifying Python as something other than a programming language, indicating a complete departure from the factual information given.", "evaluationModel": "gpt-4-0125-preview"}], "runDuration": 4.42829695797991, "latency": 13.0, "cost": 0.1, "context": ["Python is a snake."]}, {"name": "test_cost_metric", "input": "...", "actualOutput": "...", "success": true, "metricsMetadata": [{"metric": "Cost", "score": 12.0, "threshold": 12.0, "success": true}], "runDuration": 0.00047237498802132905, "cost": 12.0}, {"name": "test_latency_metric", "input": "...", "actualOutput": "...", "success": true, "metricsMetadata": [{"metric": "Latency", "score": 8.3, "threshold": 12.0, "success": true}], "runDuration": 0.0019009170064236969, "latency": 8.3}], "metricScores": [], "configurations": {}} \ No newline at end of file From f2e941bd1ce6c569e70f9b890589df30a548dca3 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sun, 25 Feb 2024 19:01:36 +0800 Subject: [PATCH 6/9] updated test --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4ba59ac47..8b3b52c4d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -65,4 +65,4 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | - poetry run pytest tests/ --ignore=tests/test_deployment.py + poetry run deepeval tests/ --ignore=tests/test_deployment.py From a860c4ff2fe3beea20aa8b21f4a5034a4bc0b324 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sun, 25 Feb 2024 19:02:37 +0800 Subject: [PATCH 7/9] udpated tests --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8b3b52c4d..5f56f6022 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -65,4 +65,4 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | - poetry run deepeval tests/ --ignore=tests/test_deployment.py + poetry run deepeval test run tests/ --ignore=tests/test_deployment.py From e2b9c4eb9dfbe92ae4b2be3efda4486d1b88fdbe Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sun, 25 Feb 2024 19:07:44 +0800 Subject: [PATCH 8/9] updated tests --- test_llm/test_a.py | 2 -- tests/test_hallucination.py | 11 ++++------- 2 files changed, 4 insertions(+), 9 deletions(-) delete mode 100644 test_llm/test_a.py diff --git a/test_llm/test_a.py b/test_llm/test_a.py deleted file mode 100644 index fe510729c..000000000 --- a/test_llm/test_a.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_ok(): - pass diff --git a/tests/test_hallucination.py b/tests/test_hallucination.py index dd74fb92f..5b4507542 100644 --- a/tests/test_hallucination.py +++ b/tests/test_hallucination.py @@ -1,5 +1,4 @@ import pytest -import deepeval from deepeval.test_case import LLMTestCase from deepeval.metrics import HallucinationMetric from deepeval import assert_test @@ -20,7 +19,7 @@ def test_hallucination_metric(): assert_test(test_case, [metric]) -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric_2(): metric = HallucinationMetric(threshold=0.6) test_case = LLMTestCase( @@ -30,11 +29,10 @@ def test_hallucination_metric_2(): cost=1, latency=0.2, ) - with pytest.raises(AssertionError): - assert_test(test_case, [metric]) + assert_test(test_case, [metric]) -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric_3(): metric = HallucinationMetric(threshold=0.6) test_case = LLMTestCase( @@ -44,8 +42,7 @@ def test_hallucination_metric_3(): cost=0.1, latency=13.0, ) - with pytest.raises(AssertionError): - assert_test(test_case, [metric]) + assert_test(test_case, [metric]) # @deepeval.set_hyperparameters(model="gpt-4") From 1fec1d2894d274d97772e2849777598ef0df3646 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sun, 25 Feb 2024 19:10:13 +0800 Subject: [PATCH 9/9] Removed duplicate tests --- .github/workflows/test.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5f56f6022..10724def0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -61,8 +61,8 @@ jobs: #---------------------------------------------- # run test suite #---------------------------------------------- - - name: Run tests - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - run: | - poetry run deepeval test run tests/ --ignore=tests/test_deployment.py + # - name: Run tests + # env: + # OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + # run: | + # poetry run deepeval test run tests/ --ignore=tests/test_deployment.py