diff --git a/notebooks/test_existing_company_reports.ipynb b/notebooks/test_existing_company_reports.ipynb index 52126bd..33b35da 100644 --- a/notebooks/test_existing_company_reports.ipynb +++ b/notebooks/test_existing_company_reports.ipynb @@ -4,7 +4,19 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Python-dotenv could not parse statement starting at line 15\n", + "Python-dotenv could not parse statement starting at line 18\n", + "Python-dotenv could not parse statement starting at line 20\n", + "Python-dotenv could not parse statement starting at line 23\n", + "Python-dotenv could not parse statement starting at line 25\n" + ] + } + ], "source": [ "from dataland_backend.models.data_type_enum import DataTypeEnum\n", "\n", @@ -70,7 +82,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "BPCE\n" + "Aktiebolaget Electrolux\n" ] } ], @@ -79,7 +91,7 @@ "extracted_yes_no_values = {}\n", "\n", "# check yes no values\n", - "for data_id, company_info in zip(data_ids[8:9], company_infos[8:9], strict=False):\n", + "for data_id, company_info in zip(data_ids[0:1], company_infos[0:1], strict=False):\n", " print(company_info.company_name)\n", " data = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id)\n", " data_collection = NuclearAndGasDataCollection(dataset=data.data)\n", @@ -107,11 +119,11 @@ "output_type": "stream", "text": [ "\n", - "Company: BPCE\n", + "Company: Aktiebolaget Electrolux\n", "nuclear_energy_related_activities_section426: Dataland=YesNo.NO, Extracted=YesNo.NO\n", - "nuclear_energy_related_activities_section427: Dataland=YesNo.YES, Extracted=YesNo.YES\n", - "nuclear_energy_related_activities_section428: Dataland=YesNo.YES, Extracted=YesNo.YES\n", - "fossil_gas_related_activities_section429: Dataland=YesNo.YES, Extracted=YesNo.YES\n", + "nuclear_energy_related_activities_section427: Dataland=YesNo.NO, Extracted=YesNo.NO\n", + "nuclear_energy_related_activities_section428: Dataland=YesNo.NO, Extracted=YesNo.NO\n", + "fossil_gas_related_activities_section429: Dataland=YesNo.NO, Extracted=YesNo.NO\n", "fossil_gas_related_activities_section430: Dataland=YesNo.NO, Extracted=YesNo.NO\n", "fossil_gas_related_activities_section431: Dataland=YesNo.NO, Extracted=YesNo.NO\n", "1.0\n" @@ -143,46 +155,56 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Skipping company Aktiebolaget Electrolux due to missing data from Dataland: Error retrieving taxonomy-aligned revenue denominator: 'NoneType' object has no attribute 'value'\n" + ] + } + ], "source": [ "numeric_values_dataland = {}\n", "extracted_numeric_values = {}\n", "\n", "# check numeric values\n", - "for data_id, company_info in zip(data_ids[6:7], company_infos[6:7], strict=False):\n", + "for data_id, company_info in zip(data_ids[0:1], company_infos[0:1], strict=False):\n", " data = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id)\n", " data_collection = NuclearAndGasDataCollection(dataset=data.data)\n", + " try:\n", + " # get values on Dataland\n", + " if company_info.company_name not in numeric_values_dataland:\n", + " numeric_values_dataland[company_info.company_name] = {}\n", "\n", - " # get values on Dataland\n", - " if company_info.company_name not in numeric_values_dataland:\n", - " numeric_values_dataland[company_info.company_name] = {}\n", - "\n", - " numeric_values_dataland[company_info.company_name][\"aligned_revenue_denominator\"] = (\n", - " get_taxonomy_aligned_revenue_denominator_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"aligned_capex_denominator\"] = (\n", - " get_taxonomy_aligned_capex_denominator_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"aligned_revenue_numerator\"] = (\n", - " get_taxonomy_aligned_revenue_numerator_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"aligned_capex_numerator\"] = (\n", - " get_taxonomy_aligned_capex_numerator_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"not_aligned_revenue\"] = (\n", - " get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"not_aligned_capex\"] = (\n", - " get_taxonomy_eligible_but_not_aligned_capex_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"non_eligible_revenue\"] = (\n", - " get_taxonomy_non_eligible_revenue_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"non_eligible_capex\"] = (\n", - " get_taxonomy_non_eligible_capex_values_by_data(data=data_collection)\n", - " )\n", + " numeric_values_dataland[company_info.company_name][\"aligned_revenue_denominator\"] = (\n", + " get_taxonomy_aligned_revenue_denominator_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"aligned_capex_denominator\"] = (\n", + " get_taxonomy_aligned_capex_denominator_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"aligned_revenue_numerator\"] = (\n", + " get_taxonomy_aligned_revenue_numerator_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"aligned_capex_numerator\"] = (\n", + " get_taxonomy_aligned_capex_numerator_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"not_aligned_revenue\"] = (\n", + " get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"not_aligned_capex\"] = (\n", + " get_taxonomy_eligible_but_not_aligned_capex_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"non_eligible_revenue\"] = (\n", + " get_taxonomy_non_eligible_revenue_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"non_eligible_capex\"] = (\n", + " get_taxonomy_non_eligible_capex_values_by_data(data=data_collection)\n", + " )\n", + " except AttributeError as e:\n", + " print(f\"Skipping company {company_info.company_name} due to missing data from Dataland: {e}\")\n", "\n", " # get values from AI\n", " try:\n", @@ -209,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -217,168 +239,8 @@ "output_type": "stream", "text": [ "\n", - "Company: Berliner Volksbank eG\n", - "Section 0: Dataland=0, Extracted=0.0\n", - "Section 1: Dataland=0, Extracted=0.0\n", - "Section 2: Dataland=0, Extracted=0.0\n", - "Section 3: Dataland=0, Extracted=0.0\n", - "Section 4: Dataland=0, Extracted=0.0\n", - "Section 5: Dataland=0, Extracted=0.0\n", - "Section 6: Dataland=0, Extracted=0.0\n", - "Section 7: Dataland=0, Extracted=0.0\n", - "Section 8: Dataland=0, Extracted=0.0\n", - "Section 9: Dataland=0, Extracted=0.0\n", - "Section 10: Dataland=0, Extracted=0.0\n", - "Section 11: Dataland=0, Extracted=0.0\n", - "Section 12: Dataland=0, Extracted=0.0\n", - "Section 13: Dataland=0, Extracted=0.0\n", - "Section 14: Dataland=0, Extracted=0.0\n", - "Section 15: Dataland=0, Extracted=0.0\n", - "Section 16: Dataland=0, Extracted=0.0\n", - "Section 17: Dataland=0, Extracted=0.0\n", - "Section 18: Dataland=0.1, Extracted=0.1\n", - "Section 19: Dataland=0.1, Extracted=0.1\n", - "Section 20: Dataland=0, Extracted=0.0\n", - "Section 21: Dataland=0.1, Extracted=0.1\n", - "Section 22: Dataland=0.1, Extracted=0.1\n", - "Section 23: Dataland=0, Extracted=0.0\n", - "Section 24: Dataland=0, Extracted=0.0\n", - "Section 25: Dataland=0, Extracted=0.0\n", - "Section 26: Dataland=0, Extracted=0.0\n", - "Section 27: Dataland=0, Extracted=0.0\n", - "Section 28: Dataland=0, Extracted=0.0\n", - "Section 29: Dataland=0, Extracted=0.0\n", - "Section 30: Dataland=0, Extracted=0.0\n", - "Section 31: Dataland=0, Extracted=0.0\n", - "Section 32: Dataland=0, Extracted=0.0\n", - "Section 33: Dataland=0, Extracted=0.0\n", - "Section 34: Dataland=0, Extracted=0.0\n", - "Section 35: Dataland=0, Extracted=0.0\n", - "Section 36: Dataland=0, Extracted=0.0\n", - "Section 37: Dataland=0, Extracted=0.0\n", - "Section 38: Dataland=0, Extracted=0.0\n", - "Section 39: Dataland=0, Extracted=0.0\n", - "Section 40: Dataland=0, Extracted=0.0\n", - "Section 41: Dataland=0, Extracted=0.0\n", - "Section 42: Dataland=0.1, Extracted=0.1\n", - "Section 43: Dataland=0.1, Extracted=0.1\n", - "Section 44: Dataland=0, Extracted=0.0\n", - "Section 45: Dataland=0.1, Extracted=0.1\n", - "Section 46: Dataland=0.1, Extracted=0.1\n", - "Section 47: Dataland=0, Extracted=0.0\n", - "Section 48: Dataland=0, Extracted=0.0\n", - "Section 49: Dataland=0, Extracted=0.0\n", - "Section 50: Dataland=0, Extracted=0.0\n", - "Section 51: Dataland=0, Extracted=0.0\n", - "Section 52: Dataland=0, Extracted=0.0\n", - "Section 53: Dataland=0, Extracted=0.0\n", - "Section 54: Dataland=0, Extracted=0.0\n", - "Section 55: Dataland=0, Extracted=0.0\n", - "Section 56: Dataland=0, Extracted=0.0\n", - "Section 57: Dataland=0, Extracted=0.0\n", - "Section 58: Dataland=0, Extracted=0.0\n", - "Section 59: Dataland=0, Extracted=0.0\n", - "Section 60: Dataland=0, Extracted=0.0\n", - "Section 61: Dataland=0, Extracted=0.0\n", - "Section 62: Dataland=0, Extracted=0.0\n", - "Section 63: Dataland=0, Extracted=0.0\n", - "Section 64: Dataland=0, Extracted=0.0\n", - "Section 65: Dataland=0, Extracted=0.0\n", - "Section 66: Dataland=100, Extracted=100.0\n", - "Section 67: Dataland=100, Extracted=100.0\n", - "Section 68: Dataland=0, Extracted=0.0\n", - "Section 69: Dataland=100, Extracted=100.0\n", - "Section 70: Dataland=100, Extracted=100.0\n", - "Section 71: Dataland=0, Extracted=0.0\n", - "Section 72: Dataland=0, Extracted=0.0\n", - "Section 73: Dataland=0, Extracted=0.0\n", - "Section 74: Dataland=0, Extracted=0.0\n", - "Section 75: Dataland=0, Extracted=0.0\n", - "Section 76: Dataland=0, Extracted=0.0\n", - "Section 77: Dataland=0, Extracted=0.0\n", - "Section 78: Dataland=0, Extracted=0.0\n", - "Section 79: Dataland=0, Extracted=0.0\n", - "Section 80: Dataland=0, Extracted=0.0\n", - "Section 81: Dataland=0, Extracted=0.0\n", - "Section 82: Dataland=0, Extracted=0.0\n", - "Section 83: Dataland=0, Extracted=0.0\n", - "Section 84: Dataland=0, Extracted=0.0\n", - "Section 85: Dataland=0, Extracted=0.0\n", - "Section 86: Dataland=0, Extracted=0.0\n", - "Section 87: Dataland=0, Extracted=0.0\n", - "Section 88: Dataland=0, Extracted=0.0\n", - "Section 89: Dataland=0, Extracted=0.0\n", - "Section 90: Dataland=100, Extracted=100.0\n", - "Section 91: Dataland=100, Extracted=100.0\n", - "Section 92: Dataland=0, Extracted=0.0\n", - "Section 93: Dataland=100, Extracted=100.0\n", - "Section 94: Dataland=100, Extracted=100.0\n", - "Section 95: Dataland=0, Extracted=0.0\n", - "Section 96: Dataland=0, Extracted=0.0\n", - "Section 97: Dataland=0, Extracted=0.0\n", - "Section 98: Dataland=0, Extracted=0.0\n", - "Section 99: Dataland=0, Extracted=0.0\n", - "Section 100: Dataland=0, Extracted=0.0\n", - "Section 101: Dataland=0, Extracted=0.0\n", - "Section 102: Dataland=0, Extracted=0.0\n", - "Section 103: Dataland=0, Extracted=0.0\n", - "Section 104: Dataland=0, Extracted=0.0\n", - "Section 105: Dataland=0, Extracted=0.0\n", - "Section 106: Dataland=0, Extracted=0.0\n", - "Section 107: Dataland=0, Extracted=0.0\n", - "Section 108: Dataland=0, Extracted=0.0\n", - "Section 109: Dataland=0, Extracted=0.0\n", - "Section 110: Dataland=0, Extracted=0.0\n", - "Section 111: Dataland=0, Extracted=0.0\n", - "Section 112: Dataland=0, Extracted=0.0\n", - "Section 113: Dataland=0, Extracted=0.0\n", - "Section 114: Dataland=7.82, Extracted=7.82\n", - "Section 115: Dataland=7.82, Extracted=7.82\n", - "Section 116: Dataland=0, Extracted=0.0\n", - "Section 117: Dataland=7.82, Extracted=7.82\n", - "Section 118: Dataland=7.82, Extracted=7.82\n", - "Section 119: Dataland=0, Extracted=0.0\n", - "Section 120: Dataland=0, Extracted=0.0\n", - "Section 121: Dataland=0, Extracted=0.0\n", - "Section 122: Dataland=0, Extracted=0.0\n", - "Section 123: Dataland=0, Extracted=0.0\n", - "Section 124: Dataland=0, Extracted=0.0\n", - "Section 125: Dataland=0, Extracted=0.0\n", - "Section 126: Dataland=0, Extracted=0.0\n", - "Section 127: Dataland=0, Extracted=0.0\n", - "Section 128: Dataland=0, Extracted=0.0\n", - "Section 129: Dataland=0, Extracted=0.0\n", - "Section 130: Dataland=0, Extracted=0.0\n", - "Section 131: Dataland=0, Extracted=0.0\n", - "Section 132: Dataland=0, Extracted=0.0\n", - "Section 133: Dataland=0, Extracted=0.0\n", - "Section 134: Dataland=0, Extracted=0.0\n", - "Section 135: Dataland=0, Extracted=0.0\n", - "Section 136: Dataland=0, Extracted=0.0\n", - "Section 137: Dataland=0, Extracted=0.0\n", - "Section 138: Dataland=7.82, Extracted=7.82\n", - "Section 139: Dataland=7.82, Extracted=7.82\n", - "Section 140: Dataland=0, Extracted=0.0\n", - "Section 141: Dataland=7.82, Extracted=7.82\n", - "Section 142: Dataland=7.82, Extracted=7.82\n", - "Section 143: Dataland=0, Extracted=0.0\n", - "Section 144: Dataland=0, Extracted=0.0\n", - "Section 145: Dataland=0, Extracted=0.0\n", - "Section 146: Dataland=0, Extracted=0.0\n", - "Section 147: Dataland=0, Extracted=0.0\n", - "Section 148: Dataland=0, Extracted=0.0\n", - "Section 149: Dataland=0, Extracted=0.0\n", - "Section 150: Dataland=4.17, Extracted=4.17\n", - "Section 151: Dataland=4.17, Extracted=4.17\n", - "Section 152: Dataland=0, Extracted=0.0\n", - "Section 153: Dataland=0, Extracted=0.0\n", - "Section 154: Dataland=0, Extracted=0.0\n", - "Section 155: Dataland=0, Extracted=0.0\n", - "Section 156: Dataland=0, Extracted=0.0\n", - "Section 157: Dataland=0, Extracted=0.0\n", - "Section 158: Dataland=4.17, Extracted=4.17\n", - "Section 159: Dataland=4.17, Extracted=4.17\n", - "Matching ratio: 100.00%\n" + "Company: Aktiebolaget Electrolux\n", + "Matching ratio: 0.00%\n" ] } ], diff --git a/src/dataland_qa_lab/dataland/data_provider.py b/src/dataland_qa_lab/dataland/data_provider.py index 123ff62..db29e5b 100644 --- a/src/dataland_qa_lab/dataland/data_provider.py +++ b/src/dataland_qa_lab/dataland/data_provider.py @@ -29,7 +29,8 @@ def get_taxonomy_aligned_revenue_denominator_values_by_data(data: NuclearAndGasD for field_name in NuclearAndGasAlignedDenominator.model_fields: denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name) except (AttributeError, KeyError, TypeError) as e: - print(f"Error processing taxonomy-aligned revenue denominator: {e}") + msg = f"Error retrieving taxonomy-aligned revenue denominator: {e}" + raise AttributeError(msg) from e return denominator_values_dict @@ -42,8 +43,8 @@ def get_taxonomy_aligned_capex_denominator_values_by_data(data: NuclearAndGasDat for field_name in NuclearAndGasAlignedDenominator.model_fields: denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name) except (AttributeError, KeyError, TypeError) as e: - print(f"Error processing taxonomy-aligned capex denominator: {e}") - + msg = f"Error retrieving taxonomy-aligned capex denominator: {e}" + raise AttributeError(msg) from e return denominator_values_dict @@ -55,8 +56,8 @@ def get_taxonomy_aligned_revenue_numerator_values_by_data(data: NuclearAndGasDat for field_name in NuclearAndGasAlignedNumerator.model_fields: numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name) except (AttributeError, KeyError, TypeError) as e: - print(f"Error processing taxonomy-aligned revenue numerator: {e}") - + msg = f"Error retrieving taxonomy-aligned revenue numerator: {e}" + raise AttributeError(msg) from e return numerator_values_dict @@ -68,8 +69,8 @@ def get_taxonomy_aligned_capex_numerator_values_by_data(data: NuclearAndGasDataC for field_name in NuclearAndGasAlignedNumerator.model_fields: numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name) except (AttributeError, KeyError, TypeError) as e: - print(f"Error processing taxonomy-aligned capex numerator: {e}") - + msg = f"Error retrieving taxonomy-aligned capex numerator: {e}" + raise AttributeError(msg) from e return numerator_values_dict @@ -81,8 +82,8 @@ def get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(data: NuclearAn for field_name in NuclearAndGasEligibleButNotAligned.model_fields: eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name) except (AttributeError, KeyError, TypeError) as e: - print(f"Error processing taxonomy eligible but not aligned revenue: {e}") - + msg = f"Error retrieving taxonomy eligible but not aligned revenue: {e}" + raise AttributeError(msg) from e return eligible_but_not_aligned_dict @@ -94,8 +95,8 @@ def get_taxonomy_eligible_but_not_aligned_capex_values_by_data(data: NuclearAndG for field_name in NuclearAndGasEligibleButNotAligned.model_fields: eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name) except (AttributeError, KeyError, TypeError) as e: - print(f"Error processing taxonomy eligible but not aligned capex: {e}") - + msg = f"Error retrieving taxonomy eligible but not aligned capex: {e}" + raise AttributeError(msg) from e return eligible_but_not_aligned_dict @@ -108,8 +109,8 @@ def get_taxonomy_non_eligible_revenue_values_by_data(data: NuclearAndGasDataColl value = getattr(non_eligible_values, field_name, None) non_eligible_dict[field_name] = -1 if value is None else value except (AttributeError, KeyError, TypeError) as e: - print(f"Error processing taxonomy non-eligible revenue: {e}") - + msg = f"Error retrieving taxonomy non-eligible revenue: {e}" + raise AttributeError(msg) from e return non_eligible_dict @@ -122,8 +123,8 @@ def get_taxonomy_non_eligible_capex_values_by_data(data: NuclearAndGasDataCollec value = getattr(non_eligible_values, field_name, None) non_eligible_dict[field_name] = -1 if value is None else value except (AttributeError, KeyError, TypeError) as e: - print(f"Error processing taxonomy non-eligible capex: {e}") - + msg = f"Error retrieving taxonomy non-eligible capex: {e}" + raise AttributeError(msg) from e return non_eligible_dict diff --git a/src/dataland_qa_lab/review/dataset_reviewer.py b/src/dataland_qa_lab/review/dataset_reviewer.py index db33447..a211e51 100644 --- a/src/dataland_qa_lab/review/dataset_reviewer.py +++ b/src/dataland_qa_lab/review/dataset_reviewer.py @@ -34,6 +34,7 @@ def review_dataset(data_id: str) -> str | None: config.get_config().dataland_client.eu_taxonomy_nuclear_gas_qa_api.post_nuclear_and_gas_data_qa_report( data_id=data_id, nuclear_and_gas_data=report ) + logging.info("Successfully reviewed dataset %s.", data_id) # noqa: LOG015 except Exception as e: msg = f"Error reviewing dataset {data_id}: {e}" raise RuntimeError(msg) from e diff --git a/src/dataland_qa_lab/review/generate_gpt_request.py b/src/dataland_qa_lab/review/generate_gpt_request.py index 741a071..92ccea9 100644 --- a/src/dataland_qa_lab/review/generate_gpt_request.py +++ b/src/dataland_qa_lab/review/generate_gpt_request.py @@ -18,37 +18,79 @@ def generate_gpt_request(mainprompt: str, subprompt: str) -> list: Returns: List[str]: A list of extracted values from the GPT response. + + Raises: + ValueError: For any issues encountered during the process. """ - conf = config.get_config() - - client = AzureOpenAI( - api_key=conf.azure_openai_api_key, - api_version="2024-07-01-preview", - azure_endpoint=conf.azure_openai_endpoint, - ) - updated_openai_response = client.chat.completions.create( - model="gpt-4o", - temperature=0, - messages=[ - {"role": "system", "content": mainprompt}, - ], - tool_choice="required", - tools=[ - { - "type": "function", - "function": { - "name": "requested_information_precisely_found_in_relevant_documents", - "description": "Submit the requested information. " - "Use this function when the information is precisely stated in the relevant documents.", - "parameters": subprompt, - }, - } - ], - ) - if updated_openai_response.choices[0].message.tool_calls: - tool_call = updated_openai_response.choices[0].message.tool_calls[0].function - else: - msg = "No tool calls found in the GPT response." - raise ValueError(msg) - data_dict = ast.literal_eval(tool_call.arguments) - return list(data_dict.values()) + try: + try: + conf = config.get_config() + except Exception as e: + msg = f"Error loading configuration in Gpt_request generator: {e}" + raise ValueError(msg) from e + + # Initialize Azure OpenAI client + try: + client = AzureOpenAI( + api_key=conf.azure_openai_api_key, + api_version="2024-07-01-preview", + azure_endpoint=conf.azure_openai_endpoint, + ) + except Exception as e: + msg = f"Error initializing AzureOpenAI client: {e}" + raise ValueError(msg) from e + + # Create GPT request + try: + updated_openai_response = client.chat.completions.create( + model="gpt-4o", + temperature=0, + messages=[ + {"role": "system", "content": mainprompt}, + ], + tool_choice="required", + tools=[ + { + "type": "function", + "function": { + "name": "requested_information_precisely_found_in_relevant_documents", + "description": "Submit the requested information. " + "Use this function when the information is precisely stated in the relevant documents.", + "parameters": subprompt, + }, + } + ], + ) + except Exception as e: + msg = f"Error during GPT request creation: {e}" + raise ValueError(msg) from e + + # Extract tool calls from GPT response + try: + if updated_openai_response.choices[0].message.tool_calls: + tool_call = updated_openai_response.choices[0].message.tool_calls[0].function + else: + msg = "No tool calls found in the GPT response." + raise ValueError(msg) # noqa: TRY301 + except Exception as e: # noqa: BLE001 + msg = f"Error extracting tool calls: {e}" + raise ValueError(e) # noqa: B904 + + # Parse tool call arguments + try: + data_dict = ast.literal_eval(tool_call.arguments) + except Exception as e: # noqa: BLE001 + msg = f"Error parsing tool call arguments: {e}" + raise ValueError(msg) # noqa: B904 + + # Convert to list and return + try: + return list(data_dict.values()) + except Exception as e: # noqa: BLE001 + msg = f"Error converting parsed data to list: {e}" + raise ValueError(msg) # noqa: B904 + + except Exception as general_error: # noqa: BLE001 + # General error handling + msg = f"An unexpected error occurred: {general_error}" + raise ValueError(msg) # noqa: B904 diff --git a/src/dataland_qa_lab/review/numeric_value_generator.py b/src/dataland_qa_lab/review/numeric_value_generator.py index c10d3aa..902842d 100644 --- a/src/dataland_qa_lab/review/numeric_value_generator.py +++ b/src/dataland_qa_lab/review/numeric_value_generator.py @@ -1,8 +1,12 @@ +import logging + from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa_lab.prompting_services import prompting_service from dataland_qa_lab.review import generate_gpt_request +logger = logging.getLogger(__name__) + class NumericValueGenerator: """Extracts and stores all values of template 2 to 5 and compares them to the values in dataland.""" @@ -12,28 +16,62 @@ def get_taxonomy_alligned_denominator(readable_text: AnalyzeResult, kpi: str) -> """Extracts information from template 2 using Azure OpenAI and returns a list of results. Returns: - list: A list including the etracted values of template 2 + list: A list of extracted and converted float values from template 2. """ - dominator_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(2, readable_text, kpi), - prompting_service.PromptingService.create_sub_prompt_template2to4(kpi), - ) - float_results = [float(value) for value in dominator_values] - return float_results + try: + # Generate GPT request + dominator_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( + prompting_service.PromptingService.create_main_prompt(2, readable_text, kpi), + prompting_service.PromptingService.create_sub_prompt_template2to4(kpi), + ) + # Check if the GPT response is empty + if not dominator_values: + logger.warning("Denominator values are empty. No results returned from GPT.") + msg = "No results returned from GPT for denominator values." + raise ValueError(msg) # noqa: TRY301 + # Convert the results to floats + try: + float_results = [float(value) for value in dominator_values] + except Exception as e: + logger.critical(f"Unexpected error during float conversion: {e}") # noqa: G004 + msg = f"Unexpected error during float conversion: {e}" + raise ValueError(msg) from e + return float_results # noqa: TRY300 + except ValueError as e: + logger.critical("Unexpected error in generate_gpt_request: %s", e) + msg = f"Error extracting values from template 2: {e}" + raise ValueError(msg) from e @staticmethod def get_taxonomy_alligned_numerator(readable_text: AnalyzeResult, kpi: str) -> list: """Extracts information from template 3 using Azure OpenAI and returns a list of results. Returns: - list: A list including the etracted values of template 3. + list: A list of extracted and converted float values from template 3. """ - numerator_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(3, readable_text, kpi), - prompting_service.PromptingService.create_sub_prompt_template2to4(kpi), - ) - float_results = [float(value) for value in numerator_values] - return float_results + try: + # Generate GPT request + numerator_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( + prompting_service.PromptingService.create_main_prompt(3, readable_text, kpi), + prompting_service.PromptingService.create_sub_prompt_template2to4(kpi), + ) + # Check if the GPT response is empty + if not numerator_values: + logger.warning("Denominator values are empty. No results returned from GPT.") + msg = "No results returned from GPT for denominator values." + raise ValueError(msg) # noqa: TRY301 + # Convert the results to floats + try: + float_results = [float(value) for value in numerator_values] + except Exception as e: + logger.critical(f"Unexpected error during float conversion: {e}") # noqa: G004 + msg = f"Unexpected error during float conversion: {e}" + raise ValueError(msg) from e + return float_results # noqa: TRY300 + except ValueError as e: + logger.critical("Unexpected error in generate_gpt_request: %s", e) + msg = f"Error extracting values from template 3: {e}" + raise ValueError(msg) from e @staticmethod def get_taxonomy_eligible_not_alligned(readable_text: AnalyzeResult, kpi: str) -> list: @@ -42,12 +80,29 @@ def get_taxonomy_eligible_not_alligned(readable_text: AnalyzeResult, kpi: str) - Returns: list: A list including the etracted values of template 4. """ - eligible_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(4, readable_text, kpi), - prompting_service.PromptingService.create_sub_prompt_template2to4(kpi), - ) - float_results = [float(value) for value in eligible_values] - return float_results + try: + # Generate GPT request + eligible_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( + prompting_service.PromptingService.create_main_prompt(4, readable_text, kpi), + prompting_service.PromptingService.create_sub_prompt_template2to4(kpi), + ) + # Check if the GPT response is empty + if not eligible_values: + logger.warning("Denominator values are empty. No results returned from GPT.") + msg = "No results returned from GPT for denominator values." + raise ValueError(msg) # noqa: TRY301 + # Convert the results to floats + try: + float_results = [float(value) for value in eligible_values] + except Exception as e: + logger.critical(f"Unexpected error during float conversion: {e}") # noqa: G004 + msg = f"Unexpected error during float conversion: {e}" + raise ValueError(msg) from e + return float_results # noqa: TRY300 + except ValueError as e: + logger.critical("Unexpected error in generate_gpt_request: %s", e) + msg = f"Error extracting values from template 4: {e}" + raise ValueError(msg) from e @staticmethod def get_taxonomy_non_eligible(readable_text: AnalyzeResult, kpi: str) -> list: @@ -56,9 +111,26 @@ def get_taxonomy_non_eligible(readable_text: AnalyzeResult, kpi: str) -> list: Returns: list: A list including the extracted values of template 5. """ - non_eligible_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(5, readable_text, kpi), - prompting_service.PromptingService.create_sub_prompt_template5(kpi), - ) - float_results = [float(value) for value in non_eligible_values] - return float_results + try: + # Generate GPT request + non_eligible_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( + prompting_service.PromptingService.create_main_prompt(5, readable_text, kpi), + prompting_service.PromptingService.create_sub_prompt_template5(kpi), + ) + # Check if the GPT response is empty + if not non_eligible_values: + logger.warning("Denominator values are empty. No results returned from GPT.") + msg = "No results returned from GPT for denominator values." + raise ValueError(msg) # noqa: TRY301 + # Convert the results to floats + try: + float_results = [float(value) for value in non_eligible_values] + except Exception as e: + logger.critical(f"Unexpected error during float conversion: {e}") # noqa: G004 + msg = f"Unexpected error during float conversion: {e}" + raise ValueError(msg) from e + return float_results # noqa: TRY300 + except ValueError as e: + logger.critical("Unexpected error in generate_gpt_request: %s", e) + msg = f"Error extracting values from template 5: {e}" + raise ValueError(msg) from e diff --git a/src/dataland_qa_lab/review/report_generator/denominator_report_generator.py b/src/dataland_qa_lab/review/report_generator/denominator_report_generator.py index 67e78e1..3088323 100644 --- a/src/dataland_qa_lab/review/report_generator/denominator_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/denominator_report_generator.py @@ -34,8 +34,23 @@ def build_denominator_report_frame( dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult, kpi: str ) -> QaReportDataPointExtendedDataPointNuclearAndGasAlignedDenominator: """Build a report frame for a specific KPI denominator (Revenue or CapEx).""" - prompted_values = NumericValueGenerator.get_taxonomy_alligned_denominator(relevant_pages, kpi) - dataland_values = get_dataland_values(dataset, kpi) + try: + prompted_values = NumericValueGenerator.get_taxonomy_alligned_denominator(relevant_pages, kpi) + except Exception: # noqa: BLE001 + return QaReportDataPointExtendedDataPointNuclearAndGasAlignedDenominator( + comment="Error retrieving prompted values for template 2", + verdict=QaReportDataPointVerdict.QANOTATTEMPTED, + correctedData=ExtendedDataPointNuclearAndGasAlignedDenominator(), + ) + + try: + dataland_values = get_dataland_values(dataset, kpi) + except Exception: # noqa: BLE001 + return QaReportDataPointExtendedDataPointNuclearAndGasAlignedDenominator( + comment="Error retrieving dataland values for template 2", + verdict=QaReportDataPointVerdict.QANOTATTEMPTED, + correctedData=ExtendedDataPointNuclearAndGasAlignedDenominator(), + ) corrected_values, verdict, comment, quality = comparator.compare_values_template_2to4( prompted_values, dataland_values, NuclearAndGasAlignedDenominator @@ -60,10 +75,14 @@ def build_denominator_report_frame( def get_dataland_values(dataset: NuclearAndGasDataCollection, kpi: str) -> dict: """Retrieve dataland denominator values based on KPI.""" - if kpi == "Revenue": - data = data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data(dataset) - else: - data = data_provider.get_taxonomy_aligned_capex_denominator_values_by_data(dataset) + try: + if kpi == "Revenue": + data = data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data(dataset) + else: + data = data_provider.get_taxonomy_aligned_capex_denominator_values_by_data(dataset) + except Exception as e: + msg = f"Error retrieving dataland values for {kpi}: {e}" + raise RuntimeError(msg) from e return data diff --git a/tests/review/test_denominator_report_generator.py b/tests/review/test_denominator_report_generator.py index 7aa0ff3..3454ad7 100644 --- a/tests/review/test_denominator_report_generator.py +++ b/tests/review/test_denominator_report_generator.py @@ -157,3 +157,42 @@ def test_generate_taxonomy_aligned_denominator_report_edge_cases(mock_generate_g assert report is not None assert report.verdict == QaReportDataPointVerdict.QAREJECTED assert report.corrected_data.quality == "NoDataFound" + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +@patch("dataland_qa_lab.dataland.data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data") +def test_generate_revenue_denominator_report_frame_not_attempted( + mock_get_dataland_values: Mock, mock_generate_gpt_request: Mock +) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in dataland value retrieval + mock_generate_gpt_request.side_effect = Exception("Mock GPT error") + report = report_generator.build_denominator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 2" in report.comment + + # Simulate an exception in dataland retrieval + mock_generate_gpt_request.side_effect = None + mock_get_dataland_values.side_effect = Exception("Mock dataland error") + report = report_generator.build_denominator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving dataland values for template 2" in report.comment + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_generate_taxonomy_aligned_denominator_report_edge_cases_not_attempted(mock_generate_gpt_request: Mock) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in the GPT request generation + mock_generate_gpt_request.side_effect = Exception("Mock GPT error") + + report = report_generator.build_denominator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 2" in report.comment