Skip to content

Commit

Permalink
ambiguity checker for excel using zip and xml parsing, no 3d party lib
Browse files Browse the repository at this point in the history
Signed-off-by: Jerry Guo <Jerry.Jinfeng.Guo@alliander.com>
  • Loading branch information
Jerry-Jinfeng-Guo committed Jul 2, 2024
1 parent 4004d24 commit 18ef330
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 0 deletions.
122 changes: 122 additions & 0 deletions src/power_grid_model_io/utils/excel_ambiguity_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# SPDX-FileCopyrightText: Contributors to the Power Grid Model project <powergridmodel@lfenergy.org>
#
# SPDX-License-Identifier: MPL-2.0

import xml.etree.ElementTree as ET
import zipfile
from collections import Counter

XML_NAME_SPACE = {"": "http://schemas.openxmlformats.org/spreadsheetml/2006/main"}


class ExcelAmbiguityChecker:
"""
A class to check for ambiguous column names within the sheets of an Excel (.xlsx) file.
Attributes:
_file_path (str): The path to the Excel file to be checked.
_col_name_in_row (int): The row index (0-based) where column names are expected. Default is 0.
sheets (dict): A dictionary storing sheet names as keys and lists of column names as values.
Methods:
__init__(self, file_path, column_name_in_row=0): Initializes the ExcelAmbiguityChecker instance.
_parse_zip(self, zip_file): Parses the shared strings XML file within the Excel ZIP archive.
_get_column_names_from_row(self, row, shared_strings): Extracts column names from a specified row.
_parse_excel_file(self): Parses the Excel file to extract sheet names and their corresponding column names.
"""
def __init__(self, file_path, column_name_in_row=0) -> None:
"""
Initializes the ExcelAmbiguityChecker with the path to an Excel file and the row index for column names.
Parameters:
file_path (str): The path to the Excel file.
column_name_in_row (int): The row index (0-based) where column names are expected. Default is 0.
"""
self._file_path = file_path
self._col_name_in_row = column_name_in_row
self.sheets = {}
self._parse_excel_file()

def _parse_zip(self, zip_file) -> list:
"""
Parses the shared strings XML file within the Excel ZIP archive to extract all shared strings.
Parameters:
zip_file (zipfile.ZipFile): The opened Excel ZIP file.
Returns:
list: A list of shared strings used in the Excel file.
"""
shared_strings_path = "xl/sharedStrings.xml"
shared_strings = []
with zip_file.open(shared_strings_path) as f:
tree = ET.parse(f)
for si in tree.findall(".//t", namespaces=XML_NAME_SPACE):
shared_strings.append(si.text)
return shared_strings

def _get_column_names_from_row(self, row, shared_strings) -> list:
"""
Extracts column names from a specified row using shared strings for strings stored in the shared string table.
Parameters:
row (xml.etree.ElementTree.Element): The XML element representing the row.
shared_strings (list): A list of shared strings extracted from the Excel file.
Returns:
list: A list of column names found in the row.
"""
column_names = []
for c in row.findall(".//c", namespaces=XML_NAME_SPACE):
cell_type = c.get("t")
value = c.find(".//v", namespaces=XML_NAME_SPACE)
if cell_type == "s" and value is not None:
column_names.append(shared_strings[int(value.text)])
elif value is not None:
column_names.append(value.text)
else:
column_names.append(None)
return column_names

def _parse_excel_file(self) -> None:
"""
Parses the Excel file to extract sheet names and their corresponding column names.
"""
with zipfile.ZipFile(self._file_path) as z:
shared_strings = self._parse_zip(z)
workbook_xml = z.read("xl/workbook.xml")
xml_tree = ET.fromstring(workbook_xml)
sheets = xml_tree.findall(".//sheet", namespaces=XML_NAME_SPACE)

for index, sheet in enumerate(sheets, start=1):
sheet_name = sheet.get("name")
sheet_file_path = f"xl/worksheets/sheet{index}.xml"

with z.open(sheet_file_path) as f:
sheet_tree = ET.parse(f)
rows = sheet_tree.findall(".//row", namespaces=XML_NAME_SPACE)
if rows:
column_names = self._get_column_names_from_row(rows[self._col_name_in_row], shared_strings)
self.sheets[sheet_name] = column_names

def check_ambiguity(self) -> bool:
"""
Check if there is ambiguity in column names across sheets.
Returns:
bool: result
"""
res = False
for sheet_name, column_names in self.sheets.items():
column_name_counts = Counter(column_names)
duplicates = [name for name, count in column_name_counts.items() if count > 1]
if duplicates:
print(f"In sheet: {sheet_name}, ambiguious column names: {duplicates}\n")
res = True
return res


# Example usage
if __name__ == "__main__":
excel_file_checker = ExcelAmbiguityChecker("data.xlsx")
excel_file_checker.check_ambiguity()
6 changes: 6 additions & 0 deletions tests/unit/converters/test_vision_excel_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,9 @@ def test_get_appliance_id(converter: VisionExcelConverter):

with pytest.raises(KeyError):
converter.get_appliance_id(table="Sources", node_number=1, sub_number=3)

@pytest.mark.parametrize("language", ["nl"])
def test_ambiguity_in_vision_excel():
test_file = Path(__file__).parent.parent / "data" / "vision" / "OS_RAUWERD_Definitief_netwerk_1.xlsx"
converter = VisionExcelConverter(source_file=test_file, mapping_file=DEFAULT_MAPPING_FILE, language="nl")
converter.convert()

0 comments on commit 18ef330

Please sign in to comment.