Skip to content

Commit

Permalink
Chore: pre-commit autoupdate
Browse files Browse the repository at this point in the history
  • Loading branch information
pre-commit-ci[bot] committed Aug 19, 2024
1 parent 86f3fa4 commit 662b675
Show file tree
Hide file tree
Showing 21 changed files with 430 additions and 136 deletions.
16 changes: 12 additions & 4 deletions osc_extraction_utils/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ def create_multiple_xlsx_files(path_folder: Path) -> None:
create_single_xlsx_file(path_folder, file_name=f"xlsx_file_{i}.xlsx")


def modify_project_settings(project_settings: typing.Dict, *args: typing.Tuple[str, str, bool]) -> typing.Dict:
def modify_project_settings(
project_settings: typing.Dict, *args: typing.Tuple[str, str, bool]
) -> typing.Dict:
"""Returns are modified project settings dict based on the input args
:param project_settings: Project settings
Expand Down Expand Up @@ -128,7 +130,9 @@ def s3_settings() -> S3Settings:
# TODO add test mode paths?
@pytest.fixture(scope="session")
def project_paths(main_settings: MainSettings) -> ProjectPaths:
return ProjectPaths("test_project", main_settings, Path(__file__).parents[1].resolve())
return ProjectPaths(
"test_project", main_settings, Path(__file__).parents[1].resolve()
)


@pytest.fixture(scope="session")
Expand All @@ -155,8 +159,12 @@ def prerequisites_generate_text(
write_to_file(path_current_file, f"That is a test {i}", "HEADER")

with (
patch.object(project_paths, "path_folder_relevance", Path(path_folder_relevance)),
patch.object(project_paths, "path_folder_text_3434", Path(path_folder_text_3434)),
patch.object(
project_paths, "path_folder_relevance", Path(path_folder_relevance)
),
patch.object(
project_paths, "path_folder_text_3434", Path(path_folder_text_3434)
),
patch("osc_extraction_utils.merger.os.getenv", lambda *args: args[0]),
):
yield
Expand Down
12 changes: 9 additions & 3 deletions osc_extraction_utils/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ def convert(self) -> None:


class XlsToCsvConverter(Converter):
def __init__(self, path_folder_source: Path = Path(), path_folder_destination: Path = Path()):
def __init__(
self, path_folder_source: Path = Path(), path_folder_destination: Path = Path()
):
self.path_folder_source: Path = path_folder_source
self.path_folder_destination: Path = path_folder_destination

Expand Down Expand Up @@ -44,7 +46,9 @@ def convert(self) -> None:
self._convert_single_file_to_csv(list_paths_xlsx_files[0])

def _find_xlsx_files_in_source_folder(self) -> list[Path]:
list_paths_xlsx_files: list[Path] = list(self._path_folder_source.glob("*.xlsx"))
list_paths_xlsx_files: list[Path] = list(
self._path_folder_source.glob("*.xlsx")
)
return list_paths_xlsx_files

def _check_for_valid_paths(self) -> None:
Expand All @@ -62,5 +66,7 @@ def _check_xlsx_files(self, list_paths_xlsx_files: list[Path]) -> None:
def _convert_single_file_to_csv(self, path_file: Path) -> None:
print(f"Converting {path_file} to csv-format")
df_read_excel: pd.DataFrame = pd.read_excel(path_file, engine="openpyxl")
path_csv_file: Path = self._path_folder_destination / "aggregated_annotation.csv"
path_csv_file: Path = (
self._path_folder_destination / "aggregated_annotation.csv"
)
df_read_excel.to_csv(path_csv_file, index=False, header=True)
26 changes: 20 additions & 6 deletions osc_extraction_utils/core_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,16 @@ def _delete_file(path_file: Path) -> None:
print("Failed to delete %s. Reason: %s" % (str(path_file), exception))


def copy_file_without_overwrite(path_folder_source_as_str: str, path_folder_destination_as_str: str) -> bool:
def copy_file_without_overwrite(
path_folder_source_as_str: str, path_folder_destination_as_str: str
) -> bool:
path_folder_source = Path(path_folder_source_as_str)
path_folder_destination = Path(path_folder_destination_as_str)

for path_file_current_source in path_folder_source.iterdir():
path_file_current_destination = path_folder_destination / path_file_current_source.name
path_file_current_destination = (
path_folder_destination / path_file_current_source.name
)
if not path_file_current_destination.exists():
shutil.copyfile(path_file_current_source, path_file_current_destination)
return True
Expand Down Expand Up @@ -62,14 +66,24 @@ def copy_file_without_overwrite(path_folder_source_as_str: str, path_folder_dest


def download_data_from_s3_main_bucket_to_local_folder_if_required(
s3_bucket: S3Communication, path_s3_with_prefix_folder: Path, path_local_folder: Path, main_settings: MainSettings
s3_bucket: S3Communication,
path_s3_with_prefix_folder: Path,
path_local_folder: Path,
main_settings: MainSettings,
):
if main_settings.general.s3_usage:
s3_bucket.download_files_in_prefix_to_dir(path_s3_with_prefix_folder, path_local_folder)
s3_bucket.download_files_in_prefix_to_dir(
path_s3_with_prefix_folder, path_local_folder
)


def upload_data_from_local_folder_to_s3_interim_bucket_if_required(
s3_bucket: S3Communication, path_local_folder: Path, path_s3_with_prefix_folder: Path, main_settings: MainSettings
s3_bucket: S3Communication,
path_local_folder: Path,
path_s3_with_prefix_folder: Path,
main_settings: MainSettings,
):
if main_settings.general.s3_usage:
s3_bucket.upload_files_in_dir_to_prefix(path_local_folder, path_s3_with_prefix_folder)
s3_bucket.upload_files_in_dir_to_prefix(
path_local_folder, path_s3_with_prefix_folder
)
61 changes: 49 additions & 12 deletions osc_extraction_utils/merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@

class Merger:
# TODO finish Merger class
def __init__(self, main_settings: MainSettings, s3_settings: S3Settings, project_paths: ProjectPaths) -> None:
def __init__(
self,
main_settings: MainSettings,
s3_settings: S3Settings,
project_paths: ProjectPaths,
) -> None:
self.main_settings: MainSettings = main_settings
self.s3_settings: S3Settings = s3_settings
self.project_paths: ProjectPaths = project_paths
Expand All @@ -33,7 +38,9 @@ def _return_s3_communication_interim(self) -> S3Communication:
return S3Communication(
s3_endpoint_url=os.getenv(self.s3_settings.interim_bucket.s3_endpoint),
aws_access_key_id=os.getenv(self.s3_settings.interim_bucket.s3_access_key),
aws_secret_access_key=os.getenv(self.s3_settings.interim_bucket.s3_secret_key),
aws_secret_access_key=os.getenv(
self.s3_settings.interim_bucket.s3_secret_key
),
s3_bucket=os.getenv(self.s3_settings.interim_bucket.s3_bucket_name),
)

Expand All @@ -47,7 +54,9 @@ def _download_inference_related_files_from_s3(self) -> None:
/ "Text"
)
# TODO wrong type
self.s3_communication_main.download_files_in_prefix_to_dir(str(path_file_related_s3), str(self.project_paths.path_folder_relevance)) # type: ignore
self.s3_communication_main.download_files_in_prefix_to_dir(
str(path_file_related_s3), str(self.project_paths.path_folder_relevance)
) # type: ignore

def _upload_inference_related_files_to_s3(self) -> None:
path_file_upload_to_s3: Path = (
Expand All @@ -59,12 +68,20 @@ def _upload_inference_related_files_to_s3(self) -> None:
/ "text_3434.csv"
)
# TODO wrong type
self.s3_communication_interim.upload_file_to_s3(filepath=str(path_file_upload_to_s3), s3_prefix=str(path_file_upload_to_s3.parent), s3_key=str(path_file_upload_to_s3.name)) # type: ignore
self.s3_communication_interim.upload_file_to_s3(
filepath=str(path_file_upload_to_s3),
s3_prefix=str(path_file_upload_to_s3.parent),
s3_key=str(path_file_upload_to_s3.name),
) # type: ignore

def _weird_writing_stuff(self) -> bool:
with open(str(self.project_paths.path_folder_text_3434) + r"/text_3434.csv", "w") as file_out:
with open(
str(self.project_paths.path_folder_text_3434) + r"/text_3434.csv", "w"
) as file_out:
very_first = True
rel_inf_list = list(glob.iglob(str(self.project_paths.path_folder_relevance) + r"/*.csv"))
rel_inf_list = list(
glob.iglob(str(self.project_paths.path_folder_relevance) + r"/*.csv")
)
if len(rel_inf_list) == 0:
print("No relevance inference results found.")
return False
Expand All @@ -84,7 +101,12 @@ def _weird_writing_stuff(self) -> bool:
return False


def generate_text_3434(project_name: str, s3_usage: bool, s3_settings: S3Settings, project_paths: ProjectPaths):
def generate_text_3434(
project_name: str,
s3_usage: bool,
s3_settings: S3Settings,
project_paths: ProjectPaths,
):
"""
This function merges all infer relevance outputs into one large file, which is then
used to train the kpi extraction model.
Expand All @@ -102,12 +124,25 @@ def generate_text_3434(project_name: str, s3_usage: bool, s3_settings: S3Setting
s3_bucket=os.getenv(s3_settings.main_bucket.s3_bucket_name),
)
# Download infer relevance files
prefix_rel_infer = str(Path(s3_settings.prefix) / project_name / "data" / "output" / "RELEVANCE" / "Text")
s3c_main.download_files_in_prefix_to_dir(prefix_rel_infer, str(project_paths.path_folder_relevance))
prefix_rel_infer = str(
Path(s3_settings.prefix)
/ project_name
/ "data"
/ "output"
/ "RELEVANCE"
/ "Text"
)
s3c_main.download_files_in_prefix_to_dir(
prefix_rel_infer, str(project_paths.path_folder_relevance)
)

with open(str(project_paths.path_folder_text_3434) + r"/text_3434.csv", "w") as file_out:
with open(
str(project_paths.path_folder_text_3434) + r"/text_3434.csv", "w"
) as file_out:
very_first = True
rel_inf_list = list(glob.iglob(str(project_paths.path_folder_relevance) + r"/*.csv"))
rel_inf_list = list(
glob.iglob(str(project_paths.path_folder_relevance) + r"/*.csv")
)
if len(rel_inf_list) == 0:
print("No relevance inference results found.")
return False
Expand All @@ -132,7 +167,9 @@ def generate_text_3434(project_name: str, s3_usage: bool, s3_settings: S3Setting
aws_secret_access_key=os.getenv(s3_settings.interim_bucket.s3_secret_key),
s3_bucket=os.getenv(s3_settings.interim_bucket.s3_bucket_name),
)
project_prefix_text3434 = str(Path(s3_settings.prefix) / project_name / "data" / "interim" / "ml")
project_prefix_text3434 = str(
Path(s3_settings.prefix) / project_name / "data" / "interim" / "ml"
)
s3c_interim.upload_file_to_s3(
filepath=str(project_paths.path_folder_text_3434) + r"/text_3434.csv",
s3_prefix=project_prefix_text3434,
Expand Down
49 changes: 38 additions & 11 deletions osc_extraction_utils/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,44 @@ class ProjectPaths(BaseSettings):
path_folder_source_annotation: Path = Field(default=Path("input/annotations"))
path_folder_source_mapping: Path = Field(default=Path("input/kpi_mapping"))
path_folder_destination_pdf: Path = Field(default=Path("interim/pdfs"))
path_folder_destination_annotation: Path = Field(default=Path("interim/ml/annotations"))
path_folder_destination_annotation: Path = Field(
default=Path("interim/ml/annotations")
)
path_folder_destination_mapping: Path = Field(default=Path("interim/kpi_mapping"))
path_folder_destination_extraction: Path = Field(default=Path("interim/ml/extraction"))
path_folder_destination_extraction: Path = Field(
default=Path("interim/ml/extraction")
)
path_folder_destination_curation: Path = Field(default=Path("interim/ml/curation"))
path_folder_destination_training: Path = Field(default=Path("interim/ml/training"))

path_folder_destination_saved_models_relevance: Path = Field(default=Path("RELEVANCE/Text"))
path_folder_destination_saved_models_inference: Path = Field(default=Path("KPI_EXTRACTION/Text"))
path_folder_destination_saved_models_relevance: Path = Field(
default=Path("RELEVANCE/Text")
)
path_folder_destination_saved_models_inference: Path = Field(
default=Path("KPI_EXTRACTION/Text")
)

path_folder_text_3434: Path = Field(default=Path("interim/ml"))
path_folder_relevance: Path = Field(default=Path("output/RELEVANCE/Text"))

def __init__(self, string_project_name: str, main_settings: MainSettings, path_folder_root: Path, **kwargs):
def __init__(
self,
string_project_name: str,
main_settings: MainSettings,
path_folder_root: Path,
**kwargs,
):
super().__init__(**kwargs)
if not isinstance(string_project_name, str):
raise TypeError
self._string_project_name: str = string_project_name
self._PATH_FOLDER_ROOT = path_folder_root.resolve()
self._path_project_data_folder: Path = self._PATH_FOLDER_DATA / Path(string_project_name)
self._path_project_model_folder: Path = self._PATH_FOLDER_MODEL / Path(string_project_name)
self._path_project_data_folder: Path = self._PATH_FOLDER_DATA / Path(
string_project_name
)
self._path_project_model_folder: Path = self._PATH_FOLDER_MODEL / Path(
string_project_name
)
self._main_settings: MainSettings = main_settings
self._update_all_paths_depending_on_path_project_data_folder()
self._update_all_paths_depending_on_path_project_model_folder()
Expand Down Expand Up @@ -118,13 +136,19 @@ def PYTHON_EXECUTABLE(self) -> str:

def _update_all_paths_depending_on_path_project_data_folder(self) -> None:
list_paths_model_fields_filtered: list[str] = [
path_model_field for path_model_field in self.model_fields.keys() if "saved_models" not in path_model_field
path_model_field
for path_model_field in self.model_fields.keys()
if "saved_models" not in path_model_field
]

for path_field in list_paths_model_fields_filtered:
path_field_default: Path = self.model_fields[path_field].default
setattr(
self, f"{path_field}", self._PATH_FOLDER_DATA / Path(self._string_project_name) / path_field_default
self,
f"{path_field}",
self._PATH_FOLDER_DATA
/ Path(self._string_project_name)
/ path_field_default,
)

def _update_all_root_related_paths(self) -> None:
Expand All @@ -133,7 +157,9 @@ def _update_all_root_related_paths(self) -> None:
self._PATH_FOLDER_DATA: Path = self.PATH_FOLDER_ROOT / "data"

def _create_all_root_related_folders(self) -> None:
self._PATH_FOLDER_MODEL.mkdir(parents=True, exist_ok=True) # includes root folder
self._PATH_FOLDER_MODEL.mkdir(
parents=True, exist_ok=True
) # includes root folder
self._PATH_FOLDER_DATA.mkdir(exist_ok=True)

def _update_all_paths_depending_on_path_project_model_folder(self) -> None:
Expand All @@ -147,7 +173,8 @@ def _update_all_paths_depending_on_path_project_model_folder(self) -> None:
]

for string_model_field, path_main_settings in zip(
list_string_paths_depending_on_path_project_model_folder, list_paths_main_settings
list_string_paths_depending_on_path_project_model_folder,
list_paths_main_settings,
):
setattr(
self,
Expand Down
Loading

0 comments on commit 662b675

Please sign in to comment.