Finishing error-free lint setup

Signed-off-by: ToWaIDS <tobias.watzel@investmentdataservices.com>
os-climate · Feb 27, 2024 · 46886de · 46886de
1 parent 833ee7f
commit 46886de
Show file tree

Hide file tree

Showing 18 changed files with 183 additions and 173 deletions.
diff --git a/data/TEST/settings.yaml b/data/TEST/settings.yaml
@@ -1,62 +1,70 @@
-# SETTINGS FILE FOR PROJECT TEST
+---
 general:
-  ext_ip: '172.30.15.68'
+  ext_ip: 172.30.15.68
   ext_port: 4000
-  infer_ip: '172.30.88.213'
+  infer_ip: 172.30.88.213
   infer_port: 6000
-  rb_ip: '172.30.224.91'
+  rb_ip: 172.30.224.91
   rb_port: 8000
   delete_interim_files: true
-#All the parameters for exporting data
 data_export:
   enable_db_export: false
   db_dialect: oracle
   db_sql_driver: cx_oracle
-  db_host: ''
-  db_port: '1521'
-  db_user: ''
-  db_password: ''
-  db_post_command: ''
-# Next follow the parameters for the NLP Machine Learning Model
-# All the input parameters for pdf text extraction stage
+  db_host: ""
+  db_port: "1521"
+  db_user: ""
+  db_password: ""
+  db_post_command: ""
 extraction:
   min_paragraph_length: 20
   seed: 42
-  annotation_folder:
+  annotation_folder: null
   skip_extracted_files: true
   use_extractions: true
   store_extractions: true
-# All the input parameters for curation stage
 curation:
   retrieve_paragraph: false
   neg_pos_ratio: 1
-  columns_to_read: ["company", "source_file", "source_page", "kpi_id", "year", "answer", "data_type", "relevant_paragraphs"]
+  columns_to_read:
+    - company
+    - source_file
+    - source_page
+    - kpi_id
+    - year
+    - answer
+    - data_type
+    - relevant_paragraphs
   company_to_exclude: []
   create_neg_samples: true
   min_length_neg_sample: 50
   seed: 41
-# All the input parameters for the relevance training stage
 train_relevance:
   base_model: roberta-base
-  input_model_name:
+  input_model_name: null
   output_model_name: TEST_1
   train: true
   seed: 42
-  processor: # farm.processor TextPairClassificationProcessor input
+  processor:
     proc_max_seq_len: 512
     proc_dev_split: 0.2
-    proc_label_list: ['0', '1']
+    proc_label_list:
+      - "0"
+      - "1"
     proc_label_column_name: label
     proc_delimiter: ","
     proc_metric: acc
-  model: # farm.model TextClassificationHead input
-    model_layer_dims: [768, 2]
-    model_lm_output_types: ["per_sequence"]
-  training: # multiple farm input parameter for training
+  model:
+    model_layer_dims:
+      - 768
+      - 2
+    model_lm_output_types:
+      - per_sequence
+  training:
     run_hyp_tuning: false
     use_amp: true
     distributed: false
-    learning_rate: 1.0e-05
+    learning_rate: 0.00001
     n_epochs: 10
     evaluate_every: 100
     dropout: 0.2
@@ -65,19 +73,20 @@ train_relevance:
     run_cv: false
     xval_folds: 5
     max_processes: 128
-# All the input parameters for the application of inferance on relevance data in the training stage
 infer_relevance:
   skip_processed_files: true
   batch_size: 16
   gpu: true
-  num_processes:
+  num_processes: null
   disable_tqdm: true
   kpi_questions: []
-  sectors: ["OG", "CM", "CU"]
+  sectors:
+    - OG
+    - CM
+    - CU
   return_class_probs: false
-# All the input parameters for the kpi training stage
 train_kpi:
-  input_model_name:
+  input_model_name: null
   output_model_name: TEST_1
   base_model: a-ware/roberta-large-squadv2
   train: true
@@ -89,22 +98,27 @@ train_kpi:
     create_unanswerable: true
   data:
     perform_splitting: true
-    dev_split: .2
+    dev_split: 0.2
   mlflow:
     track_experiment: false
     url: http://localhost:5000
   processor:
     max_seq_len: 384
-    label_list: ["start_token", "end_token"]
+    label_list:
+      - start_token
+      - end_token
     metric: squad
   model:
-    model_layer_dims: [768, 2]
-    model_lm_output_types: ["per_token"]
+    model_layer_dims:
+      - 768
+      - 2
+    model_lm_output_types:
+      - per_token
   training:
     run_hyp_tuning: false
     use_amp: true
     distributed: false
-    learning_rate: 1.0e-05
+    learning_rate: 0.00001
     n_epochs: 10
     evaluate_every: 100
     dropout: 0.3
@@ -113,16 +127,14 @@ train_kpi:
     run_cv: false
     xval_folds: 5
     metric: f1
-    max_processes: 1 #processes used for splitting up the data. Leads in the moment to issues when not 1
-# All the input parameters for the application of kpi inferance
+    max_processes: 1
 infer_kpi:
-  skip_processed_files: false # If set to True, will skip inferring on already processed files
+  skip_processed_files: false
   top_k: 4
   batch_size: 16
   gpu: true
-  num_processes: # Set to value 1 (or 0) to disable multiprocessing. Set to None to let Inferencer use all CPU cores minus one.
-  no_ans_boost: -15 # If increased, this will boost "No Answer" as prediction. Use large negative values (like -100) to disable giving "No answer" option.
-#Rule-based settings
+  num_processes: null
+  no_ans_boost: -15
 rule_based:
   verbosity: 2
   use_docker: true
diff --git a/data/s3_settings.yaml b/data/s3_settings.yaml
@@ -1,4 +1,4 @@
-# global variables in the docker/pod for the s3 connection where the input and output will be stored
+---
 main_bucket:
   s3_endpoint: LANDING_AWS_ENDPOINT
   s3_access_key: LANDING_AWS_ACCESS_KEY
@@ -9,5 +9,4 @@ interim_bucket:
   s3_access_key: INTERIM_AWS_ACCESS_KEY
   s3_secret_key: INTERIM_AWS_SECRET_KEY
   s3_bucket_name: INTERIM_AWS_BUCKET_NAME
-# variables necessary to find the files in s3 bucket
 prefix: corporate_data_extraction_projects
diff --git a/src/osc_extraction_utils/merger.py b/src/osc_extraction_utils/merger.py
@@ -8,6 +8,7 @@
 
 
 class Merger:
+    # TODO finish Merger class
     def __init__(self, main_settings: MainSettings, s3_settings: S3Settings, project_paths: ProjectPaths) -> None:
         self.main_settings: MainSettings = main_settings
         self.s3_settings: S3Settings = s3_settings
@@ -45,9 +46,8 @@ def _download_inference_related_files_from_s3(self) -> None:
             / "RELEVANCE"
             / "Text"
         )
-        self.s3_communication_main.download_files_in_prefix_to_dir(
-            str(path_file_related_s3), str(self.project_paths.path_folder_relevance)
-        )
+        # TODO wrong type
+        self.s3_communication_main.download_files_in_prefix_to_dir(str(path_file_related_s3), str(self.project_paths.path_folder_relevance))  # type: ignore
 
     def _upload_inference_related_files_to_s3(self) -> None:
         path_file_upload_to_s3: Path = (
@@ -58,13 +58,10 @@ def _upload_inference_related_files_to_s3(self) -> None:
             / "ml"
             / "text_3434.csv"
         )
-        self.s3_communication_interim.upload_file_to_s3(
-            filepath=str(path_file_upload_to_s3),
-            s3_prefix=str(path_file_upload_to_s3.parent),
-            s3_key=str(path_file_upload_to_s3.name),
-        )
+        # TODO wrong type
+        self.s3_communication_interim.upload_file_to_s3(filepath=str(path_file_upload_to_s3), s3_prefix=str(path_file_upload_to_s3.parent), s3_key=str(path_file_upload_to_s3.name))  # type: ignore
 
-    def _weird_writing_stuff(self) -> None:
+    def _weird_writing_stuff(self) -> bool:
         with open(str(self.project_paths.path_folder_text_3434) + r"/text_3434.csv", "w") as file_out:
             very_first = True
             rel_inf_list = list(glob.iglob(str(self.project_paths.path_folder_relevance) + r"/*.csv"))
@@ -82,6 +79,7 @@ def _weird_writing_stuff(self) -> None:
                                 file_out.write(line)
                             first = False
                         very_first = False
+                return True  # TODO added here to comform mypy, is this required?
             except Exception:
                 return False
 

diff --git a/src/osc_extraction_utils/s3_communication.py b/src/osc_extraction_utils/s3_communication.py
@@ -4,6 +4,7 @@
 import pathlib
 from enum import Enum
 from io import BytesIO
+from pathlib import Path
 
 import boto3
 import pandas as pd
@@ -24,7 +25,9 @@ class S3Communication(object):
     It connects with the bucket and provides methods to read and write data in parquet, csv, and json formats.
     """
 
-    def __init__(self, s3_endpoint_url, aws_access_key_id, aws_secret_access_key, s3_bucket):
+    def __init__(
+        self, s3_endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str, s3_bucket: str
+    ) -> None:
         """Initialize communicator."""
         self.s3_endpoint_url = s3_endpoint_url
         self.aws_access_key_id = aws_access_key_id
@@ -43,20 +46,20 @@ def _upload_bytes(self, buffer_bytes, prefix, key):
         status = s3_object.put(Body=buffer_bytes)
         return status
 
-    def _download_bytes(self, prefix, key):
+    def _download_bytes(self, prefix: str, key: str) -> bytes:
         """Download byte content in bucket/prefix/key to buffer."""
         buffer = BytesIO()
         s3_object = self.s3_resource.Object(self.bucket, osp.join(prefix, key))
         s3_object.download_fileobj(buffer)
         return buffer.getvalue()
 
-    def upload_file_to_s3(self, filepath, s3_prefix, s3_key):
+    def upload_file_to_s3(self, filepath: Path | str, s3_prefix: str, s3_key: str):
         """Read file from disk and upload to s3 bucket/prefix/key."""
         with open(filepath, "rb") as f:
             status = self._upload_bytes(f.read(), s3_prefix, s3_key)
         return status
 
-    def download_file_from_s3(self, filepath, s3_prefix, s3_key):
+    def download_file_from_s3(self, filepath: Path, s3_prefix: str, s3_key: str):
         """Download file from s3 bucket/prefix/key and save it to filepath on disk."""
         buffer_bytes = self._download_bytes(s3_prefix, s3_key)
         with open(filepath, "wb") as f:
@@ -110,7 +113,7 @@ def upload_files_in_dir_to_prefix(self, source_dir, s3_prefix):
         for fpath in upload_files_paths:
             self.upload_file_to_s3(fpath, s3_prefix, fpath.name)
 
-    def download_files_in_prefix_to_dir(self, s3_prefix, destination_dir):
+    def download_files_in_prefix_to_dir(self, s3_prefix, destination_dir) -> None:
         """
         Download all files under a prefix to a directory.
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,7 +4,7 @@
 from typing import Generator
 
 import pytest
-from utils_test import project_tests_root
+from utils_tests import project_tests_root
 
 from osc_extraction_utils.paths import ProjectPaths
 from osc_extraction_utils.settings import (