Skip to content

Commit

Permalink
Finishing error-free lint setup
Browse files Browse the repository at this point in the history
Signed-off-by: ToWaIDS <tobias.watzel@investmentdataservices.com>
  • Loading branch information
tobias-watzel committed Feb 27, 2024
1 parent 833ee7f commit 46886de
Show file tree
Hide file tree
Showing 18 changed files with 183 additions and 173 deletions.
92 changes: 52 additions & 40 deletions data/TEST/settings.yaml
Original file line number Diff line number Diff line change
@@ -1,62 +1,70 @@
# SETTINGS FILE FOR PROJECT TEST
---
general:
ext_ip: '172.30.15.68'
ext_ip: 172.30.15.68
ext_port: 4000
infer_ip: '172.30.88.213'
infer_ip: 172.30.88.213
infer_port: 6000
rb_ip: '172.30.224.91'
rb_ip: 172.30.224.91
rb_port: 8000
delete_interim_files: true
#All the parameters for exporting data
data_export:
enable_db_export: false
db_dialect: oracle
db_sql_driver: cx_oracle
db_host: ''
db_port: '1521'
db_user: ''
db_password: ''
db_post_command: ''
# Next follow the parameters for the NLP Machine Learning Model
# All the input parameters for pdf text extraction stage
db_host: ""
db_port: "1521"
db_user: ""
db_password: ""
db_post_command: ""
extraction:
min_paragraph_length: 20
seed: 42
annotation_folder:
annotation_folder: null
skip_extracted_files: true
use_extractions: true
store_extractions: true
# All the input parameters for curation stage
curation:
retrieve_paragraph: false
neg_pos_ratio: 1
columns_to_read: ["company", "source_file", "source_page", "kpi_id", "year", "answer", "data_type", "relevant_paragraphs"]
columns_to_read:
- company
- source_file
- source_page
- kpi_id
- year
- answer
- data_type
- relevant_paragraphs
company_to_exclude: []
create_neg_samples: true
min_length_neg_sample: 50
seed: 41
# All the input parameters for the relevance training stage
train_relevance:
base_model: roberta-base
input_model_name:
input_model_name: null
output_model_name: TEST_1
train: true
seed: 42
processor: # farm.processor TextPairClassificationProcessor input
processor:
proc_max_seq_len: 512
proc_dev_split: 0.2
proc_label_list: ['0', '1']
proc_label_list:
- "0"
- "1"
proc_label_column_name: label
proc_delimiter: ","
proc_metric: acc
model: # farm.model TextClassificationHead input
model_layer_dims: [768, 2]
model_lm_output_types: ["per_sequence"]
training: # multiple farm input parameter for training
model:
model_layer_dims:
- 768
- 2
model_lm_output_types:
- per_sequence
training:
run_hyp_tuning: false
use_amp: true
distributed: false
learning_rate: 1.0e-05
learning_rate: 0.00001
n_epochs: 10
evaluate_every: 100
dropout: 0.2
Expand All @@ -65,19 +73,20 @@ train_relevance:
run_cv: false
xval_folds: 5
max_processes: 128
# All the input parameters for the application of inferance on relevance data in the training stage
infer_relevance:
skip_processed_files: true
batch_size: 16
gpu: true
num_processes:
num_processes: null
disable_tqdm: true
kpi_questions: []
sectors: ["OG", "CM", "CU"]
sectors:
- OG
- CM
- CU
return_class_probs: false
# All the input parameters for the kpi training stage
train_kpi:
input_model_name:
input_model_name: null
output_model_name: TEST_1
base_model: a-ware/roberta-large-squadv2
train: true
Expand All @@ -89,22 +98,27 @@ train_kpi:
create_unanswerable: true
data:
perform_splitting: true
dev_split: .2
dev_split: 0.2
mlflow:
track_experiment: false
url: http://localhost:5000
processor:
max_seq_len: 384
label_list: ["start_token", "end_token"]
label_list:
- start_token
- end_token
metric: squad
model:
model_layer_dims: [768, 2]
model_lm_output_types: ["per_token"]
model_layer_dims:
- 768
- 2
model_lm_output_types:
- per_token
training:
run_hyp_tuning: false
use_amp: true
distributed: false
learning_rate: 1.0e-05
learning_rate: 0.00001
n_epochs: 10
evaluate_every: 100
dropout: 0.3
Expand All @@ -113,16 +127,14 @@ train_kpi:
run_cv: false
xval_folds: 5
metric: f1
max_processes: 1 #processes used for splitting up the data. Leads in the moment to issues when not 1
# All the input parameters for the application of kpi inferance
max_processes: 1
infer_kpi:
skip_processed_files: false # If set to True, will skip inferring on already processed files
skip_processed_files: false
top_k: 4
batch_size: 16
gpu: true
num_processes: # Set to value 1 (or 0) to disable multiprocessing. Set to None to let Inferencer use all CPU cores minus one.
no_ans_boost: -15 # If increased, this will boost "No Answer" as prediction. Use large negative values (like -100) to disable giving "No answer" option.
#Rule-based settings
num_processes: null
no_ans_boost: -15
rule_based:
verbosity: 2
use_docker: true
3 changes: 1 addition & 2 deletions data/s3_settings.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# global variables in the docker/pod for the s3 connection where the input and output will be stored
---
main_bucket:
s3_endpoint: LANDING_AWS_ENDPOINT
s3_access_key: LANDING_AWS_ACCESS_KEY
Expand All @@ -9,5 +9,4 @@ interim_bucket:
s3_access_key: INTERIM_AWS_ACCESS_KEY
s3_secret_key: INTERIM_AWS_SECRET_KEY
s3_bucket_name: INTERIM_AWS_BUCKET_NAME
# variables necessary to find the files in s3 bucket
prefix: corporate_data_extraction_projects
16 changes: 7 additions & 9 deletions src/osc_extraction_utils/merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@


class Merger:
# TODO finish Merger class
def __init__(self, main_settings: MainSettings, s3_settings: S3Settings, project_paths: ProjectPaths) -> None:
self.main_settings: MainSettings = main_settings
self.s3_settings: S3Settings = s3_settings
Expand Down Expand Up @@ -45,9 +46,8 @@ def _download_inference_related_files_from_s3(self) -> None:
/ "RELEVANCE"
/ "Text"
)
self.s3_communication_main.download_files_in_prefix_to_dir(
str(path_file_related_s3), str(self.project_paths.path_folder_relevance)
)
# TODO wrong type
self.s3_communication_main.download_files_in_prefix_to_dir(str(path_file_related_s3), str(self.project_paths.path_folder_relevance)) # type: ignore

def _upload_inference_related_files_to_s3(self) -> None:
path_file_upload_to_s3: Path = (
Expand All @@ -58,13 +58,10 @@ def _upload_inference_related_files_to_s3(self) -> None:
/ "ml"
/ "text_3434.csv"
)
self.s3_communication_interim.upload_file_to_s3(
filepath=str(path_file_upload_to_s3),
s3_prefix=str(path_file_upload_to_s3.parent),
s3_key=str(path_file_upload_to_s3.name),
)
# TODO wrong type
self.s3_communication_interim.upload_file_to_s3(filepath=str(path_file_upload_to_s3), s3_prefix=str(path_file_upload_to_s3.parent), s3_key=str(path_file_upload_to_s3.name)) # type: ignore

def _weird_writing_stuff(self) -> None:
def _weird_writing_stuff(self) -> bool:
with open(str(self.project_paths.path_folder_text_3434) + r"/text_3434.csv", "w") as file_out:
very_first = True
rel_inf_list = list(glob.iglob(str(self.project_paths.path_folder_relevance) + r"/*.csv"))
Expand All @@ -82,6 +79,7 @@ def _weird_writing_stuff(self) -> None:
file_out.write(line)
first = False
very_first = False
return True # TODO added here to comform mypy, is this required?
except Exception:
return False

Expand Down
13 changes: 8 additions & 5 deletions src/osc_extraction_utils/s3_communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pathlib
from enum import Enum
from io import BytesIO
from pathlib import Path

import boto3
import pandas as pd
Expand All @@ -24,7 +25,9 @@ class S3Communication(object):
It connects with the bucket and provides methods to read and write data in parquet, csv, and json formats.
"""

def __init__(self, s3_endpoint_url, aws_access_key_id, aws_secret_access_key, s3_bucket):
def __init__(
self, s3_endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str, s3_bucket: str
) -> None:
"""Initialize communicator."""
self.s3_endpoint_url = s3_endpoint_url
self.aws_access_key_id = aws_access_key_id
Expand All @@ -43,20 +46,20 @@ def _upload_bytes(self, buffer_bytes, prefix, key):
status = s3_object.put(Body=buffer_bytes)
return status

def _download_bytes(self, prefix, key):
def _download_bytes(self, prefix: str, key: str) -> bytes:
"""Download byte content in bucket/prefix/key to buffer."""
buffer = BytesIO()
s3_object = self.s3_resource.Object(self.bucket, osp.join(prefix, key))
s3_object.download_fileobj(buffer)
return buffer.getvalue()

def upload_file_to_s3(self, filepath, s3_prefix, s3_key):
def upload_file_to_s3(self, filepath: Path | str, s3_prefix: str, s3_key: str):
"""Read file from disk and upload to s3 bucket/prefix/key."""
with open(filepath, "rb") as f:
status = self._upload_bytes(f.read(), s3_prefix, s3_key)
return status

def download_file_from_s3(self, filepath, s3_prefix, s3_key):
def download_file_from_s3(self, filepath: Path, s3_prefix: str, s3_key: str):
"""Download file from s3 bucket/prefix/key and save it to filepath on disk."""
buffer_bytes = self._download_bytes(s3_prefix, s3_key)
with open(filepath, "wb") as f:
Expand Down Expand Up @@ -110,7 +113,7 @@ def upload_files_in_dir_to_prefix(self, source_dir, s3_prefix):
for fpath in upload_files_paths:
self.upload_file_to_s3(fpath, s3_prefix, fpath.name)

def download_files_in_prefix_to_dir(self, s3_prefix, destination_dir):
def download_files_in_prefix_to_dir(self, s3_prefix, destination_dir) -> None:
"""
Download all files under a prefix to a directory.
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Generator

import pytest
from utils_test import project_tests_root
from utils_tests import project_tests_root

from osc_extraction_utils.paths import ProjectPaths
from osc_extraction_utils.settings import (
Expand Down
Loading

0 comments on commit 46886de

Please sign in to comment.