Skip to content

Commit

Permalink
Automatic code formatting using black
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed May 24, 2024
1 parent 322a5ac commit 75c19cf
Show file tree
Hide file tree
Showing 18 changed files with 375 additions and 270 deletions.
4 changes: 2 additions & 2 deletions bdikit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__version__ = '0.2.0.dev0'
__version__ = "0.2.0.dev0"
# To shortcut the import path
from bdikit.api import APIManager
from bdikit.api import APIManager
66 changes: 44 additions & 22 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,24 @@
from bdikit.mapping_recommendation.scope_reducing_manager import ScopeReducingManager
from bdikit.mapping_recommendation.value_mapping_manager import ValueMappingManager
from bdikit.mapping_recommendation.column_mapping_manager import ColumnMappingManager
from bdikit.visualization.mappings import plot_reduce_scope, plot_column_mappings, plot_value_mappings
from bdikit.visualization.mappings import (
plot_reduce_scope,
plot_column_mappings,
plot_value_mappings,
)
from bdikit.utils import get_gdc_data
from os.path import join, dirname
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable huggingface messages
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable huggingface messages

GDC_DATA_PATH = join(dirname(__file__), './resource/gdc_table.csv')
GDC_DATA_PATH = join(dirname(__file__), "./resource/gdc_table.csv")


class APIManager():

def __init__(self,):
class APIManager:
def __init__(
self,
):
# TODO: move into database object (in data_ingestion folder)
self.dataset = None
# TODO: move into database object (in data_ingestion folder)
Expand All @@ -23,8 +28,8 @@ def __init__(self,):
self.reduced_scope = None
self.column_manager = None
self.value_manager = None
self.column_mappings = None # TODO move this to a property in column_manager
self.value_mappings = None # TODO move this to a property in value_manager
self.column_mappings = None # TODO move this to a property in column_manager
self.value_mappings = None # TODO move this to a property in value_manager

def load_global_table(self, global_table_path=None):
if global_table_path is None:
Expand All @@ -47,41 +52,58 @@ def reduce_scope(self, num_columns=5, num_candidates=5):

return self.reduced_scope

def map_columns(self, algorithm='SimFloodAlgorithm'):
self.column_manager = ColumnMappingManager(self.dataset, self.global_table, algorithm)
def map_columns(self, algorithm="SimFloodAlgorithm"):
self.column_manager = ColumnMappingManager(
self.dataset, self.global_table, algorithm
)
self.column_manager.reduced_scope = self.reduced_scope
self.column_mappings = self.column_manager.map()
plot_column_mappings(self.column_mappings)

return self.column_mappings

def map_values(self, algorithm='EditAlgorithm'):
def map_values(self, algorithm="EditAlgorithm"):
self.global_table_all = get_gdc_data(self.column_mappings.values())
self.value_manager = ValueMappingManager(self.dataset, self.column_mappings, self.global_table_all, algorithm)
self.value_manager = ValueMappingManager(
self.dataset, self.column_mappings, self.global_table_all, algorithm
)
self.value_mappings = self.value_manager.map()
plot_value_mappings(self.value_mappings)

return self.value_mappings

def update_reduced_scope(self, original_column, new_candidate_name, new_candidate_sim=1.0):
def update_reduced_scope(
self, original_column, new_candidate_name, new_candidate_sim=1.0
):
for index in range(len(self.reduced_scope)):
if self.reduced_scope[index]['Candidate column'] == original_column:
self.reduced_scope[index]['Top k columns'].append((new_candidate_name, new_candidate_sim))
print('Reduced scope updated!')
if self.reduced_scope[index]["Candidate column"] == original_column:
self.reduced_scope[index]["Top k columns"].append(
(new_candidate_name, new_candidate_sim)
)
print("Reduced scope updated!")
plot_reduce_scope(self.reduced_scope)
break

def update_column_mappings(self, new_mappings):
for original_column, new_target_column in new_mappings:
self.column_mappings[original_column] = new_target_column

print('Column mapping updated!')
print("Column mapping updated!")
plot_column_mappings(self.column_mappings)

def update_value_mappings(self, original_column, original_value, new_target_value, new_similarity=1.0):
for index in range(len(self.value_mappings[original_column]['matches'])):
if self.value_mappings[original_column]['matches'][index][0] == original_value:
self.value_mappings[original_column]['matches'][index] = (original_value, new_target_value, new_similarity)
print('Value mapping updated!')
def update_value_mappings(
self, original_column, original_value, new_target_value, new_similarity=1.0
):
for index in range(len(self.value_mappings[original_column]["matches"])):
if (
self.value_mappings[original_column]["matches"][index][0]
== original_value
):
self.value_mappings[original_column]["matches"][index] = (
original_value,
new_target_value,
new_similarity,
)
print("Value mapping updated!")
plot_value_mappings(self.value_mappings)
break
34 changes: 18 additions & 16 deletions bdikit/data_ingestion/column.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,43 @@
from enum import Enum


class ColumnType(Enum):
STRING = 'string'
FLOAT = 'float'
INTEGER = 'integer'
# TODO semantic types?
STRING = "string"
FLOAT = "float"
INTEGER = "integer"
# TODO semantic types?


class Column:
def __init__(self, df_name, column_name, column_type=ColumnType.STRING, domain_values=None, null_values_representations=None):
def __init__(
self,
df_name,
column_name,
column_type=ColumnType.STRING,
domain_values=None,
null_values_representations=None,
):
self.df_name = df_name
self.column_name = column_name
self.column_type = column_type

if domain_values is None:
self.domain_values = set()
else:
self.domain_values = set(domain_values)

if null_values_representations is None:
self.null_values_representations = set()
else:
self.null_values_representations = set(null_values_representations)



def __str__(self):
return f"Column(df_name={self.df_name}, column_name={self.column_name}, column_type={self.column_type}, domain_values={self.domain_values}, null_values_representations={self.null_values_representations})"

def __eq__(self, value):
if not isinstance(value, Column):
return False
return self.df_name == value.df_name and self.column_name == value.column_name

def __hash__(self):
return hash((self.df_name, self.column_name))





12 changes: 6 additions & 6 deletions bdikit/data_ingestion/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from .column import Column, ColumnType


class Database:
"""
A class representing a database that stores dataframes.
Expand All @@ -14,7 +15,7 @@ class Database:
load_data(df_name, file_path): Load data from a CSV file into a dataframe and store it in the database.
load_data_from_folder(folder_path): Load data from all CSV files in a folder.
get_dataframe(df_name): Retrieve a dataframe by its name.
get_dataframe_names(): Get the names of all dataframes stored in the database.
get_dataframe_names(): Get the names of all dataframes stored in the database.
describe_database(): Print out the names, shape, columns, and head of all dataframes stored in the database.
"""

Expand All @@ -32,7 +33,8 @@ def load_data(self, df_name, file_path):
"""
if df_name in self.dataframes:
raise ValueError(
f"Dataframe associated with file name '{df_name}' already exists in the database.")
f"Dataframe associated with file name '{df_name}' already exists in the database."
)

df = pd.read_csv(file_path)
self.dataframes[df_name] = df
Expand All @@ -42,7 +44,6 @@ def load_data(self, df_name, file_path):
column = Column(df_name, c, ColumnType.STRING)
self.columns.add(column)


def load_data_from_folder(self, folder_path):
"""
Function to load data from all CSV files in a folder using the Database class.
Expand Down Expand Up @@ -76,7 +77,7 @@ def get_dataframe_names(self):
list: A list of dataframe names.
"""
return list(self.dataframes.keys())

def get_columns(self):
"""
Get the names of all columns stored in the database.
Expand All @@ -99,7 +100,6 @@ def describe_database(self):
# print(f"\t\t- Head: \n{self.dataframes[df_name].head()}")



# def main():
# col1 = Column('df1', 'col1', ColumnType.STRING, ['a', 'b', 'c'], ['n/a', 'na'])
# col2 = Column('df1', 'col2', ColumnType.INTEGER, [1, 2, 3], ['n/a', 'na'])
Expand All @@ -112,4 +112,4 @@ def describe_database(self):
# print(col3 == col4)

# if __name__ == "__main__":
# main()
# main()
2 changes: 1 addition & 1 deletion bdikit/data_ingestion/dataset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
def load_dataframe(dataset_path):
dataset = pd.read_csv(dataset_path)

return dataset
return dataset
2 changes: 1 addition & 1 deletion bdikit/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def get_cached_model_or_download(model_name: str):
if len(sys.argv) < 2:
print("Please provide a model_id as a command line argument.")
sys.exit(1)

model_id = sys.argv[1]
model_path = get_cached_model_or_download(model_id)
print(f"Downloaded model: {model_path}")
Loading

0 comments on commit 75c19cf

Please sign in to comment.