Skip to content

Commit

Permalink
Merge branch 'main' into 171-world-bank-projects-database
Browse files Browse the repository at this point in the history
  • Loading branch information
lpicci96 committed Jul 19, 2023
2 parents 06530c3 + 55fa530 commit 06a5057
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 7 deletions.
26 changes: 20 additions & 6 deletions bblocks/cleaning_tools/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from numpy import nan


def clean_number(number: str, to: Type = float) -> float | int:
def clean_number(number: str | pd.Series, to: Type = float) -> float | int:
"""Clean a string and return as float or integer.
When selecting to=int, the default python round behaviour is used.
Expand All @@ -17,9 +17,21 @@ def clean_number(number: str, to: Type = float) -> float | int:
"""

if not isinstance(number, str):
# If series, vectorize for speed
if pd.api.types.is_number(number):
number = str(number)

if isinstance(number, pd.Series):
number = number.str.replace(r"[^\d.]", "", regex=True)
number = pd.to_numeric(number, errors="coerce")

if to == float:
return number.astype(float)

if to == int:
return number.round().astype("Int64")

# If string, clean
number = re.sub(r"[^\d.]", "", number)

if number == "":
Expand Down Expand Up @@ -50,20 +62,22 @@ def clean_numeric_series(
"""

data = data.copy(deep=True)

if isinstance(data, pd.DataFrame) and (series_columns is None):
raise ValueError("series_column must be specified when _data is a DataFrame")

if isinstance(data, pd.DataFrame):
if isinstance(series_columns, str):
series_columns = [series_columns]

data[series_columns] = data[series_columns].apply(
lambda s: s.apply(clean_number, to=to), axis=1
)
for col in series_columns:
data[col] = clean_number(data[col], to=to)

return data

if isinstance(data, pd.Series):
return data.apply(clean_number, to=to)
return clean_number(data, to=to)


def to_date_column(series: pd.Series, date_format: str | None = None) -> pd.Series:
Expand Down
2 changes: 1 addition & 1 deletion bblocks/import_tools/sdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def clean_df(df: pd.DataFrame, date: str) -> pd.DataFrame:

return (
df.melt(id_vars="entity", value_vars=["holdings", "allocations"])
.pipe(clean_numeric_series, series_columns="value")
.pipe(clean_numeric_series, series_columns="value", to=float)
.rename(columns={"variable": "indicator"})
.reset_index(drop=True)
.assign(date=date)
Expand Down

0 comments on commit 06a5057

Please sign in to comment.