Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve ml solution #11

Merged
merged 2 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Changelog

## v0.1.4.9005
## v0.1.4.9006

- New `cat2cat_ml_run` function to check the ml models performance before `cat2cat` with ml option is run. Now, the ml models are more transparent.
- Improved the lack of support for NaN and None in the `get_mappings`.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "cat2cat"
version = "0.1.4.9004"
version = "0.1.4.9006"
description = "Unifying an inconsistently coded categorical variable in a panel/longtitudal dataset."
authors = ["Maciej Nasinski"]
license = "MIT"
Expand Down
54 changes: 36 additions & 18 deletions src/cat2cat/cat2cat_ml.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pandas import DataFrame, concat
from numpy import repeat, setdiff1d, in1d, sum, NaN, nanmean, isnan, round
from numpy.ma import masked_invalid

from sklearn.model_selection import train_test_split

Expand All @@ -22,12 +23,12 @@ class cat2cat_ml_run_results:
kwargs (Dict): additional arguments passed to the `cat2cat_ml_run` function.
Returns:
cat2cat_ml_run_results class instance with the following attributes:
res (Dict): raw results from the cat2cat_ml_run function call
mean_acc (Dict): mean accuracy for each model
percent_failed (Dict): percent of failed models for each model
percent_better (Dict): percent of better models over most frequent category solution for each model
mappings (cat2cat_mappings): initial mappings dataclass with mappings related arguments.
ml (cat2cat_ml): initial ml dataclass with ml related arguments.
res (Dict): raw results from the cat2cat_ml_run function call
mean_acc (Dict): mean accuracy for each model
percent_failed (Dict): percent of failed models for each model
percent_better (Dict): percent of better models over most frequent category solution for each model
mappings (cat2cat_mappings): initial mappings dataclass with mappings related arguments.
ml (cat2cat_ml): initial ml dataclass with ml related arguments.
Methods:
get_raw: get raw results
"""
Expand All @@ -43,7 +44,8 @@ def __init__(

mean_acc = dict()
percent_failed = dict()
percent_better = dict()
percent_better_most = dict()
percent_better_naive = dict()

mean_acc["naive"] = round(
nanmean(
Expand All @@ -61,13 +63,17 @@ def __init__(
vals = [self.res.get(g, {}).get(m, NaN) for g in self.res.keys()]
mean_acc[m] = round(nanmean(vals), 3)
percent_failed[m] = round(sum(isnan(vals)) / len(vals) * 100, 3)
percent_better[m] = round(
sum(vals > mean_acc["most_freq"]) / len(vals) * 100, 3
percent_better_most[m] = round(
nanmean(masked_invalid(vals) > mean_acc["most_freq"]) * 100, 3
)
percent_better_naive[m] = round(
nanmean(masked_invalid(vals) > mean_acc["naive"]) * 100, 3
)

self.mean_acc = mean_acc
self.percent_failed = percent_failed
self.percent_better = percent_better
self.percent_better_most = percent_better_most
self.percent_better_naive = percent_better_naive

def get_raw(self) -> Dict:
"""Get raw results"""
Expand All @@ -81,16 +87,21 @@ def __repr__(self) -> str:
for k, v in self.percent_failed.items():
res += "Percent of failed {}: {}".format(k, v) + "\n"
res += "\n"
for k, v in self.percent_better.items():
for k, v in self.percent_better_most.items():
res += (
"Percent of better {} over most frequent category solution: {}".format(
k, v
)
+ "\n"
)
for k, v in self.percent_better_naive.items():
res += "Percent of better {} over naive solution: {}".format(k, v) + "\n"
res += "\n"
res += "Features: {}".format(self.ml.features) + "\n"
res += "Test sample size: {}".format(self.kwargs.get("test_size", 0.2)) + "\n"
res += (
"Test sample size: {}".format(self.kwargs.get("test_prop", 0.2) * 100)
+ "\n"
)
return res


Expand All @@ -106,7 +117,7 @@ def cat2cat_ml_run(
Please check out the `cat2cat.dataclass.cat2cat_ml` for more information.
**kwargs: additional arguments passed to the `cat2cat_ml_run` function.
min_match (float): minimum share of categories from the base period that have to be matched in the mapping table. Between 0 and 1. Default 0.8.
test_size (float): share of the data used for testing. Between 0 and 1. Default 0.2.
test_prop (float): share of the data used for testing. Between 0 and 1. Default 0.2.
split_seed (int): random seed for the train_test_split function. Default 42.

Returns:
Expand All @@ -126,14 +137,15 @@ def cat2cat_ml_run(
>>> occup = load_occup()
>>> o_old = occup.loc[occup.year == 2008, :].copy()
>>> o_new = occup.loc[occup.year == 2010, :].copy()
>>> mappings = cat2cat_mappings(trans = trans, direction = "forward")
>>> mappings = cat2cat_mappings(trans = trans, direction = "backward")
>>> ml = cat2cat_ml(
... occup.loc[occup.year <= 2008, :].copy(),
... occup.loc[occup.year >= 2010, :].copy(),
... "code",
... ["salary", "age", "edu", "sex"],
... [DecisionTreeClassifier(random_state=1234), LinearDiscriminantAnalysis()]
... )
>>> cat2cat_ml_run(mappings = mappings, ml = ml)
...

"""
assert isinstance(
Expand All @@ -142,8 +154,8 @@ def cat2cat_ml_run(
assert isinstance(ml, cat2cat_ml), "ml arg has to be cat2cat_ml instance"
assert isinstance(kwargs, dict), "kwargs arg has to be a dict"
assert set(kwargs.keys()).issubset(
["min_match", "test_size", "split_seed"]
), "possible kwargs are min_match, split_seed and test_size"
["min_match", "test_prop", "split_seed"]
), "possible kwargs are min_match, split_seed and test_prop"

mapps = get_mappings(mappings.trans)

Expand Down Expand Up @@ -176,7 +188,7 @@ def cat2cat_ml_run(
try:
matched_cat = mapp.get(cat, [])
res[cat] = {
"naive": 1 / len(matched_cat),
"naive": NaN,
"freq": NaN,
}
for m in models_names:
Expand All @@ -187,6 +199,7 @@ def cat2cat_ml_run(
if g not in train_g.keys():
continue
data_small_g_list.append(train_g.get(g))

if len(data_small_g_list) == 0:
continue

Expand All @@ -199,6 +212,8 @@ def cat2cat_ml_run(
):
continue

res[cat]["naive"] = 1 / len(matched_cat)

X_train, X_test, y_train, y_test = train_test_split(
data_small_g[features],
data_small_g[ml.cat_var],
Expand Down Expand Up @@ -239,6 +254,9 @@ def _cat2cat_ml(
data_ml_train = ml.data.loc[ml_cat_index, :]
data_ml_target = target_df.loc[target_cat_index, :]

if (data_ml_target.shape[0] == 0) or (data_ml_train.shape[0] < 5):
continue

target_cats = data_ml_target["g_new_c2c"]
data_ml_target_uniq = data_ml_target.drop_duplicates(
subset=["index_c2c"] + list(ml.features)
Expand Down
19 changes: 10 additions & 9 deletions tests/test_cat2cat_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from cat2cat.datasets import load_trans, load_occup
from numpy.random import seed
from numpy import nan
Expand All @@ -18,24 +19,24 @@ def test_cat2cat_ml_run_repr():
ml = cat2cat_ml(
occup.loc[occup.year >= 2010, :].copy(),
"code",
["salary", "age", "edu", "sex"],
["salary", "age", "edu", "sex", "parttime"],
[
DecisionTreeClassifier(random_state=1234),
LinearDiscriminantAnalysis(),
],
)
expected = repr(cat2cat_ml_run(mappings=mappings, ml=ml))
actual = "Average Accuracy naive: 0.302\nAverage Accuracy most_freq: 0.54\nAverage Accuracy DecisionTreeClassifier: 0.487\nAverage Accuracy LinearDiscriminantAnalysis: 0.553\n\nPercent of failed DecisionTreeClassifier: 35.369\nPercent of failed LinearDiscriminantAnalysis: 35.369\n\nPercent of better DecisionTreeClassifier over most frequent category solution: 27.99\nPercent of better LinearDiscriminantAnalysis over most frequent category solution: 32.57\n\nFeatures: ['salary', 'age', 'edu', 'sex']\nTest sample size: 0.2\n"
actual = "Average Accuracy naive: 0.18\nAverage Accuracy most_freq: 0.54\nAverage Accuracy DecisionTreeClassifier: 0.488\nAverage Accuracy LinearDiscriminantAnalysis: 0.542\n\nPercent of failed DecisionTreeClassifier: 35.369\nPercent of failed LinearDiscriminantAnalysis: 35.369\n\nPercent of better DecisionTreeClassifier over most frequent category solution: 42.52\nPercent of better LinearDiscriminantAnalysis over most frequent category solution: 49.606\nPercent of better DecisionTreeClassifier over naive solution: 88.976\nPercent of better LinearDiscriminantAnalysis over naive solution: 91.732\n\nFeatures: ['salary', 'age', 'edu', 'sex', 'parttime']\nTest sample size: 20.0\n"
assert actual == expected

expected = repr(cat2cat_ml_run(mappings=mappings, ml=ml, test_size=0.3))
actual = "Average Accuracy naive: 0.302\nAverage Accuracy most_freq: 0.55\nAverage Accuracy DecisionTreeClassifier: 0.477\nAverage Accuracy LinearDiscriminantAnalysis: 0.54\n\nPercent of failed DecisionTreeClassifier: 35.369\nPercent of failed LinearDiscriminantAnalysis: 35.369\n\nPercent of better DecisionTreeClassifier over most frequent category solution: 25.954\nPercent of better LinearDiscriminantAnalysis over most frequent category solution: 30.025\n\nFeatures: ['salary', 'age', 'edu', 'sex']\nTest sample size: 0.3\n"
expected = repr(cat2cat_ml_run(mappings=mappings, ml=ml, test_prop=0.3))
actual = "Average Accuracy naive: 0.18\nAverage Accuracy most_freq: 0.54\nAverage Accuracy DecisionTreeClassifier: 0.488\nAverage Accuracy LinearDiscriminantAnalysis: 0.542\n\nPercent of failed DecisionTreeClassifier: 35.369\nPercent of failed LinearDiscriminantAnalysis: 35.369\n\nPercent of better DecisionTreeClassifier over most frequent category solution: 42.52\nPercent of better LinearDiscriminantAnalysis over most frequent category solution: 49.606\nPercent of better DecisionTreeClassifier over naive solution: 88.976\nPercent of better LinearDiscriminantAnalysis over naive solution: 91.732\n\nFeatures: ['salary', 'age', 'edu', 'sex', 'parttime']\nTest sample size: 30.0\n"
assert actual == expected

expected = repr(
cat2cat_ml_run(mappings=mappings, ml=ml, test_size=0.9, split_seed=1234)
cat2cat_ml_run(mappings=mappings, ml=ml, test_prop=0.9, split_seed=1234)
)
actual = "Average Accuracy naive: 0.302\nAverage Accuracy most_freq: 0.49\nAverage Accuracy DecisionTreeClassifier: 0.47\nAverage Accuracy LinearDiscriminantAnalysis: 0.491\n\nPercent of failed DecisionTreeClassifier: 60.814\nPercent of failed LinearDiscriminantAnalysis: 60.814\n\nPercent of better DecisionTreeClassifier over most frequent category solution: 16.794\nPercent of better LinearDiscriminantAnalysis over most frequent category solution: 18.83\n\nFeatures: ['salary', 'age', 'edu', 'sex']\nTest sample size: 0.9\n"
actual = "Average Accuracy naive: 0.18\nAverage Accuracy most_freq: 0.53\nAverage Accuracy DecisionTreeClassifier: 0.48\nAverage Accuracy LinearDiscriminantAnalysis: 0.551\n\nPercent of failed DecisionTreeClassifier: 35.369\nPercent of failed LinearDiscriminantAnalysis: 35.369\n\nPercent of better DecisionTreeClassifier over most frequent category solution: 43.701\nPercent of better LinearDiscriminantAnalysis over most frequent category solution: 53.15\nPercent of better DecisionTreeClassifier over naive solution: 86.614\nPercent of better LinearDiscriminantAnalysis over naive solution: 91.339\n\nFeatures: ['salary', 'age', 'edu', 'sex', 'parttime']\nTest sample size: 90.0\n"
assert actual == expected

mappings = cat2cat_mappings(trans=trans, direction="forward")
Expand All @@ -45,8 +46,8 @@ def test_cat2cat_ml_run_repr():
["salary", "age", "edu", "sex"],
[DecisionTreeClassifier(random_state=1234), LinearDiscriminantAnalysis()],
)
expected = repr(cat2cat_ml_run(mappings=mappings, ml=ml, test_size=0.3))
actual = "Average Accuracy naive: 0.987\nAverage Accuracy most_freq: 0.69\nAverage Accuracy DecisionTreeClassifier: 0.647\nAverage Accuracy LinearDiscriminantAnalysis: 0.708\n\nPercent of failed DecisionTreeClassifier: 98.291\nPercent of failed LinearDiscriminantAnalysis: 98.291\n\nPercent of better DecisionTreeClassifier over most frequent category solution: 0.699\nPercent of better LinearDiscriminantAnalysis over most frequent category solution: 0.932\n\nFeatures: ['salary', 'age', 'edu', 'sex']\nTest sample size: 0.3\n"
expected = repr(cat2cat_ml_run(mappings=mappings, ml=ml, test_prop=0.3))
actual = "Average Accuracy naive: 0.439\nAverage Accuracy most_freq: 0.69\nAverage Accuracy DecisionTreeClassifier: 0.63\nAverage Accuracy LinearDiscriminantAnalysis: 0.692\n\nPercent of failed DecisionTreeClassifier: 98.291\nPercent of failed LinearDiscriminantAnalysis: 98.291\n\nPercent of better DecisionTreeClassifier over most frequent category solution: 43.182\nPercent of better LinearDiscriminantAnalysis over most frequent category solution: 54.545\nPercent of better DecisionTreeClassifier over naive solution: 81.818\nPercent of better LinearDiscriminantAnalysis over naive solution: 84.091\n\nFeatures: ['salary', 'age', 'edu', 'sex']\nTest sample size: 30.0\n"
assert actual == expected


Expand All @@ -60,7 +61,7 @@ def test_cat2cat_ml_run_get_raw():
)
expected = cat2cat_ml_run(mappings=mappings, ml=ml).get_raw()["7431"]
actual = {
"naive": 0.3333333333333333,
"naive": nan,
"freq": nan,
"DecisionTreeClassifier": nan,
"LinearDiscriminantAnalysis": nan,
Expand Down
Loading