diff --git a/.ci/aptPackagesToInstall.txt b/.ci/aptPackagesToInstall.txt new file mode 100644 index 0000000..bd4d2a3 --- /dev/null +++ b/.ci/aptPackagesToInstall.txt @@ -0,0 +1,2 @@ +python3-pandas +python3-numpy diff --git a/.ci/pythonStdlibFixes.sh b/.ci/pythonStdlibFixes.sh new file mode 100644 index 0000000..b6f8fba --- /dev/null +++ b/.ci/pythonStdlibFixes.sh @@ -0,0 +1,2 @@ +if $( python -c "import sys;sys.exit(int(not (sys.version_info < (3, 5)) ))" ); then curl -O https://raw.githubusercontent.com/python/cpython/3.6/Lib/typing.py; fi; +if $( python -c "import sys;sys.exit(int(not (sys.version_info < (3, 6)) ))" ); then curl -O https://raw.githubusercontent.com/python/cpython/3.7/Lib/enum.py; fi; diff --git a/.editorconfig b/.editorconfig new file mode 100755 index 0000000..c9162b9 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +charset = utf-8 +indent_style = tab +indent_size = 4 +insert_final_newline = true +end_of_line = lf + +[*.{yml,yaml}] +indent_style = space +indent_size = 2 diff --git a/.github/.templateMarker b/.github/.templateMarker new file mode 100644 index 0000000..5e3a3e0 --- /dev/null +++ b/.github/.templateMarker @@ -0,0 +1 @@ +KOLANICH/python_project_boilerplate.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..89ff339 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,8 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" + allow: + - dependency-type: "all" diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 0000000..7fe33b3 --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,15 @@ +name: CI +on: + push: + branches: [master] + pull_request: + branches: [master] + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - name: typical python workflow + uses: KOLANICH-GHActions/typical-python-workflow@master + with: + github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..59e6afa --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__ +*.pyc +*.pyo +/*.egg-info +/build +/dist +/.eggs \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..7e11603 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,42 @@ +#image: travisci/ci-garnet:packer-1512502276-986baf0 +#image: pypy:latest +image: python:latest + +build: + tags: + - shared + stage: build + variables: + GIT_DEPTH: "1" + + before_script: + - source ./.ci/pythonStdlibFixes.sh + - pip3 install --upgrade setuptools + - pip3 install --upgrade pandas numpy + - pip3 install --upgrade coveralls setuptools_scm + + script: + - python3 setup.py bdist_wheel + - coverage run --source=Chassis setup.py test + - pip3 install --upgrade ./dist/*.whl + cache: + paths: + - /usr/local/site-packages + artifacts: + paths: + - dist + +sast: + tags: + - shared + image: docker:latest + variables: + DOCKER_DRIVER: overlay2 + allow_failure: true + services: + - docker:dind + script: + - docker run --env SAST_CONFIDENCE_LEVEL=5 --volume "$PWD:/code" --volume /var/run/docker.sock:/var/run/docker.sock "registry.gitlab.com/gitlab-org/security-products/sast:latest" /app/bin/run /code + artifacts: + paths: + - gl-sast-report.json diff --git a/Chassis.py b/Chassis.py new file mode 100644 index 0000000..898f074 --- /dev/null +++ b/Chassis.py @@ -0,0 +1,371 @@ +__all__ = ("Chassis", "MissingColumnsExplainer", "resampleDataFrame") + + +import typing +from inspect import signature + +import warnings +from functools import wraps + +import numpy as np +import pandas + + +class RecomputingDict(dict): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def mutateCallback(self): + pass + + def __setitem__(self, *args, **kwargs): + super().__setitem__(*args, **kwargs) + + +#_datasetsDb: typing.Optional[typing.Mapping[str, typing.Callable[[], object]]] = None +_datasetsDb = None + + +def _checkAndInitDatasetsDb() -> None: + """creates a dict mapping datasets names to the functions loading them""" + global _datasetsDb # pylint:disable=global-statement + if _datasetsDb is None: + #pylint:disable=import-outside-toplevel + import re + + import sklearn.datasets + + dsLoadFuncNameRx = re.compile("^(load|fetch)_(.+)") + _datasetsDb = {} + for fName in dir(sklearn.datasets): + m = dsLoadFuncNameRx.match(fName) + if m: + _datasetsDb[m.group(2)] = getattr(sklearn.datasets, fName) + + +def resampleDataFrame(pdf: pandas.DataFrame, balancer, columns: typing.Set[str] = None) -> pandas.DataFrame: + """Use a resampler from imblearn to resample dataframe columns""" + origCols = pdf.columns + if columns is None: + columns = origCols + for cn in columns: + x = pdf.loc[:, list(set(origCols) - {cn})] + y = pdf.loc[:, cn] + x1, y1 = balancer.fit_sample(x, y) + x1 = pandas.DataFrame(x1, columns=x.columns) + y1 = pandas.Series(y1, name=y.name) + pdf = pandas.concat([x1, y1], axis=1) + pdf.reindex(origCols, axis=1) + return pdf + + +class MissingColumnsExplainer: + """If a column is missing in covariates matrix (the one with encoded cathegoricals and without `stop` columns), tries to find an explaination and fix it""" + + __slots__ = ("parent", "categorical", "unexplained") + + def __init__(self, parent, missingColumns): + self.unexplained = set(missingColumns) + self.categorical = [] + for mcn in missingColumns: + colIsLikelyCategorical = False + for ccn in parent.groups["categorical"]: + if mcn.startswith(ccn): + self.unexplained.remove(mcn) + self.categorical.append(mcn) + break + + def fix(self, dmat, raiseUnexplained=True, warnFixable=False): + if self.categorical and warnFixable: + warnings.warn("the design matrix resulted from the dataset has no " + repr(self.categorical) + " columns, but they are present in the model. The cause are likely categorical variables values one-hot encoded missing in your partition of the dataset, so creating their columns with 0.-filled") + + for mcn in self.categorical: + dmat.loc[:, mcn] = 0.0 + + if self.unexplained and raiseUnexplained: + raise ValueError("Columns `" + repr(self.unexplained) + "` are missing from the design matrix and we cannot explain this.") + + +StrOrStrIter = typing.Union[str, typing.Iterable[str]] + + +def allowAcceptMultipleColumns(fOrSeriesClass=None, *, seriesClass: type = None): + if isinstance(fOrSeriesClass, type): + seriesClass = fOrSeriesClass + f = None + elif callable(fOrSeriesClass): + f = fOrSeriesClass + else: + raise ValueError() + + def _allowAcceptMultipleColumns(f): + s = signature(f) + firstParam = list(s.parameters.values())[1] + assert firstParam.name == "cn", firstParam.name + assert firstParam.annotation == str, firstParam.annotation + assert s.return_annotation == np.ndarray or s.return_annotation == pandas.Series or s.return_annotation == pandas.DataFrame, s.return_annotation + + if seriesClass is None or seriesClass is s.return_annotation: + singleSeriesCall = f + seriesClass_ = s.return_annotation + else: + seriesClass_ = seriesClass + + def singleSeriesCall(self, cn: str, *args, **kwargs) -> seriesClass_: + return seriesClass(f(self, cn, *args, **kwargs)) + + @wraps(f) + def modifiedF(self, cns: StrOrStrIter, *args, **kwargs) -> typing.Union[pandas.DataFrame, seriesClass_]: + if isinstance(cns, str): + return singleSeriesCall(self, cns, *args, **kwargs) + + return pandas.concat((f(self, cn, *args, **kwargs) for cn in cns), axis=1) + + #modifiedF.__name__ = f.__name__.replace("col", "cols") + modifiedF.__name__ = f.__name__ + modifiedF.__annotations__["cn"] = StrOrStrIter + return modifiedF + + if f is not None: + return _allowAcceptMultipleColumns(f) + + return _allowAcceptMultipleColumns + + +class Chassis: + """Patsy is shit. This class prepares data, and it's more predictable than patsy""" + + __slots__ = ("columns", "groups", "features", "stop", "weights", "catIndex", "catRemap", "pds", "dontWarnAboutMissingStopColumns") + + #columns: typing.Set[str] + groupsTypes = {gn: gn for gn in ("categorical", "numerical", "stop", "weight", "binary")} + + def __init__(self, spec: typing.Mapping[str, str], dataset: typing.Optional[pandas.DataFrame] = None) -> None: + """Imports `dataset` according to the `spec` + `spec` is a dict specifying schema of your data. Its keys are columns names, its values are strings "Categoric", "Numeric", "Binary" and "Stop" (it takes into account only first letters). "Stop" are removed. + `dataset` is a `pandas.DataFrame` with your data.""" + if isinstance(dataset, __class__): + if spec is None: + spec = dataset.spec + dataset = dataset.pds + + self.dontWarnAboutMissingStopColumns = False + self.weights = None + self.importSpec(spec) + self.importDataset(dataset) + + def _reprContents(self) -> str: + return ", ".join(("columns: " + str(len(self.columns)), ", ".join(((gn + ": " + str(len(g))) for gn, g in self.groups.items() if len(g))))) + + def __repr__(self) -> str: + return self.__class__.__name__ + "< " + self._reprContents() + " >" + + def importSpec(self, spec: typing.Mapping[str, str]) -> None: + """Imports specification dictionary.""" + self.features = spec + self.groups = {gtn: set() for gtn in set(self.__class__.groupsTypes.values())} # slow, need to move into metaclass + for k, c in self.features.items(): + self.groups[self.__class__.groupsTypes[c.lower()]].add(k) + self.columns = set(self.features) - set(self.groups["stop"]) - set(self.groups["weight"]) + + #catIndex: typing.Mapping[str, pandas.Series] + #catRemap: typing.Mapping[str, typing.Any] + + def importDataset(self, pds: pandas.DataFrame) -> None: + """Transforms pandas.DataFrame `pds` into internal representation.""" + if pds is None: + self.pds = None + self.catIndex = {} + self.catRemap = {} + return + + pds.reindex() + if hasattr(pds, "infer_objects"): + pds = pds.infer_objects() + + presentStopColumns = self.groups["stop"] & set(pds.columns) + missingStopColumns = self.groups["stop"] - presentStopColumns + + if missingStopColumns and not self.dontWarnAboutMissingStopColumns: + warnings.warn("Following stop columns are missing: " + repr(missingStopColumns) + ". Using only present columns.") + self.columns -= missingStopColumns + self.groups["stop"] = presentStopColumns + + self.groups["weight"] = self.groups["weight"] & set(pds.columns) + if self.groups["weight"]: + assert len(self.groups["weight"]) == 1 + self.weights = pds.loc[:, list(self.groups["weight"])] + + self.stop = pds.loc[:, list(presentStopColumns)] + + pds = pds.loc[:, list(self.columns)] + + colz = [pds[cn].astype("float32") for cn in self.groups["binary"]] + catColz = {cn: pds[cn].astype("category") for cn in self.groups["categorical"]} + dummiez = {cn: pandas.get_dummies(col, prefix=cn) for cn, col in catColz.items()} + self.catIndex = {cn: col.cat.categories for cn, col in catColz.items()} + self.catRemap = {cn: list(col.columns) for cn, col in dummiez.items()} + #for cn in catColz: + # print(cn, len(self.catIndex[cn]), len(self.catRemap[cn])) + # assert len(self.catIndex[cn]) == len(self.catRemap[cn]) + colz.extend(dummiez.values()) + colz.extend([pandas.to_numeric(pds[c], "coerce") for c in self.groups["numerical"]]) + + self.pds = pandas.concat(colz, axis=1) + + @allowAcceptMultipleColumns(pandas.DataFrame) + def _colsNaEquiv(self, cn: str) -> pandas.Series: + """Returns a column suitable for checking if a value is nan. If it is categorical, it selects the first column one-hot because if value is nan one-hot will make all the values nans""" + if cn in self.catRemap: + col = self.pds.loc[:, self.catRemap[cn][0]] + else: + col = self.pds.loc[:, cn] + return col + + def colsNotNA(self, cns: StrOrStrIter) -> pandas.Series: + """Returns result of comparison of original column values to nans""" + return self._colsNaEquiv(cns).notna().all(axis=1, skipna=False) + + def colsIsNA(self, cns: StrOrStrIter) -> pandas.Series: + """Returns result of comparison of original column values to nans""" + return self._colsNaEquiv(cns).isna().any(axis=1, skipna=False) + + def prepareCovariates(self, cns: typing.Optional[StrOrStrIter] = (), dmat: typing.Optional[pandas.DataFrame] = None, excludeColumns: typing.Set[str] = None) -> pandas.DataFrame: + """Returns matrix of the rest of covariates needed to fit column `cn`""" + if dmat is None: + dmat = self.pds + neededCols = set(dmat.columns) + if excludeColumns is not None: + neededCols -= excludeColumns + + if cns is None: + cns = tuple() + elif isinstance(cns, str): + cns = (cns,) + + for cn in cns: + if cn in self.catRemap: + neededCols -= set(self.catRemap[cn]) + else: + neededCols -= {cn} + #print(neededCols) + + return dmat.loc[:, list(neededCols)] + + def oneHotToCategory(self, cn: str, oneHot: pandas.DataFrame, index=None) -> pandas.Series: + """Reverses one-hot encoding for category name. Transforms a matrix of columns `oneHot` (it must contain ONLY that columns, AND in the right order) into a column with type `category`""" + #print(cn, self.catIndex, self.catRemap) + #print(cn, len(self.catIndex[cn]), len(self.catRemap[cn])) + assert len(self.catIndex[cn]) == len(self.catRemap[cn]) + catIdx = self.catIndex[cn] + return self.numpyToColumn(cn, pandas.Categorical(catIdx[np.argmax(oneHot, axis=1)], categories=catIdx, ordered=False), index) # TODO: NaN = null vec + + def numpyToColumn(self, cn: str, data: np.array, index=None) -> pandas.Series: + """Converts a numpy array into a column""" + if index is None: + index = self.pds.index + res = pandas.pandas.Series(data, index=index) + res.name = cn + return res + + def decodeCategory(self, cn: str, dmat: typing.Optional[pandas.DataFrame] = None) -> pandas.Series: + """Returns original (like in the inital pandas.DataFrame) representation of column with the name `cn`.""" + if dmat is None: + dmat = self.pds + return self.oneHotToCategory( + cn, np.array( + dmat.loc[:, list(self.catRemap[cn])] + ) + ) + + @allowAcceptMultipleColumns + def prepareResults(self, cn: str, dmat: typing.Optional[pandas.DataFrame] = None) -> pandas.Series: + """Prepares result column pandas.DataFrame""" + if dmat is None: + dmat = self.pds + if cn in self.catRemap: + return self.decodeCategory(cn, dmat) + + return dmat.loc[:, cn] + + def select(self, decodeCategories: bool = True, columns: typing.Optional[typing.Set[str]] = None): + """Returns matrix by original columns, not transformed ones. + decodeCategories transforms one-hot encoded columns back to the original ones + columns allows to select subset of columns. If it is None, all the original columns are selected.""" + if columns is None: + columns = self.columns | self.groups["stop"] + + colz = [self.pds.loc[:, cn] for cn in self.groups["binary"] & columns] + colz.extend([self.pds.loc[:, cn] for cn in self.groups["numerical"] & columns]) + colz.append(self.stop[list(columns & self.groups["stop"])]) + if self.weights is not None: + colz.append(self.weights) + res = pandas.concat(colz, axis=1) + if decodeCategories: + for cn in set(self.catRemap) & columns: + res[cn] = self.decodeCategory(cn) + else: + for cn in set(self.catRemap) & columns: + for vcn in self.catRemap[cn]: + res[vcn] = self.pds.loc[:, vcn] + return res + + def reduceCategoricalCols(self, dmat: typing.Optional[pandas.DataFrame], columns: typing.Optional[typing.Set[str]] = None): + """Sums categorical columns. In future may use other functions. Useful for combining additive values like SHAP scores.""" + if columns is None: + columns = set(self.catRemap) + + availCols = set(dmat.columns) + plainColumns = list((self.columns - columns) & availCols) + + resCols = [dmat.loc[:, list(plainColumns)]] + for cn in columns: + colz = dmat.loc[:, list(set(self.catRemap[cn]) & availCols)] + colz = colz[colz.notna()] + resC = colz.sum(axis=1) + resC.name = cn + resCols.append(resC) + return pandas.concat(resCols, axis=1) + + def reverse(self, columns: typing.Optional[typing.Set[str]] = None): + """Encodes design matrix back into initial representation, can return subset of the original columns""" + return self.select(decodeCategories=True, columns=columns) + + @staticmethod + def specFromPandas(ds: pandas.DataFrame) -> typing.Mapping[str, str]: + """Tries to reverse-engineer spec from data.""" + spec = {} + for cn in ds.columns: + dt = ds.dtypes[cn] + v = ds.loc[:, cn] + rT = None + if dt.kind == "f" or dt.kind == "i": + if v[0] == 0 or v[0] == 1: + tf = set(dt.type([True, False])) + if tf == (set(v.unique()) & tf): + rT = "binary" + else: + rT = "numerical" + else: + rT = "numerical" + elif dt.kind == "O": + types = set(v.map(type)) - {None.__class__} + if len(types) == 1: + tp = next(iter(types)) + if np.issubdtype(tp, str): + rT = "categorical" + if rT is None: + rT = "stop" + spec[cn] = rT + return spec + + @classmethod + def fromSKLearnDataset(cls, dataset: typing.Union[str, "sklearn.utils.Bunch"], targetName: str = "target", *args, **kwargs): + """Converts an sklearn dataset into a Chassis""" + if isinstance(dataset, str): + _checkAndInitDatasetsDb() + dataset = _datasetsDb[dataset]() + ds = pandas.concat([pandas.Series(dataset.target, name=targetName), pandas.DataFrame(dataset.data, columns=dataset.feature_names)], axis=1) + ds = ds.infer_objects() + spec = cls.specFromPandas(ds) + return cls(spec, ds, *args, **kwargs) diff --git a/Code_Of_Conduct.md b/Code_Of_Conduct.md new file mode 100644 index 0000000..2b781c7 --- /dev/null +++ b/Code_Of_Conduct.md @@ -0,0 +1 @@ +No codes of conduct! diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..20f0fa8 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include UNLICENSE +include *.md +include tests +include .editorconfig diff --git a/ReadMe.md b/ReadMe.md new file mode 100644 index 0000000..9dd3dc1 --- /dev/null +++ b/ReadMe.md @@ -0,0 +1,19 @@ +Chassis.py [![Unlicensed work](https://raw.githubusercontent.com/unlicense/unlicense.org/master/static/favicon.png)](https://unlicense.org/) +=============== +~~![GitLab Build Status](https://gitlab.com/KOLANICH1/Chassis.py/badges/master/pipeline.svg)~~ +~~![GitLab Coverage](https://gitlab.com/KOLANICH1/Chassis.py/badges/master/coverage.svg)~~ +[![Libraries.io Status](https://img.shields.io/librariesio/github/KOLANICH/Chassis.py.svg)](https://libraries.io/github/KOLANICH/Chassis.py) +[![Code style: antiflash](https://img.shields.io/badge/code%20style-antiflash-FFF.svg)](https://codeberg.org/KOLANICH-tools/antiflash.py) + +This is the library to transform a `pandas.DataFrame` into another `DataFrame` suitable for machine learning. It's my own reinvention of a ~~~wheel~~~ ![PyPI Status](https://img.shields.io/pypi/status/formulaic.svg)[![Build](https://img.shields.io/github/actions/workflow/status/matthewwardrop/formulaic/tests.yml?branch=main)](https://github.com/matthewwardrop/formulaic/actions?query=workflow%3A%22Run+Tox+Tests%22)[![docs](https://img.shields.io/github/actions/workflow/status/matthewwardrop/formulaic/publish_docs.yml?label=docs)](https://matthewwardrop.github.io/formulaic/)[![codecov](https://codecov.io/gh/matthewwardrop/formulaic/branch/main/graph/badge.svg)](https://codecov.io/gh/matthewwardrop/formulaic)[![Libraries.io Status](https://img.shields.io/librariesio/github/pydata/patsy.svg)](https://libraries.io/github/pydata/patsy), which doesn't fit my needs. + +It solves the following drawbacks of patsy: +* unpredictability + * the column names are changed in unpredictable way depending on **content** of dataframe you pass to it. You also cannot retrive the names and have to write very dirty code. Here you can retrieve columns by names. + * The content is often shit `patsy` decides that we need it. For example it can remove a column if it finds them linearry dependent. Such matrices are not suitable to all the ML algorithms and currently there is no way to disable such a behavior. +* lack of automation - I have to do everything myself: construct expression and evaluate it. + +Requirements +------------ +* [`numpy`](https://github.com/numpy/numpy) ![Licence](https://img.shields.io/github/license/numpy/numpy.svg) [![PyPi Status](https://img.shields.io/pypi/v/numpy.svg)](https://pypi.org/project/numpy) [![Build status](https://github.com/numpy/numpy/actions/workflows/linux.yml/badge.svg?branch=main)](https://github.com/numpy/numpy/actions/workflows/linux.yml) [![Libraries.io Status](https://img.shields.io/librariesio/github/numpy/numpy.svg)](https://libraries.io/github/numpy/numpy) +* [`pandas`](https://github.com/pandas-dev/pandas) ![Licence](https://img.shields.io/github/license/pandas-dev/pandas.svg) [![PyPi Status](https://img.shields.io/pypi/v/pandas.svg)](https://pypi.python.org/pypi/pandas) [![CI](https://github.com/pandas-dev/pandas/actions/workflows/unit-tests.yml/badge.svg)](https://github.com/pandas-dev/pandas/actions/workflows/unit-tests.yml) [![CodeCov Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=master)](https://codecov.io/github/pandas-dev/pandas/) [![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/conda-forge/pandas) [![License - BSD 3-Clause](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/main/LICENSE) [![Libraries.io Status](https://img.shields.io/librariesio/github/pandas-dev/pandas.svg)](https://libraries.io/github/pandas-dev/pandas) [![Gitter.im](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas) diff --git a/Tutorial.ipynb b/Tutorial.ipynb new file mode 100644 index 0000000..c2033a2 --- /dev/null +++ b/Tutorial.ipynb @@ -0,0 +1,686 @@ +{ + "cells": + [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": + { + "collapsed": true + }, + "outputs": [], + "source": + [ + "import pandas\n", + "from Chassis import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": + { + "collapsed": false + }, + "outputs": + [ + { + "data": + { + "text/html": + [ + "
\n", + "