From 1c6a58fa14e4e64924900e7115c8f513e7f97561 Mon Sep 17 00:00:00 2001 From: samukweku Date: Thu, 19 Dec 2024 07:23:49 +0000 Subject: [PATCH] deploy: 1d274b515b9ba6e17867138ad2ef5a5a59480e78 --- .nojekyll | 0 404.html | 661 + AUTHORS/index.html | 893 + CHANGELOG/index.html | 1695 + api/biology/index.html | 1070 + api/chemistry/index.html | 2369 + api/engineering/index.html | 1122 + api/finance/index.html | 1567 + api/functions/index.html | 36199 ++++++++++++++++ api/io/index.html | 2919 ++ api/math/index.html | 2647 ++ api/ml/index.html | 1065 + api/polars/index.html | 5428 +++ api/timeseries/index.html | 1909 + api/xarray/index.html | 1430 + assets/_mkdocstrings.css | 143 + assets/images/favicon.png | Bin 0 -> 1870 bytes assets/javascripts/bundle.88dd0f4e.min.js | 16 + assets/javascripts/bundle.88dd0f4e.min.js.map | 7 + assets/javascripts/lunr/min/lunr.ar.min.js | 1 + assets/javascripts/lunr/min/lunr.da.min.js | 18 + assets/javascripts/lunr/min/lunr.de.min.js | 18 + assets/javascripts/lunr/min/lunr.du.min.js | 18 + assets/javascripts/lunr/min/lunr.el.min.js | 1 + assets/javascripts/lunr/min/lunr.es.min.js | 18 + assets/javascripts/lunr/min/lunr.fi.min.js | 18 + assets/javascripts/lunr/min/lunr.fr.min.js | 18 + assets/javascripts/lunr/min/lunr.he.min.js | 1 + assets/javascripts/lunr/min/lunr.hi.min.js | 1 + assets/javascripts/lunr/min/lunr.hu.min.js | 18 + assets/javascripts/lunr/min/lunr.hy.min.js | 1 + assets/javascripts/lunr/min/lunr.it.min.js | 18 + assets/javascripts/lunr/min/lunr.ja.min.js | 1 + assets/javascripts/lunr/min/lunr.jp.min.js | 1 + assets/javascripts/lunr/min/lunr.kn.min.js | 1 + assets/javascripts/lunr/min/lunr.ko.min.js | 1 + assets/javascripts/lunr/min/lunr.multi.min.js | 1 + assets/javascripts/lunr/min/lunr.nl.min.js | 18 + assets/javascripts/lunr/min/lunr.no.min.js | 18 + assets/javascripts/lunr/min/lunr.pt.min.js | 18 + assets/javascripts/lunr/min/lunr.ro.min.js | 18 + assets/javascripts/lunr/min/lunr.ru.min.js | 18 + assets/javascripts/lunr/min/lunr.sa.min.js | 1 + .../lunr/min/lunr.stemmer.support.min.js | 1 + assets/javascripts/lunr/min/lunr.sv.min.js | 18 + assets/javascripts/lunr/min/lunr.ta.min.js | 1 + assets/javascripts/lunr/min/lunr.te.min.js | 1 + assets/javascripts/lunr/min/lunr.th.min.js | 1 + assets/javascripts/lunr/min/lunr.tr.min.js | 18 + assets/javascripts/lunr/min/lunr.vi.min.js | 1 + assets/javascripts/lunr/min/lunr.zh.min.js | 1 + assets/javascripts/lunr/tinyseg.js | 206 + assets/javascripts/lunr/wordcut.js | 6708 +++ .../workers/search.6ce7567c.min.js | 42 + .../workers/search.6ce7567c.min.js.map | 7 + assets/stylesheets/main.6f8fc17f.min.css | 1 + assets/stylesheets/main.6f8fc17f.min.css.map | 1 + assets/stylesheets/palette.06af60db.min.css | 1 + .../stylesheets/palette.06af60db.min.css.map | 1 + css/apidocs.css | 53 + development/lazy_imports/index.html | 767 + devguide/index.html | 1262 + environment.yaml | 29 + index.html | 1221 + objects.inv | Bin 0 -> 2470 bytes search/search_index.json | 1 + sitemap.xml | 67 + sitemap.xml.gz | Bin 0 -> 327 bytes 68 files changed, 71794 insertions(+) create mode 100644 .nojekyll create mode 100644 404.html create mode 100644 AUTHORS/index.html create mode 100644 CHANGELOG/index.html create mode 100644 api/biology/index.html create mode 100644 api/chemistry/index.html create mode 100644 api/engineering/index.html create mode 100644 api/finance/index.html create mode 100644 api/functions/index.html create mode 100644 api/io/index.html create mode 100644 api/math/index.html create mode 100644 api/ml/index.html create mode 100644 api/polars/index.html create mode 100644 api/timeseries/index.html create mode 100644 api/xarray/index.html create mode 100644 assets/_mkdocstrings.css create mode 100644 assets/images/favicon.png create mode 100644 assets/javascripts/bundle.88dd0f4e.min.js create mode 100644 assets/javascripts/bundle.88dd0f4e.min.js.map create mode 100644 assets/javascripts/lunr/min/lunr.ar.min.js create mode 100644 assets/javascripts/lunr/min/lunr.da.min.js create mode 100644 assets/javascripts/lunr/min/lunr.de.min.js create mode 100644 assets/javascripts/lunr/min/lunr.du.min.js create mode 100644 assets/javascripts/lunr/min/lunr.el.min.js create mode 100644 assets/javascripts/lunr/min/lunr.es.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.he.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hu.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hy.min.js create mode 100644 assets/javascripts/lunr/min/lunr.it.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ja.min.js create mode 100644 assets/javascripts/lunr/min/lunr.jp.min.js create mode 100644 assets/javascripts/lunr/min/lunr.kn.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ko.min.js create mode 100644 assets/javascripts/lunr/min/lunr.multi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.nl.min.js create mode 100644 assets/javascripts/lunr/min/lunr.no.min.js create mode 100644 assets/javascripts/lunr/min/lunr.pt.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ro.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ru.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sa.min.js create mode 100644 assets/javascripts/lunr/min/lunr.stemmer.support.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sv.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ta.min.js create mode 100644 assets/javascripts/lunr/min/lunr.te.min.js create mode 100644 assets/javascripts/lunr/min/lunr.th.min.js create mode 100644 assets/javascripts/lunr/min/lunr.tr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.vi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.zh.min.js create mode 100644 assets/javascripts/lunr/tinyseg.js create mode 100644 assets/javascripts/lunr/wordcut.js create mode 100644 assets/javascripts/workers/search.6ce7567c.min.js create mode 100644 assets/javascripts/workers/search.6ce7567c.min.js.map create mode 100644 assets/stylesheets/main.6f8fc17f.min.css create mode 100644 assets/stylesheets/main.6f8fc17f.min.css.map create mode 100644 assets/stylesheets/palette.06af60db.min.css create mode 100644 assets/stylesheets/palette.06af60db.min.css.map create mode 100644 css/apidocs.css create mode 100644 development/lazy_imports/index.html create mode 100644 devguide/index.html create mode 100644 environment.yaml create mode 100644 index.html create mode 100644 objects.inv create mode 100644 search/search_index.json create mode 100644 sitemap.xml create mode 100644 sitemap.xml.gz diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 000000000..e69de29bb diff --git a/404.html b/404.html new file mode 100644 index 000000000..1c0f177b8 --- /dev/null +++ b/404.html @@ -0,0 +1,661 @@ + + + + + + + + + + + + + + + + + + + + + pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ +

404 - Not found

+ +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/AUTHORS/index.html b/AUTHORS/index.html new file mode 100644 index 000000000..fa500ee0e --- /dev/null +++ b/AUTHORS/index.html @@ -0,0 +1,893 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Authors - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Contributors

+

Once you have added your contribution to pyjanitor, +please add your name using this markdown template:

+
[@githubname](https://github.com/githubname) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%3Agithubname)
+
+

You can copy/paste the template and replace githubname with your username.

+

Contributions that did not leave a commit trace +are indicated in bullet points below each user's username.

+

Leads

+ +

Contributors

+ + + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/CHANGELOG/index.html b/CHANGELOG/index.html new file mode 100644 index 000000000..a0e5e84c5 --- /dev/null +++ b/CHANGELOG/index.html @@ -0,0 +1,1695 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Changelog - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + +

Changelog

+

Unreleased

+

v0.30.0 - 2024-12-04

+

v0.29.2 - 2024-09-28

+

v0.29.1 - 2024-09-23

+

v0.29.0 - 2024-09-15

+
    +
  • [DOC] Un-deprecate join_apply as no alternative currently exists - Issue #1399 @lbeltrame
  • +
+

v0.28.1 - 2024-08-09

+

v0.28.0 - 2024-08-03

+
    +
  • [ENH] Added a cartesian_product function, as well as an expand method for pandas. - Issue #1293 @samukweku
  • +
  • [ENH] Improve pivot_longer when sort_by_appearance is True. Added pivot_longer_spec for more control on how the dataframe should be unpivoted. -@samukweku #1361
  • +
  • [ENH] Added convert_excel_date and convert_matlab_date methods for polars - Issue #1352
  • +
  • [ENH] Added a complete method for polars. - Issue #1352 @samukweku
  • +
  • [ENH] Added a pivot_longer method, and a pivot_longer_spec function for polars - Issue #1352 @samukweku
  • +
  • [ENH] Added a row_to_names method for polars. Issue #1352 @samukweku
  • +
  • [ENH] read_commandline function now supports polars - Issue #1352 @samukweku
  • +
  • [ENH] xlsx_cells function now supports polars - Issue #1352 @samukweku
  • +
  • [ENH] xlsx_table function now supports polars - Issue #1352 @samukweku
  • +
  • [ENH] Added a clean_names method for polars - it can be used to clean the column names, or clean column values . Issue #1343 @samukweku
  • +
  • [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341
  • +
  • [ENH] pandas Index,Series, DataFrame now supported in the complete method. - PR #1369 @samukweku
  • +
  • [ENH] Improve performance for first/last in `conditional_join, when the join columns in the right dataframe are sorted. - PR #1382 @samukweku
  • +
+

v0.27.0 - 2024-03-21

+
    +
  • [BUG] Fix logic for groupby in complete. Index support deprecated. Fix deprecation warning for fillna in complete PR #1289 @samukweku
  • +
  • [ENH] select function now supports variable arguments - PR #1288 @samukweku
  • +
  • [ENH] conditional_join now supports timedelta dtype. - PR #1297 @samukweku
  • +
  • [ENH] get_join_indices function added - returns only join indices between two dataframes. Issue #1310 @samukweku
  • +
  • [ENH] explode_index function added. - Issue #1283
  • +
  • [ENH] conditional_join now supports timedelta dtype. - PR #1297
  • +
  • [ENH] change_index_dtype added. - @samukweku Issue #1314
  • +
  • [ENH] Add glue and axis parameters to collapse_levels. - Issue #211 @samukweku
  • +
  • [ENH] row_to_names now supports multiple rows conversion to columns. - @samukweku Issue #1333
  • +
  • [ENH] Fix warnings from Pandas. truncate_datetime now uses a vectorized option. -@samukweku #1337
  • +
+

v0.26.0 - 2023-09-18

+
    +
  • [ENH] clean_names can now be applied to column values. Issue #995 @samukweku
  • +
  • [BUG] Fix ImportError - Issue #1285 @samukweku
  • +
+

v0.25.0 - 2023-07-27

+
    +
  • [INF] Replace pytest.ini file with pyproject.toml file. PR #1204 @Zeroto521
  • +
  • [INF] Extract docstrings tests from all tests. PR #1205 @Zeroto521
  • +
  • [BUG] Address the TypeError when importing v0.24.0 (issue #1201 @xujiboy and @joranbeasley)
  • +
  • [INF] Fixed issue with missing PyPI README. PR #1216 @thatlittleboy
  • +
  • [INF] Update some mkdocs compatibility code. PR #1231 @thatlittleboy
  • +
  • [INF] Migrated docstring style from Sphinx to Google for better compatibility with mkdocstrings. PR #1235 @thatlittleboy
  • +
  • [INF] Prevent selection of chevrons (>>>) and outputs in Example code blocks. PR #1237 @thatlittleboy
  • +
  • [DEPR] Add deprecation warnings for process_text, rename_column, rename_columns, filter_on, remove_columns, fill_direction. Issue #1045 @samukweku
  • +
  • [ENH] pivot_longer now supports named groups where names_pattern is a regular expression. A dictionary can now be passed to names_pattern, and is internally evaluated as a list/tuple of regular expressions. Issue #1209 @samukweku
  • +
  • [ENH] Improve selection in conditional_join. Issue #1223 @samukweku
  • +
  • [ENH] Add col class for selecting columns within an expression. Currently limited to use within conditional_join. PR #1260 @samukweku.
  • +
  • [ENH] Performance improvement for range joins in conditional_join, when use_numba = False. Performance improvement for equi-join and a range join, when use_numba = True, for many to many join with wide ranges. PR #1256, #1267 @samukweku
  • +
  • [DEPR] Add deprecation warning for pivot_wider. Issue #1045 @samukweku
  • +
  • [BUG] Fix string column selection on a MultiIndex. Issue #1265. @samukweku
  • +
+

v0.24.0 - 2022-11-12

+
    +
  • [ENH] Add lazy imports to speed up the time taken to load pyjanitor (part 2)
  • +
  • [DOC] Updated developer guide docs.
  • +
  • [ENH] Allow column selection/renaming within conditional_join. Issue #1102. Also allow first or last match. Issue #1020 @samukweku.
  • +
  • [ENH] New decorator deprecated_kwargs for breaking API. #1103 @Zeroto521
  • +
  • [ENH] Extend select_columns to support non-string columns. Issue #1105 @samukweku
  • +
  • [ENH] Performance improvement for groupby_topk. Issue #1093 @samukweku
  • +
  • [ENH] min_max_scale drop old_min and old_max to fit sklearn's method API. Issue #1068 @Zeroto521
  • +
  • [ENH] Add jointly option for min_max_scale support to transform each column values or entire values. Default transform each column, similar behavior to sklearn.preprocessing.MinMaxScaler. (Issue #1067, PR #1112, PR #1123) @Zeroto521
  • +
  • [INF] Require pyspark minimal version is v3.2.0 to cut duplicates codes. Issue #1110 @Zeroto521
  • +
  • [ENH] Add support for extension arrays in expand_grid. Issue #1121 @samukweku
  • +
  • [ENH] Add names_expand and index_expand parameters to pivot_wider for exposing missing categoricals. Issue #1108 @samukweku
  • +
  • [ENH] Add fix for slicing error when selecting columns in pivot_wider. Issue #1134 @samukweku
  • +
  • [ENH] dropna parameter added to pivot_longer. Issue #1132 @samukweku
  • +
  • [INF] Update mkdocstrings version and to fit its new coming features. PR #1138 @Zeroto521
  • +
  • [BUG] Force math.softmax returning Series. PR #1139 @Zeroto521
  • +
  • [INF] Set independent environment for building documentation. PR #1141 @Zeroto521
  • +
  • [DOC] Add local documentation preview via github action artifact. PR #1149 @Zeroto521
  • +
  • [ENH] Enable encode_categorical handle 2 (or more ) dimensions array. PR #1153 @Zeroto521
  • +
  • [TST] Fix testcases failing on Window. Issue #1160 @Zeroto521, and @samukweku
  • +
  • [INF] Cancel old workflow runs via Github Action concurrency. PR #1161 @Zeroto521
  • +
  • [ENH] Faster computation for non-equi join, with a numba engine. Speed improvement for left/right joins when sort_by_appearance is False. Issue #1102 @samukweku
  • +
  • [BUG] Avoid change_type mutating original DataFrame. PR #1162 @Zeroto521
  • +
  • [ENH] The parameter column_name of change_type totally supports inputing multi-column now. #1163 @Zeroto521
  • +
  • [ENH] Fix error when sort_by_appearance=True is combined with dropna=True. Issue #1168 @samukweku
  • +
  • [ENH] Add explicit default parameter to case_when function. Issue #1159 @samukweku
  • +
  • [BUG] pandas 1.5.x _MergeOperation doesn't have copy keyword anymore. Issue #1174 @Zeroto521
  • +
  • [ENH] select_rows function added for flexible row selection. Generic select function added as well. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
  • +
  • [TST] Compat with macos and window, to fix FailedHealthCheck Issue #1181 @Zeroto521
  • +
  • [INF] Merge two docs CIs (docs-preview.yml and docs.yml) to one. And add documentation pytest mark. PR #1183 @Zeroto521
  • +
  • [INF] Merge codecov.yml (only works for the dev branch pushing event) into tests.yml (only works for PR event). PR #1185 @Zeroto521
  • +
  • [TST] Fix failure for test/timeseries/test_fill_missing_timestamp. Issue #1184 @samukweku
  • +
  • [BUG] Import DataDescription to fix: AttributeError: 'DataFrame' object has no attribute 'data_description'. PR #1191 @Zeroto521
  • +
+

v0.23.1 - 2022-05-03

+
    +
  • [DOC] Updated fill.py and update_where.py documentation with working examples.
  • +
  • [ENH] Deprecate num_bins from bin_numeric in favour of bins, and allow generic **kwargs to be passed into pd.cut. Issue #969. @thatlittleboy
  • +
  • [ENH] Fix concatenate_columns not working on category inputs @zbarry
  • +
  • [INF] Simplify CI system @ericmjl
  • +
  • [ENH] Added "read_commandline" function to janitor.io @BaritoneBeard
  • +
  • [BUG] Fix bug with the complement parameter of filter_on. Issue #988. @thatlittleboy
  • +
  • [ENH] Add xlsx_table, for reading tables from an Excel sheet. @samukweku
  • +
  • [ENH] minor improvements for conditional_join; equality only joins are no longer supported; there has to be at least one non-equi join present. @samukweku
  • +
  • [BUG] sort_column_value_order no longer mutates original dataframe.
  • +
  • [BUG] Extend fill_empty's column_names type range. Issue #998. @Zeroto521
  • +
  • [BUG] Removed/updated error-inducing default arguments in row_to_names (#1004) and round_to_fraction (#1005). @thatlittleboy
  • +
  • [ENH] patterns deprecated in favour of importing re.compile. #1007 @samukweku
  • +
  • [ENH] Changes to kwargs in encode_categorical, where the values can either be a string or a 1D array. #1021 @samukweku
  • +
  • [ENH] Add fill_value and explicit parameters to the complete function. #1019 @samukweku
  • +
  • [ENH] Performance improvement for expand_grid. @samukweku
  • +
  • [BUG] Make factorize_columns (PR #1028) and truncate_datetime_dataframe (PR #1040) functions non-mutating. @thatlittleboy
  • +
  • [BUG] Fix SettingWithCopyWarning and other minor bugs when using truncate_datetime_dataframe, along with further performance improvements (PR #1040). @thatlittleboy
  • +
  • [ENH] Performance improvement for conditional_join. @samukweku
  • +
  • [ENH] Multiple .value is now supported in pivot_longer. Multiple values_to is also supported, when names_pattern is a list or tuple. names_transform parameter added, for efficient dtype transformation of unpivoted columns. #1034, #1048, #1051 @samukweku
  • +
  • [ENH] Add xlsx_cells for reading a spreadsheet as a table of individual cells. #929 @samukweku.
  • +
  • [ENH] Let filter_string suit parameters of Series.str.contains Issue #1003 and #1047. @Zeroto521
  • +
  • [ENH] names_glue in pivot_wider now takes a string form, using str.format_map under the hood. levels_order is also deprecated. @samukweku
  • +
  • [BUG] Fixed bug in transform_columns which ignored the column_names specification when new_column_names dictionary was provided as an argument, issue #1063. @thatlittleboy
  • +
  • [BUG] count_cumulative_unique no longer modifies the column being counted in the output when case_sensitive argument is set to False, issue #1065. @thatlittleboy
  • +
  • [BUG] Fix for gcc missing error in dev container
  • +
  • [DOC] Added a step in the dev guide to install Remote Container in VS Code. @ashenafiyb
  • +
  • [DOC] Convert expand_column and find_replace code examples to doctests, issue #972. @gahjelle
  • +
  • [DOC] Convert expand_column code examples to doctests, issue #972. @gahjelle
  • +
  • [DOC] Convert get_dupes code examples to doctests, issue #972. @ethompsy
  • +
  • [DOC] Convert engineering code examples to doctests, issue #972 @ashenafiyb
  • +
  • [DOC] Convert groupby_topk code examples to doctests, issue #972. @ethompsy
  • +
  • [DOC] Add doctests to math, issue #972. @gahjelle
  • +
  • [DOC] Add doctests to math and ml, issue #972. @gahjelle
  • +
  • [DOC] Add doctests to math, ml, and xarray, issue #972. @gahjelle
  • +
+

v0.22.0 - 2021-11-21

+
    +
  • [BUG] Fix conditional join issue for multiple conditions, where pd.eval fails to evaluate if numexpr is installed. #898 @samukweku
  • +
  • [ENH] Added case_when to handle multiple conditionals and replacement values. Issue #736. @robertmitchellv
  • +
  • [ENH] Deprecate new_column_names and merge_frame from process_text. Only existing columns are supported. @samukweku
  • +
  • [ENH] complete uses pd.merge internally, providing a simpler logic, with some speed improvements in certain cases over pd.reindex. @samukweku
  • +
  • [ENH] expand_grid returns a MultiIndex DataFrame, allowing the user to decide how to manipulate the columns. @samukweku
  • +
  • [INF] Simplify a bit linting, use pre-commit as the CI linting checker. @Zeroto521
  • +
  • [ENH] Fix bug in pivot_longer for wrong output when names_pattern is a sequence with a single value. Issue #885 @samukweku
  • +
  • [ENH] Deprecate aggfunc from pivot_wider; aggregation can be chained with pandas' groupby.
  • +
  • [ENH] As_Categorical deprecated from encode_categorical; a tuple of (categories, order) suffices for **kwargs. @samukweku
  • +
  • [ENH] Deprecate names_sort from pivot_wider.@samukweku
  • +
  • [ENH] Add softmax to math module. Issue #902. @loganthomas
  • +
+

v0.21.2 - 2021-09-01

+
    +
  • [ENH] Fix warning message in coalesce, from bfill/fill;coalesce now uses variable arguments. Issue #882 @samukweku
  • +
  • [INF] Add SciPy as explicit dependency in base.in. Issue #895 @ericmjl
  • +
+

v0.21.1 - 2021-08-29

+
    +
  • [DOC] Fix references and broken links in AUTHORS.rst. @loganthomas
  • +
  • [DOC] Updated Broken links in the README and contributing docs. @nvamsikrishna05
  • +
  • [INF] Update pre-commit hooks and remove mutable references. Issue #844. @loganthomas
  • +
  • [INF] Add GitHub Release pointer to auto-release script. Issue #818. @loganthomas
  • +
  • [INF] Updated black version in github actions code-checks to match pre-commit hooks. @nvamsikrishna05
  • +
  • [ENH] Add reset_index flag to row_to_names function. @fireddd
  • +
  • [ENH] Updated label_encode to use pandas factorize instead of scikit-learn LabelEncoder. @nvamsikrishna05
  • +
  • [INF] Removed the scikit-learn package from the dependencies from environment-dev.yml and base.in files. @nvamsikrishna05
  • +
  • [ENH] Add function to remove constant columns. @fireddd
  • +
  • [ENH] Added factorize_columns method which will deprecate the label_encode method in future release. @nvamsikrishna05
  • +
  • [DOC] Delete Read the Docs project and remove all readthedocs.io references from the repo. Issue #863. @loganthomas
  • +
  • [DOC] Updated various documentation sources to reflect pyjanitor-dev ownership. @loganthomas
  • +
  • [INF] Fix isort automatic checks. Issue #845. @loganthomas
  • +
  • [ENH] complete function now uses variable args (*args) - @samukweku
  • +
  • [ENH] Set expand_column's sep default is "|", same to pandas.Series.str.get_dummies. Issue #876. @Zeroto521
  • +
  • [ENH] Deprecate limit from fill_direction. fill_direction now uses kwargs. @samukweku
  • +
  • [ENH] Added conditional_join function that supports joins on non-equi operators. @samukweku
  • +
  • [INF] Speed up pytest via -n (pytest-xdist) option. Issue #881. @Zeroto521
  • +
  • [DOC] Add list mark to keep select_columns's example same style. @Zeroto521
  • +
  • [ENH] Updated rename_columns to take optional function argument for mapping. @nvamsikrishna05
  • +
+

v0.21.0 - 2021-07-16

+
    +
  • [ENH] Drop fill_value parameter from complete. Users can use fillna instead. @samukweku
  • +
  • [BUG] Fix bug in pivot_longer with single level columns. @samukweku
  • +
  • [BUG] Disable exchange rates API until we can find another one to hit. @ericmjl
  • +
  • [ENH] Change coalesce to return columns; also use bfill, ffill, + which is faster than combine_first @samukweku
  • +
  • [ENH] Use eval for string conditions in update_where. @samukweku
  • +
  • [ENH] Add clearer error messages for pivot_longer. h/t to @tdhock + for the observation. Issue #836 @samukweku
  • +
  • [ENH] select_columns now uses variable arguments (*args), + to provide a simpler selection without the need for lists. - @samukweku
  • +
  • [ENH] encode_categoricals refactored to use generic functions + via functools.dispatch. - @samukweku
  • +
  • [ENH] Updated convert_excel_date to throw meaningful error when values contain non-numeric. @nvamsikrishna05
  • +
+

v0.20.14 - 2021-03-25

+
    +
  • [ENH] Add dropna parameter to groupby_agg. @samukweku
  • +
  • [ENH] complete adds a by parameter to expose explicit missing values per group, via groupby. @samukweku
  • +
  • [ENH] Fix check_column to support single inputs - fixes label_encode. @zbarry
  • +
+

v0.20.13 - 2021-02-25

+
    +
  • [ENH] Performance improvements to expand_grid. @samukweku
  • +
  • [HOTFIX] Add multipledispatch to pip requirements. @ericmjl
  • +
+

v0.20.12 - 2021-02-25

+
    +
  • [INF] Auto-release GitHub action maintenance. @loganthomas
  • +
+

v0.20.11 - 2021-02-24

+
    +
  • [INF] Setup auto-release GitHub action. @loganthomas
  • +
  • [INF] Deploy darglint package for docstring linting. Issue #745. @loganthomas
  • +
  • [ENH] Added optional truncation to clean_names function. Issue #753. @richardqiu
  • +
  • [ENH] Added timeseries.flag_jumps() function. Issue #711. @loganthomas
  • +
  • [ENH] pivot_longer can handle multiple values in paired columns, and can reshape + using a list/tuple of regular expressions in names_pattern. @samukweku
  • +
  • [ENH] Replaced default numeric conversion of dataframe with a dtypes parameter, + allowing the user to control the data types. - @samukweku
  • +
  • [INF] Loosen dependency specifications. Switch to pip-tools for managing + dependencies. Issue #760. @MinchinWeb
  • +
  • [DOC] added pipenv installation instructions @evan-anderson
  • +
  • [ENH] Add pivot_wider function, which is the inverse of the pivot_longer + function. @samukweku
  • +
  • [INF] Add openpyxl to environment-dev.yml. @samukweku
  • +
  • [ENH] Reduce code by reusing existing functions for fill_direction. @samukweku
  • +
  • [ENH] Improvements to pivot_longer function, with improved speed and cleaner code. + dtypes parameter dropped; user can change dtypes with pandas' astype method, or + pyjanitor's change_type method. @samukweku
  • +
  • [ENH] Add kwargs to encode_categorical function, to create ordered categorical columns, + or categorical columns with explicit categories. @samukweku
  • +
  • [ENH] Improvements to complete method. Use pd.merge to handle duplicates and + null values. @samukweku
  • +
  • [ENH] Add new_column_names parameter to process_text, allowing a user to + create a new column name after processing a text column. Also added a merge_frame + parameter, allowing dataframe merging, if the result of the text processing is a + dataframe.@samukweku
  • +
  • [ENH] Add aggfunc parameter to pivot_wider. @samukweku
  • +
  • [ENH] Modified the check function in utils to verify if a value is a callable. @samukweku
  • +
  • [ENH] Add a base _select_column function, using functools.singledispatch, + to allow for flexible columns selection. @samukweku
  • +
  • [ENH] pivot_longer and pivot_wider now support janitor.select_columns syntax, + allowing for more flexible and dynamic column selection. @samukweku
  • +
+

v0.20.10

+
    +
  • [ENH] Added function sort_timestamps_monotonically to timeseries functions @UGuntupalli
  • +
  • [ENH] Added the complete function for converting implicit missing values + to explicit ones. @samukweku
  • +
  • [ENH] Further simplification of expand_grid. @samukweku
  • +
  • [BUGFIX] Added copy() method to original dataframe, to avoid mutation. Issue #729. @samukweku
  • +
  • [ENH] Added also method for running functions in chain with no return values.
  • +
  • [DOC] Added a timeseries module section to website docs. Issue #742. @loganthomas
  • +
  • [ENH] Added a pivot_longer function, a wrapper around pd.melt and similar to + tidyr's pivot_longer function. Also added an example notebook. @samukweku
  • +
  • [ENH] Fixed code to returns error if fill_value is not a dictionary. @samukweku
  • +
  • [INF] Welcome bot (.github/config.yml) for new users added. Issue #739. @samukweku
  • +
+

v0.20.9

+
    +
  • [ENH] Updated groupby_agg function to account for null entries in the by argument. @samukweku
  • +
  • [ENH] Added function groupby_topk to janitor functions @mphirke
  • +
+

v0.20.8

+
    +
  • [ENH] Upgraded update_where function to use either the pandas query style, + or boolean indexing via the loc method. Also updated find_replace function to use the loc + method directly, instead of routing it through the update_where function. @samukweku
  • +
  • [INF] Update pandas minimum version to 1.0.0. @hectormz
  • +
  • [DOC] Updated the general functions API page to show all available functions. @samukweku
  • +
  • [DOC] Fix the few lacking type annotations of functions. @VPerrollaz
  • +
  • [DOC] Changed the signature from str to Optional[str] when initialized by None. @VPerrollaz
  • +
  • [DOC] Add the Optional type for all signatures of the API. @VPerrollaz
  • +
  • [TST] Updated test_expand_grid to account for int dtype difference in Windows OS @samukweku
  • +
  • [TST] Make importing pandas testing functions follow uniform pattern. @hectormz
  • +
  • [ENH] Added process_text wrapper function for all Pandas string methods. @samukweku
  • +
  • [TST] Only skip tests for non-installed libraries on local machine. @hectormz
  • +
  • [DOC] Fix minor issues in documentation. @hectormz
  • +
  • [ENH] Added fill_direction function for forward/backward fills on missing values + for selected columns in a dataframe. @samukweku
  • +
  • [ENH] Simpler logic and less lines of code for expand_grid function @samukweku
  • +
+

v0.20.7

+
    +
  • [TST] Add a test for transform_column to check for nonmutation. @VPerrollaz
  • +
  • [ENH] Contributed expand_grid function by @samukweku
  • +
+

v0.20.6

+
    +
  • [DOC] Pep8 all examples. @VPerrollaz
  • +
  • [TST] Add docstrings to tests @hectormz
  • +
  • [INF] Add debug-statements, requirements-txt-fixer, and interrogate to pre-commit. @hectormz
  • +
  • [ENH] Upgraded transform_column to use df.assign underneath the hood, + and also added option to transform column elementwise (via apply) + or columnwise (thus operating on a series). @ericmjl
  • +
+

v0.20.5

+
    +
  • [INF] Replace pycodestyle with flake8 in order to add pandas-vet linter @hectormz
  • +
  • [ENH] select_columns() now raises NameError if column label in + search_columns_labels is missing from DataFrame columns. @smu095
  • +
+

v0.20.1

+
    +
  • [DOC] Added an example for groupby_agg in general functions @samukweku
  • +
  • [ENH] Contributed sort_naturally() function. @ericmjl
  • +
+

v0.20.0

+
    +
  • [DOC] Edited transform_column dest_column_name kwarg description to be clearer on defaults by @evan-anderson.
  • +
  • [ENH] Replace apply() in favor of pandas functions in several functions. @hectormz
  • +
  • [ENH] Add ecdf() Series function by @ericmjl.
  • +
  • [DOC] Update API policy for clarity. @ericmjl
  • +
  • [ENH] Enforce string conversion when cleaning names. @ericmjl
  • +
  • [ENH] Change find_replace implementation to use keyword arguments to specify columns to perform find and replace on. @ericmjl
  • +
  • [ENH] Add jitter() dataframe function by @rahosbach
  • +
+

v0.19.0

+
    +
  • [ENH] Add xarray support and clone_using / convert_datetime_to_number funcs by @zbarry.
  • +
+

v0.18.3

+
    +
  • [ENH] Series toset() functionality #570 @eyaltrabelsi
  • +
  • [ENH] Added option to coalesce function to not delete coalesced columns. @gddcunh
  • +
  • [ENH] Added functionality to deconcatenate tuple/list/collections in a column to deconcatenate_column @zbarry
  • +
  • [ENH] Fix error message when length of new_column_names is wrong @DollofCutty
  • +
  • [DOC] Fixed several examples of functional syntax in functions.py. @bdice
  • +
  • [DOC] Fix #noqa comments showing up in docs by @hectormz
  • +
  • [ENH] Add unionizing a group of dataframes' categoricals. @zbarry
  • +
  • [DOC] Fix contributions hyperlinks in AUTHORS.rst and contributions by @hectormz
  • +
  • [INF] Add pre-commit hooks to repository by @ericmjl
  • +
  • [DOC] Fix formatting code in CONTRIBUTING.rst by @hectormz
  • +
  • [DOC] Changed the typing for most "column_name(s)" to Hashable rather than enforcing strings, to more closely match Pandas API by @dendrondal
  • +
  • [INF] Edited pycodestyle and Black parameters to avoid venvs by @dendrondal
  • +
+

v0.18.2

+
    +
  • [INF] Make requirements.txt smaller @eyaltrabelsi
  • +
  • [ENH] Add a reset_index parameter to shuffle @eyaltrabelsi
  • +
  • [DOC] Added contribution page link to readme @eyaltrabelsi
  • +
  • [DOC] fix example for update_where, provide a bit more detail, and expand the bad_values example notebook to demonstrate its use by @anzelpwj.
  • +
  • [INF] Fix pytest marks by @ericmjl (issue #520)
  • +
  • [ENH] add example notebook with use of finance submodule methods by @rahosbach
  • +
  • [DOC] added a couple of admonitions for Windows users. h/t @anzelpwj for debugging + help when a few tests failed for win32 @Ram-N
  • +
  • [ENH] Pyjanitor for PySpark @zjpoh
  • +
  • [ENH] Add pyspark clean_names @zjpoh
  • +
  • [ENH] Convert asserts to raise exceptions by @hectormz
  • +
  • [ENH] Add decorator functions for missing and error handling @jiafengkevinchen
  • +
  • [DOC] Update README with functional pandas API example. @ericmjl
  • +
  • [INF] Move get_features_targets() to new ml.py module by @hectormz
  • +
  • [ENH] Add chirality to morgan fingerprints in janitor.chemistry submodule by @Clayton-Springer
  • +
  • [INF] import_message suggests python dist. appropriate installs by @hectormz
  • +
  • [ENH] Add count_cumulative_unique() method to janitor.functions submodule by @rahosbach
  • +
  • [ENH] Add update_where() method to janitor.spark.functions submodule by @zjpoh
  • +
+

v0.18.1

+
    +
  • [ENH] extend find_replace functionality to allow both exact match and + regular-expression-based fuzzy match by @shandou
  • +
  • [ENH] add preserve_position kwarg to deconcatenate_column with tests + by @shandou and @ericmjl
  • +
  • [DOC] add contributions that did not leave git traces by @ericmjl
  • +
  • [ENH] add inflation adjustment in finance submodule by @rahosbach
  • +
  • [DOC] clarified how new functions should be implemented by @shandou
  • +
  • [ENH] add optional removal of accents on functions.clean_names, enabled by + default by @mralbu
  • +
  • [ENH] add camelCase conversion to snake_case on clean_names by @ericmjl, + h/t @jtaylor for sharing original
  • +
  • [ENH] Added null_flag function which can mark null values in rows. + Implemented by @anzelpwj
  • +
  • [ENH] add engineering submodule with unit conversion method by @rahosbach
  • +
  • [DOC] add PyPI project description
  • +
  • [ENH] add example notebook with use of finance submodule methods + by @rahosbach
  • +
+

For changes that happened prior to v0.18.1, +please consult the closed PRs, +which can be found here.

+

We thank all contributors +who have helped make pyjanitor +the package that it is today.

+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/api/biology/index.html b/api/biology/index.html new file mode 100644 index 000000000..d057a9246 --- /dev/null +++ b/api/biology/index.html @@ -0,0 +1,1070 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Biology - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Biology

+ + +
+ + + + +
+ +

Biology and bioinformatics-oriented data cleaning functions.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ join_fasta(df, filename, id_col, column_name) + +

+ + +
+ +

Convenience method to join in a FASTA file as a column.

+

This allows us to add the string sequence of a FASTA file as a new column +of data in the dataframe.

+

This method only attaches the string representation of the SeqRecord.Seq +object from Biopython. Does not attach the full SeqRecord. Alphabet is +also not stored, under the assumption that the data scientist has domain +knowledge of what kind of sequence is being read in (nucleotide vs. amino +acid.)

+

This method mutates the original DataFrame.

+

For more advanced functions, please use phylopandas.

+ + +

Examples:

+
>>> import tempfile
+>>> import pandas as pd
+>>> import janitor.biology
+>>> tf = tempfile.NamedTemporaryFile()
+>>> tf.write('''>SEQUENCE_1
+... MTEITAAMVKELRESTGAGMMDCK
+... >SEQUENCE_2
+... SATVSEINSETDFVAKN'''.encode('utf8'))
+66
+>>> tf.seek(0)
+0
+>>> df = pd.DataFrame({"sequence_accession":
+... ["SEQUENCE_1", "SEQUENCE_2", ]})
+>>> df = df.join_fasta(
+...     filename=tf.name,
+...     id_col='sequence_accession',
+...     column_name='sequence',
+... )
+>>> df.sequence
+0    MTEITAAMVKELRESTGAGMMDCK
+1           SATVSEINSETDFVAKN
+Name: sequence, dtype: object
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ filename + + str + +
+

Path to the FASTA file.

+
+
+ required +
+ id_col + + str + +
+

The column in the DataFrame that houses sequence IDs.

+
+
+ required +
+ column_name + + str + +
+

The name of the new column.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with new FASTA string sequence column.

+
+
+ +
+ Source code in janitor/biology.py +
20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
@pf.register_dataframe_method
+@deprecated_alias(col_name="column_name")
+def join_fasta(
+    df: pd.DataFrame, filename: str, id_col: str, column_name: str
+) -> pd.DataFrame:
+    """Convenience method to join in a FASTA file as a column.
+
+    This allows us to add the string sequence of a FASTA file as a new column
+    of data in the dataframe.
+
+    This method only attaches the string representation of the SeqRecord.Seq
+    object from Biopython. Does not attach the full SeqRecord. Alphabet is
+    also not stored, under the assumption that the data scientist has domain
+    knowledge of what kind of sequence is being read in (nucleotide vs. amino
+    acid.)
+
+    This method mutates the original DataFrame.
+
+    For more advanced functions, please use phylopandas.
+
+    Examples:
+        >>> import tempfile
+        >>> import pandas as pd
+        >>> import janitor.biology
+        >>> tf = tempfile.NamedTemporaryFile()
+        >>> tf.write('''>SEQUENCE_1
+        ... MTEITAAMVKELRESTGAGMMDCK
+        ... >SEQUENCE_2
+        ... SATVSEINSETDFVAKN'''.encode('utf8'))
+        66
+        >>> tf.seek(0)
+        0
+        >>> df = pd.DataFrame({"sequence_accession":
+        ... ["SEQUENCE_1", "SEQUENCE_2", ]})
+        >>> df = df.join_fasta(  # doctest: +SKIP
+        ...     filename=tf.name,
+        ...     id_col='sequence_accession',
+        ...     column_name='sequence',
+        ... )
+        >>> df.sequence  # doctest: +SKIP
+        0    MTEITAAMVKELRESTGAGMMDCK
+        1           SATVSEINSETDFVAKN
+        Name: sequence, dtype: object
+
+    Args:
+        df: A pandas DataFrame.
+        filename: Path to the FASTA file.
+        id_col: The column in the DataFrame that houses sequence IDs.
+        column_name: The name of the new column.
+
+    Returns:
+        A pandas DataFrame with new FASTA string sequence column.
+    """
+    seqrecords = {
+        x.id: x.seq.__str__() for x in SeqIO.parse(filename, "fasta")
+    }
+    seq_col = [seqrecords[i] for i in df[id_col]]
+    df[column_name] = seq_col
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/api/chemistry/index.html b/api/chemistry/index.html new file mode 100644 index 000000000..7bc4b71b8 --- /dev/null +++ b/api/chemistry/index.html @@ -0,0 +1,2369 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Chemistry - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Chemistry

+ + +
+ + + + +
+ +

Chemistry and cheminformatics-oriented data cleaning functions.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ maccs_keys_fingerprint(df, mols_column_name) + +

+ + +
+ +

Convert a column of RDKIT mol objects into MACCS Keys Fingerprints.

+

Returns a new dataframe without any of the original data. +This is intentional to leave the user with the data requested.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+

Functional usage

+
>>> import pandas as pd
+>>> import janitor.chemistry
+>>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+>>> maccs = janitor.chemistry.maccs_keys_fingerprint(
+...     df=df.smiles2mol('smiles', 'mols'),
+...     mols_column_name='mols'
+... )
+>>> len(maccs.columns)
+167
+
+

Method chaining usage

+
>>> import pandas as pd
+>>> import janitor.chemistry
+>>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+>>> maccs = (
+...     df.smiles2mol('smiles', 'mols')
+...         .maccs_keys_fingerprint(mols_column_name='mols')
+... )
+>>> len(maccs.columns)
+167
+
+

If you wish to join the maccs keys fingerprints back into the +original dataframe, this can be accomplished by doing a join, +because the indices are preserved:

+
>>> joined = df.join(maccs)
+>>> len(joined.columns)
+169
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ mols_column_name + + Hashable + +
+

The name of the column that has the RDKIT mol +objects.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A new pandas DataFrame of MACCS keys fingerprints.

+
+
+ +
+ Source code in janitor/chemistry.py +
422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
@pf.register_dataframe_method
+@deprecated_alias(mols_col="mols_column_name")
+def maccs_keys_fingerprint(
+    df: pd.DataFrame, mols_column_name: Hashable
+) -> pd.DataFrame:
+    """Convert a column of RDKIT mol objects into MACCS Keys Fingerprints.
+
+    Returns a new dataframe without any of the original data.
+    This is intentional to leave the user with the data requested.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        Functional usage
+
+        >>> import pandas as pd
+        >>> import janitor.chemistry
+        >>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+        >>> maccs = janitor.chemistry.maccs_keys_fingerprint(
+        ...     df=df.smiles2mol('smiles', 'mols'),
+        ...     mols_column_name='mols'
+        ... )
+        >>> len(maccs.columns)
+        167
+
+        Method chaining usage
+
+        >>> import pandas as pd
+        >>> import janitor.chemistry
+        >>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+        >>> maccs = (
+        ...     df.smiles2mol('smiles', 'mols')
+        ...         .maccs_keys_fingerprint(mols_column_name='mols')
+        ... )
+        >>> len(maccs.columns)
+        167
+
+        If you wish to join the maccs keys fingerprints back into the
+        original dataframe, this can be accomplished by doing a `join`,
+        because the indices are preserved:
+
+        >>> joined = df.join(maccs)
+        >>> len(joined.columns)
+        169
+
+    Args:
+        df: A pandas DataFrame.
+        mols_column_name: The name of the column that has the RDKIT mol
+            objects.
+
+    Returns:
+        A new pandas DataFrame of MACCS keys fingerprints.
+    """
+
+    maccs = [GetMACCSKeysFingerprint(m) for m in df[mols_column_name]]
+
+    np_maccs = []
+
+    for macc in maccs:
+        arr = np.zeros((1,))
+        DataStructs.ConvertToNumpyArray(macc, arr)
+        np_maccs.append(arr)
+    np_maccs = np.vstack(np_maccs)
+    fmaccs = pd.DataFrame(np_maccs)
+    fmaccs.index = df.index
+    return fmaccs
+
+
+
+ +
+ +
+ + +

+ molecular_descriptors(df, mols_column_name) + +

+ + +
+ +

Convert a column of RDKIT mol objects into a Pandas DataFrame +of molecular descriptors.

+

Returns a new dataframe without any of the original data. This is +intentional to leave the user only with the data requested.

+

This method does not mutate the original DataFrame.

+

The molecular descriptors are from the rdkit.Chem.rdMolDescriptors:

+
Chi0n, Chi0v, Chi1n, Chi1v, Chi2n, Chi2v, Chi3n, Chi3v,
+Chi4n, Chi4v, ExactMolWt, FractionCSP3, HallKierAlpha, Kappa1,
+Kappa2, Kappa3, LabuteASA, NumAliphaticCarbocycles,
+NumAliphaticHeterocycles, NumAliphaticRings, NumAmideBonds,
+NumAromaticCarbocycles, NumAromaticHeterocycles, NumAromaticRings,
+NumAtomStereoCenters, NumBridgeheadAtoms, NumHBA, NumHBD,
+NumHeteroatoms, NumHeterocycles, NumLipinskiHBA, NumLipinskiHBD,
+NumRings, NumSaturatedCarbocycles, NumSaturatedHeterocycles,
+NumSaturatedRings, NumSpiroAtoms, NumUnspecifiedAtomStereoCenters,
+TPSA.
+
+ + +

Examples:

+

Functional usage

+
>>> import pandas as pd
+>>> import janitor.chemistry
+>>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+>>> mol_desc = (
+...     janitor.chemistry.molecular_descriptors(
+...         df=df.smiles2mol('smiles', 'mols'),
+...         mols_column_name='mols'
+...     )
+... )
+>>> mol_desc.TPSA
+0    34.14
+1    37.30
+Name: TPSA, dtype: float64
+
+

Method chaining usage

+
>>> import pandas as pd
+>>> import janitor.chemistry
+>>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+>>> mol_desc = (
+...     df.smiles2mol('smiles', 'mols')
+...     .molecular_descriptors(mols_column_name='mols')
+... )
+>>> mol_desc.TPSA
+0    34.14
+1    37.30
+Name: TPSA, dtype: float64
+
+

If you wish to join the molecular descriptors back into the original +dataframe, this can be accomplished by doing a join, +because the indices are preserved:

+
>>> joined = df.join(mol_desc)
+>>> len(joined.columns)
+41
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ mols_column_name + + Hashable + +
+

The name of the column that has the RDKIT mol +objects.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A new pandas DataFrame of molecular descriptors.

+
+
+ +
+ Source code in janitor/chemistry.py +
298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
@pf.register_dataframe_method
+@deprecated_alias(mols_col="mols_column_name")
+def molecular_descriptors(
+    df: pd.DataFrame, mols_column_name: Hashable
+) -> pd.DataFrame:
+    """Convert a column of RDKIT mol objects into a Pandas DataFrame
+    of molecular descriptors.
+
+    Returns a new dataframe without any of the original data. This is
+    intentional to leave the user only with the data requested.
+
+    This method does not mutate the original DataFrame.
+
+    The molecular descriptors are from the `rdkit.Chem.rdMolDescriptors`:
+
+    ```text
+    Chi0n, Chi0v, Chi1n, Chi1v, Chi2n, Chi2v, Chi3n, Chi3v,
+    Chi4n, Chi4v, ExactMolWt, FractionCSP3, HallKierAlpha, Kappa1,
+    Kappa2, Kappa3, LabuteASA, NumAliphaticCarbocycles,
+    NumAliphaticHeterocycles, NumAliphaticRings, NumAmideBonds,
+    NumAromaticCarbocycles, NumAromaticHeterocycles, NumAromaticRings,
+    NumAtomStereoCenters, NumBridgeheadAtoms, NumHBA, NumHBD,
+    NumHeteroatoms, NumHeterocycles, NumLipinskiHBA, NumLipinskiHBD,
+    NumRings, NumSaturatedCarbocycles, NumSaturatedHeterocycles,
+    NumSaturatedRings, NumSpiroAtoms, NumUnspecifiedAtomStereoCenters,
+    TPSA.
+    ```
+
+    Examples:
+        Functional usage
+
+        >>> import pandas as pd
+        >>> import janitor.chemistry
+        >>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+        >>> mol_desc = (
+        ...     janitor.chemistry.molecular_descriptors(
+        ...         df=df.smiles2mol('smiles', 'mols'),
+        ...         mols_column_name='mols'
+        ...     )
+        ... )
+        >>> mol_desc.TPSA
+        0    34.14
+        1    37.30
+        Name: TPSA, dtype: float64
+
+        Method chaining usage
+
+        >>> import pandas as pd
+        >>> import janitor.chemistry
+        >>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+        >>> mol_desc = (
+        ...     df.smiles2mol('smiles', 'mols')
+        ...     .molecular_descriptors(mols_column_name='mols')
+        ... )
+        >>> mol_desc.TPSA
+        0    34.14
+        1    37.30
+        Name: TPSA, dtype: float64
+
+        If you wish to join the molecular descriptors back into the original
+        dataframe, this can be accomplished by doing a `join`,
+        because the indices are preserved:
+
+        >>> joined = df.join(mol_desc)
+        >>> len(joined.columns)
+        41
+
+    Args:
+        df: A pandas DataFrame.
+        mols_column_name: The name of the column that has the RDKIT mol
+            objects.
+
+    Returns:
+        A new pandas DataFrame of molecular descriptors.
+    """
+    descriptors = [
+        CalcChi0n,
+        CalcChi0v,
+        CalcChi1n,
+        CalcChi1v,
+        CalcChi2n,
+        CalcChi2v,
+        CalcChi3n,
+        CalcChi3v,
+        CalcChi4n,
+        CalcChi4v,
+        CalcExactMolWt,
+        CalcFractionCSP3,
+        CalcHallKierAlpha,
+        CalcKappa1,
+        CalcKappa2,
+        CalcKappa3,
+        CalcLabuteASA,
+        CalcNumAliphaticCarbocycles,
+        CalcNumAliphaticHeterocycles,
+        CalcNumAliphaticRings,
+        CalcNumAmideBonds,
+        CalcNumAromaticCarbocycles,
+        CalcNumAromaticHeterocycles,
+        CalcNumAromaticRings,
+        CalcNumAtomStereoCenters,
+        CalcNumBridgeheadAtoms,
+        CalcNumHBA,
+        CalcNumHBD,
+        CalcNumHeteroatoms,
+        CalcNumHeterocycles,
+        CalcNumLipinskiHBA,
+        CalcNumLipinskiHBD,
+        CalcNumRings,
+        CalcNumSaturatedCarbocycles,
+        CalcNumSaturatedHeterocycles,
+        CalcNumSaturatedRings,
+        CalcNumSpiroAtoms,
+        CalcNumUnspecifiedAtomStereoCenters,
+        CalcTPSA,
+    ]
+    descriptors_mapping = {f.__name__.strip("Calc"): f for f in descriptors}
+
+    feats = dict()
+    for name, func in descriptors_mapping.items():
+        feats[name] = [func(m) for m in df[mols_column_name]]
+    return pd.DataFrame(feats)
+
+
+
+ +
+ +
+ + +

+ morgan_fingerprint(df, mols_column_name, radius=3, nbits=2048, kind='counts') + +

+ + +
+ +

Convert a column of RDKIT Mol objects into Morgan Fingerprints.

+

Returns a new dataframe without any of the original data. This is +intentional, as Morgan fingerprints are usually high-dimensional +features.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+

Functional usage

+
>>> import pandas as pd
+>>> import janitor.chemistry
+>>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+
+
    +
  • For "counts" kind
  • +
+
>>> morgans = janitor.chemistry.morgan_fingerprint(
+...     df=df.smiles2mol('smiles', 'mols'),
+...     mols_column_name='mols',
+...     radius=3,      # Defaults to 3
+...     nbits=2048,    # Defaults to 2048
+...     kind='counts'  # Defaults to "counts"
+... )
+>>> set(morgans.iloc[0])
+{0.0, 1.0, 2.0}
+
+
    +
  • For "bits" kind
  • +
+
>>> morgans = janitor.chemistry.morgan_fingerprint(
+...     df=df.smiles2mol('smiles', 'mols'),
+...     mols_column_name='mols',
+...     radius=3,      # Defaults to 3
+...     nbits=2048,    # Defaults to 2048
+...     kind='bits'    # Defaults to "counts"
+...  )
+>>> set(morgans.iloc[0])
+{0.0, 1.0}
+
+

Method chaining usage

+
>>> import pandas as pd
+>>> import janitor.chemistry
+>>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+
+
    +
  • For "counts" kind
  • +
+
>>> morgans = (
+...     df.smiles2mol('smiles', 'mols')
+...     .morgan_fingerprint(
+...         mols_column_name='mols',
+...         radius=3,      # Defaults to 3
+...         nbits=2048,    # Defaults to 2048
+...         kind='counts'  # Defaults to "counts"
+...     )
+... )
+>>> set(morgans.iloc[0])
+{0.0, 1.0, 2.0}
+
+
    +
  • For "bits" kind
  • +
+
>>> morgans = (
+...     df
+...     .smiles2mol('smiles', 'mols')
+...     .morgan_fingerprint(
+...         mols_column_name='mols',
+...         radius=3,    # Defaults to 3
+...         nbits=2048,  # Defaults to 2048
+...         kind='bits'  # Defaults to "counts"
+...     )
+... )
+>>> set(morgans.iloc[0])
+{0.0, 1.0}
+
+

If you wish to join the morgan fingerprints back into the original +dataframe, this can be accomplished by doing a join, +because the indices are preserved:

+
>>> joined = df.join(morgans)
+>>> len(joined.columns)
+2050
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ mols_column_name + + str + +
+

The name of the column that has the RDKIT +mol objects

+
+
+ required +
+ radius + + int + +
+

Radius of Morgan fingerprints.

+
+
+ 3 +
+ nbits + + int + +
+

The length of the fingerprints.

+
+
+ 2048 +
+ kind + + Literal['counts', 'bits'] + +
+

Whether to return counts or bits.

+
+
+ 'counts' +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If kind is not one of "counts" or "bits".

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A new pandas DataFrame of Morgan fingerprints.

+
+
+ +
+ Source code in janitor/chemistry.py +
167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
@pf.register_dataframe_method
+@deprecated_alias(mols_col="mols_column_name")
+def morgan_fingerprint(
+    df: pd.DataFrame,
+    mols_column_name: str,
+    radius: int = 3,
+    nbits: int = 2048,
+    kind: Literal["counts", "bits"] = "counts",
+) -> pd.DataFrame:
+    """Convert a column of RDKIT Mol objects into Morgan Fingerprints.
+
+    Returns a new dataframe without any of the original data. This is
+    intentional, as Morgan fingerprints are usually high-dimensional
+    features.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        Functional usage
+
+        >>> import pandas as pd
+        >>> import janitor.chemistry
+        >>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+
+        - For "counts" kind
+
+        >>> morgans = janitor.chemistry.morgan_fingerprint(
+        ...     df=df.smiles2mol('smiles', 'mols'),
+        ...     mols_column_name='mols',
+        ...     radius=3,      # Defaults to 3
+        ...     nbits=2048,    # Defaults to 2048
+        ...     kind='counts'  # Defaults to "counts"
+        ... )
+        >>> set(morgans.iloc[0])
+        {0.0, 1.0, 2.0}
+
+        - For "bits" kind
+
+        >>> morgans = janitor.chemistry.morgan_fingerprint(
+        ...     df=df.smiles2mol('smiles', 'mols'),
+        ...     mols_column_name='mols',
+        ...     radius=3,      # Defaults to 3
+        ...     nbits=2048,    # Defaults to 2048
+        ...     kind='bits'    # Defaults to "counts"
+        ...  )
+        >>> set(morgans.iloc[0])
+        {0.0, 1.0}
+
+        Method chaining usage
+
+        >>> import pandas as pd
+        >>> import janitor.chemistry
+        >>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+
+        - For "counts" kind
+
+        >>> morgans = (
+        ...     df.smiles2mol('smiles', 'mols')
+        ...     .morgan_fingerprint(
+        ...         mols_column_name='mols',
+        ...         radius=3,      # Defaults to 3
+        ...         nbits=2048,    # Defaults to 2048
+        ...         kind='counts'  # Defaults to "counts"
+        ...     )
+        ... )
+        >>> set(morgans.iloc[0])
+        {0.0, 1.0, 2.0}
+
+        - For "bits" kind
+
+        >>> morgans = (
+        ...     df
+        ...     .smiles2mol('smiles', 'mols')
+        ...     .morgan_fingerprint(
+        ...         mols_column_name='mols',
+        ...         radius=3,    # Defaults to 3
+        ...         nbits=2048,  # Defaults to 2048
+        ...         kind='bits'  # Defaults to "counts"
+        ...     )
+        ... )
+        >>> set(morgans.iloc[0])
+        {0.0, 1.0}
+
+        If you wish to join the morgan fingerprints back into the original
+        dataframe, this can be accomplished by doing a `join`,
+        because the indices are preserved:
+
+        >>> joined = df.join(morgans)
+        >>> len(joined.columns)
+        2050
+
+    Args:
+        df: A pandas DataFrame.
+        mols_column_name: The name of the column that has the RDKIT
+            mol objects
+        radius: Radius of Morgan fingerprints.
+        nbits: The length of the fingerprints.
+        kind: Whether to return counts or bits.
+
+    Raises:
+        ValueError: If `kind` is not one of `"counts"` or `"bits"`.
+
+    Returns:
+        A new pandas DataFrame of Morgan fingerprints.
+    """
+    acceptable_kinds = ["counts", "bits"]
+    if kind not in acceptable_kinds:
+        raise ValueError(f"`kind` must be one of {acceptable_kinds}")
+
+    if kind == "bits":
+        fps = [
+            GetMorganFingerprintAsBitVect(m, radius, nbits, useChirality=True)
+            for m in df[mols_column_name]
+        ]
+    elif kind == "counts":
+        fps = [
+            GetHashedMorganFingerprint(m, radius, nbits, useChirality=True)
+            for m in df[mols_column_name]
+        ]
+
+    np_fps = []
+    for fp in fps:
+        arr = np.zeros((1,))
+        DataStructs.ConvertToNumpyArray(fp, arr)
+        np_fps.append(arr)
+    np_fps = np.vstack(np_fps)
+    fpdf = pd.DataFrame(np_fps)
+    fpdf.index = df.index
+    return fpdf
+
+
+
+ +
+ +
+ + +

+ smiles2mol(df, smiles_column_name, mols_column_name, drop_nulls=True, progressbar=None) + +

+ + +
+ +

Convert a column of SMILES strings into RDKit Mol objects.

+

Automatically drops invalid SMILES, as determined by RDKIT.

+

This method mutates the original DataFrame.

+ + +

Examples:

+

Functional usage

+
>>> import pandas as pd
+>>> import janitor.chemistry
+>>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+>>> df = janitor.chemistry.smiles2mol(
+...    df=df,
+...    smiles_column_name='smiles',
+...    mols_column_name='mols'
+... )
+>>> df.mols[0].GetNumAtoms(), df.mols[0].GetNumBonds()
+(3, 2)
+>>> df.mols[1].GetNumAtoms(), df.mols[1].GetNumBonds()
+(5, 4)
+
+

Method chaining usage

+
>>> import pandas as pd
+>>> import janitor.chemistry
+>>> df = df.smiles2mol(
+...     smiles_column_name='smiles',
+...     mols_column_name='rdkmol'
+... )
+>>> df.rdkmol[0].GetNumAtoms(), df.rdkmol[0].GetNumBonds()
+(3, 2)
+
+

A progressbar can be optionally used.

+
    +
  • Pass in "notebook" to show a tqdm notebook progressbar. + (ipywidgets must be enabled with your Jupyter installation.)
  • +
  • Pass in "terminal" to show a tqdm progressbar. Better suited for use + with scripts.
  • +
  • None is the default value - progress bar will not be shown.
  • +
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

pandas DataFrame.

+
+
+ required +
+ smiles_column_name + + Hashable + +
+

Name of column that holds the SMILES strings.

+
+
+ required +
+ mols_column_name + + Hashable + +
+

Name to be given to the new mols column.

+
+
+ required +
+ drop_nulls + + bool + +
+

Whether to drop rows whose mols failed to be +constructed.

+
+
+ True +
+ progressbar + + Optional[str] + +
+

Whether to show a progressbar or not.

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If progressbar is not one of +"notebook", "terminal", or None.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with new RDKIT Mol objects column.

+
+
+ +
+ Source code in janitor/chemistry.py +
 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
@pf.register_dataframe_method
+@deprecated_alias(smiles_col="smiles_column_name", mols_col="mols_column_name")
+def smiles2mol(
+    df: pd.DataFrame,
+    smiles_column_name: Hashable,
+    mols_column_name: Hashable,
+    drop_nulls: bool = True,
+    progressbar: Optional[str] = None,
+) -> pd.DataFrame:
+    """Convert a column of SMILES strings into RDKit Mol objects.
+
+    Automatically drops invalid SMILES, as determined by RDKIT.
+
+    This method mutates the original DataFrame.
+
+    Examples:
+        Functional usage
+
+        >>> import pandas as pd
+        >>> import janitor.chemistry
+        >>> df = pd.DataFrame({"smiles": ["O=C=O", "CCC(=O)O"]})
+        >>> df = janitor.chemistry.smiles2mol(
+        ...    df=df,
+        ...    smiles_column_name='smiles',
+        ...    mols_column_name='mols'
+        ... )
+        >>> df.mols[0].GetNumAtoms(), df.mols[0].GetNumBonds()
+        (3, 2)
+        >>> df.mols[1].GetNumAtoms(), df.mols[1].GetNumBonds()
+        (5, 4)
+
+        Method chaining usage
+
+        >>> import pandas as pd
+        >>> import janitor.chemistry
+        >>> df = df.smiles2mol(
+        ...     smiles_column_name='smiles',
+        ...     mols_column_name='rdkmol'
+        ... )
+        >>> df.rdkmol[0].GetNumAtoms(), df.rdkmol[0].GetNumBonds()
+        (3, 2)
+
+    A progressbar can be optionally used.
+
+    - Pass in "notebook" to show a `tqdm` notebook progressbar.
+      (`ipywidgets` must be enabled with your Jupyter installation.)
+    - Pass in "terminal" to show a `tqdm` progressbar. Better suited for use
+      with scripts.
+    - `None` is the default value - progress bar will not be shown.
+
+    Args:
+        df: pandas DataFrame.
+        smiles_column_name: Name of column that holds the SMILES strings.
+        mols_column_name: Name to be given to the new mols column.
+        drop_nulls: Whether to drop rows whose mols failed to be
+            constructed.
+        progressbar: Whether to show a progressbar or not.
+
+    Raises:
+        ValueError: If `progressbar` is not one of
+            `"notebook"`, `"terminal"`, or `None`.
+
+    Returns:
+        A pandas DataFrame with new RDKIT Mol objects column.
+    """
+    valid_progress = ["notebook", "terminal", None]
+    if progressbar not in valid_progress:
+        raise ValueError(f"progressbar kwarg must be one of {valid_progress}")
+
+    if progressbar is None:
+        df[mols_column_name] = df[smiles_column_name].apply(
+            lambda x: Chem.MolFromSmiles(x)
+        )
+    else:
+        if progressbar == "notebook":
+            tqdmn().pandas(desc="mols")
+        elif progressbar == "terminal":
+            tqdm.pandas(desc="mols")
+        df[mols_column_name] = df[smiles_column_name].progress_apply(
+            lambda x: Chem.MolFromSmiles(x)
+        )
+
+    if drop_nulls:
+        df = df.dropna(subset=[mols_column_name])
+    df = df.reset_index(drop=True)
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/api/engineering/index.html b/api/engineering/index.html new file mode 100644 index 000000000..dbe7d08e7 --- /dev/null +++ b/api/engineering/index.html @@ -0,0 +1,1122 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Engineering - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Engineering

+ + +
+ + + + +
+ +

Engineering-specific data cleaning functions.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ convert_units(df, column_name=None, existing_units=None, to_units=None, dest_column_name=None) + +

+ + +
+ +

Converts a column of numeric values from one unit to another.

+

Unit conversion can only take place if the existing_units and +to_units are of the same type (e.g., temperature or pressure). +The provided unit types can be any unit name or alternate name provided +in the unyt package's Listing of Units table.

+

Volume units are not provided natively in unyt. However, exponents are +supported, and therefore some volume units can be converted. For example, +a volume in cubic centimeters can be converted to cubic meters using +existing_units='cm**3' and to_units='m**3'.

+

This method mutates the original DataFrame.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor.engineering
+>>> df = pd.DataFrame({"temp_F": [-40, 112]})
+>>> df = df.convert_units(
+...     column_name='temp_F',
+...     existing_units='degF',
+...     to_units='degC',
+...     dest_column_name='temp_C'
+... )
+>>> df
+   temp_F     temp_C
+0     -40 -40.000000
+1     112  44.444444
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + str + +
+

Name of the column containing numeric +values that are to be converted from one set of units to another.

+
+
+ None +
+ existing_units + + str + +
+

The unit type to convert from.

+
+
+ None +
+ to_units + + str + +
+

The unit type to convert to.

+
+
+ None +
+ dest_column_name + + str + +
+

The name of the new column containing the +converted values that will be created.

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ TypeError + +
+

If column is not numeric.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with a new column of unit-converted values.

+
+
+ +
+ Source code in janitor/engineering.py +
21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
@pf.register_dataframe_method
+def convert_units(
+    df: pd.DataFrame,
+    column_name: str = None,
+    existing_units: str = None,
+    to_units: str = None,
+    dest_column_name: str = None,
+) -> pd.DataFrame:
+    """Converts a column of numeric values from one unit to another.
+
+    Unit conversion can only take place if the `existing_units` and
+    `to_units` are of the same type (e.g., temperature or pressure).
+    The provided unit types can be any unit name or alternate name provided
+    in the `unyt` package's [Listing of Units table](
+    https://unyt.readthedocs.io/en/stable/unit_listing.html#unit-listing).
+
+    Volume units are not provided natively in `unyt`.  However, exponents are
+    supported, and therefore some volume units can be converted.  For example,
+    a volume in cubic centimeters can be converted to cubic meters using
+    `existing_units='cm**3'` and `to_units='m**3'`.
+
+    This method mutates the original DataFrame.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor.engineering
+        >>> df = pd.DataFrame({"temp_F": [-40, 112]})
+        >>> df = df.convert_units(
+        ...     column_name='temp_F',
+        ...     existing_units='degF',
+        ...     to_units='degC',
+        ...     dest_column_name='temp_C'
+        ... )
+        >>> df
+           temp_F     temp_C
+        0     -40 -40.000000
+        1     112  44.444444
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: Name of the column containing numeric
+            values that are to be converted from one set of units to another.
+        existing_units: The unit type to convert from.
+        to_units: The unit type to convert to.
+        dest_column_name: The name of the new column containing the
+            converted values that will be created.
+
+    Raises:
+        TypeError: If column is not numeric.
+
+    Returns:
+        A pandas DataFrame with a new column of unit-converted values.
+    """
+
+    # Check all inputs are correct data type
+    check("column_name", column_name, [str])
+    check("existing_units", existing_units, [str])
+    check("to_units", to_units, [str])
+    check("dest_column_name", dest_column_name, [str])
+
+    # Check that column_name is a numeric column
+    if not np.issubdtype(df[column_name].dtype, np.number):
+        raise TypeError(f"{column_name} must be a numeric column.")
+
+    original_vals = df[column_name].to_numpy() * unyt.Unit(existing_units)
+    converted_vals = original_vals.to(to_units)
+    df[dest_column_name] = np.array(converted_vals)
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/api/finance/index.html b/api/finance/index.html new file mode 100644 index 000000000..42bf4d8eb --- /dev/null +++ b/api/finance/index.html @@ -0,0 +1,1567 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Finance - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Finance

+ + +
+ + + + +
+ +

Finance-specific data cleaning functions.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ convert_currency(df, api_key, column_name=None, from_currency=None, to_currency=None, historical_date=None, make_new_column=False) + +

+ + +
+ +

Deprecated function.

+ + +
+ Source code in janitor/finance.py +
405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
@pf.register_dataframe_method
+@deprecated_alias(colname="column_name")
+def convert_currency(
+    df: pd.DataFrame,
+    api_key: str,
+    column_name: str = None,
+    from_currency: str = None,
+    to_currency: str = None,
+    historical_date: date = None,
+    make_new_column: bool = False,
+) -> pd.DataFrame:
+    """Deprecated function.
+
+    <!--
+    # noqa: DAR101
+    # noqa: DAR401
+    -->
+    """
+    raise JanitorError(
+        "The `convert_currency` function has been temporarily disabled due to "
+        "exchangeratesapi.io disallowing free pinging of its API. "
+        "(Our tests started to fail due to this issue.) "
+        "There is no easy way around this problem "
+        "except to find a new API to call on."
+        "Please comment on issue #829 "
+        "(https://github.com/pyjanitor-devs/pyjanitor/issues/829) "
+        "if you know of an alternative API that we can call on, "
+        "otherwise the function will be removed in pyjanitor's 1.0 release."
+    )
+
+
+
+ +
+ +
+ + +

+ convert_stock(stock_symbol) + +

+ + +
+ +

This function takes in a stock symbol as a parameter, +queries an API for the companies full name and returns +it

+

Examples:

+
```python
+import janitor.finance
+janitor.finance.convert_stock("aapl")
+```
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ stock_symbol + + str + +
+

Stock ticker Symbol

+
+
+ required +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ConnectionError + +
+

Internet connection is not available

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ str + +
+

Full company name

+
+
+ +
+ Source code in janitor/finance.py +
697
+698
+699
+700
+701
+702
+703
+704
+705
+706
+707
+708
+709
+710
+711
+712
+713
+714
+715
+716
+717
+718
+719
+720
+721
+722
+723
+724
+725
def convert_stock(stock_symbol: str) -> str:
+    """
+    This function takes in a stock symbol as a parameter,
+    queries an API for the companies full name and returns
+    it
+
+    Examples:
+
+        ```python
+        import janitor.finance
+        janitor.finance.convert_stock("aapl")
+        ```
+
+    Args:
+        stock_symbol: Stock ticker Symbol
+
+    Raises:
+        ConnectionError: Internet connection is not available
+
+    Returns:
+        Full company name
+    """
+    if is_connected("www.google.com"):
+        stock_symbol = stock_symbol.upper()
+        return get_symbol(stock_symbol)
+    else:
+        raise ConnectionError(
+            "Connection Error: Client Not Connected to Internet"
+        )
+
+
+
+ +
+ +
+ + +

+ get_symbol(symbol) + +

+ + +
+ +

This is a helper function to get a companies full +name based on the stock symbol.

+

Examples:

+
```python
+import janitor.finance
+janitor.finance.get_symbol("aapl")
+```
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ symbol + + str + +
+

This is our stock symbol that we use +to query the api for the companies full name.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Optional[str] + +
+

Company full name

+
+
+ +
+ Source code in janitor/finance.py +
728
+729
+730
+731
+732
+733
+734
+735
+736
+737
+738
+739
+740
+741
+742
+743
+744
+745
+746
+747
+748
+749
+750
+751
+752
+753
+754
+755
+756
def get_symbol(symbol: str) -> Optional[str]:
+    """
+    This is a helper function to get a companies full
+    name based on the stock symbol.
+
+    Examples:
+
+        ```python
+        import janitor.finance
+        janitor.finance.get_symbol("aapl")
+        ```
+
+    Args:
+        symbol: This is our stock symbol that we use
+            to query the api for the companies full name.
+
+    Returns:
+        Company full name
+    """
+    result = requests.get(
+        "http://d.yimg.com/autoc."
+        + "finance.yahoo.com/autoc?query={}&region=1&lang=en".format(symbol)
+    ).json()
+
+    for x in result["ResultSet"]["Result"]:
+        if x["symbol"] == symbol:
+            return x["name"]
+        else:
+            return None
+
+
+
+ +
+ +
+ + +

+ inflate_currency(df, column_name=None, country=None, currency_year=None, to_year=None, make_new_column=False) + +

+ + +
+ +

Inflates a column of monetary values from one year to another, based on +the currency's country.

+

The provided country can be any economy name or code from the World Bank +list of economies.

+

Note: This method mutates the original DataFrame.

+

Examples:

+
>>> import pandas as pd
+>>> import janitor.finance
+>>> df = pd.DataFrame({"profit":[100.10, 200.20, 300.30, 400.40, 500.50]})
+>>> df
+   profit
+0   100.1
+1   200.2
+2   300.3
+3   400.4
+4   500.5
+>>> df.inflate_currency(
+...    column_name='profit',
+...    country='USA',
+...    currency_year=2015,
+...    to_year=2018,
+...    make_new_column=True
+... )
+   profit  profit_2018
+0   100.1   106.050596
+1   200.2   212.101191
+2   300.3   318.151787
+3   400.4   424.202382
+4   500.5   530.252978
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + str + +
+

Name of the column containing monetary +values to inflate.

+
+
+ None +
+ country + + str + +
+

The country associated with the currency being inflated. +May be any economy or code from the World Bank +List of economies.

+
+
+ None +
+ currency_year + + int + +
+

The currency year to inflate from. +The year should be 1960 or later.

+
+
+ None +
+ to_year + + int + +
+

The currency year to inflate to. +The year should be 1960 or later.

+
+
+ None +
+ make_new_column + + bool + +
+

Generates new column for inflated currency if +True, otherwise, inflates currency in place.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

The DataFrame with inflated currency column.

+
+
+ +
+ Source code in janitor/finance.py +
623
+624
+625
+626
+627
+628
+629
+630
+631
+632
+633
+634
+635
+636
+637
+638
+639
+640
+641
+642
+643
+644
+645
+646
+647
+648
+649
+650
+651
+652
+653
+654
+655
+656
+657
+658
+659
+660
+661
+662
+663
+664
+665
+666
+667
+668
+669
+670
+671
+672
+673
+674
+675
+676
+677
+678
+679
+680
+681
+682
+683
+684
+685
+686
+687
+688
+689
+690
+691
+692
+693
+694
@pf.register_dataframe_method
+def inflate_currency(
+    df: pd.DataFrame,
+    column_name: str = None,
+    country: str = None,
+    currency_year: int = None,
+    to_year: int = None,
+    make_new_column: bool = False,
+) -> pd.DataFrame:
+    """
+    Inflates a column of monetary values from one year to another, based on
+    the currency's country.
+
+    The provided country can be any economy name or code from the World Bank
+    [list of economies](https://databank.worldbank.org/data/download/site-content/CLASS.xls).
+
+    **Note**: This method mutates the original DataFrame.
+
+    Examples:
+
+        >>> import pandas as pd
+        >>> import janitor.finance
+        >>> df = pd.DataFrame({"profit":[100.10, 200.20, 300.30, 400.40, 500.50]})
+        >>> df
+           profit
+        0   100.1
+        1   200.2
+        2   300.3
+        3   400.4
+        4   500.5
+        >>> df.inflate_currency(
+        ...    column_name='profit',
+        ...    country='USA',
+        ...    currency_year=2015,
+        ...    to_year=2018,
+        ...    make_new_column=True
+        ... )
+           profit  profit_2018
+        0   100.1   106.050596
+        1   200.2   212.101191
+        2   300.3   318.151787
+        3   400.4   424.202382
+        4   500.5   530.252978
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: Name of the column containing monetary
+            values to inflate.
+        country: The country associated with the currency being inflated.
+            May be any economy or code from the World Bank
+            [List of economies](https://databank.worldbank.org/data/download/site-content/CLASS.xls).
+        currency_year: The currency year to inflate from.
+            The year should be 1960 or later.
+        to_year: The currency year to inflate to.
+            The year should be 1960 or later.
+        make_new_column: Generates new column for inflated currency if
+            True, otherwise, inflates currency in place.
+
+    Returns:
+        The DataFrame with inflated currency column.
+    """  # noqa: E501
+
+    inflator = _inflate_currency(country, currency_year, to_year)
+
+    if make_new_column:
+        new_column_name = column_name + "_" + str(to_year)
+        df[new_column_name] = df[column_name] * inflator
+
+    else:
+        df[column_name] = df[column_name] * inflator
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/api/functions/index.html b/api/functions/index.html new file mode 100644 index 000000000..eb2a102da --- /dev/null +++ b/api/functions/index.html @@ -0,0 +1,36199 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Functions - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Functions

+ + +
+ + + + +
+ +

General Functions

+

pyjanitor's general-purpose data cleaning functions.

+ + + + + + + + +
+ + + + + + + + + + +
+ + + +

+ add_columns + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ add_column(df, column_name, value, fill_remaining=False) + +

+ + +
+ +

Add a column to the dataframe.

+

Intended to be the method-chaining alternative to:

+
df[column_name] = value
+
+
+

Note

+

This function will be deprecated in a 1.x release. +Please use pd.DataFrame.assign instead.

+
+ + +

Examples:

+

Add a column of constant values to the dataframe.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": list(range(3)), "b": list("abc")})
+>>> df.add_column(column_name="c", value=1)
+   a  b  c
+0  0  a  1
+1  1  b  1
+2  2  c  1
+
+

Add a column of different values to the dataframe.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": list(range(3)), "b": list("abc")})
+>>> df.add_column(column_name="c", value=list("efg"))
+   a  b  c
+0  0  a  e
+1  1  b  f
+2  2  c  g
+
+

Add a column using an iterator.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": list(range(3)), "b": list("abc")})
+>>> df.add_column(column_name="c", value=range(4, 7))
+   a  b  c
+0  0  a  4
+1  1  b  5
+2  2  c  6
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + str + +
+

Name of the new column. Should be a string, in order +for the column name to be compatible with the Feather binary +format (this is a useful thing to have).

+
+
+ required +
+ value + + Union[List[Any], Tuple[Any], Any] + +
+

Either a single value, or a list/tuple of values.

+
+
+ required +
+ fill_remaining + + bool + +
+

If value is a tuple or list that is smaller than +the number of rows in the DataFrame, repeat the list or tuple +(R-style) to the end of the DataFrame.

+
+
+ False +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If attempting to add a column that already exists.

+
+
+ ValueError + +
+

If value has more elements that number of +rows in the DataFrame.

+
+
+ ValueError + +
+

If attempting to add an iterable of values with +a length not equal to the number of DataFrame rows.

+
+
+ ValueError + +
+

If value has length of 0.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with an added column.

+
+
+ +
+ Source code in janitor/functions/add_columns.py +
 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `pd.DataFrame.assign` instead."
+    )
+)
+@deprecated_alias(col_name="column_name")
+def add_column(
+    df: pd.DataFrame,
+    column_name: str,
+    value: Union[List[Any], Tuple[Any], Any],
+    fill_remaining: bool = False,
+) -> pd.DataFrame:
+    """Add a column to the dataframe.
+
+    Intended to be the method-chaining alternative to:
+
+    ```python
+    df[column_name] = value
+    ```
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `pd.DataFrame.assign` instead.
+
+    Examples:
+        Add a column of constant values to the dataframe.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": list(range(3)), "b": list("abc")})
+        >>> df.add_column(column_name="c", value=1)
+           a  b  c
+        0  0  a  1
+        1  1  b  1
+        2  2  c  1
+
+        Add a column of different values to the dataframe.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": list(range(3)), "b": list("abc")})
+        >>> df.add_column(column_name="c", value=list("efg"))
+           a  b  c
+        0  0  a  e
+        1  1  b  f
+        2  2  c  g
+
+        Add a column using an iterator.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": list(range(3)), "b": list("abc")})
+        >>> df.add_column(column_name="c", value=range(4, 7))
+           a  b  c
+        0  0  a  4
+        1  1  b  5
+        2  2  c  6
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: Name of the new column. Should be a string, in order
+            for the column name to be compatible with the Feather binary
+            format (this is a useful thing to have).
+        value: Either a single value, or a list/tuple of values.
+        fill_remaining: If value is a tuple or list that is smaller than
+            the number of rows in the DataFrame, repeat the list or tuple
+            (R-style) to the end of the DataFrame.
+
+    Raises:
+        ValueError: If attempting to add a column that already exists.
+        ValueError: If `value` has more elements that number of
+            rows in the DataFrame.
+        ValueError: If attempting to add an iterable of values with
+            a length not equal to the number of DataFrame rows.
+        ValueError: If `value` has length of `0`.
+
+    Returns:
+        A pandas DataFrame with an added column.
+    """
+    check("column_name", column_name, [str])
+
+    if column_name in df.columns:
+        raise ValueError(
+            f"Attempted to add column that already exists: " f"{column_name}."
+        )
+
+    nrows = len(df)
+
+    if hasattr(value, "__len__") and not isinstance(
+        value, (str, bytes, bytearray)
+    ):
+        len_value = len(value)
+
+        # if `value` is a list, ndarray, etc.
+        if len_value > nrows:
+            raise ValueError(
+                "`value` has more elements than number of rows "
+                f"in your `DataFrame`. vals: {len_value}, "
+                f"df: {nrows}"
+            )
+        if len_value != nrows and not fill_remaining:
+            raise ValueError(
+                "Attempted to add iterable of values with length"
+                " not equal to number of DataFrame rows"
+            )
+        if not len_value:
+            raise ValueError(
+                "`value` has to be an iterable of minimum length 1"
+            )
+
+    elif fill_remaining:
+        # relevant if a scalar val was passed, yet fill_remaining == True
+        len_value = 1
+        value = [value]
+
+    df = df.copy()
+    if fill_remaining:
+        times_to_loop = int(np.ceil(nrows / len_value))
+        fill_values = list(value) * times_to_loop
+        df[column_name] = fill_values[:nrows]
+    else:
+        df[column_name] = value
+
+    return df
+
+
+
+ +
+ +
+ + +

+ add_columns(df, fill_remaining=False, **kwargs) + +

+ + +
+ +

Add multiple columns to the dataframe.

+

This method does not mutate the original DataFrame.

+

Method to augment +add_column +with ability to add multiple columns in +one go. This replaces the need for multiple +add_column calls.

+

Usage is through supplying kwargs where the key is the col name and the +values correspond to the values of the new DataFrame column.

+

Values passed can be scalar or iterable (list, ndarray, etc.)

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use pd.DataFrame.assign instead.

+
+ + +

Examples:

+

Inserting two more columns into a dataframe.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": list(range(3)), "b": list("abc")})
+>>> df.add_columns(x=4, y=list("def"))
+   a  b  x  y
+0  0  a  4  d
+1  1  b  4  e
+2  2  c  4  f
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ fill_remaining + + bool + +
+

If value is a tuple or list that is smaller than +the number of rows in the DataFrame, repeat the list or tuple +(R-style) to the end of the DataFrame. (Passed to +add_column)

+
+
+ False +
+ **kwargs + + Any + +
+

Column, value pairs which are looped through in +add_column calls.

+
+
+ {} +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with added columns.

+
+
+ +
+ Source code in janitor/functions/add_columns.py +
139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `pd.DataFrame.assign` instead."
+    )
+)
+def add_columns(
+    df: pd.DataFrame,
+    fill_remaining: bool = False,
+    **kwargs: Any,
+) -> pd.DataFrame:
+    """Add multiple columns to the dataframe.
+
+    This method does not mutate the original DataFrame.
+
+    Method to augment
+    [`add_column`][janitor.functions.add_columns.add_column]
+    with ability to add multiple columns in
+    one go. This replaces the need for multiple
+    [`add_column`][janitor.functions.add_columns.add_column] calls.
+
+    Usage is through supplying kwargs where the key is the col name and the
+    values correspond to the values of the new DataFrame column.
+
+    Values passed can be scalar or iterable (list, ndarray, etc.)
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `pd.DataFrame.assign` instead.
+
+    Examples:
+        Inserting two more columns into a dataframe.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": list(range(3)), "b": list("abc")})
+        >>> df.add_columns(x=4, y=list("def"))
+           a  b  x  y
+        0  0  a  4  d
+        1  1  b  4  e
+        2  2  c  4  f
+
+    Args:
+        df: A pandas DataFrame.
+        fill_remaining: If value is a tuple or list that is smaller than
+            the number of rows in the DataFrame, repeat the list or tuple
+            (R-style) to the end of the DataFrame. (Passed to
+            [`add_column`][janitor.functions.add_columns.add_column])
+        **kwargs: Column, value pairs which are looped through in
+            [`add_column`][janitor.functions.add_columns.add_column] calls.
+
+    Returns:
+        A pandas DataFrame with added columns.
+    """
+    # Note: error checking can pretty much be handled in `add_column`
+
+    for col_name, values in kwargs.items():
+        df = df.add_column(col_name, values, fill_remaining=fill_remaining)
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ also + + +

+ +
+ +

Implementation source for chainable function also.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ also(df, func, *args, **kwargs) + +

+ + +
+ +

Run a function with side effects.

+

This function allows you to run an arbitrary function +in the pyjanitor method chain. +Doing so will let you do things like save the dataframe to disk midway +while continuing to modify the dataframe afterwards.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = (
+...     pd.DataFrame({"a": [1, 2, 3], "b": list("abc")})
+...     .query("a > 1")
+...     .also(lambda df: print(f"DataFrame shape is: {df.shape}"))
+...     .rename_column(old_column_name="a", new_column_name="a_new")
+...     .also(lambda df: df.to_csv("midpoint.csv"))
+...     .also(
+...         lambda df: print(f"Columns: {df.columns}")
+...     )
+... )
+DataFrame shape is: (2, 2)
+Columns: Index(['a_new', 'b'], dtype='object')
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ func + + Callable + +
+

A function you would like to run in the method chain. +It should take one DataFrame object as a parameter and have no return. +If there is a return, it will be ignored.

+
+
+ required +
+ *args + + Any + +
+

Optional arguments for func.

+
+
+ () +
+ **kwargs + + Any + +
+

Optional keyword arguments for func.

+
+
+ {} +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

The input pandas DataFrame, unmodified.

+
+
+ +
+ Source code in janitor/functions/also.py +
 9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
@pf.register_dataframe_method
+def also(
+    df: pd.DataFrame, func: Callable, *args: Any, **kwargs: Any
+) -> pd.DataFrame:
+    """Run a function with side effects.
+
+    This function allows you to run an arbitrary function
+    in the `pyjanitor` method chain.
+    Doing so will let you do things like save the dataframe to disk midway
+    while continuing to modify the dataframe afterwards.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = (
+        ...     pd.DataFrame({"a": [1, 2, 3], "b": list("abc")})
+        ...     .query("a > 1")
+        ...     .also(lambda df: print(f"DataFrame shape is: {df.shape}"))
+        ...     .rename_column(old_column_name="a", new_column_name="a_new")
+        ...     .also(lambda df: df.to_csv("midpoint.csv"))
+        ...     .also(
+        ...         lambda df: print(f"Columns: {df.columns}")
+        ...     )
+        ... )
+        DataFrame shape is: (2, 2)
+        Columns: Index(['a_new', 'b'], dtype='object')
+
+    Args:
+        df: A pandas DataFrame.
+        func: A function you would like to run in the method chain.
+            It should take one DataFrame object as a parameter and have no return.
+            If there is a return, it will be ignored.
+        *args: Optional arguments for `func`.
+        **kwargs: Optional keyword arguments for `func`.
+
+    Returns:
+        The input pandas DataFrame, unmodified.
+    """  # noqa: E501
+    func(df.copy(), *args, **kwargs)
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ bin_numeric + + +

+ +
+ +

Implementation source for bin_numeric.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ bin_numeric(df, from_column_name, to_column_name, bins=5, **kwargs) + +

+ + +
+ +

Generate a new column that labels bins for a specified numeric column.

+

This method does not mutate the original DataFrame.

+

A wrapper around the pandas cut() function to bin data of +one column, generating a new column with the results.

+ + +

Examples:

+

Binning a numeric column with specific bin edges.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": [3, 6, 9, 12, 15]})
+>>> df.bin_numeric(
+...     from_column_name="a", to_column_name="a_binned",
+...     bins=[0, 5, 11, 15],
+... )
+    a  a_binned
+0   3    (0, 5]
+1   6   (5, 11]
+2   9   (5, 11]
+3  12  (11, 15]
+4  15  (11, 15]
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ from_column_name + + str + +
+

The column whose data you want binned.

+
+
+ required +
+ to_column_name + + str + +
+

The new column to be created with the binned data.

+
+
+ required +
+ bins + + Optional[Union[int, ScalarSequence, IntervalIndex]] + +
+

The binning strategy to be utilized. Read the pd.cut +documentation for more details.

+
+
+ 5 +
+ **kwargs + + Any + +
+

Additional kwargs to pass to pd.cut, except retbins.

+
+
+ {} +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If retbins is passed in as a kwarg.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/bin_numeric.py +
13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
@pf.register_dataframe_method
+@deprecated_alias(
+    from_column="from_column_name",
+    to_column="to_column_name",
+    num_bins="bins",
+)
+def bin_numeric(
+    df: pd.DataFrame,
+    from_column_name: str,
+    to_column_name: str,
+    bins: Optional[Union[int, ScalarSequence, pd.IntervalIndex]] = 5,
+    **kwargs: Any,
+) -> pd.DataFrame:
+    """Generate a new column that labels bins for a specified numeric column.
+
+    This method does not mutate the original DataFrame.
+
+    A wrapper around the pandas [`cut()`][pd_cut_docs] function to bin data of
+    one column, generating a new column with the results.
+
+    [pd_cut_docs]: https://pandas.pydata.org/docs/reference/api/pandas.cut.html
+
+    Examples:
+        Binning a numeric column with specific bin edges.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": [3, 6, 9, 12, 15]})
+        >>> df.bin_numeric(
+        ...     from_column_name="a", to_column_name="a_binned",
+        ...     bins=[0, 5, 11, 15],
+        ... )
+            a  a_binned
+        0   3    (0, 5]
+        1   6   (5, 11]
+        2   9   (5, 11]
+        3  12  (11, 15]
+        4  15  (11, 15]
+
+    Args:
+        df: A pandas DataFrame.
+        from_column_name: The column whose data you want binned.
+        to_column_name: The new column to be created with the binned data.
+        bins: The binning strategy to be utilized. Read the `pd.cut`
+            documentation for more details.
+        **kwargs: Additional kwargs to pass to `pd.cut`, except `retbins`.
+
+    Raises:
+        ValueError: If `retbins` is passed in as a kwarg.
+
+    Returns:
+        A pandas DataFrame.
+    """
+    if "retbins" in kwargs:
+        raise ValueError("`retbins` is not an acceptable keyword argument.")
+
+    check("from_column_name", from_column_name, [str])
+    check("to_column_name", to_column_name, [str])
+    check_column(df, from_column_name)
+
+    df = df.assign(
+        **{
+            to_column_name: pd.cut(df[from_column_name], bins=bins, **kwargs),
+        }
+    )
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ case_when + + +

+ +
+ +

Implementation source for case_when.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ case_when(df, *args, default=None, column_name) + +

+ + +
+ +

Create a column based on a condition or multiple conditions.

+

Similar to SQL and dplyr's case_when +with inspiration from pydatatable if_else function.

+

If your scenario requires direct replacement of values, +pandas' replace method or map method should be better +suited and more efficient; if the conditions check +if a value is within a range of values, pandas' cut or qcut +should be more efficient; np.where/np.select are also +performant options.

+

This function relies on pd.Series.mask method.

+

When multiple conditions are satisfied, the first one is used.

+

The variable *args parameters takes arguments of the form : +condition0, value0, condition1, value1, ..., default. +If condition0 evaluates to True, then assign value0 to +column_name, if condition1 evaluates to True, then +assign value1 to column_name, and so on. If none of the +conditions evaluate to True, assign default to +column_name.

+

This function can be likened to SQL's case_when:

+
CASE WHEN condition0 THEN value0
+    WHEN condition1 THEN value1
+    --- more conditions
+    ELSE default
+    END AS column_name
+
+

compared to python's if-elif-else:

+
if condition0:
+    value0
+elif condition1:
+    value1
+# more elifs
+else:
+    default
+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame(
+...     {
+...         "a": [0, 0, 1, 2, "hi"],
+...         "b": [0, 3, 4, 5, "bye"],
+...         "c": [6, 7, 8, 9, "wait"],
+...     }
+... )
+>>> df
+    a    b     c
+0   0    0     6
+1   0    3     7
+2   1    4     8
+3   2    5     9
+4  hi  bye  wait
+>>> df.case_when(
+...     ((df.a == 0) & (df.b != 0)) | (df.c == "wait"), df.a,
+...     (df.b == 0) & (df.a == 0), "x",
+...     default = df.c,
+...     column_name = "value",
+... )
+    a    b     c value
+0   0    0     6     x
+1   0    3     7     0
+2   1    4     8     8
+3   2    5     9     9
+4  hi  bye  wait    hi
+
+
+

Version Changed

+
    +
  • 0.24.0
      +
    • Added default parameter.
    • +
    +
  • +
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ *args + + Any + +
+

Variable argument of conditions and expected values. +Takes the form +condition0, value0, condition1, value1, ... . +condition can be a 1-D boolean array, a callable, or a string. +If condition is a callable, it should evaluate +to a 1-D boolean array. The array should have the same length +as the DataFrame. If it is a string, it is computed on the dataframe, +via df.eval, and should return a 1-D boolean array. +result can be a scalar, a 1-D array, or a callable. +If result is a callable, it should evaluate to a 1-D array. +For a 1-D array, it should have the same length as the DataFrame.

+
+
+ () +
+ default + + Any + +
+

This is the element inserted in the output +when all conditions evaluate to False. +Can be scalar, 1-D array or callable. +If callable, it should evaluate to a 1-D array. +The 1-D array should be the same length as the DataFrame.

+
+
+ None +
+ column_name + + str + +
+

Name of column to assign results to. A new column +is created if it does not already exist in the DataFrame.

+
+
+ required +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If condition/value fails to evaluate.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/case_when.py +
 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `pd.Series.case_when` instead."
+    )
+)
+def case_when(
+    df: pd.DataFrame, *args: Any, default: Any = None, column_name: str
+) -> pd.DataFrame:
+    """Create a column based on a condition or multiple conditions.
+
+    Similar to SQL and dplyr's case_when
+    with inspiration from `pydatatable` if_else function.
+
+    If your scenario requires direct replacement of values,
+    pandas' `replace` method or `map` method should be better
+    suited and more efficient; if the conditions check
+    if a value is within a range of values, pandas' `cut` or `qcut`
+    should be more efficient; `np.where/np.select` are also
+    performant options.
+
+    This function relies on `pd.Series.mask` method.
+
+    When multiple conditions are satisfied, the first one is used.
+
+    The variable `*args` parameters takes arguments of the form :
+    `condition0`, `value0`, `condition1`, `value1`, ..., `default`.
+    If `condition0` evaluates to `True`, then assign `value0` to
+    `column_name`, if `condition1` evaluates to `True`, then
+    assign `value1` to `column_name`, and so on. If none of the
+    conditions evaluate to `True`, assign `default` to
+    `column_name`.
+
+    This function can be likened to SQL's `case_when`:
+
+    ```sql
+    CASE WHEN condition0 THEN value0
+        WHEN condition1 THEN value1
+        --- more conditions
+        ELSE default
+        END AS column_name
+    ```
+
+    compared to python's `if-elif-else`:
+
+    ```python
+    if condition0:
+        value0
+    elif condition1:
+        value1
+    # more elifs
+    else:
+        default
+    ```
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "a": [0, 0, 1, 2, "hi"],
+        ...         "b": [0, 3, 4, 5, "bye"],
+        ...         "c": [6, 7, 8, 9, "wait"],
+        ...     }
+        ... )
+        >>> df
+            a    b     c
+        0   0    0     6
+        1   0    3     7
+        2   1    4     8
+        3   2    5     9
+        4  hi  bye  wait
+        >>> df.case_when(
+        ...     ((df.a == 0) & (df.b != 0)) | (df.c == "wait"), df.a,
+        ...     (df.b == 0) & (df.a == 0), "x",
+        ...     default = df.c,
+        ...     column_name = "value",
+        ... )
+            a    b     c value
+        0   0    0     6     x
+        1   0    3     7     0
+        2   1    4     8     8
+        3   2    5     9     9
+        4  hi  bye  wait    hi
+
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `default` parameter.
+
+    Args:
+        df: A pandas DataFrame.
+        *args: Variable argument of conditions and expected values.
+            Takes the form
+            `condition0`, `value0`, `condition1`, `value1`, ... .
+            `condition` can be a 1-D boolean array, a callable, or a string.
+            If `condition` is a callable, it should evaluate
+            to a 1-D boolean array. The array should have the same length
+            as the DataFrame. If it is a string, it is computed on the dataframe,
+            via `df.eval`, and should return a 1-D boolean array.
+            `result` can be a scalar, a 1-D array, or a callable.
+            If `result` is a callable, it should evaluate to a 1-D array.
+            For a 1-D array, it should have the same length as the DataFrame.
+        default: This is the element inserted in the output
+            when all conditions evaluate to False.
+            Can be scalar, 1-D array or callable.
+            If callable, it should evaluate to a 1-D array.
+            The 1-D array should be the same length as the DataFrame.
+        column_name: Name of column to assign results to. A new column
+            is created if it does not already exist in the DataFrame.
+
+    Raises:
+        ValueError: If condition/value fails to evaluate.
+
+    Returns:
+        A pandas DataFrame.
+    """  # noqa: E501
+    # Preliminary checks on the case_when function.
+    # The bare minimum checks are done; the remaining checks
+    # are done within `pd.Series.mask`.
+    check("column_name", column_name, [str])
+    len_args = len(args)
+    if len_args < 2:
+        raise ValueError(
+            "At least two arguments are required for the `args` parameter"
+        )
+
+    if len_args % 2:
+        if default is None:
+            warnings.warn(
+                "The last argument in the variable arguments "
+                "has been assigned as the default. "
+                "Note however that this will be deprecated "
+                "in a future release; use an even number "
+                "of boolean conditions and values, "
+                "and pass the default argument to the `default` "
+                "parameter instead.",
+                DeprecationWarning,
+                stacklevel=find_stack_level(),
+            )
+            *args, default = args
+        else:
+            raise ValueError(
+                "The number of conditions and values do not match. "
+                f"There are {len_args - len_args//2} conditions "
+                f"and {len_args//2} values."
+            )
+
+    booleans = []
+    replacements = []
+
+    for index, value in enumerate(args):
+        if index % 2:
+            if callable(value):
+                value = apply_if_callable(value, df)
+            replacements.append(value)
+        else:
+            if callable(value):
+                value = apply_if_callable(value, df)
+            elif isinstance(value, str):
+                value = df.eval(value)
+            booleans.append(value)
+
+    if callable(default):
+        default = apply_if_callable(default, df)
+    if is_scalar(default):
+        default = pd.Series([default]).repeat(len(df))
+    if not hasattr(default, "shape"):
+        default = pd.Series([*default])
+    if isinstance(default, pd.Index):
+        arr_ndim = default.nlevels
+    else:
+        arr_ndim = default.ndim
+    if arr_ndim != 1:
+        raise ValueError(
+            "The argument for the `default` parameter "
+            "should either be a 1-D array, a scalar, "
+            "or a callable that can evaluate to a 1-D array."
+        )
+    if not isinstance(default, pd.Series):
+        default = pd.Series(default)
+    default.index = df.index
+    # actual computation
+    # ensures value assignment is on a first come basis
+    booleans = booleans[::-1]
+    replacements = replacements[::-1]
+    for index, (condition, value) in enumerate(zip(booleans, replacements)):
+        try:
+            default = default.mask(condition, value)
+        # error `feedoff` idea from SO
+        # https://stackoverflow.com/a/46091127/7175713
+        except Exception as error:
+            raise ValueError(
+                f"condition{index} and value{index} failed to evaluate. "
+                f"Original error message: {error}"
+            ) from error
+
+    return df.assign(**{column_name: default})
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ change_index_dtype + + +

+ +
+ +

Implementation of the change_index_dtype function.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ change_index_dtype(df, dtype, axis='index') + +

+ + +
+ +

Cast an index to a specified dtype dtype.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import numpy as np
+>>> import janitor
+>>> rng = np.random.default_rng(seed=0)
+>>> np.random.seed(0)
+>>> tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
+...             'foo', 'foo', 'qux', 'qux'],
+...              [1.0, 2.0, 1.0, 2.0,
+...               1.0, 2.0, 1.0, 2.0]]))
+>>> idx = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
+>>> df = pd.DataFrame(np.random.randn(8, 2), index=idx, columns=['A', 'B'])
+>>> df
+                     A         B
+first second
+bar   1.0     1.764052  0.400157
+      2.0     0.978738  2.240893
+baz   1.0     1.867558 -0.977278
+      2.0     0.950088 -0.151357
+foo   1.0    -0.103219  0.410599
+      2.0     0.144044  1.454274
+qux   1.0     0.761038  0.121675
+      2.0     0.443863  0.333674
+>>> outcome=df.change_index_dtype(dtype=str)
+>>> outcome
+                     A         B
+first second
+bar   1.0     1.764052  0.400157
+      2.0     0.978738  2.240893
+baz   1.0     1.867558 -0.977278
+      2.0     0.950088 -0.151357
+foo   1.0    -0.103219  0.410599
+      2.0     0.144044  1.454274
+qux   1.0     0.761038  0.121675
+      2.0     0.443863  0.333674
+>>> outcome.index.dtypes
+first     object
+second    object
+dtype: object
+>>> outcome=df.change_index_dtype(dtype={'second':int})
+>>> outcome
+                     A         B
+first second
+bar   1       1.764052  0.400157
+      2       0.978738  2.240893
+baz   1       1.867558 -0.977278
+      2       0.950088 -0.151357
+foo   1      -0.103219  0.410599
+      2       0.144044  1.454274
+qux   1       0.761038  0.121675
+      2       0.443863  0.333674
+>>> outcome.index.dtypes
+first     object
+second     int64
+dtype: object
+>>> outcome=df.change_index_dtype(dtype={0:'category',1:int})
+>>> outcome
+                     A         B
+first second
+bar   1       1.764052  0.400157
+      2       0.978738  2.240893
+baz   1       1.867558 -0.977278
+      2       0.950088 -0.151357
+foo   1      -0.103219  0.410599
+      2       0.144044  1.454274
+qux   1       0.761038  0.121675
+      2       0.443863  0.333674
+>>> outcome.index.dtypes
+first     category
+second       int64
+dtype: object
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ dtype + + +
+

Use a str or dtype to cast the entire Index +to the same type. +Alternatively, use a dictionary to change the MultiIndex +to new dtypes.

+
+
+ required +
+ axis + + str + +
+

Determines which axis to change the dtype(s). +Should be either 'index' or 'columns'.

+
+
+ 'index' +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with new Index.

+
+
+ +
+ Source code in janitor/functions/change_index_dtype.py +
 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
@pf.register_dataframe_method
+def change_index_dtype(
+    df: pd.DataFrame, dtype: Union[str, dict], axis: str = "index"
+) -> pd.DataFrame:
+    """Cast an index to a specified dtype ``dtype``.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> import janitor
+        >>> rng = np.random.default_rng(seed=0)
+        >>> np.random.seed(0)
+        >>> tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
+        ...             'foo', 'foo', 'qux', 'qux'],
+        ...              [1.0, 2.0, 1.0, 2.0,
+        ...               1.0, 2.0, 1.0, 2.0]]))
+        >>> idx = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
+        >>> df = pd.DataFrame(np.random.randn(8, 2), index=idx, columns=['A', 'B'])
+        >>> df
+                             A         B
+        first second
+        bar   1.0     1.764052  0.400157
+              2.0     0.978738  2.240893
+        baz   1.0     1.867558 -0.977278
+              2.0     0.950088 -0.151357
+        foo   1.0    -0.103219  0.410599
+              2.0     0.144044  1.454274
+        qux   1.0     0.761038  0.121675
+              2.0     0.443863  0.333674
+        >>> outcome=df.change_index_dtype(dtype=str)
+        >>> outcome
+                             A         B
+        first second
+        bar   1.0     1.764052  0.400157
+              2.0     0.978738  2.240893
+        baz   1.0     1.867558 -0.977278
+              2.0     0.950088 -0.151357
+        foo   1.0    -0.103219  0.410599
+              2.0     0.144044  1.454274
+        qux   1.0     0.761038  0.121675
+              2.0     0.443863  0.333674
+        >>> outcome.index.dtypes
+        first     object
+        second    object
+        dtype: object
+        >>> outcome=df.change_index_dtype(dtype={'second':int})
+        >>> outcome
+                             A         B
+        first second
+        bar   1       1.764052  0.400157
+              2       0.978738  2.240893
+        baz   1       1.867558 -0.977278
+              2       0.950088 -0.151357
+        foo   1      -0.103219  0.410599
+              2       0.144044  1.454274
+        qux   1       0.761038  0.121675
+              2       0.443863  0.333674
+        >>> outcome.index.dtypes
+        first     object
+        second     int64
+        dtype: object
+        >>> outcome=df.change_index_dtype(dtype={0:'category',1:int})
+        >>> outcome
+                             A         B
+        first second
+        bar   1       1.764052  0.400157
+              2       0.978738  2.240893
+        baz   1       1.867558 -0.977278
+              2       0.950088 -0.151357
+        foo   1      -0.103219  0.410599
+              2       0.144044  1.454274
+        qux   1       0.761038  0.121675
+              2       0.443863  0.333674
+        >>> outcome.index.dtypes
+        first     category
+        second       int64
+        dtype: object
+
+    Args:
+        df: A pandas DataFrame.
+        dtype : Use a str or dtype to cast the entire Index
+            to the same type.
+            Alternatively, use a dictionary to change the MultiIndex
+            to new dtypes.
+        axis: Determines which axis to change the dtype(s).
+            Should be either 'index' or 'columns'.
+
+    Returns:
+        A pandas DataFrame with new Index.
+    """  # noqa: E501
+
+    check("axis", axis, [str])
+    if axis not in {"index", "columns"}:
+        raise ValueError("axis should be either index or columns.")
+
+    df = df[:]
+    current_index = getattr(df, axis)
+    if not isinstance(current_index, pd.MultiIndex):
+        if isinstance(dtype, dict):
+            raise TypeError(
+                "Changing the dtype via a dictionary "
+                "is not supported for a single index."
+            )
+        current_index = current_index.astype(dtype)
+        setattr(df, axis, current_index)
+        return df
+
+    if not isinstance(dtype, dict):
+        dtype = {
+            level_number: dtype
+            for level_number in range(current_index.nlevels)
+        }
+
+    all_str = all(isinstance(level, str) for level in dtype)
+    all_int = all(isinstance(level, int) for level in dtype)
+    if not all_str | all_int:
+        raise TypeError(
+            "The levels in the dictionary "
+            "should be either all strings or all integers."
+        )
+
+    dtype = {
+        current_index._get_level_number(label): _dtype
+        for label, _dtype in dtype.items()
+    }
+
+    new_levels = []
+    codes = current_index.codes
+    levels = current_index.levels
+
+    for level_number in range(current_index.nlevels):
+        _index = levels[level_number]
+        if level_number in dtype:
+            _dtype = dtype[level_number]
+            _index = _index.astype(_dtype)
+        new_levels.append(_index)
+
+    current_index = pd.MultiIndex(
+        levels=new_levels,
+        codes=codes,
+        names=current_index.names,
+        copy=False,
+        verify_integrity=False,
+    )
+    setattr(df, axis, current_index)
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ change_type + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ change_type(df, column_name, dtype, ignore_exception=False) + +

+ + +
+ +

Change the type of a column.

+

This method does not mutate the original DataFrame.

+

Exceptions that are raised can be ignored. For example, if one has a mixed +dtype column that has non-integer strings and integers, and you want to +coerce everything to integers, you can optionally ignore the non-integer +strings and replace them with NaN or keep the original value.

+

Intended to be the method-chaining alternative to:

+
df[col] = df[col].astype(dtype)
+
+
+

Note

+

This function will be deprecated in a 1.x release. +Please use pd.DataFrame.astype instead.

+
+ + +

Examples:

+

Change the type of a column.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"col1": range(3), "col2": ["m", 5, True]})
+>>> df
+   col1  col2
+0     0     m
+1     1     5
+2     2  True
+>>> df.change_type(
+...     "col1", dtype=str,
+... ).change_type(
+...     "col2", dtype=float, ignore_exception="fillna",
+... )
+  col1  col2
+0    0   NaN
+1    1   5.0
+2    2   1.0
+
+

Change the type of multiple columns. To change the type of all columns, +please use DataFrame.astype instead.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"col1": range(3), "col2": ["m", 5, True]})
+>>> df.change_type(['col1', 'col2'], str)
+  col1  col2
+0    0     m
+1    1     5
+2    2  True
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + Hashable | list[Hashable] | Index + +
+

The column(s) in the dataframe.

+
+
+ required +
+ dtype + + type + +
+

The datatype to convert to. Should be one of the standard +Python types, or a numpy datatype.

+
+
+ required +
+ ignore_exception + + bool + +
+

One of {False, "fillna", "keep_values"}.

+
+
+ False +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If unknown option provided for ignore_exception.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with changed column types.

+
+
+ +
+ Source code in janitor/functions/change_type.py +
 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `pd.DataFrame.astype` instead."
+    )
+)
+@deprecated_alias(column="column_name")
+def change_type(
+    df: pd.DataFrame,
+    column_name: Hashable | list[Hashable] | pd.Index,
+    dtype: type,
+    ignore_exception: bool = False,
+) -> pd.DataFrame:
+    """Change the type of a column.
+
+    This method does not mutate the original DataFrame.
+
+    Exceptions that are raised can be ignored. For example, if one has a mixed
+    dtype column that has non-integer strings and integers, and you want to
+    coerce everything to integers, you can optionally ignore the non-integer
+    strings and replace them with `NaN` or keep the original value.
+
+    Intended to be the method-chaining alternative to:
+
+    ```python
+    df[col] = df[col].astype(dtype)
+    ```
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `pd.DataFrame.astype` instead.
+
+    Examples:
+        Change the type of a column.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"col1": range(3), "col2": ["m", 5, True]})
+        >>> df
+           col1  col2
+        0     0     m
+        1     1     5
+        2     2  True
+        >>> df.change_type(
+        ...     "col1", dtype=str,
+        ... ).change_type(
+        ...     "col2", dtype=float, ignore_exception="fillna",
+        ... )
+          col1  col2
+        0    0   NaN
+        1    1   5.0
+        2    2   1.0
+
+        Change the type of multiple columns. To change the type of all columns,
+        please use `DataFrame.astype` instead.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"col1": range(3), "col2": ["m", 5, True]})
+        >>> df.change_type(['col1', 'col2'], str)
+          col1  col2
+        0    0     m
+        1    1     5
+        2    2  True
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: The column(s) in the dataframe.
+        dtype: The datatype to convert to. Should be one of the standard
+            Python types, or a numpy datatype.
+        ignore_exception: One of `{False, "fillna", "keep_values"}`.
+
+    Raises:
+        ValueError: If unknown option provided for `ignore_exception`.
+
+    Returns:
+        A pandas DataFrame with changed column types.
+    """  # noqa: E501
+
+    df = df.copy()  # avoid mutating the original DataFrame
+    if not ignore_exception:
+        df[column_name] = df[column_name].astype(dtype)
+    elif ignore_exception == "keep_values":
+        df[column_name] = df[column_name].astype(dtype, errors="ignore")
+    elif ignore_exception == "fillna":
+        if isinstance(column_name, Hashable):
+            column_name = [column_name]
+        df[column_name] = df[column_name].map(_convert, dtype=dtype)
+    else:
+        raise ValueError("Unknown option for ignore_exception")
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ clean_names + + +

+ +
+ +

Functions for cleaning columns/index names and/or column values.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ clean_names(df, axis='columns', column_names=None, strip_underscores=None, case_type='lower', remove_special=False, strip_accents=True, preserve_original_labels=True, enforce_string=True, truncate_limit=None) + +

+ + +
+ +

Clean column/index names. It can also be applied to column values.

+

Takes all column names, converts them to lowercase, +then replaces all spaces with underscores.

+

By default, column names are converted to string types. +This can be switched off by passing in enforce_string=False.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame(
+...     {
+...         "Aloha": range(3),
+...         "Bell Chart": range(3),
+...         "Animals@#$%^": range(3)
+...     }
+... )
+>>> df
+   Aloha  Bell Chart  Animals@#$%^
+0      0           0             0
+1      1           1             1
+2      2           2             2
+>>> df.clean_names()
+   aloha  bell_chart  animals@#$%^
+0      0           0             0
+1      1           1             1
+2      2           2             2
+>>> df.clean_names(remove_special=True)
+   aloha  bell_chart  animals
+0      0           0        0
+1      1           1        1
+2      2           2        2
+
+
+

Version Changed

+
    +
  • 0.26.0
      +
    • Added axis and column_names parameters.
    • +
    +
  • +
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The pandas DataFrame object.

+
+
+ required +
+ axis + + str + +
+

Whether to clean the labels on the index or columns. +If None, applies to a defined column +or columns in column_names.

+
+
+ 'columns' +
+ column_names + + str | list + +
+

Clean the values in a column. +axis should be None. +Column selection is possible using the +select syntax.

+
+
+ None +
+ strip_underscores + + str | bool + +
+

Removes the outer underscores from all +column names/values. Default None keeps outer underscores. +Values can be either 'left', 'right' or 'both' +or the respective shorthand 'l', +'r' and True.

+
+
+ None +
+ case_type + + str + +
+

Whether to make columns lower or uppercase. +Current case may be preserved with 'preserve', +while snake case conversion (from CamelCase or camelCase only) +can be turned on using "snake". +Default 'lower' makes all characters lowercase.

+
+
+ 'lower' +
+ remove_special + + bool + +
+

Remove special characters from columns. +Only letters, numbers and underscores are preserved.

+
+
+ False +
+ strip_accents + + bool + +
+

Whether or not to remove accents from +columns names/values.

+
+
+ True +
+ preserve_original_labels + + bool + +
+

Preserve original names. +This is later retrievable using df.original_labels. +Applies if axis is not None.

+
+
+ True +
+ enforce_string + + bool + +
+

Whether or not to convert all +column names/values to string type. +Defaults to True, but can be turned off. +Columns with >1 levels will not be converted by default.

+
+
+ True +
+ truncate_limit + + int + +
+

Truncates formatted column names/values +to the specified length. +Default None does not truncate.

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If axis=None and column_names=None.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/clean_names.py +
 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
@pf.register_dataframe_method
+@deprecated_alias(preserve_original_columns="preserve_original_labels")
+def clean_names(
+    df: pd.DataFrame,
+    axis: str = "columns",
+    column_names: str | list = None,
+    strip_underscores: str | bool = None,
+    case_type: str = "lower",
+    remove_special: bool = False,
+    strip_accents: bool = True,
+    preserve_original_labels: bool = True,
+    enforce_string: bool = True,
+    truncate_limit: int = None,
+) -> pd.DataFrame:
+    """Clean column/index names. It can also be applied to column values.
+
+    Takes all column names, converts them to lowercase,
+    then replaces all spaces with underscores.
+
+    By default, column names are converted to string types.
+    This can be switched off by passing in `enforce_string=False`.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "Aloha": range(3),
+        ...         "Bell Chart": range(3),
+        ...         "Animals@#$%^": range(3)
+        ...     }
+        ... )
+        >>> df
+           Aloha  Bell Chart  Animals@#$%^
+        0      0           0             0
+        1      1           1             1
+        2      2           2             2
+        >>> df.clean_names()
+           aloha  bell_chart  animals@#$%^
+        0      0           0             0
+        1      1           1             1
+        2      2           2             2
+        >>> df.clean_names(remove_special=True)
+           aloha  bell_chart  animals
+        0      0           0        0
+        1      1           1        1
+        2      2           2        2
+
+    !!! summary "Version Changed"
+
+        - 0.26.0
+             - Added `axis` and `column_names` parameters.
+
+    Args:
+        df: The pandas DataFrame object.
+        axis: Whether to clean the labels on the index or columns.
+            If `None`, applies to a defined column
+            or columns in `column_names`.
+        column_names: Clean the values in a column.
+            `axis` should be `None`.
+            Column selection is possible using the
+            [`select`][janitor.functions.select.select] syntax.
+        strip_underscores: Removes the outer underscores from all
+            column names/values. Default None keeps outer underscores.
+            Values can be either 'left', 'right' or 'both'
+            or the respective shorthand 'l',
+            'r' and True.
+        case_type: Whether to make columns lower or uppercase.
+            Current case may be preserved with 'preserve',
+            while snake case conversion (from CamelCase or camelCase only)
+            can be turned on using "snake".
+            Default 'lower' makes all characters lowercase.
+        remove_special: Remove special characters from columns.
+            Only letters, numbers and underscores are preserved.
+        strip_accents: Whether or not to remove accents from
+            columns names/values.
+        preserve_original_labels: Preserve original names.
+            This is later retrievable using `df.original_labels`.
+            Applies if `axis` is not None.
+        enforce_string: Whether or not to convert all
+            column names/values to string type.
+            Defaults to True, but can be turned off.
+            Columns with >1 levels will not be converted by default.
+        truncate_limit: Truncates formatted column names/values
+            to the specified length.
+            Default None does not truncate.
+
+    Raises:
+        ValueError: If `axis=None` and `column_names=None`.
+
+    Returns:
+        A pandas DataFrame.
+    """
+    if not axis and not column_names:
+        raise ValueError(
+            "Kindly provide an argument to `column_names`, if axis is None."
+        )
+    if axis is None:
+        column_names = get_index_labels(
+            arg=column_names, df=df, axis="columns"
+        )
+        if is_scalar(column_names):
+            column_names = [column_names]
+        df = df.copy()
+        for column_name in column_names:
+            df[column_name] = _clean_names(
+                obj=df[column_name],
+                enforce_string=enforce_string,
+                case_type=case_type,
+                remove_special=remove_special,
+                strip_accents=strip_accents,
+                strip_underscores=strip_underscores,
+                truncate_limit=truncate_limit,
+            )
+        return df
+
+    assert axis in {"index", "columns"}
+    df = df[:]
+    target_axis = getattr(df, axis)
+    if isinstance(target_axis, pd.MultiIndex):
+        target_axis = [
+            target_axis.get_level_values(number)
+            for number in range(target_axis.nlevels)
+        ]
+        target_axis = [
+            _clean_names(
+                obj=obj,
+                enforce_string=enforce_string,
+                case_type=case_type,
+                remove_special=remove_special,
+                strip_accents=strip_accents,
+                strip_underscores=strip_underscores,
+                truncate_limit=truncate_limit,
+            )
+            for obj in target_axis
+        ]
+    else:
+        target_axis = _clean_names(
+            obj=target_axis,
+            enforce_string=enforce_string,
+            case_type=case_type,
+            remove_special=remove_special,
+            strip_accents=strip_accents,
+            strip_underscores=strip_underscores,
+            truncate_limit=truncate_limit,
+        )
+    # Store the original column names, if enabled by user
+    if preserve_original_labels:
+        df.__dict__["original_labels"] = getattr(df, axis)
+    setattr(df, axis, target_axis)
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ coalesce + + +

+ +
+ +

Function for performing coalesce.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ coalesce(df, *column_names, target_column_name=None, default_value=None) + +

+ + +
+ +

Coalesce two or more columns of data in order of column names provided.

+

Given the variable arguments of column names, +coalesce finds and returns the first non-missing value +from these columns, for every row in the input dataframe. +If all the column values are null for a particular row, +then the default_value will be filled in.

+

If target_column_name is not provided, +then the first column is coalesced.

+

This method does not mutate the original DataFrame.

+

The select syntax +can be used in column_names.

+ + +

Examples:

+

Use coalesce with 3 columns, "a", "b" and "c".

+
>>> import pandas as pd
+>>> import numpy as np
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "a": [np.nan, 1, np.nan],
+...     "b": [2, 3, np.nan],
+...     "c": [4, np.nan, np.nan],
+... })
+>>> df.coalesce("a", "b", "c")
+     a    b    c
+0  2.0  2.0  4.0
+1  1.0  3.0  NaN
+2  NaN  NaN  NaN
+
+

Provide a target_column_name.

+
>>> df.coalesce("a", "b", "c", target_column_name="new_col")
+     a    b    c  new_col
+0  NaN  2.0  4.0      2.0
+1  1.0  3.0  NaN      1.0
+2  NaN  NaN  NaN      NaN
+
+

Provide a default value.

+
>>> import pandas as pd
+>>> import numpy as np
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "a": [1, np.nan, np.nan],
+...     "b": [2, 3, np.nan],
+... })
+>>> df.coalesce(
+...     "a", "b",
+...     target_column_name="new_col",
+...     default_value=-1,
+... )
+     a    b  new_col
+0  1.0  2.0      1.0
+1  NaN  3.0      3.0
+2  NaN  NaN     -1.0
+
+

This is more syntactic diabetes! For R users, this should look familiar to +dplyr's coalesce function; for Python users, the interface +should be more intuitive than the pandas.Series.combine_first +method.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_names + + Any + +
+

A list of column names.

+
+
+ () +
+ target_column_name + + Optional[str] + +
+

The new column name after combining. +If None, then the first column in column_names is updated, +with the Null values replaced.

+
+
+ None +
+ default_value + + Optional[Union[int, float, str]] + +
+

A scalar to replace any remaining nulls +after coalescing.

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If length of column_names is less than 2.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with coalesced columns.

+
+
+ +
+ Source code in janitor/functions/coalesce.py +
 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
@pf.register_dataframe_method
+@deprecated_alias(columns="column_names", new_column_name="target_column_name")
+def coalesce(
+    df: pd.DataFrame,
+    *column_names: Any,
+    target_column_name: Optional[str] = None,
+    default_value: Optional[Union[int, float, str]] = None,
+) -> pd.DataFrame:
+    """Coalesce two or more columns of data in order of column names provided.
+
+    Given the variable arguments of column names,
+    `coalesce` finds and returns the first non-missing value
+    from these columns, for every row in the input dataframe.
+    If all the column values are null for a particular row,
+    then the `default_value` will be filled in.
+
+    If `target_column_name` is not provided,
+    then the first column is coalesced.
+
+    This method does not mutate the original DataFrame.
+
+    The [`select`][janitor.functions.select.select] syntax
+    can be used in `column_names`.
+
+    Examples:
+        Use `coalesce` with 3 columns, "a", "b" and "c".
+
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "a": [np.nan, 1, np.nan],
+        ...     "b": [2, 3, np.nan],
+        ...     "c": [4, np.nan, np.nan],
+        ... })
+        >>> df.coalesce("a", "b", "c")
+             a    b    c
+        0  2.0  2.0  4.0
+        1  1.0  3.0  NaN
+        2  NaN  NaN  NaN
+
+        Provide a target_column_name.
+
+        >>> df.coalesce("a", "b", "c", target_column_name="new_col")
+             a    b    c  new_col
+        0  NaN  2.0  4.0      2.0
+        1  1.0  3.0  NaN      1.0
+        2  NaN  NaN  NaN      NaN
+
+        Provide a default value.
+
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "a": [1, np.nan, np.nan],
+        ...     "b": [2, 3, np.nan],
+        ... })
+        >>> df.coalesce(
+        ...     "a", "b",
+        ...     target_column_name="new_col",
+        ...     default_value=-1,
+        ... )
+             a    b  new_col
+        0  1.0  2.0      1.0
+        1  NaN  3.0      3.0
+        2  NaN  NaN     -1.0
+
+    This is more syntactic diabetes! For R users, this should look familiar to
+    `dplyr`'s `coalesce` function; for Python users, the interface
+    should be more intuitive than the `pandas.Series.combine_first`
+    method.
+
+    Args:
+        df: A pandas DataFrame.
+        column_names: A list of column names.
+        target_column_name: The new column name after combining.
+            If `None`, then the first column in `column_names` is updated,
+            with the Null values replaced.
+        default_value: A scalar to replace any remaining nulls
+            after coalescing.
+
+    Raises:
+        ValueError: If length of `column_names` is less than 2.
+
+    Returns:
+        A pandas DataFrame with coalesced columns.
+    """
+
+    if not column_names:
+        return df
+
+    indexers = _select_index([*column_names], df, axis="columns")
+
+    if len(indexers) < 2:
+        raise ValueError(
+            "The number of columns to coalesce should be a minimum of 2."
+        )
+
+    if target_column_name:
+        check("target_column_name", target_column_name, [str])
+
+    if default_value:
+        check("default_value", default_value, [int, float, str])
+
+    df = df.copy()
+
+    outcome = df.iloc[:, indexers[0]]
+
+    for num in range(1, len(indexers)):
+        position = indexers[num]
+        replacement = df.iloc[:, position]
+        outcome = outcome.fillna(replacement)
+
+    if outcome.hasnans and (default_value is not None):
+        outcome = outcome.fillna(default_value)
+
+    if target_column_name is None:
+        df.iloc[:, indexers[0]] = outcome
+    else:
+        df[target_column_name] = outcome
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ collapse_levels + + +

+ +
+ +

Implementation of the collapse_levels function.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ collapse_levels(df, sep=None, glue=None, axis='columns') + +

+ + +
+ +

Flatten multi-level index/column dataframe to a single level.

+

This method does not mutate the original DataFrame.

+

Given a DataFrame containing multi-level index/columns, flatten to single-level +by string-joining the labels in each level.

+

After a groupby / aggregate operation where .agg() is passed a +list of multiple aggregation functions, a multi-level DataFrame is +returned with the name of the function applied in the second level.

+

It is sometimes convenient for later indexing to flatten out this +multi-level configuration back into a single level. This function does +this through a simple string-joining of all the names across different +levels in a single column.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "class": ["bird", "bird", "bird", "mammal", "mammal"],
+...     "max_speed": [389, 389, 24, 80, 21],
+...     "type": ["falcon", "falcon", "parrot", "Lion", "Monkey"],
+... })
+>>> df
+    class  max_speed    type
+0    bird        389  falcon
+1    bird        389  falcon
+2    bird         24  parrot
+3  mammal         80    Lion
+4  mammal         21  Monkey
+>>> grouped_df = df.groupby("class")[['max_speed']].agg(["mean", "median"])
+>>> grouped_df
+         max_speed
+              mean median
+class
+bird    267.333333  389.0
+mammal   50.500000   50.5
+>>> grouped_df.collapse_levels(sep="_")
+        max_speed_mean  max_speed_median
+class
+bird        267.333333             389.0
+mammal       50.500000              50.5
+
+

Before applying .collapse_levels, the .agg operation returns a +multi-level column DataFrame whose columns are (level 1, level 2):

+
[("max_speed", "mean"), ("max_speed", "median")]
+
+

.collapse_levels then flattens the column MultiIndex into a single +level index with names:

+
["max_speed_mean", "max_speed_median"]
+
+

For more control, a glue specification can be passed, +where the names of the levels are used to control the output of the +flattened index:

+
>>> (grouped_df
+...  .rename_axis(columns=['column_name', 'agg_name'])
+...  .collapse_levels(glue="{agg_name}_{column_name}")
+... )
+        mean_max_speed  median_max_speed
+class
+bird        267.333333             389.0
+mammal       50.500000              50.5
+
+

Note that for glue to work, the keyword arguments +in the glue specification +should be the names of the levels in the MultiIndex.

+
+

Version Changed

+
    +
  • 0.27.0
      +
    • Added glue and axis parameters.
    • +
    +
  • +
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ sep + + str + +
+

String separator used to join the column level names.

+
+
+ None +
+ glue + + str + +
+

A specification on how the column levels should be combined. +It allows for a more granular composition, +and serves as an alternative to sep.

+
+
+ None +
+ axis + + str + +
+

Determines whether to collapse the +levels on the index or columns.

+
+
+ 'columns' +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with single-level column index.

+
+
+ +
+ Source code in janitor/functions/collapse_levels.py +
 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
@pf.register_dataframe_method
+def collapse_levels(
+    df: pd.DataFrame,
+    sep: str = None,
+    glue: str = None,
+    axis: str = "columns",
+) -> pd.DataFrame:
+    """Flatten multi-level index/column dataframe to a single level.
+
+    This method does not mutate the original DataFrame.
+
+    Given a DataFrame containing multi-level index/columns, flatten to single-level
+    by string-joining the labels in each level.
+
+    After a `groupby` / `aggregate` operation where `.agg()` is passed a
+    list of multiple aggregation functions, a multi-level DataFrame is
+    returned with the name of the function applied in the second level.
+
+    It is sometimes convenient for later indexing to flatten out this
+    multi-level configuration back into a single level. This function does
+    this through a simple string-joining of all the names across different
+    levels in a single column.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "class": ["bird", "bird", "bird", "mammal", "mammal"],
+        ...     "max_speed": [389, 389, 24, 80, 21],
+        ...     "type": ["falcon", "falcon", "parrot", "Lion", "Monkey"],
+        ... })
+        >>> df
+            class  max_speed    type
+        0    bird        389  falcon
+        1    bird        389  falcon
+        2    bird         24  parrot
+        3  mammal         80    Lion
+        4  mammal         21  Monkey
+        >>> grouped_df = df.groupby("class")[['max_speed']].agg(["mean", "median"])
+        >>> grouped_df  # doctest: +NORMALIZE_WHITESPACE
+                 max_speed
+                      mean median
+        class
+        bird    267.333333  389.0
+        mammal   50.500000   50.5
+        >>> grouped_df.collapse_levels(sep="_")  # doctest: +NORMALIZE_WHITESPACE
+                max_speed_mean  max_speed_median
+        class
+        bird        267.333333             389.0
+        mammal       50.500000              50.5
+
+        Before applying `.collapse_levels`, the `.agg` operation returns a
+        multi-level column DataFrame whose columns are `(level 1, level 2)`:
+
+        ```python
+        [("max_speed", "mean"), ("max_speed", "median")]
+        ```
+
+        `.collapse_levels` then flattens the column MultiIndex into a single
+        level index with names:
+
+        ```python
+        ["max_speed_mean", "max_speed_median"]
+        ```
+
+        For more control, a `glue` specification can be passed,
+        where the names of the levels are used to control the output of the
+        flattened index:
+        >>> (grouped_df
+        ...  .rename_axis(columns=['column_name', 'agg_name'])
+        ...  .collapse_levels(glue="{agg_name}_{column_name}")
+        ... )
+                mean_max_speed  median_max_speed
+        class
+        bird        267.333333             389.0
+        mammal       50.500000              50.5
+
+        Note that for `glue` to work, the keyword arguments
+        in the glue specification
+        should be the names of the levels in the MultiIndex.
+
+    !!! abstract "Version Changed"
+
+        - 0.27.0
+            - Added `glue` and `axis` parameters.
+
+    Args:
+        df: A pandas DataFrame.
+        sep: String separator used to join the column level names.
+        glue: A specification on how the column levels should be combined.
+            It allows for a more granular composition,
+            and serves as an alternative to `sep`.
+        axis: Determines whether to collapse the
+            levels on the index or columns.
+
+    Returns:
+        A pandas DataFrame with single-level column index.
+    """  # noqa: E501
+    if (sep is not None) and (glue is not None):
+        raise ValueError("Only one of sep or glue should be provided.")
+    if sep is not None:
+        check("sep", sep, [str])
+    if glue is not None:
+        check("glue", glue, [str])
+    check("axis", axis, [str])
+    if axis not in {"index", "columns"}:
+        raise ValueError(
+            "axis argument should be either 'index' or 'columns'."
+        )
+
+    if not isinstance(getattr(df, axis), pd.MultiIndex):
+        return df
+
+    # TODO: Pyarrow offers faster string computations
+    # future work should take this into consideration,
+    # which would require a different route from python's string.join
+    # since work is only on the columns
+    # it is safe, and more efficient to slice/view the dataframe
+    # plus Pandas creates a new Index altogether
+    # as such, the original dataframe is not modified
+    df = df[:]
+    new_index = getattr(df, axis)
+    if glue is not None:
+        new_index = [dict(zip(new_index.names, entry)) for entry in new_index]
+        new_index = [glue.format_map(mapping) for mapping in new_index]
+        setattr(df, axis, new_index)
+        return df
+    sep = "_" if sep is None else sep
+    levels = [level for level in new_index.levels]
+    all_strings = all(map(is_string_dtype, levels))
+    if all_strings:
+        no_empty_string = all((entry != "").all() for entry in levels)
+        if no_empty_string:
+            new_index = new_index.map(sep.join)
+            setattr(df, axis, new_index)
+            return df
+    new_index = (map(str, entry) for entry in new_index)
+    new_index = [
+        # faster to use a list comprehension within string.join
+        # compared to a generator
+        # https://stackoverflow.com/a/37782238
+        sep.join([entry for entry in word if entry])
+        for word in new_index
+    ]
+    setattr(df, axis, new_index)
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ complete + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ complete(df, *columns, sort=False, by=None, fill_value=None, explicit=True) + +

+ + +
+ +

Complete a data frame with missing combinations of data.

+

It is modeled after tidyr's complete function. +In a way, it is the inverse of pd.dropna, as it exposes +implicitly missing rows.

+

The variable columns parameter can be a column name, +a list of column names, +or a pandas Index, Series, or DataFrame. +If a pandas Index, Series, or DataFrame is passed, it should +have a name or names that exist in df.

+

A callable can also be passed - the callable should evaluate +to a pandas Index, Series, or DataFrame, +and the names of the pandas object should exist in df.

+

A dictionary can also be passed - +the values of the dictionary should be +either be a 1D array +or a callable that evaluates to a +1D array, +while the keys of the dictionary +should exist in df.

+

User should ensure that the pandas object is unique and/or sorted +- no checks are done to ensure uniqueness and/or sortedness.

+

If by is present, the DataFrame is completed per group. +by should be a column name, or a list of column names.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> import numpy as np
+>>> df = pd.DataFrame(
+...     {
+...         "Year": [1999, 2000, 2004, 1999, 2004],
+...         "Taxon": [
+...             "Saccharina",
+...             "Saccharina",
+...             "Saccharina",
+...             "Agarum",
+...             "Agarum",
+...         ],
+...         "Abundance": [4, 5, 2, 1, 8],
+...     }
+... )
+>>> df
+   Year       Taxon  Abundance
+0  1999  Saccharina          4
+1  2000  Saccharina          5
+2  2004  Saccharina          2
+3  1999      Agarum          1
+4  2004      Agarum          8
+
+

Expose missing pairings of Year and Taxon:

+
>>> df.complete("Year", "Taxon", sort=True)
+   Year       Taxon  Abundance
+0  1999      Agarum        1.0
+1  1999  Saccharina        4.0
+2  2000      Agarum        NaN
+3  2000  Saccharina        5.0
+4  2004      Agarum        8.0
+5  2004  Saccharina        2.0
+
+

Expose missing years from 1999 to 2004:

+
>>> index = pd.Index(range(1999,2005),name='Year')
+>>> df.complete(index, "Taxon", sort=True)
+    Year       Taxon  Abundance
+0   1999      Agarum        1.0
+1   1999  Saccharina        4.0
+2   2000      Agarum        NaN
+3   2000  Saccharina        5.0
+4   2001      Agarum        NaN
+5   2001  Saccharina        NaN
+6   2002      Agarum        NaN
+7   2002  Saccharina        NaN
+8   2003      Agarum        NaN
+9   2003  Saccharina        NaN
+10  2004      Agarum        8.0
+11  2004  Saccharina        2.0
+
+

A dictionary can be used as well:

+
>>> dictionary = {'Year':range(1999,2005)}
+>>> df.complete(dictionary, "Taxon", sort=True)
+    Year       Taxon  Abundance
+0   1999      Agarum        1.0
+1   1999  Saccharina        4.0
+2   2000      Agarum        NaN
+3   2000  Saccharina        5.0
+4   2001      Agarum        NaN
+5   2001  Saccharina        NaN
+6   2002      Agarum        NaN
+7   2002  Saccharina        NaN
+8   2003      Agarum        NaN
+9   2003  Saccharina        NaN
+10  2004      Agarum        8.0
+11  2004  Saccharina        2.0
+
+

Fill missing values:

+
>>> df = pd.DataFrame(
+...     dict(
+...         group=(1, 2, 1, 2),
+...         item_id=(1, 2, 2, 3),
+...         item_name=("a", "a", "b", "b"),
+...         value1=(1, np.nan, 3, 4),
+...         value2=range(4, 8),
+...     )
+... )
+>>> df
+   group  item_id item_name  value1  value2
+0      1        1         a     1.0       4
+1      2        2         a     NaN       5
+2      1        2         b     3.0       6
+3      2        3         b     4.0       7
+
+
>>> df.complete(
+...     "group",
+...     ["item_id", "item_name"],
+...     fill_value={"value1": 0, "value2": 99},
+...     sort=True
+... )
+   group  item_id item_name  value1  value2
+0      1        1         a     1.0     4.0
+1      1        2         a     0.0    99.0
+2      1        2         b     3.0     6.0
+3      1        3         b     0.0    99.0
+4      2        1         a     0.0    99.0
+5      2        2         a     0.0     5.0
+6      2        2         b     0.0    99.0
+7      2        3         b     4.0     7.0
+
+

Limit the fill to only implicit missing values +by setting explicit to False:

+
>>> df.complete(
+...     "group",
+...     ["item_id", "item_name"],
+...     fill_value={"value1": 0, "value2": 99},
+...     explicit=False,
+...     sort=True
+... )
+   group  item_id item_name  value1  value2
+0      1        1         a     1.0     4.0
+1      1        2         a     0.0    99.0
+2      1        2         b     3.0     6.0
+3      1        3         b     0.0    99.0
+4      2        1         a     0.0    99.0
+5      2        2         a     NaN     5.0
+6      2        2         b     0.0    99.0
+7      2        3         b     4.0     7.0
+
+

Expose missing rows per group, using a callable:

+
>>> df = pd.DataFrame(
+...     {
+...         "state": ["CA", "CA", "HI", "HI", "HI", "NY", "NY"],
+...         "year": [2010, 2013, 2010, 2012, 2016, 2009, 2013],
+...         "value": [1, 3, 1, 2, 3, 2, 5],
+...     }
+... )
+>>> df
+  state  year  value
+0    CA  2010      1
+1    CA  2013      3
+2    HI  2010      1
+3    HI  2012      2
+4    HI  2016      3
+5    NY  2009      2
+6    NY  2013      5
+
+
>>> def new_year_values(df):
+...     return pd.RangeIndex(start=df.year.min(), stop=df.year.max() + 1, name='year')
+>>> df.complete(new_year_values, by='state',sort=True)
+    state  year  value
+0     CA  2010    1.0
+1     CA  2011    NaN
+2     CA  2012    NaN
+3     CA  2013    3.0
+4     HI  2010    1.0
+5     HI  2011    NaN
+6     HI  2012    2.0
+7     HI  2013    NaN
+8     HI  2014    NaN
+9     HI  2015    NaN
+10    HI  2016    3.0
+11    NY  2009    2.0
+12    NY  2010    NaN
+13    NY  2011    NaN
+14    NY  2012    NaN
+15    NY  2013    5.0
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ *columns + + Any + +
+

This refers to the columns to be completed. +It could be a column name, +a list of column names, +or a pandas Index, Series, or DataFrame.

+

It can also be a callable that gets evaluated +to a pandas Index, Series, or DataFrame.

+

It can also be a dictionary, +where the values are either a 1D array +or a callable that evaluates to a +1D array, +while the keys of the dictionary +should exist in df.

+
+
+ () +
+ sort + + bool + +
+

Sort DataFrame based on *columns.

+
+
+ False +
+ by + + str | list + +
+

Label or list of labels to group by. +The explicit missing rows are returned per group.

+
+
+ None +
+ fill_value + + dict | Any + +
+

Scalar value to use instead of NaN +for missing combinations. A dictionary, mapping columns names +to a scalar value is also accepted.

+
+
+ None +
+ explicit + + bool + +
+

Determines if only implicitly missing values +should be filled (False), or all nulls existing in the dataframe +(True). explicit is applicable only +if fill_value is not None.

+
+
+ True +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with explicit missing rows, if any.

+
+
+ +
+ Source code in janitor/functions/complete.py +
 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
@pf.register_dataframe_method
+def complete(
+    df: pd.DataFrame,
+    *columns: Any,
+    sort: bool = False,
+    by: str | list = None,
+    fill_value: dict | Any = None,
+    explicit: bool = True,
+) -> pd.DataFrame:
+    """
+    Complete a data frame with missing combinations of data.
+
+    It is modeled after tidyr's `complete` function.
+    In a way, it is the inverse of `pd.dropna`, as it exposes
+    implicitly missing rows.
+
+    The variable `columns` parameter can be a column name,
+    a list of column names,
+    or a pandas Index, Series, or DataFrame.
+    If a pandas Index, Series, or DataFrame is passed, it should
+    have a name or names that exist in `df`.
+
+    A callable can also be passed - the callable should evaluate
+    to a pandas Index, Series, or DataFrame,
+    and the names of the pandas object should exist in `df`.
+
+    A dictionary can also be passed -
+    the values of the dictionary should be
+    either be a 1D array
+    or a callable that evaluates to a
+    1D array,
+    while the keys of the dictionary
+    should exist in `df`.
+
+    User should ensure that the pandas object is unique and/or sorted
+    - no checks are done to ensure uniqueness and/or sortedness.
+
+    If `by` is present, the DataFrame is *completed* per group.
+    `by` should be a column name, or a list of column names.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> import numpy as np
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "Year": [1999, 2000, 2004, 1999, 2004],
+        ...         "Taxon": [
+        ...             "Saccharina",
+        ...             "Saccharina",
+        ...             "Saccharina",
+        ...             "Agarum",
+        ...             "Agarum",
+        ...         ],
+        ...         "Abundance": [4, 5, 2, 1, 8],
+        ...     }
+        ... )
+        >>> df
+           Year       Taxon  Abundance
+        0  1999  Saccharina          4
+        1  2000  Saccharina          5
+        2  2004  Saccharina          2
+        3  1999      Agarum          1
+        4  2004      Agarum          8
+
+        Expose missing pairings of `Year` and `Taxon`:
+        >>> df.complete("Year", "Taxon", sort=True)
+           Year       Taxon  Abundance
+        0  1999      Agarum        1.0
+        1  1999  Saccharina        4.0
+        2  2000      Agarum        NaN
+        3  2000  Saccharina        5.0
+        4  2004      Agarum        8.0
+        5  2004  Saccharina        2.0
+
+        Expose missing years from 1999 to 2004:
+        >>> index = pd.Index(range(1999,2005),name='Year')
+        >>> df.complete(index, "Taxon", sort=True)
+            Year       Taxon  Abundance
+        0   1999      Agarum        1.0
+        1   1999  Saccharina        4.0
+        2   2000      Agarum        NaN
+        3   2000  Saccharina        5.0
+        4   2001      Agarum        NaN
+        5   2001  Saccharina        NaN
+        6   2002      Agarum        NaN
+        7   2002  Saccharina        NaN
+        8   2003      Agarum        NaN
+        9   2003  Saccharina        NaN
+        10  2004      Agarum        8.0
+        11  2004  Saccharina        2.0
+
+        A dictionary can be used as well:
+        >>> dictionary = {'Year':range(1999,2005)}
+        >>> df.complete(dictionary, "Taxon", sort=True)
+            Year       Taxon  Abundance
+        0   1999      Agarum        1.0
+        1   1999  Saccharina        4.0
+        2   2000      Agarum        NaN
+        3   2000  Saccharina        5.0
+        4   2001      Agarum        NaN
+        5   2001  Saccharina        NaN
+        6   2002      Agarum        NaN
+        7   2002  Saccharina        NaN
+        8   2003      Agarum        NaN
+        9   2003  Saccharina        NaN
+        10  2004      Agarum        8.0
+        11  2004  Saccharina        2.0
+
+        Fill missing values:
+        >>> df = pd.DataFrame(
+        ...     dict(
+        ...         group=(1, 2, 1, 2),
+        ...         item_id=(1, 2, 2, 3),
+        ...         item_name=("a", "a", "b", "b"),
+        ...         value1=(1, np.nan, 3, 4),
+        ...         value2=range(4, 8),
+        ...     )
+        ... )
+        >>> df
+           group  item_id item_name  value1  value2
+        0      1        1         a     1.0       4
+        1      2        2         a     NaN       5
+        2      1        2         b     3.0       6
+        3      2        3         b     4.0       7
+
+        >>> df.complete(
+        ...     "group",
+        ...     ["item_id", "item_name"],
+        ...     fill_value={"value1": 0, "value2": 99},
+        ...     sort=True
+        ... )
+           group  item_id item_name  value1  value2
+        0      1        1         a     1.0     4.0
+        1      1        2         a     0.0    99.0
+        2      1        2         b     3.0     6.0
+        3      1        3         b     0.0    99.0
+        4      2        1         a     0.0    99.0
+        5      2        2         a     0.0     5.0
+        6      2        2         b     0.0    99.0
+        7      2        3         b     4.0     7.0
+
+        Limit the fill to only implicit missing values
+        by setting explicit to `False`:
+        >>> df.complete(
+        ...     "group",
+        ...     ["item_id", "item_name"],
+        ...     fill_value={"value1": 0, "value2": 99},
+        ...     explicit=False,
+        ...     sort=True
+        ... )
+           group  item_id item_name  value1  value2
+        0      1        1         a     1.0     4.0
+        1      1        2         a     0.0    99.0
+        2      1        2         b     3.0     6.0
+        3      1        3         b     0.0    99.0
+        4      2        1         a     0.0    99.0
+        5      2        2         a     NaN     5.0
+        6      2        2         b     0.0    99.0
+        7      2        3         b     4.0     7.0
+
+        Expose missing rows per group, using a callable:
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "state": ["CA", "CA", "HI", "HI", "HI", "NY", "NY"],
+        ...         "year": [2010, 2013, 2010, 2012, 2016, 2009, 2013],
+        ...         "value": [1, 3, 1, 2, 3, 2, 5],
+        ...     }
+        ... )
+        >>> df
+          state  year  value
+        0    CA  2010      1
+        1    CA  2013      3
+        2    HI  2010      1
+        3    HI  2012      2
+        4    HI  2016      3
+        5    NY  2009      2
+        6    NY  2013      5
+
+        >>> def new_year_values(df):
+        ...     return pd.RangeIndex(start=df.year.min(), stop=df.year.max() + 1, name='year')
+        >>> df.complete(new_year_values, by='state',sort=True)
+            state  year  value
+        0     CA  2010    1.0
+        1     CA  2011    NaN
+        2     CA  2012    NaN
+        3     CA  2013    3.0
+        4     HI  2010    1.0
+        5     HI  2011    NaN
+        6     HI  2012    2.0
+        7     HI  2013    NaN
+        8     HI  2014    NaN
+        9     HI  2015    NaN
+        10    HI  2016    3.0
+        11    NY  2009    2.0
+        12    NY  2010    NaN
+        13    NY  2011    NaN
+        14    NY  2012    NaN
+        15    NY  2013    5.0
+
+    Args:
+        df: A pandas DataFrame.
+        *columns: This refers to the columns to be completed.
+            It could be a column name,
+            a list of column names,
+            or a pandas Index, Series, or DataFrame.
+
+            It can also be a callable that gets evaluated
+            to a pandas Index, Series, or DataFrame.
+
+            It can also be a dictionary,
+            where the values are either a 1D array
+            or a callable that evaluates to a
+            1D array,
+            while the keys of the dictionary
+            should exist in `df`.
+        sort: Sort DataFrame based on *columns.
+        by: Label or list of labels to group by.
+            The explicit missing rows are returned per group.
+        fill_value: Scalar value to use instead of NaN
+            for missing combinations. A dictionary, mapping columns names
+            to a scalar value is also accepted.
+        explicit: Determines if only implicitly missing values
+            should be filled (`False`), or all nulls existing in the dataframe
+            (`True`). `explicit` is applicable only
+            if `fill_value` is not `None`.
+
+    Returns:
+        A pandas DataFrame with explicit missing rows, if any.
+    """  # noqa: E501
+
+    if not columns:
+        return df
+    return _computations_complete(df, columns, sort, by, fill_value, explicit)
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ concatenate_columns + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ concatenate_columns(df, column_names, new_column_name, sep='-', ignore_empty=True) + +

+ + +
+ +

Concatenates the set of columns into a single column.

+

Used to quickly generate an index based on a group of columns.

+

This method mutates the original DataFrame.

+ + +

Examples:

+

Concatenate two columns row-wise.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": [1, 3, 5], "b": list("xyz")})
+>>> df
+   a  b
+0  1  x
+1  3  y
+2  5  z
+>>> df.concatenate_columns(
+...     column_names=["a", "b"], new_column_name="m",
+... )
+   a  b    m
+0  1  x  1-x
+1  3  y  3-y
+2  5  z  5-z
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_names + + List[Hashable] + +
+

A list of columns to concatenate together.

+
+
+ required +
+ new_column_name + + Hashable + +
+

The name of the new column.

+
+
+ required +
+ sep + + str + +
+

The separator between each column's data.

+
+
+ '-' +
+ ignore_empty + + bool + +
+

Ignore null values if exists.

+
+
+ True +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ JanitorError + +
+

If at least two columns are not provided +within column_names.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with concatenated columns.

+
+
+ +
+ Source code in janitor/functions/concatenate_columns.py +
10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
@pf.register_dataframe_method
+@deprecated_alias(columns="column_names")
+def concatenate_columns(
+    df: pd.DataFrame,
+    column_names: List[Hashable],
+    new_column_name: Hashable,
+    sep: str = "-",
+    ignore_empty: bool = True,
+) -> pd.DataFrame:
+    """Concatenates the set of columns into a single column.
+
+    Used to quickly generate an index based on a group of columns.
+
+    This method mutates the original DataFrame.
+
+    Examples:
+        Concatenate two columns row-wise.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": [1, 3, 5], "b": list("xyz")})
+        >>> df
+           a  b
+        0  1  x
+        1  3  y
+        2  5  z
+        >>> df.concatenate_columns(
+        ...     column_names=["a", "b"], new_column_name="m",
+        ... )
+           a  b    m
+        0  1  x  1-x
+        1  3  y  3-y
+        2  5  z  5-z
+
+    Args:
+        df: A pandas DataFrame.
+        column_names: A list of columns to concatenate together.
+        new_column_name: The name of the new column.
+        sep: The separator between each column's data.
+        ignore_empty: Ignore null values if exists.
+
+    Raises:
+        JanitorError: If at least two columns are not provided
+            within `column_names`.
+
+    Returns:
+        A pandas DataFrame with concatenated columns.
+    """
+    if len(column_names) < 2:
+        raise JanitorError("At least two columns must be specified")
+
+    df[new_column_name] = (
+        df[column_names].astype(str).fillna("").agg(sep.join, axis=1)
+    )
+
+    if ignore_empty:
+
+        def remove_empty_string(x):
+            """Ignore empty/null string values from the concatenated output."""
+            return sep.join(x for x in x.split(sep) if x)
+
+        df[new_column_name] = df[new_column_name].transform(
+            remove_empty_string
+        )
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ conditional_join + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ conditional_join(df, right, *conditions, how='inner', df_columns=slice(None), right_columns=slice(None), keep='all', use_numba=False, indicator=False, force=False) + +

+ + +
+ +

The conditional_join function operates similarly to pd.merge, +but supports joins on inequality operators, +or a combination of equi and non-equi joins.

+

Joins solely on equality are not supported.

+

If the join is solely on equality, pd.merge function +covers that; if you are interested in nearest joins, asof joins, +or rolling joins, then pd.merge_asof covers that. +There is also pandas' IntervalIndex, which is efficient for range joins, +especially if the intervals do not overlap.

+

Column selection in df_columns and right_columns is possible using the +select syntax.

+

Performance might be improved by setting use_numba to True - +this can be handy for equi joins that have lots of duplicated keys. +This can also be handy for non-equi joins, where there are more than +two join conditions, +or there is significant overlap in the range join columns. +This assumes that numba is installed.

+

Noticeable performance can be observed for range joins, +if both join columns from the right dataframe +are monotonically increasing.

+

This function returns rows, if any, where values from df meet the +condition(s) for values from right. The conditions are passed in +as a variable argument of tuples, where the tuple is of +the form (left_on, right_on, op); left_on is the column +label from df, right_on is the column label from right, +while op is the operator.

+

For multiple conditions, the and(&) +operator is used to combine the results of the individual conditions.

+

In some scenarios there might be performance gains if the less than join, +or the greater than join condition, or the range condition +is executed before the equi join - pass force=True to force this.

+

The operator can be any of ==, !=, <=, <, >=, >.

+

There is no optimisation for the != operator.

+

The join is done only on the columns.

+

For non-equi joins, only numeric, timedelta and date columns are supported.

+

inner, left, right and outer joins are supported.

+

If the columns from df and right have nothing in common, +a single index column is returned; else, a MultiIndex column +is returned.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df1 = pd.DataFrame({"value_1": [2, 5, 7, 1, 3, 4]})
+>>> df2 = pd.DataFrame({"value_2A": [0, 3, 7, 12, 0, 2, 3, 1],
+...                     "value_2B": [1, 5, 9, 15, 1, 4, 6, 3],
+...                    })
+>>> df1
+   value_1
+0        2
+1        5
+2        7
+3        1
+4        3
+5        4
+>>> df2
+   value_2A  value_2B
+0         0         1
+1         3         5
+2         7         9
+3        12        15
+4         0         1
+5         2         4
+6         3         6
+7         1         3
+
+
>>> df1.conditional_join(
+...     df2,
+...     ("value_1", "value_2A", ">"),
+...     ("value_1", "value_2B", "<")
+... )
+   value_1  value_2A  value_2B
+0        2         1         3
+1        5         3         6
+2        3         2         4
+3        4         3         5
+4        4         3         6
+
+

Select specific columns, after the join:

+
>>> df1.conditional_join(
+...     df2,
+...     ("value_1", "value_2A", ">"),
+...     ("value_1", "value_2B", "<"),
+...     right_columns='value_2B',
+...     how='left'
+... )
+   value_1  value_2B
+0        2       3.0
+1        5       6.0
+2        3       4.0
+3        4       5.0
+4        4       6.0
+5        7       NaN
+6        1       NaN
+
+

Rename columns, before the join:

+
>>> (df1
+...  .rename(columns={'value_1':'left_column'})
+...  .conditional_join(
+...      df2,
+...     ("left_column", "value_2A", ">"),
+...     ("left_column", "value_2B", "<"),
+...      right_columns='value_2B',
+...      how='outer')
+... )
+    left_column  value_2B
+0           2.0       3.0
+1           5.0       6.0
+2           3.0       4.0
+3           4.0       5.0
+4           4.0       6.0
+5           7.0       NaN
+6           1.0       NaN
+7           NaN       1.0
+8           NaN       9.0
+9           NaN      15.0
+10          NaN       1.0
+
+

Get the first match:

+
>>> df1.conditional_join(
+...     df2,
+...     ("value_1", "value_2A", ">"),
+...     ("value_1", "value_2B", "<"),
+...     keep='first'
+... )
+   value_1  value_2A  value_2B
+0        2         1         3
+1        5         3         6
+2        3         2         4
+3        4         3         5
+
+

Get the last match:

+
>>> df1.conditional_join(
+...     df2,
+...     ("value_1", "value_2A", ">"),
+...     ("value_1", "value_2B", "<"),
+...     keep='last'
+... )
+   value_1  value_2A  value_2B
+0        2         1         3
+1        5         3         6
+2        3         2         4
+3        4         3         6
+
+

Add an indicator column:

+
>>> df1.conditional_join(
+...     df2,
+...     ("value_1", "value_2A", ">"),
+...     ("value_1", "value_2B", "<"),
+...     how='outer',
+...     indicator=True
+... )
+    value_1  value_2A  value_2B      _merge
+0       2.0       1.0       3.0        both
+1       5.0       3.0       6.0        both
+2       3.0       2.0       4.0        both
+3       4.0       3.0       5.0        both
+4       4.0       3.0       6.0        both
+5       7.0       NaN       NaN   left_only
+6       1.0       NaN       NaN   left_only
+7       NaN       0.0       1.0  right_only
+8       NaN       7.0       9.0  right_only
+9       NaN      12.0      15.0  right_only
+10      NaN       0.0       1.0  right_only
+
+
+

Version Changed

+
    +
  • 0.24.0
      +
    • Added df_columns, right_columns, keep and use_numba parameters.
    • +
    +
  • +
  • 0.24.1
      +
    • Added indicator parameter.
    • +
    +
  • +
  • 0.25.0
      +
    • col class supported.
    • +
    • Outer join supported. sort_by_appearance deprecated.
    • +
    • Numba support for equi join
    • +
    +
  • +
  • 0.27.0
      +
    • Added support for timedelta dtype.
    • +
    +
  • +
  • 0.28.0
      +
    • col class deprecated.
    • +
    +
  • +
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ right + + Union[DataFrame, Series] + +
+

Named Series or DataFrame to join to.

+
+
+ required +
+ conditions + + Any + +
+

Variable argument of tuple(s) of the form +(left_on, right_on, op), where left_on is the column +label from df, right_on is the column label from right, +while op is the operator. +The col class is also supported. The operator can be any of +==, !=, <=, <, >=, >. For multiple conditions, +the and(&) operator is used to combine the results +of the individual conditions.

+
+
+ () +
+ how + + Literal['inner', 'left', 'right', 'outer'] + +
+

Indicates the type of join to be performed. +It can be one of inner, left, right or outer.

+
+
+ 'inner' +
+ df_columns + + Optional[Any] + +
+

Columns to select from df in the final output dataframe. +Column selection is based on the +select syntax.

+
+
+ slice(None) +
+ right_columns + + Optional[Any] + +
+

Columns to select from right in the final output dataframe. +Column selection is based on the +select syntax.

+
+
+ slice(None) +
+ use_numba + + bool + +
+

Use numba, if installed, to accelerate the computation.

+
+
+ False +
+ keep + + Literal['first', 'last', 'all'] + +
+

Choose whether to return the first match, last match or all matches.

+
+
+ 'all' +
+ indicator + + Optional[Union[bool, str]] + +
+

If True, adds a column to the output DataFrame +called _merge with information on the source of each row. +The column can be given a different name by providing a string argument. +The column will have a Categorical type with the value of left_only +for observations whose merge key only appears in the left DataFrame, +right_only for observations whose merge key +only appears in the right DataFrame, and both if the observation’s +merge key is found in both DataFrames.

+
+
+ False +
+ force + + bool + +
+

If True, force the non-equi join conditions to execute before the equi join.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame of the two merged Pandas objects.

+
+
+ +
+ Source code in janitor/functions/conditional_join.py +
 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
@pf.register_dataframe_method
+def conditional_join(
+    df: pd.DataFrame,
+    right: Union[pd.DataFrame, pd.Series],
+    *conditions: Any,
+    how: Literal["inner", "left", "right", "outer"] = "inner",
+    df_columns: Optional[Any] = slice(None),
+    right_columns: Optional[Any] = slice(None),
+    keep: Literal["first", "last", "all"] = "all",
+    use_numba: bool = False,
+    indicator: Optional[Union[bool, str]] = False,
+    force: bool = False,
+) -> pd.DataFrame:
+    """The conditional_join function operates similarly to `pd.merge`,
+    but supports joins on inequality operators,
+    or a combination of equi and non-equi joins.
+
+    Joins solely on equality are not supported.
+
+    If the join is solely on equality, `pd.merge` function
+    covers that; if you are interested in nearest joins, asof joins,
+    or rolling joins, then `pd.merge_asof` covers that.
+    There is also pandas' IntervalIndex, which is efficient for range joins,
+    especially if the intervals do not overlap.
+
+    Column selection in `df_columns` and `right_columns` is possible using the
+    [`select`][janitor.functions.select.select] syntax.
+
+    Performance might be improved by setting `use_numba` to `True` -
+    this can be handy for equi joins that have lots of duplicated keys.
+    This can also be handy for non-equi joins, where there are more than
+    two join conditions,
+    or there is significant overlap in the range join columns.
+    This assumes that `numba` is installed.
+
+    Noticeable performance can be observed for range joins,
+    if both join columns from the right dataframe
+    are monotonically increasing.
+
+    This function returns rows, if any, where values from `df` meet the
+    condition(s) for values from `right`. The conditions are passed in
+    as a variable argument of tuples, where the tuple is of
+    the form `(left_on, right_on, op)`; `left_on` is the column
+    label from `df`, `right_on` is the column label from `right`,
+    while `op` is the operator.
+
+    For multiple conditions, the and(`&`)
+    operator is used to combine the results of the individual conditions.
+
+    In some scenarios there might be performance gains if the less than join,
+    or the greater than join condition, or the range condition
+    is executed before the equi join - pass `force=True` to force this.
+
+    The operator can be any of `==`, `!=`, `<=`, `<`, `>=`, `>`.
+
+    There is no optimisation for the `!=` operator.
+
+    The join is done only on the columns.
+
+    For non-equi joins, only numeric, timedelta and date columns are supported.
+
+    `inner`, `left`, `right` and `outer` joins are supported.
+
+    If the columns from `df` and `right` have nothing in common,
+    a single index column is returned; else, a MultiIndex column
+    is returned.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df1 = pd.DataFrame({"value_1": [2, 5, 7, 1, 3, 4]})
+        >>> df2 = pd.DataFrame({"value_2A": [0, 3, 7, 12, 0, 2, 3, 1],
+        ...                     "value_2B": [1, 5, 9, 15, 1, 4, 6, 3],
+        ...                    })
+        >>> df1
+           value_1
+        0        2
+        1        5
+        2        7
+        3        1
+        4        3
+        5        4
+        >>> df2
+           value_2A  value_2B
+        0         0         1
+        1         3         5
+        2         7         9
+        3        12        15
+        4         0         1
+        5         2         4
+        6         3         6
+        7         1         3
+
+        >>> df1.conditional_join(
+        ...     df2,
+        ...     ("value_1", "value_2A", ">"),
+        ...     ("value_1", "value_2B", "<")
+        ... )
+           value_1  value_2A  value_2B
+        0        2         1         3
+        1        5         3         6
+        2        3         2         4
+        3        4         3         5
+        4        4         3         6
+
+        Select specific columns, after the join:
+        >>> df1.conditional_join(
+        ...     df2,
+        ...     ("value_1", "value_2A", ">"),
+        ...     ("value_1", "value_2B", "<"),
+        ...     right_columns='value_2B',
+        ...     how='left'
+        ... )
+           value_1  value_2B
+        0        2       3.0
+        1        5       6.0
+        2        3       4.0
+        3        4       5.0
+        4        4       6.0
+        5        7       NaN
+        6        1       NaN
+
+        Rename columns, before the join:
+        >>> (df1
+        ...  .rename(columns={'value_1':'left_column'})
+        ...  .conditional_join(
+        ...      df2,
+        ...     ("left_column", "value_2A", ">"),
+        ...     ("left_column", "value_2B", "<"),
+        ...      right_columns='value_2B',
+        ...      how='outer')
+        ... )
+            left_column  value_2B
+        0           2.0       3.0
+        1           5.0       6.0
+        2           3.0       4.0
+        3           4.0       5.0
+        4           4.0       6.0
+        5           7.0       NaN
+        6           1.0       NaN
+        7           NaN       1.0
+        8           NaN       9.0
+        9           NaN      15.0
+        10          NaN       1.0
+
+        Get the first match:
+        >>> df1.conditional_join(
+        ...     df2,
+        ...     ("value_1", "value_2A", ">"),
+        ...     ("value_1", "value_2B", "<"),
+        ...     keep='first'
+        ... )
+           value_1  value_2A  value_2B
+        0        2         1         3
+        1        5         3         6
+        2        3         2         4
+        3        4         3         5
+
+        Get the last match:
+        >>> df1.conditional_join(
+        ...     df2,
+        ...     ("value_1", "value_2A", ">"),
+        ...     ("value_1", "value_2B", "<"),
+        ...     keep='last'
+        ... )
+           value_1  value_2A  value_2B
+        0        2         1         3
+        1        5         3         6
+        2        3         2         4
+        3        4         3         6
+
+        Add an indicator column:
+        >>> df1.conditional_join(
+        ...     df2,
+        ...     ("value_1", "value_2A", ">"),
+        ...     ("value_1", "value_2B", "<"),
+        ...     how='outer',
+        ...     indicator=True
+        ... )
+            value_1  value_2A  value_2B      _merge
+        0       2.0       1.0       3.0        both
+        1       5.0       3.0       6.0        both
+        2       3.0       2.0       4.0        both
+        3       4.0       3.0       5.0        both
+        4       4.0       3.0       6.0        both
+        5       7.0       NaN       NaN   left_only
+        6       1.0       NaN       NaN   left_only
+        7       NaN       0.0       1.0  right_only
+        8       NaN       7.0       9.0  right_only
+        9       NaN      12.0      15.0  right_only
+        10      NaN       0.0       1.0  right_only
+
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `df_columns`, `right_columns`, `keep` and `use_numba` parameters.
+        - 0.24.1
+            - Added `indicator` parameter.
+        - 0.25.0
+            - `col` class supported.
+            - Outer join supported. `sort_by_appearance` deprecated.
+            - Numba support for equi join
+        - 0.27.0
+            - Added support for timedelta dtype.
+        - 0.28.0
+            - `col` class deprecated.
+
+    Args:
+        df: A pandas DataFrame.
+        right: Named Series or DataFrame to join to.
+        conditions: Variable argument of tuple(s) of the form
+            `(left_on, right_on, op)`, where `left_on` is the column
+            label from `df`, `right_on` is the column label from `right`,
+            while `op` is the operator.
+            The `col` class is also supported. The operator can be any of
+            `==`, `!=`, `<=`, `<`, `>=`, `>`. For multiple conditions,
+            the and(`&`) operator is used to combine the results
+            of the individual conditions.
+        how: Indicates the type of join to be performed.
+            It can be one of `inner`, `left`, `right` or `outer`.
+        df_columns: Columns to select from `df` in the final output dataframe.
+            Column selection is based on the
+            [`select`][janitor.functions.select.select] syntax.
+        right_columns: Columns to select from `right` in the final output dataframe.
+            Column selection is based on the
+            [`select`][janitor.functions.select.select] syntax.
+        use_numba: Use numba, if installed, to accelerate the computation.
+        keep: Choose whether to return the first match, last match or all matches.
+        indicator: If `True`, adds a column to the output DataFrame
+            called `_merge` with information on the source of each row.
+            The column can be given a different name by providing a string argument.
+            The column will have a Categorical type with the value of `left_only`
+            for observations whose merge key only appears in the left DataFrame,
+            `right_only` for observations whose merge key
+            only appears in the right DataFrame, and `both` if the observation’s
+            merge key is found in both DataFrames.
+        force: If `True`, force the non-equi join conditions to execute before the equi join.
+
+
+    Returns:
+        A pandas DataFrame of the two merged Pandas objects.
+    """  # noqa: E501
+
+    return _conditional_join_compute(
+        df=df,
+        right=right,
+        conditions=conditions,
+        how=how,
+        df_columns=df_columns,
+        right_columns=right_columns,
+        keep=keep,
+        use_numba=use_numba,
+        indicator=indicator,
+        force=force,
+    )
+
+
+
+ +
+ +
+ + +

+ get_join_indices(df, right, conditions, keep='all', use_numba=False, force=False, return_ragged_arrays=False) + +

+ + +
+ +

Convenience function to return the matching indices from an inner join.

+
+

New in version 0.27.0

+
+
+

Version Changed

+
    +
  • 0.29.0
      +
    • Add support for ragged array indices.
    • +
    +
  • +
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ right + + Union[DataFrame, Series] + +
+

Named Series or DataFrame to join to.

+
+
+ required +
+ conditions + + list[tuple[str]] + +
+

List of arguments of tuple(s) of the form +(left_on, right_on, op), where left_on is the column +label from df, right_on is the column label from right, +while op is the operator. +The col class is also supported. The operator can be any of +==, !=, <=, <, >=, >. For multiple conditions, +the and(&) operator is used to combine the results +of the individual conditions.

+
+
+ required +
+ use_numba + + bool + +
+

Use numba, if installed, to accelerate the computation.

+
+
+ False +
+ keep + + Literal['first', 'last', 'all'] + +
+

Choose whether to return the first match, last match or all matches.

+
+
+ 'all' +
+ force + + bool + +
+

If True, force the non-equi join conditions +to execute before the equi join.

+
+
+ False +
+ return_ragged_arrays + + bool + +
+

If True, return slices/ranges of matching right indices +for each matching left index. Not applicable if use_numba is True. +If return_ragged_arrays is True, the join condition +should be a single join, or a range join, +where the right columns are both monotonically increasing.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ tuple[ndarray, ndarray] + +
+

A tuple of indices for the rows in the dataframes that match.

+
+
+ +
+ Source code in janitor/functions/conditional_join.py +
1466
+1467
+1468
+1469
+1470
+1471
+1472
+1473
+1474
+1475
+1476
+1477
+1478
+1479
+1480
+1481
+1482
+1483
+1484
+1485
+1486
+1487
+1488
+1489
+1490
+1491
+1492
+1493
+1494
+1495
+1496
+1497
+1498
+1499
+1500
+1501
+1502
+1503
+1504
+1505
+1506
+1507
+1508
+1509
+1510
+1511
+1512
+1513
+1514
+1515
+1516
+1517
+1518
+1519
+1520
+1521
def get_join_indices(
+    df: pd.DataFrame,
+    right: Union[pd.DataFrame, pd.Series],
+    conditions: list[tuple[str]],
+    keep: Literal["first", "last", "all"] = "all",
+    use_numba: bool = False,
+    force: bool = False,
+    return_ragged_arrays: bool = False,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Convenience function to return the matching indices from an inner join.
+
+    !!! info "New in version 0.27.0"
+
+    !!! abstract "Version Changed"
+
+        - 0.29.0
+            - Add support for ragged array indices.
+
+    Args:
+        df: A pandas DataFrame.
+        right: Named Series or DataFrame to join to.
+        conditions: List of arguments of tuple(s) of the form
+            `(left_on, right_on, op)`, where `left_on` is the column
+            label from `df`, `right_on` is the column label from `right`,
+            while `op` is the operator.
+            The `col` class is also supported. The operator can be any of
+            `==`, `!=`, `<=`, `<`, `>=`, `>`. For multiple conditions,
+            the and(`&`) operator is used to combine the results
+            of the individual conditions.
+        use_numba: Use numba, if installed, to accelerate the computation.
+        keep: Choose whether to return the first match, last match or all matches.
+        force: If `True`, force the non-equi join conditions
+            to execute before the equi join.
+        return_ragged_arrays: If `True`, return slices/ranges of matching right indices
+            for each matching left index. Not applicable if `use_numba` is `True`.
+            If `return_ragged_arrays` is `True`, the join condition
+            should be a single join, or a range join,
+            where the right columns are both monotonically increasing.
+
+    Returns:
+        A tuple of indices for the rows in the dataframes that match.
+    """
+    return _conditional_join_compute(
+        df=df,
+        right=right,
+        conditions=conditions,
+        how="inner",
+        df_columns=None,
+        right_columns=None,
+        keep=keep,
+        use_numba=use_numba,
+        indicator=False,
+        force=force,
+        return_matching_indices=True,
+        return_ragged_arrays=return_ragged_arrays,
+    )
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ convert_date + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ convert_excel_date(df, column_names) + +

+ + +
+ +

Convert Excel's serial date format into Python datetime format.

+

This method does not mutate the original DataFrame.

+

Implementation is based on +Stack Overflow.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"date": [39690, 39690, 37118]})
+>>> df
+    date
+0  39690
+1  39690
+2  37118
+>>> df.convert_excel_date('date')
+        date
+0 2008-08-30
+1 2008-08-30
+2 2001-08-15
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_names + + Union[Hashable, list] + +
+

A column name, or a list of column names.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with corrected dates.

+
+
+ +
+ Source code in janitor/functions/convert_date.py +
10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
@pf.register_dataframe_method
+@deprecated_alias(column="column_names")
+def convert_excel_date(
+    df: pd.DataFrame, column_names: Union[Hashable, list]
+) -> pd.DataFrame:
+    """Convert Excel's serial date format into Python datetime format.
+
+    This method does not mutate the original DataFrame.
+
+    Implementation is based on
+    [Stack Overflow](https://stackoverflow.com/questions/38454403/convert-excel-style-date-with-pandas).
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"date": [39690, 39690, 37118]})
+        >>> df
+            date
+        0  39690
+        1  39690
+        2  37118
+        >>> df.convert_excel_date('date')
+                date
+        0 2008-08-30
+        1 2008-08-30
+        2 2001-08-15
+
+    Args:
+        df: A pandas DataFrame.
+        column_names: A column name, or a list of column names.
+
+    Returns:
+        A pandas DataFrame with corrected dates.
+    """  # noqa: E501
+
+    if not isinstance(column_names, list):
+        column_names = [column_names]
+    # https://stackoverflow.com/a/65460255/7175713
+    dictionary = {
+        column_name: pd.to_datetime(
+            df[column_name], unit="D", origin="1899-12-30"
+        )
+        for column_name in column_names
+    }
+
+    return df.assign(**dictionary)
+
+
+
+ +
+ +
+ + +

+ convert_matlab_date(df, column_names) + +

+ + +
+ +

Convert Matlab's serial date number into Python datetime format.

+

Implementation is based on +Stack Overflow.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"date": [737125.0, 737124.815863, 737124.4985, 737124]})
+>>> df
+            date
+0  737125.000000
+1  737124.815863
+2  737124.498500
+3  737124.000000
+>>> df.convert_matlab_date('date')
+                           date
+0 2018-03-06 00:00:00.000000000
+1 2018-03-05 19:34:50.563199671
+2 2018-03-05 11:57:50.399998876
+3 2018-03-05 00:00:00.000000000
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_names + + Union[Hashable, list] + +
+

A column name, or a list of column names.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with corrected dates.

+
+
+ +
+ Source code in janitor/functions/convert_date.py +
 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
@pf.register_dataframe_method
+@deprecated_alias(column="column_names")
+def convert_matlab_date(
+    df: pd.DataFrame, column_names: Union[Hashable, list]
+) -> pd.DataFrame:
+    """Convert Matlab's serial date number into Python datetime format.
+
+    Implementation is based on
+    [Stack Overflow](https://stackoverflow.com/questions/13965740/converting-matlabs-datenum-format-to-python).
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"date": [737125.0, 737124.815863, 737124.4985, 737124]})
+        >>> df
+                    date
+        0  737125.000000
+        1  737124.815863
+        2  737124.498500
+        3  737124.000000
+        >>> df.convert_matlab_date('date')
+                                   date
+        0 2018-03-06 00:00:00.000000000
+        1 2018-03-05 19:34:50.563199671
+        2 2018-03-05 11:57:50.399998876
+        3 2018-03-05 00:00:00.000000000
+
+    Args:
+        df: A pandas DataFrame.
+        column_names: A column name, or a list of column names.
+
+    Returns:
+        A pandas DataFrame with corrected dates.
+    """  # noqa: E501
+    # https://stackoverflow.com/a/49135037/7175713
+    if not isinstance(column_names, list):
+        column_names = [column_names]
+    dictionary = {
+        column_name: pd.to_datetime(df[column_name] - 719529, unit="D")
+        for column_name in column_names
+    }
+
+    return df.assign(**dictionary)
+
+
+
+ +
+ +
+ + +

+ convert_unix_date(df, column_name) + +

+ + +
+ +

Convert unix epoch time into Python datetime format.

+

Note that this ignores local tz and convert all timestamps to naive +datetime based on UTC!

+

This method mutates the original DataFrame.

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use pd.to_datetime instead.

+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"date": [1651510462, 53394822, 1126233195]})
+>>> df
+         date
+0  1651510462
+1    53394822
+2  1126233195
+>>> df.convert_unix_date('date')
+                 date
+0 2022-05-02 16:54:22
+1 1971-09-10 23:53:42
+2 2005-09-09 02:33:15
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + Hashable + +
+

A column name.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with corrected dates.

+
+
+ +
+ Source code in janitor/functions/convert_date.py +
105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `pd.to_datetime` instead."
+    )
+)
+@deprecated_alias(column="column_name")
+def convert_unix_date(df: pd.DataFrame, column_name: Hashable) -> pd.DataFrame:
+    """Convert unix epoch time into Python datetime format.
+
+    Note that this ignores local tz and convert all timestamps to naive
+    datetime based on UTC!
+
+    This method mutates the original DataFrame.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `pd.to_datetime` instead.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"date": [1651510462, 53394822, 1126233195]})
+        >>> df
+                 date
+        0  1651510462
+        1    53394822
+        2  1126233195
+        >>> df.convert_unix_date('date')
+                         date
+        0 2022-05-02 16:54:22
+        1 1971-09-10 23:53:42
+        2 2005-09-09 02:33:15
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: A column name.
+
+    Returns:
+        A pandas DataFrame with corrected dates.
+    """
+
+    try:
+        df[column_name] = pd.to_datetime(df[column_name], unit="s")
+    except OutOfBoundsDatetime:  # Indicates time is in milliseconds.
+        df[column_name] = pd.to_datetime(df[column_name], unit="ms")
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ count_cumulative_unique + + +

+ +
+ +

Implementation of count_cumulative_unique.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ count_cumulative_unique(df, column_name, dest_column_name, case_sensitive=True) + +

+ + +
+ +

Generates a running total of cumulative unique values in a given column.

+

A new column will be created containing a running +count of unique values in the specified column. +If case_sensitive is True, then the case of +any letters will matter (i.e., a != A); +otherwise, the case of any letters will not matter.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "letters": list("aabABb"),
+...     "numbers": range(4, 10),
+... })
+>>> df
+  letters  numbers
+0       a        4
+1       a        5
+2       b        6
+3       A        7
+4       B        8
+5       b        9
+>>> df.count_cumulative_unique(
+...     column_name="letters",
+...     dest_column_name="letters_unique_count",
+... )
+  letters  numbers  letters_unique_count
+0       a        4                     1
+1       a        5                     1
+2       b        6                     2
+3       A        7                     3
+4       B        8                     4
+5       b        9                     4
+
+

Cumulative counts, ignoring casing.

+
>>> df.count_cumulative_unique(
+...     column_name="letters",
+...     dest_column_name="letters_unique_count",
+...     case_sensitive=False,
+... )
+  letters  numbers  letters_unique_count
+0       a        4                     1
+1       a        5                     1
+2       b        6                     2
+3       A        7                     2
+4       B        8                     2
+5       b        9                     2
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + Hashable + +
+

Name of the column containing values from which a +running count of unique values will be created.

+
+
+ required +
+ dest_column_name + + str + +
+

The name of the new column containing the +cumulative count of unique values that will be created.

+
+
+ required +
+ case_sensitive + + bool + +
+

Whether or not uppercase and lowercase letters +will be considered equal. Only valid with string-like columns.

+
+
+ True +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ TypeError + +
+

If case_sensitive is False when counting a non-string +column_name.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with a new column containing a cumulative +count of unique values from another column.

+
+
+ +
+ Source code in janitor/functions/count_cumulative_unique.py +
 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
@pf.register_dataframe_method
+def count_cumulative_unique(
+    df: pd.DataFrame,
+    column_name: Hashable,
+    dest_column_name: str,
+    case_sensitive: bool = True,
+) -> pd.DataFrame:
+    """Generates a running total of cumulative unique values in a given column.
+
+    A new column will be created containing a running
+    count of unique values in the specified column.
+    If `case_sensitive` is `True`, then the case of
+    any letters will matter (i.e., `a != A`);
+    otherwise, the case of any letters will not matter.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "letters": list("aabABb"),
+        ...     "numbers": range(4, 10),
+        ... })
+        >>> df
+          letters  numbers
+        0       a        4
+        1       a        5
+        2       b        6
+        3       A        7
+        4       B        8
+        5       b        9
+        >>> df.count_cumulative_unique(
+        ...     column_name="letters",
+        ...     dest_column_name="letters_unique_count",
+        ... )
+          letters  numbers  letters_unique_count
+        0       a        4                     1
+        1       a        5                     1
+        2       b        6                     2
+        3       A        7                     3
+        4       B        8                     4
+        5       b        9                     4
+
+        Cumulative counts, ignoring casing.
+
+        >>> df.count_cumulative_unique(
+        ...     column_name="letters",
+        ...     dest_column_name="letters_unique_count",
+        ...     case_sensitive=False,
+        ... )
+          letters  numbers  letters_unique_count
+        0       a        4                     1
+        1       a        5                     1
+        2       b        6                     2
+        3       A        7                     2
+        4       B        8                     2
+        5       b        9                     2
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: Name of the column containing values from which a
+            running count of unique values will be created.
+        dest_column_name: The name of the new column containing the
+            cumulative count of unique values that will be created.
+        case_sensitive: Whether or not uppercase and lowercase letters
+            will be considered equal. Only valid with string-like columns.
+
+    Raises:
+        TypeError: If `case_sensitive` is False when counting a non-string
+            `column_name`.
+
+    Returns:
+        A pandas DataFrame with a new column containing a cumulative
+            count of unique values from another column.
+    """
+    check_column(df, column_name)
+    check_column(df, dest_column_name, present=False)
+
+    counter = df[column_name]
+    if not case_sensitive:
+        try:
+            # Make it so that the the same uppercase and lowercase
+            # letter are treated as one unique value
+            counter = counter.str.lower()
+        except (AttributeError, TypeError) as e:
+            # AttributeError is raised by pandas when .str is used on
+            # non-string types, e.g. int.
+            # TypeError is raised by pandas when .str.lower is used on a
+            # forbidden string type, e.g. bytes.
+            raise TypeError(
+                "case_sensitive=False can only be used with a string-like "
+                f"type. Column {column_name} is {counter.dtype} type."
+            ) from e
+
+    counter = (
+        counter.groupby(counter, sort=False).cumcount().to_numpy(copy=False)
+    )
+    counter = np.cumsum(counter == 0)
+
+    return df.assign(**{dest_column_name: counter})
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ currency_column_to_numeric + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ currency_column_to_numeric(df, column_name, cleaning_style=None, cast_non_numeric=None, fill_all_non_numeric=None, remove_non_numeric=False) + +

+ + +
+ +

Convert currency column to numeric.

+

This method does not mutate the original DataFrame.

+

This method allows one to take a column containing currency values, +inadvertently imported as a string, and cast it as a float. This is +usually the case when reading CSV files that were modified in Excel. +Empty strings (i.e. '') are retained as NaN values.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "a_col": [" 24.56", "-", "(12.12)", "1,000,000"],
+...     "d_col": ["", "foo", "1.23 dollars", "-1,000 yen"],
+... })
+>>> df
+       a_col         d_col
+0      24.56
+1          -           foo
+2    (12.12)  1.23 dollars
+3  1,000,000    -1,000 yen
+
+

The default cleaning style.

+
>>> df.currency_column_to_numeric("d_col")
+       a_col    d_col
+0      24.56      NaN
+1          -      NaN
+2    (12.12)     1.23
+3  1,000,000 -1000.00
+
+

The accounting cleaning style.

+
>>> df.currency_column_to_numeric("a_col", cleaning_style="accounting")
+        a_col         d_col
+0       24.56
+1        0.00           foo
+2      -12.12  1.23 dollars
+3  1000000.00    -1,000 yen
+
+

Valid cleaning styles are:

+
    +
  • None: Default cleaning is applied. Empty strings are always retained as + NaN. Numbers, -, . are extracted and the resulting string + is cast to a float.
  • +
  • 'accounting': Replaces numbers in parentheses with negatives, removes commas.
  • +
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The pandas DataFrame.

+
+
+ required +
+ column_name + + str + +
+

The column containing currency values to modify.

+
+
+ required +
+ cleaning_style + + Optional[str] + +
+

What style of cleaning to perform.

+
+
+ None +
+ cast_non_numeric + + Optional[dict] + +
+

A dict of how to coerce certain strings to numeric +type. For example, if there are values of 'REORDER' in the DataFrame, +{'REORDER': 0} will cast all instances of 'REORDER' to 0. +Only takes effect in the default cleaning style.

+
+
+ None +
+ fill_all_non_numeric + + Optional[Union[float, int]] + +
+

Similar to cast_non_numeric, but fills all +strings to the same value. For example, fill_all_non_numeric=1, will +make everything that doesn't coerce to a currency 1. +Only takes effect in the default cleaning style.

+
+
+ None +
+ remove_non_numeric + + bool + +
+

If set to True, rows of df that contain +non-numeric values in the column_name column will be removed. +Only takes effect in the default cleaning style.

+
+
+ False +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If cleaning_style is not one of the accepted styles.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/currency_column_to_numeric.py +
 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
@pf.register_dataframe_method
+@deprecated_alias(col_name="column_name", type="cleaning_style")
+def currency_column_to_numeric(
+    df: pd.DataFrame,
+    column_name: str,
+    cleaning_style: Optional[str] = None,
+    cast_non_numeric: Optional[dict] = None,
+    fill_all_non_numeric: Optional[Union[float, int]] = None,
+    remove_non_numeric: bool = False,
+) -> pd.DataFrame:
+    """Convert currency column to numeric.
+
+    This method does not mutate the original DataFrame.
+
+    This method allows one to take a column containing currency values,
+    inadvertently imported as a string, and cast it as a float. This is
+    usually the case when reading CSV files that were modified in Excel.
+    Empty strings (i.e. `''`) are retained as `NaN` values.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "a_col": [" 24.56", "-", "(12.12)", "1,000,000"],
+        ...     "d_col": ["", "foo", "1.23 dollars", "-1,000 yen"],
+        ... })
+        >>> df  # doctest: +NORMALIZE_WHITESPACE
+               a_col         d_col
+        0      24.56
+        1          -           foo
+        2    (12.12)  1.23 dollars
+        3  1,000,000    -1,000 yen
+
+        The default cleaning style.
+
+        >>> df.currency_column_to_numeric("d_col")
+               a_col    d_col
+        0      24.56      NaN
+        1          -      NaN
+        2    (12.12)     1.23
+        3  1,000,000 -1000.00
+
+        The accounting cleaning style.
+
+        >>> df.currency_column_to_numeric("a_col", cleaning_style="accounting")  # doctest: +NORMALIZE_WHITESPACE
+                a_col         d_col
+        0       24.56
+        1        0.00           foo
+        2      -12.12  1.23 dollars
+        3  1000000.00    -1,000 yen
+
+    Valid cleaning styles are:
+
+    - `None`: Default cleaning is applied. Empty strings are always retained as
+        `NaN`. Numbers, `-`, `.` are extracted and the resulting string
+        is cast to a float.
+    - `'accounting'`: Replaces numbers in parentheses with negatives, removes commas.
+
+    Args:
+        df: The pandas DataFrame.
+        column_name: The column containing currency values to modify.
+        cleaning_style: What style of cleaning to perform.
+        cast_non_numeric: A dict of how to coerce certain strings to numeric
+            type. For example, if there are values of 'REORDER' in the DataFrame,
+            `{'REORDER': 0}` will cast all instances of 'REORDER' to 0.
+            Only takes effect in the default cleaning style.
+        fill_all_non_numeric: Similar to `cast_non_numeric`, but fills all
+            strings to the same value. For example, `fill_all_non_numeric=1`, will
+            make everything that doesn't coerce to a currency `1`.
+            Only takes effect in the default cleaning style.
+        remove_non_numeric: If set to True, rows of `df` that contain
+            non-numeric values in the `column_name` column will be removed.
+            Only takes effect in the default cleaning style.
+
+    Raises:
+        ValueError: If `cleaning_style` is not one of the accepted styles.
+
+    Returns:
+        A pandas DataFrame.
+    """  # noqa: E501
+
+    check("column_name", column_name, [str])
+    check_column(df, column_name)
+
+    column_series = df[column_name]
+    if cleaning_style == "accounting":
+        outcome = (
+            df[column_name]
+            .str.strip()
+            .str.replace(",", "", regex=False)
+            .str.replace(")", "", regex=False)
+            .str.replace("(", "-", regex=False)
+            .replace({"-": 0.0})
+            .astype(float)
+        )
+        return df.assign(**{column_name: outcome})
+    if cleaning_style is not None:
+        raise ValueError(
+            "`cleaning_style` is expected to be one of ('accounting', None). "
+            f"Got {cleaning_style!r} instead."
+        )
+
+    if cast_non_numeric:
+        check("cast_non_numeric", cast_non_numeric, [dict])
+
+    _make_cc_patrial = partial(
+        _currency_column_to_numeric,
+        cast_non_numeric=cast_non_numeric,
+    )
+    column_series = column_series.apply(_make_cc_patrial)
+
+    if remove_non_numeric:
+        df = df.loc[column_series != "", :]
+
+    # _replace_empty_string_with_none is applied here after the check on
+    # remove_non_numeric since "" is our indicator that a string was coerced
+    # in the original column
+    column_series = _replace_empty_string_with_none(column_series)
+
+    if fill_all_non_numeric is not None:
+        check("fill_all_non_numeric", fill_all_non_numeric, [int, float])
+        column_series = column_series.fillna(fill_all_non_numeric)
+
+    column_series = _replace_original_empty_string_with_none(column_series)
+
+    df = df.assign(**{column_name: pd.to_numeric(column_series)})
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ deconcatenate_column + + +

+ +
+ +

Implementation of deconcatenating columns.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ deconcatenate_column(df, column_name, sep=None, new_column_names=None, autoname=None, preserve_position=False) + +

+ + +
+ +

De-concatenates a single column into multiple columns.

+

The column to de-concatenate can be either a collection (list, tuple, ...) +which can be separated out with pd.Series.tolist(), +or a string to slice based on sep.

+

To determine this behaviour automatically, +the first element in the column specified is inspected.

+

If it is a string, then sep must be specified. +Else, the function assumes that it is an iterable type +(e.g. list or tuple), +and will attempt to deconcatenate by splitting the list.

+

Given a column with string values, this is the inverse of the +concatenate_columns +function.

+

Used to quickly split columns out of a single column.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"m": ["1-x", "2-y", "3-z"]})
+>>> df
+     m
+0  1-x
+1  2-y
+2  3-z
+>>> df.deconcatenate_column("m", sep="-", autoname="col")
+     m col1 col2
+0  1-x    1    x
+1  2-y    2    y
+2  3-z    3    z
+
+

The keyword argument preserve_position +takes True or False boolean +that controls whether the new_column_names +will take the original position +of the to-be-deconcatenated column_name:

+
    +
  • When preserve_position=False (default), df.columns change from + [..., column_name, ...] to [..., column_name, ..., new_column_names]. + In other words, the deconcatenated new columns are appended to the right + of the original dataframe and the original column_name is NOT dropped.
  • +
  • When preserve_position=True, df.column change from + [..., column_name, ...] to [..., new_column_names, ...]. + In other words, the deconcatenated new column will REPLACE the original + column_name at its original position, and column_name itself + is dropped.
  • +
+

The keyword argument autoname accepts a base string +and then automatically creates numbered column names +based off the base string. +For example, if col is passed in as the argument to autoname, +and 4 columns are created, then the resulting columns will be named +col1, col2, col3, col4. +Numbering is always 1-indexed, not 0-indexed, +in order to make the column names human-friendly.

+

This method does not mutate the original DataFrame.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + Hashable + +
+

The column to split.

+
+
+ required +
+ sep + + Optional[str] + +
+

The separator delimiting the column's data.

+
+
+ None +
+ new_column_names + + Optional[Union[List[str], Tuple[str]]] + +
+

A list of new column names post-splitting.

+
+
+ None +
+ autoname + + str + +
+

A base name for automatically naming the new columns. +Takes precedence over new_column_names if both are provided.

+
+
+ None +
+ preserve_position + + bool + +
+

Boolean for whether or not to preserve original +position of the column upon de-concatenation.

+
+
+ False +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If column_name is not present in the DataFrame.

+
+
+ ValueError + +
+

If sep is not provided and the column values +are of type str.

+
+
+ ValueError + +
+

If either new_column_names or autoname +is not supplied.

+
+
+ JanitorError + +
+

If incorrect number of names is provided +within new_column_names.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with a deconcatenated column.

+
+
+ +
+ Source code in janitor/functions/deconcatenate_column.py +
 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
@pf.register_dataframe_method
+@deprecated_alias(column="column_name")
+def deconcatenate_column(
+    df: pd.DataFrame,
+    column_name: Hashable,
+    sep: Optional[str] = None,
+    new_column_names: Optional[Union[List[str], Tuple[str]]] = None,
+    autoname: str = None,
+    preserve_position: bool = False,
+) -> pd.DataFrame:
+    """De-concatenates a single column into multiple columns.
+
+    The column to de-concatenate can be either a collection (list, tuple, ...)
+    which can be separated out with `pd.Series.tolist()`,
+    or a string to slice based on `sep`.
+
+    To determine this behaviour automatically,
+    the first element in the column specified is inspected.
+
+    If it is a string, then `sep` must be specified.
+    Else, the function assumes that it is an iterable type
+    (e.g. `list` or `tuple`),
+    and will attempt to deconcatenate by splitting the list.
+
+    Given a column with string values, this is the inverse of the
+    [`concatenate_columns`][janitor.functions.concatenate_columns.concatenate_columns]
+    function.
+
+    Used to quickly split columns out of a single column.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"m": ["1-x", "2-y", "3-z"]})
+        >>> df
+             m
+        0  1-x
+        1  2-y
+        2  3-z
+        >>> df.deconcatenate_column("m", sep="-", autoname="col")
+             m col1 col2
+        0  1-x    1    x
+        1  2-y    2    y
+        2  3-z    3    z
+
+    The keyword argument `preserve_position`
+    takes `True` or `False` boolean
+    that controls whether the `new_column_names`
+    will take the original position
+    of the to-be-deconcatenated `column_name`:
+
+    - When `preserve_position=False` (default), `df.columns` change from
+      `[..., column_name, ...]` to `[..., column_name, ..., new_column_names]`.
+      In other words, the deconcatenated new columns are appended to the right
+      of the original dataframe and the original `column_name` is NOT dropped.
+    - When `preserve_position=True`, `df.column` change from
+      `[..., column_name, ...]` to `[..., new_column_names, ...]`.
+      In other words, the deconcatenated new column will REPLACE the original
+      `column_name` at its original position, and `column_name` itself
+      is dropped.
+
+    The keyword argument `autoname` accepts a base string
+    and then automatically creates numbered column names
+    based off the base string.
+    For example, if `col` is passed in as the argument to `autoname`,
+    and 4 columns are created, then the resulting columns will be named
+    `col1, col2, col3, col4`.
+    Numbering is always 1-indexed, not 0-indexed,
+    in order to make the column names human-friendly.
+
+    This method does not mutate the original DataFrame.
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: The column to split.
+        sep: The separator delimiting the column's data.
+        new_column_names: A list of new column names post-splitting.
+        autoname: A base name for automatically naming the new columns.
+            Takes precedence over `new_column_names` if both are provided.
+        preserve_position: Boolean for whether or not to preserve original
+            position of the column upon de-concatenation.
+
+    Raises:
+        ValueError: If `column_name` is not present in the DataFrame.
+        ValueError: If `sep` is not provided and the column values
+            are of type `str`.
+        ValueError: If either `new_column_names` or `autoname`
+            is not supplied.
+        JanitorError: If incorrect number of names is provided
+            within `new_column_names`.
+
+    Returns:
+        A pandas DataFrame with a deconcatenated column.
+    """  # noqa: E501
+
+    if column_name not in df.columns:
+        raise ValueError(f"column name {column_name} not present in DataFrame")
+
+    if isinstance(df[column_name].iloc[0], str):
+        if sep is None:
+            raise ValueError(
+                "`sep` must be specified if the column values "
+                "are of type `str`."
+            )
+        df_deconcat = df[column_name].str.split(sep, expand=True)
+    else:
+        df_deconcat = pd.DataFrame(
+            df[column_name].to_list(), columns=new_column_names, index=df.index
+        )
+
+    if new_column_names is None and autoname is None:
+        raise ValueError(
+            "One of `new_column_names` or `autoname` must be supplied."
+        )
+
+    if autoname:
+        new_column_names = [
+            f"{autoname}{i}" for i in range(1, df_deconcat.shape[1] + 1)
+        ]
+
+    if not len(new_column_names) == df_deconcat.shape[1]:
+        raise JanitorError(
+            f"You need to provide {len(df_deconcat.shape[1])} names "
+            "to `new_column_names`"
+        )
+
+    df_deconcat.columns = new_column_names
+    df_new = pd.concat([df, df_deconcat], axis=1)
+
+    if preserve_position:
+        df_original = df.copy()
+        cols = list(df_original.columns)
+        index_original = cols.index(column_name)
+
+        for i, col_new in enumerate(new_column_names):
+            cols.insert(index_original + i, col_new)
+
+        df_new = df_new.select(cols, axis="columns").drop(columns=column_name)
+
+    return df_new
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ drop_constant_columns + + +

+ +
+ +

Implementation of drop_constant_columns.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ drop_constant_columns(df) + +

+ + +
+ +

Finds and drops the constant columns from a Pandas DataFrame.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> data_dict = {
+...     "a": [1, 1, 1],
+...     "b": [1, 2, 3],
+...     "c": [1, 1, 1],
+...     "d": ["rabbit", "leopard", "lion"],
+...     "e": ["Cambridge", "Shanghai", "Basel"]
+... }
+>>> df = pd.DataFrame(data_dict)
+>>> df
+   a  b  c        d          e
+0  1  1  1   rabbit  Cambridge
+1  1  2  1  leopard   Shanghai
+2  1  3  1     lion      Basel
+>>> df.drop_constant_columns()
+   b        d          e
+0  1   rabbit  Cambridge
+1  2  leopard   Shanghai
+2  3     lion      Basel
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

Input Pandas DataFrame

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

The Pandas DataFrame with the constant columns dropped.

+
+
+ +
+ Source code in janitor/functions/drop_constant_columns.py +
 7
+ 8
+ 9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
@pf.register_dataframe_method
+def drop_constant_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """Finds and drops the constant columns from a Pandas DataFrame.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> data_dict = {
+        ...     "a": [1, 1, 1],
+        ...     "b": [1, 2, 3],
+        ...     "c": [1, 1, 1],
+        ...     "d": ["rabbit", "leopard", "lion"],
+        ...     "e": ["Cambridge", "Shanghai", "Basel"]
+        ... }
+        >>> df = pd.DataFrame(data_dict)
+        >>> df
+           a  b  c        d          e
+        0  1  1  1   rabbit  Cambridge
+        1  1  2  1  leopard   Shanghai
+        2  1  3  1     lion      Basel
+        >>> df.drop_constant_columns()
+           b        d          e
+        0  1   rabbit  Cambridge
+        1  2  leopard   Shanghai
+        2  3     lion      Basel
+
+    Args:
+        df: Input Pandas DataFrame
+
+    Returns:
+        The Pandas DataFrame with the constant columns dropped.
+    """
+    return df.loc[:, df.nunique().ne(1)]
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ drop_duplicate_columns + + +

+ +
+ +

Implementation for drop_duplicate_columns.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ drop_duplicate_columns(df, column_name, nth_index=0) + +

+ + +
+ +

Remove a duplicated column specified by column_name.

+

Specifying nth_index=0 will remove the first column, +nth_index=1 will remove the second column, +and so on and so forth.

+

The corresponding tidyverse R's library is: +select(-<column_name>_<nth_index + 1>)

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "a": range(2, 5),
+...     "b": range(3, 6),
+...     "A": range(4, 7),
+...     "a*": range(6, 9),
+... }).clean_names(remove_special=True)
+>>> df
+   a  b  a  a
+0  2  3  4  6
+1  3  4  5  7
+2  4  5  6  8
+>>> df.drop_duplicate_columns(column_name="a", nth_index=1)
+   a  b  a
+0  2  3  6
+1  3  4  7
+2  4  5  8
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame

+
+
+ required +
+ column_name + + Hashable + +
+

Name of duplicated columns.

+
+
+ required +
+ nth_index + + int + +
+

Among the duplicated columns, +select the nth column to drop.

+
+
+ 0 +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame

+
+
+ +
+ Source code in janitor/functions/drop_duplicate_columns.py +
 9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
@pf.register_dataframe_method
+def drop_duplicate_columns(
+    df: pd.DataFrame, column_name: Hashable, nth_index: int = 0
+) -> pd.DataFrame:
+    """Remove a duplicated column specified by `column_name`.
+
+    Specifying `nth_index=0` will remove the first column,
+    `nth_index=1` will remove the second column,
+    and so on and so forth.
+
+    The corresponding tidyverse R's library is:
+    `select(-<column_name>_<nth_index + 1>)`
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "a": range(2, 5),
+        ...     "b": range(3, 6),
+        ...     "A": range(4, 7),
+        ...     "a*": range(6, 9),
+        ... }).clean_names(remove_special=True)
+        >>> df
+           a  b  a  a
+        0  2  3  4  6
+        1  3  4  5  7
+        2  4  5  6  8
+        >>> df.drop_duplicate_columns(column_name="a", nth_index=1)
+           a  b  a
+        0  2  3  6
+        1  3  4  7
+        2  4  5  8
+
+    Args:
+        df: A pandas DataFrame
+        column_name: Name of duplicated columns.
+        nth_index: Among the duplicated columns,
+            select the nth column to drop.
+
+    Returns:
+        A pandas DataFrame
+    """
+    col_indexes = [
+        col_idx
+        for col_idx, col_name in enumerate(df.columns)
+        if col_name == column_name
+    ]
+
+    # Select the column to remove based on nth_index.
+    removed_col_idx = col_indexes[nth_index]
+    # Filter out columns except for the one to be removed.
+    filtered_cols = [
+        c_i for c_i, _ in enumerate(df.columns) if c_i != removed_col_idx
+    ]
+
+    return df.iloc[:, filtered_cols]
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ dropnotnull + + +

+ +
+ +

Implementation source for dropnotnull.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ dropnotnull(df, column_name) + +

+ + +
+ +

Drop rows that do not have null values in the given column.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+
>>> import numpy as np
+>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": [1., np.NaN, 3.], "b": [None, "y", "z"]})
+>>> df
+     a     b
+0  1.0  None
+1  NaN     y
+2  3.0     z
+>>> df.dropnotnull("a")
+    a  b
+1 NaN  y
+>>> df.dropnotnull("b")
+     a     b
+0  1.0  None
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + Hashable + +
+

The column name to drop rows from.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with dropped rows.

+
+
+ +
+ Source code in janitor/functions/dropnotnull.py +
11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
@pf.register_dataframe_method
+@deprecated_alias(column="column_name")
+def dropnotnull(df: pd.DataFrame, column_name: Hashable) -> pd.DataFrame:
+    """Drop rows that do *not* have null values in the given column.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        >>> import numpy as np
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": [1., np.NaN, 3.], "b": [None, "y", "z"]})
+        >>> df
+             a     b
+        0  1.0  None
+        1  NaN     y
+        2  3.0     z
+        >>> df.dropnotnull("a")
+            a  b
+        1 NaN  y
+        >>> df.dropnotnull("b")
+             a     b
+        0  1.0  None
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: The column name to drop rows from.
+
+    Returns:
+        A pandas DataFrame with dropped rows.
+    """
+    return df[pd.isna(df[column_name])]
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ encode_categorical + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ encode_categorical(df, column_names=None, **kwargs) + +

+ + +
+ +

Encode the specified columns with Pandas' category dtype.

+

It is syntactic sugar around pd.Categorical.

+

This method does not mutate the original DataFrame.

+

Simply pass a string, or a sequence of column names to column_names; +alternatively, you can pass kwargs, where the keys are the column names +and the values can either be None, sort, appearance +or a 1-D array-like object.

+
    +
  • None: column is cast to an unordered categorical.
  • +
  • sort: column is cast to an ordered categorical, + with the order defined by the sort-order of the categories.
  • +
  • appearance: column is cast to an ordered categorical, + with the order defined by the order of appearance + in the original column.
  • +
  • 1d-array-like object: column is cast to an ordered categorical, + with the categories and order as specified + in the input array.
  • +
+

column_names and kwargs parameters cannot be used at the same time.

+ + +

Examples:

+

Using column_names

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "foo": ["b", "b", "a", "c", "b"],
+...     "bar": range(4, 9),
+... })
+>>> df
+  foo  bar
+0   b    4
+1   b    5
+2   a    6
+3   c    7
+4   b    8
+>>> df.dtypes
+foo    object
+bar     int64
+dtype: object
+>>> enc_df = df.encode_categorical(column_names="foo")
+>>> enc_df.dtypes
+foo    category
+bar       int64
+dtype: object
+>>> enc_df["foo"].cat.categories
+Index(['a', 'b', 'c'], dtype='object')
+>>> enc_df["foo"].cat.ordered
+False
+
+

Using kwargs to specify an ordered categorical.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "foo": ["b", "b", "a", "c", "b"],
+...     "bar": range(4, 9),
+... })
+>>> df.dtypes
+foo    object
+bar     int64
+dtype: object
+>>> enc_df = df.encode_categorical(foo="appearance")
+>>> enc_df.dtypes
+foo    category
+bar       int64
+dtype: object
+>>> enc_df["foo"].cat.categories
+Index(['b', 'a', 'c'], dtype='object')
+>>> enc_df["foo"].cat.ordered
+True
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame object.

+
+
+ required +
+ column_names + + Union[str, Iterable[str], Hashable] + +
+

A column name or an iterable (list or tuple) +of column names.

+
+
+ None +
+ **kwargs + + Any + +
+

A mapping from column name to either None, +'sort' or 'appearance', or a 1-D array. This is useful +in creating categorical columns that are ordered, or +if the user needs to explicitly specify the categories.

+
+
+ {} +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If both column_names and kwargs are provided.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/encode_categorical.py +
 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
@pf.register_dataframe_method
+@deprecated_alias(columns="column_names")
+def encode_categorical(
+    df: pd.DataFrame,
+    column_names: Union[str, Iterable[str], Hashable] = None,
+    **kwargs: Any,
+) -> pd.DataFrame:
+    """Encode the specified columns with Pandas' [category dtype][cat].
+
+    [cat]: http://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html
+
+    It is syntactic sugar around `pd.Categorical`.
+
+    This method does not mutate the original DataFrame.
+
+    Simply pass a string, or a sequence of column names to `column_names`;
+    alternatively, you can pass kwargs, where the keys are the column names
+    and the values can either be None, `sort`, `appearance`
+    or a 1-D array-like object.
+
+    - None: column is cast to an unordered categorical.
+    - `sort`: column is cast to an ordered categorical,
+              with the order defined by the sort-order of the categories.
+    - `appearance`: column is cast to an ordered categorical,
+                    with the order defined by the order of appearance
+                    in the original column.
+    - 1d-array-like object: column is cast to an ordered categorical,
+                            with the categories and order as specified
+                            in the input array.
+
+    `column_names` and `kwargs` parameters cannot be used at the same time.
+
+    Examples:
+        Using `column_names`
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "foo": ["b", "b", "a", "c", "b"],
+        ...     "bar": range(4, 9),
+        ... })
+        >>> df
+          foo  bar
+        0   b    4
+        1   b    5
+        2   a    6
+        3   c    7
+        4   b    8
+        >>> df.dtypes
+        foo    object
+        bar     int64
+        dtype: object
+        >>> enc_df = df.encode_categorical(column_names="foo")
+        >>> enc_df.dtypes
+        foo    category
+        bar       int64
+        dtype: object
+        >>> enc_df["foo"].cat.categories
+        Index(['a', 'b', 'c'], dtype='object')
+        >>> enc_df["foo"].cat.ordered
+        False
+
+        Using `kwargs` to specify an ordered categorical.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "foo": ["b", "b", "a", "c", "b"],
+        ...     "bar": range(4, 9),
+        ... })
+        >>> df.dtypes
+        foo    object
+        bar     int64
+        dtype: object
+        >>> enc_df = df.encode_categorical(foo="appearance")
+        >>> enc_df.dtypes
+        foo    category
+        bar       int64
+        dtype: object
+        >>> enc_df["foo"].cat.categories
+        Index(['b', 'a', 'c'], dtype='object')
+        >>> enc_df["foo"].cat.ordered
+        True
+
+    Args:
+        df: A pandas DataFrame object.
+        column_names: A column name or an iterable (list or tuple)
+            of column names.
+        **kwargs: A mapping from column name to either `None`,
+            `'sort'` or `'appearance'`, or a 1-D array. This is useful
+            in creating categorical columns that are ordered, or
+            if the user needs to explicitly specify the categories.
+
+    Raises:
+        ValueError: If both `column_names` and `kwargs` are provided.
+
+    Returns:
+        A pandas DataFrame.
+    """  # noqa: E501
+
+    if all((column_names, kwargs)):
+        raise ValueError(
+            "Only one of `column_names` or `kwargs` can be provided."
+        )
+    # column_names deal with only category dtype (unordered)
+    # kwargs takes care of scenarios where user wants an ordered category
+    # or user supplies specific categories to create the categorical
+    if column_names is not None:
+        column_names = get_index_labels([column_names], df, axis="columns")
+        dtypes = {col: "category" for col in column_names}
+        return df.astype(dtypes)
+
+    return _computations_as_categorical(df, **kwargs)
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ expand_column + + +

+ +
+ +

Implementation for expand_column.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ expand_column(df, column_name, sep='|', concat=True) + +

+ + +
+ +

Expand a categorical column with multiple labels into dummy-coded columns.

+

Super sugary syntax that wraps pandas.Series.str.get_dummies.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+

Functional usage syntax:

+
>>> import pandas as pd
+>>> df = pd.DataFrame(
+...     {
+...         "col1": ["A, B", "B, C, D", "E, F", "A, E, F"],
+...         "col2": [1, 2, 3, 4],
+...     }
+... )
+>>> df = expand_column(
+...     df,
+...     column_name="col1",
+...     sep=", "  # note space in sep
+... )
+>>> df
+      col1  col2  A  B  C  D  E  F
+0     A, B     1  1  1  0  0  0  0
+1  B, C, D     2  0  1  1  1  0  0
+2     E, F     3  0  0  0  0  1  1
+3  A, E, F     4  1  0  0  0  1  1
+
+

Method chaining syntax:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = (
+...     pd.DataFrame(
+...         {
+...             "col1": ["A, B", "B, C, D", "E, F", "A, E, F"],
+...             "col2": [1, 2, 3, 4],
+...         }
+...     )
+...     .expand_column(
+...         column_name='col1',
+...         sep=', '
+...     )
+... )
+>>> df
+      col1  col2  A  B  C  D  E  F
+0     A, B     1  1  1  0  0  0  0
+1  B, C, D     2  0  1  1  1  0  0
+2     E, F     3  0  0  0  0  1  1
+3  A, E, F     4  1  0  0  0  1  1
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + Hashable + +
+

Which column to expand.

+
+
+ required +
+ sep + + str + +
+

The delimiter, same to +pandas.Series.str.get_dummies's sep.

+
+
+ '|' +
+ concat + + bool + +
+

Whether to return the expanded column concatenated to +the original dataframe (concat=True), or to return it standalone +(concat=False).

+
+
+ True +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with an expanded column.

+
+
+ +
+ Source code in janitor/functions/expand_column.py +
11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
@pf.register_dataframe_method
+@deprecated_alias(column="column_name")
+def expand_column(
+    df: pd.DataFrame,
+    column_name: Hashable,
+    sep: str = "|",
+    concat: bool = True,
+) -> pd.DataFrame:
+    """Expand a categorical column with multiple labels into dummy-coded columns.
+
+    Super sugary syntax that wraps `pandas.Series.str.get_dummies`.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        Functional usage syntax:
+
+        >>> import pandas as pd
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "col1": ["A, B", "B, C, D", "E, F", "A, E, F"],
+        ...         "col2": [1, 2, 3, 4],
+        ...     }
+        ... )
+        >>> df = expand_column(
+        ...     df,
+        ...     column_name="col1",
+        ...     sep=", "  # note space in sep
+        ... )
+        >>> df
+              col1  col2  A  B  C  D  E  F
+        0     A, B     1  1  1  0  0  0  0
+        1  B, C, D     2  0  1  1  1  0  0
+        2     E, F     3  0  0  0  0  1  1
+        3  A, E, F     4  1  0  0  0  1  1
+
+        Method chaining syntax:
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = (
+        ...     pd.DataFrame(
+        ...         {
+        ...             "col1": ["A, B", "B, C, D", "E, F", "A, E, F"],
+        ...             "col2": [1, 2, 3, 4],
+        ...         }
+        ...     )
+        ...     .expand_column(
+        ...         column_name='col1',
+        ...         sep=', '
+        ...     )
+        ... )
+        >>> df
+              col1  col2  A  B  C  D  E  F
+        0     A, B     1  1  1  0  0  0  0
+        1  B, C, D     2  0  1  1  1  0  0
+        2     E, F     3  0  0  0  0  1  1
+        3  A, E, F     4  1  0  0  0  1  1
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: Which column to expand.
+        sep: The delimiter, same to
+            `pandas.Series.str.get_dummies`'s `sep`.
+        concat: Whether to return the expanded column concatenated to
+            the original dataframe (`concat=True`), or to return it standalone
+            (`concat=False`).
+
+    Returns:
+        A pandas DataFrame with an expanded column.
+    """  # noqa: E501
+    expanded_df = df[column_name].str.get_dummies(sep=sep)
+    if concat:
+        return df.join(expanded_df)
+    return expanded_df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ expand_grid + + +

+ +
+ +

Implementation source for expand_grid.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ cartesian_product(*inputs, sort=False) + +

+ + +
+ +

Creates a DataFrame from a cartesian combination of all inputs.

+

Inspiration is from tidyr's expand_grid() function.

+

The input argument should be a pandas Index/Series/DataFrame, +or a dictionary - the values of the dictionary should be +a 1D array.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor as jn
+>>> df = pd.DataFrame({"x": [1, 2], "y": [2, 1]})
+>>> data = pd.Series([1, 2, 3], name='z')
+>>> jn.cartesian_product(df, data)
+   x  y  z
+0  1  2  1
+1  1  2  2
+2  1  2  3
+3  2  1  1
+4  2  1  2
+5  2  1  3
+
+

cartesian_product also works with non-pandas objects:

+
>>> data = {"x": [1, 2, 3], "y": [1, 2]}
+>>> cartesian_product(data)
+   x  y
+0  1  1
+1  1  2
+2  2  1
+3  2  2
+4  3  1
+5  3  2
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ *inputs + + tuple + +
+

Variable arguments. The arguments should be +a pandas Index/Series/DataFrame, or a dictionary, +where the values in the dictionary is a 1D array.

+
+
+ () +
+ sort + + bool + +
+

If True, sort the output DataFrame.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/expand_grid.py +
406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
def cartesian_product(*inputs: tuple, sort: bool = False) -> pd.DataFrame:
+    """Creates a DataFrame from a cartesian combination of all inputs.
+
+    Inspiration is from tidyr's expand_grid() function.
+
+    The input argument should be a pandas Index/Series/DataFrame,
+    or a dictionary - the values of the dictionary should be
+    a 1D array.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor as jn
+        >>> df = pd.DataFrame({"x": [1, 2], "y": [2, 1]})
+        >>> data = pd.Series([1, 2, 3], name='z')
+        >>> jn.cartesian_product(df, data)
+           x  y  z
+        0  1  2  1
+        1  1  2  2
+        2  1  2  3
+        3  2  1  1
+        4  2  1  2
+        5  2  1  3
+
+        `cartesian_product` also works with non-pandas objects:
+
+        >>> data = {"x": [1, 2, 3], "y": [1, 2]}
+        >>> cartesian_product(data)
+           x  y
+        0  1  1
+        1  1  2
+        2  2  1
+        3  2  2
+        4  3  1
+        5  3  2
+
+    Args:
+        *inputs: Variable arguments. The arguments should be
+            a pandas Index/Series/DataFrame, or a dictionary,
+            where the values in the dictionary is a 1D array.
+        sort: If True, sort the output DataFrame.
+
+    Returns:
+        A pandas DataFrame.
+    """
+    contents = []
+    for entry in inputs:
+        if isinstance(entry, dict):
+            for label, value in entry.items():
+                arr = pd.Series(value, name=label)
+                contents.append(arr)
+        else:
+            contents.append(entry)
+    outcome = _compute_cartesian_product(inputs=contents, sort=sort)
+    # the values in the outcome dictionary are copies,
+    # based on numpy indexing semantics;
+    # as such, it is safe to pass copy=False
+    return pd.DataFrame(data=outcome, copy=False)
+
+
+
+ +
+ +
+ + +

+ expand(df, *columns, sort=False, by=None) + +

+ + +
+ +

Creates a DataFrame from a cartesian combination of all inputs.

+

Inspiration is from tidyr's expand() function.

+

expand() is often useful with +pd.merge +to convert implicit +missing values to explicit missing values - similar to +complete.

+

It can also be used to figure out which combinations are missing +(e.g identify gaps in your DataFrame).

+

The variable columns parameter can be a column name, +a list of column names, a pandas Index/Series/DataFrame, +or a callable, which when applied to the DataFrame, +evaluates to a pandas Index/Series/DataFrame.

+

A dictionary can also be passed +to the variable columns parameter - +the values of the dictionary should be +either be a 1D array +or a callable that evaluates to a +1D array. The array should be unique; +no check is done to verify this.

+

If by is present, the DataFrame is expanded per group. +by should be a column name, or a list of column names.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> data = [{'type': 'apple', 'year': 2010, 'size': 'XS'},
+...         {'type': 'orange', 'year': 2010, 'size': 'S'},
+...         {'type': 'apple', 'year': 2012, 'size': 'M'},
+...         {'type': 'orange', 'year': 2010, 'size': 'S'},
+...         {'type': 'orange', 'year': 2011, 'size': 'S'},
+...         {'type': 'orange', 'year': 2012, 'size': 'M'}]
+>>> df = pd.DataFrame(data)
+>>> df
+     type  year size
+0   apple  2010   XS
+1  orange  2010    S
+2   apple  2012    M
+3  orange  2010    S
+4  orange  2011    S
+5  orange  2012    M
+
+

Get unique observations:

+
>>> df.expand('type')
+     type
+0   apple
+1  orange
+>>> df.expand('size')
+  size
+0   XS
+1    S
+2    M
+>>> df.expand('type', 'size')
+     type size
+0   apple   XS
+1   apple    S
+2   apple    M
+3  orange   XS
+4  orange    S
+5  orange    M
+>>> df.expand('type','size','year')
+      type size  year
+0    apple   XS  2010
+1    apple   XS  2012
+2    apple   XS  2011
+3    apple    S  2010
+4    apple    S  2012
+5    apple    S  2011
+6    apple    M  2010
+7    apple    M  2012
+8    apple    M  2011
+9   orange   XS  2010
+10  orange   XS  2012
+11  orange   XS  2011
+12  orange    S  2010
+13  orange    S  2012
+14  orange    S  2011
+15  orange    M  2010
+16  orange    M  2012
+17  orange    M  2011
+
+

Get observations that only occur in the data:

+
>>> df.expand(['type','size'])
+     type size
+0   apple   XS
+1  orange    S
+2   apple    M
+3  orange    M
+>>> df.expand(['type','size','year'])
+     type size  year
+0   apple   XS  2010
+1  orange    S  2010
+2   apple    M  2012
+3  orange    S  2011
+4  orange    M  2012
+
+

Expand the DataFrame to include new observations:

+
>>> df.expand('type','size',{'new_year':range(2010,2014)})
+      type size  new_year
+0    apple   XS      2010
+1    apple   XS      2011
+2    apple   XS      2012
+3    apple   XS      2013
+4    apple    S      2010
+5    apple    S      2011
+6    apple    S      2012
+7    apple    S      2013
+8    apple    M      2010
+9    apple    M      2011
+10   apple    M      2012
+11   apple    M      2013
+12  orange   XS      2010
+13  orange   XS      2011
+14  orange   XS      2012
+15  orange   XS      2013
+16  orange    S      2010
+17  orange    S      2011
+18  orange    S      2012
+19  orange    S      2013
+20  orange    M      2010
+21  orange    M      2011
+22  orange    M      2012
+23  orange    M      2013
+
+

Filter for missing observations:

+
>>> combo = df.expand('type','size','year')
+>>> anti_join = df.merge(combo, how='right', indicator=True)
+>>> anti_join.query("_merge=='right_only'").drop(columns="_merge")
+      type  year size
+1    apple  2012   XS
+2    apple  2011   XS
+3    apple  2010    S
+4    apple  2012    S
+5    apple  2011    S
+6    apple  2010    M
+8    apple  2011    M
+9   orange  2010   XS
+10  orange  2012   XS
+11  orange  2011   XS
+14  orange  2012    S
+16  orange  2010    M
+18  orange  2011    M
+
+

Expand within each group, using by:

+
>>> df.expand('year','size',by='type')
+        year size
+type
+apple   2010   XS
+apple   2010    M
+apple   2012   XS
+apple   2012    M
+orange  2010    S
+orange  2010    M
+orange  2011    S
+orange  2011    M
+orange  2012    S
+orange  2012    M
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ columns + + tuple + +
+

Specification of columns to expand. +It could be column labels, + a list/tuple of column labels, + or a pandas Index/Series/DataFrame.

+

It can also be a callable; +the callable will be applied to the +entire DataFrame. The callable should +return a pandas Series/Index/DataFrame.

+

It can also be a dictionary, +where the values are either a 1D array +or a callable that evaluates to a +1D array. +The array should be unique; +no check is done to verify this.

+
+
+ () +
+ sort + + bool + +
+

If True, sort the DataFrame.

+
+
+ False +
+ by + + str | list + +
+

Label or list of labels to group by.

+
+
+ None +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/expand_grid.py +
140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
@pf.register_dataframe_method
+def expand(
+    df: pd.DataFrame,
+    *columns: tuple,
+    sort: bool = False,
+    by: str | list = None,
+) -> pd.DataFrame:
+    """
+    Creates a DataFrame from a cartesian combination of all inputs.
+
+    Inspiration is from tidyr's expand() function.
+
+    expand() is often useful with
+    [pd.merge](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html)
+    to convert implicit
+    missing values to explicit missing values - similar to
+    [`complete`][janitor.functions.complete.complete].
+
+    It can also be used to figure out which combinations are missing
+    (e.g identify gaps in your DataFrame).
+
+    The variable `columns` parameter can be a column name,
+    a list of column names, a pandas Index/Series/DataFrame,
+    or a callable, which when applied to the DataFrame,
+    evaluates to a pandas Index/Series/DataFrame.
+
+    A dictionary can also be passed
+    to the variable `columns` parameter -
+    the values of the dictionary should be
+    either be a 1D array
+    or a callable that evaluates to a
+    1D array. The array should be unique;
+    no check is done to verify this.
+
+    If `by` is present, the DataFrame is *expanded* per group.
+    `by` should be a column name, or a list of column names.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> data = [{'type': 'apple', 'year': 2010, 'size': 'XS'},
+        ...         {'type': 'orange', 'year': 2010, 'size': 'S'},
+        ...         {'type': 'apple', 'year': 2012, 'size': 'M'},
+        ...         {'type': 'orange', 'year': 2010, 'size': 'S'},
+        ...         {'type': 'orange', 'year': 2011, 'size': 'S'},
+        ...         {'type': 'orange', 'year': 2012, 'size': 'M'}]
+        >>> df = pd.DataFrame(data)
+        >>> df
+             type  year size
+        0   apple  2010   XS
+        1  orange  2010    S
+        2   apple  2012    M
+        3  orange  2010    S
+        4  orange  2011    S
+        5  orange  2012    M
+
+        Get unique observations:
+        >>> df.expand('type')
+             type
+        0   apple
+        1  orange
+        >>> df.expand('size')
+          size
+        0   XS
+        1    S
+        2    M
+        >>> df.expand('type', 'size')
+             type size
+        0   apple   XS
+        1   apple    S
+        2   apple    M
+        3  orange   XS
+        4  orange    S
+        5  orange    M
+        >>> df.expand('type','size','year')
+              type size  year
+        0    apple   XS  2010
+        1    apple   XS  2012
+        2    apple   XS  2011
+        3    apple    S  2010
+        4    apple    S  2012
+        5    apple    S  2011
+        6    apple    M  2010
+        7    apple    M  2012
+        8    apple    M  2011
+        9   orange   XS  2010
+        10  orange   XS  2012
+        11  orange   XS  2011
+        12  orange    S  2010
+        13  orange    S  2012
+        14  orange    S  2011
+        15  orange    M  2010
+        16  orange    M  2012
+        17  orange    M  2011
+
+        Get observations that only occur in the data:
+        >>> df.expand(['type','size'])
+             type size
+        0   apple   XS
+        1  orange    S
+        2   apple    M
+        3  orange    M
+        >>> df.expand(['type','size','year'])
+             type size  year
+        0   apple   XS  2010
+        1  orange    S  2010
+        2   apple    M  2012
+        3  orange    S  2011
+        4  orange    M  2012
+
+        Expand the DataFrame to include new observations:
+        >>> df.expand('type','size',{'new_year':range(2010,2014)})
+              type size  new_year
+        0    apple   XS      2010
+        1    apple   XS      2011
+        2    apple   XS      2012
+        3    apple   XS      2013
+        4    apple    S      2010
+        5    apple    S      2011
+        6    apple    S      2012
+        7    apple    S      2013
+        8    apple    M      2010
+        9    apple    M      2011
+        10   apple    M      2012
+        11   apple    M      2013
+        12  orange   XS      2010
+        13  orange   XS      2011
+        14  orange   XS      2012
+        15  orange   XS      2013
+        16  orange    S      2010
+        17  orange    S      2011
+        18  orange    S      2012
+        19  orange    S      2013
+        20  orange    M      2010
+        21  orange    M      2011
+        22  orange    M      2012
+        23  orange    M      2013
+
+        Filter for missing observations:
+        >>> combo = df.expand('type','size','year')
+        >>> anti_join = df.merge(combo, how='right', indicator=True)
+        >>> anti_join.query("_merge=='right_only'").drop(columns="_merge")
+              type  year size
+        1    apple  2012   XS
+        2    apple  2011   XS
+        3    apple  2010    S
+        4    apple  2012    S
+        5    apple  2011    S
+        6    apple  2010    M
+        8    apple  2011    M
+        9   orange  2010   XS
+        10  orange  2012   XS
+        11  orange  2011   XS
+        14  orange  2012    S
+        16  orange  2010    M
+        18  orange  2011    M
+
+        Expand within each group, using `by`:
+        >>> df.expand('year','size',by='type')
+                year size
+        type
+        apple   2010   XS
+        apple   2010    M
+        apple   2012   XS
+        apple   2012    M
+        orange  2010    S
+        orange  2010    M
+        orange  2011    S
+        orange  2011    M
+        orange  2012    S
+        orange  2012    M
+
+    Args:
+        df: A pandas DataFrame.
+        columns: Specification of columns to expand.
+            It could be column labels,
+             a list/tuple of column labels,
+             or a pandas Index/Series/DataFrame.
+
+            It can also be a callable;
+            the callable will be applied to the
+            entire DataFrame. The callable should
+            return a pandas Series/Index/DataFrame.
+
+            It can also be a dictionary,
+            where the values are either a 1D array
+            or a callable that evaluates to a
+            1D array.
+            The array should be unique;
+            no check is done to verify this.
+        sort: If True, sort the DataFrame.
+        by: Label or list of labels to group by.
+
+    Returns:
+        A pandas DataFrame.
+    """  # noqa: E501
+    if by is None:
+        contents = _build_pandas_objects_for_expand(df=df, columns=columns)
+        return cartesian_product(*contents, sort=sort)
+    if not is_scalar(by) and not isinstance(by, list):
+        raise TypeError(
+            "The argument to the by parameter "
+            "should be a scalar or a list; "
+            f"instead got {type(by).__name__}"
+        )
+    check_column(df, column_names=by, present=True)
+    grouped = df.groupby(by=by, sort=False, dropna=False, observed=True)
+    index = grouped._grouper.result_index
+    dictionary = defaultdict(list)
+    lengths = []
+    for _, frame in grouped:
+        objects = _build_pandas_objects_for_expand(df=frame, columns=columns)
+        objects = _compute_cartesian_product(inputs=objects, sort=False)
+        length = objects[next(iter(objects))].size
+        lengths.append(length)
+        for k, v in objects.items():
+            dictionary[k].append(v)
+    dictionary = {
+        key: concat_compat(value) for key, value in dictionary.items()
+    }
+    index = index.repeat(lengths)
+    out = pd.DataFrame(data=dictionary, index=index, copy=False)
+    if sort:
+        headers = out.columns.tolist()
+        return out.sort_values(headers)
+    return out
+
+
+
+ +
+ +
+ + +

+ expand_grid(df=None, df_key=None, *, others=None) + +

+ + +
+ +

Creates a DataFrame from a cartesian combination of all inputs.

+
+

Note

+

This function will be deprecated in a 1.x release; +use cartesian_product +instead.

+
+

It is not restricted to a pandas DataFrame; +it can work with any list-like structure +that is 1 or 2 dimensional.

+

If method-chaining to a DataFrame, a string argument +to df_key parameter must be provided.

+

Data types are preserved in this function, +including pandas' extension array dtypes.

+

The output will always be a DataFrame, usually with a MultiIndex column, +with the keys of the others dictionary serving as the top level columns.

+

If a pandas Series/DataFrame is passed, and has a labeled index, or +a MultiIndex index, the index is discarded; the final DataFrame +will have a RangeIndex.

+

The MultiIndexed DataFrame can be flattened using pyjanitor's +collapse_levels +method; the user can also decide to drop any of the levels, via pandas' +droplevel method.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor as jn
+>>> df = pd.DataFrame({"x": [1, 2], "y": [2, 1]})
+>>> data = {"z": [1, 2, 3]}
+>>> df.expand_grid(df_key="df", others=data)
+  df     z
+   x  y  0
+0  1  2  1
+1  1  2  2
+2  1  2  3
+3  2  1  1
+4  2  1  2
+5  2  1  3
+
+

expand_grid works with non-pandas objects:

+
>>> data = {"x": [1, 2, 3], "y": [1, 2]}
+>>> jn.expand_grid(others=data)
+   x  y
+   0  0
+0  1  1
+1  1  2
+2  2  1
+3  2  2
+4  3  1
+5  3  2
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + Optional[DataFrame] + +
+

A pandas DataFrame.

+
+
+ None +
+ df_key + + Optional[str] + +
+

Name of key for the dataframe. +It becomes part of the column names of the dataframe.

+
+
+ None +
+ others + + Optional[dict] + +
+

A dictionary that contains the data +to be combined with the dataframe. +If no dataframe exists, all inputs +in others will be combined to create a DataFrame.

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ KeyError + +
+

If there is a DataFrame and df_key is not provided.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ Union[DataFrame, None] + +
+

A pandas DataFrame of the cartesian product.

+
+
+ Union[DataFrame, None] + +
+

If df is not provided, and others is not provided,

+
+
+ Union[DataFrame, None] + +
+

None is returned.

+
+
+ +
+ Source code in janitor/functions/expand_grid.py +
 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `janitor.cartesian_product` instead."
+    )
+)
+def expand_grid(
+    df: Optional[pd.DataFrame] = None,
+    df_key: Optional[str] = None,
+    *,
+    others: Optional[dict] = None,
+) -> Union[pd.DataFrame, None]:
+    """
+    Creates a DataFrame from a cartesian combination of all inputs.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release;
+        use [`cartesian_product`][janitor.functions.expand_grid.cartesian_product]
+        instead.
+
+    It is not restricted to a pandas DataFrame;
+    it can work with any list-like structure
+    that is 1 or 2 dimensional.
+
+    If method-chaining to a DataFrame, a string argument
+    to `df_key` parameter must be provided.
+
+    Data types are preserved in this function,
+    including pandas' extension array dtypes.
+
+    The output will always be a DataFrame, usually with a MultiIndex column,
+    with the keys of the `others` dictionary serving as the top level columns.
+
+    If a pandas Series/DataFrame is passed, and has a labeled index, or
+    a MultiIndex index, the index is discarded; the final DataFrame
+    will have a RangeIndex.
+
+    The MultiIndexed DataFrame can be flattened using pyjanitor's
+    [`collapse_levels`][janitor.functions.collapse_levels.collapse_levels]
+    method; the user can also decide to drop any of the levels, via pandas'
+    `droplevel` method.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor as jn
+        >>> df = pd.DataFrame({"x": [1, 2], "y": [2, 1]})
+        >>> data = {"z": [1, 2, 3]}
+        >>> df.expand_grid(df_key="df", others=data)
+          df     z
+           x  y  0
+        0  1  2  1
+        1  1  2  2
+        2  1  2  3
+        3  2  1  1
+        4  2  1  2
+        5  2  1  3
+
+        `expand_grid` works with non-pandas objects:
+
+        >>> data = {"x": [1, 2, 3], "y": [1, 2]}
+        >>> jn.expand_grid(others=data)
+           x  y
+           0  0
+        0  1  1
+        1  1  2
+        2  2  1
+        3  2  2
+        4  3  1
+        5  3  2
+
+    Args:
+        df: A pandas DataFrame.
+        df_key: Name of key for the dataframe.
+            It becomes part of the column names of the dataframe.
+        others: A dictionary that contains the data
+            to be combined with the dataframe.
+            If no dataframe exists, all inputs
+            in `others` will be combined to create a DataFrame.
+
+    Raises:
+        KeyError: If there is a DataFrame and `df_key` is not provided.
+
+    Returns:
+        A pandas DataFrame of the cartesian product.
+        If `df` is not provided, and `others` is not provided,
+        None is returned.
+    """  # noqa: E501
+
+    if df is not None:
+        check("df", df, [pd.DataFrame])
+        if not df_key:
+            raise KeyError(
+                "Using `expand_grid` as part of a "
+                "DataFrame method chain requires that "
+                "a string argument be provided for "
+                "the `df_key` parameter. "
+            )
+
+        check("df_key", df_key, [str])
+
+    if not others and (df is not None):
+        return df
+
+    if not others:
+        return None
+
+    check("others", others, [dict])
+
+    for key in others:
+        check("key", key, [str])
+
+    if df is not None:
+        others = {**{df_key: df}, **others}
+
+    others = _computations_expand_grid(others)
+    return pd.DataFrame(others, copy=False)
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ explode_index + + +

+ +
+ +

Implementation of the explode_index function.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ explode_index(df, names_sep=None, names_pattern=None, axis='columns', level_names=None) + +

+ + +
+ +

Explode a single index DataFrame into a MultiIndex DataFrame.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame(
+...          {'max_speed_mean': [267.3333333333333, 50.5],
+...           'max_speed_median': [389.0, 50.5]})
+>>> df
+   max_speed_mean  max_speed_median
+0      267.333333             389.0
+1       50.500000              50.5
+>>> df.explode_index(names_sep='_',axis='columns')
+          max
+        speed
+         mean median
+0  267.333333  389.0
+1   50.500000   50.5
+>>> df.explode_index(names_pattern=r"(.+speed)_(.+)",axis='columns')
+    max_speed
+         mean median
+0  267.333333  389.0
+1   50.500000   50.5
+>>> df.explode_index(
+...     names_pattern=r"(?P<measurement>.+speed)_(?P<aggregation>.+)",
+...     axis='columns'
+... )
+measurement   max_speed
+aggregation        mean median
+0            267.333333  389.0
+1             50.500000   50.5
+>>> df.explode_index(
+...     names_sep='_',
+...     axis='columns',
+...     level_names = ['min or max', 'measurement','aggregation']
+... )
+min or max          max
+measurement       speed
+aggregation        mean median
+0            267.333333  389.0
+1             50.500000   50.5
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ names_sep + + Union[str, None] + +
+

string or compiled regex used to split the column/index into levels.

+
+
+ None +
+ names_pattern + + Union[str, None] + +
+

regex to extract new levels from the column/index.

+
+
+ None +
+ axis + + str + +
+

'index/columns'. Determines which axis to explode.

+
+
+ 'columns' +
+ level_names + + list + +
+

names of the levels in the MultiIndex.

+
+
+ None +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with a MultiIndex.

+
+
+ +
+ Source code in janitor/functions/explode_index.py +
 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
@pf.register_dataframe_method
+def explode_index(
+    df: pd.DataFrame,
+    names_sep: Union[str, None] = None,
+    names_pattern: Union[str, None] = None,
+    axis: str = "columns",
+    level_names: list = None,
+) -> pd.DataFrame:
+    """Explode a single index DataFrame into a MultiIndex DataFrame.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame(
+        ...          {'max_speed_mean': [267.3333333333333, 50.5],
+        ...           'max_speed_median': [389.0, 50.5]})
+        >>> df
+           max_speed_mean  max_speed_median
+        0      267.333333             389.0
+        1       50.500000              50.5
+        >>> df.explode_index(names_sep='_',axis='columns')  # doctest: +NORMALIZE_WHITESPACE
+                  max
+                speed
+                 mean median
+        0  267.333333  389.0
+        1   50.500000   50.5
+        >>> df.explode_index(names_pattern=r"(.+speed)_(.+)",axis='columns') # doctest: +NORMALIZE_WHITESPACE
+            max_speed
+                 mean median
+        0  267.333333  389.0
+        1   50.500000   50.5
+        >>> df.explode_index(
+        ...     names_pattern=r"(?P<measurement>.+speed)_(?P<aggregation>.+)",
+        ...     axis='columns'
+        ... ) # doctest: +NORMALIZE_WHITESPACE
+        measurement   max_speed
+        aggregation        mean median
+        0            267.333333  389.0
+        1             50.500000   50.5
+        >>> df.explode_index(
+        ...     names_sep='_',
+        ...     axis='columns',
+        ...     level_names = ['min or max', 'measurement','aggregation']
+        ... ) # doctest: +NORMALIZE_WHITESPACE
+        min or max          max
+        measurement       speed
+        aggregation        mean median
+        0            267.333333  389.0
+        1             50.500000   50.5
+
+    Args:
+        df: A pandas DataFrame.
+        names_sep: string or compiled regex used to split the column/index into levels.
+        names_pattern: regex to extract new levels from the column/index.
+        axis: 'index/columns'. Determines which axis to explode.
+        level_names: names of the levels in the MultiIndex.
+
+    Returns:
+        A pandas DataFrame with a MultiIndex.
+    """  # noqa: E501
+    check("axis", axis, [str])
+    if axis not in {"index", "columns"}:
+        raise ValueError("axis should be either index or columns.")
+    if (names_sep is None) and (names_pattern is None):
+        raise ValueError(
+            "Provide argument for either names_sep or names_pattern."
+        )
+    if (names_sep is not None) and (names_pattern is not None):
+        raise ValueError(
+            "Provide argument for either names_sep or names_pattern, not both."
+        )
+    if names_sep is not None:
+        check("names_sep", names_sep, [str])
+    if names_pattern is not None:
+        check("names_pattern", names_pattern, [str])
+    if level_names is not None:
+        check("level_names", level_names, [list])
+
+    new_index = getattr(df, axis)
+    if isinstance(new_index, pd.MultiIndex):
+        return df
+    # avoid a copy - Index is immutable; a slice is safe to use.
+    df = df[:]
+    if names_sep:
+        new_index = new_index.str.split(names_sep, expand=True)
+    else:
+        named_groups = re.compile(names_pattern).groupindex
+        if named_groups and not level_names:
+            level_names = list(named_groups)
+        new_index = new_index.str.extract(names_pattern)
+        new_index = [arr.array for _, arr in new_index.items()]
+        new_index = pd.MultiIndex.from_arrays(new_index)
+    if level_names:
+        new_index.names = level_names
+
+    setattr(df, axis, new_index)
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ factorize_columns + + +

+ +
+ +

Implementation of the factorize_columns function

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ factorize_columns(df, column_names, suffix='_enc', **kwargs) + +

+ + +
+ +

Converts labels into numerical data.

+

This method will create a new column with the string _enc appended +after the original column's name. +This can be overridden with the suffix parameter.

+

Internally, this method uses pandas factorize method. +It takes in an optional suffix and keyword arguments also. +An empty string as suffix will override the existing column.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "foo": ["b", "b", "a", "c", "b"],
+...     "bar": range(4, 9),
+... })
+>>> df
+  foo  bar
+0   b    4
+1   b    5
+2   a    6
+3   c    7
+4   b    8
+>>> df.factorize_columns(column_names="foo")
+  foo  bar  foo_enc
+0   b    4        0
+1   b    5        0
+2   a    6        1
+3   c    7        2
+4   b    8        0
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The pandas DataFrame object.

+
+
+ required +
+ column_names + + Union[str, Iterable[str], Hashable] + +
+

A column name or an iterable (list or tuple) of +column names.

+
+
+ required +
+ suffix + + str + +
+

Suffix to be used for the new column. +An empty string suffix means, it will override the existing column.

+
+
+ '_enc' +
+ **kwargs + + Any + +
+

Keyword arguments. It takes any of the keyword arguments, +which the pandas factorize method takes like sort, na_sentinel, +size_hint.

+
+
+ {} +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/factorize_columns.py +
11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
@pf.register_dataframe_method
+def factorize_columns(
+    df: pd.DataFrame,
+    column_names: Union[str, Iterable[str], Hashable],
+    suffix: str = "_enc",
+    **kwargs: Any,
+) -> pd.DataFrame:
+    """Converts labels into numerical data.
+
+    This method will create a new column with the string `_enc` appended
+    after the original column's name.
+    This can be overridden with the suffix parameter.
+
+    Internally, this method uses pandas `factorize` method.
+    It takes in an optional suffix and keyword arguments also.
+    An empty string as suffix will override the existing column.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "foo": ["b", "b", "a", "c", "b"],
+        ...     "bar": range(4, 9),
+        ... })
+        >>> df
+          foo  bar
+        0   b    4
+        1   b    5
+        2   a    6
+        3   c    7
+        4   b    8
+        >>> df.factorize_columns(column_names="foo")
+          foo  bar  foo_enc
+        0   b    4        0
+        1   b    5        0
+        2   a    6        1
+        3   c    7        2
+        4   b    8        0
+
+    Args:
+        df: The pandas DataFrame object.
+        column_names: A column name or an iterable (list or tuple) of
+            column names.
+        suffix: Suffix to be used for the new column.
+            An empty string suffix means, it will override the existing column.
+        **kwargs: Keyword arguments. It takes any of the keyword arguments,
+            which the pandas factorize method takes like `sort`, `na_sentinel`,
+            `size_hint`.
+
+    Returns:
+        A pandas DataFrame.
+    """
+    df = _factorize(df.copy(), column_names, suffix, **kwargs)
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ fill + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ fill_direction(df, **kwargs) + +

+ + +
+ +

Provide a method-chainable function for filling missing values +in selected columns.

+

It is a wrapper for pd.Series.ffill and pd.Series.bfill, +and pairs the column name with one of up, down, updown, +and downup.

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use pd.DataFrame.assign instead.

+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor as jn
+>>> df = pd.DataFrame(
+...    {
+...        'col1': [1, 2, 3, 4],
+...        'col2': [None, 5, 6, 7],
+...        'col3': [8, 9, 10, None],
+...        'col4': [None, None, 11, None],
+...        'col5': [None, 12, 13, None]
+...    }
+... )
+>>> df
+   col1  col2  col3  col4  col5
+0     1   NaN   8.0   NaN   NaN
+1     2   5.0   9.0   NaN  12.0
+2     3   6.0  10.0  11.0  13.0
+3     4   7.0   NaN   NaN   NaN
+>>> df.fill_direction(
+... col2 = 'up',
+... col3 = 'down',
+... col4 = 'downup',
+... col5 = 'updown'
+... )
+   col1  col2  col3  col4  col5
+0     1   5.0   8.0  11.0  12.0
+1     2   5.0   9.0  11.0  12.0
+2     3   6.0  10.0  11.0  13.0
+3     4   7.0  10.0  11.0  13.0
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ **kwargs + + Any + +
+

Key - value pairs of columns and directions. +Directions can be either down, up, updown +(fill up then down) and downup (fill down then up).

+
+
+ {} +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If direction supplied is not one of down, up, +updown, or downup.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with modified column(s).

+
+
+ +
+ Source code in janitor/functions/fill.py +
 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `pd.DataFrame.assign` instead."
+    )
+)
+def fill_direction(df: pd.DataFrame, **kwargs: Any) -> pd.DataFrame:
+    """Provide a method-chainable function for filling missing values
+    in selected columns.
+
+    It is a wrapper for `pd.Series.ffill` and `pd.Series.bfill`,
+    and pairs the column name with one of `up`, `down`, `updown`,
+    and `downup`.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `pd.DataFrame.assign` instead.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor as jn
+        >>> df = pd.DataFrame(
+        ...    {
+        ...        'col1': [1, 2, 3, 4],
+        ...        'col2': [None, 5, 6, 7],
+        ...        'col3': [8, 9, 10, None],
+        ...        'col4': [None, None, 11, None],
+        ...        'col5': [None, 12, 13, None]
+        ...    }
+        ... )
+        >>> df
+           col1  col2  col3  col4  col5
+        0     1   NaN   8.0   NaN   NaN
+        1     2   5.0   9.0   NaN  12.0
+        2     3   6.0  10.0  11.0  13.0
+        3     4   7.0   NaN   NaN   NaN
+        >>> df.fill_direction(
+        ... col2 = 'up',
+        ... col3 = 'down',
+        ... col4 = 'downup',
+        ... col5 = 'updown'
+        ... )
+           col1  col2  col3  col4  col5
+        0     1   5.0   8.0  11.0  12.0
+        1     2   5.0   9.0  11.0  12.0
+        2     3   6.0  10.0  11.0  13.0
+        3     4   7.0  10.0  11.0  13.0
+
+    Args:
+        df: A pandas DataFrame.
+        **kwargs: Key - value pairs of columns and directions.
+            Directions can be either `down`, `up`, `updown`
+            (fill up then down) and `downup` (fill down then up).
+
+    Raises:
+        ValueError: If direction supplied is not one of `down`, `up`,
+            `updown`, or `downup`.
+
+    Returns:
+        A pandas DataFrame with modified column(s).
+    """  # noqa: E501
+
+    if not kwargs:
+        return df
+
+    fill_types = {fill.name for fill in _FILLTYPE}
+    for column_name, fill_type in kwargs.items():
+        check("column_name", column_name, [str])
+        check("fill_type", fill_type, [str])
+        if fill_type.upper() not in fill_types:
+            raise ValueError(
+                "fill_type should be one of up, down, updown, or downup."
+            )
+
+    check_column(df, kwargs)
+
+    new_values = {}
+    for column_name, fill_type in kwargs.items():
+        direction = _FILLTYPE[f"{fill_type.upper()}"].value
+        if len(direction) == 1:
+            direction = methodcaller(direction[0])
+            output = direction(df[column_name])
+        else:
+            direction = [methodcaller(entry) for entry in direction]
+            output = _chain_func(df[column_name], *direction)
+        new_values[column_name] = output
+
+    return df.assign(**new_values)
+
+
+
+ +
+ +
+ + +

+ fill_empty(df, column_names, value) + +

+ + +
+ +

Fill NaN values in specified columns with a given value.

+

Super sugary syntax that wraps pandas.DataFrame.fillna.

+

This method mutates the original DataFrame.

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use jn.impute instead.

+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame(
+...        {
+...            'col1': [1, 2, 3],
+...            'col2': [None, 4, None ],
+...            'col3': [None, 5, 6]
+...        }
+...    )
+>>> df
+   col1  col2  col3
+0     1   NaN   NaN
+1     2   4.0   5.0
+2     3   NaN   6.0
+>>> df.fill_empty(column_names = 'col2', value = 0)
+   col1  col2  col3
+0     1   0.0   NaN
+1     2   4.0   5.0
+2     3   0.0   6.0
+>>> df.fill_empty(column_names = ['col2', 'col3'], value = 0)
+   col1  col2  col3
+0     1   0.0   0.0
+1     2   4.0   5.0
+2     3   0.0   6.0
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_names + + Union[str, Iterable[str], Hashable] + +
+

A column name or an iterable (list +or tuple) of column names. If a single column name is passed in, +then only that column will be filled; if a list or tuple is passed +in, then those columns will all be filled with the same value.

+
+
+ required +
+ value + + Any + +
+

The value that replaces the NaN values.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with NaN values filled.

+
+
+ +
+ Source code in janitor/functions/fill.py +
131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
@pf.register_dataframe_method
+@refactored_function(
+    message="This function will be deprecated in a 1.x release. "
+    "Kindly use `jn.impute` instead."
+)
+@deprecated_alias(columns="column_names")
+def fill_empty(
+    df: pd.DataFrame,
+    column_names: Union[str, Iterable[str], Hashable],
+    value: Any,
+) -> pd.DataFrame:
+    """Fill `NaN` values in specified columns with a given value.
+
+    Super sugary syntax that wraps `pandas.DataFrame.fillna`.
+
+    This method mutates the original DataFrame.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use [`jn.impute`][janitor.functions.impute.impute] instead.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame(
+        ...        {
+        ...            'col1': [1, 2, 3],
+        ...            'col2': [None, 4, None ],
+        ...            'col3': [None, 5, 6]
+        ...        }
+        ...    )
+        >>> df
+           col1  col2  col3
+        0     1   NaN   NaN
+        1     2   4.0   5.0
+        2     3   NaN   6.0
+        >>> df.fill_empty(column_names = 'col2', value = 0)
+           col1  col2  col3
+        0     1   0.0   NaN
+        1     2   4.0   5.0
+        2     3   0.0   6.0
+        >>> df.fill_empty(column_names = ['col2', 'col3'], value = 0)
+           col1  col2  col3
+        0     1   0.0   0.0
+        1     2   4.0   5.0
+        2     3   0.0   6.0
+
+    Args:
+        df: A pandas DataFrame.
+        column_names: A column name or an iterable (list
+            or tuple) of column names. If a single column name is passed in,
+            then only that column will be filled; if a list or tuple is passed
+            in, then those columns will all be filled with the same value.
+        value: The value that replaces the `NaN` values.
+
+    Returns:
+        A pandas DataFrame with `NaN` values filled.
+    """
+
+    check_column(df, column_names)
+    return _fill_empty(df, column_names, value=value)
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ filter + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ filter_column_isin(df, column_name, iterable, complement=False) + +

+ + +
+ +

Filter a dataframe for values in a column that exist in the given iterable.

+

This method does not mutate the original DataFrame.

+

Assumes exact matching; fuzzy matching not implemented.

+ + +

Examples:

+

Filter the dataframe to retain rows for which names +are exactly James or John.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"names": ["Jane", "Jeremy", "John"], "foo": list("xyz")})
+>>> df
+    names foo
+0    Jane   x
+1  Jeremy   y
+2    John   z
+>>> df.filter_column_isin(column_name="names", iterable=["James", "John"])
+  names foo
+2  John   z
+
+

This is the method-chaining alternative to:

+
df = df[df["names"].isin(["James", "John"])]
+
+

If complement=True, then we will only get rows for which the names +are neither James nor John.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + Hashable + +
+

The column on which to filter.

+
+
+ required +
+ iterable + + Iterable + +
+

An iterable. Could be a list, tuple, another pandas +Series.

+
+
+ required +
+ complement + + bool + +
+

Whether to return the complement of the selection or +not.

+
+
+ False +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If iterable does not have a length of 1 +or greater.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A filtered pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/filter.py +
296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
@pf.register_dataframe_method
+@deprecated_alias(column="column_name")
+def filter_column_isin(
+    df: pd.DataFrame,
+    column_name: Hashable,
+    iterable: Iterable,
+    complement: bool = False,
+) -> pd.DataFrame:
+    """Filter a dataframe for values in a column that exist in the given iterable.
+
+    This method does not mutate the original DataFrame.
+
+    Assumes exact matching; fuzzy matching not implemented.
+
+    Examples:
+        Filter the dataframe to retain rows for which `names`
+        are exactly `James` or `John`.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"names": ["Jane", "Jeremy", "John"], "foo": list("xyz")})
+        >>> df
+            names foo
+        0    Jane   x
+        1  Jeremy   y
+        2    John   z
+        >>> df.filter_column_isin(column_name="names", iterable=["James", "John"])
+          names foo
+        2  John   z
+
+        This is the method-chaining alternative to:
+
+        ```python
+        df = df[df["names"].isin(["James", "John"])]
+        ```
+
+        If `complement=True`, then we will only get rows for which the names
+        are neither `James` nor `John`.
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: The column on which to filter.
+        iterable: An iterable. Could be a list, tuple, another pandas
+            Series.
+        complement: Whether to return the complement of the selection or
+            not.
+
+    Raises:
+        ValueError: If `iterable` does not have a length of `1`
+            or greater.
+
+    Returns:
+        A filtered pandas DataFrame.
+    """  # noqa: E501
+    if len(iterable) == 0:
+        raise ValueError(
+            "`iterable` kwarg must be given an iterable of length 1 "
+            "or greater."
+        )
+    criteria = df[column_name].isin(iterable)
+
+    if complement:
+        return df[~criteria]
+    return df[criteria]
+
+
+
+ +
+ +
+ + +

+ filter_date(df, column_name, start_date=None, end_date=None, years=None, months=None, days=None, column_date_options=None, format=None) + +

+ + +
+ +

Filter a date-based column based on certain criteria.

+

This method does not mutate the original DataFrame.

+

Dates may be finicky and this function builds on top of the magic from +the pandas to_datetime function that is able to parse dates well.

+

Additional options to parse the date type of your column may be found at +the official pandas documentation.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "a": range(5, 9),
+...     "dt": ["2021-11-12", "2021-12-15", "2022-01-03", "2022-01-09"],
+... })
+>>> df
+   a          dt
+0  5  2021-11-12
+1  6  2021-12-15
+2  7  2022-01-03
+3  8  2022-01-09
+>>> df.filter_date("dt", start_date="2021-12-01", end_date="2022-01-05")
+   a         dt
+1  6 2021-12-15
+2  7 2022-01-03
+>>> df.filter_date("dt", years=[2021], months=[12])
+   a         dt
+1  6 2021-12-15
+
+
+

Note

+

This method will cast your column to a Timestamp!

+
+
+

Note

+

This only affects the format of the start_date and end_date +parameters. If there's an issue with the format of the DataFrame being +parsed, you would pass {'format': your_format} to column_date_options.

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The dataframe to filter on.

+
+
+ required +
+ column_name + + Hashable + +
+

The column which to apply the fraction transformation.

+
+
+ required +
+ start_date + + Optional[date] + +
+

The beginning date to use to filter the DataFrame.

+
+
+ None +
+ end_date + + Optional[date] + +
+

The end date to use to filter the DataFrame.

+
+
+ None +
+ years + + Optional[List] + +
+

The years to use to filter the DataFrame.

+
+
+ None +
+ months + + Optional[List] + +
+

The months to use to filter the DataFrame.

+
+
+ None +
+ days + + Optional[List] + +
+

The days to use to filter the DataFrame.

+
+
+ None +
+ column_date_options + + Optional[Dict] + +
+

Special options to use when parsing the date +column in the original DataFrame. The options may be found at the +official Pandas documentation.

+
+
+ None +
+ format + + Optional[str] + +
+

If you're using a format for start_date or end_date +that is not recognized natively by pandas' to_datetime function, you +may supply the format yourself. Python date and time formats may be +found here.

+
+
+ None +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A filtered pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/filter.py +
184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
@pf.register_dataframe_method
+@deprecated_alias(column="column_name", start="start_date", end="end_date")
+def filter_date(
+    df: pd.DataFrame,
+    column_name: Hashable,
+    start_date: Optional[dt.date] = None,
+    end_date: Optional[dt.date] = None,
+    years: Optional[List] = None,
+    months: Optional[List] = None,
+    days: Optional[List] = None,
+    column_date_options: Optional[Dict] = None,
+    format: Optional[str] = None,  # skipcq: PYL-W0622
+) -> pd.DataFrame:
+    """Filter a date-based column based on certain criteria.
+
+    This method does not mutate the original DataFrame.
+
+    Dates may be finicky and this function builds on top of the *magic* from
+    the pandas `to_datetime` function that is able to parse dates well.
+
+    Additional options to parse the date type of your column may be found at
+    the official pandas [documentation][datetime].
+
+    [datetime]: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "a": range(5, 9),
+        ...     "dt": ["2021-11-12", "2021-12-15", "2022-01-03", "2022-01-09"],
+        ... })
+        >>> df
+           a          dt
+        0  5  2021-11-12
+        1  6  2021-12-15
+        2  7  2022-01-03
+        3  8  2022-01-09
+        >>> df.filter_date("dt", start_date="2021-12-01", end_date="2022-01-05")
+           a         dt
+        1  6 2021-12-15
+        2  7 2022-01-03
+        >>> df.filter_date("dt", years=[2021], months=[12])
+           a         dt
+        1  6 2021-12-15
+
+    !!!note
+
+        This method will cast your column to a Timestamp!
+
+    !!!note
+
+        This only affects the format of the `start_date` and `end_date`
+        parameters. If there's an issue with the format of the DataFrame being
+        parsed, you would pass `{'format': your_format}` to `column_date_options`.
+
+    Args:
+        df: The dataframe to filter on.
+        column_name: The column which to apply the fraction transformation.
+        start_date: The beginning date to use to filter the DataFrame.
+        end_date: The end date to use to filter the DataFrame.
+        years: The years to use to filter the DataFrame.
+        months: The months to use to filter the DataFrame.
+        days: The days to use to filter the DataFrame.
+        column_date_options: Special options to use when parsing the date
+            column in the original DataFrame. The options may be found at the
+            official Pandas documentation.
+        format: If you're using a format for `start_date` or `end_date`
+            that is not recognized natively by pandas' `to_datetime` function, you
+            may supply the format yourself. Python date and time formats may be
+            found [here](http://strftime.org/).
+
+    Returns:
+        A filtered pandas DataFrame.
+    """  # noqa: E501
+
+    def _date_filter_conditions(conditions):
+        """Taken from: https://stackoverflow.com/a/13616382."""
+        return reduce(np.logical_and, conditions)
+
+    if column_date_options is None:
+        column_date_options = {}
+    df[column_name] = pd.to_datetime(df[column_name], **column_date_options)
+
+    _filter_list = []
+
+    if start_date:
+        start_date = pd.to_datetime(start_date, format=format)
+        _filter_list.append(df[column_name] >= start_date)
+
+    if end_date:
+        end_date = pd.to_datetime(end_date, format=format)
+        _filter_list.append(df[column_name] <= end_date)
+
+    if years:
+        _filter_list.append(df[column_name].dt.year.isin(years))
+
+    if months:
+        _filter_list.append(df[column_name].dt.month.isin(months))
+
+    if days:
+        _filter_list.append(df[column_name].dt.day.isin(days))
+
+    if start_date and end_date and start_date > end_date:
+        warnings.warn(
+            f"Your start date of {start_date} is after your end date of "
+            f"{end_date}. Is this intended?"
+        )
+
+    return df.loc[_date_filter_conditions(_filter_list), :]
+
+
+
+ +
+ +
+ + +

+ filter_on(df, criteria, complement=False) + +

+ + +
+ +

Return a dataframe filtered on a particular criteria.

+

This method does not mutate the original DataFrame.

+

This is super-sugary syntax that wraps the pandas .query() API, enabling +users to use strings to quickly specify filters for filtering their +dataframe. The intent is that filter_on as a verb better matches the +intent of a pandas user than the verb query.

+

This is intended to be the method-chaining equivalent of the following:

+
df = df[df["score"] < 3]
+
+
+

Note

+

This function will be deprecated in a 1.x release. +Please use pd.DataFrame.query instead.

+
+ + +

Examples:

+

Filter students who failed an exam (scored less than 50).

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "student_id": ["S1", "S2", "S3"],
+...     "score": [40, 60, 85],
+... })
+>>> df
+  student_id  score
+0         S1     40
+1         S2     60
+2         S3     85
+>>> df.filter_on("score < 50", complement=False)
+  student_id  score
+0         S1     40
+
+

Credit to Brant Peterson for the name.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ criteria + + str + +
+

A filtering criteria that returns an array or Series of +booleans, on which pandas can filter on.

+
+
+ required +
+ complement + + bool + +
+

Whether to return the complement of the filter or not. +If set to True, then the rows for which the criteria is False are +retained instead.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A filtered pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/filter.py +
107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `pd.DataFrame.query` instead."
+    )
+)
+def filter_on(
+    df: pd.DataFrame,
+    criteria: str,
+    complement: bool = False,
+) -> pd.DataFrame:
+    """Return a dataframe filtered on a particular criteria.
+
+    This method does not mutate the original DataFrame.
+
+    This is super-sugary syntax that wraps the pandas `.query()` API, enabling
+    users to use strings to quickly specify filters for filtering their
+    dataframe. The intent is that `filter_on` as a verb better matches the
+    intent of a pandas user than the verb `query`.
+
+    This is intended to be the method-chaining equivalent of the following:
+
+    ```python
+    df = df[df["score"] < 3]
+    ```
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `pd.DataFrame.query` instead.
+
+
+    Examples:
+        Filter students who failed an exam (scored less than 50).
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "student_id": ["S1", "S2", "S3"],
+        ...     "score": [40, 60, 85],
+        ... })
+        >>> df
+          student_id  score
+        0         S1     40
+        1         S2     60
+        2         S3     85
+        >>> df.filter_on("score < 50", complement=False)
+          student_id  score
+        0         S1     40
+
+    Credit to Brant Peterson for the name.
+
+    Args:
+        df: A pandas DataFrame.
+        criteria: A filtering criteria that returns an array or Series of
+            booleans, on which pandas can filter on.
+        complement: Whether to return the complement of the filter or not.
+            If set to True, then the rows for which the criteria is False are
+            retained instead.
+
+    Returns:
+        A filtered pandas DataFrame.
+    """
+
+    warnings.warn(
+        "This function will be deprecated in a 1.x release. "
+        "Kindly use `pd.DataFrame.query` instead.",
+        DeprecationWarning,
+        stacklevel=find_stack_level(),
+    )
+
+    if complement:
+        return df.query(f"not ({criteria})")
+    return df.query(criteria)
+
+
+
+ +
+ +
+ + +

+ filter_string(df, column_name, search_string, complement=False, case=True, flags=0, na=None, regex=True) + +

+ + +
+ +

Filter a string-based column according to whether it contains a substring.

+

This is super sugary syntax that builds on top of pandas.Series.str.contains. +It is meant to be the method-chaining equivalent of the following:

+
df = df[df[column_name].str.contains(search_string)]]
+
+

This method does not mutate the original DataFrame.

+ + +

Examples:

+

Retain rows whose column values contain a particular substring.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": range(3, 6), "b": ["bear", "peeL", "sail"]})
+>>> df
+   a     b
+0  3  bear
+1  4  peeL
+2  5  sail
+>>> df.filter_string(column_name="b", search_string="ee")
+   a     b
+1  4  peeL
+>>> df.filter_string(column_name="b", search_string="L", case=False)
+   a     b
+1  4  peeL
+2  5  sail
+
+

Filter names does not contain '.' (disable regex mode).

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.Series(["JoseChen", "Brian.Salvi"], name="Name").to_frame()
+>>> df
+          Name
+0     JoseChen
+1  Brian.Salvi
+>>> df.filter_string(column_name="Name", search_string=".", regex=False, complement=True)
+       Name
+0  JoseChen
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + Hashable + +
+

The column to filter. The column should contain strings.

+
+
+ required +
+ search_string + + str + +
+

A regex pattern or a (sub-)string to search.

+
+
+ required +
+ complement + + bool + +
+

Whether to return the complement of the filter or not. If +set to True, then the rows for which the string search fails are retained +instead.

+
+
+ False +
+ case + + bool + +
+

If True, case sensitive.

+
+
+ True +
+ flags + + int + +
+

Flags to pass through to the re module, e.g. re.IGNORECASE.

+
+
+ 0 +
+ na + + Any + +
+

Fill value for missing values. The default depends on dtype of +the array. For object-dtype, numpy.nan is used. For StringDtype, +pandas.NA is used.

+
+
+ None +
+ regex + + bool + +
+

If True, assumes search_string is a regular expression. If False, +treats the search_string as a literal string.

+
+
+ True +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A filtered pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/filter.py +
 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
@pf.register_dataframe_method
+@deprecated_alias(column="column_name")
+def filter_string(
+    df: pd.DataFrame,
+    column_name: Hashable,
+    search_string: str,
+    complement: bool = False,
+    case: bool = True,
+    flags: int = 0,
+    na: Any = None,
+    regex: bool = True,
+) -> pd.DataFrame:
+    """Filter a string-based column according to whether it contains a substring.
+
+    This is super sugary syntax that builds on top of `pandas.Series.str.contains`.
+    It is meant to be the method-chaining equivalent of the following:
+
+    ```python
+    df = df[df[column_name].str.contains(search_string)]]
+    ```
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        Retain rows whose column values contain a particular substring.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": range(3, 6), "b": ["bear", "peeL", "sail"]})
+        >>> df
+           a     b
+        0  3  bear
+        1  4  peeL
+        2  5  sail
+        >>> df.filter_string(column_name="b", search_string="ee")
+           a     b
+        1  4  peeL
+        >>> df.filter_string(column_name="b", search_string="L", case=False)
+           a     b
+        1  4  peeL
+        2  5  sail
+
+        Filter names does not contain `'.'` (disable regex mode).
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.Series(["JoseChen", "Brian.Salvi"], name="Name").to_frame()
+        >>> df
+                  Name
+        0     JoseChen
+        1  Brian.Salvi
+        >>> df.filter_string(column_name="Name", search_string=".", regex=False, complement=True)
+               Name
+        0  JoseChen
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: The column to filter. The column should contain strings.
+        search_string: A regex pattern or a (sub-)string to search.
+        complement: Whether to return the complement of the filter or not. If
+            set to True, then the rows for which the string search fails are retained
+            instead.
+        case: If True, case sensitive.
+        flags: Flags to pass through to the re module, e.g. re.IGNORECASE.
+        na: Fill value for missing values. The default depends on dtype of
+            the array. For object-dtype, `numpy.nan` is used. For `StringDtype`,
+            `pandas.NA` is used.
+        regex: If True, assumes `search_string` is a regular expression. If False,
+            treats the `search_string` as a literal string.
+
+    Returns:
+        A filtered pandas DataFrame.
+    """  # noqa: E501
+
+    criteria = df[column_name].str.contains(
+        pat=search_string,
+        case=case,
+        flags=flags,
+        na=na,
+        regex=regex,
+    )
+
+    if complement:
+        return df[~criteria]
+
+    return df[criteria]
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ find_replace + + +

+ +
+ +

Implementation for find_replace.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ find_replace(df, match='exact', **mappings) + +

+ + +
+ +

Perform a find-and-replace action on provided columns.

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use pd.DataFrame.replace instead.

+
+

Depending on use case, users can choose either exact, full-value matching, +or regular-expression-based fuzzy matching +(hence allowing substring matching in the latter case). +For strings, the matching is always case sensitive.

+ + +

Examples:

+

For instance, given a DataFrame containing orders at a coffee shop:

+
>>> df = pd.DataFrame({
+...     "customer": ["Mary", "Tom", "Lila"],
+...     "order": ["ice coffee", "lemonade", "regular coffee"]
+... })
+>>> df
+  customer           order
+0     Mary      ice coffee
+1      Tom        lemonade
+2     Lila  regular coffee
+
+

Our task is to replace values ice coffee and regular coffee +of the order column into latte.

+

Example 1 - exact matching (functional usage):

+
>>> df = find_replace(
+...     df,
+...     match="exact",
+...     order={"ice coffee": "latte", "regular coffee": "latte"},
+... )
+>>> df
+  customer     order
+0     Mary     latte
+1      Tom  lemonade
+2     Lila     latte
+
+

Example 1 - exact matching (method chaining):

+
>>> df = df.find_replace(
+...     match="exact",
+...     order={"ice coffee": "latte", "regular coffee": "latte"},
+... )
+>>> df
+  customer     order
+0     Mary     latte
+1      Tom  lemonade
+2     Lila     latte
+
+

Example 2 - Regular-expression-based matching (functional usage):

+
>>> df = find_replace(
+...     df,
+...     match='regex',
+...     order={'coffee$': 'latte'},
+... )
+>>> df
+  customer     order
+0     Mary     latte
+1      Tom  lemonade
+2     Lila     latte
+
+

Example 2 - Regular-expression-based matching (method chaining usage):

+
>>> df = df.find_replace(
+...     match='regex',
+...     order={'coffee$': 'latte'},
+... )
+>>> df
+  customer     order
+0     Mary     latte
+1      Tom  lemonade
+2     Lila     latte
+
+

To perform a find and replace on the entire DataFrame, +pandas' df.replace() function provides the appropriate functionality. +You can find more detail on the replace docs.

+

This function only works with column names that have no spaces +or punctuation in them. +For example, a column name item_name would work with find_replace, +because it is a contiguous string that can be parsed correctly, +but item name would not be parsed correctly by the Python interpreter.

+

If you have column names that might not be compatible, +we recommend calling on clean_names() +as the first method call. If, for whatever reason, that is not possible, +then _find_replace is available as a function +that you can do a pandas pipe call on.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ match + + str + +
+

Whether or not to perform an exact match or not. +Valid values are "exact" or "regex".

+
+
+ 'exact' +
+ **mappings + + Any + +
+

keyword arguments corresponding to column names +that have dictionaries passed in indicating what to find (keys) +and what to replace with (values).

+
+
+ {} +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with replaced values.

+
+
+ +
+ Source code in janitor/functions/find_replace.py +
 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `pd.DataFrame.replace` instead."
+    )
+)
+def find_replace(
+    df: pd.DataFrame, match: str = "exact", **mappings: Any
+) -> pd.DataFrame:
+    """Perform a find-and-replace action on provided columns.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `pd.DataFrame.replace` instead.
+
+    Depending on use case, users can choose either exact, full-value matching,
+    or regular-expression-based fuzzy matching
+    (hence allowing substring matching in the latter case).
+    For strings, the matching is always case sensitive.
+
+    Examples:
+        For instance, given a DataFrame containing orders at a coffee shop:
+
+        >>> df = pd.DataFrame({
+        ...     "customer": ["Mary", "Tom", "Lila"],
+        ...     "order": ["ice coffee", "lemonade", "regular coffee"]
+        ... })
+        >>> df
+          customer           order
+        0     Mary      ice coffee
+        1      Tom        lemonade
+        2     Lila  regular coffee
+
+        Our task is to replace values `ice coffee` and `regular coffee`
+        of the `order` column into `latte`.
+
+        Example 1 - exact matching (functional usage):
+
+        >>> df = find_replace(
+        ...     df,
+        ...     match="exact",
+        ...     order={"ice coffee": "latte", "regular coffee": "latte"},
+        ... )
+        >>> df
+          customer     order
+        0     Mary     latte
+        1      Tom  lemonade
+        2     Lila     latte
+
+        Example 1 - exact matching (method chaining):
+
+        >>> df = df.find_replace(
+        ...     match="exact",
+        ...     order={"ice coffee": "latte", "regular coffee": "latte"},
+        ... )
+        >>> df
+          customer     order
+        0     Mary     latte
+        1      Tom  lemonade
+        2     Lila     latte
+
+        Example 2 - Regular-expression-based matching (functional usage):
+
+        >>> df = find_replace(
+        ...     df,
+        ...     match='regex',
+        ...     order={'coffee$': 'latte'},
+        ... )
+        >>> df
+          customer     order
+        0     Mary     latte
+        1      Tom  lemonade
+        2     Lila     latte
+
+        Example 2 - Regular-expression-based matching (method chaining usage):
+
+        >>> df = df.find_replace(
+        ...     match='regex',
+        ...     order={'coffee$': 'latte'},
+        ... )
+        >>> df
+          customer     order
+        0     Mary     latte
+        1      Tom  lemonade
+        2     Lila     latte
+
+    To perform a find and replace on the entire DataFrame,
+    pandas' `df.replace()` function provides the appropriate functionality.
+    You can find more detail on the [replace] docs.
+
+    [replace]: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.replace.html
+
+    This function only works with column names that have no spaces
+    or punctuation in them.
+    For example, a column name `item_name` would work with `find_replace`,
+    because it is a contiguous string that can be parsed correctly,
+    but `item name` would not be parsed correctly by the Python interpreter.
+
+    If you have column names that might not be compatible,
+    we recommend calling on [`clean_names()`][janitor.functions.clean_names.clean_names]
+    as the first method call. If, for whatever reason, that is not possible,
+    then `_find_replace` is available as a function
+    that you can do a pandas [pipe](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pipe.html) call on.
+
+    Args:
+        df: A pandas DataFrame.
+        match: Whether or not to perform an exact match or not.
+            Valid values are "exact" or "regex".
+        **mappings: keyword arguments corresponding to column names
+            that have dictionaries passed in indicating what to find (keys)
+            and what to replace with (values).
+
+    Returns:
+        A pandas DataFrame with replaced values.
+    """  # noqa: E501
+    for column_name, mapper in mappings.items():
+        df = _find_replace(df, column_name, mapper, match=match)
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ flag_nulls + + +

+ +
+ +

Implementation source for flag_nulls.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ flag_nulls(df, column_name='null_flag', columns=None) + +

+ + +
+ +

Creates a new column to indicate whether you have null values in a given +row.

+

If the columns parameter is not set, looks across the entire +DataFrame, otherwise will look only in the columns you set.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "a": ["w", "x", None, "z"], "b": [5, None, 7, 8],
+... })
+>>> df.flag_nulls()
+      a    b  null_flag
+0     w  5.0          0
+1     x  NaN          1
+2  None  7.0          1
+3     z  8.0          0
+>>> df.flag_nulls(columns="b")
+      a    b  null_flag
+0     w  5.0          0
+1     x  NaN          1
+2  None  7.0          0
+3     z  8.0          0
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

Input pandas DataFrame.

+
+
+ required +
+ column_name + + Optional[Hashable] + +
+

Name for the output column.

+
+
+ 'null_flag' +
+ columns + + Optional[Union[str, Iterable[str], Hashable]] + +
+

List of columns to look at for finding null values. If you +only want to look at one column, you can simply give its name. +If set to None (default), all DataFrame columns are used.

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If column_name is already present in the +DataFrame.

+
+
+ ValueError + +
+

If any column within columns is not present in +the DataFrame.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

Input dataframe with the null flag column.

+
+
+ + +
+ Source code in janitor/functions/flag_nulls.py +
12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
@pf.register_dataframe_method
+def flag_nulls(
+    df: pd.DataFrame,
+    column_name: Optional[Hashable] = "null_flag",
+    columns: Optional[Union[str, Iterable[str], Hashable]] = None,
+) -> pd.DataFrame:
+    """Creates a new column to indicate whether you have null values in a given
+    row.
+
+    If the columns parameter is not set, looks across the entire
+    DataFrame, otherwise will look only in the columns you set.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "a": ["w", "x", None, "z"], "b": [5, None, 7, 8],
+        ... })
+        >>> df.flag_nulls()
+              a    b  null_flag
+        0     w  5.0          0
+        1     x  NaN          1
+        2  None  7.0          1
+        3     z  8.0          0
+        >>> df.flag_nulls(columns="b")
+              a    b  null_flag
+        0     w  5.0          0
+        1     x  NaN          1
+        2  None  7.0          0
+        3     z  8.0          0
+
+    Args:
+        df: Input pandas DataFrame.
+        column_name: Name for the output column.
+        columns: List of columns to look at for finding null values. If you
+            only want to look at one column, you can simply give its name.
+            If set to None (default), all DataFrame columns are used.
+
+    Raises:
+        ValueError: If `column_name` is already present in the
+            DataFrame.
+        ValueError: If any column within `columns` is not present in
+            the DataFrame.
+
+    Returns:
+        Input dataframe with the null flag column.
+
+    <!--
+    # noqa: DAR402
+    -->
+    """
+    # Sort out columns input
+    if isinstance(columns, str):
+        columns = [columns]
+    elif columns is None:
+        columns = df.columns
+    elif not isinstance(columns, Iterable):
+        # catches other hashable types
+        columns = [columns]
+
+    # Input sanitation checks
+    check_column(df, columns)
+    check_column(df, [column_name], present=False)
+
+    # This algorithm works best for n_rows >> n_cols. See issue #501
+    null_array = np.zeros(len(df))
+    for col in columns:
+        null_array = np.logical_or(null_array, pd.isna(df[col]))
+
+    df = df.copy()
+    df[column_name] = null_array.astype(int)
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ get_dupes + + +

+ +
+ +

Implementation of the get_dupes function

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ get_dupes(df, column_names=None) + +

+ + +
+ +

Return all duplicate rows.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+

Method chaining syntax:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "item": ["shoe", "shoe", "bag", "shoe", "bag"],
+...     "quantity": [100, 100, 75, 200, 75],
+... })
+>>> df
+   item  quantity
+0  shoe       100
+1  shoe       100
+2   bag        75
+3  shoe       200
+4   bag        75
+>>> df.get_dupes()
+   item  quantity
+0  shoe       100
+1  shoe       100
+2   bag        75
+4   bag        75
+
+

Optional column_names usage:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "item": ["shoe", "shoe", "bag", "shoe", "bag"],
+...     "quantity": [100, 100, 75, 200, 75],
+... })
+>>> df
+   item  quantity
+0  shoe       100
+1  shoe       100
+2   bag        75
+3  shoe       200
+4   bag        75
+>>> df.get_dupes(column_names=["item"])
+   item  quantity
+0  shoe       100
+1  shoe       100
+2   bag        75
+3  shoe       200
+4   bag        75
+>>> df.get_dupes(column_names=["quantity"])
+   item  quantity
+0  shoe       100
+1  shoe       100
+2   bag        75
+4   bag        75
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The pandas DataFrame object.

+
+
+ required +
+ column_names + + Optional[Union[str, Iterable[str], Hashable]] + +
+

A column name or an iterable +(list or tuple) of column names. Following pandas API, this only +considers certain columns for identifying duplicates. Defaults +to using all columns.

+
+
+ None +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

The duplicate rows, as a pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/get_dupes.py +
11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
@pf.register_dataframe_method
+@deprecated_alias(columns="column_names")
+def get_dupes(
+    df: pd.DataFrame,
+    column_names: Optional[Union[str, Iterable[str], Hashable]] = None,
+) -> pd.DataFrame:
+    """
+    Return all duplicate rows.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        Method chaining syntax:
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "item": ["shoe", "shoe", "bag", "shoe", "bag"],
+        ...     "quantity": [100, 100, 75, 200, 75],
+        ... })
+        >>> df
+           item  quantity
+        0  shoe       100
+        1  shoe       100
+        2   bag        75
+        3  shoe       200
+        4   bag        75
+        >>> df.get_dupes()
+           item  quantity
+        0  shoe       100
+        1  shoe       100
+        2   bag        75
+        4   bag        75
+
+        Optional `column_names` usage:
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "item": ["shoe", "shoe", "bag", "shoe", "bag"],
+        ...     "quantity": [100, 100, 75, 200, 75],
+        ... })
+        >>> df
+           item  quantity
+        0  shoe       100
+        1  shoe       100
+        2   bag        75
+        3  shoe       200
+        4   bag        75
+        >>> df.get_dupes(column_names=["item"])
+           item  quantity
+        0  shoe       100
+        1  shoe       100
+        2   bag        75
+        3  shoe       200
+        4   bag        75
+        >>> df.get_dupes(column_names=["quantity"])
+           item  quantity
+        0  shoe       100
+        1  shoe       100
+        2   bag        75
+        4   bag        75
+
+    Args:
+        df: The pandas DataFrame object.
+        column_names: A column name or an iterable
+            (list or tuple) of column names. Following pandas API, this only
+            considers certain columns for identifying duplicates. Defaults
+            to using all columns.
+
+    Returns:
+        The duplicate rows, as a pandas DataFrame.
+    """
+    return df.loc[df.duplicated(subset=column_names, keep=False)]
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ groupby_agg + + +

+ +
+ +

Implementation source for groupby_agg.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ groupby_agg(df, by, new_column_name, agg_column_name, agg, dropna=True) + +

+ + +
+ +

Shortcut for assigning a groupby-transform to a new column.

+

This method does not mutate the original DataFrame.

+

Intended to be the method-chaining equivalent of:

+
df = df.assign(...=df.groupby(...)[...].transform(...))
+
+
+

Note

+

This function will be deprecated in a 1.x release. +Please use +jn.transform_column +instead.

+
+ + +

Examples:

+

Basic usage.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "item": ["shoe", "shoe", "bag", "shoe", "bag"],
+...     "quantity": [100, 120, 75, 200, 25],
+... })
+>>> df.groupby_agg(
+...     by="item",
+...     agg="mean",
+...     agg_column_name="quantity",
+...     new_column_name="avg_quantity",
+... )
+   item  quantity  avg_quantity
+0  shoe       100         140.0
+1  shoe       120         140.0
+2   bag        75          50.0
+3  shoe       200         140.0
+4   bag        25          50.0
+
+

Set dropna=False to compute the aggregation, treating the null +values in the by column as an isolated "group".

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "x": ["a", "a", None, "b"], "y": [9, 9, 9, 9],
+... })
+>>> df.groupby_agg(
+...     by="x",
+...     agg="count",
+...     agg_column_name="y",
+...     new_column_name="y_count",
+...     dropna=False,
+... )
+      x  y  y_count
+0     a  9        2
+1     a  9        2
+2  None  9        1
+3     b  9        1
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ by + + Union[List, Callable, str] + +
+

Column(s) to groupby on, will be passed into DataFrame.groupby.

+
+
+ required +
+ new_column_name + + str + +
+

Name of the aggregation output column.

+
+
+ required +
+ agg_column_name + + str + +
+

Name of the column to aggregate over.

+
+
+ required +
+ agg + + Union[Callable, str] + +
+

How to aggregate.

+
+
+ required +
+ dropna + + bool + +
+

Whether or not to include null values, if present in the +by column(s). Default is True (null values in by are assigned NaN in +the new column).

+
+
+ True +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/groupby_agg.py +
 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
@pf.register_dataframe_method
+@deprecated_alias(new_column="new_column_name", agg_column="agg_column_name")
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `janitor.transform_column` instead."
+    )
+)
+def groupby_agg(
+    df: pd.DataFrame,
+    by: Union[List, Callable, str],
+    new_column_name: str,
+    agg_column_name: str,
+    agg: Union[Callable, str],
+    dropna: bool = True,
+) -> pd.DataFrame:
+    """Shortcut for assigning a groupby-transform to a new column.
+
+    This method does not mutate the original DataFrame.
+
+    Intended to be the method-chaining equivalent of:
+
+    ```python
+    df = df.assign(...=df.groupby(...)[...].transform(...))
+    ```
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use
+        [`jn.transform_column`][janitor.functions.transform_columns.transform_column]
+        instead.
+
+    Examples:
+        Basic usage.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "item": ["shoe", "shoe", "bag", "shoe", "bag"],
+        ...     "quantity": [100, 120, 75, 200, 25],
+        ... })
+        >>> df.groupby_agg(
+        ...     by="item",
+        ...     agg="mean",
+        ...     agg_column_name="quantity",
+        ...     new_column_name="avg_quantity",
+        ... )
+           item  quantity  avg_quantity
+        0  shoe       100         140.0
+        1  shoe       120         140.0
+        2   bag        75          50.0
+        3  shoe       200         140.0
+        4   bag        25          50.0
+
+        Set `dropna=False` to compute the aggregation, treating the null
+        values in the `by` column as an isolated "group".
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "x": ["a", "a", None, "b"], "y": [9, 9, 9, 9],
+        ... })
+        >>> df.groupby_agg(
+        ...     by="x",
+        ...     agg="count",
+        ...     agg_column_name="y",
+        ...     new_column_name="y_count",
+        ...     dropna=False,
+        ... )
+              x  y  y_count
+        0     a  9        2
+        1     a  9        2
+        2  None  9        1
+        3     b  9        1
+
+    Args:
+        df: A pandas DataFrame.
+        by: Column(s) to groupby on, will be passed into `DataFrame.groupby`.
+        new_column_name: Name of the aggregation output column.
+        agg_column_name: Name of the column to aggregate over.
+        agg: How to aggregate.
+        dropna: Whether or not to include null values, if present in the
+            `by` column(s). Default is True (null values in `by` are assigned NaN in
+            the new column).
+
+    Returns:
+        A pandas DataFrame.
+    """  # noqa: E501
+
+    return df.assign(
+        **{
+            new_column_name: df.groupby(by, dropna=dropna)[
+                agg_column_name
+            ].transform(agg),
+        }
+    )
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ groupby_topk + + +

+ +
+ +

Implementation of the groupby_topk function

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ groupby_topk(df, by, column, k, dropna=True, ascending=True, ignore_index=True) + +

+ + +
+ +

Return top k rows from a groupby of a set of columns.

+

Returns a DataFrame that has the top k values per column, +grouped by by. Under the hood it uses nlargest/nsmallest, +for numeric columns, which avoids sorting the entire dataframe, +and is usually more performant. For non-numeric columns, pd.sort_values +is used. +No sorting is done to the by column(s); the order is maintained +in the final output.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame(
+...     {
+...         "age": [20, 23, 22, 43, 21],
+...         "id": [1, 4, 6, 2, 5],
+...         "result": ["pass", "pass", "fail", "pass", "fail"],
+...     }
+... )
+>>> df
+   age  id result
+0   20   1   pass
+1   23   4   pass
+2   22   6   fail
+3   43   2   pass
+4   21   5   fail
+
+

Ascending top 3:

+
>>> df.groupby_topk(by="result", column="age", k=3)
+   age  id result
+0   20   1   pass
+1   23   4   pass
+2   43   2   pass
+3   21   5   fail
+4   22   6   fail
+
+

Descending top 2:

+
>>> df.groupby_topk(
+...     by="result", column="age", k=2, ascending=False, ignore_index=False
+... )
+   age  id result
+3   43   2   pass
+1   23   4   pass
+2   22   6   fail
+4   21   5   fail
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ by + + Union[list, Hashable] + +
+

Column name(s) to group input DataFrame df by.

+
+
+ required +
+ column + + Hashable + +
+

Name of the column that determines k rows +to return.

+
+
+ required +
+ k + + int + +
+

Number of top rows to return for each group.

+
+
+ required +
+ dropna + + bool + +
+

If True, and NA values exist in by, the NA +values are not used in the groupby computation to get the relevant +k rows. If False, and NA values exist in by, then the NA +values are used in the groupby computation to get the relevant +k rows.

+
+
+ True +
+ ascending + + bool + +
+

If True, the smallest top k rows, +determined by column are returned; if False, the largest topkrows, determined bycolumn` are returned.

+
+
+ True +
+ ignore_index + + bool + +
+

If True, the original index is ignored. +If False, the original index for the top k rows is retained.

+
+
+ True +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If k is less than 1.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with top k rows per column, grouped by by.

+
+
+ +
+ Source code in janitor/functions/groupby_topk.py +
 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
@pf.register_dataframe_method
+@deprecated_alias(groupby_column_name="by", sort_column_name="column")
+def groupby_topk(
+    df: pd.DataFrame,
+    by: Union[list, Hashable],
+    column: Hashable,
+    k: int,
+    dropna: bool = True,
+    ascending: bool = True,
+    ignore_index: bool = True,
+) -> pd.DataFrame:
+    """Return top `k` rows from a groupby of a set of columns.
+
+    Returns a DataFrame that has the top `k` values per `column`,
+    grouped by `by`. Under the hood it uses `nlargest/nsmallest`,
+    for numeric columns, which avoids sorting the entire dataframe,
+    and is usually more performant. For non-numeric columns, `pd.sort_values`
+    is used.
+    No sorting is done to the `by` column(s); the order is maintained
+    in the final output.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "age": [20, 23, 22, 43, 21],
+        ...         "id": [1, 4, 6, 2, 5],
+        ...         "result": ["pass", "pass", "fail", "pass", "fail"],
+        ...     }
+        ... )
+        >>> df
+           age  id result
+        0   20   1   pass
+        1   23   4   pass
+        2   22   6   fail
+        3   43   2   pass
+        4   21   5   fail
+
+        Ascending top 3:
+
+        >>> df.groupby_topk(by="result", column="age", k=3)
+           age  id result
+        0   20   1   pass
+        1   23   4   pass
+        2   43   2   pass
+        3   21   5   fail
+        4   22   6   fail
+
+        Descending top 2:
+
+        >>> df.groupby_topk(
+        ...     by="result", column="age", k=2, ascending=False, ignore_index=False
+        ... )
+           age  id result
+        3   43   2   pass
+        1   23   4   pass
+        2   22   6   fail
+        4   21   5   fail
+
+    Args:
+        df: A pandas DataFrame.
+        by: Column name(s) to group input DataFrame `df` by.
+        column: Name of the column that determines `k` rows
+            to return.
+        k: Number of top rows to return for each group.
+        dropna: If `True`, and `NA` values exist in `by`, the `NA`
+            values are not used in the groupby computation to get the relevant
+            `k` rows. If `False`, and `NA` values exist in `by`, then the `NA`
+            values are used in the groupby computation to get the relevant
+            `k` rows.
+        ascending: If `True`, the smallest top `k` rows,
+            determined by `column` are returned; if `False, the largest top `k`
+            rows, determined by `column` are returned.
+        ignore_index: If `True`, the original index is ignored.
+            If `False`, the original index for the top `k` rows is retained.
+
+    Raises:
+        ValueError: If `k` is less than 1.
+
+    Returns:
+        A pandas DataFrame with top `k` rows per `column`, grouped by `by`.
+    """  # noqa: E501
+
+    if isinstance(by, Hashable):
+        by = [by]
+
+    check("by", by, [Hashable, list])
+
+    check_column(df, [column])
+    check_column(df, by)
+
+    if k < 1:
+        raise ValueError(
+            "Numbers of rows per group "
+            "to be returned must be greater than 0."
+        )
+
+    indices = df.groupby(by=by, dropna=dropna, sort=False, observed=True)
+    indices = indices[column]
+
+    try:
+        if ascending:
+            indices = indices.nsmallest(n=k)
+        else:
+            indices = indices.nlargest(n=k)
+    except TypeError:
+        indices = indices.apply(
+            lambda d: d.sort_values(ascending=ascending).head(k)
+        )
+
+    indices = indices.index.get_level_values(-1)
+    if ignore_index:
+        return df.loc[indices].reset_index(drop=True)
+    return df.loc[indices]
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ impute + + +

+ +
+ +

Implementation of impute function

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ impute(df, column_names, value=None, statistic_column_name=None) + +

+ + +
+ +

Method-chainable imputation of values in a column.

+

This method does not mutate the original DataFrame.

+

Underneath the hood, this function calls the .fillna() method available +to every pandas.Series object.

+

Either one of value or statistic_column_name should be provided.

+

If value is provided, then all null values in the selected column will +take on the value provided.

+

If statistic_column_name is provided, then all null values in the +selected column(s) will take on the summary statistic value +of other non-null values.

+

Column selection in column_names is possible using the +select syntax.

+

Currently supported statistics include:

+
    +
  • mean (also aliased by average)
  • +
  • median
  • +
  • mode
  • +
  • minimum (also aliased by min)
  • +
  • maximum (also aliased by max)
  • +
+ + +

Examples:

+
>>> import numpy as np
+>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "a": [1, 2, 3],
+...     "sales": np.nan,
+...     "score": [np.nan, 3, 2],
+... })
+>>> df
+   a  sales  score
+0  1    NaN    NaN
+1  2    NaN    3.0
+2  3    NaN    2.0
+
+

Imputing null values with 0 (using the value parameter):

+
>>> df.impute(column_names="sales", value=0.0)
+   a  sales  score
+0  1    0.0    NaN
+1  2    0.0    3.0
+2  3    0.0    2.0
+
+

Imputing null values with median (using the statistic_column_name +parameter):

+
>>> df.impute(column_names="score", statistic_column_name="median")
+   a  sales  score
+0  1    NaN    2.5
+1  2    NaN    3.0
+2  3    NaN    2.0
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_names + + Any + +
+

The name of the column(s) on which to impute values.

+
+
+ required +
+ value + + Optional[Any] + +
+

The value used for imputation, passed into .fillna method +of the underlying pandas Series.

+
+
+ None +
+ statistic_column_name + + Optional[str] + +
+

The column statistic to impute.

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If both value and statistic_column_name are +provided.

+
+
+ KeyError + +
+

If statistic_column_name is not one of mean, +average, median, mode, minimum, min, maximum, or +max.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

An imputed pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/impute.py +
 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
@pf.register_dataframe_method
+@deprecated_alias(column="column_name")
+@deprecated_alias(column_name="column_names")
+@deprecated_alias(statistic="statistic_column_name")
+def impute(
+    df: pd.DataFrame,
+    column_names: Any,
+    value: Optional[Any] = None,
+    statistic_column_name: Optional[str] = None,
+) -> pd.DataFrame:
+    """Method-chainable imputation of values in a column.
+
+    This method does not mutate the original DataFrame.
+
+    Underneath the hood, this function calls the `.fillna()` method available
+    to every `pandas.Series` object.
+
+    Either one of `value` or `statistic_column_name` should be provided.
+
+    If `value` is provided, then all null values in the selected column will
+    take on the value provided.
+
+    If `statistic_column_name` is provided, then all null values in the
+    selected column(s) will take on the summary statistic value
+    of other non-null values.
+
+    Column selection in `column_names` is possible using the
+    [`select`][janitor.functions.select.select] syntax.
+
+    Currently supported statistics include:
+
+    - `mean` (also aliased by `average`)
+    - `median`
+    - `mode`
+    - `minimum` (also aliased by `min`)
+    - `maximum` (also aliased by `max`)
+
+    Examples:
+        >>> import numpy as np
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "a": [1, 2, 3],
+        ...     "sales": np.nan,
+        ...     "score": [np.nan, 3, 2],
+        ... })
+        >>> df
+           a  sales  score
+        0  1    NaN    NaN
+        1  2    NaN    3.0
+        2  3    NaN    2.0
+
+        Imputing null values with 0 (using the `value` parameter):
+
+        >>> df.impute(column_names="sales", value=0.0)
+           a  sales  score
+        0  1    0.0    NaN
+        1  2    0.0    3.0
+        2  3    0.0    2.0
+
+        Imputing null values with median (using the `statistic_column_name`
+        parameter):
+
+        >>> df.impute(column_names="score", statistic_column_name="median")
+           a  sales  score
+        0  1    NaN    2.5
+        1  2    NaN    3.0
+        2  3    NaN    2.0
+
+    Args:
+        df: A pandas DataFrame.
+        column_names: The name of the column(s) on which to impute values.
+        value: The value used for imputation, passed into `.fillna` method
+            of the underlying pandas Series.
+        statistic_column_name: The column statistic to impute.
+
+    Raises:
+        ValueError: If both `value` and `statistic_column_name` are
+            provided.
+        KeyError: If `statistic_column_name` is not one of `mean`,
+            `average`, `median`, `mode`, `minimum`, `min`, `maximum`, or
+            `max`.
+
+    Returns:
+        An imputed pandas DataFrame.
+    """
+    # Firstly, we check that only one of `value` or `statistic` are provided.
+    if (value is None) and (statistic_column_name is None):
+        raise ValueError("Kindly specify a value or a statistic_column_name")
+
+    if value is not None and statistic_column_name is not None:
+        raise ValueError(
+            "Only one of `value` or `statistic_column_name` should be "
+            "provided."
+        )
+
+    column_names = get_index_labels([column_names], df, axis="columns")
+
+    if value is not None:
+        value = dict(product(column_names, [value]))
+
+    else:
+        # If statistic is provided, then we compute
+        # the relevant summary statistic
+        # from the other data.
+        funcs = {
+            "mean": "mean",
+            "average": "mean",  # aliased
+            "median": "median",
+            "mode": "mode",
+            "minimum": "min",
+            "min": "min",  # aliased
+            "maximum": "max",
+            "max": "max",  # aliased
+        }
+        # Check that the statistic keyword argument is one of the approved.
+        if statistic_column_name not in funcs:
+            raise KeyError(
+                f"`statistic_column_name` must be one of {funcs.keys()}."
+            )
+
+        value = dict(product(column_names, [funcs[statistic_column_name]]))
+
+        value = df.agg(value)
+
+        # special treatment for mode
+        if statistic_column_name == "mode":
+            value = {key: val.at[0] for key, val in value.items()}
+
+    return df.fillna(value=value)
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ jitter + + +

+ +
+ +

Implementation of the jitter function.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ jitter(df, column_name, dest_column_name, scale, clip=None, random_state=None) + +

+ + +
+ +

Adds Gaussian noise (jitter) to the values of a column.

+

A new column will be created containing the values of the original column +with Gaussian noise added. +For each value in the column, a Gaussian distribution is created +having a location (mean) equal to the value +and a scale (standard deviation) equal to scale. +A random value is then sampled from this distribution, +which is the jittered value. +If a tuple is supplied for clip, +then any values of the new column less than clip[0] +will be set to clip[0], +and any values greater than clip[1] will be set to clip[1]. +Additionally, if a numeric value is supplied for random_state, +this value will be used to set the random seed used for sampling. +NaN values are ignored in this method.

+

This method mutates the original DataFrame.

+ + +

Examples:

+
>>> import numpy as np
+>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": [3, 4, 5, np.nan]})
+>>> df
+     a
+0  3.0
+1  4.0
+2  5.0
+3  NaN
+>>> df.jitter("a", dest_column_name="a_jit", scale=1, random_state=42)
+     a     a_jit
+0  3.0  3.496714
+1  4.0  3.861736
+2  5.0  5.647689
+3  NaN       NaN
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + Hashable + +
+

Name of the column containing +values to add Gaussian jitter to.

+
+
+ required +
+ dest_column_name + + str + +
+

The name of the new column containing the +jittered values that will be created.

+
+
+ required +
+ scale + + number + +
+

A positive value multiplied by the original +column value to determine the scale (standard deviation) of the +Gaussian distribution to sample from. (A value of zero results in +no jittering.)

+
+
+ required +
+ clip + + Optional[Iterable[number]] + +
+

An iterable of two values (minimum and maximum) to clip +the jittered values to, default to None.

+
+
+ None +
+ random_state + + Optional[number] + +
+

An integer or 1-d array value used to set the random +seed, default to None.

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ TypeError + +
+

If column_name is not numeric.

+
+
+ ValueError + +
+

If scale is not a numerical value +greater than 0.

+
+
+ ValueError + +
+

If clip is not an iterable of length 2.

+
+
+ ValueError + +
+

If clip[0] is greater than clip[1].

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with a new column containing +Gaussian-jittered values from another column.

+
+
+ +
+ Source code in janitor/functions/jitter.py +
 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
@pf.register_dataframe_method
+def jitter(
+    df: pd.DataFrame,
+    column_name: Hashable,
+    dest_column_name: str,
+    scale: np.number,
+    clip: Optional[Iterable[np.number]] = None,
+    random_state: Optional[np.number] = None,
+) -> pd.DataFrame:
+    """Adds Gaussian noise (jitter) to the values of a column.
+
+    A new column will be created containing the values of the original column
+    with Gaussian noise added.
+    For each value in the column, a Gaussian distribution is created
+    having a location (mean) equal to the value
+    and a scale (standard deviation) equal to `scale`.
+    A random value is then sampled from this distribution,
+    which is the jittered value.
+    If a tuple is supplied for `clip`,
+    then any values of the new column less than `clip[0]`
+    will be set to `clip[0]`,
+    and any values greater than `clip[1]` will be set to `clip[1]`.
+    Additionally, if a numeric value is supplied for `random_state`,
+    this value will be used to set the random seed used for sampling.
+    NaN values are ignored in this method.
+
+    This method mutates the original DataFrame.
+
+    Examples:
+        >>> import numpy as np
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": [3, 4, 5, np.nan]})
+        >>> df
+             a
+        0  3.0
+        1  4.0
+        2  5.0
+        3  NaN
+        >>> df.jitter("a", dest_column_name="a_jit", scale=1, random_state=42)
+             a     a_jit
+        0  3.0  3.496714
+        1  4.0  3.861736
+        2  5.0  5.647689
+        3  NaN       NaN
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: Name of the column containing
+            values to add Gaussian jitter to.
+        dest_column_name: The name of the new column containing the
+            jittered values that will be created.
+        scale: A positive value multiplied by the original
+            column value to determine the scale (standard deviation) of the
+            Gaussian distribution to sample from. (A value of zero results in
+            no jittering.)
+        clip: An iterable of two values (minimum and maximum) to clip
+            the jittered values to, default to None.
+        random_state: An integer or 1-d array value used to set the random
+            seed, default to None.
+
+    Raises:
+        TypeError: If `column_name` is not numeric.
+        ValueError: If `scale` is not a numerical value
+            greater than `0`.
+        ValueError: If `clip` is not an iterable of length `2`.
+        ValueError: If `clip[0]` is greater than `clip[1]`.
+
+    Returns:
+        A pandas DataFrame with a new column containing
+            Gaussian-jittered values from another column.
+    """
+
+    # Check types
+    check("scale", scale, [int, float])
+
+    # Check that `column_name` is a numeric column
+    if not np.issubdtype(df[column_name].dtype, np.number):
+        raise TypeError(f"{column_name} must be a numeric column.")
+
+    if scale <= 0:
+        raise ValueError("`scale` must be a numeric value greater than 0.")
+    values = df[column_name]
+    if random_state is not None:
+        np.random.seed(random_state)
+    result = np.random.normal(loc=values, scale=scale)
+    if clip:
+        # Ensure `clip` has length 2
+        if len(clip) != 2:
+            raise ValueError("`clip` must be an iterable of length 2.")
+        # Ensure the values in `clip` are ordered as min, max
+        if clip[1] < clip[0]:
+            raise ValueError(
+                "`clip[0]` must be less than or equal to `clip[1]`."
+            )
+        result = np.clip(result, *clip)
+    df[dest_column_name] = result
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ join_apply + + +

+ +
+ +

Implementation of the join_apply function

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ join_apply(df, func, new_column_name) + +

+ + +
+ +

Join the result of applying a function across dataframe rows.

+

This method does not mutate the original DataFrame.

+

This is a convenience function that allows us to apply arbitrary functions +that take any combination of information from any of the columns. The only +requirement is that the function signature takes in a row from the +DataFrame.

+ + +

Examples:

+

Sum the result of two columns into a new column.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a":[1, 2, 3], "b": [2, 3, 4]})
+>>> df
+   a  b
+0  1  2
+1  2  3
+2  3  4
+>>> df.join_apply(
+...     func=lambda x: 2 * x["a"] + x["b"],
+...     new_column_name="2a+b",
+... )
+   a  b  2a+b
+0  1  2     4
+1  2  3     7
+2  3  4    10
+
+

Incorporating conditionals in func.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": [1, 2, 3], "b": [20, 30, 40]})
+>>> df
+   a   b
+0  1  20
+1  2  30
+2  3  40
+>>> def take_a_if_even(x):
+...     if x["a"] % 2 == 0:
+...         return x["a"]
+...     else:
+...         return x["b"]
+>>> df.join_apply(take_a_if_even, "a_if_even")
+   a   b  a_if_even
+0  1  20         20
+1  2  30          2
+2  3  40         40
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ func + + Callable + +
+

A function that is applied elementwise across all rows of the +DataFrame.

+
+
+ required +
+ new_column_name + + str + +
+

Name of the resulting column.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with new column appended.

+
+
+ +
+ Source code in janitor/functions/join_apply.py +
 9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
@pf.register_dataframe_method
+def join_apply(
+    df: pd.DataFrame,
+    func: Callable,
+    new_column_name: str,
+) -> pd.DataFrame:
+    """Join the result of applying a function across dataframe rows.
+
+    This method does not mutate the original DataFrame.
+
+    This is a convenience function that allows us to apply arbitrary functions
+    that take any combination of information from any of the columns. The only
+    requirement is that the function signature takes in a row from the
+    DataFrame.
+
+    Examples:
+        Sum the result of two columns into a new column.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a":[1, 2, 3], "b": [2, 3, 4]})
+        >>> df
+           a  b
+        0  1  2
+        1  2  3
+        2  3  4
+        >>> df.join_apply(
+        ...     func=lambda x: 2 * x["a"] + x["b"],
+        ...     new_column_name="2a+b",
+        ... )
+           a  b  2a+b
+        0  1  2     4
+        1  2  3     7
+        2  3  4    10
+
+        Incorporating conditionals in `func`.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": [1, 2, 3], "b": [20, 30, 40]})
+        >>> df
+           a   b
+        0  1  20
+        1  2  30
+        2  3  40
+        >>> def take_a_if_even(x):
+        ...     if x["a"] % 2 == 0:
+        ...         return x["a"]
+        ...     else:
+        ...         return x["b"]
+        >>> df.join_apply(take_a_if_even, "a_if_even")
+           a   b  a_if_even
+        0  1  20         20
+        1  2  30          2
+        2  3  40         40
+
+    Args:
+        df: A pandas DataFrame.
+        func: A function that is applied elementwise across all rows of the
+            DataFrame.
+        new_column_name: Name of the resulting column.
+
+    Returns:
+        A pandas DataFrame with new column appended.
+    """  # noqa: E501
+    df = df.copy().join(df.apply(func, axis=1).rename(new_column_name))
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ label_encode + + +

+ +
+ +

Implementation of label_encode function

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ label_encode(df, column_names) + +

+ + +
+ +

Convert labels into numerical data.

+

This method will create a new column with the string _enc appended +after the original column's name. +Consider this to be syntactic sugar. +This function uses the factorize pandas function under the hood.

+

This method behaves differently from +encode_categorical. +This method creates a new column of numeric data. +encode_categorical +replaces the dtype of the original column with a categorical dtype.

+

This method mutates the original DataFrame.

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use factorize_columns +instead.

+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "foo": ["b", "b", "a", "c", "b"],
+...     "bar": range(4, 9),
+... })
+>>> df
+  foo  bar
+0   b    4
+1   b    5
+2   a    6
+3   c    7
+4   b    8
+>>> df.label_encode(column_names="foo")
+  foo  bar  foo_enc
+0   b    4        0
+1   b    5        0
+2   a    6        1
+3   c    7        2
+4   b    8        0
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The pandas DataFrame object.

+
+
+ required +
+ column_names + + Union[str, Iterable[str], Hashable] + +
+

A column name or an iterable (list +or tuple) of column names.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/label_encode.py +
13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `janitor.factorize_columns` instead."
+    )
+)
+@deprecated_alias(columns="column_names")
+def label_encode(
+    df: pd.DataFrame,
+    column_names: Union[str, Iterable[str], Hashable],
+) -> pd.DataFrame:
+    """Convert labels into numerical data.
+
+    This method will create a new column with the string `_enc` appended
+    after the original column's name.
+    Consider this to be syntactic sugar.
+    This function uses the `factorize` pandas function under the hood.
+
+    This method behaves differently from
+    [`encode_categorical`][janitor.functions.encode_categorical.encode_categorical].
+    This method creates a new column of numeric data.
+    [`encode_categorical`][janitor.functions.encode_categorical.encode_categorical]
+    replaces the dtype of the original column with a *categorical* dtype.
+
+    This method mutates the original DataFrame.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use [`factorize_columns`][janitor.functions.factorize_columns.factorize_columns]
+        instead.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "foo": ["b", "b", "a", "c", "b"],
+        ...     "bar": range(4, 9),
+        ... })
+        >>> df
+          foo  bar
+        0   b    4
+        1   b    5
+        2   a    6
+        3   c    7
+        4   b    8
+        >>> df.label_encode(column_names="foo")
+          foo  bar  foo_enc
+        0   b    4        0
+        1   b    5        0
+        2   a    6        1
+        3   c    7        2
+        4   b    8        0
+
+    Args:
+        df: The pandas DataFrame object.
+        column_names: A column name or an iterable (list
+            or tuple) of column names.
+
+    Returns:
+        A pandas DataFrame.
+    """  # noqa: E501
+    warnings.warn(
+        "`label_encode` will be deprecated in a 1.x release. "
+        "Please use `factorize_columns` instead."
+    )
+    df = _factorize(df, column_names, "_enc")
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ limit_column_characters + + +

+ +
+ +

Implementation of limit_column_characters.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ limit_column_characters(df, column_length, col_separator='_') + +

+ + +
+ +

Truncate column sizes to a specific length.

+

This method mutates the original DataFrame.

+

Method chaining will truncate all columns to a given length and append +a given separator character with the index of duplicate columns, except +for the first distinct column name.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> data_dict = {
+...     "really_long_name": [9, 8, 7],
+...     "another_really_long_name": [2, 4, 6],
+...     "another_really_longer_name": list("xyz"),
+...     "this_is_getting_out_of_hand": list("pqr"),
+... }
+>>> df = pd.DataFrame(data_dict)
+>>> df
+   really_long_name  another_really_long_name another_really_longer_name this_is_getting_out_of_hand
+0                 9                         2                          x                           p
+1                 8                         4                          y                           q
+2                 7                         6                          z                           r
+>>> df.limit_column_characters(7)
+   really_  another another_1 this_is
+0        9        2         x       p
+1        8        4         y       q
+2        7        6         z       r
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_length + + int + +
+

Character length for which to truncate all columns. +The column separator value and number for duplicate column name does +not contribute. Therefore, if all columns are truncated to 10 +characters, the first distinct column will be 10 characters and the +remaining will be 12 characters (assuming a column separator of one +character).

+
+
+ required +
+ col_separator + + str + +
+

The separator to use for counting distinct column +values, for example, '_' or '.'. +Supply an empty string (i.e. '') to remove the separator.

+
+
+ '_' +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with truncated column lengths.

+
+
+ +
+ Source code in janitor/functions/limit_column_characters.py +
 9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
@pf.register_dataframe_method
+def limit_column_characters(
+    df: pd.DataFrame,
+    column_length: int,
+    col_separator: str = "_",
+) -> pd.DataFrame:
+    """Truncate column sizes to a specific length.
+
+    This method mutates the original DataFrame.
+
+    Method chaining will truncate all columns to a given length and append
+    a given separator character with the index of duplicate columns, except
+    for the first distinct column name.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> data_dict = {
+        ...     "really_long_name": [9, 8, 7],
+        ...     "another_really_long_name": [2, 4, 6],
+        ...     "another_really_longer_name": list("xyz"),
+        ...     "this_is_getting_out_of_hand": list("pqr"),
+        ... }
+        >>> df = pd.DataFrame(data_dict)
+        >>> df  # doctest: +SKIP
+           really_long_name  another_really_long_name another_really_longer_name this_is_getting_out_of_hand
+        0                 9                         2                          x                           p
+        1                 8                         4                          y                           q
+        2                 7                         6                          z                           r
+        >>> df.limit_column_characters(7)
+           really_  another another_1 this_is
+        0        9        2         x       p
+        1        8        4         y       q
+        2        7        6         z       r
+
+    Args:
+        df: A pandas DataFrame.
+        column_length: Character length for which to truncate all columns.
+            The column separator value and number for duplicate column name does
+            not contribute. Therefore, if all columns are truncated to 10
+            characters, the first distinct column will be 10 characters and the
+            remaining will be 12 characters (assuming a column separator of one
+            character).
+        col_separator: The separator to use for counting distinct column
+            values, for example, `'_'` or `'.'`.
+            Supply an empty string (i.e. `''`) to remove the separator.
+
+    Returns:
+        A pandas DataFrame with truncated column lengths.
+    """  # noqa: E501
+
+    check("column_length", column_length, [int])
+    check("col_separator", col_separator, [str])
+
+    col_names = df.columns
+    col_names = [col_name[:column_length] for col_name in col_names]
+
+    col_name_set = set(col_names)
+    col_name_count = {}
+
+    # If no columns are duplicates, we can skip the loops below.
+    if len(col_name_set) == len(col_names):
+        df.columns = col_names
+        return df
+
+    for col_name_to_check in col_name_set:
+        count = 0
+        for idx, col_name in enumerate(col_names):
+            if col_name_to_check == col_name:
+                col_name_count[idx] = count
+                count += 1
+
+    final_col_names = []
+    for idx, col_name in enumerate(col_names):
+        if col_name_count[idx] > 0:
+            col_name_to_append = (
+                col_name + col_separator + str(col_name_count[idx])
+            )
+            final_col_names.append(col_name_to_append)
+        else:
+            final_col_names.append(col_name)
+
+    df.columns = final_col_names
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ min_max_scale + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ min_max_scale(df, feature_range=(0, 1), column_name=None, jointly=False) + +

+ + +
+ +

Scales DataFrame to between a minimum and maximum value.

+

One can optionally set a new target minimum and maximum value +using the feature_range keyword argument.

+

If column_name is specified, then only that column(s) of data is scaled. +Otherwise, the entire dataframe is scaled. +If jointly is True, the column_names provided entire dataframe will +be regnozied as the one to jointly scale. Otherwise, each column of data +will be scaled separately.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
+>>> df.min_max_scale()
+     a    b
+0  0.0  0.0
+1  1.0  1.0
+>>> df.min_max_scale(jointly=True)
+     a    b
+0  0.5  0.0
+1  1.0  0.5
+
+

Setting custom minimum and maximum.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
+>>> df.min_max_scale(feature_range=(0, 100))
+       a      b
+0    0.0    0.0
+1  100.0  100.0
+>>> df.min_max_scale(feature_range=(0, 100), jointly=True)
+       a     b
+0   50.0   0.0
+1  100.0  50.0
+
+

Apply min-max to the selected columns.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1], 'c': [1, 0]})
+>>> df.min_max_scale(
+...     feature_range=(0, 100),
+...     column_name=["a", "c"],
+... )
+       a  b      c
+0    0.0  0  100.0
+1  100.0  1    0.0
+>>> df.min_max_scale(
+...     feature_range=(0, 100),
+...     column_name=["a", "c"],
+...     jointly=True,
+... )
+       a  b     c
+0   50.0  0  50.0
+1  100.0  1   0.0
+>>> df.min_max_scale(feature_range=(0, 100), column_name='a')
+       a  b  c
+0    0.0  0  1
+1  100.0  1  0
+
+

The aforementioned example might be applied to something like scaling the +isoelectric points of amino acids. While technically they range from +approx 3-10, we can also think of them on the pH scale which ranges from +1 to 14. Hence, 3 gets scaled not to 0 but approx. 0.15 instead, while 10 +gets scaled to approx. 0.69 instead.

+
+

Version Changed

+
    +
  • 0.24.0
      +
    • Deleted old_min, old_max, new_min, and new_max options.
    • +
    • Added feature_range, and jointly options.
    • +
    +
  • +
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ feature_range + + tuple[int | float, int | float] + +
+

Desired range of transformed data.

+
+
+ (0, 1) +
+ column_name + + str | int | list[str | int] | Index + +
+

The column on which to perform scaling.

+
+
+ None +
+ jointly + + bool + +
+

Scale the entire data if True.

+
+
+ False +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If feature_range isn't tuple type.

+
+
+ ValueError + +
+

If the length of feature_range isn't equal to two.

+
+
+ ValueError + +
+

If the element of feature_range isn't number type.

+
+
+ ValueError + +
+

If feature_range[1] <= feature_range[0].

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with scaled data.

+
+
+ +
+ Source code in janitor/functions/min_max_scale.py +
  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
@pf.register_dataframe_method
+@deprecated_kwargs(
+    "old_min",
+    "old_max",
+    "new_min",
+    "new_max",
+    message=(
+        "The keyword argument {argument!r} of {func_name!r} is deprecated. "
+        "Please use 'feature_range' instead."
+    ),
+)
+@deprecated_alias(col_name="column_name")
+def min_max_scale(
+    df: pd.DataFrame,
+    feature_range: tuple[int | float, int | float] = (0, 1),
+    column_name: str | int | list[str | int] | pd.Index = None,
+    jointly: bool = False,
+) -> pd.DataFrame:
+    """Scales DataFrame to between a minimum and maximum value.
+
+    One can optionally set a new target **minimum** and **maximum** value
+    using the `feature_range` keyword argument.
+
+    If `column_name` is specified, then only that column(s) of data is scaled.
+    Otherwise, the entire dataframe is scaled.
+    If `jointly` is `True`, the `column_names` provided entire dataframe will
+    be regnozied as the one to jointly scale. Otherwise, each column of data
+    will be scaled separately.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
+        >>> df.min_max_scale()
+             a    b
+        0  0.0  0.0
+        1  1.0  1.0
+        >>> df.min_max_scale(jointly=True)
+             a    b
+        0  0.5  0.0
+        1  1.0  0.5
+
+        Setting custom minimum and maximum.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
+        >>> df.min_max_scale(feature_range=(0, 100))
+               a      b
+        0    0.0    0.0
+        1  100.0  100.0
+        >>> df.min_max_scale(feature_range=(0, 100), jointly=True)
+               a     b
+        0   50.0   0.0
+        1  100.0  50.0
+
+        Apply min-max to the selected columns.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1], 'c': [1, 0]})
+        >>> df.min_max_scale(
+        ...     feature_range=(0, 100),
+        ...     column_name=["a", "c"],
+        ... )
+               a  b      c
+        0    0.0  0  100.0
+        1  100.0  1    0.0
+        >>> df.min_max_scale(
+        ...     feature_range=(0, 100),
+        ...     column_name=["a", "c"],
+        ...     jointly=True,
+        ... )
+               a  b     c
+        0   50.0  0  50.0
+        1  100.0  1   0.0
+        >>> df.min_max_scale(feature_range=(0, 100), column_name='a')
+               a  b  c
+        0    0.0  0  1
+        1  100.0  1  0
+
+        The aforementioned example might be applied to something like scaling the
+        isoelectric points of amino acids. While technically they range from
+        approx 3-10, we can also think of them on the pH scale which ranges from
+        1 to 14. Hence, 3 gets scaled not to 0 but approx. 0.15 instead, while 10
+        gets scaled to approx. 0.69 instead.
+
+    !!! summary "Version Changed"
+
+        - 0.24.0
+            - Deleted `old_min`, `old_max`, `new_min`, and `new_max` options.
+            - Added `feature_range`, and `jointly` options.
+
+    Args:
+        df: A pandas DataFrame.
+        feature_range: Desired range of transformed data.
+        column_name: The column on which to perform scaling.
+        jointly: Scale the entire data if True.
+
+    Raises:
+        ValueError: If `feature_range` isn't tuple type.
+        ValueError: If the length of `feature_range` isn't equal to two.
+        ValueError: If the element of `feature_range` isn't number type.
+        ValueError: If `feature_range[1]` <= `feature_range[0]`.
+
+    Returns:
+        A pandas DataFrame with scaled data.
+    """  # noqa: E501
+
+    if not (
+        isinstance(feature_range, (tuple, list))
+        and len(feature_range) == 2
+        and all((isinstance(i, (int, float))) for i in feature_range)
+        and feature_range[1] > feature_range[0]
+    ):
+        raise ValueError(
+            "`feature_range` should be a range type contains number element, "
+            "the first element must be greater than the second one"
+        )
+
+    if column_name is not None:
+        df = df.copy()  # Avoid to change the original DataFrame.
+
+        old_feature_range = df[column_name].pipe(_min_max_value, jointly)
+        df[column_name] = df[column_name].pipe(
+            _apply_min_max,
+            *old_feature_range,
+            *feature_range,
+        )
+    else:
+        old_feature_range = df.pipe(_min_max_value, jointly)
+        df = df.pipe(
+            _apply_min_max,
+            *old_feature_range,
+            *feature_range,
+        )
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ move + + +

+ +
+ +

Implementation of move.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ move(df, source, target=None, position='before', axis=0) + +

+ + +
+ +

Changes rows or columns positions in the dataframe.

+

It uses the +select syntax, +making it easy to move blocks of rows or columns at once.

+

This operation does not reset the index of the dataframe. User must +explicitly do so.

+

The dataframe must have unique column names or indices.

+ + +

Examples:

+

Move a row:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": [2, 4, 6, 8], "b": list("wxyz")})
+>>> df
+   a  b
+0  2  w
+1  4  x
+2  6  y
+3  8  z
+>>> df.move(source=0, target=3, position="before", axis=0)
+   a  b
+1  4  x
+2  6  y
+0  2  w
+3  8  z
+
+

Move a column:

+
>>> import pandas as pd
+>>> import janitor
+>>> data = [{"a": 1, "b": 1, "c": 1,
+...          "d": "a", "e": "a","f": "a"}]
+>>> df = pd.DataFrame(data)
+>>> df
+   a  b  c  d  e  f
+0  1  1  1  a  a  a
+>>> df.move(source="a", target="c", position="after", axis=1)
+   b  c  a  d  e  f
+0  1  1  1  a  a  a
+>>> df.move(source="f", target="b", position="before", axis=1)
+   a  f  b  c  d  e
+0  1  a  1  1  a  a
+>>> df.move(source="a", target=None, position="after", axis=1)
+   b  c  d  e  f  a
+0  1  1  a  a  a  1
+
+

Move columns:

+
>>> from pandas.api.types import is_numeric_dtype, is_string_dtype
+>>> df.move(source=is_string_dtype, target=None, position="before", axis=1)
+   d  e  f  a  b  c
+0  a  a  a  1  1  1
+>>> df.move(source=is_numeric_dtype, target=None, position="after", axis=1)
+   d  e  f  a  b  c
+0  a  a  a  1  1  1
+>>> df.move(source = ["d", "f"], target=is_numeric_dtype, position="before", axis=1)
+   d  f  a  b  c  e
+0  a  a  1  1  1  a
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The pandas DataFrame object.

+
+
+ required +
+ source + + Any + +
+

Columns or rows to move.

+
+
+ required +
+ target + + Any + +
+

Columns or rows to move adjacent to. +If None and position == 'before', source +is moved to the beginning; if position == 'after', +source is moved to the end.

+
+
+ None +
+ position + + str + +
+

Specifies the destination of the columns/rows. +Values can be either before or after; defaults to before.

+
+
+ 'before' +
+ axis + + int + +
+

Axis along which the function is applied. 0 to move along +the index, 1 to move along the columns.

+
+
+ 0 +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If axis is not 0 or 1.

+
+
+ ValueError + +
+

If position is not before or after.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

The dataframe with the Series moved.

+
+
+ +
+ Source code in janitor/functions/move.py +
 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
@pf.register_dataframe_method
+def move(
+    df: pd.DataFrame,
+    source: Any,
+    target: Any = None,
+    position: str = "before",
+    axis: int = 0,
+) -> pd.DataFrame:
+    """Changes rows or columns positions in the dataframe.
+
+    It uses the
+    [`select`][janitor.functions.select.select] syntax,
+    making it easy to move blocks of rows or columns at once.
+
+    This operation does not reset the index of the dataframe. User must
+    explicitly do so.
+
+    The dataframe must have unique column names or indices.
+
+    Examples:
+        Move a row:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": [2, 4, 6, 8], "b": list("wxyz")})
+        >>> df
+           a  b
+        0  2  w
+        1  4  x
+        2  6  y
+        3  8  z
+        >>> df.move(source=0, target=3, position="before", axis=0)
+           a  b
+        1  4  x
+        2  6  y
+        0  2  w
+        3  8  z
+
+        Move a column:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> data = [{"a": 1, "b": 1, "c": 1,
+        ...          "d": "a", "e": "a","f": "a"}]
+        >>> df = pd.DataFrame(data)
+        >>> df
+           a  b  c  d  e  f
+        0  1  1  1  a  a  a
+        >>> df.move(source="a", target="c", position="after", axis=1)
+           b  c  a  d  e  f
+        0  1  1  1  a  a  a
+        >>> df.move(source="f", target="b", position="before", axis=1)
+           a  f  b  c  d  e
+        0  1  a  1  1  a  a
+        >>> df.move(source="a", target=None, position="after", axis=1)
+           b  c  d  e  f  a
+        0  1  1  a  a  a  1
+
+        Move columns:
+        >>> from pandas.api.types import is_numeric_dtype, is_string_dtype
+        >>> df.move(source=is_string_dtype, target=None, position="before", axis=1)
+           d  e  f  a  b  c
+        0  a  a  a  1  1  1
+        >>> df.move(source=is_numeric_dtype, target=None, position="after", axis=1)
+           d  e  f  a  b  c
+        0  a  a  a  1  1  1
+        >>> df.move(source = ["d", "f"], target=is_numeric_dtype, position="before", axis=1)
+           d  f  a  b  c  e
+        0  a  a  1  1  1  a
+
+    Args:
+        df: The pandas DataFrame object.
+        source: Columns or rows to move.
+        target: Columns or rows to move adjacent to.
+            If `None` and `position == 'before'`, `source`
+            is moved to the beginning; if `position == 'after'`,
+            `source` is moved to the end.
+        position: Specifies the destination of the columns/rows.
+            Values can be either `before` or `after`; defaults to `before`.
+        axis: Axis along which the function is applied. 0 to move along
+            the index, 1 to move along the columns.
+
+    Raises:
+        ValueError: If `axis` is not `0` or `1`.
+        ValueError: If `position` is not `before` or `after`.
+
+    Returns:
+        The dataframe with the Series moved.
+    """  # noqa: E501
+    if axis not in [0, 1]:
+        raise ValueError(f"Invalid axis '{axis}'. Can only be 0 or 1.")
+
+    if position not in ["before", "after"]:
+        raise ValueError(
+            f"Invalid position '{position}'. Can only be 'before' or 'after'."
+        )
+
+    mapping = {0: "index", 1: "columns"}
+    names = getattr(df, mapping[axis])
+
+    assert names.is_unique
+
+    index = np.arange(names.size)
+    source = _select_index([source], df, mapping[axis])
+    source = _index_converter(source, index)
+    if target is None:
+        if position == "after":
+            target = np.array([names.size])
+        else:
+            target = np.array([0])
+    else:
+        target = _select_index([target], df, mapping[axis])
+        target = _index_converter(target, index)
+    index = np.delete(index, source)
+
+    if position == "before":
+        position = index.searchsorted(target[0])
+    else:
+        position = index.searchsorted(target[-1]) + 1
+    start = index[:position]
+    end = index[position:]
+    position = np.concatenate([start, source, end])
+
+    return df.iloc(axis=axis)[position]
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ pivot + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ pivot_longer(df, index=None, column_names=None, names_to=None, values_to='value', column_level=None, names_sep=None, names_pattern=None, names_transform=None, dropna=False, sort_by_appearance=False, ignore_index=True) + +

+ + +
+ +

Unpivots a DataFrame from wide to long format.

+

This method does not mutate the original DataFrame.

+

It is modeled after the pivot_longer function in R's tidyr package, +and also takes inspiration from R's data.table package.

+

This function is useful to massage a DataFrame into a format where +one or more columns are considered measured variables, and all other +columns are considered as identifier variables.

+

All measured variables are unpivoted (and typically duplicated) along the +row axis.

+

Column selection in index and column_names is possible using the +select syntax.

+

For more granular control on the unpivoting, have a look at +pivot_longer_spec.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame(
+...     {
+...         "Sepal.Length": [5.1, 5.9],
+...         "Sepal.Width": [3.5, 3.0],
+...         "Petal.Length": [1.4, 5.1],
+...         "Petal.Width": [0.2, 1.8],
+...         "Species": ["setosa", "virginica"],
+...     }
+... )
+>>> df
+   Sepal.Length  Sepal.Width  Petal.Length  Petal.Width    Species
+0           5.1          3.5           1.4          0.2     setosa
+1           5.9          3.0           5.1          1.8  virginica
+
+

Replicate pandas' melt:

+
>>> df.pivot_longer(index = 'Species')
+     Species      variable  value
+0     setosa  Sepal.Length    5.1
+1  virginica  Sepal.Length    5.9
+2     setosa   Sepal.Width    3.5
+3  virginica   Sepal.Width    3.0
+4     setosa  Petal.Length    1.4
+5  virginica  Petal.Length    5.1
+6     setosa   Petal.Width    0.2
+7  virginica   Petal.Width    1.8
+
+

Convenient, flexible column selection in the index via the +select syntax:

+
>>> from pandas.api.types import is_string_dtype
+>>> df.pivot_longer(index = is_string_dtype)
+     Species      variable  value
+0     setosa  Sepal.Length    5.1
+1  virginica  Sepal.Length    5.9
+2     setosa   Sepal.Width    3.5
+3  virginica   Sepal.Width    3.0
+4     setosa  Petal.Length    1.4
+5  virginica  Petal.Length    5.1
+6     setosa   Petal.Width    0.2
+7  virginica   Petal.Width    1.8
+
+

Split the column labels into individual columns:

+
>>> df.pivot_longer(
+...     index = 'Species',
+...     names_to = ('part', 'dimension'),
+...     names_sep = '.',
+...     sort_by_appearance = True,
+... )
+     Species   part dimension  value
+0     setosa  Sepal    Length    5.1
+1     setosa  Sepal     Width    3.5
+2     setosa  Petal    Length    1.4
+3     setosa  Petal     Width    0.2
+4  virginica  Sepal    Length    5.9
+5  virginica  Sepal     Width    3.0
+6  virginica  Petal    Length    5.1
+7  virginica  Petal     Width    1.8
+
+

Retain parts of the column names as headers:

+
>>> df.pivot_longer(
+...     index = 'Species',
+...     names_to = ('part', '.value'),
+...     names_sep = '.',
+...     sort_by_appearance = True,
+... )
+     Species   part  Length  Width
+0     setosa  Sepal     5.1    3.5
+1     setosa  Petal     1.4    0.2
+2  virginica  Sepal     5.9    3.0
+3  virginica  Petal     5.1    1.8
+
+

Split the column labels based on regex:

+
>>> df = pd.DataFrame({"id": [1], "new_sp_m5564": [2], "newrel_f65": [3]})
+>>> df
+   id  new_sp_m5564  newrel_f65
+0   1             2           3
+>>> df.pivot_longer(
+...     index = 'id',
+...     names_to = ('diagnosis', 'gender', 'age'),
+...     names_pattern = r"new_?(.+)_(.)(\d+)",
+... )
+   id diagnosis gender   age  value
+0   1        sp      m  5564      2
+1   1       rel      f    65      3
+
+

Split the column labels for the above dataframe using named groups in names_pattern:

+
>>> df.pivot_longer(
+...     index = 'id',
+...     names_pattern = r"new_?(?P<diagnosis>.+)_(?P<gender>.)(?P<age>\d+)",
+... )
+    id diagnosis gender   age  value
+0   1        sp      m  5564      2
+1   1       rel      f    65      3
+
+

Convert the dtypes of specific columns with names_transform:

+
>>> result = (df
+...          .pivot_longer(
+...              index = 'id',
+...              names_to = ('diagnosis', 'gender', 'age'),
+...              names_pattern = r"new_?(.+)_(.)(\d+)",
+...              names_transform = {'gender': 'category', 'age':'int'})
+... )
+>>> result.dtypes
+id           int64
+diagnosis   object
+gender    category
+age          int64
+value        int64
+dtype: object
+
+

Use multiple .value to reshape the dataframe:

+
>>> df = pd.DataFrame(
+...     [
+...         {
+...             "x_1_mean": 10,
+...             "x_2_mean": 20,
+...             "y_1_mean": 30,
+...             "y_2_mean": 40,
+...             "unit": 50,
+...         }
+...     ]
+... )
+>>> df
+   x_1_mean  x_2_mean  y_1_mean  y_2_mean  unit
+0        10        20        30        40    50
+>>> df.pivot_longer(
+...     index="unit",
+...     names_to=(".value", "time", ".value"),
+...     names_pattern=r"(x|y)_([0-9])(_mean)",
+... )
+   unit time  x_mean  y_mean
+0    50    1      10      30
+1    50    2      20      40
+
+

Replicate the above with named groups in names_pattern - use _ instead of .value:

+
>>> df.pivot_longer(
+...     index="unit",
+...     names_pattern=r"(?P<_>x|y)_(?P<time>[0-9])(?P<__>_mean)",
+... )
+   unit time  x_mean  y_mean
+0    50    1      10      30
+1    50    2      20      40
+
+

Convenient, flexible column selection in the column_names via +the select syntax:

+
>>> df.pivot_longer(
+...     column_names="*mean",
+...     names_to=(".value", "time", ".value"),
+...     names_pattern=r"(x|y)_([0-9])(_mean)",
+... )
+   unit time  x_mean  y_mean
+0    50    1      10      30
+1    50    2      20      40
+
+
>>> df.pivot_longer(
+...     column_names=slice("x_1_mean", "y_2_mean"),
+...     names_to=(".value", "time", ".value"),
+...     names_pattern=r"(x|y)_([0-9])(_mean)",
+... )
+   unit time  x_mean  y_mean
+0    50    1      10      30
+1    50    2      20      40
+
+

Reshape the dataframe by passing a sequence to names_pattern:

+
>>> df = pd.DataFrame({'hr1': [514, 573],
+...                    'hr2': [545, 526],
+...                    'team': ['Red Sox', 'Yankees'],
+...                    'year1': [2007, 2007],
+...                    'year2': [2008, 2008]})
+>>> df
+   hr1  hr2     team  year1  year2
+0  514  545  Red Sox   2007   2008
+1  573  526  Yankees   2007   2008
+>>> df.pivot_longer(
+...     index = 'team',
+...     names_to = ['year', 'hr'],
+...     names_pattern = ['year', 'hr']
+... )
+      team   hr  year
+0  Red Sox  514  2007
+1  Yankees  573  2007
+2  Red Sox  545  2008
+3  Yankees  526  2008
+
+

Reshape the above dataframe by passing a dictionary to names_pattern:

+
>>> df.pivot_longer(
+...     index = 'team',
+...     names_pattern = {"year":"year", "hr":"hr"}
+... )
+      team   hr  year
+0  Red Sox  514  2007
+1  Yankees  573  2007
+2  Red Sox  545  2008
+3  Yankees  526  2008
+
+

Multiple values_to:

+
>>> df = pd.DataFrame(
+...         {
+...             "City": ["Houston", "Austin", "Hoover"],
+...             "State": ["Texas", "Texas", "Alabama"],
+...             "Name": ["Aria", "Penelope", "Niko"],
+...             "Mango": [4, 10, 90],
+...             "Orange": [10, 8, 14],
+...             "Watermelon": [40, 99, 43],
+...             "Gin": [16, 200, 34],
+...             "Vodka": [20, 33, 18],
+...         },
+...     )
+>>> df
+      City    State      Name  Mango  Orange  Watermelon  Gin  Vodka
+0  Houston    Texas      Aria      4      10          40   16     20
+1   Austin    Texas  Penelope     10       8          99  200     33
+2   Hoover  Alabama      Niko     90      14          43   34     18
+>>> df.pivot_longer(
+...         index=["City", "State"],
+...         column_names=slice("Mango", "Vodka"),
+...         names_to=("Fruit", "Drink"),
+...         values_to=("Pounds", "Ounces"),
+...         names_pattern=["M|O|W", "G|V"],
+...     )
+      City    State       Fruit  Drink  Pounds  Ounces
+0  Houston    Texas       Mango    Gin       4    16.0
+1   Austin    Texas       Mango    Gin      10   200.0
+2   Hoover  Alabama       Mango    Gin      90    34.0
+3  Houston    Texas      Orange  Vodka      10    20.0
+4   Austin    Texas      Orange  Vodka       8    33.0
+5   Hoover  Alabama      Orange  Vodka      14    18.0
+6  Houston    Texas  Watermelon   None      40     NaN
+7   Austin    Texas  Watermelon   None      99     NaN
+8   Hoover  Alabama  Watermelon   None      43     NaN
+
+

Replicate the above transformation with a nested dictionary passed to names_pattern +- the outer keys in the names_pattern dictionary are passed to names_to, +while the inner keys are passed to values_to:

+
>>> df.pivot_longer(
+...     index=["City", "State"],
+...     column_names=slice("Mango", "Vodka"),
+...     names_pattern={
+...         "Fruit": {"Pounds": "M|O|W"},
+...         "Drink": {"Ounces": "G|V"},
+...     },
+... )
+      City    State       Fruit  Drink  Pounds  Ounces
+0  Houston    Texas       Mango    Gin       4    16.0
+1   Austin    Texas       Mango    Gin      10   200.0
+2   Hoover  Alabama       Mango    Gin      90    34.0
+3  Houston    Texas      Orange  Vodka      10    20.0
+4   Austin    Texas      Orange  Vodka       8    33.0
+5   Hoover  Alabama      Orange  Vodka      14    18.0
+6  Houston    Texas  Watermelon   None      40     NaN
+7   Austin    Texas  Watermelon   None      99     NaN
+8   Hoover  Alabama  Watermelon   None      43     NaN
+
+
+

Version Changed

+
    +
  • 0.24.0
      +
    • Added dropna parameter.
    • +
    +
  • +
  • 0.24.1
      +
    • names_pattern can accept a dictionary.
    • +
    • named groups supported in names_pattern.
    • +
    +
  • +
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ index + + list | tuple | str | Pattern + +
+

Name(s) of columns to use as identifier variables. +Should be either a single column name, or a list/tuple of +column names. +index should be a list of tuples if the columns are a MultiIndex. +Column selection is possible using the +select syntax.

+
+
+ None +
+ column_names + + list | tuple | str | Pattern + +
+

Name(s) of columns to unpivot. Should be either +a single column name or a list/tuple of column names. +column_names should be a list of tuples +if the columns are a MultiIndex. +Column selection is possible using the +select syntax.

+
+
+ None +
+ names_to + + list | tuple | str + +
+

Name of new column as a string that will contain +what were previously the column names in column_names. +The default is variable if no value is provided. It can +also be a list/tuple of strings that will serve as new column +names, if name_sep or names_pattern is provided. +If .value is in names_to, new column names will be extracted +from part of the existing column names and overrides values_to.

+
+
+ None +
+ values_to + + str + +
+

Name of new column as a string that will contain what +were previously the values of the columns in column_names. +values_to can also be a list/tuple +and requires that names_pattern is also a list/tuple.

+
+
+ 'value' +
+ column_level + + int | str + +
+

If columns are a MultiIndex, then use this level to +unpivot the DataFrame. Provided for compatibility with pandas' melt, +and applies only if neither names_sep nor names_pattern is +provided.

+
+
+ None +
+ names_sep + + str | Pattern + +
+

Determines how the column name is broken up, if +names_to contains multiple values. It takes the same +specification as pandas' str.split method, and can be a string +or regular expression. names_sep does not work with MultiIndex +columns.

+
+
+ None +
+ names_pattern + + list | tuple | str | Pattern + +
+

Determines how the column name is broken up. +It can be a regular expression containing matching groups. +Under the hood it is processed with pandas' str.extract function. +If it is a single regex, the number of groups must match +the length of names_to. +Named groups are supported, if names_to is none. _ is used +instead of .value as a placeholder in named groups. +_ can be overloaded for multiple .value +calls - _, __, ___, ... +names_pattern can also be a list/tuple of regular expressions +It can also be a list/tuple of strings; +the strings will be treated as regular expressions. +Under the hood it is processed with pandas' str.contains function. +For a list/tuple of regular expressions, +names_to must also be a list/tuple and the lengths of both +arguments must match. +names_pattern can also be a dictionary, where the keys are +the new column names, while the values can be a regular expression +or a string which will be evaluated as a regular expression. +Alternatively, a nested dictionary can be used, where the sub +key(s) are associated with values_to. Please have a look +at the examples for usage. +names_pattern does not work with MultiIndex columns.

+
+
+ None +
+ names_transform + + str | Callable | dict + +
+

Use this option to change the types of columns that +have been transformed to rows. This does not applies to the values' columns. +Accepts any argument that is acceptable by pd.astype.

+
+
+ None +
+ dropna + + bool + +
+

Determines whether or not to drop nulls +from the values columns. Default is False.

+
+
+ False +
+ sort_by_appearance + + bool + +
+

Boolean value that determines +the final look of the DataFrame. If True, the unpivoted DataFrame +will be stacked in order of first appearance.

+
+
+ False +
+ ignore_index + + bool + +
+

If True, +the original index is ignored. If False, the original index +is retained and the index labels will be repeated as necessary.

+
+
+ True +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame that has been unpivoted from wide to long +format.

+
+
+ +
+ Source code in janitor/functions/pivot.py +
 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
@pf.register_dataframe_method
+def pivot_longer(
+    df: pd.DataFrame,
+    index: list | tuple | str | Pattern = None,
+    column_names: list | tuple | str | Pattern = None,
+    names_to: list | tuple | str = None,
+    values_to: str = "value",
+    column_level: int | str = None,
+    names_sep: str | Pattern = None,
+    names_pattern: list | tuple | str | Pattern = None,
+    names_transform: str | Callable | dict = None,
+    dropna: bool = False,
+    sort_by_appearance: bool = False,
+    ignore_index: bool = True,
+) -> pd.DataFrame:
+    """Unpivots a DataFrame from *wide* to *long* format.
+
+    This method does not mutate the original DataFrame.
+
+    It is modeled after the `pivot_longer` function in R's tidyr package,
+    and also takes inspiration from R's data.table package.
+
+    This function is useful to massage a DataFrame into a format where
+    one or more columns are considered measured variables, and all other
+    columns are considered as identifier variables.
+
+    All measured variables are *unpivoted* (and typically duplicated) along the
+    row axis.
+
+    Column selection in `index` and `column_names` is possible using the
+    [`select`][janitor.functions.select.select] syntax.
+
+    For more granular control on the unpivoting, have a look at
+    [`pivot_longer_spec`][janitor.functions.pivot.pivot_longer_spec].
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "Sepal.Length": [5.1, 5.9],
+        ...         "Sepal.Width": [3.5, 3.0],
+        ...         "Petal.Length": [1.4, 5.1],
+        ...         "Petal.Width": [0.2, 1.8],
+        ...         "Species": ["setosa", "virginica"],
+        ...     }
+        ... )
+        >>> df
+           Sepal.Length  Sepal.Width  Petal.Length  Petal.Width    Species
+        0           5.1          3.5           1.4          0.2     setosa
+        1           5.9          3.0           5.1          1.8  virginica
+
+        Replicate pandas' melt:
+        >>> df.pivot_longer(index = 'Species')
+             Species      variable  value
+        0     setosa  Sepal.Length    5.1
+        1  virginica  Sepal.Length    5.9
+        2     setosa   Sepal.Width    3.5
+        3  virginica   Sepal.Width    3.0
+        4     setosa  Petal.Length    1.4
+        5  virginica  Petal.Length    5.1
+        6     setosa   Petal.Width    0.2
+        7  virginica   Petal.Width    1.8
+
+        Convenient, flexible column selection in the `index` via the
+        [`select`][janitor.functions.select.select] syntax:
+        >>> from pandas.api.types import is_string_dtype
+        >>> df.pivot_longer(index = is_string_dtype)
+             Species      variable  value
+        0     setosa  Sepal.Length    5.1
+        1  virginica  Sepal.Length    5.9
+        2     setosa   Sepal.Width    3.5
+        3  virginica   Sepal.Width    3.0
+        4     setosa  Petal.Length    1.4
+        5  virginica  Petal.Length    5.1
+        6     setosa   Petal.Width    0.2
+        7  virginica   Petal.Width    1.8
+
+        Split the column labels into individual columns:
+        >>> df.pivot_longer(
+        ...     index = 'Species',
+        ...     names_to = ('part', 'dimension'),
+        ...     names_sep = '.',
+        ...     sort_by_appearance = True,
+        ... )
+             Species   part dimension  value
+        0     setosa  Sepal    Length    5.1
+        1     setosa  Sepal     Width    3.5
+        2     setosa  Petal    Length    1.4
+        3     setosa  Petal     Width    0.2
+        4  virginica  Sepal    Length    5.9
+        5  virginica  Sepal     Width    3.0
+        6  virginica  Petal    Length    5.1
+        7  virginica  Petal     Width    1.8
+
+        Retain parts of the column names as headers:
+        >>> df.pivot_longer(
+        ...     index = 'Species',
+        ...     names_to = ('part', '.value'),
+        ...     names_sep = '.',
+        ...     sort_by_appearance = True,
+        ... )
+             Species   part  Length  Width
+        0     setosa  Sepal     5.1    3.5
+        1     setosa  Petal     1.4    0.2
+        2  virginica  Sepal     5.9    3.0
+        3  virginica  Petal     5.1    1.8
+
+        Split the column labels based on regex:
+        >>> df = pd.DataFrame({"id": [1], "new_sp_m5564": [2], "newrel_f65": [3]})
+        >>> df
+           id  new_sp_m5564  newrel_f65
+        0   1             2           3
+        >>> df.pivot_longer(
+        ...     index = 'id',
+        ...     names_to = ('diagnosis', 'gender', 'age'),
+        ...     names_pattern = r"new_?(.+)_(.)(\\d+)",
+        ... )
+           id diagnosis gender   age  value
+        0   1        sp      m  5564      2
+        1   1       rel      f    65      3
+
+        Split the column labels for the above dataframe using named groups in `names_pattern`:
+        >>> df.pivot_longer(
+        ...     index = 'id',
+        ...     names_pattern = r"new_?(?P<diagnosis>.+)_(?P<gender>.)(?P<age>\\d+)",
+        ... )
+            id diagnosis gender   age  value
+        0   1        sp      m  5564      2
+        1   1       rel      f    65      3
+
+        Convert the dtypes of specific columns with `names_transform`:
+        >>> result = (df
+        ...          .pivot_longer(
+        ...              index = 'id',
+        ...              names_to = ('diagnosis', 'gender', 'age'),
+        ...              names_pattern = r"new_?(.+)_(.)(\\d+)",
+        ...              names_transform = {'gender': 'category', 'age':'int'})
+        ... )
+        >>> result.dtypes
+        id           int64
+        diagnosis   object
+        gender    category
+        age          int64
+        value        int64
+        dtype: object
+
+        Use multiple `.value` to reshape the dataframe:
+        >>> df = pd.DataFrame(
+        ...     [
+        ...         {
+        ...             "x_1_mean": 10,
+        ...             "x_2_mean": 20,
+        ...             "y_1_mean": 30,
+        ...             "y_2_mean": 40,
+        ...             "unit": 50,
+        ...         }
+        ...     ]
+        ... )
+        >>> df
+           x_1_mean  x_2_mean  y_1_mean  y_2_mean  unit
+        0        10        20        30        40    50
+        >>> df.pivot_longer(
+        ...     index="unit",
+        ...     names_to=(".value", "time", ".value"),
+        ...     names_pattern=r"(x|y)_([0-9])(_mean)",
+        ... )
+           unit time  x_mean  y_mean
+        0    50    1      10      30
+        1    50    2      20      40
+
+        Replicate the above with named groups in `names_pattern` - use `_` instead of `.value`:
+        >>> df.pivot_longer(
+        ...     index="unit",
+        ...     names_pattern=r"(?P<_>x|y)_(?P<time>[0-9])(?P<__>_mean)",
+        ... )
+           unit time  x_mean  y_mean
+        0    50    1      10      30
+        1    50    2      20      40
+
+        Convenient, flexible column selection in the `column_names` via
+        the [`select`][janitor.functions.select.select] syntax:
+        >>> df.pivot_longer(
+        ...     column_names="*mean",
+        ...     names_to=(".value", "time", ".value"),
+        ...     names_pattern=r"(x|y)_([0-9])(_mean)",
+        ... )
+           unit time  x_mean  y_mean
+        0    50    1      10      30
+        1    50    2      20      40
+
+        >>> df.pivot_longer(
+        ...     column_names=slice("x_1_mean", "y_2_mean"),
+        ...     names_to=(".value", "time", ".value"),
+        ...     names_pattern=r"(x|y)_([0-9])(_mean)",
+        ... )
+           unit time  x_mean  y_mean
+        0    50    1      10      30
+        1    50    2      20      40
+
+        Reshape the dataframe by passing a sequence to `names_pattern`:
+        >>> df = pd.DataFrame({'hr1': [514, 573],
+        ...                    'hr2': [545, 526],
+        ...                    'team': ['Red Sox', 'Yankees'],
+        ...                    'year1': [2007, 2007],
+        ...                    'year2': [2008, 2008]})
+        >>> df
+           hr1  hr2     team  year1  year2
+        0  514  545  Red Sox   2007   2008
+        1  573  526  Yankees   2007   2008
+        >>> df.pivot_longer(
+        ...     index = 'team',
+        ...     names_to = ['year', 'hr'],
+        ...     names_pattern = ['year', 'hr']
+        ... )
+              team   hr  year
+        0  Red Sox  514  2007
+        1  Yankees  573  2007
+        2  Red Sox  545  2008
+        3  Yankees  526  2008
+
+
+        Reshape the above dataframe by passing a dictionary to `names_pattern`:
+        >>> df.pivot_longer(
+        ...     index = 'team',
+        ...     names_pattern = {"year":"year", "hr":"hr"}
+        ... )
+              team   hr  year
+        0  Red Sox  514  2007
+        1  Yankees  573  2007
+        2  Red Sox  545  2008
+        3  Yankees  526  2008
+
+        Multiple values_to:
+        >>> df = pd.DataFrame(
+        ...         {
+        ...             "City": ["Houston", "Austin", "Hoover"],
+        ...             "State": ["Texas", "Texas", "Alabama"],
+        ...             "Name": ["Aria", "Penelope", "Niko"],
+        ...             "Mango": [4, 10, 90],
+        ...             "Orange": [10, 8, 14],
+        ...             "Watermelon": [40, 99, 43],
+        ...             "Gin": [16, 200, 34],
+        ...             "Vodka": [20, 33, 18],
+        ...         },
+        ...     )
+        >>> df
+              City    State      Name  Mango  Orange  Watermelon  Gin  Vodka
+        0  Houston    Texas      Aria      4      10          40   16     20
+        1   Austin    Texas  Penelope     10       8          99  200     33
+        2   Hoover  Alabama      Niko     90      14          43   34     18
+        >>> df.pivot_longer(
+        ...         index=["City", "State"],
+        ...         column_names=slice("Mango", "Vodka"),
+        ...         names_to=("Fruit", "Drink"),
+        ...         values_to=("Pounds", "Ounces"),
+        ...         names_pattern=["M|O|W", "G|V"],
+        ...     )
+              City    State       Fruit  Drink  Pounds  Ounces
+        0  Houston    Texas       Mango    Gin       4    16.0
+        1   Austin    Texas       Mango    Gin      10   200.0
+        2   Hoover  Alabama       Mango    Gin      90    34.0
+        3  Houston    Texas      Orange  Vodka      10    20.0
+        4   Austin    Texas      Orange  Vodka       8    33.0
+        5   Hoover  Alabama      Orange  Vodka      14    18.0
+        6  Houston    Texas  Watermelon   None      40     NaN
+        7   Austin    Texas  Watermelon   None      99     NaN
+        8   Hoover  Alabama  Watermelon   None      43     NaN
+
+        Replicate the above transformation with a nested dictionary passed to `names_pattern`
+        - the outer keys in the `names_pattern` dictionary are passed to `names_to`,
+        while the inner keys are passed to `values_to`:
+        >>> df.pivot_longer(
+        ...     index=["City", "State"],
+        ...     column_names=slice("Mango", "Vodka"),
+        ...     names_pattern={
+        ...         "Fruit": {"Pounds": "M|O|W"},
+        ...         "Drink": {"Ounces": "G|V"},
+        ...     },
+        ... )
+              City    State       Fruit  Drink  Pounds  Ounces
+        0  Houston    Texas       Mango    Gin       4    16.0
+        1   Austin    Texas       Mango    Gin      10   200.0
+        2   Hoover  Alabama       Mango    Gin      90    34.0
+        3  Houston    Texas      Orange  Vodka      10    20.0
+        4   Austin    Texas      Orange  Vodka       8    33.0
+        5   Hoover  Alabama      Orange  Vodka      14    18.0
+        6  Houston    Texas  Watermelon   None      40     NaN
+        7   Austin    Texas  Watermelon   None      99     NaN
+        8   Hoover  Alabama  Watermelon   None      43     NaN
+
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `dropna` parameter.
+        - 0.24.1
+            - `names_pattern` can accept a dictionary.
+            - named groups supported in `names_pattern`.
+
+    Args:
+        df: A pandas DataFrame.
+        index: Name(s) of columns to use as identifier variables.
+            Should be either a single column name, or a list/tuple of
+            column names.
+            `index` should be a list of tuples if the columns are a MultiIndex.
+            Column selection is possible using the
+            [`select`][janitor.functions.select.select] syntax.
+        column_names: Name(s) of columns to unpivot. Should be either
+            a single column name or a list/tuple of column names.
+            `column_names` should be a list of tuples
+            if the columns are a MultiIndex.
+            Column selection is possible using the
+            [`select`][janitor.functions.select.select] syntax.
+        names_to: Name of new column as a string that will contain
+            what were previously the column names in `column_names`.
+            The default is `variable` if no value is provided. It can
+            also be a list/tuple of strings that will serve as new column
+            names, if `name_sep` or `names_pattern` is provided.
+            If `.value` is in `names_to`, new column names will be extracted
+            from part of the existing column names and overrides `values_to`.
+        values_to: Name of new column as a string that will contain what
+            were previously the values of the columns in `column_names`.
+            values_to can also be a list/tuple
+            and requires that names_pattern is also a list/tuple.
+        column_level: If columns are a MultiIndex, then use this level to
+            unpivot the DataFrame. Provided for compatibility with pandas' melt,
+            and applies only if neither `names_sep` nor `names_pattern` is
+            provided.
+        names_sep: Determines how the column name is broken up, if
+            `names_to` contains multiple values. It takes the same
+            specification as pandas' `str.split` method, and can be a string
+            or regular expression. `names_sep` does not work with MultiIndex
+            columns.
+        names_pattern: Determines how the column name is broken up.
+            It can be a regular expression containing matching groups.
+            Under the hood it is processed with pandas' `str.extract` function.
+            If it is a single regex, the number of groups must match
+            the length of `names_to`.
+            Named groups are supported, if `names_to` is none. `_` is used
+            instead of `.value` as a placeholder in named groups.
+            `_` can be overloaded for multiple `.value`
+            calls - `_`, `__`, `___`, ...
+            `names_pattern` can also be a list/tuple of regular expressions
+            It can also be a list/tuple of strings;
+            the strings will be treated as regular expressions.
+            Under the hood it is processed with pandas' `str.contains` function.
+            For a list/tuple of regular expressions,
+            `names_to` must also be a list/tuple and the lengths of both
+            arguments must match.
+            `names_pattern` can also be a dictionary, where the keys are
+            the new column names, while the values can be a regular expression
+            or a string which will be evaluated as a regular expression.
+            Alternatively, a nested dictionary can be used, where the sub
+            key(s) are associated with `values_to`. Please have a look
+            at the examples for usage.
+            `names_pattern` does not work with MultiIndex columns.
+        names_transform: Use this option to change the types of columns that
+            have been transformed to rows. This does not applies to the values' columns.
+            Accepts any argument that is acceptable by `pd.astype`.
+        dropna: Determines whether or not to drop nulls
+            from the values columns. Default is `False`.
+        sort_by_appearance: Boolean value that determines
+            the final look of the DataFrame. If `True`, the unpivoted DataFrame
+            will be stacked in order of first appearance.
+        ignore_index: If `True`,
+            the original index is ignored. If `False`, the original index
+            is retained and the index labels will be repeated as necessary.
+
+    Returns:
+        A pandas DataFrame that has been unpivoted from wide to long
+            format.
+    """  # noqa: E501
+
+    # this code builds on the wonderful work of @benjaminjack’s PR
+    # https://github.com/benjaminjack/pyjanitor/commit/e3df817903c20dd21634461c8a92aec137963ed0
+
+    return _computations_pivot_longer(
+        df=df,
+        index=index,
+        column_names=column_names,
+        column_level=column_level,
+        names_to=names_to,
+        values_to=values_to,
+        names_sep=names_sep,
+        names_pattern=names_pattern,
+        names_transform=names_transform,
+        dropna=dropna,
+        sort_by_appearance=sort_by_appearance,
+        ignore_index=ignore_index,
+    )
+
+
+
+ +
+ +
+ + +

+ pivot_longer_spec(df, spec, sort_by_appearance=False, ignore_index=True, dropna=False, df_columns_is_unique=True) + +

+ + +
+ +

A declarative interface to pivot a DataFrame from wide to long form, +where you describe how the data will be unpivoted, +using a DataFrame.

+

This gives you, the user, +more control over unpivoting, where you create a “spec” +data frame that describes exactly how data stored +in the column names becomes variables.

+

It can come in handy for situations where +pivot_longer +seems inadequate for the transformation.

+
+

New in version 0.28.0

+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame(
+...     {
+...         "Sepal.Length": [5.1, 5.9],
+...         "Sepal.Width": [3.5, 3.0],
+...         "Petal.Length": [1.4, 5.1],
+...         "Petal.Width": [0.2, 1.8],
+...         "Species": ["setosa", "virginica"],
+...     }
+... )
+>>> df
+   Sepal.Length  Sepal.Width  Petal.Length  Petal.Width    Species
+0           5.1          3.5           1.4          0.2     setosa
+1           5.9          3.0           5.1          1.8  virginica
+>>> spec = {'.name':['Sepal.Length','Petal.Length',
+...                  'Sepal.Width','Petal.Width'],
+...         '.value':['Length','Length','Width','Width'],
+...         'part':['Sepal','Petal','Sepal','Petal']}
+>>> spec = pd.DataFrame(spec)
+>>> spec
+          .name  .value   part
+0  Sepal.Length  Length  Sepal
+1  Petal.Length  Length  Petal
+2   Sepal.Width   Width  Sepal
+3   Petal.Width   Width  Petal
+>>> pivot_longer_spec(df=df,spec=spec)
+     Species   part  Length  Width
+0     setosa  Sepal     5.1    3.5
+1  virginica  Sepal     5.9    3.0
+2     setosa  Petal     1.4    0.2
+3  virginica  Petal     5.1    1.8
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The source DataFrame to unpivot.

+
+
+ required +
+ spec + + DataFrame + +
+

A specification DataFrame. +At a minimum, the spec DataFrame +must have a '.name' and a '.value' columns. +The '.name' column should contain the +columns in the source DataFrame that will be +transformed to long form. +The '.value' column gives the name of the column(s) +that the values in the source DataFrame will go into. +Additional columns in spec should be named to match columns +in the long format of the dataset and contain values +corresponding to columns pivoted from the wide format. +Note that these additional columns should not already exist +in the source DataFrame.

+
+
+ required +
+ sort_by_appearance + + bool + +
+

Boolean value that determines +the final look of the DataFrame. If True, the unpivoted DataFrame +will be stacked in order of first appearance.

+
+
+ False +
+ ignore_index + + bool + +
+

If True, +the original index is ignored. If False, the original index +is retained and the index labels will be repeated as necessary.

+
+
+ True +
+ dropna + + bool + +
+

Determines whether or not to drop nulls +from the values columns. Default is False.

+
+
+ False +
+ df_columns_is_unique + + bool + +
+

Boolean value to indicate if the source +DataFrame's columns is unique. Default is True.

+
+
+ True +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ KeyError + +
+

If '.name' or '.value' is missing from the spec's columns.

+
+
+ ValueError + +
+

If the spec's columns is not unique, +or the labels in spec['.name'] is not unique.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/pivot.py +
417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
def pivot_longer_spec(
+    df: pd.DataFrame,
+    spec: pd.DataFrame,
+    sort_by_appearance: bool = False,
+    ignore_index: bool = True,
+    dropna: bool = False,
+    df_columns_is_unique: bool = True,
+) -> pd.DataFrame:
+    """A declarative interface to pivot a DataFrame from wide to long form,
+    where you describe how the data will be unpivoted,
+    using a DataFrame.
+
+    This gives you, the user,
+    more control over unpivoting, where you create a “spec”
+    data frame that describes exactly how data stored
+    in the column names becomes variables.
+
+    It can come in handy for situations where
+    [`pivot_longer`][janitor.functions.pivot.pivot_longer]
+    seems inadequate for the transformation.
+
+    !!! info "New in version 0.28.0"
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "Sepal.Length": [5.1, 5.9],
+        ...         "Sepal.Width": [3.5, 3.0],
+        ...         "Petal.Length": [1.4, 5.1],
+        ...         "Petal.Width": [0.2, 1.8],
+        ...         "Species": ["setosa", "virginica"],
+        ...     }
+        ... )
+        >>> df
+           Sepal.Length  Sepal.Width  Petal.Length  Petal.Width    Species
+        0           5.1          3.5           1.4          0.2     setosa
+        1           5.9          3.0           5.1          1.8  virginica
+        >>> spec = {'.name':['Sepal.Length','Petal.Length',
+        ...                  'Sepal.Width','Petal.Width'],
+        ...         '.value':['Length','Length','Width','Width'],
+        ...         'part':['Sepal','Petal','Sepal','Petal']}
+        >>> spec = pd.DataFrame(spec)
+        >>> spec
+                  .name  .value   part
+        0  Sepal.Length  Length  Sepal
+        1  Petal.Length  Length  Petal
+        2   Sepal.Width   Width  Sepal
+        3   Petal.Width   Width  Petal
+        >>> pivot_longer_spec(df=df,spec=spec)
+             Species   part  Length  Width
+        0     setosa  Sepal     5.1    3.5
+        1  virginica  Sepal     5.9    3.0
+        2     setosa  Petal     1.4    0.2
+        3  virginica  Petal     5.1    1.8
+
+    Args:
+        df: The source DataFrame to unpivot.
+        spec: A specification DataFrame.
+            At a minimum, the spec DataFrame
+            must have a '.name' and a '.value' columns.
+            The '.name' column  should contain the
+            columns in the source DataFrame that will be
+            transformed to long form.
+            The '.value' column gives the name of the column(s)
+            that the values in the source DataFrame will go into.
+            Additional columns in spec should be named to match columns
+            in the long format of the dataset and contain values
+            corresponding to columns pivoted from the wide format.
+            Note that these additional columns should not already exist
+            in the source DataFrame.
+        sort_by_appearance: Boolean value that determines
+            the final look of the DataFrame. If `True`, the unpivoted DataFrame
+            will be stacked in order of first appearance.
+        ignore_index: If `True`,
+            the original index is ignored. If `False`, the original index
+            is retained and the index labels will be repeated as necessary.
+        dropna: Determines whether or not to drop nulls
+            from the values columns. Default is `False`.
+        df_columns_is_unique: Boolean value to indicate if the source
+            DataFrame's columns is unique. Default is `True`.
+
+    Raises:
+        KeyError: If '.name' or '.value' is missing from the spec's columns.
+        ValueError: If the spec's columns is not unique,
+            or the labels in spec['.name'] is not unique.
+
+    Returns:
+        A pandas DataFrame.
+    """
+    check("spec", spec, [pd.DataFrame])
+    if not spec.columns.is_unique:
+        raise ValueError("Kindly ensure the spec's columns is unique.")
+    if ".name" not in spec.columns:
+        raise KeyError(
+            "Kindly ensure the spec DataFrame has a `.name` column."
+        )
+    if ".value" not in spec.columns:
+        raise KeyError(
+            "Kindly ensure the spec DataFrame has a `.value` column."
+        )
+    if spec.columns.tolist()[:2] != [".name", ".value"]:
+        raise ValueError(
+            "The first two columns of the spec DataFrame "
+            "should be '.name' and '.value', "
+            "with '.name' coming before '.value'."
+        )
+    if not spec[".name"].is_unique:
+        raise ValueError("The labels in the `.name` column should be unique.")
+
+    exclude = df.columns.intersection(spec.columns)
+    if not exclude.empty:
+        raise ValueError(
+            f"Labels {*exclude, } in the spec DataFrame already exist "
+            "as column labels in the source DataFrame. "
+            "Kindly ensure the spec DataFrame's columns "
+            "are not present in the source DataFrame."
+        )
+
+    check("dropna", dropna, [bool])
+    check("sort_by_appearance", sort_by_appearance, [bool])
+    check("ignore_index", ignore_index, [bool])
+    check("df_columns_is_unique", df_columns_is_unique, [bool])
+
+    index = df.columns.difference(spec[".name"], sort=False)
+    index = {name: df[name]._values for name in index}
+
+    df = df.loc[:, spec[".name"]]
+    if not df_columns_is_unique:
+        spec = pd.DataFrame({".name": df.columns}).merge(
+            spec, on=".name", how="inner"
+        )
+    others = [label for label in spec if label not in {".name", ".value"}]
+    return _pivot_longer_dot_value(
+        df=df,
+        spec=spec.drop(columns=".name"),
+        index=index,
+        others=others,
+        sort_by_appearance=sort_by_appearance,
+        ignore_index=ignore_index,
+        dropna=dropna,
+    )
+
+
+
+ +
+ +
+ + +

+ pivot_wider(df, index=None, names_from=None, values_from=None, flatten_levels=True, names_sep='_', names_glue=None, reset_index=True, names_expand=False, index_expand=False) + +

+ + +
+ +

Reshapes data from long to wide form.

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use pd.DataFrame.pivot instead.

+
+

The number of columns are increased, while decreasing +the number of rows. It is the inverse of the +pivot_longer +method, and is a wrapper around pd.DataFrame.pivot method.

+

This method does not mutate the original DataFrame.

+

Column selection in index, names_from and values_from +is possible using the +select syntax.

+

A ValueError is raised if the combination +of the index and names_from is not unique.

+

By default, values from values_from are always +at the top level if the columns are not flattened. +If flattened, the values from values_from are usually +at the start of each label in the columns.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = [{'dep': 5.5, 'step': 1, 'a': 20, 'b': 30},
+...       {'dep': 5.5, 'step': 2, 'a': 25, 'b': 37},
+...       {'dep': 6.1, 'step': 1, 'a': 22, 'b': 19},
+...       {'dep': 6.1, 'step': 2, 'a': 18, 'b': 29}]
+>>> df = pd.DataFrame(df)
+>>> df
+   dep  step   a   b
+0  5.5     1  20  30
+1  5.5     2  25  37
+2  6.1     1  22  19
+3  6.1     2  18  29
+
+

Pivot and flatten columns:

+
>>> df.pivot_wider(
+...     index = "dep",
+...     names_from = "step",
+... )
+   dep  a_1  a_2  b_1  b_2
+0  5.5   20   25   30   37
+1  6.1   22   18   19   29
+
+

Modify columns with names_sep:

+
>>> df.pivot_wider(
+...     index = "dep",
+...     names_from = "step",
+...     names_sep = "",
+... )
+   dep  a1  a2  b1  b2
+0  5.5  20  25  30  37
+1  6.1  22  18  19  29
+
+

Modify columns with names_glue:

+
>>> df.pivot_wider(
+...     index = "dep",
+...     names_from = "step",
+...     names_glue = "{_value}_step{step}",
+... )
+   dep  a_step1  a_step2  b_step1  b_step2
+0  5.5       20       25       30       37
+1  6.1       22       18       19       29
+
+

Expand columns to expose implicit missing values +- this applies only to categorical columns:

+
>>> weekdays = ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
+>>> daily = pd.DataFrame(
+...     {
+...         "day": pd.Categorical(
+...             values=("Tue", "Thu", "Fri", "Mon"), categories=weekdays
+...         ),
+...         "value": (2, 3, 1, 5),
+...     },
+... index=[0, 0, 0, 0],
+... )
+>>> daily
+   day  value
+0  Tue      2
+0  Thu      3
+0  Fri      1
+0  Mon      5
+>>> daily.pivot_wider(names_from='day', values_from='value')
+   Tue  Thu  Fri  Mon
+0    2    3    1    5
+>>> (daily
+... .pivot_wider(
+...     names_from='day',
+...     values_from='value',
+...     names_expand=True)
+... )
+   Mon  Tue  Wed  Thu  Fri  Sat  Sun
+0    5    2  NaN    3    1  NaN  NaN
+
+

Expand the index to expose implicit missing values +- this applies only to categorical columns:

+
>>> daily = daily.assign(letter = list('ABBA'))
+>>> daily
+   day  value letter
+0  Tue      2      A
+0  Thu      3      B
+0  Fri      1      B
+0  Mon      5      A
+>>> daily.pivot_wider(index='day',names_from='letter',values_from='value')
+   day    A    B
+0  Tue  2.0  NaN
+1  Thu  NaN  3.0
+2  Fri  NaN  1.0
+3  Mon  5.0  NaN
+>>> (daily
+... .pivot_wider(
+...     index='day',
+...     names_from='letter',
+...     values_from='value',
+...     index_expand=True)
+... )
+   day    A    B
+0  Mon  5.0  NaN
+1  Tue  2.0  NaN
+2  Wed  NaN  NaN
+3  Thu  NaN  3.0
+4  Fri  NaN  1.0
+5  Sat  NaN  NaN
+6  Sun  NaN  NaN
+
+
+

Version Changed

+
    +
  • 0.24.0
      +
    • Added reset_index, names_expand and index_expand parameters.
    • +
    +
  • +
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ index + + list | str + +
+

Name(s) of columns to use as identifier variables. +It should be either a single column name, or a list of column names. +If index is not provided, the DataFrame's index is used.

+
+
+ None +
+ names_from + + list | str + +
+

Name(s) of column(s) to use to make the new +DataFrame's columns. Should be either a single column name, +or a list of column names.

+
+
+ None +
+ values_from + + list | str + +
+

Name(s) of column(s) that will be used for populating +the new DataFrame's values. +If values_from is not specified, all remaining columns +will be used.

+
+
+ None +
+ flatten_levels + + bool + +
+

If False, the DataFrame stays as a MultiIndex.

+
+
+ True +
+ names_sep + + str + +
+

If names_from or values_from contain multiple +variables, this will be used to join the values into a single string +to use as a column name. Default is _. +Applicable only if flatten_levels is True.

+
+
+ '_' +
+ names_glue + + str + +
+

A string to control the output of the flattened columns. +It offers more flexibility in creating custom column names, +and uses python's str.format_map under the hood. +Simply create the string template, +using the column labels in names_from, +and special _value as a placeholder for values_from. +Applicable only if flatten_levels is True.

+
+
+ None +
+ reset_index + + bool + +
+

Determines whether to restore index +as a column/columns. Applicable only if index is provided, +and flatten_levels is True.

+
+
+ True +
+ names_expand + + bool + +
+

Expand columns to show all the categories. +Applies only if names_from is a categorical column.

+
+
+ False +
+ index_expand + + bool + +
+

Expand the index to show all the categories. +Applies only if index is a categorical column.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame that has been unpivoted from long to wide form.

+
+
+ +
+ Source code in janitor/functions/pivot.py +
1865
+1866
+1867
+1868
+1869
+1870
+1871
+1872
+1873
+1874
+1875
+1876
+1877
+1878
+1879
+1880
+1881
+1882
+1883
+1884
+1885
+1886
+1887
+1888
+1889
+1890
+1891
+1892
+1893
+1894
+1895
+1896
+1897
+1898
+1899
+1900
+1901
+1902
+1903
+1904
+1905
+1906
+1907
+1908
+1909
+1910
+1911
+1912
+1913
+1914
+1915
+1916
+1917
+1918
+1919
+1920
+1921
+1922
+1923
+1924
+1925
+1926
+1927
+1928
+1929
+1930
+1931
+1932
+1933
+1934
+1935
+1936
+1937
+1938
+1939
+1940
+1941
+1942
+1943
+1944
+1945
+1946
+1947
+1948
+1949
+1950
+1951
+1952
+1953
+1954
+1955
+1956
+1957
+1958
+1959
+1960
+1961
+1962
+1963
+1964
+1965
+1966
+1967
+1968
+1969
+1970
+1971
+1972
+1973
+1974
+1975
+1976
+1977
+1978
+1979
+1980
+1981
+1982
+1983
+1984
+1985
+1986
+1987
+1988
+1989
+1990
+1991
+1992
+1993
+1994
+1995
+1996
+1997
+1998
+1999
+2000
+2001
+2002
+2003
+2004
+2005
+2006
+2007
+2008
+2009
+2010
+2011
+2012
+2013
+2014
+2015
+2016
+2017
+2018
+2019
+2020
+2021
+2022
+2023
+2024
+2025
+2026
+2027
+2028
+2029
+2030
+2031
+2032
+2033
+2034
+2035
+2036
+2037
+2038
+2039
+2040
+2041
+2042
+2043
+2044
+2045
+2046
+2047
+2048
+2049
+2050
+2051
+2052
+2053
+2054
+2055
+2056
+2057
+2058
+2059
+2060
+2061
+2062
+2063
+2064
+2065
+2066
+2067
+2068
+2069
+2070
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `pd.DataFrame.pivot` instead."
+    )
+)
+def pivot_wider(
+    df: pd.DataFrame,
+    index: list | str = None,
+    names_from: list | str = None,
+    values_from: list | str = None,
+    flatten_levels: bool = True,
+    names_sep: str = "_",
+    names_glue: str = None,
+    reset_index: bool = True,
+    names_expand: bool = False,
+    index_expand: bool = False,
+) -> pd.DataFrame:
+    """Reshapes data from *long* to *wide* form.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `pd.DataFrame.pivot` instead.
+
+    The number of columns are increased, while decreasing
+    the number of rows. It is the inverse of the
+    [`pivot_longer`][janitor.functions.pivot.pivot_longer]
+    method, and is a wrapper around `pd.DataFrame.pivot` method.
+
+    This method does not mutate the original DataFrame.
+
+    Column selection in `index`, `names_from` and `values_from`
+    is possible using the
+    [`select`][janitor.functions.select.select] syntax.
+
+    A ValueError is raised if the combination
+    of the `index` and `names_from` is not unique.
+
+    By default, values from `values_from` are always
+    at the top level if the columns are not flattened.
+    If flattened, the values from `values_from` are usually
+    at the start of each label in the columns.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = [{'dep': 5.5, 'step': 1, 'a': 20, 'b': 30},
+        ...       {'dep': 5.5, 'step': 2, 'a': 25, 'b': 37},
+        ...       {'dep': 6.1, 'step': 1, 'a': 22, 'b': 19},
+        ...       {'dep': 6.1, 'step': 2, 'a': 18, 'b': 29}]
+        >>> df = pd.DataFrame(df)
+        >>> df
+           dep  step   a   b
+        0  5.5     1  20  30
+        1  5.5     2  25  37
+        2  6.1     1  22  19
+        3  6.1     2  18  29
+
+        Pivot and flatten columns:
+        >>> df.pivot_wider( # doctest: +SKIP
+        ...     index = "dep",
+        ...     names_from = "step",
+        ... )
+           dep  a_1  a_2  b_1  b_2
+        0  5.5   20   25   30   37
+        1  6.1   22   18   19   29
+
+        Modify columns with `names_sep`:
+        >>> df.pivot_wider( # doctest: +SKIP
+        ...     index = "dep",
+        ...     names_from = "step",
+        ...     names_sep = "",
+        ... )
+           dep  a1  a2  b1  b2
+        0  5.5  20  25  30  37
+        1  6.1  22  18  19  29
+
+        Modify columns with `names_glue`:
+        >>> df.pivot_wider( # doctest: +SKIP
+        ...     index = "dep",
+        ...     names_from = "step",
+        ...     names_glue = "{_value}_step{step}",
+        ... )
+           dep  a_step1  a_step2  b_step1  b_step2
+        0  5.5       20       25       30       37
+        1  6.1       22       18       19       29
+
+        Expand columns to expose implicit missing values
+        - this applies only to categorical columns:
+        >>> weekdays = ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
+        >>> daily = pd.DataFrame(
+        ...     {
+        ...         "day": pd.Categorical(
+        ...             values=("Tue", "Thu", "Fri", "Mon"), categories=weekdays
+        ...         ),
+        ...         "value": (2, 3, 1, 5),
+        ...     },
+        ... index=[0, 0, 0, 0],
+        ... )
+        >>> daily
+           day  value
+        0  Tue      2
+        0  Thu      3
+        0  Fri      1
+        0  Mon      5
+        >>> daily.pivot_wider(names_from='day', values_from='value') # doctest: +SKIP
+           Tue  Thu  Fri  Mon
+        0    2    3    1    5
+        >>> (daily # doctest: +SKIP
+        ... .pivot_wider(
+        ...     names_from='day',
+        ...     values_from='value',
+        ...     names_expand=True)
+        ... )
+           Mon  Tue  Wed  Thu  Fri  Sat  Sun
+        0    5    2  NaN    3    1  NaN  NaN
+
+        Expand the index to expose implicit missing values
+        - this applies only to categorical columns:
+        >>> daily = daily.assign(letter = list('ABBA'))
+        >>> daily
+           day  value letter
+        0  Tue      2      A
+        0  Thu      3      B
+        0  Fri      1      B
+        0  Mon      5      A
+        >>> daily.pivot_wider(index='day',names_from='letter',values_from='value') # doctest: +SKIP
+           day    A    B
+        0  Tue  2.0  NaN
+        1  Thu  NaN  3.0
+        2  Fri  NaN  1.0
+        3  Mon  5.0  NaN
+        >>> (daily # doctest: +SKIP
+        ... .pivot_wider(
+        ...     index='day',
+        ...     names_from='letter',
+        ...     values_from='value',
+        ...     index_expand=True)
+        ... )
+           day    A    B
+        0  Mon  5.0  NaN
+        1  Tue  2.0  NaN
+        2  Wed  NaN  NaN
+        3  Thu  NaN  3.0
+        4  Fri  NaN  1.0
+        5  Sat  NaN  NaN
+        6  Sun  NaN  NaN
+
+
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `reset_index`, `names_expand` and `index_expand` parameters.
+
+    Args:
+        df: A pandas DataFrame.
+        index: Name(s) of columns to use as identifier variables.
+            It should be either a single column name, or a list of column names.
+            If `index` is not provided, the DataFrame's index is used.
+        names_from: Name(s) of column(s) to use to make the new
+            DataFrame's columns. Should be either a single column name,
+            or a list of column names.
+        values_from: Name(s) of column(s) that will be used for populating
+            the new DataFrame's values.
+            If `values_from` is not specified,  all remaining columns
+            will be used.
+        flatten_levels: If `False`, the DataFrame stays as a MultiIndex.
+        names_sep: If `names_from` or `values_from` contain multiple
+            variables, this will be used to join the values into a single string
+            to use as a column name. Default is `_`.
+            Applicable only if `flatten_levels` is `True`.
+        names_glue: A string to control the output of the flattened columns.
+            It offers more flexibility in creating custom column names,
+            and uses python's `str.format_map` under the hood.
+            Simply create the string template,
+            using the column labels in `names_from`,
+            and special `_value` as a placeholder for `values_from`.
+            Applicable only if `flatten_levels` is `True`.
+        reset_index: Determines whether to restore `index`
+            as a column/columns. Applicable only if `index` is provided,
+            and `flatten_levels` is `True`.
+        names_expand: Expand columns to show all the categories.
+            Applies only if `names_from` is a categorical column.
+        index_expand: Expand the index to show all the categories.
+            Applies only if `index` is a categorical column.
+
+    Returns:
+        A pandas DataFrame that has been unpivoted from long to wide form.
+    """  # noqa: E501
+
+    # no need for an explicit copy --> df = df.copy()
+    # `pd.pivot` creates one
+    return _computations_pivot_wider(
+        df,
+        index,
+        names_from,
+        values_from,
+        flatten_levels,
+        names_sep,
+        names_glue,
+        reset_index,
+        names_expand,
+        index_expand,
+    )
+
+
+
+ +
+ +
+ + +

+ pivot_wider_spec(df, spec, index=None, reset_index=True) + +

+ + +
+ +

A declarative interface to pivot a DataFrame from long to wide form, +where you describe how the data will be pivoted, +using a DataFrame.

+

This gives you, the user, +more control over pivoting, where you create a “spec” +data frame that describes exactly how data stored +in the column names becomes variables.

+

It can come in handy for situations where +pd.DataFrame.pivot +seems inadequate for the transformation.

+
+

New in version 0.31.0

+
+ + +

Examples:

+
>>> import pandas as pd
+>>> from janitor import pivot_wider_spec
+>>> df = pd.DataFrame(
+... [
+...    {"famid": 1, "birth": 1, "age": 1, "ht": 2.8},
+...    {"famid": 1, "birth": 1, "age": 2, "ht": 3.4},
+...    {"famid": 1, "birth": 2, "age": 1, "ht": 2.9},
+...    {"famid": 1, "birth": 2, "age": 2, "ht": 3.8},
+...    {"famid": 1, "birth": 3, "age": 1, "ht": 2.2},
+...    {"famid": 1, "birth": 3, "age": 2, "ht": 2.9},
+...    {"famid": 2, "birth": 1, "age": 1, "ht": 2.0},
+...    {"famid": 2, "birth": 1, "age": 2, "ht": 3.2},
+...    {"famid": 2, "birth": 2, "age": 1, "ht": 1.8},
+...    {"famid": 2, "birth": 2, "age": 2, "ht": 2.8},
+...    {"famid": 2, "birth": 3, "age": 1, "ht": 1.9},
+...    {"famid": 2, "birth": 3, "age": 2, "ht": 2.4},
+...    {"famid": 3, "birth": 1, "age": 1, "ht": 2.2},
+...    {"famid": 3, "birth": 1, "age": 2, "ht": 3.3},
+...    {"famid": 3, "birth": 2, "age": 1, "ht": 2.3},
+...    {"famid": 3, "birth": 2, "age": 2, "ht": 3.4},
+...    {"famid": 3, "birth": 3, "age": 1, "ht": 2.1},
+...    {"famid": 3, "birth": 3, "age": 2, "ht": 2.9},
+... ]
+... )
+>>> df
+    famid  birth  age   ht
+0       1      1    1  2.8
+1       1      1    2  3.4
+2       1      2    1  2.9
+3       1      2    2  3.8
+4       1      3    1  2.2
+5       1      3    2  2.9
+6       2      1    1  2.0
+7       2      1    2  3.2
+8       2      2    1  1.8
+9       2      2    2  2.8
+10      2      3    1  1.9
+11      2      3    2  2.4
+12      3      1    1  2.2
+13      3      1    2  3.3
+14      3      2    1  2.3
+15      3      2    2  3.4
+16      3      3    1  2.1
+17      3      3    2  2.9
+>>> spec = {".name": ["ht1", "ht2"],
+...         ".value": ["ht", "ht"],
+...         "age": [1, 2]}
+>>> spec = pd.DataFrame(spec)
+>>> spec
+  .name .value  age
+0   ht1     ht    1
+1   ht2     ht    2
+>>> pivot_wider_spec(df=df,spec=spec, index=['famid','birth'])
+   famid  birth  ht1  ht2
+0      1      1  2.8  3.4
+1      1      2  2.9  3.8
+2      1      3  2.2  2.9
+3      2      1  2.0  3.2
+4      2      2  1.8  2.8
+5      2      3  1.9  2.4
+6      3      1  2.2  3.3
+7      3      2  2.3  3.4
+8      3      3  2.1  2.9
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ spec + + DataFrame + +
+

A specification DataFrame. +At a minimum, the spec DataFrame +must have a '.name' and a '.value' columns. +The '.name' column should contain the +the names of the columns in the output DataFrame. +The '.value' column should contain the name of the column(s) +in the source DataFrame that will be serve as the values. +Additional columns in spec will serves as the columns +to be flipped to wide form. +Note that these additional columns should already exist +in the source DataFrame.

+
+
+ required +
+ index + + list | tuple | str | Pattern + +
+

Name(s) of columns to use as identifier variables. +It should be either a single column name, or a list of column names. +If index is not provided, the DataFrame's index is used. +Column selection is possible using the +select syntax.

+
+
+ None +
+ reset_index + + bool + +
+

Determines whether to reset the index. +Applicable only if index is provided.

+
+
+ True +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame that has been unpivoted from long to wide form.

+
+
+ +
+ Source code in janitor/functions/pivot.py +
2392
+2393
+2394
+2395
+2396
+2397
+2398
+2399
+2400
+2401
+2402
+2403
+2404
+2405
+2406
+2407
+2408
+2409
+2410
+2411
+2412
+2413
+2414
+2415
+2416
+2417
+2418
+2419
+2420
+2421
+2422
+2423
+2424
+2425
+2426
+2427
+2428
+2429
+2430
+2431
+2432
+2433
+2434
+2435
+2436
+2437
+2438
+2439
+2440
+2441
+2442
+2443
+2444
+2445
+2446
+2447
+2448
+2449
+2450
+2451
+2452
+2453
+2454
+2455
+2456
+2457
+2458
+2459
+2460
+2461
+2462
+2463
+2464
+2465
+2466
+2467
+2468
+2469
+2470
+2471
+2472
+2473
+2474
+2475
+2476
+2477
+2478
+2479
+2480
+2481
+2482
+2483
+2484
+2485
+2486
+2487
+2488
+2489
+2490
+2491
+2492
+2493
+2494
+2495
+2496
+2497
+2498
+2499
+2500
+2501
+2502
+2503
+2504
+2505
+2506
+2507
+2508
+2509
+2510
+2511
+2512
+2513
+2514
+2515
+2516
+2517
+2518
+2519
+2520
+2521
+2522
+2523
+2524
+2525
+2526
+2527
+2528
+2529
+2530
+2531
+2532
+2533
+2534
+2535
+2536
+2537
def pivot_wider_spec(
+    df: pd.DataFrame,
+    spec: pd.DataFrame,
+    index: list | tuple | str | Pattern = None,
+    reset_index: bool = True,
+) -> pd.DataFrame:
+    """A declarative interface to pivot a DataFrame from long to wide form,
+    where you describe how the data will be pivoted,
+    using a DataFrame.
+
+    This gives you, the user,
+    more control over pivoting, where you create a “spec”
+    data frame that describes exactly how data stored
+    in the column names becomes variables.
+
+    It can come in handy for situations where
+    `pd.DataFrame.pivot`
+    seems inadequate for the transformation.
+
+    !!! info "New in version 0.31.0"
+
+    Examples:
+        >>> import pandas as pd
+        >>> from janitor import pivot_wider_spec
+        >>> df = pd.DataFrame(
+        ... [
+        ...    {"famid": 1, "birth": 1, "age": 1, "ht": 2.8},
+        ...    {"famid": 1, "birth": 1, "age": 2, "ht": 3.4},
+        ...    {"famid": 1, "birth": 2, "age": 1, "ht": 2.9},
+        ...    {"famid": 1, "birth": 2, "age": 2, "ht": 3.8},
+        ...    {"famid": 1, "birth": 3, "age": 1, "ht": 2.2},
+        ...    {"famid": 1, "birth": 3, "age": 2, "ht": 2.9},
+        ...    {"famid": 2, "birth": 1, "age": 1, "ht": 2.0},
+        ...    {"famid": 2, "birth": 1, "age": 2, "ht": 3.2},
+        ...    {"famid": 2, "birth": 2, "age": 1, "ht": 1.8},
+        ...    {"famid": 2, "birth": 2, "age": 2, "ht": 2.8},
+        ...    {"famid": 2, "birth": 3, "age": 1, "ht": 1.9},
+        ...    {"famid": 2, "birth": 3, "age": 2, "ht": 2.4},
+        ...    {"famid": 3, "birth": 1, "age": 1, "ht": 2.2},
+        ...    {"famid": 3, "birth": 1, "age": 2, "ht": 3.3},
+        ...    {"famid": 3, "birth": 2, "age": 1, "ht": 2.3},
+        ...    {"famid": 3, "birth": 2, "age": 2, "ht": 3.4},
+        ...    {"famid": 3, "birth": 3, "age": 1, "ht": 2.1},
+        ...    {"famid": 3, "birth": 3, "age": 2, "ht": 2.9},
+        ... ]
+        ... )
+        >>> df
+            famid  birth  age   ht
+        0       1      1    1  2.8
+        1       1      1    2  3.4
+        2       1      2    1  2.9
+        3       1      2    2  3.8
+        4       1      3    1  2.2
+        5       1      3    2  2.9
+        6       2      1    1  2.0
+        7       2      1    2  3.2
+        8       2      2    1  1.8
+        9       2      2    2  2.8
+        10      2      3    1  1.9
+        11      2      3    2  2.4
+        12      3      1    1  2.2
+        13      3      1    2  3.3
+        14      3      2    1  2.3
+        15      3      2    2  3.4
+        16      3      3    1  2.1
+        17      3      3    2  2.9
+        >>> spec = {".name": ["ht1", "ht2"],
+        ...         ".value": ["ht", "ht"],
+        ...         "age": [1, 2]}
+        >>> spec = pd.DataFrame(spec)
+        >>> spec
+          .name .value  age
+        0   ht1     ht    1
+        1   ht2     ht    2
+        >>> pivot_wider_spec(df=df,spec=spec, index=['famid','birth'])
+           famid  birth  ht1  ht2
+        0      1      1  2.8  3.4
+        1      1      2  2.9  3.8
+        2      1      3  2.2  2.9
+        3      2      1  2.0  3.2
+        4      2      2  1.8  2.8
+        5      2      3  1.9  2.4
+        6      3      1  2.2  3.3
+        7      3      2  2.3  3.4
+        8      3      3  2.1  2.9
+
+    Args:
+        df: A pandas DataFrame.
+        spec: A specification DataFrame.
+            At a minimum, the spec DataFrame
+            must have a '.name' and a '.value' columns.
+            The '.name' column  should contain the
+            the names of the columns in the output DataFrame.
+            The '.value' column should contain the name of the column(s)
+            in the source DataFrame that will be serve as the values.
+            Additional columns in spec will serves as the columns
+            to be flipped to wide form.
+            Note that these additional columns should already exist
+            in the source DataFrame.
+        index: Name(s) of columns to use as identifier variables.
+            It should be either a single column name, or a list of column names.
+            If `index` is not provided, the DataFrame's index is used.
+            Column selection is possible using the
+            [`select`][janitor.functions.select.select] syntax.
+        reset_index: Determines whether to reset the `index`.
+            Applicable only if `index` is provided.
+
+    Returns:
+        A pandas DataFrame that has been unpivoted from long to wide form.
+    """  # noqa: E501
+    check("spec", spec, [pd.DataFrame])
+    check("reset_index", reset_index, [bool])
+    if not spec.columns.is_unique:
+        raise ValueError("Kindly ensure the spec's columns is unique.")
+    if ".name" not in spec.columns:
+        raise KeyError(
+            "Kindly ensure the spec DataFrame has a `.name` column."
+        )
+    if ".value" not in spec.columns:
+        raise KeyError(
+            "Kindly ensure the spec DataFrame has a `.value` column."
+        )
+    if spec.columns.tolist()[:2] != [".name", ".value"]:
+        raise ValueError(
+            "The first two columns of the spec DataFrame "
+            "should be '.name' and '.value', "
+            "with '.name' coming before '.value'."
+        )
+    if spec.columns.size == 2:
+        raise ValueError(
+            "Kindly provide the column(s) "
+            "to use to make new frame’s columns"
+        )
+    columns = spec.columns[2:]
+    values = spec[".value"].unique()
+    if index is not None:
+        index = _select_index([index], df, axis="columns")
+        index = df.columns[index].tolist()
+    df = df.pivot(index=index, columns=columns, values=values)
+    _index = spec.columns[1:].tolist()
+    spec = spec.set_index(_index).squeeze()
+    df = df.reindex(columns=spec.index)
+    df.columns = df.columns.map(spec)
+    if reset_index and index:
+        return df.reset_index()
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ process_text + + +

+ +
+ +

Implementation source for process_text.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ process_text(df, column_name, string_function, **kwargs) + +

+ + +
+ +

Apply a Pandas string method to an existing column.

+

This function aims to make string cleaning easy, while chaining, +by simply passing the string method name, +along with keyword arguments, if any, to the function.

+

This modifies an existing column; it does not create a new column; +new columns can be created via pyjanitor's +transform_columns.

+

A list of all the string methods in Pandas can be accessed here.

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use jn.transform_column +instead.

+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> import re
+>>> df = pd.DataFrame({"text": ["Ragnar", "sammywemmy", "ginger"],
+... "code": [1, 2, 3]})
+>>> df
+         text  code
+0      Ragnar     1
+1  sammywemmy     2
+2      ginger     3
+>>> df.process_text(column_name="text", string_function="lower")
+         text  code
+0      ragnar     1
+1  sammywemmy     2
+2      ginger     3
+
+

For string methods with parameters, simply pass the keyword arguments:

+
>>> df.process_text(
+...     column_name="text",
+...     string_function="extract",
+...     pat=r"(ag)",
+...     expand=False,
+...     flags=re.IGNORECASE,
+... )
+  text  code
+0   ag     1
+1  NaN     2
+2  NaN     3
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + str + +
+

String column to be operated on.

+
+
+ required +
+ string_function + + str + +
+

pandas string method to be applied.

+
+
+ required +
+ **kwargs + + Any + +
+

Keyword arguments for parameters of the string_function.

+
+
+ {} +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ KeyError + +
+

If string_function is not a Pandas string method.

+
+
+ ValueError + +
+

If the text function returns a DataFrame, instead of a Series.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with modified column.

+
+
+ +
+ Source code in janitor/functions/process_text.py +
 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `jn.transform_columns` instead."
+    )
+)
+@deprecated_alias(column="column_name")
+def process_text(
+    df: pd.DataFrame,
+    column_name: str,
+    string_function: str,
+    **kwargs: Any,
+) -> pd.DataFrame:
+    """Apply a Pandas string method to an existing column.
+
+    This function aims to make string cleaning easy, while chaining,
+    by simply passing the string method name,
+    along with keyword arguments, if any, to the function.
+
+    This modifies an existing column; it does not create a new column;
+    new columns can be created via pyjanitor's
+    [`transform_columns`][janitor.functions.transform_columns.transform_columns].
+
+    A list of all the string methods in Pandas can be accessed [here](https://pandas.pydata.org/docs/user_guide/text.html#method-summary).
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use [`jn.transform_column`][janitor.functions.transform_columns.transform_column]
+        instead.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> import re
+        >>> df = pd.DataFrame({"text": ["Ragnar", "sammywemmy", "ginger"],
+        ... "code": [1, 2, 3]})
+        >>> df
+                 text  code
+        0      Ragnar     1
+        1  sammywemmy     2
+        2      ginger     3
+        >>> df.process_text(column_name="text", string_function="lower")
+                 text  code
+        0      ragnar     1
+        1  sammywemmy     2
+        2      ginger     3
+
+        For string methods with parameters, simply pass the keyword arguments:
+
+        >>> df.process_text(
+        ...     column_name="text",
+        ...     string_function="extract",
+        ...     pat=r"(ag)",
+        ...     expand=False,
+        ...     flags=re.IGNORECASE,
+        ... )
+          text  code
+        0   ag     1
+        1  NaN     2
+        2  NaN     3
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: String column to be operated on.
+        string_function: pandas string method to be applied.
+        **kwargs: Keyword arguments for parameters of the `string_function`.
+
+    Raises:
+        KeyError: If `string_function` is not a Pandas string method.
+        ValueError: If the text function returns a DataFrame, instead of a Series.
+
+    Returns:
+        A pandas DataFrame with modified column.
+    """  # noqa: E501
+
+    check("column_name", column_name, [str])
+    check("string_function", string_function, [str])
+    check_column(df, [column_name])
+
+    pandas_string_methods = [
+        func.__name__
+        for _, func in inspect.getmembers(pd.Series.str, inspect.isfunction)
+        if not func.__name__.startswith("_")
+    ]
+
+    if string_function not in pandas_string_methods:
+        raise KeyError(f"{string_function} is not a Pandas string method.")
+
+    result = getattr(df[column_name].str, string_function)(**kwargs)
+
+    if isinstance(result, pd.DataFrame):
+        raise ValueError(
+            "The outcome of the processed text is a DataFrame, "
+            "which is not supported in `process_text`."
+        )
+
+    return df.assign(**{column_name: result})
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ remove_columns + + +

+ +
+ +

Implementation of remove_columns.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ remove_columns(df, column_names) + +

+ + +
+ +

Remove the set of columns specified in column_names.

+

This method does not mutate the original DataFrame.

+

Intended to be the method-chaining alternative to del df[col].

+
+

Note

+

This function will be deprecated in a 1.x release. +Kindly use pd.DataFrame.drop instead.

+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": [2, 4, 6], "b": [1, 3, 5], "c": [7, 8, 9]})
+>>> df
+   a  b  c
+0  2  1  7
+1  4  3  8
+2  6  5  9
+>>> df.remove_columns(column_names=['a', 'c'])
+   b
+0  1
+1  3
+2  5
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_names + + Union[str, Iterable[str], Hashable] + +
+

The columns to remove.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/remove_columns.py +
11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `pd.DataFrame.drop` instead."
+    )
+)
+@deprecated_alias(columns="column_names")
+def remove_columns(
+    df: pd.DataFrame,
+    column_names: Union[str, Iterable[str], Hashable],
+) -> pd.DataFrame:
+    """Remove the set of columns specified in `column_names`.
+
+    This method does not mutate the original DataFrame.
+
+    Intended to be the method-chaining alternative to `del df[col]`.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Kindly use `pd.DataFrame.drop` instead.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": [2, 4, 6], "b": [1, 3, 5], "c": [7, 8, 9]})
+        >>> df
+           a  b  c
+        0  2  1  7
+        1  4  3  8
+        2  6  5  9
+        >>> df.remove_columns(column_names=['a', 'c'])
+           b
+        0  1
+        1  3
+        2  5
+
+    Args:
+        df: A pandas DataFrame.
+        column_names: The columns to remove.
+
+    Returns:
+        A pandas DataFrame.
+    """
+
+    return df.drop(columns=column_names)
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ remove_empty + + +

+ +
+ +

Implementation of remove_empty.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ remove_empty(df, reset_index=True) + +

+ + +
+ +

Drop all rows and columns that are completely null.

+

This method does not mutate the original DataFrame.

+

Implementation is inspired from StackOverflow.

+ + +

Examples:

+
>>> import numpy as np
+>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "a": [1, np.nan, 2],
+...     "b": [3, np.nan, 4],
+...     "c": [np.nan, np.nan, np.nan],
+... })
+>>> df
+     a    b   c
+0  1.0  3.0 NaN
+1  NaN  NaN NaN
+2  2.0  4.0 NaN
+>>> df.remove_empty()
+     a    b
+0  1.0  3.0
+1  2.0  4.0
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The pandas DataFrame object.

+
+
+ required +
+ reset_index + + bool + +
+

Determines if the index is reset.

+
+
+ True +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/remove_empty.py +
 7
+ 8
+ 9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
@pf.register_dataframe_method
+def remove_empty(df: pd.DataFrame, reset_index: bool = True) -> pd.DataFrame:
+    """Drop all rows and columns that are completely null.
+
+    This method does not mutate the original DataFrame.
+
+    Implementation is inspired from [StackOverflow][so].
+
+    [so]: https://stackoverflow.com/questions/38884538/python-pandas-find-all-rows-where-all-values-are-nan
+
+    Examples:
+        >>> import numpy as np
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "a": [1, np.nan, 2],
+        ...     "b": [3, np.nan, 4],
+        ...     "c": [np.nan, np.nan, np.nan],
+        ... })
+        >>> df
+             a    b   c
+        0  1.0  3.0 NaN
+        1  NaN  NaN NaN
+        2  2.0  4.0 NaN
+        >>> df.remove_empty()
+             a    b
+        0  1.0  3.0
+        1  2.0  4.0
+
+    Args:
+        df: The pandas DataFrame object.
+        reset_index: Determines if the index is reset.
+
+    Returns:
+        A pandas DataFrame.
+    """  # noqa: E501
+    outcome = df.isna()
+    outcome = df.loc[~outcome.all(axis=1), ~outcome.all(axis=0)]
+    if reset_index:
+        return outcome.reset_index(drop=True)
+    return outcome
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ rename_columns + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ rename_column(df, old_column_name, new_column_name) + +

+ + +
+ +

Rename a column in place.

+

This method does not mutate the original DataFrame.

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use pd.DataFrame.rename instead.

+
+

This is just syntactic sugar/a convenience function for renaming one column at a time. +If you are convinced that there are multiple columns in need of changing, +then use the pandas.DataFrame.rename method.

+ + +

Examples:

+

Change the name of column 'a' to 'a_new'.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": list(range(3)), "b": list("abc")})
+>>> df.rename_column(old_column_name='a', new_column_name='a_new')
+   a_new  b
+0      0  a
+1      1  b
+2      2  c
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The pandas DataFrame object.

+
+
+ required +
+ old_column_name + + str + +
+

The old column name.

+
+
+ required +
+ new_column_name + + str + +
+

The new column name.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with renamed columns.

+
+
+ +
+ Source code in janitor/functions/rename_columns.py +
 9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `pd.DataFrame.rename` instead."
+    )
+)
+@deprecated_alias(old="old_column_name", new="new_column_name")
+def rename_column(
+    df: pd.DataFrame,
+    old_column_name: str,
+    new_column_name: str,
+) -> pd.DataFrame:
+    """Rename a column in place.
+
+    This method does not mutate the original DataFrame.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `pd.DataFrame.rename` instead.
+
+    This is just syntactic sugar/a convenience function for renaming one column at a time.
+    If you are convinced that there are multiple columns in need of changing,
+    then use the `pandas.DataFrame.rename` method.
+
+    Examples:
+        Change the name of column 'a' to 'a_new'.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": list(range(3)), "b": list("abc")})
+        >>> df.rename_column(old_column_name='a', new_column_name='a_new')
+           a_new  b
+        0      0  a
+        1      1  b
+        2      2  c
+
+    Args:
+        df: The pandas DataFrame object.
+        old_column_name: The old column name.
+        new_column_name: The new column name.
+
+    Returns:
+        A pandas DataFrame with renamed columns.
+    """  # noqa: E501
+
+    check_column(df, [old_column_name])
+
+    return df.rename(columns={old_column_name: new_column_name})
+
+
+
+ +
+ +
+ + +

+ rename_columns(df, new_column_names=None, function=None) + +

+ + +
+ +

Rename columns.

+

This method does not mutate the original DataFrame.

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use pd.DataFrame.rename instead.

+
+

One of the new_column_names or function are a required parameter. +If both are provided, then new_column_names takes priority and function +is never executed.

+ + +

Examples:

+

Rename columns using a dictionary which maps old names to new names.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": list(range(3)), "b": list("xyz")})
+>>> df
+   a  b
+0  0  x
+1  1  y
+2  2  z
+>>> df.rename_columns(new_column_names={"a": "a_new", "b": "b_new"})
+   a_new b_new
+0      0     x
+1      1     y
+2      2     z
+
+

Rename columns using a generic callable.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": list(range(3)), "b": list("xyz")})
+>>> df.rename_columns(function=str.upper)
+   A  B
+0  0  x
+1  1  y
+2  2  z
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The pandas DataFrame object.

+
+
+ required +
+ new_column_names + + Union[Dict, None] + +
+

A dictionary of old and new column names.

+
+
+ None +
+ function + + Callable + +
+

A function which should be applied to all the columns.

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If both new_column_names and function are None.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with renamed columns.

+
+
+ +
+ Source code in janitor/functions/rename_columns.py +
 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `pd.DataFrame.rename` instead."
+    )
+)
+def rename_columns(
+    df: pd.DataFrame,
+    new_column_names: Union[Dict, None] = None,
+    function: Callable = None,
+) -> pd.DataFrame:
+    """Rename columns.
+
+    This method does not mutate the original DataFrame.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `pd.DataFrame.rename` instead.
+
+    One of the `new_column_names` or `function` are a required parameter.
+    If both are provided, then `new_column_names` takes priority and `function`
+    is never executed.
+
+    Examples:
+        Rename columns using a dictionary which maps old names to new names.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": list(range(3)), "b": list("xyz")})
+        >>> df
+           a  b
+        0  0  x
+        1  1  y
+        2  2  z
+        >>> df.rename_columns(new_column_names={"a": "a_new", "b": "b_new"})
+           a_new b_new
+        0      0     x
+        1      1     y
+        2      2     z
+
+        Rename columns using a generic callable.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": list(range(3)), "b": list("xyz")})
+        >>> df.rename_columns(function=str.upper)
+           A  B
+        0  0  x
+        1  1  y
+        2  2  z
+
+    Args:
+        df: The pandas DataFrame object.
+        new_column_names: A dictionary of old and new column names.
+        function: A function which should be applied to all the columns.
+
+    Raises:
+        ValueError: If both `new_column_names` and `function` are None.
+
+    Returns:
+        A pandas DataFrame with renamed columns.
+    """  # noqa: E501
+
+    if new_column_names is None and function is None:
+        raise ValueError(
+            "One of new_column_names or function must be provided"
+        )
+
+    if new_column_names is not None:
+        check_column(df, new_column_names)
+        return df.rename(columns=new_column_names)
+
+    return df.rename(mapper=function, axis="columns")
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ reorder_columns + + +

+ +
+ +

Implementation source for reorder_columns.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ reorder_columns(df, column_order) + +

+ + +
+ +

Reorder DataFrame columns by specifying desired order as list of col names.

+

Columns not specified retain their order and follow after the columns specified +in column_order.

+

All columns specified within the column_order list must be present within df.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"col1": [1, 1, 1], "col2": [2, 2, 2], "col3": [3, 3, 3]})
+>>> df
+   col1  col2  col3
+0     1     2     3
+1     1     2     3
+2     1     2     3
+>>> df.reorder_columns(['col3', 'col1'])
+   col3  col1  col2
+0     3     1     2
+1     3     1     2
+2     3     1     2
+
+

Notice that the column order of df is now col3, col1, col2.

+

Internally, this function uses DataFrame.reindex with copy=False +to avoid unnecessary data duplication.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

DataFrame to reorder

+
+
+ required +
+ column_order + + Union[Iterable[str], Index, Hashable] + +
+

A list of column names or Pandas Index +specifying their order in the returned DataFrame.

+
+
+ required +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ IndexError + +
+

If a column within column_order is not found +within the DataFrame.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with reordered columns.

+
+
+ +
+ Source code in janitor/functions/reorder_columns.py +
11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
@pf.register_dataframe_method
+def reorder_columns(
+    df: pd.DataFrame, column_order: Union[Iterable[str], pd.Index, Hashable]
+) -> pd.DataFrame:
+    """Reorder DataFrame columns by specifying desired order as list of col names.
+
+    Columns not specified retain their order and follow after the columns specified
+    in `column_order`.
+
+    All columns specified within the `column_order` list must be present within `df`.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"col1": [1, 1, 1], "col2": [2, 2, 2], "col3": [3, 3, 3]})
+        >>> df
+           col1  col2  col3
+        0     1     2     3
+        1     1     2     3
+        2     1     2     3
+        >>> df.reorder_columns(['col3', 'col1'])
+           col3  col1  col2
+        0     3     1     2
+        1     3     1     2
+        2     3     1     2
+
+        Notice that the column order of `df` is now `col3`, `col1`, `col2`.
+
+    Internally, this function uses `DataFrame.reindex` with `copy=False`
+    to avoid unnecessary data duplication.
+
+    Args:
+        df: `DataFrame` to reorder
+        column_order: A list of column names or Pandas `Index`
+            specifying their order in the returned `DataFrame`.
+
+    Raises:
+        IndexError: If a column within `column_order` is not found
+            within the DataFrame.
+
+    Returns:
+        A pandas DataFrame with reordered columns.
+    """  # noqa: E501
+    check("column_order", column_order, [list, tuple, pd.Index])
+
+    if any(col not in df.columns for col in column_order):
+        raise IndexError(
+            "One or more columns in `column_order` were not found in the "
+            "DataFrame."
+        )
+
+    # if column_order is a Pandas index, needs conversion to list:
+    column_order = list(column_order)
+
+    return df.reindex(
+        columns=(
+            column_order
+            + [col for col in df.columns if col not in column_order]
+        ),
+        copy=False,
+    )
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ round_to_fraction + + +

+ +
+ +

Implementation of round_to_fraction

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ round_to_fraction(df, column_name, denominator, digits=np.inf) + +

+ + +
+ +

Round all values in a column to a fraction.

+

This method mutates the original DataFrame.

+

Taken from the R package.

+

Also, optionally round to a specified number of digits.

+ + +

Examples:

+

Round numeric column to the nearest 1/4 value.

+
>>> import numpy as np
+>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "a1": [1.263, 2.499, np.nan],
+...     "a2": ["x", "y", "z"],
+... })
+>>> df
+      a1 a2
+0  1.263  x
+1  2.499  y
+2    NaN  z
+>>> df.round_to_fraction("a1", denominator=4)
+     a1 a2
+0  1.25  x
+1  2.50  y
+2   NaN  z
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + Hashable + +
+

Name of column to round to fraction.

+
+
+ required +
+ denominator + + float + +
+

The denominator of the fraction for rounding. Must be +a positive number.

+
+
+ required +
+ digits + + float + +
+

The number of digits for rounding after rounding to the +fraction. Default is np.inf (i.e. no subsequent rounding).

+
+
+ inf +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If denominator is not a positive number.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with a column's values rounded.

+
+
+ +
+ Source code in janitor/functions/round_to_fraction.py +
12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
@pf.register_dataframe_method
+@deprecated_alias(col_name="column_name")
+def round_to_fraction(
+    df: pd.DataFrame,
+    column_name: Hashable,
+    denominator: float,
+    digits: float = np.inf,
+) -> pd.DataFrame:
+    """Round all values in a column to a fraction.
+
+    This method mutates the original DataFrame.
+
+    Taken from [the R package](https://github.com/sfirke/janitor/issues/235).
+
+    Also, optionally round to a specified number of digits.
+
+    Examples:
+        Round numeric column to the nearest 1/4 value.
+
+        >>> import numpy as np
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "a1": [1.263, 2.499, np.nan],
+        ...     "a2": ["x", "y", "z"],
+        ... })
+        >>> df
+              a1 a2
+        0  1.263  x
+        1  2.499  y
+        2    NaN  z
+        >>> df.round_to_fraction("a1", denominator=4)
+             a1 a2
+        0  1.25  x
+        1  2.50  y
+        2   NaN  z
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: Name of column to round to fraction.
+        denominator: The denominator of the fraction for rounding. Must be
+            a positive number.
+        digits: The number of digits for rounding after rounding to the
+            fraction. Default is np.inf (i.e. no subsequent rounding).
+
+    Raises:
+        ValueError: If `denominator` is not a positive number.
+
+    Returns:
+        A pandas DataFrame with a column's values rounded.
+    """
+    check_column(df, column_name)
+    check("denominator", denominator, [float, int])
+    check("digits", digits, [float, int])
+
+    if denominator <= 0:
+        raise ValueError("denominator is expected to be a positive number.")
+
+    df[column_name] = round(df[column_name] * denominator, 0) / denominator
+    if not np.isinf(digits):
+        df[column_name] = round(df[column_name], digits)
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ row_to_names + + +

+ +
+ +

Implementation of the row_to_names function.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ row_to_names(df, row_numbers=0, remove_rows=False, remove_rows_above=False, reset_index=False) + +

+ + +
+ +

Elevates a row, or rows, to be the column names of a DataFrame.

+

This method does not mutate the original DataFrame.

+

Contains options to remove the elevated row from the DataFrame along with +removing the rows above the selected row.

+ + +

Examples:

+

Replace column names with the first row and reset the index.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "a": ["nums", 6, 9],
+...     "b": ["chars", "x", "y"],
+... })
+>>> df
+      a      b
+0  nums  chars
+1     6      x
+2     9      y
+>>> df.row_to_names(0, remove_rows=True, reset_index=True)
+  nums chars
+0    6     x
+1    9     y
+>>> df.row_to_names([0,1], remove_rows=True, reset_index=True)
+  nums chars
+     6     x
+0    9     y
+
+

Remove rows above the elevated row and the elevated row itself.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "a": ["bla1", "nums", 6, 9],
+...     "b": ["bla2", "chars", "x", "y"],
+... })
+>>> df
+      a      b
+0  bla1   bla2
+1  nums  chars
+2     6      x
+3     9      y
+>>> df.row_to_names(1, remove_rows=True, remove_rows_above=True, reset_index=True)
+  nums chars
+0    6     x
+1    9     y
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ row_numbers + + int | list | slice + +
+

Position of the row(s) containing the variable names. +It can be an integer, a list or a slice. +Defaults to 0 (first row).

+
+
+ 0 +
+ remove_rows + + bool + +
+

Whether the row(s) should be removed from the DataFrame.

+
+
+ False +
+ remove_rows_above + + bool + +
+

Whether the row(s) above the selected row should +be removed from the DataFrame.

+
+
+ False +
+ reset_index + + bool + +
+

Whether the index should be reset on the returning DataFrame.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with set column names.

+
+
+ +
+ Source code in janitor/functions/row_to_names.py +
14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
@pf.register_dataframe_method
+@deprecated_alias(row_number="row_numbers", remove_row="remove_rows")
+def row_to_names(
+    df: pd.DataFrame,
+    row_numbers: int | list | slice = 0,
+    remove_rows: bool = False,
+    remove_rows_above: bool = False,
+    reset_index: bool = False,
+) -> pd.DataFrame:
+    """Elevates a row, or rows, to be the column names of a DataFrame.
+
+    This method does not mutate the original DataFrame.
+
+    Contains options to remove the elevated row from the DataFrame along with
+    removing the rows above the selected row.
+
+    Examples:
+        Replace column names with the first row and reset the index.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "a": ["nums", 6, 9],
+        ...     "b": ["chars", "x", "y"],
+        ... })
+        >>> df
+              a      b
+        0  nums  chars
+        1     6      x
+        2     9      y
+        >>> df.row_to_names(0, remove_rows=True, reset_index=True)
+          nums chars
+        0    6     x
+        1    9     y
+        >>> df.row_to_names([0,1], remove_rows=True, reset_index=True)
+          nums chars
+             6     x
+        0    9     y
+
+        Remove rows above the elevated row and the elevated row itself.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "a": ["bla1", "nums", 6, 9],
+        ...     "b": ["bla2", "chars", "x", "y"],
+        ... })
+        >>> df
+              a      b
+        0  bla1   bla2
+        1  nums  chars
+        2     6      x
+        3     9      y
+        >>> df.row_to_names(1, remove_rows=True, remove_rows_above=True, reset_index=True)
+          nums chars
+        0    6     x
+        1    9     y
+
+    Args:
+        df: A pandas DataFrame.
+        row_numbers: Position of the row(s) containing the variable names.
+            It can be an integer, a list or a slice.
+            Defaults to 0 (first row).
+        remove_rows: Whether the row(s) should be removed from the DataFrame.
+        remove_rows_above: Whether the row(s) above the selected row should
+            be removed from the DataFrame.
+        reset_index: Whether the index should be reset on the returning DataFrame.
+
+    Returns:
+        A pandas DataFrame with set column names.
+    """  # noqa: E501
+
+    return _row_to_names(
+        row_numbers,
+        df=df,
+        remove_rows=remove_rows,
+        remove_rows_above=remove_rows_above,
+        reset_index=reset_index,
+    )
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ select + + +

+ +
+ + + + + + + + +
+ + + + + + + + +
+ + + +

+ DropLabel + + + + dataclass + + +

+ + +
+ + +

Helper class for removing labels within the select syntax.

+

label can be any of the types supported in the select, +select_rows and select_columns functions. +An array of integers not matching the labels is returned.

+
+

New in version 0.24.0

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ label + + Any + +
+

Label(s) to be dropped from the index.

+
+
+ required +
+ + + + + + +
+ Source code in janitor/functions/select.py +
538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
@dataclass
+class DropLabel:
+    """Helper class for removing labels within the `select` syntax.
+
+    `label` can be any of the types supported in the `select`,
+    `select_rows` and `select_columns` functions.
+    An array of integers not matching the labels is returned.
+
+    !!! info "New in version 0.24.0"
+
+    Args:
+        label: Label(s) to be dropped from the index.
+    """
+
+    label: Any
+
+
+ + + +
+ + + + + + + + + + + +
+ +
+ +
+ + +
+ + +

+ get_columns(group, label) + +

+ + +
+ +

Helper function for selecting columns on a grouped object, +using the +select syntax.

+
+

New in version 0.25.0

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ group + + DataFrameGroupBy | SeriesGroupBy + +
+

A Pandas GroupBy object.

+
+
+ required +
+ label + + Any + +
+

column(s) to select.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrameGroupBy | SeriesGroupBy + +
+

A pandas groupby object.

+
+
+ +
+ Source code in janitor/functions/select.py +
477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
def get_columns(
+    group: DataFrameGroupBy | SeriesGroupBy, label: Any
+) -> DataFrameGroupBy | SeriesGroupBy:
+    """
+    Helper function for selecting columns on a grouped object,
+    using the
+    [`select`][janitor.functions.select.select] syntax.
+
+    !!! info "New in version 0.25.0"
+
+    Args:
+        group: A Pandas GroupBy object.
+        label: column(s) to select.
+
+    Returns:
+        A pandas groupby object.
+    """
+    check("groupby object", group, [DataFrameGroupBy, SeriesGroupBy])
+    label = get_index_labels(label, group.obj, axis="columns")
+    label = label if is_scalar(label) else list(label)
+    return group[label]
+
+
+
+ +
+ +
+ + +

+ get_index_labels(arg, df, axis) + +

+ + +
+ +

Convenience function to get actual labels from column/index

+
+

New in version 0.25.0

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ arg + + Any + +
+

Valid inputs include: an exact column name to look for, +a shell-style glob string (e.g. *_thing_*), +a regular expression, +a callable, +or variable arguments of all the aforementioned. +A sequence of booleans is also acceptable. +A dictionary can be used for selection +on a MultiIndex on different levels.

+
+
+ required +
+ df + + DataFrame + +
+

The pandas DataFrame object.

+
+
+ required +
+ axis + + Literal['index', 'columns'] + +
+

Should be either index or columns.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Index + +
+

A pandas Index.

+
+
+ +
+ Source code in janitor/functions/select.py +
450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
def get_index_labels(
+    arg: Any, df: pd.DataFrame, axis: Literal["index", "columns"]
+) -> pd.Index:
+    """Convenience function to get actual labels from column/index
+
+    !!! info "New in version 0.25.0"
+
+    Args:
+        arg: Valid inputs include: an exact column name to look for,
+            a shell-style glob string (e.g. `*_thing_*`),
+            a regular expression,
+            a callable,
+            or variable arguments of all the aforementioned.
+            A sequence of booleans is also acceptable.
+            A dictionary can be used for selection
+            on a MultiIndex on different levels.
+        df: The pandas DataFrame object.
+        axis: Should be either `index` or `columns`.
+
+    Returns:
+        A pandas Index.
+    """
+    assert axis in {"index", "columns"}
+    index = getattr(df, axis)
+    return index[_select_index(arg, df, axis)]
+
+
+
+ +
+ +
+ + +

+ select(df, *args, index=None, columns=None, axis='columns', invert=False) + +

+ + +
+ +

Method-chainable selection of rows and columns.

+

It accepts a string, shell-like glob strings (*string*), +regex, slice, array-like object, or a list of the previous options.

+

Selection on a MultiIndex on a level, or multiple levels, +is possible with a dictionary.

+

This method does not mutate the original DataFrame.

+

Selection can be inverted with the DropLabel class.

+

Optional ability to invert selection of index/columns available as well.

+
+

New in version 0.24.0

+
+
+

Note

+

The preferred option when selecting columns or rows in a Pandas DataFrame +is with .loc or .iloc methods, as they are generally performant. +select is primarily for convenience.

+
+
+

Version Changed

+
    +
  • 0.26.0
      +
    • Added variable args, invert and axis parameters.
    • +
    • rows keyword deprecated in favour of index.
    • +
    +
  • +
+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
+...      index=['cobra', 'viper', 'sidewinder'],
+...      columns=['max_speed', 'shield'])
+>>> df
+            max_speed  shield
+cobra               1       2
+viper               4       5
+sidewinder          7       8
+>>> df.select(index='cobra', columns='shield')
+       shield
+cobra       2
+
+

Labels can be dropped with the DropLabel class:

+
>>> df.select(index=DropLabel('cobra'))
+            max_speed  shield
+viper               4       5
+sidewinder          7       8
+
+

More examples can be found in the +select_columns section.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ *args + + tuple + +
+

Valid inputs include: an exact index name to look for, +a shell-style glob string (e.g. *_thing_*), +a regular expression, +a callable, +or variable arguments of all the aforementioned. +A sequence of booleans is also acceptable. +A dictionary can be used for selection +on a MultiIndex on different levels.

+
+
+ () +
+ index + + Any + +
+

Valid inputs include: an exact label to look for, +a shell-style glob string (e.g. *_thing_*), +a regular expression, +a callable, +or variable arguments of all the aforementioned. +A sequence of booleans is also acceptable. +A dictionary can be used for selection +on a MultiIndex on different levels.

+
+
+ None +
+ columns + + Any + +
+

Valid inputs include: an exact label to look for, +a shell-style glob string (e.g. *_thing_*), +a regular expression, +a callable, +or variable arguments of all the aforementioned. +A sequence of booleans is also acceptable. +A dictionary can be used for selection +on a MultiIndex on different levels.

+
+
+ None +
+ invert + + bool + +
+

Whether or not to invert the selection. +This will result in the selection +of the complement of the rows/columns provided.

+
+
+ False +
+ axis + + str + +
+

Whether the selection should be on the index('index'), +or columns('columns'). +Applicable only for the variable args parameter.

+
+
+ 'columns' +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If args and index/columns are provided.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with the specified rows and/or columns selected.

+
+
+ +
+ Source code in janitor/functions/select.py +
330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
@pf.register_dataframe_method
+@deprecated_alias(rows="index")
+def select(
+    df: pd.DataFrame,
+    *args: tuple,
+    index: Any = None,
+    columns: Any = None,
+    axis: str = "columns",
+    invert: bool = False,
+) -> pd.DataFrame:
+    """Method-chainable selection of rows and columns.
+
+    It accepts a string, shell-like glob strings `(*string*)`,
+    regex, slice, array-like object, or a list of the previous options.
+
+    Selection on a MultiIndex on a level, or multiple levels,
+    is possible with a dictionary.
+
+    This method does not mutate the original DataFrame.
+
+    Selection can be inverted with the `DropLabel` class.
+
+    Optional ability to invert selection of index/columns available as well.
+
+
+    !!! info "New in version 0.24.0"
+
+
+    !!!note
+
+        The preferred option when selecting columns or rows in a Pandas DataFrame
+        is with `.loc` or `.iloc` methods, as they are generally performant.
+        `select` is primarily for convenience.
+
+    !!! abstract "Version Changed"
+
+        - 0.26.0
+            - Added variable `args`, `invert` and `axis` parameters.
+            - `rows` keyword deprecated in favour of `index`.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
+        ...      index=['cobra', 'viper', 'sidewinder'],
+        ...      columns=['max_speed', 'shield'])
+        >>> df
+                    max_speed  shield
+        cobra               1       2
+        viper               4       5
+        sidewinder          7       8
+        >>> df.select(index='cobra', columns='shield')
+               shield
+        cobra       2
+
+        Labels can be dropped with the `DropLabel` class:
+
+        >>> df.select(index=DropLabel('cobra'))
+                    max_speed  shield
+        viper               4       5
+        sidewinder          7       8
+
+    More examples can be found in the
+    [`select_columns`][janitor.functions.select.select_columns] section.
+
+    Args:
+        df: A pandas DataFrame.
+        *args: Valid inputs include: an exact index name to look for,
+            a shell-style glob string (e.g. `*_thing_*`),
+            a regular expression,
+            a callable,
+            or variable arguments of all the aforementioned.
+            A sequence of booleans is also acceptable.
+            A dictionary can be used for selection
+            on a MultiIndex on different levels.
+        index: Valid inputs include: an exact label to look for,
+            a shell-style glob string (e.g. `*_thing_*`),
+            a regular expression,
+            a callable,
+            or variable arguments of all the aforementioned.
+            A sequence of booleans is also acceptable.
+            A dictionary can be used for selection
+            on a MultiIndex on different levels.
+        columns: Valid inputs include: an exact label to look for,
+            a shell-style glob string (e.g. `*_thing_*`),
+            a regular expression,
+            a callable,
+            or variable arguments of all the aforementioned.
+            A sequence of booleans is also acceptable.
+            A dictionary can be used for selection
+            on a MultiIndex on different levels.
+        invert: Whether or not to invert the selection.
+            This will result in the selection
+            of the complement of the rows/columns provided.
+        axis: Whether the selection should be on the index('index'),
+            or columns('columns').
+            Applicable only for the variable args parameter.
+
+    Raises:
+        ValueError: If args and index/columns are provided.
+
+    Returns:
+        A pandas DataFrame with the specified rows and/or columns selected.
+    """  # noqa: E501
+
+    if args:
+        check("invert", invert, [bool])
+        if (index is not None) or (columns is not None):
+            raise ValueError(
+                "Either provide variable args with the axis parameter, "
+                "or provide arguments to the index and/or columns parameters."
+            )
+        if axis == "index":
+            return _select(df, rows=list(args), columns=columns, invert=invert)
+        if axis == "columns":
+            return _select(df, columns=list(args), rows=index, invert=invert)
+        raise ValueError("axis should be either 'index' or 'columns'.")
+    return _select(df, rows=index, columns=columns, invert=invert)
+
+
+
+ +
+ +
+ + +

+ select_columns(df, *args, invert=False) + +

+ + +
+ +

Method-chainable selection of columns.

+

It accepts a string, shell-like glob strings (*string*), +regex, slice, array-like object, or a list of the previous options.

+

Selection on a MultiIndex on a level, or multiple levels, +is possible with a dictionary.

+

This method does not mutate the original DataFrame.

+

Optional ability to invert selection of columns available as well.

+
+

Note

+

The preferred option when selecting columns or rows in a Pandas DataFrame +is with .loc or .iloc methods. +select_columns is primarily for convenience.

+
+
+

Note

+

This function will be deprecated in a 1.x release. +Please use jn.select instead.

+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> from numpy import nan
+>>> pd.set_option("display.max_columns", None)
+>>> pd.set_option("display.expand_frame_repr", False)
+>>> pd.set_option("max_colwidth", None)
+>>> data = {'name': ['Cheetah','Owl monkey','Mountain beaver',
+...                  'Greater short-tailed shrew','Cow'],
+...         'genus': ['Acinonyx', 'Aotus', 'Aplodontia', 'Blarina', 'Bos'],
+...         'vore': ['carni', 'omni', 'herbi', 'omni', 'herbi'],
+...         'order': ['Carnivora','Primates','Rodentia','Soricomorpha','Artiodactyla'],
+...         'conservation': ['lc', nan, 'nt', 'lc', 'domesticated'],
+...         'sleep_total': [12.1, 17.0, 14.4, 14.9, 4.0],
+...         'sleep_rem': [nan, 1.8, 2.4, 2.3, 0.7],
+...         'sleep_cycle': [nan, nan, nan, 0.133333333, 0.666666667],
+...         'awake': [11.9, 7.0, 9.6, 9.1, 20.0],
+...         'brainwt': [nan, 0.0155, nan, 0.00029, 0.423],
+...         'bodywt': [50.0, 0.48, 1.35, 0.019, 600.0]}
+>>> df = pd.DataFrame(data)
+>>> df
+                         name       genus   vore         order  conservation  sleep_total  sleep_rem  sleep_cycle  awake  brainwt   bodywt
+0                     Cheetah    Acinonyx  carni     Carnivora            lc         12.1        NaN          NaN   11.9      NaN   50.000
+1                  Owl monkey       Aotus   omni      Primates           NaN         17.0        1.8          NaN    7.0  0.01550    0.480
+2             Mountain beaver  Aplodontia  herbi      Rodentia            nt         14.4        2.4          NaN    9.6      NaN    1.350
+3  Greater short-tailed shrew     Blarina   omni  Soricomorpha            lc         14.9        2.3     0.133333    9.1  0.00029    0.019
+4                         Cow         Bos  herbi  Artiodactyla  domesticated          4.0        0.7     0.666667   20.0  0.42300  600.000
+
+

Explicit label selection:

+
>>> df.select_columns('name', 'order')
+                         name         order
+0                     Cheetah     Carnivora
+1                  Owl monkey      Primates
+2             Mountain beaver      Rodentia
+3  Greater short-tailed shrew  Soricomorpha
+4                         Cow  Artiodactyla
+
+

Selection via globbing:

+
>>> df.select_columns("sleep*", "*wt")
+   sleep_total  sleep_rem  sleep_cycle  brainwt   bodywt
+0         12.1        NaN          NaN      NaN   50.000
+1         17.0        1.8          NaN  0.01550    0.480
+2         14.4        2.4          NaN      NaN    1.350
+3         14.9        2.3     0.133333  0.00029    0.019
+4          4.0        0.7     0.666667  0.42300  600.000
+
+

Selection via regex:

+
>>> import re
+>>> df.select_columns(re.compile(r"o.+er"))
+          order  conservation
+0     Carnivora            lc
+1      Primates           NaN
+2      Rodentia            nt
+3  Soricomorpha            lc
+4  Artiodactyla  domesticated
+
+

Selection via slicing:

+
>>> df.select_columns(slice('name','order'), slice('sleep_total','sleep_cycle'))
+                         name       genus   vore         order  sleep_total  sleep_rem  sleep_cycle
+0                     Cheetah    Acinonyx  carni     Carnivora         12.1        NaN          NaN
+1                  Owl monkey       Aotus   omni      Primates         17.0        1.8          NaN
+2             Mountain beaver  Aplodontia  herbi      Rodentia         14.4        2.4          NaN
+3  Greater short-tailed shrew     Blarina   omni  Soricomorpha         14.9        2.3     0.133333
+4                         Cow         Bos  herbi  Artiodactyla          4.0        0.7     0.666667
+
+

Selection via callable:

+
>>> from pandas.api.types import is_numeric_dtype
+>>> df.select_columns(is_numeric_dtype)
+   sleep_total  sleep_rem  sleep_cycle  awake  brainwt   bodywt
+0         12.1        NaN          NaN   11.9      NaN   50.000
+1         17.0        1.8          NaN    7.0  0.01550    0.480
+2         14.4        2.4          NaN    9.6      NaN    1.350
+3         14.9        2.3     0.133333    9.1  0.00029    0.019
+4          4.0        0.7     0.666667   20.0  0.42300  600.000
+>>> df.select_columns(lambda f: f.isna().any())
+   conservation  sleep_rem  sleep_cycle  brainwt
+0            lc        NaN          NaN      NaN
+1           NaN        1.8          NaN  0.01550
+2            nt        2.4          NaN      NaN
+3            lc        2.3     0.133333  0.00029
+4  domesticated        0.7     0.666667  0.42300
+
+

Exclude columns with the invert parameter:

+
>>> df.select_columns(is_numeric_dtype, invert=True)
+                         name       genus   vore         order  conservation
+0                     Cheetah    Acinonyx  carni     Carnivora            lc
+1                  Owl monkey       Aotus   omni      Primates           NaN
+2             Mountain beaver  Aplodontia  herbi      Rodentia            nt
+3  Greater short-tailed shrew     Blarina   omni  Soricomorpha            lc
+4                         Cow         Bos  herbi  Artiodactyla  domesticated
+
+

Exclude columns with the DropLabel class:

+
>>> from janitor import DropLabel
+>>> df.select_columns(DropLabel(slice("name", "awake")), "conservation")
+   brainwt   bodywt  conservation
+0      NaN   50.000            lc
+1  0.01550    0.480           NaN
+2      NaN    1.350            nt
+3  0.00029    0.019            lc
+4  0.42300  600.000  domesticated
+
+

Selection on MultiIndex columns:

+
>>> d = {'num_legs': [4, 4, 2, 2],
+...      'num_wings': [0, 0, 2, 2],
+...      'class': ['mammal', 'mammal', 'mammal', 'bird'],
+...      'animal': ['cat', 'dog', 'bat', 'penguin'],
+...      'locomotion': ['walks', 'walks', 'flies', 'walks']}
+>>> df = pd.DataFrame(data=d)
+>>> df = df.set_index(['class', 'animal', 'locomotion']).T
+>>> df
+class      mammal                bird
+animal        cat   dog   bat penguin
+locomotion  walks walks flies   walks
+num_legs        4     4     2       2
+num_wings       0     0     2       2
+
+

Selection with a scalar:

+
>>> df.select_columns('mammal')
+class      mammal
+animal        cat   dog   bat
+locomotion  walks walks flies
+num_legs        4     4     2
+num_wings       0     0     2
+
+

Selection with a tuple:

+
>>> df.select_columns(('mammal','bat'))
+class      mammal
+animal        bat
+locomotion  flies
+num_legs        2
+num_wings       2
+
+

Selection within a level is possible with a dictionary, +where the key is either a level name or number:

+
>>> df.select_columns({'animal':'cat'})
+class      mammal
+animal        cat
+locomotion  walks
+num_legs        4
+num_wings       0
+>>> df.select_columns({1:["bat", "cat"]})
+class      mammal
+animal        bat   cat
+locomotion  flies walks
+num_legs        2     4
+num_wings       2     0
+
+

Selection on multiple levels:

+
>>> df.select_columns({"class":"mammal", "locomotion":"flies"})
+class      mammal
+animal        bat
+locomotion  flies
+num_legs        2
+num_wings       2
+
+

Selection with a regex on a level:

+
>>> df.select_columns({"animal":re.compile(".+t$")})
+class      mammal
+animal        cat   bat
+locomotion  walks flies
+num_legs        4     2
+num_wings       0     2
+
+

Selection with a callable on a level:

+
>>> df.select_columns({"animal":lambda f: f.str.endswith('t')})
+class      mammal
+animal        cat   bat
+locomotion  walks flies
+num_legs        4     2
+num_wings       0     2
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ *args + + Any + +
+

Valid inputs include: an exact column name to look for, +a shell-style glob string (e.g. *_thing_*), +a regular expression, +a callable, +or variable arguments of all the aforementioned. +A sequence of booleans is also acceptable. +A dictionary can be used for selection +on a MultiIndex on different levels.

+
+
+ () +
+ invert + + bool + +
+

Whether or not to invert the selection. +This will result in the selection +of the complement of the columns provided.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with the specified columns selected.

+
+
+ +
+ Source code in janitor/functions/select.py +
 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `jn.select` instead."
+    )
+)
+def select_columns(
+    df: pd.DataFrame,
+    *args: Any,
+    invert: bool = False,
+) -> pd.DataFrame:
+    """Method-chainable selection of columns.
+
+    It accepts a string, shell-like glob strings `(*string*)`,
+    regex, slice, array-like object, or a list of the previous options.
+
+    Selection on a MultiIndex on a level, or multiple levels,
+    is possible with a dictionary.
+
+    This method does not mutate the original DataFrame.
+
+    Optional ability to invert selection of columns available as well.
+
+    !!!note
+
+        The preferred option when selecting columns or rows in a Pandas DataFrame
+        is with `.loc` or `.iloc` methods.
+        `select_columns` is primarily for convenience.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `jn.select` instead.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> from numpy import nan
+        >>> pd.set_option("display.max_columns", None)
+        >>> pd.set_option("display.expand_frame_repr", False)
+        >>> pd.set_option("max_colwidth", None)
+        >>> data = {'name': ['Cheetah','Owl monkey','Mountain beaver',
+        ...                  'Greater short-tailed shrew','Cow'],
+        ...         'genus': ['Acinonyx', 'Aotus', 'Aplodontia', 'Blarina', 'Bos'],
+        ...         'vore': ['carni', 'omni', 'herbi', 'omni', 'herbi'],
+        ...         'order': ['Carnivora','Primates','Rodentia','Soricomorpha','Artiodactyla'],
+        ...         'conservation': ['lc', nan, 'nt', 'lc', 'domesticated'],
+        ...         'sleep_total': [12.1, 17.0, 14.4, 14.9, 4.0],
+        ...         'sleep_rem': [nan, 1.8, 2.4, 2.3, 0.7],
+        ...         'sleep_cycle': [nan, nan, nan, 0.133333333, 0.666666667],
+        ...         'awake': [11.9, 7.0, 9.6, 9.1, 20.0],
+        ...         'brainwt': [nan, 0.0155, nan, 0.00029, 0.423],
+        ...         'bodywt': [50.0, 0.48, 1.35, 0.019, 600.0]}
+        >>> df = pd.DataFrame(data)
+        >>> df
+                                 name       genus   vore         order  conservation  sleep_total  sleep_rem  sleep_cycle  awake  brainwt   bodywt
+        0                     Cheetah    Acinonyx  carni     Carnivora            lc         12.1        NaN          NaN   11.9      NaN   50.000
+        1                  Owl monkey       Aotus   omni      Primates           NaN         17.0        1.8          NaN    7.0  0.01550    0.480
+        2             Mountain beaver  Aplodontia  herbi      Rodentia            nt         14.4        2.4          NaN    9.6      NaN    1.350
+        3  Greater short-tailed shrew     Blarina   omni  Soricomorpha            lc         14.9        2.3     0.133333    9.1  0.00029    0.019
+        4                         Cow         Bos  herbi  Artiodactyla  domesticated          4.0        0.7     0.666667   20.0  0.42300  600.000
+
+        Explicit label selection:
+        >>> df.select_columns('name', 'order')
+                                 name         order
+        0                     Cheetah     Carnivora
+        1                  Owl monkey      Primates
+        2             Mountain beaver      Rodentia
+        3  Greater short-tailed shrew  Soricomorpha
+        4                         Cow  Artiodactyla
+
+        Selection via globbing:
+        >>> df.select_columns("sleep*", "*wt")
+           sleep_total  sleep_rem  sleep_cycle  brainwt   bodywt
+        0         12.1        NaN          NaN      NaN   50.000
+        1         17.0        1.8          NaN  0.01550    0.480
+        2         14.4        2.4          NaN      NaN    1.350
+        3         14.9        2.3     0.133333  0.00029    0.019
+        4          4.0        0.7     0.666667  0.42300  600.000
+
+        Selection via regex:
+        >>> import re
+        >>> df.select_columns(re.compile(r"o.+er"))
+                  order  conservation
+        0     Carnivora            lc
+        1      Primates           NaN
+        2      Rodentia            nt
+        3  Soricomorpha            lc
+        4  Artiodactyla  domesticated
+
+        Selection via slicing:
+        >>> df.select_columns(slice('name','order'), slice('sleep_total','sleep_cycle'))
+                                 name       genus   vore         order  sleep_total  sleep_rem  sleep_cycle
+        0                     Cheetah    Acinonyx  carni     Carnivora         12.1        NaN          NaN
+        1                  Owl monkey       Aotus   omni      Primates         17.0        1.8          NaN
+        2             Mountain beaver  Aplodontia  herbi      Rodentia         14.4        2.4          NaN
+        3  Greater short-tailed shrew     Blarina   omni  Soricomorpha         14.9        2.3     0.133333
+        4                         Cow         Bos  herbi  Artiodactyla          4.0        0.7     0.666667
+
+        Selection via callable:
+        >>> from pandas.api.types import is_numeric_dtype
+        >>> df.select_columns(is_numeric_dtype)
+           sleep_total  sleep_rem  sleep_cycle  awake  brainwt   bodywt
+        0         12.1        NaN          NaN   11.9      NaN   50.000
+        1         17.0        1.8          NaN    7.0  0.01550    0.480
+        2         14.4        2.4          NaN    9.6      NaN    1.350
+        3         14.9        2.3     0.133333    9.1  0.00029    0.019
+        4          4.0        0.7     0.666667   20.0  0.42300  600.000
+        >>> df.select_columns(lambda f: f.isna().any())
+           conservation  sleep_rem  sleep_cycle  brainwt
+        0            lc        NaN          NaN      NaN
+        1           NaN        1.8          NaN  0.01550
+        2            nt        2.4          NaN      NaN
+        3            lc        2.3     0.133333  0.00029
+        4  domesticated        0.7     0.666667  0.42300
+
+        Exclude columns with the `invert` parameter:
+        >>> df.select_columns(is_numeric_dtype, invert=True)
+                                 name       genus   vore         order  conservation
+        0                     Cheetah    Acinonyx  carni     Carnivora            lc
+        1                  Owl monkey       Aotus   omni      Primates           NaN
+        2             Mountain beaver  Aplodontia  herbi      Rodentia            nt
+        3  Greater short-tailed shrew     Blarina   omni  Soricomorpha            lc
+        4                         Cow         Bos  herbi  Artiodactyla  domesticated
+
+        Exclude columns with the `DropLabel` class:
+        >>> from janitor import DropLabel
+        >>> df.select_columns(DropLabel(slice("name", "awake")), "conservation")
+           brainwt   bodywt  conservation
+        0      NaN   50.000            lc
+        1  0.01550    0.480           NaN
+        2      NaN    1.350            nt
+        3  0.00029    0.019            lc
+        4  0.42300  600.000  domesticated
+
+        Selection on MultiIndex columns:
+        >>> d = {'num_legs': [4, 4, 2, 2],
+        ...      'num_wings': [0, 0, 2, 2],
+        ...      'class': ['mammal', 'mammal', 'mammal', 'bird'],
+        ...      'animal': ['cat', 'dog', 'bat', 'penguin'],
+        ...      'locomotion': ['walks', 'walks', 'flies', 'walks']}
+        >>> df = pd.DataFrame(data=d)
+        >>> df = df.set_index(['class', 'animal', 'locomotion']).T
+        >>> df
+        class      mammal                bird
+        animal        cat   dog   bat penguin
+        locomotion  walks walks flies   walks
+        num_legs        4     4     2       2
+        num_wings       0     0     2       2
+
+        Selection with a scalar:
+        >>> df.select_columns('mammal')
+        class      mammal
+        animal        cat   dog   bat
+        locomotion  walks walks flies
+        num_legs        4     4     2
+        num_wings       0     0     2
+
+        Selection with a tuple:
+        >>> df.select_columns(('mammal','bat'))
+        class      mammal
+        animal        bat
+        locomotion  flies
+        num_legs        2
+        num_wings       2
+
+        Selection within a level is possible with a dictionary,
+        where the key is either a level name or number:
+        >>> df.select_columns({'animal':'cat'})
+        class      mammal
+        animal        cat
+        locomotion  walks
+        num_legs        4
+        num_wings       0
+        >>> df.select_columns({1:["bat", "cat"]})
+        class      mammal
+        animal        bat   cat
+        locomotion  flies walks
+        num_legs        2     4
+        num_wings       2     0
+
+        Selection on multiple levels:
+        >>> df.select_columns({"class":"mammal", "locomotion":"flies"})
+        class      mammal
+        animal        bat
+        locomotion  flies
+        num_legs        2
+        num_wings       2
+
+        Selection with a regex on a level:
+        >>> df.select_columns({"animal":re.compile(".+t$")})
+        class      mammal
+        animal        cat   bat
+        locomotion  walks flies
+        num_legs        4     2
+        num_wings       0     2
+
+        Selection with a callable on a level:
+        >>> df.select_columns({"animal":lambda f: f.str.endswith('t')})
+        class      mammal
+        animal        cat   bat
+        locomotion  walks flies
+        num_legs        4     2
+        num_wings       0     2
+
+    Args:
+        df: A pandas DataFrame.
+        *args: Valid inputs include: an exact column name to look for,
+            a shell-style glob string (e.g. `*_thing_*`),
+            a regular expression,
+            a callable,
+            or variable arguments of all the aforementioned.
+            A sequence of booleans is also acceptable.
+            A dictionary can be used for selection
+            on a MultiIndex on different levels.
+        invert: Whether or not to invert the selection.
+            This will result in the selection
+            of the complement of the columns provided.
+
+    Returns:
+        A pandas DataFrame with the specified columns selected.
+    """  # noqa: E501
+
+    return _select(df, columns=list(args), invert=invert)
+
+
+
+ +
+ +
+ + +

+ select_rows(df, *args, invert=False) + +

+ + +
+ +

Method-chainable selection of rows.

+

It accepts a string, shell-like glob strings (*string*), +regex, slice, array-like object, or a list of the previous options.

+

Selection on a MultiIndex on a level, or multiple levels, +is possible with a dictionary.

+

This method does not mutate the original DataFrame.

+

Optional ability to invert selection of rows available as well.

+
+

New in version 0.24.0

+
+
+

Note

+

The preferred option when selecting columns or rows in a Pandas DataFrame +is with .loc or .iloc methods, as they are generally performant. +select_rows is primarily for convenience.

+
+
+

Note

+

This function will be deprecated in a 1.x release. +Please use jn.select instead.

+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = {"col1": [1, 2], "foo": [3, 4], "col2": [5, 6]}
+>>> df = pd.DataFrame.from_dict(df, orient='index')
+>>> df
+      0  1
+col1  1  2
+foo   3  4
+col2  5  6
+>>> df.select_rows("col*")
+      0  1
+col1  1  2
+col2  5  6
+
+

More examples can be found in the +select_columns section.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ *args + + Any + +
+

Valid inputs include: an exact index name to look for, +a shell-style glob string (e.g. *_thing_*), +a regular expression, +a callable, +or variable arguments of all the aforementioned. +A sequence of booleans is also acceptable. +A dictionary can be used for selection +on a MultiIndex on different levels.

+
+
+ () +
+ invert + + bool + +
+

Whether or not to invert the selection. +This will result in the selection +of the complement of the rows provided.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with the specified rows selected.

+
+
+ +
+ Source code in janitor/functions/select.py +
254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
@pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `jn.select` instead."
+    )
+)
+def select_rows(
+    df: pd.DataFrame,
+    *args: Any,
+    invert: bool = False,
+) -> pd.DataFrame:
+    """Method-chainable selection of rows.
+
+    It accepts a string, shell-like glob strings `(*string*)`,
+    regex, slice, array-like object, or a list of the previous options.
+
+    Selection on a MultiIndex on a level, or multiple levels,
+    is possible with a dictionary.
+
+    This method does not mutate the original DataFrame.
+
+    Optional ability to invert selection of rows available as well.
+
+
+    !!! info "New in version 0.24.0"
+
+    !!!note
+
+        The preferred option when selecting columns or rows in a Pandas DataFrame
+        is with `.loc` or `.iloc` methods, as they are generally performant.
+        `select_rows` is primarily for convenience.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `jn.select` instead.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = {"col1": [1, 2], "foo": [3, 4], "col2": [5, 6]}
+        >>> df = pd.DataFrame.from_dict(df, orient='index')
+        >>> df
+              0  1
+        col1  1  2
+        foo   3  4
+        col2  5  6
+        >>> df.select_rows("col*")
+              0  1
+        col1  1  2
+        col2  5  6
+
+    More examples can be found in the
+    [`select_columns`][janitor.functions.select.select_columns] section.
+
+    Args:
+        df: A pandas DataFrame.
+        *args: Valid inputs include: an exact index name to look for,
+            a shell-style glob string (e.g. `*_thing_*`),
+            a regular expression,
+            a callable,
+            or variable arguments of all the aforementioned.
+            A sequence of booleans is also acceptable.
+            A dictionary can be used for selection
+            on a MultiIndex on different levels.
+        invert: Whether or not to invert the selection.
+            This will result in the selection
+            of the complement of the rows provided.
+
+    Returns:
+        A pandas DataFrame with the specified rows selected.
+    """  # noqa: E501
+    return _select(df, rows=list(args), invert=invert)
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ shuffle + + +

+ +
+ +

Implementation of shuffle functions.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ shuffle(df, random_state=None, reset_index=True) + +

+ + +
+ +

Shuffle the rows of the DataFrame.

+

This method does not mutate the original DataFrame.

+

Super-sugary syntax! Underneath the hood, we use df.sample(frac=1), +with the option to set the random state.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "col1": range(5),
+...     "col2": list("abcde"),
+... })
+>>> df
+   col1 col2
+0     0    a
+1     1    b
+2     2    c
+3     3    d
+4     4    e
+>>> df.shuffle(random_state=42)
+   col1 col2
+0     1    b
+1     4    e
+2     2    c
+3     0    a
+4     3    d
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ random_state + + Any + +
+

If provided, set a seed for the random number +generator. Passed to pd.DataFrame.sample().

+
+
+ None +
+ reset_index + + bool + +
+

If True, reset the dataframe index to the default +RangeIndex.

+
+
+ True +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A shuffled pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/shuffle.py +
 9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
@pf.register_dataframe_method
+def shuffle(
+    df: pd.DataFrame, random_state: Any = None, reset_index: bool = True
+) -> pd.DataFrame:
+    """Shuffle the rows of the DataFrame.
+
+    This method does not mutate the original DataFrame.
+
+    Super-sugary syntax! Underneath the hood, we use `df.sample(frac=1)`,
+    with the option to set the random state.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "col1": range(5),
+        ...     "col2": list("abcde"),
+        ... })
+        >>> df
+           col1 col2
+        0     0    a
+        1     1    b
+        2     2    c
+        3     3    d
+        4     4    e
+        >>> df.shuffle(random_state=42)
+           col1 col2
+        0     1    b
+        1     4    e
+        2     2    c
+        3     0    a
+        4     3    d
+
+    Args:
+        df: A pandas DataFrame.
+        random_state: If provided, set a seed for the random number
+            generator. Passed to `pd.DataFrame.sample()`.
+        reset_index: If True, reset the dataframe index to the default
+            RangeIndex.
+
+    Returns:
+        A shuffled pandas DataFrame.
+    """
+    result = df.sample(frac=1, random_state=random_state)
+    if reset_index:
+        result = result.reset_index(drop=True)
+    return result
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ sort_column_value_order + + +

+ +
+ +

Implementation of the sort_column_value_order function.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ sort_column_value_order(df, column, column_value_order, columns=None) + +

+ + +
+ +

This function adds precedence to certain values in a specified column, +then sorts based on that column and any other specified columns.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> import numpy as np
+>>> company_sales = {
+...     "SalesMonth": ["Jan", "Feb", "Feb", "Mar", "April"],
+...     "Company1": [150.0, 200.0, 200.0, 300.0, 400.0],
+...     "Company2": [180.0, 250.0, 250.0, np.nan, 500.0],
+...     "Company3": [400.0, 500.0, 500.0, 600.0, 675.0],
+... }
+>>> df = pd.DataFrame.from_dict(company_sales)
+>>> df
+  SalesMonth  Company1  Company2  Company3
+0        Jan     150.0     180.0     400.0
+1        Feb     200.0     250.0     500.0
+2        Feb     200.0     250.0     500.0
+3        Mar     300.0       NaN     600.0
+4      April     400.0     500.0     675.0
+>>> df.sort_column_value_order(
+...     "SalesMonth",
+...     {"April": 1, "Mar": 2, "Feb": 3, "Jan": 4}
+... )
+  SalesMonth  Company1  Company2  Company3
+4      April     400.0     500.0     675.0
+3        Mar     300.0       NaN     600.0
+1        Feb     200.0     250.0     500.0
+2        Feb     200.0     250.0     500.0
+0        Jan     150.0     180.0     400.0
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

pandas DataFrame that we are manipulating

+
+
+ required +
+ column + + str + +
+

This is a column name as a string we are using to specify +which column to sort by

+
+
+ required +
+ column_value_order + + dict + +
+

Dictionary of values that will +represent precedence of the values in the specified column

+
+
+ required +
+ columns + + str + +
+

A list of additional columns that we can sort by

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If chosen Column Name is not in +Dataframe, or if column_value_order dictionary is empty.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A sorted pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/sort_column_value_order.py +
 9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
@pf.register_dataframe_method
+def sort_column_value_order(
+    df: pd.DataFrame,
+    column: str,
+    column_value_order: dict,
+    columns: str = None,
+) -> pd.DataFrame:
+    """This function adds precedence to certain values in a specified column,
+    then sorts based on that column and any other specified columns.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> import numpy as np
+        >>> company_sales = {
+        ...     "SalesMonth": ["Jan", "Feb", "Feb", "Mar", "April"],
+        ...     "Company1": [150.0, 200.0, 200.0, 300.0, 400.0],
+        ...     "Company2": [180.0, 250.0, 250.0, np.nan, 500.0],
+        ...     "Company3": [400.0, 500.0, 500.0, 600.0, 675.0],
+        ... }
+        >>> df = pd.DataFrame.from_dict(company_sales)
+        >>> df
+          SalesMonth  Company1  Company2  Company3
+        0        Jan     150.0     180.0     400.0
+        1        Feb     200.0     250.0     500.0
+        2        Feb     200.0     250.0     500.0
+        3        Mar     300.0       NaN     600.0
+        4      April     400.0     500.0     675.0
+        >>> df.sort_column_value_order(
+        ...     "SalesMonth",
+        ...     {"April": 1, "Mar": 2, "Feb": 3, "Jan": 4}
+        ... )
+          SalesMonth  Company1  Company2  Company3
+        4      April     400.0     500.0     675.0
+        3        Mar     300.0       NaN     600.0
+        1        Feb     200.0     250.0     500.0
+        2        Feb     200.0     250.0     500.0
+        0        Jan     150.0     180.0     400.0
+
+    Args:
+        df: pandas DataFrame that we are manipulating
+        column: This is a column name as a string we are using to specify
+            which column to sort by
+        column_value_order: Dictionary of values that will
+            represent precedence of the values in the specified column
+        columns: A list of additional columns that we can sort by
+
+    Raises:
+        ValueError: If chosen Column Name is not in
+            Dataframe, or if `column_value_order` dictionary is empty.
+
+    Returns:
+        A sorted pandas DataFrame.
+    """
+    # Validation checks
+    check_column(df, column, present=True)
+    check("column_value_order", column_value_order, [dict])
+    if not column_value_order:
+        raise ValueError("column_value_order dictionary cannot be empty")
+
+    df = df.assign(cond_order=df[column].map(column_value_order))
+
+    sort_by = ["cond_order"]
+    if columns is not None:
+        sort_by = ["cond_order"] + columns
+
+    df = df.sort_values(sort_by).remove_columns("cond_order")
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ sort_naturally + + +

+ +
+ +

Implementation of the sort_naturally function.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ sort_naturally(df, column_name, **natsorted_kwargs) + +

+ + +
+ +

Sort a DataFrame by a column using natural sorting.

+

Natural sorting is distinct from +the default lexiographical sorting provided by pandas. +For example, given the following list of items:

+
["A1", "A11", "A3", "A2", "A10"]
+
+

Lexicographical sorting would give us:

+
["A1", "A10", "A11", "A2", "A3"]
+
+

By contrast, "natural" sorting would give us:

+
["A1", "A2", "A3", "A10", "A11"]
+
+

This function thus provides natural sorting +on a single column of a dataframe.

+

To accomplish this, we do a natural sort +on the unique values that are present in the dataframe. +Then, we reconstitute the entire dataframe +in the naturally sorted order.

+

Natural sorting is provided by the Python package +natsort.

+

All keyword arguments to natsort should be provided +after the column name to sort by is provided. +They are passed through to the natsorted function.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame(
+...     {
+...         "Well": ["A21", "A3", "A21", "B2", "B51", "B12"],
+...         "Value": [1, 2, 13, 3, 4, 7],
+...     }
+... )
+>>> df
+  Well  Value
+0  A21      1
+1   A3      2
+2  A21     13
+3   B2      3
+4  B51      4
+5  B12      7
+>>> df.sort_naturally("Well")
+  Well  Value
+1   A3      2
+0  A21      1
+2  A21     13
+3   B2      3
+5  B12      7
+4  B51      4
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + str + +
+

The column on which natural sorting should take place.

+
+
+ required +
+ **natsorted_kwargs + + Any + +
+

Keyword arguments to be passed +to natsort's natsorted function.

+
+
+ {} +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A sorted pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/sort_naturally.py +
10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
@pf.register_dataframe_method
+def sort_naturally(
+    df: pd.DataFrame, column_name: str, **natsorted_kwargs: Any
+) -> pd.DataFrame:
+    """Sort a DataFrame by a column using *natural* sorting.
+
+    Natural sorting is distinct from
+    the default lexiographical sorting provided by `pandas`.
+    For example, given the following list of items:
+
+    ```python
+    ["A1", "A11", "A3", "A2", "A10"]
+    ```
+
+    Lexicographical sorting would give us:
+
+    ```python
+    ["A1", "A10", "A11", "A2", "A3"]
+    ```
+
+    By contrast, "natural" sorting would give us:
+
+    ```python
+    ["A1", "A2", "A3", "A10", "A11"]
+    ```
+
+    This function thus provides *natural* sorting
+    on a single column of a dataframe.
+
+    To accomplish this, we do a natural sort
+    on the unique values that are present in the dataframe.
+    Then, we reconstitute the entire dataframe
+    in the naturally sorted order.
+
+    Natural sorting is provided by the Python package
+    [natsort](https://natsort.readthedocs.io/en/master/index.html).
+
+    All keyword arguments to `natsort` should be provided
+    after the column name to sort by is provided.
+    They are passed through to the `natsorted` function.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "Well": ["A21", "A3", "A21", "B2", "B51", "B12"],
+        ...         "Value": [1, 2, 13, 3, 4, 7],
+        ...     }
+        ... )
+        >>> df
+          Well  Value
+        0  A21      1
+        1   A3      2
+        2  A21     13
+        3   B2      3
+        4  B51      4
+        5  B12      7
+        >>> df.sort_naturally("Well")
+          Well  Value
+        1   A3      2
+        0  A21      1
+        2  A21     13
+        3   B2      3
+        5  B12      7
+        4  B51      4
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: The column on which natural sorting should take place.
+        **natsorted_kwargs: Keyword arguments to be passed
+            to natsort's `natsorted` function.
+
+    Returns:
+        A sorted pandas DataFrame.
+    """
+    new_order = index_natsorted(df[column_name], **natsorted_kwargs)
+    return df.iloc[new_order, :]
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ take_first + + +

+ +
+ +

Implementation of take_first function.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ take_first(df, subset, by, ascending=True) + +

+ + +
+ +

Take the first row within each group specified by subset.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [0, 1, 2, 3]})
+>>> df
+   a  b
+0  x  0
+1  x  1
+2  y  2
+3  y  3
+>>> df.take_first(subset="a", by="b")
+   a  b
+0  x  0
+2  y  2
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ subset + + Union[Hashable, Iterable[Hashable]] + +
+

Column(s) defining the group.

+
+
+ required +
+ by + + Hashable + +
+

Column to sort by.

+
+
+ required +
+ ascending + + bool + +
+

Whether or not to sort in ascending order, bool.

+
+
+ True +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/take_first.py +
 9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
@pf.register_dataframe_method
+def take_first(
+    df: pd.DataFrame,
+    subset: Union[Hashable, Iterable[Hashable]],
+    by: Hashable,
+    ascending: bool = True,
+) -> pd.DataFrame:
+    """Take the first row within each group specified by `subset`.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [0, 1, 2, 3]})
+        >>> df
+           a  b
+        0  x  0
+        1  x  1
+        2  y  2
+        3  y  3
+        >>> df.take_first(subset="a", by="b")
+           a  b
+        0  x  0
+        2  y  2
+
+    Args:
+        df: A pandas DataFrame.
+        subset: Column(s) defining the group.
+        by: Column to sort by.
+        ascending: Whether or not to sort in ascending order, `bool`.
+
+    Returns:
+        A pandas DataFrame.
+    """
+    result = df.sort_values(by=by, ascending=ascending).drop_duplicates(
+        subset=subset, keep="first"
+    )
+
+    return result
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ then + + +

+ +
+ +

Implementation source for then.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ then(df, func) + +

+ + +
+ +

Add an arbitrary function to run in the pyjanitor method chain.

+

This method does not mutate the original DataFrame.

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use pd.DataFrame.pipe instead.

+
+ + +

Examples:

+

A trivial example using a lambda func.

+
>>> import pandas as pd
+>>> import janitor
+>>> (pd.DataFrame({"a": [1, 2, 3], "b": [7, 8, 9]})
+...  .then(lambda df: df * 2))
+   a   b
+0  2  14
+1  4  16
+2  6  18
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ func + + Callable + +
+

A function you would like to run in the method chain. +It should take one parameter and return one parameter, each being +the DataFrame object. After that, do whatever you want in the +middle. Go crazy.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/then.py +
11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
@pf.register_dataframe_method
+@refactored_function(
+    message="This function will be deprecated in a 1.x release. "
+    "Kindly use `pd.DataFrame.pipe` instead."
+)
+def then(df: pd.DataFrame, func: Callable) -> pd.DataFrame:
+    """Add an arbitrary function to run in the `pyjanitor` method chain.
+
+    This method does not mutate the original DataFrame.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `pd.DataFrame.pipe` instead.
+
+    Examples:
+        A trivial example using a lambda `func`.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> (pd.DataFrame({"a": [1, 2, 3], "b": [7, 8, 9]})
+        ...  .then(lambda df: df * 2))
+           a   b
+        0  2  14
+        1  4  16
+        2  6  18
+
+    Args:
+        df: A pandas DataFrame.
+        func: A function you would like to run in the method chain.
+            It should take one parameter and return one parameter, each being
+            the DataFrame object. After that, do whatever you want in the
+            middle. Go crazy.
+
+    Returns:
+        A pandas DataFrame.
+    """
+    df = func(df)
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ to_datetime + + +

+ +
+ +

Implementation source for to_datetime.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ to_datetime(df, column_name, **kwargs) + +

+ + +
+ +

Convert column to a datetime type, in-place.

+

Intended to be the method-chaining equivalent of:

+
df[column_name] = pd.to_datetime(df[column_name], **kwargs)
+
+

This method mutates the original DataFrame.

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use jn.transform_column +instead.

+
+ + +

Examples:

+

Converting a string column to datetime type with custom format.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({'date': ['20200101', '20200202', '20200303']})
+>>> df
+       date
+0  20200101
+1  20200202
+2  20200303
+>>> df.to_datetime('date', format='%Y%m%d')
+        date
+0 2020-01-01
+1 2020-02-02
+2 2020-03-03
+
+

Read the pandas documentation for to_datetime for more information.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + Hashable + +
+

Column name.

+
+
+ required +
+ **kwargs + + Any + +
+

Provide any kwargs that pd.to_datetime can take.

+
+
+ {} +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with updated datetime data.

+
+
+ +
+ Source code in janitor/functions/to_datetime.py +
11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
@pf.register_dataframe_method
+@deprecated_alias(column="column_name")
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `jn.transform_columns` instead."
+    )
+)
+def to_datetime(
+    df: pd.DataFrame, column_name: Hashable, **kwargs: Any
+) -> pd.DataFrame:
+    """Convert column to a datetime type, in-place.
+
+    Intended to be the method-chaining equivalent of:
+
+    ```python
+    df[column_name] = pd.to_datetime(df[column_name], **kwargs)
+    ```
+
+    This method mutates the original DataFrame.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use [`jn.transform_column`][janitor.functions.transform_columns.transform_column]
+        instead.
+
+    Examples:
+        Converting a string column to datetime type with custom format.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({'date': ['20200101', '20200202', '20200303']})
+        >>> df
+               date
+        0  20200101
+        1  20200202
+        2  20200303
+        >>> df.to_datetime('date', format='%Y%m%d')
+                date
+        0 2020-01-01
+        1 2020-02-02
+        2 2020-03-03
+
+    Read the pandas documentation for [`to_datetime`][pd_docs] for more information.
+
+    [pd_docs]: https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: Column name.
+        **kwargs: Provide any kwargs that `pd.to_datetime` can take.
+
+    Returns:
+        A pandas DataFrame with updated datetime data.
+    """  # noqa: E501
+    df[column_name] = pd.to_datetime(df[column_name], **kwargs)
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ toset + + +

+ +
+ +

Implementation of the toset function.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ toset(series) + +

+ + +
+ +

Return a set of the values.

+
+

Note

+

This function will be deprecated in a 1.x release. +Please use set(df[column]) instead.

+
+

These are each a scalar type, which is a Python scalar +(for str, int, float) or a pandas scalar +(for Timestamp/Timedelta/Interval/Period)

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> s = pd.Series([1, 2, 3, 5, 5], index=["a", "b", "c", "d", "e"])
+>>> s
+a    1
+b    2
+c    3
+d    5
+e    5
+dtype: int64
+>>> s.toset()
+{1, 2, 3, 5}
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ series + + Series + +
+

A pandas series.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Set + +
+

A set of values.

+
+
+ +
+ Source code in janitor/functions/toset.py +
11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
@pf.register_series_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `set(df[column])` instead."
+    )
+)
+def toset(series: pd.Series) -> Set:
+    """Return a set of the values.
+
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `set(df[column])` instead.
+
+    These are each a scalar type, which is a Python scalar
+    (for str, int, float) or a pandas scalar
+    (for Timestamp/Timedelta/Interval/Period)
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> s = pd.Series([1, 2, 3, 5, 5], index=["a", "b", "c", "d", "e"])
+        >>> s
+        a    1
+        b    2
+        c    3
+        d    5
+        e    5
+        dtype: int64
+        >>> s.toset()
+        {1, 2, 3, 5}
+
+    Args:
+        series: A pandas series.
+
+    Returns:
+        A set of values.
+    """
+
+    return set(series.tolist())
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ transform_columns + + +

+ +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ transform_column(df, column_name, function, dest_column_name=None, elementwise=True) + +

+ + +
+ +

Transform the given column using the provided function.

+

Meant to be the method-chaining equivalent of: +

df[dest_column_name] = df[column_name].apply(function)
+

+

Functions can be applied in one of two ways:

+
    +
  • Element-wise (default; elementwise=True). Then, the individual +column elements will be passed in as the first argument of function.
  • +
  • Column-wise (elementwise=False). Then, function is expected to +take in a pandas Series and return a sequence that is of identical length +to the original.
  • +
+

If dest_column_name is provided, then the transformation result is stored +in that column. Otherwise, the transformed result is stored under the name +of the original column.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+

Transform a column in-place with an element-wise function.

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "a": [2, 3, 4],
+...     "b": ["area", "pyjanitor", "grapefruit"],
+... })
+>>> df
+   a           b
+0  2        area
+1  3   pyjanitor
+2  4  grapefruit
+>>> df.transform_column(
+...     column_name="a",
+...     function=lambda x: x**2 - 1,
+... )
+    a           b
+0   3        area
+1   8   pyjanitor
+2  15  grapefruit
+
+ + +

Examples:

+

Transform a column in-place with an column-wise function.

+
>>> df.transform_column(
+...     column_name="b",
+...     function=lambda srs: srs.str[:5],
+...     elementwise=False,
+... )
+   a      b
+0  2   area
+1  3  pyjan
+2  4  grape
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_name + + Hashable + +
+

The column to transform.

+
+
+ required +
+ function + + Callable + +
+

A function to apply on the column.

+
+
+ required +
+ dest_column_name + + Optional[str] + +
+

The column name to store the transformation result +in. Defaults to None, which will result in the original column +name being overwritten. If a name is provided here, then a new +column with the transformed values will be created.

+
+
+ None +
+ elementwise + + bool + +
+

Whether to apply the function elementwise or not. +If elementwise is True, then the function's first argument +should be the data type of each datum in the column of data, +and should return a transformed datum. +If elementwise is False, then the function's should expect +a pandas Series passed into it, and return a pandas Series.

+
+
+ True +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with a transformed column.

+
+
+ +
+ Source code in janitor/functions/transform_columns.py +
 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
@pf.register_dataframe_method
+@deprecated_alias(col_name="column_name", dest_col_name="dest_column_name")
+def transform_column(
+    df: pd.DataFrame,
+    column_name: Hashable,
+    function: Callable,
+    dest_column_name: Optional[str] = None,
+    elementwise: bool = True,
+) -> pd.DataFrame:
+    """Transform the given column using the provided function.
+
+    Meant to be the method-chaining equivalent of:
+    ```python
+    df[dest_column_name] = df[column_name].apply(function)
+    ```
+
+    Functions can be applied in one of two ways:
+
+    - **Element-wise** (default; `elementwise=True`). Then, the individual
+    column elements will be passed in as the first argument of `function`.
+    - **Column-wise** (`elementwise=False`). Then, `function` is expected to
+    take in a pandas Series and return a sequence that is of identical length
+    to the original.
+
+    If `dest_column_name` is provided, then the transformation result is stored
+    in that column. Otherwise, the transformed result is stored under the name
+    of the original column.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        Transform a column in-place with an element-wise function.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "a": [2, 3, 4],
+        ...     "b": ["area", "pyjanitor", "grapefruit"],
+        ... })
+        >>> df
+           a           b
+        0  2        area
+        1  3   pyjanitor
+        2  4  grapefruit
+        >>> df.transform_column(
+        ...     column_name="a",
+        ...     function=lambda x: x**2 - 1,
+        ... )
+            a           b
+        0   3        area
+        1   8   pyjanitor
+        2  15  grapefruit
+
+    Examples:
+        Transform a column in-place with an column-wise function.
+
+        >>> df.transform_column(
+        ...     column_name="b",
+        ...     function=lambda srs: srs.str[:5],
+        ...     elementwise=False,
+        ... )
+           a      b
+        0  2   area
+        1  3  pyjan
+        2  4  grape
+
+    Args:
+        df: A pandas DataFrame.
+        column_name: The column to transform.
+        function: A function to apply on the column.
+        dest_column_name: The column name to store the transformation result
+            in. Defaults to None, which will result in the original column
+            name being overwritten. If a name is provided here, then a new
+            column with the transformed values will be created.
+        elementwise: Whether to apply the function elementwise or not.
+            If `elementwise` is True, then the function's first argument
+            should be the data type of each datum in the column of data,
+            and should return a transformed datum.
+            If `elementwise` is False, then the function's should expect
+            a pandas Series passed into it, and return a pandas Series.
+
+    Returns:
+        A pandas DataFrame with a transformed column.
+    """
+    check_column(df, column_name)
+
+    if dest_column_name is None:
+        dest_column_name = column_name
+    elif dest_column_name != column_name:
+        # If `dest_column_name` is provided and equals `column_name`, then we
+        # assume that the user's intent is to perform an in-place
+        # transformation (Same behaviour as when `dest_column_name` = None).
+        # Otherwise we throw an error if `dest_column_name` already exists in
+        # df.
+        check_column(df, dest_column_name, present=False)
+
+    result = _get_transform_column_result(
+        df[column_name],
+        function,
+        elementwise,
+    )
+
+    return df.assign(**{dest_column_name: result})
+
+
+
+ +
+ +
+ + +

+ transform_columns(df, column_names, function, suffix=None, elementwise=True, new_column_names=None) + +

+ + +
+ +

Transform multiple columns through the same transformation.

+

This method does not mutate the original DataFrame.

+

Super syntactic sugar! +Essentially wraps transform_column +and calls it repeatedly over all column names provided.

+

User can optionally supply either a suffix to create a new set of columns +with the specified suffix, or provide a dictionary mapping each original +column name in column_names to its corresponding new column name. +Note that all column names must be strings.

+ + +

Examples:

+

log10 transform a list of columns, replacing original columns.

+
>>> import numpy as np
+>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "col1": [5, 10, 15],
+...     "col2": [3, 6, 9],
+...     "col3": [10, 100, 1_000],
+... })
+>>> df
+   col1  col2  col3
+0     5     3    10
+1    10     6   100
+2    15     9  1000
+>>> df.transform_columns(["col1", "col2", "col3"], np.log10)
+       col1      col2  col3
+0  0.698970  0.477121   1.0
+1  1.000000  0.778151   2.0
+2  1.176091  0.954243   3.0
+
+

Using the suffix parameter to create new columns.

+
>>> df.transform_columns(["col1", "col3"], np.log10, suffix="_log")
+   col1  col2  col3  col1_log  col3_log
+0     5     3    10  0.698970       1.0
+1    10     6   100  1.000000       2.0
+2    15     9  1000  1.176091       3.0
+
+

Using the new_column_names parameter to create new columns.

+
>>> df.transform_columns(
+...     ["col1", "col3"],
+...     np.log10,
+...     new_column_names={"col1": "transform1"},
+... )
+   col1  col2  col3  transform1
+0     5     3   1.0    0.698970
+1    10     6   2.0    1.000000
+2    15     9   3.0    1.176091
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

A pandas DataFrame.

+
+
+ required +
+ column_names + + Union[List[str], Tuple[str]] + +
+

An iterable of columns to transform.

+
+
+ required +
+ function + + Callable + +
+

A function to apply on each column.

+
+
+ required +
+ suffix + + Optional[str] + +
+

Suffix to use when creating new columns to hold +the transformed values.

+
+
+ None +
+ elementwise + + bool + +
+

Passed on to transform_column; whether or not +to apply the transformation function elementwise (True) +or columnwise (False).

+
+
+ True +
+ new_column_names + + Optional[Dict[str, str]] + +
+

An explicit mapping of old column names in +column_names to new column names. If any column specified in +column_names is not a key in this dictionary, the transformation +will happen in-place for that column.

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If both suffix and new_column_names are specified.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with transformed columns.

+
+
+ +
+ Source code in janitor/functions/transform_columns.py +
125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
@pf.register_dataframe_method
+@deprecated_alias(columns="column_names", new_names="new_column_names")
+def transform_columns(
+    df: pd.DataFrame,
+    column_names: Union[List[str], Tuple[str]],
+    function: Callable,
+    suffix: Optional[str] = None,
+    elementwise: bool = True,
+    new_column_names: Optional[Dict[str, str]] = None,
+) -> pd.DataFrame:
+    """Transform multiple columns through the same transformation.
+
+    This method does not mutate the original DataFrame.
+
+    Super syntactic sugar!
+    Essentially wraps [`transform_column`][janitor.functions.transform_columns.transform_column]
+    and calls it repeatedly over all column names provided.
+
+    User can optionally supply either a suffix to create a new set of columns
+    with the specified suffix, or provide a dictionary mapping each original
+    column name in `column_names` to its corresponding new column name.
+    Note that all column names must be strings.
+
+    Examples:
+        log10 transform a list of columns, replacing original columns.
+
+        >>> import numpy as np
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "col1": [5, 10, 15],
+        ...     "col2": [3, 6, 9],
+        ...     "col3": [10, 100, 1_000],
+        ... })
+        >>> df
+           col1  col2  col3
+        0     5     3    10
+        1    10     6   100
+        2    15     9  1000
+        >>> df.transform_columns(["col1", "col2", "col3"], np.log10)
+               col1      col2  col3
+        0  0.698970  0.477121   1.0
+        1  1.000000  0.778151   2.0
+        2  1.176091  0.954243   3.0
+
+        Using the `suffix` parameter to create new columns.
+
+        >>> df.transform_columns(["col1", "col3"], np.log10, suffix="_log")
+           col1  col2  col3  col1_log  col3_log
+        0     5     3    10  0.698970       1.0
+        1    10     6   100  1.000000       2.0
+        2    15     9  1000  1.176091       3.0
+
+        Using the `new_column_names` parameter to create new columns.
+
+        >>> df.transform_columns(
+        ...     ["col1", "col3"],
+        ...     np.log10,
+        ...     new_column_names={"col1": "transform1"},
+        ... )
+           col1  col2  col3  transform1
+        0     5     3   1.0    0.698970
+        1    10     6   2.0    1.000000
+        2    15     9   3.0    1.176091
+
+    Args:
+        df: A pandas DataFrame.
+        column_names: An iterable of columns to transform.
+        function: A function to apply on each column.
+        suffix: Suffix to use when creating new columns to hold
+            the transformed values.
+        elementwise: Passed on to [`transform_column`][janitor.functions.transform_columns.transform_column]; whether or not
+            to apply the transformation function elementwise (True)
+            or columnwise (False).
+        new_column_names: An explicit mapping of old column names in
+            `column_names` to new column names. If any column specified in
+            `column_names` is not a key in this dictionary, the transformation
+            will happen in-place for that column.
+
+    Raises:
+        ValueError: If both `suffix` and `new_column_names` are specified.
+
+    Returns:
+        A pandas DataFrame with transformed columns.
+    """  # noqa: E501
+    check("column_names", column_names, [list, tuple])
+    check_column(df, column_names)
+
+    if suffix is not None and new_column_names is not None:
+        raise ValueError(
+            "Only one of `suffix` or `new_column_names` should be specified."
+        )
+
+    if suffix:
+        check("suffix", suffix, [str])
+        dest_column_names = {col: col + suffix for col in column_names}
+    elif new_column_names:
+        check("new_column_names", new_column_names, [dict])
+        dest_column_names = {
+            col: new_column_names.get(col, col) for col in column_names
+        }
+    else:
+        dest_column_names = dict(zip(column_names, column_names))
+
+    results = {}
+    for old_col, new_col in dest_column_names.items():
+        if old_col != new_col:
+            check_column(df, new_col, present=False)
+        results[new_col] = _get_transform_column_result(
+            df[old_col],
+            function,
+            elementwise=elementwise,
+        )
+
+    return df.assign(**results)
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ truncate_datetime + + +

+ +
+ +

Implementation of the truncate_datetime family of functions.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ truncate_datetime_dataframe(df, datepart) + +

+ + +
+ +

Truncate times down to a user-specified precision of +year, month, day, hour, minute, or second.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> df = pd.DataFrame({
+...     "foo": ["xxxx", "yyyy", "zzzz"],
+...     "dt": pd.date_range("2020-03-11", periods=3, freq="15H"),
+... })
+>>> df
+    foo                  dt
+0  xxxx 2020-03-11 00:00:00
+1  yyyy 2020-03-11 15:00:00
+2  zzzz 2020-03-12 06:00:00
+>>> df.truncate_datetime_dataframe("day")
+    foo         dt
+0  xxxx 2020-03-11
+1  yyyy 2020-03-11
+2  zzzz 2020-03-12
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The pandas DataFrame on which to truncate datetime.

+
+
+ required +
+ datepart + + str + +
+

Truncation precision, YEAR, MONTH, DAY, +HOUR, MINUTE, SECOND. (String is automagically +capitalized)

+
+
+ required +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If an invalid datepart precision is passed in.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame with all valid datetimes truncated down +to the specified precision.

+
+
+ +
+ Source code in janitor/functions/truncate_datetime.py +
 9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
@pf.register_dataframe_method
+def truncate_datetime_dataframe(
+    df: pd.DataFrame,
+    datepart: str,
+) -> pd.DataFrame:
+    """Truncate times down to a user-specified precision of
+    year, month, day, hour, minute, or second.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "foo": ["xxxx", "yyyy", "zzzz"],
+        ...     "dt": pd.date_range("2020-03-11", periods=3, freq="15H"),
+        ... })
+        >>> df
+            foo                  dt
+        0  xxxx 2020-03-11 00:00:00
+        1  yyyy 2020-03-11 15:00:00
+        2  zzzz 2020-03-12 06:00:00
+        >>> df.truncate_datetime_dataframe("day")
+            foo         dt
+        0  xxxx 2020-03-11
+        1  yyyy 2020-03-11
+        2  zzzz 2020-03-12
+
+    Args:
+        df: The pandas DataFrame on which to truncate datetime.
+        datepart: Truncation precision, YEAR, MONTH, DAY,
+            HOUR, MINUTE, SECOND. (String is automagically
+            capitalized)
+
+    Raises:
+        ValueError: If an invalid `datepart` precision is passed in.
+
+    Returns:
+        A pandas DataFrame with all valid datetimes truncated down
+            to the specified precision.
+    """
+    # idea from Stack Overflow
+    # https://stackoverflow.com/a/28783971/7175713
+    # https://numpy.org/doc/stable/reference/arrays.datetime.html
+    ACCEPTABLE_DATEPARTS = {
+        "YEAR": "datetime64[Y]",
+        "MONTH": "datetime64[M]",
+        "DAY": "datetime64[D]",
+        "HOUR": "datetime64[h]",
+        "MINUTE": "datetime64[m]",
+        "SECOND": "datetime64[s]",
+    }
+    datepart = datepart.upper()
+    if datepart not in ACCEPTABLE_DATEPARTS:
+        raise ValueError(
+            "Received an invalid `datepart` precision. "
+            f"Please enter any one of {ACCEPTABLE_DATEPARTS}."
+        )
+
+    dictionary = {}
+
+    for label, series in df.items():
+        if is_datetime64_any_dtype(series):
+            dtype = ACCEPTABLE_DATEPARTS[datepart]
+            # TODO: add branch for pyarrow arrays
+            series = np.array(series._values, dtype=dtype)
+        dictionary[label] = series
+
+    return pd.DataFrame(dictionary)
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ update_where + + +

+ +
+ +

Function for updating values based on other column values.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ update_where(df, conditions, target_column_name, target_val) + +

+ + +
+ +

Add multiple conditions to update a column in the dataframe.

+

This method does not mutate the original DataFrame.

+ + +

Examples:

+
>>> import janitor
+>>> data = {
+...    "a": [1, 2, 3, 4],
+...    "b": [5, 6, 7, 8],
+...    "c": [0, 0, 0, 0],
+... }
+>>> df = pd.DataFrame(data)
+>>> df
+   a  b  c
+0  1  5  0
+1  2  6  0
+2  3  7  0
+3  4  8  0
+>>> df.update_where(
+...    conditions = (df.a > 2) & (df.b < 8),
+...    target_column_name = 'c',
+...    target_val = 10
+... )
+   a  b   c
+0  1  5   0
+1  2  6   0
+2  3  7  10
+3  4  8   0
+>>> df.update_where( # supports pandas *query* style string expressions
+...    conditions = "a > 2 and b < 8",
+...    target_column_name = 'c',
+...    target_val = 10
+... )
+   a  b   c
+0  1  5   0
+1  2  6   0
+2  3  7  10
+3  4  8   0
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The pandas DataFrame object.

+
+
+ required +
+ conditions + + Any + +
+

Conditions used to update a target column +and target value.

+
+
+ required +
+ target_column_name + + Hashable + +
+

Column to be updated. If column does not exist +in DataFrame, a new column will be created; note that entries +that do not get set in the new column will be null.

+
+
+ required +
+ target_val + + Any + +
+

Value to be updated.

+
+
+ required +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If conditions does not return a boolean array-like +data structure.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A pandas DataFrame.

+
+
+ +
+ Source code in janitor/functions/update_where.py +
12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
@pf.register_dataframe_method
+@deprecated_alias(target_col="target_column_name")
+def update_where(
+    df: pd.DataFrame,
+    conditions: Any,
+    target_column_name: Hashable,
+    target_val: Any,
+) -> pd.DataFrame:
+    """Add multiple conditions to update a column in the dataframe.
+
+    This method does not mutate the original DataFrame.
+
+    Examples:
+        >>> import janitor
+        >>> data = {
+        ...    "a": [1, 2, 3, 4],
+        ...    "b": [5, 6, 7, 8],
+        ...    "c": [0, 0, 0, 0],
+        ... }
+        >>> df = pd.DataFrame(data)
+        >>> df
+           a  b  c
+        0  1  5  0
+        1  2  6  0
+        2  3  7  0
+        3  4  8  0
+        >>> df.update_where(
+        ...    conditions = (df.a > 2) & (df.b < 8),
+        ...    target_column_name = 'c',
+        ...    target_val = 10
+        ... )
+           a  b   c
+        0  1  5   0
+        1  2  6   0
+        2  3  7  10
+        3  4  8   0
+        >>> df.update_where( # supports pandas *query* style string expressions
+        ...    conditions = "a > 2 and b < 8",
+        ...    target_column_name = 'c',
+        ...    target_val = 10
+        ... )
+           a  b   c
+        0  1  5   0
+        1  2  6   0
+        2  3  7  10
+        3  4  8   0
+
+    Args:
+        df: The pandas DataFrame object.
+        conditions: Conditions used to update a target column
+            and target value.
+        target_column_name: Column to be updated. If column does not exist
+            in DataFrame, a new column will be created; note that entries
+            that do not get set in the new column will be null.
+        target_val: Value to be updated.
+
+    Raises:
+        ValueError: If `conditions` does not return a boolean array-like
+            data structure.
+
+    Returns:
+        A pandas DataFrame.
+    """
+
+    df = df.copy()
+
+    # use query mode if a string expression is passed
+    if isinstance(conditions, str):
+        conditions = df.eval(conditions)
+
+    if not is_bool_dtype(conditions):
+        raise ValueError(
+            """
+            Kindly ensure that `conditions` passed
+            evaluates to a Boolean dtype.
+            """
+        )
+
+    df.loc[conditions, target_column_name] = target_val
+
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ utils + + +

+ +
+ +

Utility functions for all of the functions submodule.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ patterns(regex_pattern) + +

+ + +
+ +

This function converts a string into a compiled regular expression.

+

It can be used to select columns in the index or columns_names +arguments of pivot_longer function.

+
+

Warning

+

This function is deprecated. Kindly use re.compile instead.

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ regex_pattern + + Union[str, Pattern] + +
+

String to be converted to compiled regular +expression.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Pattern + +
+

A compile regular expression from provided regex_pattern.

+
+
+ +
+ Source code in janitor/functions/utils.py +
140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
def patterns(regex_pattern: Union[str, Pattern]) -> Pattern:
+    """This function converts a string into a compiled regular expression.
+
+    It can be used to select columns in the index or columns_names
+    arguments of `pivot_longer` function.
+
+    !!!warning
+
+        This function is deprecated. Kindly use `re.compile` instead.
+
+    Args:
+        regex_pattern: String to be converted to compiled regular
+            expression.
+
+    Returns:
+        A compile regular expression from provided `regex_pattern`.
+    """
+    warnings.warn(
+        "This function is deprecated. Kindly use `re.compile` instead.",
+        DeprecationWarning,
+        stacklevel=find_stack_level(),
+    )
+    check("regular expression", regex_pattern, [str, Pattern])
+
+    return re.compile(regex_pattern)
+
+
+
+ +
+ +
+ + +

+ unionize_dataframe_categories(*dataframes, column_names=None) + +

+ + +
+ +

Given a group of dataframes which contain some categorical columns, for +each categorical column present, find all the possible categories across +all the dataframes which have that column. +Update each dataframes' corresponding column with a new categorical object +that contains the original data +but has labels for all the possible categories from all dataframes. +This is useful when concatenating a list of dataframes which all have the +same categorical columns into one dataframe.

+

If, for a given categorical column, all input dataframes do not have at +least one instance of all the possible categories, +Pandas will change the output dtype of that column from category to +object, losing out on dramatic speed gains you get from the former +format.

+ + +

Examples:

+

Usage example for concatenation of categorical column-containing +dataframes:

+

Instead of:

+
concatenated_df = pd.concat([df1, df2, df3], ignore_index=True)
+
+

which in your case has resulted in category -> object conversion, +use:

+
unionized_dataframes = unionize_dataframe_categories(df1, df2, df2)
+concatenated_df = pd.concat(unionized_dataframes, ignore_index=True)
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ *dataframes + + Any + +
+

The dataframes you wish to unionize the categorical +objects for.

+
+
+ () +
+ column_names + + Optional[Iterable[CategoricalDtype]] + +
+

If supplied, only unionize this subset of columns.

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ TypeError + +
+

If any of the inputs are not pandas DataFrames.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ List[DataFrame] + +
+

A list of the category-unioned dataframes in the same order they +were provided.

+
+
+ +
+ Source code in janitor/functions/utils.py +
 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
def unionize_dataframe_categories(
+    *dataframes: Any,
+    column_names: Optional[Iterable[pd.CategoricalDtype]] = None,
+) -> List[pd.DataFrame]:
+    """
+    Given a group of dataframes which contain some categorical columns, for
+    each categorical column present, find all the possible categories across
+    all the dataframes which have that column.
+    Update each dataframes' corresponding column with a new categorical object
+    that contains the original data
+    but has labels for all the possible categories from all dataframes.
+    This is useful when concatenating a list of dataframes which all have the
+    same categorical columns into one dataframe.
+
+    If, for a given categorical column, all input dataframes do not have at
+    least one instance of all the possible categories,
+    Pandas will change the output dtype of that column from `category` to
+    `object`, losing out on dramatic speed gains you get from the former
+    format.
+
+    Examples:
+        Usage example for concatenation of categorical column-containing
+        dataframes:
+
+        Instead of:
+
+        ```python
+        concatenated_df = pd.concat([df1, df2, df3], ignore_index=True)
+        ```
+
+        which in your case has resulted in `category` -> `object` conversion,
+        use:
+
+        ```python
+        unionized_dataframes = unionize_dataframe_categories(df1, df2, df2)
+        concatenated_df = pd.concat(unionized_dataframes, ignore_index=True)
+        ```
+
+    Args:
+        *dataframes: The dataframes you wish to unionize the categorical
+            objects for.
+        column_names: If supplied, only unionize this subset of columns.
+
+    Raises:
+        TypeError: If any of the inputs are not pandas DataFrames.
+
+    Returns:
+        A list of the category-unioned dataframes in the same order they
+            were provided.
+    """
+
+    if any(not isinstance(df, pd.DataFrame) for df in dataframes):
+        raise TypeError("Inputs must all be dataframes.")
+
+    if column_names is None:
+        # Find all columns across all dataframes that are categorical
+
+        column_names = set()
+
+        for dataframe in dataframes:
+            column_names = column_names.union(
+                [
+                    column_name
+                    for column_name in dataframe.columns
+                    if isinstance(
+                        dataframe[column_name].dtype, pd.CategoricalDtype
+                    )
+                ]
+            )
+
+    else:
+        column_names = [column_names]
+    # For each categorical column, find all possible values across the DFs
+
+    category_unions = {
+        column_name: union_categoricals(
+            [df[column_name] for df in dataframes if column_name in df.columns]
+        )
+        for column_name in column_names
+    }
+
+    # Make a shallow copy of all DFs and modify the categorical columns
+    # such that they can encode the union of all possible categories for each.
+
+    refactored_dfs = []
+
+    for df in dataframes:
+        df = df.copy(deep=False)
+
+        for column_name, categorical in category_unions.items():
+            if column_name in df.columns:
+                df[column_name] = pd.Categorical(
+                    df[column_name], categories=categorical.categories
+                )
+
+        refactored_dfs.append(df)
+
+    return refactored_dfs
+
+
+
+ +
+ + + +
+ +
+ +
+ + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/api/io/index.html b/api/io/index.html new file mode 100644 index 000000000..93bcbce81 --- /dev/null +++ b/api/io/index.html @@ -0,0 +1,2919 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Input/Output (io) - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Input/Output (io)

+ + +
+ + + + +
+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ read_commandline(cmd, engine='pandas', **kwargs) + +

+ + +
+ +

Read a CSV file based on a command-line command.

+

For example, you may wish to run the following command on sep-quarter.csv +before reading it into a pandas DataFrame:

+
cat sep-quarter.csv | grep .SEA1AA
+
+

In this case, you can use the following Python code to load the dataframe:

+
import janitor as jn
+df = jn.read_commandline("cat data.csv | grep .SEA1AA")
+
+

This function assumes that your command line command will return +an output that is parsable using the relevant engine and StringIO. +This function defaults to using pd.read_csv underneath the hood. +Keyword arguments are passed through as-is.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ cmd + + str + +
+

Shell command to preprocess a file on disk.

+
+
+ required +
+ engine + + str + +
+

DataFrame engine to process the output of the shell command. +Currently supports both pandas and polars.

+
+
+ 'pandas' +
+ **kwargs + + Any + +
+

Keyword arguments that are passed through to +the engine's csv reader.

+
+
+ {} +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Mapping + +
+

A DataFrame parsed from the stdout of the underlying +shell.

+
+
+ +
+ Source code in janitor/io.py +
 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
def read_commandline(
+    cmd: str, engine: str = "pandas", **kwargs: Any
+) -> Mapping:
+    """Read a CSV file based on a command-line command.
+
+    For example, you may wish to run the following command on `sep-quarter.csv`
+    before reading it into a pandas DataFrame:
+
+    ```bash
+    cat sep-quarter.csv | grep .SEA1AA
+    ```
+
+    In this case, you can use the following Python code to load the dataframe:
+
+    ```python
+    import janitor as jn
+    df = jn.read_commandline("cat data.csv | grep .SEA1AA")
+    ```
+
+    This function assumes that your command line command will return
+    an output that is parsable using the relevant engine and StringIO.
+    This function defaults to using `pd.read_csv` underneath the hood.
+    Keyword arguments are passed through as-is.
+
+    Args:
+        cmd: Shell command to preprocess a file on disk.
+        engine: DataFrame engine to process the output of the shell command.
+            Currently supports both pandas and polars.
+        **kwargs: Keyword arguments that are passed through to
+            the engine's csv reader.
+
+
+    Returns:
+        A DataFrame parsed from the stdout of the underlying
+            shell.
+    """
+
+    check("cmd", cmd, [str])
+    if engine not in {"pandas", "polars"}:
+        raise ValueError("engine should be either pandas or polars.")
+    # adding check=True ensures that an explicit, clear error
+    # is raised, so that the user can see the reason for the failure
+    outcome = subprocess.run(
+        cmd, shell=True, capture_output=True, text=True, check=True
+    )
+    if engine == "polars":
+        try:
+            import polars as pl
+        except ImportError:
+            import_message(
+                submodule="polars",
+                package="polars",
+                conda_channel="conda-forge",
+                pip_install=True,
+            )
+        return pl.read_csv(StringIO(outcome.stdout), **kwargs)
+    return pd.read_csv(StringIO(outcome.stdout), **kwargs)
+
+
+
+ +
+ +
+ + +

+ read_csvs(files_path, separate_df=False, **kwargs) + +

+ + +
+ +

Read multiple CSV files and return a dictionary of DataFrames, or +one concatenated DataFrame.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ files_path + + Union[str, Iterable[str]] + +
+

The filepath pattern matching the CSV files. +Accepts regular expressions, with or without .csv extension. +Also accepts iterable of file paths.

+
+
+ required +
+ separate_df + + bool + +
+

If False (default), returns a single Dataframe +with the concatenation of the csv files. +If True, returns a dictionary of separate DataFrames +for each CSV file.

+
+
+ False +
+ **kwargs + + Any + +
+

Keyword arguments to pass into the +original pandas read_csv.

+
+
+ {} +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ JanitorError + +
+

If None provided for files_path.

+
+
+ JanitorError + +
+

If length of files_path is 0.

+
+
+ ValueError + +
+

If no CSV files exist in files_path.

+
+
+ ValueError + +
+

If columns in input CSV files do not match.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Union[DataFrame, dict] + +
+

DataFrame of concatenated DataFrames or dictionary of DataFrames.

+
+
+ +
+ Source code in janitor/io.py +
27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
@deprecated_alias(seperate_df="separate_df", filespath="files_path")
+def read_csvs(
+    files_path: Union[str, Iterable[str]],
+    separate_df: bool = False,
+    **kwargs: Any,
+) -> Union[pd.DataFrame, dict]:
+    """Read multiple CSV files and return a dictionary of DataFrames, or
+    one concatenated DataFrame.
+
+    Args:
+        files_path: The filepath pattern matching the CSV files.
+            Accepts regular expressions, with or without `.csv` extension.
+            Also accepts iterable of file paths.
+        separate_df: If `False` (default), returns a single Dataframe
+            with the concatenation of the csv files.
+            If `True`, returns a dictionary of separate DataFrames
+            for each CSV file.
+        **kwargs: Keyword arguments to pass into the
+            original pandas `read_csv`.
+
+    Raises:
+        JanitorError: If `None` provided for `files_path`.
+        JanitorError: If length of `files_path` is `0`.
+        ValueError: If no CSV files exist in `files_path`.
+        ValueError: If columns in input CSV files do not match.
+
+    Returns:
+        DataFrame of concatenated DataFrames or dictionary of DataFrames.
+    """
+    # Sanitize input
+    if files_path is None:
+        raise JanitorError("`None` provided for `files_path`")
+    if not files_path:
+        raise JanitorError("0 length `files_path` provided")
+
+    # Read the csv files
+    # String to file/folder or file pattern provided
+    if isinstance(files_path, str):
+        dfs_dict = {
+            os.path.basename(f): pd.read_csv(f, **kwargs)
+            for f in glob(files_path)
+        }
+    # Iterable of file paths provided
+    else:
+        dfs_dict = {
+            os.path.basename(f): pd.read_csv(f, **kwargs) for f in files_path
+        }
+    # Check if dataframes have been read
+    if not dfs_dict:
+        raise ValueError("No CSV files to read with the given `files_path`")
+    # Concatenate the dataframes if requested (default)
+    col_names = list(dfs_dict.values())[0].columns  # noqa: PD011
+    if not separate_df:
+        # If columns do not match raise an error
+        for df in dfs_dict.values():  # noqa: PD011
+            if not all(df.columns == col_names):
+                raise ValueError(
+                    "Columns in input CSV files do not match."
+                    "Files cannot be concatenated."
+                )
+        return pd.concat(
+            list(dfs_dict.values()),
+            ignore_index=True,
+            sort=False,  # noqa: PD011
+            copy=False,
+        )
+    return dfs_dict
+
+
+
+ +
+ +
+ + +

+ xlsx_cells(path, sheetnames=None, start_point=None, end_point=None, read_only=True, include_blank_cells=True, fill=False, font=False, alignment=False, border=False, protection=False, comment=False, engine='pandas', **kwargs) + +

+ + +
+ +

Imports data from spreadsheet without coercing it into a rectangle.

+

Each cell is represented by a row in a dataframe, and includes the +cell's coordinates, the value, row and column position. +The cell formatting (fill, font, border, etc) can also be accessed; +usually this is returned as a dictionary in the cell, and the specific +cell format attribute can be accessed using pd.Series.str.get +or pl.struct.field if it is a polars DataFrame.

+

Inspiration for this comes from R's tidyxl package.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import polars as pl
+>>> from janitor import xlsx_cells
+>>> pd.set_option("display.max_columns", None)
+>>> pd.set_option("display.expand_frame_repr", False)
+>>> pd.set_option("max_colwidth", None)
+>>> filename = "../pyjanitor/tests/test_data/worked-examples.xlsx"
+
+

Each cell is returned as a row:

+
>>> xlsx_cells(filename, sheetnames="highlights")
+    value internal_value coordinate  row  column data_type  is_date number_format
+0     Age            Age         A1    1       1         s    False       General
+1  Height         Height         B1    1       2         s    False       General
+2       1              1         A2    2       1         n    False       General
+3       2              2         B2    2       2         n    False       General
+4       3              3         A3    3       1         n    False       General
+5       4              4         B3    3       2         n    False       General
+6       5              5         A4    4       1         n    False       General
+7       6              6         B4    4       2         n    False       General
+
+

Access cell formatting such as fill:

+
>>> out=xlsx_cells(filename, sheetnames="highlights", fill=True).select("value", "fill", axis='columns')
+>>> out
+    value                                                                                                                                              fill
+0     Age     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
+1  Height     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
+2       1     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
+3       2     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
+4       3  {'patternType': 'solid', 'fgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}}
+5       4  {'patternType': 'solid', 'fgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}}
+6       5     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
+7       6     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
+
+

Specific cell attributes can be accessed by using Pandas' series.str.get:

+
>>> out.fill.str.get("fgColor").str.get("rgb")
+0    00000000
+1    00000000
+2    00000000
+3    00000000
+4    FFFFFF00
+5    FFFFFF00
+6    00000000
+7    00000000
+Name: fill, dtype: object
+
+

Access cell formatting in a polars DataFrame:

+
>>> out = xlsx_cells(filename, sheetnames="highlights", engine='polars', fill=True).get_column('fill')
+>>> out
+shape: (8,)
+Series: 'fill' [struct[3]]
+[
+   {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
+   {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
+   {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
+   {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
+   {"solid",{"FFFFFF00","rgb",0.0},{"FFFFFF00","rgb",0.0}}
+   {"solid",{"FFFFFF00","rgb",0.0},{"FFFFFF00","rgb",0.0}}
+   {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
+   {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
+]
+
+

Specific cell attributes can be acessed via Polars' struct:

+
>>> out.struct.field('fgColor').struct.field('rgb')
+shape: (8,)
+Series: 'rgb' [str]
+[
+   "00000000"
+   "00000000"
+   "00000000"
+   "00000000"
+   "FFFFFF00"
+   "FFFFFF00"
+   "00000000"
+   "00000000"
+]
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ path + + Union[str, Workbook] + +
+

Path to the Excel File. It can also be an openpyxl Workbook.

+
+
+ required +
+ sheetnames + + Union[str, list, tuple] + +
+

Names of the sheets from which the cells are to be extracted. +If None, all the sheets in the file are extracted; +if it is a string, or list or tuple, only the specified sheets are extracted.

+
+
+ None +
+ start_point + + Union[str, int] + +
+

Start coordinates of the Excel sheet. This is useful +if the user is only interested in a subsection of the sheet. +If start_point is provided, end_point must be provided as well.

+
+
+ None +
+ end_point + + Union[str, int] + +
+

End coordinates of the Excel sheet. This is useful +if the user is only interested in a subsection of the sheet. +If end_point is provided, start_point must be provided as well.

+
+
+ None +
+ read_only + + bool + +
+

Determines if the entire file is loaded in memory, +or streamed. For memory efficiency, read_only should be set to True. +Some cell properties like comment, can only be accessed by +setting read_only to False.

+
+
+ True +
+ include_blank_cells + + bool + +
+

Determines if cells without a value should be included.

+
+
+ True +
+ fill + + bool + +
+

If True, return fill properties of the cell. +It is usually returned as a dictionary.

+
+
+ False +
+ font + + bool + +
+

If True, return font properties of the cell. +It is usually returned as a dictionary.

+
+
+ False +
+ alignment + + bool + +
+

If True, return alignment properties of the cell. +It is usually returned as a dictionary.

+
+
+ False +
+ border + + bool + +
+

If True, return border properties of the cell. +It is usually returned as a dictionary.

+
+
+ False +
+ protection + + bool + +
+

If True, return protection properties of the cell. +It is usually returned as a dictionary.

+
+
+ False +
+ comment + + bool + +
+

If True, return comment properties of the cell. +It is usually returned as a dictionary.

+
+
+ False +
+ engine + + str + +
+

DataFrame engine. Should be either pandas or polars.

+
+
+ 'pandas' +
+ **kwargs + + Any + +
+

Any other attributes of the cell, that can be accessed from openpyxl.

+
+
+ {} +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If kwargs is provided, and one of the keys is a default column.

+
+
+ AttributeError + +
+

If kwargs is provided and any of the keys +is not a openpyxl cell attribute.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Mapping + +
+

A DataFrame, or a dictionary of DataFrames.

+
+
+ +
+ Source code in janitor/io.py +
345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
+560
+561
+562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
+576
+577
+578
+579
+580
+581
+582
+583
+584
+585
+586
+587
+588
+589
+590
+591
+592
+593
+594
+595
+596
+597
+598
+599
+600
+601
+602
+603
+604
+605
+606
+607
+608
+609
+610
+611
+612
+613
+614
+615
+616
+617
def xlsx_cells(
+    path: Union[str, Workbook],
+    sheetnames: Union[str, list, tuple] = None,
+    start_point: Union[str, int] = None,
+    end_point: Union[str, int] = None,
+    read_only: bool = True,
+    include_blank_cells: bool = True,
+    fill: bool = False,
+    font: bool = False,
+    alignment: bool = False,
+    border: bool = False,
+    protection: bool = False,
+    comment: bool = False,
+    engine: str = "pandas",
+    **kwargs: Any,
+) -> Mapping:
+    """Imports data from spreadsheet without coercing it into a rectangle.
+
+    Each cell is represented by a row in a dataframe, and includes the
+    cell's coordinates, the value, row and column position.
+    The cell formatting (fill, font, border, etc) can also be accessed;
+    usually this is returned as a dictionary in the cell, and the specific
+    cell format attribute can be accessed using `pd.Series.str.get`
+    or `pl.struct.field` if it is a polars DataFrame.
+
+    Inspiration for this comes from R's [tidyxl][link] package.
+    [link]: https://nacnudus.github.io/tidyxl/reference/tidyxl.html
+
+    Examples:
+        >>> import pandas as pd
+        >>> import polars as pl
+        >>> from janitor import xlsx_cells
+        >>> pd.set_option("display.max_columns", None)
+        >>> pd.set_option("display.expand_frame_repr", False)
+        >>> pd.set_option("max_colwidth", None)
+        >>> filename = "../pyjanitor/tests/test_data/worked-examples.xlsx"
+
+        Each cell is returned as a row:
+
+        >>> xlsx_cells(filename, sheetnames="highlights")
+            value internal_value coordinate  row  column data_type  is_date number_format
+        0     Age            Age         A1    1       1         s    False       General
+        1  Height         Height         B1    1       2         s    False       General
+        2       1              1         A2    2       1         n    False       General
+        3       2              2         B2    2       2         n    False       General
+        4       3              3         A3    3       1         n    False       General
+        5       4              4         B3    3       2         n    False       General
+        6       5              5         A4    4       1         n    False       General
+        7       6              6         B4    4       2         n    False       General
+
+        Access cell formatting such as fill:
+
+        >>> out=xlsx_cells(filename, sheetnames="highlights", fill=True).select("value", "fill", axis='columns')
+        >>> out
+            value                                                                                                                                              fill
+        0     Age     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
+        1  Height     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
+        2       1     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
+        3       2     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
+        4       3  {'patternType': 'solid', 'fgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}}
+        5       4  {'patternType': 'solid', 'fgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}}
+        6       5     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
+        7       6     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
+
+        Specific cell attributes can be accessed by using Pandas' `series.str.get`:
+
+        >>> out.fill.str.get("fgColor").str.get("rgb")
+        0    00000000
+        1    00000000
+        2    00000000
+        3    00000000
+        4    FFFFFF00
+        5    FFFFFF00
+        6    00000000
+        7    00000000
+        Name: fill, dtype: object
+
+        Access cell formatting in a polars DataFrame:
+
+        >>> out = xlsx_cells(filename, sheetnames="highlights", engine='polars', fill=True).get_column('fill')
+        >>> out
+        shape: (8,)
+        Series: 'fill' [struct[3]]
+        [
+           {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
+           {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
+           {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
+           {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
+           {"solid",{"FFFFFF00","rgb",0.0},{"FFFFFF00","rgb",0.0}}
+           {"solid",{"FFFFFF00","rgb",0.0},{"FFFFFF00","rgb",0.0}}
+           {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
+           {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
+        ]
+
+        Specific cell attributes can be acessed via Polars' struct:
+
+        >>> out.struct.field('fgColor').struct.field('rgb')
+        shape: (8,)
+        Series: 'rgb' [str]
+        [
+           "00000000"
+           "00000000"
+           "00000000"
+           "00000000"
+           "FFFFFF00"
+           "FFFFFF00"
+           "00000000"
+           "00000000"
+        ]
+
+
+    Args:
+        path: Path to the Excel File. It can also be an openpyxl Workbook.
+        sheetnames: Names of the sheets from which the cells are to be extracted.
+            If `None`, all the sheets in the file are extracted;
+            if it is a string, or list or tuple, only the specified sheets are extracted.
+        start_point: Start coordinates of the Excel sheet. This is useful
+            if the user is only interested in a subsection of the sheet.
+            If `start_point` is provided, `end_point` must be provided as well.
+        end_point: End coordinates of the Excel sheet. This is useful
+            if the user is only interested in a subsection of the sheet.
+            If `end_point` is provided, `start_point` must be provided as well.
+        read_only: Determines if the entire file is loaded in memory,
+            or streamed. For memory efficiency, read_only should be set to `True`.
+            Some cell properties like `comment`, can only be accessed by
+            setting `read_only` to `False`.
+        include_blank_cells: Determines if cells without a value should be included.
+        fill: If `True`, return fill properties of the cell.
+            It is usually returned as a dictionary.
+        font: If `True`, return font properties of the cell.
+            It is usually returned as a dictionary.
+        alignment: If `True`, return alignment properties of the cell.
+            It is usually returned as a dictionary.
+        border: If `True`, return border properties of the cell.
+            It is usually returned as a dictionary.
+        protection: If `True`, return protection properties of the cell.
+            It is usually returned as a dictionary.
+        comment: If `True`, return comment properties of the cell.
+            It is usually returned as a dictionary.
+        engine: DataFrame engine. Should be either pandas or polars.
+        **kwargs: Any other attributes of the cell, that can be accessed from openpyxl.
+
+    Raises:
+        ValueError: If kwargs is provided, and one of the keys is a default column.
+        AttributeError: If kwargs is provided and any of the keys
+            is not a openpyxl cell attribute.
+
+    Returns:
+        A DataFrame, or a dictionary of DataFrames.
+    """  # noqa : E501
+
+    try:
+        from openpyxl import load_workbook
+        from openpyxl.cell.cell import Cell
+        from openpyxl.cell.read_only import ReadOnlyCell
+        from openpyxl.workbook.workbook import Workbook
+    except ImportError:
+        import_message(
+            submodule="io",
+            package="openpyxl",
+            conda_channel="conda-forge",
+            pip_install=True,
+        )
+
+    path_is_workbook = isinstance(path, Workbook)
+    if not path_is_workbook:
+        # for memory efficiency, read_only is set to True
+        # if comments is True, read_only has to be False,
+        # as lazy loading is not enabled for comments
+        if comment and read_only:
+            raise ValueError(
+                "To access comments, kindly set 'read_only' to False."
+            )
+        path = load_workbook(
+            filename=path, read_only=read_only, keep_links=False
+        )
+    if engine not in {"pandas", "polars"}:
+        raise ValueError("engine should be one of pandas or polars.")
+    base_engine = pd
+    if engine == "polars":
+        try:
+            import polars as pl
+
+            base_engine = pl
+        except ImportError:
+            import_message(
+                submodule="polars",
+                package="polars",
+                conda_channel="conda-forge",
+                pip_install=True,
+            )
+    # start_point and end_point applies if the user is interested in
+    # only a subset of the Excel File and knows the coordinates
+    if start_point or end_point:
+        check("start_point", start_point, [str, int])
+        check("end_point", end_point, [str, int])
+
+    defaults = (
+        "value",
+        "internal_value",
+        "coordinate",
+        "row",
+        "column",
+        "data_type",
+        "is_date",
+        "number_format",
+    )
+
+    parameters = {
+        "fill": fill,
+        "font": font,
+        "alignment": alignment,
+        "border": border,
+        "protection": protection,
+        "comment": comment,
+    }
+
+    if kwargs:
+        if path_is_workbook:
+            if path.read_only:
+                _cell = ReadOnlyCell
+            else:
+                _cell = Cell
+        else:
+            if read_only:
+                _cell = ReadOnlyCell
+            else:
+                _cell = Cell
+
+        attrs = {
+            attr
+            for attr, _ in inspect.getmembers(_cell, not (inspect.isroutine))
+            if not attr.startswith("_")
+        }
+
+        for key in kwargs:
+            if key in defaults:
+                raise ValueError(
+                    f"{key} is part of the default attributes "
+                    "returned as a column."
+                )
+            elif key not in attrs:
+                raise AttributeError(
+                    f"{key} is not a recognized attribute of {_cell}."
+                )
+        parameters.update(kwargs)
+
+    if not sheetnames:
+        sheetnames = path.sheetnames
+    elif isinstance(sheetnames, str):
+        sheetnames = [sheetnames]
+    else:
+        check("sheetnames", sheetnames, [str, list, tuple])
+
+    out = {
+        sheetname: _xlsx_cells(
+            path[sheetname],
+            defaults,
+            parameters,
+            start_point,
+            end_point,
+            include_blank_cells,
+            base_engine=base_engine,
+        )
+        for sheetname in sheetnames
+    }
+    if len(out) == 1:
+        _, out = out.popitem()
+
+    if (not path_is_workbook) and path.read_only:
+        path.close()
+
+    return out
+
+
+
+ +
+ +
+ + +

+ xlsx_table(path, sheetname=None, table=None, engine='pandas') + +

+ + +
+ +

Returns a DataFrame of values in a table in the Excel file.

+

This applies to an Excel file, where the data range is explicitly +specified as a Microsoft Excel table.

+

If there is a single table in the sheet, or a string is provided +as an argument to the table parameter, a DataFrame is returned; +if there is more than one table in the sheet, +and the table argument is None, or a list/tuple of names, +a dictionary of DataFrames is returned, where the keys of the dictionary +are the table names.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import polars as pl
+>>> from janitor import xlsx_table
+>>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"
+
+

Single table:

+
>>> xlsx_table(filename, table='dCategory')
+   CategoryID       Category
+0           1       Beginner
+1           2       Advanced
+2           3      Freestyle
+3           4    Competition
+4           5  Long Distance
+
+
>>> xlsx_table(filename, table='dCategory', engine='polars')
+shape: (5, 2)
+┌────────────┬───────────────┐
+│ CategoryID ┆ Category      │
+│ ---        ┆ ---           │
+│ i64        ┆ str           │
+╞════════════╪═══════════════╡
+│ 1          ┆ Beginner      │
+│ 2          ┆ Advanced      │
+│ 3          ┆ Freestyle     │
+│ 4          ┆ Competition   │
+│ 5          ┆ Long Distance │
+└────────────┴───────────────┘
+
+

Multiple tables:

+
>>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
+>>> out["dCategory"]
+   CategoryID       Category
+0           1       Beginner
+1           2       Advanced
+2           3      Freestyle
+3           4    Competition
+4           5  Long Distance
+>>> out["dSalesReps"].head(3)
+   SalesRepID             SalesRep Region
+0           1  Sioux Radcoolinator     NW
+1           2        Tyrone Smithe     NE
+2           3         Chantel Zoya     SW
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ path + + Union[str, IO, Workbook] + +
+

Path to the Excel File. It can also be an openpyxl Workbook.

+
+
+ required +
+ table + + Union[str, list, tuple] + +
+

Name of a table, or list of tables in the sheet.

+
+
+ None +
+ engine + + str + +
+

DataFrame engine. Should be either pandas or polars. +Defaults to pandas

+
+
+ 'pandas' +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ AttributeError + +
+

If a workbook is provided, and is a ReadOnlyWorksheet.

+
+
+ ValueError + +
+

If there are no tables in the sheet.

+
+
+ KeyError + +
+

If the provided table does not exist in the sheet.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Mapping + +
+

A DataFrame, or a dictionary of DataFrames, +if there are multiple arguments for the table parameter, +or the argument to table is None.

+
+
+ +
+ Source code in janitor/io.py +
159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
def xlsx_table(
+    path: Union[str, IO, Workbook],
+    sheetname: str = None,
+    table: Union[str, list, tuple] = None,
+    engine: str = "pandas",
+) -> Mapping:
+    """Returns a DataFrame of values in a table in the Excel file.
+
+    This applies to an Excel file, where the data range is explicitly
+    specified as a Microsoft Excel table.
+
+    If there is a single table in the sheet, or a string is provided
+    as an argument to the `table` parameter, a DataFrame is returned;
+    if there is more than one table in the sheet,
+    and the `table` argument is `None`, or a list/tuple of names,
+    a dictionary of DataFrames is returned, where the keys of the dictionary
+    are the table names.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import polars as pl
+        >>> from janitor import xlsx_table
+        >>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"
+
+        Single table:
+
+        >>> xlsx_table(filename, table='dCategory')
+           CategoryID       Category
+        0           1       Beginner
+        1           2       Advanced
+        2           3      Freestyle
+        3           4    Competition
+        4           5  Long Distance
+
+        >>> xlsx_table(filename, table='dCategory', engine='polars')
+        shape: (5, 2)
+        ┌────────────┬───────────────┐
+        │ CategoryID ┆ Category      │
+        │ ---        ┆ ---           │
+        │ i64        ┆ str           │
+        ╞════════════╪═══════════════╡
+        │ 1          ┆ Beginner      │
+        │ 2          ┆ Advanced      │
+        │ 3          ┆ Freestyle     │
+        │ 4          ┆ Competition   │
+        │ 5          ┆ Long Distance │
+        └────────────┴───────────────┘
+
+        Multiple tables:
+
+        >>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
+        >>> out["dCategory"]
+           CategoryID       Category
+        0           1       Beginner
+        1           2       Advanced
+        2           3      Freestyle
+        3           4    Competition
+        4           5  Long Distance
+        >>> out["dSalesReps"].head(3)
+           SalesRepID             SalesRep Region
+        0           1  Sioux Radcoolinator     NW
+        1           2        Tyrone Smithe     NE
+        2           3         Chantel Zoya     SW
+
+    Args:
+        path: Path to the Excel File. It can also be an openpyxl Workbook.
+        table: Name of a table, or list of tables in the sheet.
+        engine: DataFrame engine. Should be either pandas or polars.
+            Defaults to pandas
+
+    Raises:
+        AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet.
+        ValueError: If there are no tables in the sheet.
+        KeyError: If the provided table does not exist in the sheet.
+
+    Returns:
+        A DataFrame, or a dictionary of DataFrames,
+            if there are multiple arguments for the `table` parameter,
+            or the argument to `table` is `None`.
+    """  # noqa : E501
+
+    try:
+        from openpyxl import load_workbook
+        from openpyxl.workbook.workbook import Workbook
+    except ImportError:
+        import_message(
+            submodule="io",
+            package="openpyxl",
+            conda_channel="conda-forge",
+            pip_install=True,
+        )
+    # TODO: remove in version 1.0
+    if sheetname:
+        warnings.warn(
+            "The keyword argument "
+            "'sheetname' of 'xlsx_tables' is deprecated.",
+            DeprecationWarning,
+            stacklevel=find_stack_level(),
+        )
+    if engine not in {"pandas", "polars"}:
+        raise ValueError("engine should be one of pandas or polars.")
+    base_engine = pd
+    if engine == "polars":
+        try:
+            import polars as pl
+
+            base_engine = pl
+        except ImportError:
+            import_message(
+                submodule="polars",
+                package="polars",
+                conda_channel="conda-forge",
+                pip_install=True,
+            )
+
+    if table is not None:
+        check("table", table, [str, list, tuple])
+        if isinstance(table, (list, tuple)):
+            for num, entry in enumerate(table):
+                check(f"entry{num} in the table argument", entry, [str])
+    if isinstance(path, Workbook):
+        ws = path
+    else:
+        ws = load_workbook(
+            filename=path, read_only=False, keep_links=False, data_only=True
+        )
+    if ws.read_only:
+        raise ValueError("xlsx_table does not work in read only mode.")
+
+    def _create_dataframe_or_dictionary_from_table(
+        table_name_and_worksheet: tuple,
+    ):
+        """
+        Create DataFrame/dictionary if table exists in Workbook
+        """
+        dictionary = {}
+        for table_name, worksheet in table_name_and_worksheet:
+            contents = worksheet.tables[table_name]
+            header_exist = contents.headerRowCount
+            coordinates = contents.ref
+            data = worksheet[coordinates]
+            if header_exist:
+                header, *data = data
+                header = [cell.value for cell in header]
+            else:
+                header = [f"C{num}" for num in range(len(data[0]))]
+            data = zip(*data)
+            data = ([entry.value for entry in cell] for cell in data)
+            data = dict(zip(header, data))
+            dictionary[table_name] = base_engine.DataFrame(data)
+        return dictionary
+
+    worksheets = [worksheet for worksheet in ws if worksheet.tables.items()]
+    if not any(worksheets):
+        raise ValueError("There are no tables in the Workbook.")
+    table_is_a_string = False
+    if table:
+        if isinstance(table, str):
+            table_is_a_string = True
+            table = [table]
+        table_names = (
+            entry for worksheet in worksheets for entry in worksheet.tables
+        )
+        missing = set(table).difference(table_names)
+        if missing:
+            raise KeyError(f"Tables {*missing,} do not exist in the Workbook.")
+        tables = [
+            (entry, worksheet)
+            for worksheet in worksheets
+            for entry in worksheet.tables
+            if entry in table
+        ]
+    else:
+        tables = [
+            (entry, worksheet)
+            for worksheet in worksheets
+            for entry in worksheet.tables
+        ]
+    data = _create_dataframe_or_dictionary_from_table(
+        table_name_and_worksheet=tables
+    )
+    if table_is_a_string:
+        return data[table[0]]
+    return data
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/api/math/index.html b/api/math/index.html new file mode 100644 index 000000000..17d0b6e83 --- /dev/null +++ b/api/math/index.html @@ -0,0 +1,2647 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Math - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Math

+ + +
+ + + + +
+ +

Miscellaneous mathematical operators.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ ecdf(s) + +

+ + +
+ +

Return cumulative distribution of values in a series.

+

Null values must be dropped from the series, +otherwise a ValueError is raised.

+

Also, if the dtype of the series is not numeric, +a TypeError is raised.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> s = pd.Series([0, 4, 0, 1, 2, 1, 1, 3])
+>>> x, y = s.ecdf()
+>>> x
+array([0, 0, 1, 1, 1, 2, 3, 4])
+>>> y
+array([0.125, 0.25 , 0.375, 0.5  , 0.625, 0.75 , 0.875, 1.   ])
+
+

You can then plot the ECDF values, for example:

+
>>> from matplotlib import pyplot as plt
+>>> plt.scatter(x, y)
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ s + + Series + +
+

A pandas series. dtype should be numeric.

+
+
+ required +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ TypeError + +
+

If series is not numeric.

+
+
+ ValueError + +
+

If series contains nulls.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + + + + + +
Name TypeDescription
x + ndarray + +
+

Sorted array of values.

+
+
y + ndarray + +
+

Cumulative fraction of data points with value x or lower.

+
+
+ +
+ Source code in janitor/math.py +
329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
@pf.register_series_method
+def ecdf(s: "Series") -> Tuple["ndarray", "ndarray"]:
+    """Return cumulative distribution of values in a series.
+
+    Null values must be dropped from the series,
+    otherwise a `ValueError` is raised.
+
+    Also, if the `dtype` of the series is not numeric,
+    a `TypeError` is raised.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> s = pd.Series([0, 4, 0, 1, 2, 1, 1, 3])
+        >>> x, y = s.ecdf()
+        >>> x  # doctest: +SKIP
+        array([0, 0, 1, 1, 1, 2, 3, 4])
+        >>> y  # doctest: +SKIP
+        array([0.125, 0.25 , 0.375, 0.5  , 0.625, 0.75 , 0.875, 1.   ])
+
+        You can then plot the ECDF values, for example:
+
+        >>> from matplotlib import pyplot as plt
+        >>> plt.scatter(x, y)  # doctest: +SKIP
+
+    Args:
+        s: A pandas series. `dtype` should be numeric.
+
+    Raises:
+        TypeError: If series is not numeric.
+        ValueError: If series contains nulls.
+
+    Returns:
+        x: Sorted array of values.
+        y: Cumulative fraction of data points with value `x` or lower.
+    """
+    import numpy as np
+    import pandas.api.types as pdtypes
+
+    if not pdtypes.is_numeric_dtype(s):
+        raise TypeError(f"series {s.name} must be numeric!")
+    if not s.isna().sum() == 0:
+        raise ValueError(f"series {s.name} contains nulls. Please drop them.")
+
+    n = len(s)
+    x = np.sort(s)
+    y = np.arange(1, n + 1) / n
+
+    return x, y
+
+
+
+ +
+ +
+ + +

+ exp(s) + +

+ + +
+ +

Take the exponential transform of the series.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> s = pd.Series([0, 1, 3], name="numbers")
+>>> s.exp()
+0     1.000000
+1     2.718282
+2    20.085537
+Name: numbers, dtype: float64
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ s + + Series + +
+

Input Series.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Series + +
+

Transformed Series.

+
+
+ +
+ Source code in janitor/math.py +
61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
@pf.register_series_method
+def exp(s: "Series") -> "Series":
+    """Take the exponential transform of the series.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> s = pd.Series([0, 1, 3], name="numbers")
+        >>> s.exp()
+        0     1.000000
+        1     2.718282
+        2    20.085537
+        Name: numbers, dtype: float64
+
+    Args:
+        s: Input Series.
+
+    Returns:
+        Transformed Series.
+    """
+    import numpy as np
+
+    return np.exp(s)
+
+
+
+ +
+ +
+ + +

+ log(s, error='warn') + +

+ + +
+ +

Take natural logarithm of the Series.

+

Each value in the series should be positive. Use error to control the +behavior if there are nonpositive entries in the series.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> s = pd.Series([0, 1, 3], name="numbers")
+>>> s.log(error="ignore")
+0         NaN
+1    0.000000
+2    1.098612
+Name: numbers, dtype: float64
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ s + + Series + +
+

Input Series.

+
+
+ required +
+ error + + str + +
+

Determines behavior when taking the log of nonpositive +entries. If 'warn' then a RuntimeWarning is thrown. If +'raise', then a RuntimeError is thrown. Otherwise, nothing +is thrown and log of nonpositive values is np.nan.

+
+
+ 'warn' +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ RuntimeError + +
+

Raised when there are nonpositive values in the +Series and error='raise'.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Series + +
+

Transformed Series.

+
+
+ +
+ Source code in janitor/math.py +
13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
@pf.register_series_method
+def log(s: "Series", error: str = "warn") -> "Series":
+    """
+    Take natural logarithm of the Series.
+
+    Each value in the series should be positive. Use `error` to control the
+    behavior if there are nonpositive entries in the series.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> s = pd.Series([0, 1, 3], name="numbers")
+        >>> s.log(error="ignore")
+        0         NaN
+        1    0.000000
+        2    1.098612
+        Name: numbers, dtype: float64
+
+    Args:
+        s: Input Series.
+        error: Determines behavior when taking the log of nonpositive
+            entries. If `'warn'` then a `RuntimeWarning` is thrown. If
+            `'raise'`, then a `RuntimeError` is thrown. Otherwise, nothing
+            is thrown and log of nonpositive values is `np.nan`.
+
+    Raises:
+        RuntimeError: Raised when there are nonpositive values in the
+            Series and `error='raise'`.
+
+    Returns:
+        Transformed Series.
+    """
+    import numpy as np
+
+    s = s.copy()
+    nonpositive = s <= 0
+    if (nonpositive).any():
+        msg = f"Log taken on {nonpositive.sum()} nonpositive value(s)"
+        if error.lower() == "warn":
+            warnings.warn(msg, RuntimeWarning)
+        if error.lower() == "raise":
+            raise RuntimeError(msg)
+        else:
+            pass
+    s[nonpositive] = np.nan
+    return np.log(s)
+
+
+
+ +
+ +
+ + +

+ logit(s, error='warn') + +

+ + +
+ +

Take logit transform of the Series.

+

The logit transform is defined:

+
logit(p) = log(p/(1-p))
+
+

Each value in the series should be between 0 and 1. Use error to +control the behavior if any series entries are outside of (0, 1).

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> s = pd.Series([0.1, 0.5, 0.9], name="numbers")
+>>> s.logit()
+0   -2.197225
+1    0.000000
+2    2.197225
+Name: numbers, dtype: float64
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ s + + Series + +
+

Input Series.

+
+
+ required +
+ error + + str + +
+

Determines behavior when s is outside of (0, 1). +If 'warn' then a RuntimeWarning is thrown. If 'raise', then a +RuntimeError is thrown. Otherwise, nothing is thrown and np.nan +is returned for the problematic entries; defaults to 'warn'.

+
+
+ 'warn' +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ RuntimeError + +
+

If error is set to 'raise'.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Series + +
+

Transformed Series.

+
+
+ +
+ Source code in janitor/math.py +
153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
@pf.register_series_method
+def logit(s: "Series", error: str = "warn") -> "Series":
+    """Take logit transform of the Series.
+
+    The logit transform is defined:
+
+    ```python
+    logit(p) = log(p/(1-p))
+    ```
+
+    Each value in the series should be between 0 and 1. Use `error` to
+    control the behavior if any series entries are outside of (0, 1).
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> s = pd.Series([0.1, 0.5, 0.9], name="numbers")
+        >>> s.logit()
+        0   -2.197225
+        1    0.000000
+        2    2.197225
+        Name: numbers, dtype: float64
+
+    Args:
+        s: Input Series.
+        error: Determines behavior when `s` is outside of `(0, 1)`.
+            If `'warn'` then a `RuntimeWarning` is thrown. If `'raise'`, then a
+            `RuntimeError` is thrown. Otherwise, nothing is thrown and `np.nan`
+            is returned for the problematic entries; defaults to `'warn'`.
+
+    Raises:
+        RuntimeError: If `error` is set to `'raise'`.
+
+    Returns:
+        Transformed Series.
+    """
+    import numpy as np
+    import scipy
+
+    s = s.copy()
+    outside_support = (s <= 0) | (s >= 1)
+    if (outside_support).any():
+        msg = f"{outside_support.sum()} value(s) are outside of (0, 1)"
+        if error.lower() == "warn":
+            warnings.warn(msg, RuntimeWarning)
+        if error.lower() == "raise":
+            raise RuntimeError(msg)
+        else:
+            pass
+    s[outside_support] = np.nan
+    return scipy.special.logit(s)
+
+
+
+ +
+ +
+ + +

+ normal_cdf(s) + +

+ + +
+ +

Transforms the Series via the CDF of the Normal distribution.

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> s = pd.Series([-1, 0, 3], name="numbers")
+>>> s.normal_cdf()
+0    0.158655
+1    0.500000
+2    0.998650
+dtype: float64
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ s + + Series + +
+

Input Series.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Series + +
+

Transformed Series.

+
+
+ +
+ Source code in janitor/math.py +
206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
@pf.register_series_method
+def normal_cdf(s: "Series") -> "Series":
+    """Transforms the Series via the CDF of the Normal distribution.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> s = pd.Series([-1, 0, 3], name="numbers")
+        >>> s.normal_cdf()
+        0    0.158655
+        1    0.500000
+        2    0.998650
+        dtype: float64
+
+    Args:
+        s: Input Series.
+
+    Returns:
+        Transformed Series.
+    """
+    import pandas as pd
+    import scipy
+
+    return pd.Series(scipy.stats.norm.cdf(s), index=s.index)
+
+
+
+ +
+ +
+ + +

+ probit(s, error='warn') + +

+ + +
+ +

Transforms the Series via the inverse CDF of the Normal distribution.

+

Each value in the series should be between 0 and 1. Use error to +control the behavior if any series entries are outside of (0, 1).

+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> s = pd.Series([0.1, 0.5, 0.8], name="numbers")
+>>> s.probit()
+0   -1.281552
+1    0.000000
+2    0.841621
+dtype: float64
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ s + + Series + +
+

Input Series.

+
+
+ required +
+ error + + str + +
+

Determines behavior when s is outside of (0, 1). +If 'warn' then a RuntimeWarning is thrown. If 'raise', then +a RuntimeError is thrown. Otherwise, nothing is thrown and +np.nan is returned for the problematic entries.

+
+
+ 'warn' +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ RuntimeError + +
+

When there are problematic values +in the Series and error='raise'.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Series + +
+

Transformed Series

+
+
+ +
+ Source code in janitor/math.py +
232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
@pf.register_series_method
+def probit(s: "Series", error: str = "warn") -> "Series":
+    """Transforms the Series via the inverse CDF of the Normal distribution.
+
+    Each value in the series should be between 0 and 1. Use `error` to
+    control the behavior if any series entries are outside of (0, 1).
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> s = pd.Series([0.1, 0.5, 0.8], name="numbers")
+        >>> s.probit()
+        0   -1.281552
+        1    0.000000
+        2    0.841621
+        dtype: float64
+
+    Args:
+        s: Input Series.
+        error: Determines behavior when `s` is outside of `(0, 1)`.
+            If `'warn'` then a `RuntimeWarning` is thrown. If `'raise'`, then
+            a `RuntimeError` is thrown. Otherwise, nothing is thrown and
+            `np.nan` is returned for the problematic entries.
+
+    Raises:
+        RuntimeError: When there are problematic values
+            in the Series and `error='raise'`.
+
+    Returns:
+        Transformed Series
+    """
+    import numpy as np
+    import pandas as pd
+    import scipy
+
+    s = s.copy()
+    outside_support = (s <= 0) | (s >= 1)
+    if (outside_support).any():
+        msg = f"{outside_support.sum()} value(s) are outside of (0, 1)"
+        if error.lower() == "warn":
+            warnings.warn(msg, RuntimeWarning)
+        if error.lower() == "raise":
+            raise RuntimeError(msg)
+        else:
+            pass
+    s[outside_support] = np.nan
+    with np.errstate(all="ignore"):
+        out = pd.Series(scipy.stats.norm.ppf(s), index=s.index)
+    return out
+
+
+
+ +
+ +
+ + +

+ sigmoid(s) + +

+ + +
+ +

Take the sigmoid transform of the series.

+

The sigmoid function is defined:

+
sigmoid(x) = 1 / (1 + exp(-x))
+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> s = pd.Series([-1, 0, 4], name="numbers")
+>>> s.sigmoid()
+0    0.268941
+1    0.500000
+2    0.982014
+Name: numbers, dtype: float64
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ s + + Series + +
+

Input Series.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Series + +
+

Transformed Series.

+
+
+ +
+ Source code in janitor/math.py +
 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
@pf.register_series_method
+def sigmoid(s: "Series") -> "Series":
+    """Take the sigmoid transform of the series.
+
+    The sigmoid function is defined:
+
+    ```python
+    sigmoid(x) = 1 / (1 + exp(-x))
+    ```
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> s = pd.Series([-1, 0, 4], name="numbers")
+        >>> s.sigmoid()
+        0    0.268941
+        1    0.500000
+        2    0.982014
+        Name: numbers, dtype: float64
+
+    Args:
+        s: Input Series.
+
+    Returns:
+        Transformed Series.
+    """
+    import scipy
+
+    return scipy.special.expit(s)
+
+
+
+ +
+ +
+ + +

+ softmax(s) + +

+ + +
+ +

Take the softmax transform of the series.

+

The softmax function transforms each element of a collection by +computing the exponential of each element divided by the sum of the +exponentials of all the elements.

+

That is, if x is a one-dimensional numpy array or pandas Series:

+
softmax(x) = exp(x)/sum(exp(x))
+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> s = pd.Series([0, 1, 3], name="numbers")
+>>> s.softmax()
+0    0.042010
+1    0.114195
+2    0.843795
+Name: numbers, dtype: float64
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ s + + Series + +
+

Input Series.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Series + +
+

Transformed Series.

+
+
+ +
+ Source code in janitor/math.py +
117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
@pf.register_series_method
+def softmax(s: "Series") -> "Series":
+    """Take the softmax transform of the series.
+
+    The softmax function transforms each element of a collection by
+    computing the exponential of each element divided by the sum of the
+    exponentials of all the elements.
+
+    That is, if x is a one-dimensional numpy array or pandas Series:
+
+    ```python
+    softmax(x) = exp(x)/sum(exp(x))
+    ```
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> s = pd.Series([0, 1, 3], name="numbers")
+        >>> s.softmax()
+        0    0.042010
+        1    0.114195
+        2    0.843795
+        Name: numbers, dtype: float64
+
+    Args:
+        s: Input Series.
+
+    Returns:
+        Transformed Series.
+    """
+    import pandas as pd
+    import scipy
+
+    return pd.Series(scipy.special.softmax(s), index=s.index, name=s.name)
+
+
+
+ +
+ +
+ + +

+ z_score(s, moments_dict=None, keys=('mean', 'std')) + +

+ + +
+ +

Transforms the Series into z-scores.

+

The z-score is defined:

+
z = (s - s.mean()) / s.std()
+
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor
+>>> s = pd.Series([0, 1, 3], name="numbers")
+>>> s.z_score()
+0   -0.872872
+1   -0.218218
+2    1.091089
+Name: numbers, dtype: float64
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ s + + Series + +
+

Input Series.

+
+
+ required +
+ moments_dict + + dict + +
+

If not None, then the mean and standard +deviation used to compute the z-score transformation is +saved as entries in moments_dict with keys determined by +the keys argument; defaults to None.

+
+
+ None +
+ keys + + Tuple[str, str] + +
+

Determines the keys saved in moments_dict +if moments are saved; defaults to ('mean', 'std').

+
+
+ ('mean', 'std') +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Series + +
+

Transformed Series.

+
+
+ +
+ Source code in janitor/math.py +
283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
@pf.register_series_method
+def z_score(
+    s: "Series",
+    moments_dict: dict = None,
+    keys: Tuple[str, str] = ("mean", "std"),
+) -> "Series":
+    """Transforms the Series into z-scores.
+
+    The z-score is defined:
+
+    ```python
+    z = (s - s.mean()) / s.std()
+    ```
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> s = pd.Series([0, 1, 3], name="numbers")
+        >>> s.z_score()
+        0   -0.872872
+        1   -0.218218
+        2    1.091089
+        Name: numbers, dtype: float64
+
+    Args:
+        s: Input Series.
+        moments_dict: If not `None`, then the mean and standard
+            deviation used to compute the z-score transformation is
+            saved as entries in `moments_dict` with keys determined by
+            the `keys` argument; defaults to `None`.
+        keys: Determines the keys saved in `moments_dict`
+            if moments are saved; defaults to (`'mean'`, `'std'`).
+
+    Returns:
+        Transformed Series.
+    """
+    mean = s.mean()
+    std = s.std()
+    if std == 0:
+        return 0
+    if moments_dict is not None:
+        moments_dict[keys[0]] = mean
+        moments_dict[keys[1]] = std
+    return (s - mean) / std
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/api/ml/index.html b/api/ml/index.html new file mode 100644 index 000000000..8705705e2 --- /dev/null +++ b/api/ml/index.html @@ -0,0 +1,1065 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Machine Learning - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Machine Learning

+ + +
+ + + + +
+ +

Machine learning specific functions.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ get_features_targets(df, target_column_names, feature_column_names=None) + +

+ + +
+ +

Get the features and targets as separate DataFrames/Series.

+

This method does not mutate the original DataFrame.

+

The behaviour is as such:

+
    +
  • target_column_names is mandatory.
  • +
  • If feature_column_names is present, then we will respect the column + names inside there.
  • +
  • If feature_column_names is not passed in, then we will assume that +the rest of the columns are feature columns, and return them.
  • +
+ + +

Examples:

+
>>> import pandas as pd
+>>> import janitor.ml
+>>> df = pd.DataFrame(
+...     {"a": [1, 2, 3], "b": [-2, 0, 4], "c": [1.23, 7.89, 4.56]}
+... )
+>>> X, Y = df.get_features_targets(target_column_names=["a", "c"])
+>>> X
+   b
+0 -2
+1  0
+2  4
+>>> Y
+   a     c
+0  1  1.23
+1  2  7.89
+2  3  4.56
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

The pandas DataFrame object.

+
+
+ required +
+ target_column_names + + Union[str, Union[List, Tuple], Hashable] + +
+

Either a column name or an +iterable (list or tuple) of column names that are the target(s) to +be predicted.

+
+
+ required +
+ feature_column_names + + Optional[Union[str, Iterable[str], Hashable]] + +
+

The column name or +iterable of column names that are the features (a.k.a. predictors) +used to predict the targets.

+
+
+ None +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[DataFrame, DataFrame] + +
+

(X, Y) the feature matrix (X) and the target matrix (Y). +Both are pandas DataFrames.

+
+
+ +
+ Source code in janitor/ml.py +
11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
@pf.register_dataframe_method
+@deprecated_alias(
+    target_columns="target_column_names",
+    feature_columns="feature_column_names",
+)
+def get_features_targets(
+    df: pd.DataFrame,
+    target_column_names: Union[str, Union[List, Tuple], Hashable],
+    feature_column_names: Optional[Union[str, Iterable[str], Hashable]] = None,
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """Get the features and targets as separate DataFrames/Series.
+
+    This method does not mutate the original DataFrame.
+
+    The behaviour is as such:
+
+    - `target_column_names` is mandatory.
+    - If `feature_column_names` is present, then we will respect the column
+        names inside there.
+    - If `feature_column_names` is not passed in, then we will assume that
+    the rest of the columns are feature columns, and return them.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor.ml
+        >>> df = pd.DataFrame(
+        ...     {"a": [1, 2, 3], "b": [-2, 0, 4], "c": [1.23, 7.89, 4.56]}
+        ... )
+        >>> X, Y = df.get_features_targets(target_column_names=["a", "c"])
+        >>> X
+           b
+        0 -2
+        1  0
+        2  4
+        >>> Y
+           a     c
+        0  1  1.23
+        1  2  7.89
+        2  3  4.56
+
+    Args:
+        df: The pandas DataFrame object.
+        target_column_names: Either a column name or an
+            iterable (list or tuple) of column names that are the target(s) to
+            be predicted.
+        feature_column_names: The column name or
+            iterable of column names that are the features (a.k.a. predictors)
+            used to predict the targets.
+
+    Returns:
+        `(X, Y)` the feature matrix (`X`) and the target matrix (`Y`).
+            Both are pandas DataFrames.
+    """
+    Y = df[target_column_names]
+
+    if feature_column_names:
+        X = df[feature_column_names]
+    else:
+        if isinstance(target_column_names, (list, tuple)):  # noqa: W503
+            xcols = [c for c in df.columns if c not in target_column_names]
+        else:
+            xcols = [c for c in df.columns if target_column_names != c]
+
+        X = df[xcols]
+    return X, Y
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/api/polars/index.html b/api/polars/index.html new file mode 100644 index 000000000..9dc6c6857 --- /dev/null +++ b/api/polars/index.html @@ -0,0 +1,5428 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Polars - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Polars

+ + +
+ + + + +
+ + + + + + + + +
+ + + + + + + + + + +
+ + + +

+ clean_names + + +

+ +
+ +

clean_names implementation for polars.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ clean_names(df, strip_underscores=None, case_type='lower', remove_special=False, strip_accents=False, truncate_limit=None) + +

+ + +
+ +

Clean the column names in a polars DataFrame.

+

clean_names can also be applied to a LazyFrame.

+ + +

Examples:

+
>>> import polars as pl
+>>> import janitor.polars
+>>> df = pl.DataFrame(
+...     {
+...         "Aloha": range(3),
+...         "Bell Chart": range(3),
+...         "Animals@#$%^": range(3)
+...     }
+... )
+>>> df
+shape: (3, 3)
+┌───────┬────────────┬──────────────┐
+│ Aloha ┆ Bell Chart ┆ Animals@#$%^ │
+│ ---   ┆ ---        ┆ ---          │
+│ i64   ┆ i64        ┆ i64          │
+╞═══════╪════════════╪══════════════╡
+│ 0     ┆ 0          ┆ 0            │
+│ 1     ┆ 1          ┆ 1            │
+│ 2     ┆ 2          ┆ 2            │
+└───────┴────────────┴──────────────┘
+>>> df.clean_names(remove_special=True)
+shape: (3, 3)
+┌───────┬────────────┬─────────┐
+│ aloha ┆ bell_chart ┆ animals │
+│ ---   ┆ ---        ┆ ---     │
+│ i64   ┆ i64        ┆ i64     │
+╞═══════╪════════════╪═════════╡
+│ 0     ┆ 0          ┆ 0       │
+│ 1     ┆ 1          ┆ 1       │
+│ 2     ┆ 2          ┆ 2       │
+└───────┴────────────┴─────────┘
+
+
+

New in version 0.28.0

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ strip_underscores + + str | bool + +
+

Removes the outer underscores from all +column names. Default None keeps outer underscores. Values can be +either 'left', 'right' or 'both' or the respective shorthand 'l', +'r' and True.

+
+
+ None +
+ case_type + + str + +
+

Whether to make the column names lower or uppercase. +Current case may be preserved with 'preserve', +while snake case conversion (from CamelCase or camelCase only) +can be turned on using "snake". +Default 'lower' makes all characters lowercase.

+
+
+ 'lower' +
+ remove_special + + bool + +
+

Remove special characters from the column names. +Only letters, numbers and underscores are preserved.

+
+
+ False +
+ strip_accents + + bool + +
+

Whether or not to remove accents from +the labels.

+
+
+ False +
+ truncate_limit + + int + +
+

Truncates formatted column names to +the specified length. Default None does not truncate.

+
+
+ None +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame | LazyFrame + +
+

A polars DataFrame/LazyFrame.

+
+
+ +
+ Source code in janitor/polars/clean_names.py +
 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
@register_lazyframe_method
+@register_dataframe_method
+def clean_names(
+    df: pl.DataFrame | pl.LazyFrame,
+    strip_underscores: str | bool = None,
+    case_type: str = "lower",
+    remove_special: bool = False,
+    strip_accents: bool = False,
+    truncate_limit: int = None,
+) -> pl.DataFrame | pl.LazyFrame:
+    """
+    Clean the column names in a polars DataFrame.
+
+    `clean_names` can also be applied to a LazyFrame.
+
+    Examples:
+        >>> import polars as pl
+        >>> import janitor.polars
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "Aloha": range(3),
+        ...         "Bell Chart": range(3),
+        ...         "Animals@#$%^": range(3)
+        ...     }
+        ... )
+        >>> df
+        shape: (3, 3)
+        ┌───────┬────────────┬──────────────┐
+        │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │
+        │ ---   ┆ ---        ┆ ---          │
+        │ i64   ┆ i64        ┆ i64          │
+        ╞═══════╪════════════╪══════════════╡
+        │ 0     ┆ 0          ┆ 0            │
+        │ 1     ┆ 1          ┆ 1            │
+        │ 2     ┆ 2          ┆ 2            │
+        └───────┴────────────┴──────────────┘
+        >>> df.clean_names(remove_special=True)
+        shape: (3, 3)
+        ┌───────┬────────────┬─────────┐
+        │ aloha ┆ bell_chart ┆ animals │
+        │ ---   ┆ ---        ┆ ---     │
+        │ i64   ┆ i64        ┆ i64     │
+        ╞═══════╪════════════╪═════════╡
+        │ 0     ┆ 0          ┆ 0       │
+        │ 1     ┆ 1          ┆ 1       │
+        │ 2     ┆ 2          ┆ 2       │
+        └───────┴────────────┴─────────┘
+
+    !!! info "New in version 0.28.0"
+
+    Args:
+        strip_underscores: Removes the outer underscores from all
+            column names. Default None keeps outer underscores. Values can be
+            either 'left', 'right' or 'both' or the respective shorthand 'l',
+            'r' and True.
+        case_type: Whether to make the column names lower or uppercase.
+            Current case may be preserved with 'preserve',
+            while snake case conversion (from CamelCase or camelCase only)
+            can be turned on using "snake".
+            Default 'lower' makes all characters lowercase.
+        remove_special: Remove special characters from the column names.
+            Only letters, numbers and underscores are preserved.
+        strip_accents: Whether or not to remove accents from
+            the labels.
+        truncate_limit: Truncates formatted column names to
+            the specified length. Default None does not truncate.
+
+    Returns:
+        A polars DataFrame/LazyFrame.
+    """  # noqa: E501
+    return df.rename(
+        lambda col: _clean_column_names(
+            obj=col,
+            strip_accents=strip_accents,
+            strip_underscores=strip_underscores,
+            case_type=case_type,
+            remove_special=remove_special,
+            truncate_limit=truncate_limit,
+        )
+    )
+
+
+
+ +
+ +
+ + +

+ make_clean_names(expression, strip_underscores=None, case_type='lower', remove_special=False, strip_accents=False, enforce_string=False, truncate_limit=None) + +

+ + +
+ +

Clean the labels in a polars Expression.

+ + +

Examples:

+
>>> import polars as pl
+>>> import janitor.polars
+>>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]})
+>>> df
+shape: (1, 1)
+┌─────────────┐
+│ raw         │
+│ ---         │
+│ str         │
+╞═════════════╡
+│ Abçdê fgí j │
+└─────────────┘
+
+

Clean the column values:

+
>>> df.with_columns(pl.col("raw").make_clean_names(strip_accents=True))
+shape: (1, 1)
+┌─────────────┐
+│ raw         │
+│ ---         │
+│ str         │
+╞═════════════╡
+│ abcde_fgi_j │
+└─────────────┘
+
+
+

New in version 0.28.0

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ strip_underscores + + str | bool + +
+

Removes the outer underscores +from all labels in the expression. +Default None keeps outer underscores. +Values can be either 'left', 'right' +or 'both' or the respective shorthand 'l', +'r' and True.

+
+
+ None +
+ case_type + + str + +
+

Whether to make the labels in the expression lower or uppercase. +Current case may be preserved with 'preserve', +while snake case conversion (from CamelCase or camelCase only) +can be turned on using "snake". +Default 'lower' makes all characters lowercase.

+
+
+ 'lower' +
+ remove_special + + bool + +
+

Remove special characters from the values in the expression. +Only letters, numbers and underscores are preserved.

+
+
+ False +
+ strip_accents + + bool + +
+

Whether or not to remove accents from +the expression.

+
+
+ False +
+ enforce_string + + bool + +
+

Whether or not to cast the expression to a string type.

+
+
+ False +
+ truncate_limit + + int + +
+

Truncates formatted labels in the expression to +the specified length. Default None does not truncate.

+
+
+ None +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Expr + +
+

A polars Expression.

+
+
+ +
+ Source code in janitor/polars/clean_names.py +
117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
@register_expr_method
+def make_clean_names(
+    expression,
+    strip_underscores: str | bool = None,
+    case_type: str = "lower",
+    remove_special: bool = False,
+    strip_accents: bool = False,
+    enforce_string: bool = False,
+    truncate_limit: int = None,
+) -> pl.Expr:
+    """
+    Clean the labels in a polars Expression.
+
+    Examples:
+        >>> import polars as pl
+        >>> import janitor.polars
+        >>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]})
+        >>> df
+        shape: (1, 1)
+        ┌─────────────┐
+        │ raw         │
+        │ ---         │
+        │ str         │
+        ╞═════════════╡
+        │ Abçdê fgí j │
+        └─────────────┘
+
+        Clean the column values:
+        >>> df.with_columns(pl.col("raw").make_clean_names(strip_accents=True))
+        shape: (1, 1)
+        ┌─────────────┐
+        │ raw         │
+        │ ---         │
+        │ str         │
+        ╞═════════════╡
+        │ abcde_fgi_j │
+        └─────────────┘
+
+    !!! info "New in version 0.28.0"
+
+    Args:
+        strip_underscores: Removes the outer underscores
+            from all labels in the expression.
+            Default None keeps outer underscores.
+            Values can be either 'left', 'right'
+            or 'both' or the respective shorthand 'l',
+            'r' and True.
+        case_type: Whether to make the labels in the expression lower or uppercase.
+            Current case may be preserved with 'preserve',
+            while snake case conversion (from CamelCase or camelCase only)
+            can be turned on using "snake".
+            Default 'lower' makes all characters lowercase.
+        remove_special: Remove special characters from the values in the expression.
+            Only letters, numbers and underscores are preserved.
+        strip_accents: Whether or not to remove accents from
+            the expression.
+        enforce_string: Whether or not to cast the expression to a string type.
+        truncate_limit: Truncates formatted labels in the expression to
+            the specified length. Default None does not truncate.
+
+    Returns:
+        A polars Expression.
+    """
+    return _clean_expr_names(
+        obj=expression,
+        strip_accents=strip_accents,
+        strip_underscores=strip_underscores,
+        case_type=case_type,
+        remove_special=remove_special,
+        enforce_string=enforce_string,
+        truncate_limit=truncate_limit,
+    )
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ complete + + +

+ +
+ +

complete implementation for polars.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ complete(df, *columns, fill_value=None, explicit=True, sort=False, by=None) + +

+ + +
+ +

Turns implicit missing values into explicit missing values

+

It is modeled after tidyr's complete function. +In a way, it is the inverse of pl.drop_nulls, +as it exposes implicitly missing rows.

+

If new values need to be introduced, a polars Expression +or a polars Series with the new values can be passed, +as long as the polars Expression/Series +has a name that already exists in the DataFrame.

+

complete can also be applied to a LazyFrame.

+ + +

Examples:

+
>>> import polars as pl
+>>> import janitor.polars
+>>> df = pl.DataFrame(
+...     dict(
+...         group=(1, 2, 1, 2),
+...         item_id=(1, 2, 2, 3),
+...         item_name=("a", "a", "b", "b"),
+...         value1=(1, None, 3, 4),
+...         value2=range(4, 8),
+...     )
+... )
+>>> df
+shape: (4, 5)
+┌───────┬─────────┬───────────┬────────┬────────┐
+│ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
+│ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
+│ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
+╞═══════╪═════════╪═══════════╪════════╪════════╡
+│ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
+│ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
+│ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
+│ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
+└───────┴─────────┴───────────┴────────┴────────┘
+
+

Generate all possible combinations of +group, item_id, and item_name +(whether or not they appear in the data)

+
>>> with pl.Config(tbl_rows=-1):
+...     df.complete("group", "item_id", "item_name", sort=True)
+shape: (12, 5)
+┌───────┬─────────┬───────────┬────────┬────────┐
+│ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
+│ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
+│ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
+╞═══════╪═════════╪═══════════╪════════╪════════╡
+│ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
+│ 1     ┆ 1       ┆ b         ┆ null   ┆ null   │
+│ 1     ┆ 2       ┆ a         ┆ null   ┆ null   │
+│ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
+│ 1     ┆ 3       ┆ a         ┆ null   ┆ null   │
+│ 1     ┆ 3       ┆ b         ┆ null   ┆ null   │
+│ 2     ┆ 1       ┆ a         ┆ null   ┆ null   │
+│ 2     ┆ 1       ┆ b         ┆ null   ┆ null   │
+│ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
+│ 2     ┆ 2       ┆ b         ┆ null   ┆ null   │
+│ 2     ┆ 3       ┆ a         ┆ null   ┆ null   │
+│ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
+└───────┴─────────┴───────────┴────────┴────────┘
+
+

Cross all possible group values with the unique pairs of +(item_id, item_name) that already exist in the data.

+
>>> with pl.Config(tbl_rows=-1):
+...     df.select(
+...         "group", pl.struct("item_id", "item_name"), "value1", "value2"
+...     ).complete("group", "item_id", sort=True).unnest("item_id")
+shape: (8, 5)
+┌───────┬─────────┬───────────┬────────┬────────┐
+│ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
+│ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
+│ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
+╞═══════╪═════════╪═══════════╪════════╪════════╡
+│ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
+│ 1     ┆ 2       ┆ a         ┆ null   ┆ null   │
+│ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
+│ 1     ┆ 3       ┆ b         ┆ null   ┆ null   │
+│ 2     ┆ 1       ┆ a         ┆ null   ┆ null   │
+│ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
+│ 2     ┆ 2       ┆ b         ┆ null   ┆ null   │
+│ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
+└───────┴─────────┴───────────┴────────┴────────┘
+
+

Fill in nulls:

+
>>> with pl.Config(tbl_rows=-1):
+...     df.select(
+...         "group", pl.struct("item_id", "item_name"), "value1", "value2"
+...     ).complete(
+...         "group",
+...         "item_id",
+...         fill_value={"value1": 0, "value2": 99},
+...         explicit=True,
+...         sort=True,
+...     ).unnest("item_id")
+shape: (8, 5)
+┌───────┬─────────┬───────────┬────────┬────────┐
+│ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
+│ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
+│ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
+╞═══════╪═════════╪═══════════╪════════╪════════╡
+│ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
+│ 1     ┆ 2       ┆ a         ┆ 0      ┆ 99     │
+│ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
+│ 1     ┆ 3       ┆ b         ┆ 0      ┆ 99     │
+│ 2     ┆ 1       ┆ a         ┆ 0      ┆ 99     │
+│ 2     ┆ 2       ┆ a         ┆ 0      ┆ 5      │
+│ 2     ┆ 2       ┆ b         ┆ 0      ┆ 99     │
+│ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
+└───────┴─────────┴───────────┴────────┴────────┘
+
+

Limit the fill to only the newly created +missing values with explicit = FALSE:

+
>>> with pl.Config(tbl_rows=-1):
+...     df.select(
+...         "group", pl.struct("item_id", "item_name"), "value1", "value2"
+...     ).complete(
+...         "group",
+...         "item_id",
+...         fill_value={"value1": 0, "value2": 99},
+...         explicit=False,
+...         sort=True,
+...     ).unnest("item_id").sort(pl.all())
+shape: (8, 5)
+┌───────┬─────────┬───────────┬────────┬────────┐
+│ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
+│ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
+│ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
+╞═══════╪═════════╪═══════════╪════════╪════════╡
+│ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
+│ 1     ┆ 2       ┆ a         ┆ 0      ┆ 99     │
+│ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
+│ 1     ┆ 3       ┆ b         ┆ 0      ┆ 99     │
+│ 2     ┆ 1       ┆ a         ┆ 0      ┆ 99     │
+│ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
+│ 2     ┆ 2       ┆ b         ┆ 0      ┆ 99     │
+│ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
+└───────┴─────────┴───────────┴────────┴────────┘
+
+
>>> df = pl.DataFrame(
+...     {
+...         "Year": [1999, 2000, 2004, 1999, 2004],
+...         "Taxon": [
+...             "Saccharina",
+...             "Saccharina",
+...             "Saccharina",
+...             "Agarum",
+...             "Agarum",
+...         ],
+...         "Abundance": [4, 5, 2, 1, 8],
+...     }
+... )
+>>> df
+shape: (5, 3)
+┌──────┬────────────┬───────────┐
+│ Year ┆ Taxon      ┆ Abundance │
+│ ---  ┆ ---        ┆ ---       │
+│ i64  ┆ str        ┆ i64       │
+╞══════╪════════════╪═══════════╡
+│ 1999 ┆ Saccharina ┆ 4         │
+│ 2000 ┆ Saccharina ┆ 5         │
+│ 2004 ┆ Saccharina ┆ 2         │
+│ 1999 ┆ Agarum     ┆ 1         │
+│ 2004 ┆ Agarum     ┆ 8         │
+└──────┴────────────┴───────────┘
+
+

Expose missing years from 1999 to 2004 - +pass a polars expression with the new dates, +and ensure the expression's name already exists +in the DataFrame:

+
>>> expression = pl.int_range(1999,2005).alias('Year')
+>>> with pl.Config(tbl_rows=-1):
+...     df.complete(expression,'Taxon',sort=True)
+shape: (12, 3)
+┌──────┬────────────┬───────────┐
+│ Year ┆ Taxon      ┆ Abundance │
+│ ---  ┆ ---        ┆ ---       │
+│ i64  ┆ str        ┆ i64       │
+╞══════╪════════════╪═══════════╡
+│ 1999 ┆ Agarum     ┆ 1         │
+│ 1999 ┆ Saccharina ┆ 4         │
+│ 2000 ┆ Agarum     ┆ null      │
+│ 2000 ┆ Saccharina ┆ 5         │
+│ 2001 ┆ Agarum     ┆ null      │
+│ 2001 ┆ Saccharina ┆ null      │
+│ 2002 ┆ Agarum     ┆ null      │
+│ 2002 ┆ Saccharina ┆ null      │
+│ 2003 ┆ Agarum     ┆ null      │
+│ 2003 ┆ Saccharina ┆ null      │
+│ 2004 ┆ Agarum     ┆ 8         │
+│ 2004 ┆ Saccharina ┆ 2         │
+└──────┴────────────┴───────────┘
+
+

Expose missing rows per group:

+
>>> df = pl.DataFrame(
+...     {
+...         "state": ["CA", "CA", "HI", "HI", "HI", "NY", "NY"],
+...         "year": [2010, 2013, 2010, 2012, 2016, 2009, 2013],
+...         "value": [1, 3, 1, 2, 3, 2, 5],
+...     }
+... )
+>>> df
+shape: (7, 3)
+┌───────┬──────┬───────┐
+│ state ┆ year ┆ value │
+│ ---   ┆ ---  ┆ ---   │
+│ str   ┆ i64  ┆ i64   │
+╞═══════╪══════╪═══════╡
+│ CA    ┆ 2010 ┆ 1     │
+│ CA    ┆ 2013 ┆ 3     │
+│ HI    ┆ 2010 ┆ 1     │
+│ HI    ┆ 2012 ┆ 2     │
+│ HI    ┆ 2016 ┆ 3     │
+│ NY    ┆ 2009 ┆ 2     │
+│ NY    ┆ 2013 ┆ 5     │
+└───────┴──────┴───────┘
+>>> low = pl.col('year').min()
+>>> high = pl.col('year').max().add(1)
+>>> new_year_values=pl.int_range(low,high).alias('year')
+>>> with pl.Config(tbl_rows=-1):
+...     df.complete(new_year_values,by='state',sort=True)
+shape: (16, 3)
+┌───────┬──────┬───────┐
+│ state ┆ year ┆ value │
+│ ---   ┆ ---  ┆ ---   │
+│ str   ┆ i64  ┆ i64   │
+╞═══════╪══════╪═══════╡
+│ CA    ┆ 2010 ┆ 1     │
+│ CA    ┆ 2011 ┆ null  │
+│ CA    ┆ 2012 ┆ null  │
+│ CA    ┆ 2013 ┆ 3     │
+│ HI    ┆ 2010 ┆ 1     │
+│ HI    ┆ 2011 ┆ null  │
+│ HI    ┆ 2012 ┆ 2     │
+│ HI    ┆ 2013 ┆ null  │
+│ HI    ┆ 2014 ┆ null  │
+│ HI    ┆ 2015 ┆ null  │
+│ HI    ┆ 2016 ┆ 3     │
+│ NY    ┆ 2009 ┆ 2     │
+│ NY    ┆ 2010 ┆ null  │
+│ NY    ┆ 2011 ┆ null  │
+│ NY    ┆ 2012 ┆ null  │
+│ NY    ┆ 2013 ┆ 5     │
+└───────┴──────┴───────┘
+
+
+

New in version 0.28.0

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ *columns + + ColumnNameOrSelector + +
+

This refers to the columns to be completed. +It can be a string or a column selector or a polars expression. +A polars expression can be used to introduced new values, +as long as the polars expression has a name that already exists +in the DataFrame.

+
+
+ () +
+ fill_value + + dict | Any | Expr + +
+

Scalar value or polars expression to use instead of nulls +for missing combinations. A dictionary, mapping columns names +to a scalar value is also accepted.

+
+
+ None +
+ explicit + + bool + +
+

Determines if only implicitly missing values +should be filled (False), or all nulls existing in the LazyFrame +(True). explicit is applicable only +if fill_value is not None.

+
+
+ True +
+ sort + + bool + +
+

Sort the DataFrame based on *columns.

+
+
+ False +
+ by + + ColumnNameOrSelector + +
+

Column(s) to group by. +The explicit missing rows are returned per group.

+
+
+ None +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame | LazyFrame + +
+

A polars DataFrame/LazyFrame.

+
+
+ +
+ Source code in janitor/polars/complete.py +
268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
@register_lazyframe_method
+@register_dataframe_method
+def complete(
+    df: pl.DataFrame | pl.LazyFrame,
+    *columns: ColumnNameOrSelector,
+    fill_value: dict | Any | pl.Expr = None,
+    explicit: bool = True,
+    sort: bool = False,
+    by: ColumnNameOrSelector = None,
+) -> pl.DataFrame | pl.LazyFrame:
+    """
+    Turns implicit missing values into explicit missing values
+
+    It is modeled after tidyr's `complete` function.
+    In a way, it is the inverse of `pl.drop_nulls`,
+    as it exposes implicitly missing rows.
+
+    If new values need to be introduced, a polars Expression
+    or a polars Series with the new values can be passed,
+    as long as the polars Expression/Series
+    has a name that already exists in the DataFrame.
+
+    `complete` can also be applied to a LazyFrame.
+
+    Examples:
+        >>> import polars as pl
+        >>> import janitor.polars
+        >>> df = pl.DataFrame(
+        ...     dict(
+        ...         group=(1, 2, 1, 2),
+        ...         item_id=(1, 2, 2, 3),
+        ...         item_name=("a", "a", "b", "b"),
+        ...         value1=(1, None, 3, 4),
+        ...         value2=range(4, 8),
+        ...     )
+        ... )
+        >>> df
+        shape: (4, 5)
+        ┌───────┬─────────┬───────────┬────────┬────────┐
+        │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
+        │ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
+        │ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
+        ╞═══════╪═════════╪═══════════╪════════╪════════╡
+        │ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
+        │ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
+        │ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
+        │ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
+        └───────┴─────────┴───────────┴────────┴────────┘
+
+        Generate all possible combinations of
+        `group`, `item_id`, and `item_name`
+        (whether or not they appear in the data)
+        >>> with pl.Config(tbl_rows=-1):
+        ...     df.complete("group", "item_id", "item_name", sort=True)
+        shape: (12, 5)
+        ┌───────┬─────────┬───────────┬────────┬────────┐
+        │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
+        │ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
+        │ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
+        ╞═══════╪═════════╪═══════════╪════════╪════════╡
+        │ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
+        │ 1     ┆ 1       ┆ b         ┆ null   ┆ null   │
+        │ 1     ┆ 2       ┆ a         ┆ null   ┆ null   │
+        │ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
+        │ 1     ┆ 3       ┆ a         ┆ null   ┆ null   │
+        │ 1     ┆ 3       ┆ b         ┆ null   ┆ null   │
+        │ 2     ┆ 1       ┆ a         ┆ null   ┆ null   │
+        │ 2     ┆ 1       ┆ b         ┆ null   ┆ null   │
+        │ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
+        │ 2     ┆ 2       ┆ b         ┆ null   ┆ null   │
+        │ 2     ┆ 3       ┆ a         ┆ null   ┆ null   │
+        │ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
+        └───────┴─────────┴───────────┴────────┴────────┘
+
+        Cross all possible `group` values with the unique pairs of
+        `(item_id, item_name)` that already exist in the data.
+        >>> with pl.Config(tbl_rows=-1):
+        ...     df.select(
+        ...         "group", pl.struct("item_id", "item_name"), "value1", "value2"
+        ...     ).complete("group", "item_id", sort=True).unnest("item_id")
+        shape: (8, 5)
+        ┌───────┬─────────┬───────────┬────────┬────────┐
+        │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
+        │ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
+        │ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
+        ╞═══════╪═════════╪═══════════╪════════╪════════╡
+        │ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
+        │ 1     ┆ 2       ┆ a         ┆ null   ┆ null   │
+        │ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
+        │ 1     ┆ 3       ┆ b         ┆ null   ┆ null   │
+        │ 2     ┆ 1       ┆ a         ┆ null   ┆ null   │
+        │ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
+        │ 2     ┆ 2       ┆ b         ┆ null   ┆ null   │
+        │ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
+        └───────┴─────────┴───────────┴────────┴────────┘
+
+        Fill in nulls:
+        >>> with pl.Config(tbl_rows=-1):
+        ...     df.select(
+        ...         "group", pl.struct("item_id", "item_name"), "value1", "value2"
+        ...     ).complete(
+        ...         "group",
+        ...         "item_id",
+        ...         fill_value={"value1": 0, "value2": 99},
+        ...         explicit=True,
+        ...         sort=True,
+        ...     ).unnest("item_id")
+        shape: (8, 5)
+        ┌───────┬─────────┬───────────┬────────┬────────┐
+        │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
+        │ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
+        │ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
+        ╞═══════╪═════════╪═══════════╪════════╪════════╡
+        │ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
+        │ 1     ┆ 2       ┆ a         ┆ 0      ┆ 99     │
+        │ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
+        │ 1     ┆ 3       ┆ b         ┆ 0      ┆ 99     │
+        │ 2     ┆ 1       ┆ a         ┆ 0      ┆ 99     │
+        │ 2     ┆ 2       ┆ a         ┆ 0      ┆ 5      │
+        │ 2     ┆ 2       ┆ b         ┆ 0      ┆ 99     │
+        │ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
+        └───────┴─────────┴───────────┴────────┴────────┘
+
+        Limit the fill to only the newly created
+        missing values with `explicit = FALSE`:
+        >>> with pl.Config(tbl_rows=-1):
+        ...     df.select(
+        ...         "group", pl.struct("item_id", "item_name"), "value1", "value2"
+        ...     ).complete(
+        ...         "group",
+        ...         "item_id",
+        ...         fill_value={"value1": 0, "value2": 99},
+        ...         explicit=False,
+        ...         sort=True,
+        ...     ).unnest("item_id").sort(pl.all())
+        shape: (8, 5)
+        ┌───────┬─────────┬───────────┬────────┬────────┐
+        │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
+        │ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
+        │ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
+        ╞═══════╪═════════╪═══════════╪════════╪════════╡
+        │ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
+        │ 1     ┆ 2       ┆ a         ┆ 0      ┆ 99     │
+        │ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
+        │ 1     ┆ 3       ┆ b         ┆ 0      ┆ 99     │
+        │ 2     ┆ 1       ┆ a         ┆ 0      ┆ 99     │
+        │ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
+        │ 2     ┆ 2       ┆ b         ┆ 0      ┆ 99     │
+        │ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
+        └───────┴─────────┴───────────┴────────┴────────┘
+
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "Year": [1999, 2000, 2004, 1999, 2004],
+        ...         "Taxon": [
+        ...             "Saccharina",
+        ...             "Saccharina",
+        ...             "Saccharina",
+        ...             "Agarum",
+        ...             "Agarum",
+        ...         ],
+        ...         "Abundance": [4, 5, 2, 1, 8],
+        ...     }
+        ... )
+        >>> df
+        shape: (5, 3)
+        ┌──────┬────────────┬───────────┐
+        │ Year ┆ Taxon      ┆ Abundance │
+        │ ---  ┆ ---        ┆ ---       │
+        │ i64  ┆ str        ┆ i64       │
+        ╞══════╪════════════╪═══════════╡
+        │ 1999 ┆ Saccharina ┆ 4         │
+        │ 2000 ┆ Saccharina ┆ 5         │
+        │ 2004 ┆ Saccharina ┆ 2         │
+        │ 1999 ┆ Agarum     ┆ 1         │
+        │ 2004 ┆ Agarum     ┆ 8         │
+        └──────┴────────────┴───────────┘
+
+        Expose missing years from 1999 to 2004 -
+        pass a polars expression with the new dates,
+        and ensure the expression's name already exists
+        in the DataFrame:
+        >>> expression = pl.int_range(1999,2005).alias('Year')
+        >>> with pl.Config(tbl_rows=-1):
+        ...     df.complete(expression,'Taxon',sort=True)
+        shape: (12, 3)
+        ┌──────┬────────────┬───────────┐
+        │ Year ┆ Taxon      ┆ Abundance │
+        │ ---  ┆ ---        ┆ ---       │
+        │ i64  ┆ str        ┆ i64       │
+        ╞══════╪════════════╪═══════════╡
+        │ 1999 ┆ Agarum     ┆ 1         │
+        │ 1999 ┆ Saccharina ┆ 4         │
+        │ 2000 ┆ Agarum     ┆ null      │
+        │ 2000 ┆ Saccharina ┆ 5         │
+        │ 2001 ┆ Agarum     ┆ null      │
+        │ 2001 ┆ Saccharina ┆ null      │
+        │ 2002 ┆ Agarum     ┆ null      │
+        │ 2002 ┆ Saccharina ┆ null      │
+        │ 2003 ┆ Agarum     ┆ null      │
+        │ 2003 ┆ Saccharina ┆ null      │
+        │ 2004 ┆ Agarum     ┆ 8         │
+        │ 2004 ┆ Saccharina ┆ 2         │
+        └──────┴────────────┴───────────┘
+
+        Expose missing rows per group:
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "state": ["CA", "CA", "HI", "HI", "HI", "NY", "NY"],
+        ...         "year": [2010, 2013, 2010, 2012, 2016, 2009, 2013],
+        ...         "value": [1, 3, 1, 2, 3, 2, 5],
+        ...     }
+        ... )
+        >>> df
+        shape: (7, 3)
+        ┌───────┬──────┬───────┐
+        │ state ┆ year ┆ value │
+        │ ---   ┆ ---  ┆ ---   │
+        │ str   ┆ i64  ┆ i64   │
+        ╞═══════╪══════╪═══════╡
+        │ CA    ┆ 2010 ┆ 1     │
+        │ CA    ┆ 2013 ┆ 3     │
+        │ HI    ┆ 2010 ┆ 1     │
+        │ HI    ┆ 2012 ┆ 2     │
+        │ HI    ┆ 2016 ┆ 3     │
+        │ NY    ┆ 2009 ┆ 2     │
+        │ NY    ┆ 2013 ┆ 5     │
+        └───────┴──────┴───────┘
+        >>> low = pl.col('year').min()
+        >>> high = pl.col('year').max().add(1)
+        >>> new_year_values=pl.int_range(low,high).alias('year')
+        >>> with pl.Config(tbl_rows=-1):
+        ...     df.complete(new_year_values,by='state',sort=True)
+        shape: (16, 3)
+        ┌───────┬──────┬───────┐
+        │ state ┆ year ┆ value │
+        │ ---   ┆ ---  ┆ ---   │
+        │ str   ┆ i64  ┆ i64   │
+        ╞═══════╪══════╪═══════╡
+        │ CA    ┆ 2010 ┆ 1     │
+        │ CA    ┆ 2011 ┆ null  │
+        │ CA    ┆ 2012 ┆ null  │
+        │ CA    ┆ 2013 ┆ 3     │
+        │ HI    ┆ 2010 ┆ 1     │
+        │ HI    ┆ 2011 ┆ null  │
+        │ HI    ┆ 2012 ┆ 2     │
+        │ HI    ┆ 2013 ┆ null  │
+        │ HI    ┆ 2014 ┆ null  │
+        │ HI    ┆ 2015 ┆ null  │
+        │ HI    ┆ 2016 ┆ 3     │
+        │ NY    ┆ 2009 ┆ 2     │
+        │ NY    ┆ 2010 ┆ null  │
+        │ NY    ┆ 2011 ┆ null  │
+        │ NY    ┆ 2012 ┆ null  │
+        │ NY    ┆ 2013 ┆ 5     │
+        └───────┴──────┴───────┘
+
+
+    !!! info "New in version 0.28.0"
+
+    Args:
+        *columns: This refers to the columns to be completed.
+            It can be a string or a column selector or a polars expression.
+            A polars expression can be used to introduced new values,
+            as long as the polars expression has a name that already exists
+            in the DataFrame.
+        fill_value: Scalar value or polars expression to use instead of nulls
+            for missing combinations. A dictionary, mapping columns names
+            to a scalar value is also accepted.
+        explicit: Determines if only implicitly missing values
+            should be filled (`False`), or all nulls existing in the LazyFrame
+            (`True`). `explicit` is applicable only
+            if `fill_value` is not `None`.
+        sort: Sort the DataFrame based on *columns.
+        by: Column(s) to group by.
+            The explicit missing rows are returned per group.
+
+    Returns:
+        A polars DataFrame/LazyFrame.
+    """  # noqa: E501
+    if not columns:
+        return df
+    return _complete(
+        df=df,
+        columns=columns,
+        fill_value=fill_value,
+        explicit=explicit,
+        sort=sort,
+        by=by,
+    )
+
+
+
+ +
+ +
+ + +

+ expand(df, *columns, sort=False, by=None) + +

+ + +
+ +

Creates a DataFrame from a cartesian combination of all inputs.

+

Inspiration is from tidyr's expand() function.

+

expand() is often useful with +pl.DataFrame.join +to convert implicit +missing values to explicit missing values - similar to +complete.

+

It can also be used to figure out which combinations are missing +(e.g identify gaps in your DataFrame).

+

The variable columns parameter can be a string, +a ColumnSelector, a polars expression, or a polars Series.

+

expand can also be applied to a LazyFrame.

+ + +

Examples:

+
>>> import polars as pl
+>>> import janitor.polars
+>>> data = [{'type': 'apple', 'year': 2010, 'size': 'XS'},
+...         {'type': 'orange', 'year': 2010, 'size': 'S'},
+...         {'type': 'apple', 'year': 2012, 'size': 'M'},
+...         {'type': 'orange', 'year': 2010, 'size': 'S'},
+...         {'type': 'orange', 'year': 2011, 'size': 'S'},
+...         {'type': 'orange', 'year': 2012, 'size': 'M'}]
+>>> df = pl.DataFrame(data)
+>>> df
+shape: (6, 3)
+┌────────┬──────┬──────┐
+│ type   ┆ year ┆ size │
+│ ---    ┆ ---  ┆ ---  │
+│ str    ┆ i64  ┆ str  │
+╞════════╪══════╪══════╡
+│ apple  ┆ 2010 ┆ XS   │
+│ orange ┆ 2010 ┆ S    │
+│ apple  ┆ 2012 ┆ M    │
+│ orange ┆ 2010 ┆ S    │
+│ orange ┆ 2011 ┆ S    │
+│ orange ┆ 2012 ┆ M    │
+└────────┴──────┴──────┘
+
+

Get unique observations:

+
>>> df.expand('type',sort=True)
+shape: (2, 1)
+┌────────┐
+│ type   │
+│ ---    │
+│ str    │
+╞════════╡
+│ apple  │
+│ orange │
+└────────┘
+>>> df.expand('size',sort=True)
+shape: (3, 1)
+┌──────┐
+│ size │
+│ ---  │
+│ str  │
+╞══════╡
+│ M    │
+│ S    │
+│ XS   │
+└──────┘
+>>> df.expand('type', 'size',sort=True)
+shape: (6, 2)
+┌────────┬──────┐
+│ type   ┆ size │
+│ ---    ┆ ---  │
+│ str    ┆ str  │
+╞════════╪══════╡
+│ apple  ┆ M    │
+│ apple  ┆ S    │
+│ apple  ┆ XS   │
+│ orange ┆ M    │
+│ orange ┆ S    │
+│ orange ┆ XS   │
+└────────┴──────┘
+>>> with pl.Config(tbl_rows=-1):
+...     df.expand('type','size','year',sort=True)
+shape: (18, 3)
+┌────────┬──────┬──────┐
+│ type   ┆ size ┆ year │
+│ ---    ┆ ---  ┆ ---  │
+│ str    ┆ str  ┆ i64  │
+╞════════╪══════╪══════╡
+│ apple  ┆ M    ┆ 2010 │
+│ apple  ┆ M    ┆ 2011 │
+│ apple  ┆ M    ┆ 2012 │
+│ apple  ┆ S    ┆ 2010 │
+│ apple  ┆ S    ┆ 2011 │
+│ apple  ┆ S    ┆ 2012 │
+│ apple  ┆ XS   ┆ 2010 │
+│ apple  ┆ XS   ┆ 2011 │
+│ apple  ┆ XS   ┆ 2012 │
+│ orange ┆ M    ┆ 2010 │
+│ orange ┆ M    ┆ 2011 │
+│ orange ┆ M    ┆ 2012 │
+│ orange ┆ S    ┆ 2010 │
+│ orange ┆ S    ┆ 2011 │
+│ orange ┆ S    ┆ 2012 │
+│ orange ┆ XS   ┆ 2010 │
+│ orange ┆ XS   ┆ 2011 │
+│ orange ┆ XS   ┆ 2012 │
+└────────┴──────┴──────┘
+
+

Get observations that only occur in the data:

+
>>> df.expand(pl.struct('type','size'),sort=True).unnest('type')
+shape: (4, 2)
+┌────────┬──────┐
+│ type   ┆ size │
+│ ---    ┆ ---  │
+│ str    ┆ str  │
+╞════════╪══════╡
+│ apple  ┆ M    │
+│ apple  ┆ XS   │
+│ orange ┆ M    │
+│ orange ┆ S    │
+└────────┴──────┘
+>>> df.expand(pl.struct('type','size','year'),sort=True).unnest('type')
+shape: (5, 3)
+┌────────┬──────┬──────┐
+│ type   ┆ size ┆ year │
+│ ---    ┆ ---  ┆ ---  │
+│ str    ┆ str  ┆ i64  │
+╞════════╪══════╪══════╡
+│ apple  ┆ M    ┆ 2012 │
+│ apple  ┆ XS   ┆ 2010 │
+│ orange ┆ M    ┆ 2012 │
+│ orange ┆ S    ┆ 2010 │
+│ orange ┆ S    ┆ 2011 │
+└────────┴──────┴──────┘
+
+

Expand the DataFrame to include new observations:

+
>>> with pl.Config(tbl_rows=-1):
+...     df.expand('type','size',pl.int_range(2010,2014).alias('new_year'),sort=True)
+shape: (24, 3)
+┌────────┬──────┬──────────┐
+│ type   ┆ size ┆ new_year │
+│ ---    ┆ ---  ┆ ---      │
+│ str    ┆ str  ┆ i64      │
+╞════════╪══════╪══════════╡
+│ apple  ┆ M    ┆ 2010     │
+│ apple  ┆ M    ┆ 2011     │
+│ apple  ┆ M    ┆ 2012     │
+│ apple  ┆ M    ┆ 2013     │
+│ apple  ┆ S    ┆ 2010     │
+│ apple  ┆ S    ┆ 2011     │
+│ apple  ┆ S    ┆ 2012     │
+│ apple  ┆ S    ┆ 2013     │
+│ apple  ┆ XS   ┆ 2010     │
+│ apple  ┆ XS   ┆ 2011     │
+│ apple  ┆ XS   ┆ 2012     │
+│ apple  ┆ XS   ┆ 2013     │
+│ orange ┆ M    ┆ 2010     │
+│ orange ┆ M    ┆ 2011     │
+│ orange ┆ M    ┆ 2012     │
+│ orange ┆ M    ┆ 2013     │
+│ orange ┆ S    ┆ 2010     │
+│ orange ┆ S    ┆ 2011     │
+│ orange ┆ S    ┆ 2012     │
+│ orange ┆ S    ┆ 2013     │
+│ orange ┆ XS   ┆ 2010     │
+│ orange ┆ XS   ┆ 2011     │
+│ orange ┆ XS   ┆ 2012     │
+│ orange ┆ XS   ┆ 2013     │
+└────────┴──────┴──────────┘
+
+

Filter for missing observations:

+
>>> columns = ('type','size','year')
+>>> with pl.Config(tbl_rows=-1):
+...     df.expand(*columns).join(df, how='anti', on=columns).sort(by=pl.all())
+shape: (13, 3)
+┌────────┬──────┬──────┐
+│ type   ┆ size ┆ year │
+│ ---    ┆ ---  ┆ ---  │
+│ str    ┆ str  ┆ i64  │
+╞════════╪══════╪══════╡
+│ apple  ┆ M    ┆ 2010 │
+│ apple  ┆ M    ┆ 2011 │
+│ apple  ┆ S    ┆ 2010 │
+│ apple  ┆ S    ┆ 2011 │
+│ apple  ┆ S    ┆ 2012 │
+│ apple  ┆ XS   ┆ 2011 │
+│ apple  ┆ XS   ┆ 2012 │
+│ orange ┆ M    ┆ 2010 │
+│ orange ┆ M    ┆ 2011 │
+│ orange ┆ S    ┆ 2012 │
+│ orange ┆ XS   ┆ 2010 │
+│ orange ┆ XS   ┆ 2011 │
+│ orange ┆ XS   ┆ 2012 │
+└────────┴──────┴──────┘
+
+

Expand within each group, using by:

+
>>> with pl.Config(tbl_rows=-1):
+...     df.expand('year','size',by='type',sort=True)
+shape: (10, 3)
+┌────────┬──────┬──────┐
+│ type   ┆ year ┆ size │
+│ ---    ┆ ---  ┆ ---  │
+│ str    ┆ i64  ┆ str  │
+╞════════╪══════╪══════╡
+│ apple  ┆ 2010 ┆ M    │
+│ apple  ┆ 2010 ┆ XS   │
+│ apple  ┆ 2012 ┆ M    │
+│ apple  ┆ 2012 ┆ XS   │
+│ orange ┆ 2010 ┆ M    │
+│ orange ┆ 2010 ┆ S    │
+│ orange ┆ 2011 ┆ M    │
+│ orange ┆ 2011 ┆ S    │
+│ orange ┆ 2012 ┆ M    │
+│ orange ┆ 2012 ┆ S    │
+└────────┴──────┴──────┘
+
+
+

New in version 0.28.0

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ *columns + + ColumnNameOrSelector + +
+

This refers to the columns to be completed. +It can be a string or a column selector or a polars expression. +A polars expression can be used to introduced new values, +as long as the polars expression has a name that already exists +in the DataFrame.

+
+
+ () +
+ sort + + bool + +
+

Sort the DataFrame based on *columns.

+
+
+ False +
+ by + + ColumnNameOrSelector + +
+

Column(s) to group by.

+
+
+ None +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame | LazyFrame + +
+

A polars DataFrame/LazyFrame.

+
+
+ +
+ Source code in janitor/polars/complete.py +
 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
@register_lazyframe_method
+@register_dataframe_method
+def expand(
+    df: pl.DataFrame | pl.LazyFrame,
+    *columns: ColumnNameOrSelector,
+    sort: bool = False,
+    by: ColumnNameOrSelector = None,
+) -> pl.DataFrame | pl.LazyFrame:
+    """
+    Creates a DataFrame from a cartesian combination of all inputs.
+
+    Inspiration is from tidyr's expand() function.
+
+    expand() is often useful with
+    [pl.DataFrame.join](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html)
+    to convert implicit
+    missing values to explicit missing values - similar to
+    [`complete`][janitor.polars.complete.complete].
+
+    It can also be used to figure out which combinations are missing
+    (e.g identify gaps in your DataFrame).
+
+    The variable `columns` parameter can be a string,
+    a ColumnSelector, a polars expression, or a polars Series.
+
+    `expand` can also be applied to a LazyFrame.
+
+    Examples:
+        >>> import polars as pl
+        >>> import janitor.polars
+        >>> data = [{'type': 'apple', 'year': 2010, 'size': 'XS'},
+        ...         {'type': 'orange', 'year': 2010, 'size': 'S'},
+        ...         {'type': 'apple', 'year': 2012, 'size': 'M'},
+        ...         {'type': 'orange', 'year': 2010, 'size': 'S'},
+        ...         {'type': 'orange', 'year': 2011, 'size': 'S'},
+        ...         {'type': 'orange', 'year': 2012, 'size': 'M'}]
+        >>> df = pl.DataFrame(data)
+        >>> df
+        shape: (6, 3)
+        ┌────────┬──────┬──────┐
+        │ type   ┆ year ┆ size │
+        │ ---    ┆ ---  ┆ ---  │
+        │ str    ┆ i64  ┆ str  │
+        ╞════════╪══════╪══════╡
+        │ apple  ┆ 2010 ┆ XS   │
+        │ orange ┆ 2010 ┆ S    │
+        │ apple  ┆ 2012 ┆ M    │
+        │ orange ┆ 2010 ┆ S    │
+        │ orange ┆ 2011 ┆ S    │
+        │ orange ┆ 2012 ┆ M    │
+        └────────┴──────┴──────┘
+
+        Get unique observations:
+        >>> df.expand('type',sort=True)
+        shape: (2, 1)
+        ┌────────┐
+        │ type   │
+        │ ---    │
+        │ str    │
+        ╞════════╡
+        │ apple  │
+        │ orange │
+        └────────┘
+        >>> df.expand('size',sort=True)
+        shape: (3, 1)
+        ┌──────┐
+        │ size │
+        │ ---  │
+        │ str  │
+        ╞══════╡
+        │ M    │
+        │ S    │
+        │ XS   │
+        └──────┘
+        >>> df.expand('type', 'size',sort=True)
+        shape: (6, 2)
+        ┌────────┬──────┐
+        │ type   ┆ size │
+        │ ---    ┆ ---  │
+        │ str    ┆ str  │
+        ╞════════╪══════╡
+        │ apple  ┆ M    │
+        │ apple  ┆ S    │
+        │ apple  ┆ XS   │
+        │ orange ┆ M    │
+        │ orange ┆ S    │
+        │ orange ┆ XS   │
+        └────────┴──────┘
+        >>> with pl.Config(tbl_rows=-1):
+        ...     df.expand('type','size','year',sort=True)
+        shape: (18, 3)
+        ┌────────┬──────┬──────┐
+        │ type   ┆ size ┆ year │
+        │ ---    ┆ ---  ┆ ---  │
+        │ str    ┆ str  ┆ i64  │
+        ╞════════╪══════╪══════╡
+        │ apple  ┆ M    ┆ 2010 │
+        │ apple  ┆ M    ┆ 2011 │
+        │ apple  ┆ M    ┆ 2012 │
+        │ apple  ┆ S    ┆ 2010 │
+        │ apple  ┆ S    ┆ 2011 │
+        │ apple  ┆ S    ┆ 2012 │
+        │ apple  ┆ XS   ┆ 2010 │
+        │ apple  ┆ XS   ┆ 2011 │
+        │ apple  ┆ XS   ┆ 2012 │
+        │ orange ┆ M    ┆ 2010 │
+        │ orange ┆ M    ┆ 2011 │
+        │ orange ┆ M    ┆ 2012 │
+        │ orange ┆ S    ┆ 2010 │
+        │ orange ┆ S    ┆ 2011 │
+        │ orange ┆ S    ┆ 2012 │
+        │ orange ┆ XS   ┆ 2010 │
+        │ orange ┆ XS   ┆ 2011 │
+        │ orange ┆ XS   ┆ 2012 │
+        └────────┴──────┴──────┘
+
+        Get observations that only occur in the data:
+        >>> df.expand(pl.struct('type','size'),sort=True).unnest('type')
+        shape: (4, 2)
+        ┌────────┬──────┐
+        │ type   ┆ size │
+        │ ---    ┆ ---  │
+        │ str    ┆ str  │
+        ╞════════╪══════╡
+        │ apple  ┆ M    │
+        │ apple  ┆ XS   │
+        │ orange ┆ M    │
+        │ orange ┆ S    │
+        └────────┴──────┘
+        >>> df.expand(pl.struct('type','size','year'),sort=True).unnest('type')
+        shape: (5, 3)
+        ┌────────┬──────┬──────┐
+        │ type   ┆ size ┆ year │
+        │ ---    ┆ ---  ┆ ---  │
+        │ str    ┆ str  ┆ i64  │
+        ╞════════╪══════╪══════╡
+        │ apple  ┆ M    ┆ 2012 │
+        │ apple  ┆ XS   ┆ 2010 │
+        │ orange ┆ M    ┆ 2012 │
+        │ orange ┆ S    ┆ 2010 │
+        │ orange ┆ S    ┆ 2011 │
+        └────────┴──────┴──────┘
+
+        Expand the DataFrame to include new observations:
+        >>> with pl.Config(tbl_rows=-1):
+        ...     df.expand('type','size',pl.int_range(2010,2014).alias('new_year'),sort=True)
+        shape: (24, 3)
+        ┌────────┬──────┬──────────┐
+        │ type   ┆ size ┆ new_year │
+        │ ---    ┆ ---  ┆ ---      │
+        │ str    ┆ str  ┆ i64      │
+        ╞════════╪══════╪══════════╡
+        │ apple  ┆ M    ┆ 2010     │
+        │ apple  ┆ M    ┆ 2011     │
+        │ apple  ┆ M    ┆ 2012     │
+        │ apple  ┆ M    ┆ 2013     │
+        │ apple  ┆ S    ┆ 2010     │
+        │ apple  ┆ S    ┆ 2011     │
+        │ apple  ┆ S    ┆ 2012     │
+        │ apple  ┆ S    ┆ 2013     │
+        │ apple  ┆ XS   ┆ 2010     │
+        │ apple  ┆ XS   ┆ 2011     │
+        │ apple  ┆ XS   ┆ 2012     │
+        │ apple  ┆ XS   ┆ 2013     │
+        │ orange ┆ M    ┆ 2010     │
+        │ orange ┆ M    ┆ 2011     │
+        │ orange ┆ M    ┆ 2012     │
+        │ orange ┆ M    ┆ 2013     │
+        │ orange ┆ S    ┆ 2010     │
+        │ orange ┆ S    ┆ 2011     │
+        │ orange ┆ S    ┆ 2012     │
+        │ orange ┆ S    ┆ 2013     │
+        │ orange ┆ XS   ┆ 2010     │
+        │ orange ┆ XS   ┆ 2011     │
+        │ orange ┆ XS   ┆ 2012     │
+        │ orange ┆ XS   ┆ 2013     │
+        └────────┴──────┴──────────┘
+
+        Filter for missing observations:
+        >>> columns = ('type','size','year')
+        >>> with pl.Config(tbl_rows=-1):
+        ...     df.expand(*columns).join(df, how='anti', on=columns).sort(by=pl.all())
+        shape: (13, 3)
+        ┌────────┬──────┬──────┐
+        │ type   ┆ size ┆ year │
+        │ ---    ┆ ---  ┆ ---  │
+        │ str    ┆ str  ┆ i64  │
+        ╞════════╪══════╪══════╡
+        │ apple  ┆ M    ┆ 2010 │
+        │ apple  ┆ M    ┆ 2011 │
+        │ apple  ┆ S    ┆ 2010 │
+        │ apple  ┆ S    ┆ 2011 │
+        │ apple  ┆ S    ┆ 2012 │
+        │ apple  ┆ XS   ┆ 2011 │
+        │ apple  ┆ XS   ┆ 2012 │
+        │ orange ┆ M    ┆ 2010 │
+        │ orange ┆ M    ┆ 2011 │
+        │ orange ┆ S    ┆ 2012 │
+        │ orange ┆ XS   ┆ 2010 │
+        │ orange ┆ XS   ┆ 2011 │
+        │ orange ┆ XS   ┆ 2012 │
+        └────────┴──────┴──────┘
+
+        Expand within each group, using `by`:
+        >>> with pl.Config(tbl_rows=-1):
+        ...     df.expand('year','size',by='type',sort=True)
+        shape: (10, 3)
+        ┌────────┬──────┬──────┐
+        │ type   ┆ year ┆ size │
+        │ ---    ┆ ---  ┆ ---  │
+        │ str    ┆ i64  ┆ str  │
+        ╞════════╪══════╪══════╡
+        │ apple  ┆ 2010 ┆ M    │
+        │ apple  ┆ 2010 ┆ XS   │
+        │ apple  ┆ 2012 ┆ M    │
+        │ apple  ┆ 2012 ┆ XS   │
+        │ orange ┆ 2010 ┆ M    │
+        │ orange ┆ 2010 ┆ S    │
+        │ orange ┆ 2011 ┆ M    │
+        │ orange ┆ 2011 ┆ S    │
+        │ orange ┆ 2012 ┆ M    │
+        │ orange ┆ 2012 ┆ S    │
+        └────────┴──────┴──────┘
+
+    !!! info "New in version 0.28.0"
+
+    Args:
+        *columns: This refers to the columns to be completed.
+            It can be a string or a column selector or a polars expression.
+            A polars expression can be used to introduced new values,
+            as long as the polars expression has a name that already exists
+            in the DataFrame.
+        sort: Sort the DataFrame based on *columns.
+        by: Column(s) to group by.
+
+    Returns:
+        A polars DataFrame/LazyFrame.
+    """  # noqa: E501
+    if not columns:
+        return df
+    uniques, _ = _expand(df=df, columns=columns, by=by, sort=sort)
+    return uniques
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ pivot_longer + + +

+ +
+ +

pivot_longer implementation for polars.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ pivot_longer(df, index=None, column_names=None, names_to='variable', values_to='value', names_sep=None, names_pattern=None, names_transform=None) + +

+ + +
+ +

Unpivots a DataFrame from wide to long format.

+

It is modeled after the pivot_longer function in R's tidyr package, +and also takes inspiration from the melt function in R's data.table package.

+

This function is useful to massage a DataFrame into a format where +one or more columns are considered measured variables, and all other +columns are considered as identifier variables.

+

All measured variables are unpivoted (and typically duplicated) along the +row axis.

+

If names_pattern, use a valid regular expression pattern containing at least +one capture group, compatible with the regex crate.

+

For more granular control on the unpivoting, have a look at +pivot_longer_spec.

+

pivot_longer can also be applied to a LazyFrame.

+ + +

Examples:

+
>>> import polars as pl
+>>> import polars.selectors as cs
+>>> import janitor.polars
+>>> df = pl.DataFrame(
+...     {
+...         "Sepal.Length": [5.1, 5.9],
+...         "Sepal.Width": [3.5, 3.0],
+...         "Petal.Length": [1.4, 5.1],
+...         "Petal.Width": [0.2, 1.8],
+...         "Species": ["setosa", "virginica"],
+...     }
+... )
+>>> df
+shape: (2, 5)
+┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐
+│ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species   │
+│ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---       │
+│ f64          ┆ f64         ┆ f64          ┆ f64         ┆ str       │
+╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡
+│ 5.1          ┆ 3.5         ┆ 1.4          ┆ 0.2         ┆ setosa    │
+│ 5.9          ┆ 3.0         ┆ 5.1          ┆ 1.8         ┆ virginica │
+└──────────────┴─────────────┴──────────────┴─────────────┴───────────┘
+
+

Replicate polars' melt:

+
>>> df.pivot_longer(index = 'Species').sort(by=pl.all())
+shape: (8, 3)
+┌───────────┬──────────────┬───────┐
+│ Species   ┆ variable     ┆ value │
+│ ---       ┆ ---          ┆ ---   │
+│ str       ┆ str          ┆ f64   │
+╞═══════════╪══════════════╪═══════╡
+│ setosa    ┆ Petal.Length ┆ 1.4   │
+│ setosa    ┆ Petal.Width  ┆ 0.2   │
+│ setosa    ┆ Sepal.Length ┆ 5.1   │
+│ setosa    ┆ Sepal.Width  ┆ 3.5   │
+│ virginica ┆ Petal.Length ┆ 5.1   │
+│ virginica ┆ Petal.Width  ┆ 1.8   │
+│ virginica ┆ Sepal.Length ┆ 5.9   │
+│ virginica ┆ Sepal.Width  ┆ 3.0   │
+└───────────┴──────────────┴───────┘
+
+

Split the column labels into individual columns:

+
>>> df.pivot_longer(
+...     index = 'Species',
+...     names_to = ('part', 'dimension'),
+...     names_sep = '.',
+... ).select('Species','part','dimension','value').sort(by=pl.all())
+shape: (8, 4)
+┌───────────┬───────┬───────────┬───────┐
+│ Species   ┆ part  ┆ dimension ┆ value │
+│ ---       ┆ ---   ┆ ---       ┆ ---   │
+│ str       ┆ str   ┆ str       ┆ f64   │
+╞═══════════╪═══════╪═══════════╪═══════╡
+│ setosa    ┆ Petal ┆ Length    ┆ 1.4   │
+│ setosa    ┆ Petal ┆ Width     ┆ 0.2   │
+│ setosa    ┆ Sepal ┆ Length    ┆ 5.1   │
+│ setosa    ┆ Sepal ┆ Width     ┆ 3.5   │
+│ virginica ┆ Petal ┆ Length    ┆ 5.1   │
+│ virginica ┆ Petal ┆ Width     ┆ 1.8   │
+│ virginica ┆ Sepal ┆ Length    ┆ 5.9   │
+│ virginica ┆ Sepal ┆ Width     ┆ 3.0   │
+└───────────┴───────┴───────────┴───────┘
+
+

Retain parts of the column names as headers:

+
>>> df.pivot_longer(
+...     index = 'Species',
+...     names_to = ('part', '.value'),
+...     names_sep = '.',
+... ).select('Species','part','Length','Width').sort(by=pl.all())
+shape: (4, 4)
+┌───────────┬───────┬────────┬───────┐
+│ Species   ┆ part  ┆ Length ┆ Width │
+│ ---       ┆ ---   ┆ ---    ┆ ---   │
+│ str       ┆ str   ┆ f64    ┆ f64   │
+╞═══════════╪═══════╪════════╪═══════╡
+│ setosa    ┆ Petal ┆ 1.4    ┆ 0.2   │
+│ setosa    ┆ Sepal ┆ 5.1    ┆ 3.5   │
+│ virginica ┆ Petal ┆ 5.1    ┆ 1.8   │
+│ virginica ┆ Sepal ┆ 5.9    ┆ 3.0   │
+└───────────┴───────┴────────┴───────┘
+
+

Split the column labels based on regex:

+
>>> df = pl.DataFrame({"id": [1], "new_sp_m5564": [2], "newrel_f65": [3]})
+>>> df
+shape: (1, 3)
+┌─────┬──────────────┬────────────┐
+│ id  ┆ new_sp_m5564 ┆ newrel_f65 │
+│ --- ┆ ---          ┆ ---        │
+│ i64 ┆ i64          ┆ i64        │
+╞═════╪══════════════╪════════════╡
+│ 1   ┆ 2            ┆ 3          │
+└─────┴──────────────┴────────────┘
+>>> df.pivot_longer(
+...     index = 'id',
+...     names_to = ('diagnosis', 'gender', 'age'),
+...     names_pattern = r"new_?(.+)_(.)([0-9]+)",
+... ).select('id','diagnosis','gender','age','value').sort(by=pl.all())
+shape: (2, 5)
+┌─────┬───────────┬────────┬──────┬───────┐
+│ id  ┆ diagnosis ┆ gender ┆ age  ┆ value │
+│ --- ┆ ---       ┆ ---    ┆ ---  ┆ ---   │
+│ i64 ┆ str       ┆ str    ┆ str  ┆ i64   │
+╞═════╪═══════════╪════════╪══════╪═══════╡
+│ 1   ┆ rel       ┆ f      ┆ 65   ┆ 3     │
+│ 1   ┆ sp        ┆ m      ┆ 5564 ┆ 2     │
+└─────┴───────────┴────────┴──────┴───────┘
+
+

Convert the dtypes of specific columns with names_transform:

+
>>> df.pivot_longer(
+...     index = "id",
+...     names_pattern=r"new_?(.+)_(.)([0-9]+)",
+...     names_to=("diagnosis", "gender", "age"),
+...     names_transform=pl.col('age').cast(pl.Int32),
+... ).select("id", "diagnosis", "gender", "age", "value").sort(by=pl.all())
+shape: (2, 5)
+┌─────┬───────────┬────────┬──────┬───────┐
+│ id  ┆ diagnosis ┆ gender ┆ age  ┆ value │
+│ --- ┆ ---       ┆ ---    ┆ ---  ┆ ---   │
+│ i64 ┆ str       ┆ str    ┆ i32  ┆ i64   │
+╞═════╪═══════════╪════════╪══════╪═══════╡
+│ 1   ┆ rel       ┆ f      ┆ 65   ┆ 3     │
+│ 1   ┆ sp        ┆ m      ┆ 5564 ┆ 2     │
+└─────┴───────────┴────────┴──────┴───────┘
+
+

Use multiple .value to reshape the dataframe:

+
>>> df = pl.DataFrame(
+...     [
+...         {
+...             "x_1_mean": 10,
+...             "x_2_mean": 20,
+...             "y_1_mean": 30,
+...             "y_2_mean": 40,
+...             "unit": 50,
+...         }
+...     ]
+... )
+>>> df
+shape: (1, 5)
+┌──────────┬──────────┬──────────┬──────────┬──────┐
+│ x_1_mean ┆ x_2_mean ┆ y_1_mean ┆ y_2_mean ┆ unit │
+│ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---  │
+│ i64      ┆ i64      ┆ i64      ┆ i64      ┆ i64  │
+╞══════════╪══════════╪══════════╪══════════╪══════╡
+│ 10       ┆ 20       ┆ 30       ┆ 40       ┆ 50   │
+└──────────┴──────────┴──────────┴──────────┴──────┘
+>>> df.pivot_longer(
+...     index="unit",
+...     names_to=(".value", "time", ".value"),
+...     names_pattern=r"(x|y)_([0-9])(_mean)",
+... ).select('unit','time','x_mean','y_mean').sort(by=pl.all())
+shape: (2, 4)
+┌──────┬──────┬────────┬────────┐
+│ unit ┆ time ┆ x_mean ┆ y_mean │
+│ ---  ┆ ---  ┆ ---    ┆ ---    │
+│ i64  ┆ str  ┆ i64    ┆ i64    │
+╞══════╪══════╪════════╪════════╡
+│ 50   ┆ 1    ┆ 10     ┆ 30     │
+│ 50   ┆ 2    ┆ 20     ┆ 40     │
+└──────┴──────┴────────┴────────┘
+
+
+

New in version 0.28.0

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ index + + ColumnNameOrSelector + +
+

Column(s) or selector(s) to use as identifier variables.

+
+
+ None +
+ column_names + + ColumnNameOrSelector + +
+

Column(s) or selector(s) to unpivot.

+
+
+ None +
+ names_to + + list | tuple | str + +
+

Name of new column as a string that will contain +what were previously the column names in column_names. +The default is variable if no value is provided. It can +also be a list/tuple of strings that will serve as new column +names, if name_sep or names_pattern is provided. +If .value is in names_to, new column names will be extracted +from part of the existing column names and overrides values_to.

+
+
+ 'variable' +
+ values_to + + str + +
+

Name of new column as a string that will contain what +were previously the values of the columns in column_names.

+
+
+ 'value' +
+ names_sep + + str + +
+

Determines how the column name is broken up, if +names_to contains multiple values. It takes the same +specification as polars' str.split method.

+
+
+ None +
+ names_pattern + + str + +
+

Determines how the column name is broken up. +It can be a regular expression containing matching groups. +It takes the same specification as +polars' str.extract_groups method.

+
+
+ None +
+ names_transform + + Expr + +
+

Use this option to change the types of columns that +have been transformed to rows. +This does not applies to the values' columns. +Accepts a polars expression or a list of polars expressions. +Applicable only if one of names_sep +or names_pattern is provided.

+
+
+ None +
+ + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ DataFrame | LazyFrame + +
+

A polars DataFrame/LazyFrame that has been unpivoted

+
+
+ DataFrame | LazyFrame + +
+

from wide to long format.

+
+
+ +
+ Source code in janitor/polars/pivot_longer.py +
187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
@register_lazyframe_method
+@register_dataframe_method
+def pivot_longer(
+    df: pl.DataFrame | pl.LazyFrame,
+    index: ColumnNameOrSelector = None,
+    column_names: ColumnNameOrSelector = None,
+    names_to: list | tuple | str = "variable",
+    values_to: str = "value",
+    names_sep: str = None,
+    names_pattern: str = None,
+    names_transform: pl.Expr = None,
+) -> pl.DataFrame | pl.LazyFrame:
+    """
+    Unpivots a DataFrame from *wide* to *long* format.
+
+    It is modeled after the `pivot_longer` function in R's tidyr package,
+    and also takes inspiration from the `melt` function in R's data.table package.
+
+    This function is useful to massage a DataFrame into a format where
+    one or more columns are considered measured variables, and all other
+    columns are considered as identifier variables.
+
+    All measured variables are *unpivoted* (and typically duplicated) along the
+    row axis.
+
+    If `names_pattern`, use a valid regular expression pattern containing at least
+    one capture group, compatible with the [regex crate](https://docs.rs/regex/latest/regex/).
+
+    For more granular control on the unpivoting, have a look at
+    [`pivot_longer_spec`][janitor.polars.pivot_longer.pivot_longer_spec].
+
+    `pivot_longer` can also be applied to a LazyFrame.
+
+    Examples:
+        >>> import polars as pl
+        >>> import polars.selectors as cs
+        >>> import janitor.polars
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "Sepal.Length": [5.1, 5.9],
+        ...         "Sepal.Width": [3.5, 3.0],
+        ...         "Petal.Length": [1.4, 5.1],
+        ...         "Petal.Width": [0.2, 1.8],
+        ...         "Species": ["setosa", "virginica"],
+        ...     }
+        ... )
+        >>> df
+        shape: (2, 5)
+        ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐
+        │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species   │
+        │ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---       │
+        │ f64          ┆ f64         ┆ f64          ┆ f64         ┆ str       │
+        ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡
+        │ 5.1          ┆ 3.5         ┆ 1.4          ┆ 0.2         ┆ setosa    │
+        │ 5.9          ┆ 3.0         ┆ 5.1          ┆ 1.8         ┆ virginica │
+        └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘
+
+        Replicate polars' [melt](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.unpivot.html#polars-dataframe-melt):
+        >>> df.pivot_longer(index = 'Species').sort(by=pl.all())
+        shape: (8, 3)
+        ┌───────────┬──────────────┬───────┐
+        │ Species   ┆ variable     ┆ value │
+        │ ---       ┆ ---          ┆ ---   │
+        │ str       ┆ str          ┆ f64   │
+        ╞═══════════╪══════════════╪═══════╡
+        │ setosa    ┆ Petal.Length ┆ 1.4   │
+        │ setosa    ┆ Petal.Width  ┆ 0.2   │
+        │ setosa    ┆ Sepal.Length ┆ 5.1   │
+        │ setosa    ┆ Sepal.Width  ┆ 3.5   │
+        │ virginica ┆ Petal.Length ┆ 5.1   │
+        │ virginica ┆ Petal.Width  ┆ 1.8   │
+        │ virginica ┆ Sepal.Length ┆ 5.9   │
+        │ virginica ┆ Sepal.Width  ┆ 3.0   │
+        └───────────┴──────────────┴───────┘
+
+        Split the column labels into individual columns:
+        >>> df.pivot_longer(
+        ...     index = 'Species',
+        ...     names_to = ('part', 'dimension'),
+        ...     names_sep = '.',
+        ... ).select('Species','part','dimension','value').sort(by=pl.all())
+        shape: (8, 4)
+        ┌───────────┬───────┬───────────┬───────┐
+        │ Species   ┆ part  ┆ dimension ┆ value │
+        │ ---       ┆ ---   ┆ ---       ┆ ---   │
+        │ str       ┆ str   ┆ str       ┆ f64   │
+        ╞═══════════╪═══════╪═══════════╪═══════╡
+        │ setosa    ┆ Petal ┆ Length    ┆ 1.4   │
+        │ setosa    ┆ Petal ┆ Width     ┆ 0.2   │
+        │ setosa    ┆ Sepal ┆ Length    ┆ 5.1   │
+        │ setosa    ┆ Sepal ┆ Width     ┆ 3.5   │
+        │ virginica ┆ Petal ┆ Length    ┆ 5.1   │
+        │ virginica ┆ Petal ┆ Width     ┆ 1.8   │
+        │ virginica ┆ Sepal ┆ Length    ┆ 5.9   │
+        │ virginica ┆ Sepal ┆ Width     ┆ 3.0   │
+        └───────────┴───────┴───────────┴───────┘
+
+        Retain parts of the column names as headers:
+        >>> df.pivot_longer(
+        ...     index = 'Species',
+        ...     names_to = ('part', '.value'),
+        ...     names_sep = '.',
+        ... ).select('Species','part','Length','Width').sort(by=pl.all())
+        shape: (4, 4)
+        ┌───────────┬───────┬────────┬───────┐
+        │ Species   ┆ part  ┆ Length ┆ Width │
+        │ ---       ┆ ---   ┆ ---    ┆ ---   │
+        │ str       ┆ str   ┆ f64    ┆ f64   │
+        ╞═══════════╪═══════╪════════╪═══════╡
+        │ setosa    ┆ Petal ┆ 1.4    ┆ 0.2   │
+        │ setosa    ┆ Sepal ┆ 5.1    ┆ 3.5   │
+        │ virginica ┆ Petal ┆ 5.1    ┆ 1.8   │
+        │ virginica ┆ Sepal ┆ 5.9    ┆ 3.0   │
+        └───────────┴───────┴────────┴───────┘
+
+        Split the column labels based on regex:
+        >>> df = pl.DataFrame({"id": [1], "new_sp_m5564": [2], "newrel_f65": [3]})
+        >>> df
+        shape: (1, 3)
+        ┌─────┬──────────────┬────────────┐
+        │ id  ┆ new_sp_m5564 ┆ newrel_f65 │
+        │ --- ┆ ---          ┆ ---        │
+        │ i64 ┆ i64          ┆ i64        │
+        ╞═════╪══════════════╪════════════╡
+        │ 1   ┆ 2            ┆ 3          │
+        └─────┴──────────────┴────────────┘
+        >>> df.pivot_longer(
+        ...     index = 'id',
+        ...     names_to = ('diagnosis', 'gender', 'age'),
+        ...     names_pattern = r"new_?(.+)_(.)([0-9]+)",
+        ... ).select('id','diagnosis','gender','age','value').sort(by=pl.all())
+        shape: (2, 5)
+        ┌─────┬───────────┬────────┬──────┬───────┐
+        │ id  ┆ diagnosis ┆ gender ┆ age  ┆ value │
+        │ --- ┆ ---       ┆ ---    ┆ ---  ┆ ---   │
+        │ i64 ┆ str       ┆ str    ┆ str  ┆ i64   │
+        ╞═════╪═══════════╪════════╪══════╪═══════╡
+        │ 1   ┆ rel       ┆ f      ┆ 65   ┆ 3     │
+        │ 1   ┆ sp        ┆ m      ┆ 5564 ┆ 2     │
+        └─────┴───────────┴────────┴──────┴───────┘
+
+        Convert the dtypes of specific columns with `names_transform`:
+        >>> df.pivot_longer(
+        ...     index = "id",
+        ...     names_pattern=r"new_?(.+)_(.)([0-9]+)",
+        ...     names_to=("diagnosis", "gender", "age"),
+        ...     names_transform=pl.col('age').cast(pl.Int32),
+        ... ).select("id", "diagnosis", "gender", "age", "value").sort(by=pl.all())
+        shape: (2, 5)
+        ┌─────┬───────────┬────────┬──────┬───────┐
+        │ id  ┆ diagnosis ┆ gender ┆ age  ┆ value │
+        │ --- ┆ ---       ┆ ---    ┆ ---  ┆ ---   │
+        │ i64 ┆ str       ┆ str    ┆ i32  ┆ i64   │
+        ╞═════╪═══════════╪════════╪══════╪═══════╡
+        │ 1   ┆ rel       ┆ f      ┆ 65   ┆ 3     │
+        │ 1   ┆ sp        ┆ m      ┆ 5564 ┆ 2     │
+        └─────┴───────────┴────────┴──────┴───────┘
+
+        Use multiple `.value` to reshape the dataframe:
+        >>> df = pl.DataFrame(
+        ...     [
+        ...         {
+        ...             "x_1_mean": 10,
+        ...             "x_2_mean": 20,
+        ...             "y_1_mean": 30,
+        ...             "y_2_mean": 40,
+        ...             "unit": 50,
+        ...         }
+        ...     ]
+        ... )
+        >>> df
+        shape: (1, 5)
+        ┌──────────┬──────────┬──────────┬──────────┬──────┐
+        │ x_1_mean ┆ x_2_mean ┆ y_1_mean ┆ y_2_mean ┆ unit │
+        │ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---  │
+        │ i64      ┆ i64      ┆ i64      ┆ i64      ┆ i64  │
+        ╞══════════╪══════════╪══════════╪══════════╪══════╡
+        │ 10       ┆ 20       ┆ 30       ┆ 40       ┆ 50   │
+        └──────────┴──────────┴──────────┴──────────┴──────┘
+        >>> df.pivot_longer(
+        ...     index="unit",
+        ...     names_to=(".value", "time", ".value"),
+        ...     names_pattern=r"(x|y)_([0-9])(_mean)",
+        ... ).select('unit','time','x_mean','y_mean').sort(by=pl.all())
+        shape: (2, 4)
+        ┌──────┬──────┬────────┬────────┐
+        │ unit ┆ time ┆ x_mean ┆ y_mean │
+        │ ---  ┆ ---  ┆ ---    ┆ ---    │
+        │ i64  ┆ str  ┆ i64    ┆ i64    │
+        ╞══════╪══════╪════════╪════════╡
+        │ 50   ┆ 1    ┆ 10     ┆ 30     │
+        │ 50   ┆ 2    ┆ 20     ┆ 40     │
+        └──────┴──────┴────────┴────────┘
+
+    !!! info "New in version 0.28.0"
+
+    Args:
+        index: Column(s) or selector(s) to use as identifier variables.
+        column_names: Column(s) or selector(s) to unpivot.
+        names_to: Name of new column as a string that will contain
+            what were previously the column names in `column_names`.
+            The default is `variable` if no value is provided. It can
+            also be a list/tuple of strings that will serve as new column
+            names, if `name_sep` or `names_pattern` is provided.
+            If `.value` is in `names_to`, new column names will be extracted
+            from part of the existing column names and overrides `values_to`.
+        values_to: Name of new column as a string that will contain what
+            were previously the values of the columns in `column_names`.
+        names_sep: Determines how the column name is broken up, if
+            `names_to` contains multiple values. It takes the same
+            specification as polars' `str.split` method.
+        names_pattern: Determines how the column name is broken up.
+            It can be a regular expression containing matching groups.
+            It takes the same specification as
+            polars' `str.extract_groups` method.
+        names_transform: Use this option to change the types of columns that
+            have been transformed to rows.
+            This does not applies to the values' columns.
+            Accepts a polars expression or a list of polars expressions.
+            Applicable only if one of names_sep
+            or names_pattern is provided.
+
+    Returns:
+        A polars DataFrame/LazyFrame that has been unpivoted
+        from wide to long format.
+    """  # noqa: E501
+    return _pivot_longer(
+        df=df,
+        index=index,
+        column_names=column_names,
+        names_pattern=names_pattern,
+        names_sep=names_sep,
+        names_to=names_to,
+        values_to=values_to,
+        names_transform=names_transform,
+    )
+
+
+
+ +
+ +
+ + +

+ pivot_longer_spec(df, spec) + +

+ + +
+ +

A declarative interface to pivot a Polars Frame +from wide to long form, +where you describe how the data will be unpivoted, +using a DataFrame.

+

It is modeled after tidyr's pivot_longer_spec.

+

This gives you, the user, +more control over the transformation to long form, +using a spec DataFrame that describes exactly +how data stored in the column names +becomes variables.

+

It can come in handy for situations where +pivot_longer +seems inadequate for the transformation.

+
+

New in version 0.28.0

+
+ + +

Examples:

+
>>> import pandas as pd
+>>> from janitor.polars import pivot_longer_spec
+>>> df = pl.DataFrame(
+...     {
+...         "Sepal.Length": [5.1, 5.9],
+...         "Sepal.Width": [3.5, 3.0],
+...         "Petal.Length": [1.4, 5.1],
+...         "Petal.Width": [0.2, 1.8],
+...         "Species": ["setosa", "virginica"],
+...     }
+... )
+>>> df
+shape: (2, 5)
+┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐
+│ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species   │
+│ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---       │
+│ f64          ┆ f64         ┆ f64          ┆ f64         ┆ str       │
+╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡
+│ 5.1          ┆ 3.5         ┆ 1.4          ┆ 0.2         ┆ setosa    │
+│ 5.9          ┆ 3.0         ┆ 5.1          ┆ 1.8         ┆ virginica │
+└──────────────┴─────────────┴──────────────┴─────────────┴───────────┘
+>>> spec = {'.name':['Sepal.Length','Petal.Length',
+...                  'Sepal.Width','Petal.Width'],
+...         '.value':['Length','Length','Width','Width'],
+...         'part':['Sepal','Petal','Sepal','Petal']}
+>>> spec = pl.DataFrame(spec)
+>>> spec
+shape: (4, 3)
+┌──────────────┬────────┬───────┐
+│ .name        ┆ .value ┆ part  │
+│ ---          ┆ ---    ┆ ---   │
+│ str          ┆ str    ┆ str   │
+╞══════════════╪════════╪═══════╡
+│ Sepal.Length ┆ Length ┆ Sepal │
+│ Petal.Length ┆ Length ┆ Petal │
+│ Sepal.Width  ┆ Width  ┆ Sepal │
+│ Petal.Width  ┆ Width  ┆ Petal │
+└──────────────┴────────┴───────┘
+>>> df.pipe(pivot_longer_spec,spec=spec).sort(by=pl.all())
+shape: (4, 4)
+┌───────────┬───────┬────────┬───────┐
+│ Species   ┆ part  ┆ Length ┆ Width │
+│ ---       ┆ ---   ┆ ---    ┆ ---   │
+│ str       ┆ str   ┆ f64    ┆ f64   │
+╞═══════════╪═══════╪════════╪═══════╡
+│ setosa    ┆ Petal ┆ 1.4    ┆ 0.2   │
+│ setosa    ┆ Sepal ┆ 5.1    ┆ 3.5   │
+│ virginica ┆ Petal ┆ 5.1    ┆ 1.8   │
+│ virginica ┆ Sepal ┆ 5.9    ┆ 3.0   │
+└───────────┴───────┴────────┴───────┘
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame | LazyFrame + +
+

The source DataFrame to unpivot. +It can also be a LazyFrame.

+
+
+ required +
+ spec + + DataFrame + +
+

A specification DataFrame. +At a minimum, the spec DataFrame +must have a .name column +and a .value column. +The .name column should contain the +columns in the source DataFrame that will be +transformed to long form. +The .value column gives the name of the column +that the values in the source DataFrame will go into. +Additional columns in the spec DataFrame +should be named to match columns +in the long format of the dataset and contain values +corresponding to columns pivoted from the wide format. +Note that these additional columns should not already exist +in the source DataFrame. +If there are additional columns, the combination of these columns +and the .value column must be unique.

+
+
+ required +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ KeyError + +
+

If .name or .value is missing from the spec's columns.

+
+
+ ValueError + +
+

If the labels in spec's .name column is not unique.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame | LazyFrame + +
+

A polars DataFrame/LazyFrame.

+
+
+ +
+ Source code in janitor/polars/pivot_longer.py +
 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
def pivot_longer_spec(
+    df: pl.DataFrame | pl.LazyFrame,
+    spec: pl.DataFrame,
+) -> pl.DataFrame | pl.LazyFrame:
+    """
+    A declarative interface to pivot a Polars Frame
+    from wide to long form,
+    where you describe how the data will be unpivoted,
+    using a DataFrame.
+
+    It is modeled after tidyr's `pivot_longer_spec`.
+
+    This gives you, the user,
+    more control over the transformation to long form,
+    using a *spec* DataFrame that describes exactly
+    how data stored in the column names
+    becomes variables.
+
+    It can come in handy for situations where
+    [`pivot_longer`][janitor.polars.pivot_longer.pivot_longer]
+    seems inadequate for the transformation.
+
+    !!! info "New in version 0.28.0"
+
+    Examples:
+        >>> import pandas as pd
+        >>> from janitor.polars import pivot_longer_spec
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "Sepal.Length": [5.1, 5.9],
+        ...         "Sepal.Width": [3.5, 3.0],
+        ...         "Petal.Length": [1.4, 5.1],
+        ...         "Petal.Width": [0.2, 1.8],
+        ...         "Species": ["setosa", "virginica"],
+        ...     }
+        ... )
+        >>> df
+        shape: (2, 5)
+        ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐
+        │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species   │
+        │ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---       │
+        │ f64          ┆ f64         ┆ f64          ┆ f64         ┆ str       │
+        ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡
+        │ 5.1          ┆ 3.5         ┆ 1.4          ┆ 0.2         ┆ setosa    │
+        │ 5.9          ┆ 3.0         ┆ 5.1          ┆ 1.8         ┆ virginica │
+        └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘
+        >>> spec = {'.name':['Sepal.Length','Petal.Length',
+        ...                  'Sepal.Width','Petal.Width'],
+        ...         '.value':['Length','Length','Width','Width'],
+        ...         'part':['Sepal','Petal','Sepal','Petal']}
+        >>> spec = pl.DataFrame(spec)
+        >>> spec
+        shape: (4, 3)
+        ┌──────────────┬────────┬───────┐
+        │ .name        ┆ .value ┆ part  │
+        │ ---          ┆ ---    ┆ ---   │
+        │ str          ┆ str    ┆ str   │
+        ╞══════════════╪════════╪═══════╡
+        │ Sepal.Length ┆ Length ┆ Sepal │
+        │ Petal.Length ┆ Length ┆ Petal │
+        │ Sepal.Width  ┆ Width  ┆ Sepal │
+        │ Petal.Width  ┆ Width  ┆ Petal │
+        └──────────────┴────────┴───────┘
+        >>> df.pipe(pivot_longer_spec,spec=spec).sort(by=pl.all())
+        shape: (4, 4)
+        ┌───────────┬───────┬────────┬───────┐
+        │ Species   ┆ part  ┆ Length ┆ Width │
+        │ ---       ┆ ---   ┆ ---    ┆ ---   │
+        │ str       ┆ str   ┆ f64    ┆ f64   │
+        ╞═══════════╪═══════╪════════╪═══════╡
+        │ setosa    ┆ Petal ┆ 1.4    ┆ 0.2   │
+        │ setosa    ┆ Sepal ┆ 5.1    ┆ 3.5   │
+        │ virginica ┆ Petal ┆ 5.1    ┆ 1.8   │
+        │ virginica ┆ Sepal ┆ 5.9    ┆ 3.0   │
+        └───────────┴───────┴────────┴───────┘
+
+    Args:
+        df: The source DataFrame to unpivot.
+            It can also be a LazyFrame.
+        spec: A specification DataFrame.
+            At a minimum, the spec DataFrame
+            must have a `.name` column
+            and a `.value` column.
+            The `.name` column  should contain the
+            columns in the source DataFrame that will be
+            transformed to long form.
+            The `.value` column gives the name of the column
+            that the values in the source DataFrame will go into.
+            Additional columns in the spec DataFrame
+            should be named to match columns
+            in the long format of the dataset and contain values
+            corresponding to columns pivoted from the wide format.
+            Note that these additional columns should not already exist
+            in the source DataFrame.
+            If there are additional columns, the combination of these columns
+            and the `.value` column must be unique.
+
+    Raises:
+        KeyError: If `.name` or `.value` is missing from the spec's columns.
+        ValueError: If the labels in spec's `.name` column is not unique.
+
+    Returns:
+        A polars DataFrame/LazyFrame.
+    """
+    check("spec", spec, [pl.DataFrame])
+    spec_columns = spec.collect_schema().names()
+    if ".name" not in spec_columns:
+        raise KeyError(
+            "Kindly ensure the spec DataFrame has a `.name` column."
+        )
+    if ".value" not in spec_columns:
+        raise KeyError(
+            "Kindly ensure the spec DataFrame has a `.value` column."
+        )
+    if spec.get_column(".name").is_duplicated().any():
+        raise ValueError("The labels in the `.name` column should be unique.")
+    df_columns = df.collect_schema().names()
+    exclude = set(df_columns).intersection(spec_columns)
+    if exclude:
+        raise ValueError(
+            f"Labels {*exclude, } in the spec dataframe already exist "
+            "as column labels in the source dataframe. "
+            "Kindly ensure the spec DataFrame's columns "
+            "are not present in the source DataFrame."
+        )
+
+    index = [
+        label for label in df_columns if label not in spec.get_column(".name")
+    ]
+    others = [
+        label for label in spec_columns if label not in {".name", ".value"}
+    ]
+    if others:
+        if (len(others) == 1) & (
+            spec.get_column(others[0]).dtype == pl.String
+        ):
+            # shortcut that avoids the implode/explode approach - and is faster
+            # if the requirements are met
+            # inspired by https://github.com/pola-rs/polars/pull/18519#issue-2500860927
+            return _pivot_longer_dot_value_string(
+                df=df,
+                index=index,
+                spec=spec,
+                variable_name=others[0],
+            )
+        variable_name = "".join(df_columns + spec_columns)
+        variable_name = f"{variable_name}_"
+        dot_value_only = False
+        expression = pl.struct(others).alias(variable_name)
+        spec = spec.select(".name", ".value", expression)
+    else:
+        variable_name = "".join(df_columns + spec_columns)
+        variable_name = f"{variable_name}_"
+        dot_value_only = True
+        expression = pl.cum_count(".value").over(".value").alias(variable_name)
+        spec = spec.with_columns(expression)
+    return _pivot_longer_dot_value(
+        df=df,
+        index=index,
+        spec=spec,
+        variable_name=variable_name,
+        dot_value_only=dot_value_only,
+        names_transform=None,
+    )
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ row_to_names + + +

+ +
+ +

row_to_names implementation for polars.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ row_to_names(df, row_numbers=0, remove_rows=False, remove_rows_above=False, separator='_') + +

+ + +
+ +

Elevates a row, or rows, to be the column names of a DataFrame.

+ + +

Examples:

+

Replace column names with the first row.

+
>>> import polars as pl
+>>> import janitor.polars
+>>> df = pl.DataFrame({
+...     "a": ["nums", '6', '9'],
+...     "b": ["chars", "x", "y"],
+... })
+>>> df
+shape: (3, 2)
+┌──────┬───────┐
+│ a    ┆ b     │
+│ ---  ┆ ---   │
+│ str  ┆ str   │
+╞══════╪═══════╡
+│ nums ┆ chars │
+│ 6    ┆ x     │
+│ 9    ┆ y     │
+└──────┴───────┘
+>>> df.row_to_names(0, remove_rows=True)
+shape: (2, 2)
+┌──────┬───────┐
+│ nums ┆ chars │
+│ ---  ┆ ---   │
+│ str  ┆ str   │
+╞══════╪═══════╡
+│ 6    ┆ x     │
+│ 9    ┆ y     │
+└──────┴───────┘
+>>> df.row_to_names(row_numbers=[0,1], remove_rows=True)
+shape: (1, 2)
+┌────────┬─────────┐
+│ nums_6 ┆ chars_x │
+│ ---    ┆ ---     │
+│ str    ┆ str     │
+╞════════╪═════════╡
+│ 9      ┆ y       │
+└────────┴─────────┘
+
+

Remove rows above the elevated row and the elevated row itself.

+
>>> df = pl.DataFrame({
+...     "a": ["bla1", "nums", '6', '9'],
+...     "b": ["bla2", "chars", "x", "y"],
+... })
+>>> df
+shape: (4, 2)
+┌──────┬───────┐
+│ a    ┆ b     │
+│ ---  ┆ ---   │
+│ str  ┆ str   │
+╞══════╪═══════╡
+│ bla1 ┆ bla2  │
+│ nums ┆ chars │
+│ 6    ┆ x     │
+│ 9    ┆ y     │
+└──────┴───────┘
+>>> df.row_to_names(1, remove_rows=True, remove_rows_above=True)
+shape: (2, 2)
+┌──────┬───────┐
+│ nums ┆ chars │
+│ ---  ┆ ---   │
+│ str  ┆ str   │
+╞══════╪═══════╡
+│ 6    ┆ x     │
+│ 9    ┆ y     │
+└──────┴───────┘
+
+
+

New in version 0.28.0

+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ row_numbers + + int | list | slice + +
+

Position of the row(s) containing the variable names. +It can be an integer, list or a slice.

+
+
+ 0 +
+ remove_rows + + bool + +
+

Whether the row(s) should be removed from the DataFrame.

+
+
+ False +
+ remove_rows_above + + bool + +
+

Whether the row(s) above the selected row should +be removed from the DataFrame.

+
+
+ False +
+ separator + + str + +
+

Combines the labels into a single string, +if row_numbers is a list of integers. Default is '_'.

+
+
+ '_' +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

A polars DataFrame.

+
+
+ +
+ Source code in janitor/polars/row_to_names.py +
 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
@register_dataframe_method
+def row_to_names(
+    df: pl.DataFrame,
+    row_numbers: int | list | slice = 0,
+    remove_rows: bool = False,
+    remove_rows_above: bool = False,
+    separator: str = "_",
+) -> pl.DataFrame:
+    """
+    Elevates a row, or rows, to be the column names of a DataFrame.
+
+    Examples:
+        Replace column names with the first row.
+
+        >>> import polars as pl
+        >>> import janitor.polars
+        >>> df = pl.DataFrame({
+        ...     "a": ["nums", '6', '9'],
+        ...     "b": ["chars", "x", "y"],
+        ... })
+        >>> df
+        shape: (3, 2)
+        ┌──────┬───────┐
+        │ a    ┆ b     │
+        │ ---  ┆ ---   │
+        │ str  ┆ str   │
+        ╞══════╪═══════╡
+        │ nums ┆ chars │
+        │ 6    ┆ x     │
+        │ 9    ┆ y     │
+        └──────┴───────┘
+        >>> df.row_to_names(0, remove_rows=True)
+        shape: (2, 2)
+        ┌──────┬───────┐
+        │ nums ┆ chars │
+        │ ---  ┆ ---   │
+        │ str  ┆ str   │
+        ╞══════╪═══════╡
+        │ 6    ┆ x     │
+        │ 9    ┆ y     │
+        └──────┴───────┘
+        >>> df.row_to_names(row_numbers=[0,1], remove_rows=True)
+        shape: (1, 2)
+        ┌────────┬─────────┐
+        │ nums_6 ┆ chars_x │
+        │ ---    ┆ ---     │
+        │ str    ┆ str     │
+        ╞════════╪═════════╡
+        │ 9      ┆ y       │
+        └────────┴─────────┘
+
+        Remove rows above the elevated row and the elevated row itself.
+
+        >>> df = pl.DataFrame({
+        ...     "a": ["bla1", "nums", '6', '9'],
+        ...     "b": ["bla2", "chars", "x", "y"],
+        ... })
+        >>> df
+        shape: (4, 2)
+        ┌──────┬───────┐
+        │ a    ┆ b     │
+        │ ---  ┆ ---   │
+        │ str  ┆ str   │
+        ╞══════╪═══════╡
+        │ bla1 ┆ bla2  │
+        │ nums ┆ chars │
+        │ 6    ┆ x     │
+        │ 9    ┆ y     │
+        └──────┴───────┘
+        >>> df.row_to_names(1, remove_rows=True, remove_rows_above=True)
+        shape: (2, 2)
+        ┌──────┬───────┐
+        │ nums ┆ chars │
+        │ ---  ┆ ---   │
+        │ str  ┆ str   │
+        ╞══════╪═══════╡
+        │ 6    ┆ x     │
+        │ 9    ┆ y     │
+        └──────┴───────┘
+
+    !!! info "New in version 0.28.0"
+
+    Args:
+        row_numbers: Position of the row(s) containing the variable names.
+            It can be an integer, list or a slice.
+        remove_rows: Whether the row(s) should be removed from the DataFrame.
+        remove_rows_above: Whether the row(s) above the selected row should
+            be removed from the DataFrame.
+        separator: Combines the labels into a single string,
+            if row_numbers is a list of integers. Default is '_'.
+
+    Returns:
+        A polars DataFrame.
+    """  # noqa: E501
+    return _row_to_names(
+        row_numbers,
+        df=df,
+        remove_rows=remove_rows,
+        remove_rows_above=remove_rows_above,
+        separator=separator,
+    )
+
+
+
+ +
+ + + +
+ +
+ +
+ + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/api/timeseries/index.html b/api/timeseries/index.html new file mode 100644 index 000000000..02822e6b2 --- /dev/null +++ b/api/timeseries/index.html @@ -0,0 +1,1909 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Timeseries - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Timeseries

+ + +
+ + + + +
+ +

Time series-specific data cleaning functions.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ fill_missing_timestamps(df, frequency, first_time_stamp=None, last_time_stamp=None) + +

+ + +
+ +

Fills a DataFrame with missing timestamps based on a defined frequency.

+

If timestamps are missing, this function will re-index the DataFrame. +If timestamps are not missing, then the function will return the DataFrame +unmodified.

+ + +

Examples:

+

Functional usage

+
>>> import pandas as pd
+>>> import janitor.timeseries
+>>> df = janitor.timeseries.fill_missing_timestamps(
+...     df=pd.DataFrame(...),
+...     frequency="1H",
+... )
+
+

Method chaining example:

+
>>> import pandas as pd
+>>> import janitor.timeseries
+>>> df = (
+...     pd.DataFrame(...)
+...     .fill_missing_timestamps(frequency="1H")
+... )
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

DataFrame which needs to be tested for missing timestamps

+
+
+ required +
+ frequency + + str + +
+

Sampling frequency of the data. +Acceptable frequency strings are available +here. +Check offset aliases under time series in user guide

+
+
+ required +
+ first_time_stamp + + Timestamp + +
+

Timestamp expected to start from; +defaults to None. If no input is provided, assumes the +minimum value in time_series.

+
+
+ None +
+ last_time_stamp + + Timestamp + +
+

Timestamp expected to end with; defaults to None. +If no input is provided, assumes the maximum value in +time_series.

+
+
+ None +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

DataFrame that has a complete set of contiguous datetimes.

+
+
+ +
+ Source code in janitor/timeseries.py +
13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
@pf.register_dataframe_method
+def fill_missing_timestamps(
+    df: pd.DataFrame,
+    frequency: str,
+    first_time_stamp: pd.Timestamp = None,
+    last_time_stamp: pd.Timestamp = None,
+) -> pd.DataFrame:
+    """Fills a DataFrame with missing timestamps based on a defined frequency.
+
+    If timestamps are missing, this function will re-index the DataFrame.
+    If timestamps are not missing, then the function will return the DataFrame
+    unmodified.
+
+    Examples:
+        Functional usage
+
+        >>> import pandas as pd
+        >>> import janitor.timeseries
+        >>> df = janitor.timeseries.fill_missing_timestamps(
+        ...     df=pd.DataFrame(...),
+        ...     frequency="1H",
+        ... )  # doctest: +SKIP
+
+        Method chaining example:
+
+        >>> import pandas as pd
+        >>> import janitor.timeseries
+        >>> df = (
+        ...     pd.DataFrame(...)
+        ...     .fill_missing_timestamps(frequency="1H")
+        ... )  # doctest: +SKIP
+
+    Args:
+        df: DataFrame which needs to be tested for missing timestamps
+        frequency: Sampling frequency of the data.
+            Acceptable frequency strings are available
+            [here](https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases).
+            Check offset aliases under time series in user guide
+        first_time_stamp: Timestamp expected to start from;
+            defaults to `None`. If no input is provided, assumes the
+            minimum value in `time_series`.
+        last_time_stamp: Timestamp expected to end with; defaults to `None`.
+            If no input is provided, assumes the maximum value in
+            `time_series`.
+
+    Returns:
+        DataFrame that has a complete set of contiguous datetimes.
+    """
+    # Check all the inputs are the correct data type
+    check("frequency", frequency, [str])
+    check("first_time_stamp", first_time_stamp, [pd.Timestamp, type(None)])
+    check("last_time_stamp", last_time_stamp, [pd.Timestamp, type(None)])
+
+    if first_time_stamp is None:
+        first_time_stamp = df.index.min()
+    if last_time_stamp is None:
+        last_time_stamp = df.index.max()
+
+    # Generate expected timestamps
+    expected_timestamps = pd.date_range(
+        start=first_time_stamp, end=last_time_stamp, freq=frequency
+    )
+
+    return df.reindex(expected_timestamps)
+
+
+
+ +
+ +
+ + +

+ flag_jumps(df, scale='percentage', direction='any', threshold=0.0, strict=False) + +

+ + +
+ +

Create boolean column(s) that flag whether or not the change +between consecutive rows exceeds a provided threshold.

+

Examples:

+
Applies specified criteria across all columns of the DataFrame
+and appends a flag column for each column in the DataFrame
+
+>>> df = (
+...     pd.DataFrame(...)
+...     .flag_jumps(
+...         scale="absolute",
+...         direction="any",
+...         threshold=2
+...     )
+... )  # doctest: +SKIP
+
+Applies specific criteria to certain DataFrame columns,
+applies default criteria to columns *not* specifically listed and
+appends a flag column for each column in the DataFrame
+
+>>> df = (
+...     pd.DataFrame(...)
+...     .flag_jumps(
+...         scale=dict(col1="absolute", col2="percentage"),
+...         direction=dict(col1="increasing", col2="any"),
+...         threshold=dict(col1=1, col2=0.5),
+...     )
+... )  # doctest: +SKIP
+
+Applies specific criteria to certain DataFrame columns,
+applies default criteria to columns *not* specifically listed and
+appends a flag column for only those columns found in specified
+criteria
+
+>>> df = (
+...     pd.DataFrame(...)
+...     .flag_jumps(
+...         scale=dict(col1="absolute"),
+...         threshold=dict(col2=1),
+...         strict=True,
+...     )
+... )  # doctest: +SKIP
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

DataFrame which needs to be flagged for changes between +consecutive rows above a certain threshold.

+
+
+ required +
+ scale + + Union[str, Dict[str, str]] + +
+

Type of scaling approach to use. +Acceptable arguments are

+
    +
  • 'absolute' (consider the difference between rows)
  • +
  • 'percentage' (consider the percentage change between rows).
  • +
+
+
+ 'percentage' +
+ direction + + Union[str, Dict[str, str]] + +
+

Type of method used to handle the sign change when +comparing consecutive rows. +Acceptable arguments are

+
    +
  • 'increasing' (only consider rows that are increasing in value)
  • +
  • 'decreasing' (only consider rows that are decreasing in value)
  • +
  • 'any' (consider rows that are either increasing or decreasing; + sign is ignored).
  • +
+
+
+ 'any' +
+ threshold + + Union[int, float, Dict[str, Union[int, float]]] + +
+

The value to check if consecutive row comparisons +exceed. Always uses a greater than comparison. Must be >= 0.0.

+
+
+ 0.0 +
+ strict + + bool + +
+

Flag to enable/disable appending of a flag column for +each column in the provided DataFrame. If set to True, will +only append a flag column for those columns found in at least +one of the input dictionaries. If set to False, will append +a flag column for each column found in the provided DataFrame. +If criteria is not specified, the defaults for each criteria +is used.

+
+
+ False +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ JanitorError + +
+

If strict=True and at least one of +scale, direction, or threshold inputs is not a +dictionary.

+
+
+ JanitorError + +
+

If scale is not one of +("absolute", "percentage").

+
+
+ JanitorError + +
+

If direction is not one of +("increasing", "decreasing", "any").

+
+
+ JanitorError + +
+

If threshold is less than 0.0.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

DataFrame that has flag jump columns.

+
+
+ + +
+ Source code in janitor/timeseries.py +
256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
@pf.register_dataframe_method
+def flag_jumps(
+    df: pd.DataFrame,
+    scale: Union[str, Dict[str, str]] = "percentage",
+    direction: Union[str, Dict[str, str]] = "any",
+    threshold: Union[int, float, Dict[str, Union[int, float]]] = 0.0,
+    strict: bool = False,
+) -> pd.DataFrame:
+    """Create boolean column(s) that flag whether or not the change
+    between consecutive rows exceeds a provided threshold.
+
+    Examples:
+
+        Applies specified criteria across all columns of the DataFrame
+        and appends a flag column for each column in the DataFrame
+
+        >>> df = (
+        ...     pd.DataFrame(...)
+        ...     .flag_jumps(
+        ...         scale="absolute",
+        ...         direction="any",
+        ...         threshold=2
+        ...     )
+        ... )  # doctest: +SKIP
+
+        Applies specific criteria to certain DataFrame columns,
+        applies default criteria to columns *not* specifically listed and
+        appends a flag column for each column in the DataFrame
+
+        >>> df = (
+        ...     pd.DataFrame(...)
+        ...     .flag_jumps(
+        ...         scale=dict(col1="absolute", col2="percentage"),
+        ...         direction=dict(col1="increasing", col2="any"),
+        ...         threshold=dict(col1=1, col2=0.5),
+        ...     )
+        ... )  # doctest: +SKIP
+
+        Applies specific criteria to certain DataFrame columns,
+        applies default criteria to columns *not* specifically listed and
+        appends a flag column for only those columns found in specified
+        criteria
+
+        >>> df = (
+        ...     pd.DataFrame(...)
+        ...     .flag_jumps(
+        ...         scale=dict(col1="absolute"),
+        ...         threshold=dict(col2=1),
+        ...         strict=True,
+        ...     )
+        ... )  # doctest: +SKIP
+
+    Args:
+        df: DataFrame which needs to be flagged for changes between
+            consecutive rows above a certain threshold.
+        scale:
+            Type of scaling approach to use.
+            Acceptable arguments are
+
+            * `'absolute'` (consider the difference between rows)
+            * `'percentage'` (consider the percentage change between rows).
+
+        direction: Type of method used to handle the sign change when
+            comparing consecutive rows.
+            Acceptable arguments are
+
+            * `'increasing'` (only consider rows that are increasing in value)
+            * `'decreasing'` (only consider rows that are decreasing in value)
+            * `'any'` (consider rows that are either increasing or decreasing;
+                sign is ignored).
+        threshold: The value to check if consecutive row comparisons
+            exceed. Always uses a greater than comparison. Must be `>= 0.0`.
+        strict: Flag to enable/disable appending of a flag column for
+            each column in the provided DataFrame. If set to `True`, will
+            only append a flag column for those columns found in at least
+            one of the input dictionaries. If set to `False`, will append
+            a flag column for each column found in the provided DataFrame.
+            If criteria is not specified, the defaults for each criteria
+            is used.
+
+    Raises:
+        JanitorError: If `strict=True` and at least one of
+            `scale`, `direction`, or `threshold` inputs is not a
+            dictionary.
+        JanitorError: If `scale` is not one of
+            `("absolute", "percentage")`.
+        JanitorError: If `direction` is not one of
+            `("increasing", "decreasing", "any")`.
+        JanitorError: If `threshold` is less than `0.0`.
+
+    Returns:
+        DataFrame that has `flag jump` columns.
+
+    <!--
+    # noqa: DAR101
+    -->
+    """
+    df = df.copy()
+
+    if strict:
+        if (
+            any(isinstance(arg, dict) for arg in (scale, direction, threshold))
+            is False
+        ):
+            raise JanitorError(
+                "When enacting 'strict=True', 'scale', 'direction', or "
+                + "'threshold' must be a dictionary."
+            )
+
+        # Only append a flag col for the cols that appear
+        # in at least one of the input dicts
+        arg_keys = [
+            arg.keys()
+            for arg in (scale, direction, threshold)
+            if isinstance(arg, dict)
+        ]
+        cols = set(itertools.chain.from_iterable(arg_keys))
+
+    else:
+        # Append a flag col for each col in the DataFrame
+        cols = df.columns
+
+    columns_to_add = {}
+    for col in sorted(cols):
+        # Allow arguments to be a mix of dict and single instances
+        s = scale.get(col, "percentage") if isinstance(scale, dict) else scale
+        d = (
+            direction.get(col, "any")
+            if isinstance(direction, dict)
+            else direction
+        )
+        t = (
+            threshold.get(col, 0.0)
+            if isinstance(threshold, dict)
+            else threshold
+        )
+
+        columns_to_add[f"{col}_jump_flag"] = _flag_jumps_single_col(
+            df, col, scale=s, direction=d, threshold=t
+        )
+
+    df = df.assign(**columns_to_add)
+
+    return df
+
+
+
+ +
+ +
+ + +

+ sort_timestamps_monotonically(df, direction='increasing', strict=False) + +

+ + +
+ +

Sort DataFrame such that index is monotonic.

+

If timestamps are monotonic, this function will return +the DataFrame unmodified. If timestamps are not monotonic, +then the function will sort the DataFrame.

+ + +

Examples:

+

Functional usage

+
>>> import pandas as pd
+>>> import janitor.timeseries
+>>> df = janitor.timeseries.sort_timestamps_monotonically(
+...     df=pd.DataFrame(...),
+...     direction="increasing",
+... )
+
+

Method chaining example:

+
>>> import pandas as pd
+>>> import janitor.timeseries
+>>> df = (
+...     pd.DataFrame(...)
+...     .sort_timestamps_monotonically(direction="increasing")
+... )
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ df + + DataFrame + +
+

DataFrame which needs to be tested for monotonicity.

+
+
+ required +
+ direction + + str + +
+

Type of monotonicity desired. +Acceptable arguments are 'increasing' or 'decreasing'.

+
+
+ 'increasing' +
+ strict + + bool + +
+

Flag to enable/disable strict monotonicity. +If set to True, will remove duplicates in the index +by retaining first occurrence of value in index. +If set to False, will not test for duplicates in the index.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataFrame + +
+

DataFrame that has monotonically increasing (or decreasing) +timestamps.

+
+
+ +
+ Source code in janitor/timeseries.py +
106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
@pf.register_dataframe_method
+def sort_timestamps_monotonically(
+    df: pd.DataFrame, direction: str = "increasing", strict: bool = False
+) -> pd.DataFrame:
+    """Sort DataFrame such that index is monotonic.
+
+    If timestamps are monotonic, this function will return
+    the DataFrame unmodified. If timestamps are not monotonic,
+    then the function will sort the DataFrame.
+
+    Examples:
+        Functional usage
+
+        >>> import pandas as pd
+        >>> import janitor.timeseries
+        >>> df = janitor.timeseries.sort_timestamps_monotonically(
+        ...     df=pd.DataFrame(...),
+        ...     direction="increasing",
+        ... )  # doctest: +SKIP
+
+        Method chaining example:
+
+        >>> import pandas as pd
+        >>> import janitor.timeseries
+        >>> df = (
+        ...     pd.DataFrame(...)
+        ...     .sort_timestamps_monotonically(direction="increasing")
+        ... )  # doctest: +SKIP
+
+    Args:
+        df: DataFrame which needs to be tested for monotonicity.
+        direction: Type of monotonicity desired.
+            Acceptable arguments are `'increasing'` or `'decreasing'`.
+        strict: Flag to enable/disable strict monotonicity.
+            If set to `True`, will remove duplicates in the index
+            by retaining first occurrence of value in index.
+            If set to `False`, will not test for duplicates in the index.
+
+    Returns:
+        DataFrame that has monotonically increasing (or decreasing)
+            timestamps.
+    """
+    # Check all the inputs are the correct data type
+    check("df", df, [pd.DataFrame])
+    check("direction", direction, [str])
+    check("strict", strict, [bool])
+
+    # Remove duplicates if requested
+    if strict:
+        df = df[~df.index.duplicated(keep="first")]
+
+    # Sort timestamps
+    if direction == "increasing":
+        df = df.sort_index()
+    else:
+        df = df.sort_index(ascending=False)
+
+    # Return the DataFrame
+    return df
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/api/xarray/index.html b/api/xarray/index.html new file mode 100644 index 000000000..9a65d973f --- /dev/null +++ b/api/xarray/index.html @@ -0,0 +1,1430 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + XArray - pyjanitor documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

XArray

+ + +
+ + + + +
+ +

Functions to augment XArray DataArrays and Datasets with additional +functionality.

+ + + + + + + + +
+ + + + + + + + + +
+ + +

+ clone_using(da, np_arr, use_coords=True, use_attrs=False, new_name=None) + +

+ + +
+ +

Given a NumPy array, return an XArray DataArray which contains the same +dimension names and (optionally) coordinates and other properties as the +supplied DataArray.

+

This is similar to xr.DataArray.copy() with more specificity for +the type of cloning you would like to perform - the different properties +that you desire to mirror in the new DataArray.

+

If the coordinates from the source DataArray are not desired, the shape +of the source and new NumPy arrays don't need to match. +The number of dimensions do, however.

+ + +

Examples:

+

Making a new DataArray from a previous one, keeping the +dimension names but dropping the coordinates (the input NumPy array +is of a different size):

+
>>> import xarray as xr
+>>> import janitor.xarray
+>>> da = xr.DataArray(
+...     np.zeros((512, 1024)), dims=["ax_1", "ax_2"],
+...     coords=dict(ax_1=np.linspace(0, 1, 512),
+...                 ax_2=np.logspace(-2, 2, 1024)),
+...     name="original",
+... )
+>>> new_da = da.clone_using(
+...     np.ones((4, 6)), new_name='new_and_improved', use_coords=False,
+... )
+>>> new_da
+<xarray.DataArray 'new_and_improved' (ax_1: 4, ax_2: 6)> Size: 192B
+array([[1., 1., 1., 1., 1., 1.],
+       [1., 1., 1., 1., 1., 1.],
+       [1., 1., 1., 1., 1., 1.],
+       [1., 1., 1., 1., 1., 1.]])
+Dimensions without coordinates: ax_1, ax_2
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ da + + DataArray + +
+

The DataArray supplied by the method itself.

+
+
+ required +
+ np_arr + + array + +
+

The NumPy array which will be wrapped in a new DataArray +given the properties copied over from the source DataArray.

+
+
+ required +
+ use_coords + + bool + +
+

If True, use the coordinates of the source +DataArray for the coordinates of the newly-generated array. +Shapes must match in this case. If False, only the number of +dimensions must match.

+
+
+ True +
+ use_attrs + + bool + +
+

If True, copy over the attrs from the source +DataArray. +The data inside attrs itself is not copied, only the mapping. +Otherwise, use the supplied attrs.

+
+
+ False +
+ new_name + + str + +
+

If set, use as the new name of the returned DataArray. +Otherwise, use the name of da.

+
+
+ None +
+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If number of dimensions in NumPy array and +DataArray do not match.

+
+
+ ValueError + +
+

If shape of NumPy array and DataArray +do not match.

+
+
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ DataArray + +
+

A DataArray styled like the input DataArray containing the +NumPy array data.

+
+
+ +
+ Source code in janitor/xarray/functions.py +
 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
@pf.register_xarray_dataarray_method
+def clone_using(
+    da: xr.DataArray,
+    np_arr: np.array,
+    use_coords: bool = True,
+    use_attrs: bool = False,
+    new_name: str = None,
+) -> xr.DataArray:
+    """
+    Given a NumPy array, return an XArray `DataArray` which contains the same
+    dimension names and (optionally) coordinates and other properties as the
+    supplied `DataArray`.
+
+    This is similar to `xr.DataArray.copy()` with more specificity for
+    the type of cloning you would like to perform - the different properties
+    that you desire to mirror in the new `DataArray`.
+
+    If the coordinates from the source `DataArray` are not desired, the shape
+    of the source and new NumPy arrays don't need to match.
+    The number of dimensions do, however.
+
+    Examples:
+        Making a new `DataArray` from a previous one, keeping the
+        dimension names but dropping the coordinates (the input NumPy array
+        is of a different size):
+
+        >>> import xarray as xr
+        >>> import janitor.xarray
+        >>> da = xr.DataArray(
+        ...     np.zeros((512, 1024)), dims=["ax_1", "ax_2"],
+        ...     coords=dict(ax_1=np.linspace(0, 1, 512),
+        ...                 ax_2=np.logspace(-2, 2, 1024)),
+        ...     name="original",
+        ... )
+        >>> new_da = da.clone_using(
+        ...     np.ones((4, 6)), new_name='new_and_improved', use_coords=False,
+        ... )
+        >>> new_da
+        <xarray.DataArray 'new_and_improved' (ax_1: 4, ax_2: 6)> Size: 192B
+        array([[1., 1., 1., 1., 1., 1.],
+               [1., 1., 1., 1., 1., 1.],
+               [1., 1., 1., 1., 1., 1.],
+               [1., 1., 1., 1., 1., 1.]])
+        Dimensions without coordinates: ax_1, ax_2
+
+    Args:
+        da: The `DataArray` supplied by the method itself.
+        np_arr: The NumPy array which will be wrapped in a new `DataArray`
+            given the properties copied over from the source `DataArray`.
+        use_coords: If `True`, use the coordinates of the source
+            `DataArray` for the coordinates of the newly-generated array.
+            Shapes must match in this case. If `False`, only the number of
+            dimensions must match.
+        use_attrs: If `True`, copy over the `attrs` from the source
+            `DataArray`.
+            The data inside `attrs` itself is not copied, only the mapping.
+            Otherwise, use the supplied attrs.
+        new_name: If set, use as the new name of the returned `DataArray`.
+            Otherwise, use the name of `da`.
+
+    Raises:
+        ValueError: If number of dimensions in `NumPy` array and
+            `DataArray` do not match.
+        ValueError: If shape of `NumPy` array and `DataArray`
+            do not match.
+
+    Returns:
+        A `DataArray` styled like the input `DataArray` containing the
+            NumPy array data.
+    """
+
+    if np_arr.ndim != da.ndim:
+        raise ValueError(
+            "Number of dims in the NumPy array and the DataArray "
+            "must match."
+        )
+
+    if use_coords and not all(
+        np_ax_len == da_ax_len
+        for np_ax_len, da_ax_len in zip(np_arr.shape, da.shape)
+    ):
+        raise ValueError(
+            "Input NumPy array and DataArray must have the same "
+            "shape if copying over coordinates."
+        )
+
+    return xr.DataArray(
+        np_arr,
+        dims=da.dims,
+        coords=da.coords if use_coords else None,
+        attrs=da.attrs.copy() if use_attrs else None,
+        name=new_name if new_name is not None else da.name,
+    )
+
+
+
+ +
+ +
+ + +

+ convert_datetime_to_number(da_or_ds, time_units, dim='time') + +

+ + +
+ +

Convert the coordinates of a datetime axis to a human-readable float +representation.

+ + +

Examples:

+

Convert a DataArray's time dimension coordinates from +minutes to seconds:

+
>>> import numpy as np
+>>> import xarray as xr
+>>> import janitor.xarray
+>>> timepoints = 5
+>>> da = xr.DataArray(
+...     np.array([2, 8, 0, 1, 7, 7]),
+...     dims="time",
+...     coords=dict(time=np.arange(6) * np.timedelta64(1, "m"))
+... )
+>>> da_minutes = da.convert_datetime_to_number("s", dim="time")
+>>> da_minutes
+<xarray.DataArray (time: 6)> Size: 48B
+array([2, 8, 0, 1, 7, 7])
+Coordinates:
+  * time     (time) float64 48B 0.0 60.0 120.0 180.0 240.0 300.0
+
+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ da_or_ds + + Union[DataArray, Dataset] + +
+

XArray object.

+
+
+ required +
+ time_units + + str + +
+

Numpy timedelta string specification for the unit you +would like to convert the coordinates to.

+
+
+ required +
+ dim + + str + +
+

The time dimension whose coordinates are datetime objects.

+
+
+ 'time' +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Union[DataArray, Dataset] + +
+

The original XArray object with the time dimension reassigned.

+
+
+ +
+ Source code in janitor/xarray/functions.py +
108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
@pf.register_xarray_dataset_method
+@pf.register_xarray_dataarray_method
+def convert_datetime_to_number(
+    da_or_ds: Union[xr.DataArray, xr.Dataset],
+    time_units: str,
+    dim: str = "time",
+) -> Union[xr.DataArray, xr.Dataset]:
+    """Convert the coordinates of a datetime axis to a human-readable float
+    representation.
+
+    Examples:
+        Convert a `DataArray`'s time dimension coordinates from
+        minutes to seconds:
+
+        >>> import numpy as np
+        >>> import xarray as xr
+        >>> import janitor.xarray
+        >>> timepoints = 5
+        >>> da = xr.DataArray(
+        ...     np.array([2, 8, 0, 1, 7, 7]),
+        ...     dims="time",
+        ...     coords=dict(time=np.arange(6) * np.timedelta64(1, "m"))
+        ... )
+        >>> da_minutes = da.convert_datetime_to_number("s", dim="time")
+        >>> da_minutes
+        <xarray.DataArray (time: 6)> Size: 48B
+        array([2, 8, 0, 1, 7, 7])
+        Coordinates:
+          * time     (time) float64 48B 0.0 60.0 120.0 180.0 240.0 300.0
+
+    Args:
+        da_or_ds: XArray object.
+        time_units: Numpy timedelta string specification for the unit you
+            would like to convert the coordinates to.
+        dim: The time dimension whose coordinates are datetime objects.
+
+    Returns:
+        The original XArray object with the time dimension reassigned.
+    """
+
+    times = da_or_ds.coords[dim].data / np.timedelta64(1, time_units)
+
+    return da_or_ds.assign_coords({dim: times})
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/assets/_mkdocstrings.css b/assets/_mkdocstrings.css new file mode 100644 index 000000000..b500381b5 --- /dev/null +++ b/assets/_mkdocstrings.css @@ -0,0 +1,143 @@ + +/* Avoid breaking parameter names, etc. in table cells. */ +.doc-contents td code { + word-break: normal !important; +} + +/* No line break before first paragraph of descriptions. */ +.doc-md-description, +.doc-md-description>p:first-child { + display: inline; +} + +/* Max width for docstring sections tables. */ +.doc .md-typeset__table, +.doc .md-typeset__table table { + display: table !important; + width: 100%; +} + +.doc .md-typeset__table tr { + display: table-row; +} + +/* Defaults in Spacy table style. */ +.doc-param-default { + float: right; +} + +/* Parameter headings must be inline, not blocks. */ +.doc-heading-parameter { + display: inline; +} + +/* Prefer space on the right, not the left of parameter permalinks. */ +.doc-heading-parameter .headerlink { + margin-left: 0 !important; + margin-right: 0.2rem; +} + +/* Backward-compatibility: docstring section titles in bold. */ +.doc-section-title { + font-weight: bold; +} + +/* Symbols in Navigation and ToC. */ +:root, :host, +[data-md-color-scheme="default"] { + --doc-symbol-parameter-fg-color: #df50af; + --doc-symbol-attribute-fg-color: #953800; + --doc-symbol-function-fg-color: #8250df; + --doc-symbol-method-fg-color: #8250df; + --doc-symbol-class-fg-color: #0550ae; + --doc-symbol-module-fg-color: #5cad0f; + + --doc-symbol-parameter-bg-color: #df50af1a; + --doc-symbol-attribute-bg-color: #9538001a; + --doc-symbol-function-bg-color: #8250df1a; + --doc-symbol-method-bg-color: #8250df1a; + --doc-symbol-class-bg-color: #0550ae1a; + --doc-symbol-module-bg-color: #5cad0f1a; +} + +[data-md-color-scheme="slate"] { + --doc-symbol-parameter-fg-color: #ffa8cc; + --doc-symbol-attribute-fg-color: #ffa657; + --doc-symbol-function-fg-color: #d2a8ff; + --doc-symbol-method-fg-color: #d2a8ff; + --doc-symbol-class-fg-color: #79c0ff; + --doc-symbol-module-fg-color: #baff79; + + --doc-symbol-parameter-bg-color: #ffa8cc1a; + --doc-symbol-attribute-bg-color: #ffa6571a; + --doc-symbol-function-bg-color: #d2a8ff1a; + --doc-symbol-method-bg-color: #d2a8ff1a; + --doc-symbol-class-bg-color: #79c0ff1a; + --doc-symbol-module-bg-color: #baff791a; +} + +code.doc-symbol { + border-radius: .1rem; + font-size: .85em; + padding: 0 .3em; + font-weight: bold; +} + +code.doc-symbol-parameter { + color: var(--doc-symbol-parameter-fg-color); + background-color: var(--doc-symbol-parameter-bg-color); +} + +code.doc-symbol-parameter::after { + content: "param"; +} + +code.doc-symbol-attribute { + color: var(--doc-symbol-attribute-fg-color); + background-color: var(--doc-symbol-attribute-bg-color); +} + +code.doc-symbol-attribute::after { + content: "attr"; +} + +code.doc-symbol-function { + color: var(--doc-symbol-function-fg-color); + background-color: var(--doc-symbol-function-bg-color); +} + +code.doc-symbol-function::after { + content: "func"; +} + +code.doc-symbol-method { + color: var(--doc-symbol-method-fg-color); + background-color: var(--doc-symbol-method-bg-color); +} + +code.doc-symbol-method::after { + content: "meth"; +} + +code.doc-symbol-class { + color: var(--doc-symbol-class-fg-color); + background-color: var(--doc-symbol-class-bg-color); +} + +code.doc-symbol-class::after { + content: "class"; +} + +code.doc-symbol-module { + color: var(--doc-symbol-module-fg-color); + background-color: var(--doc-symbol-module-bg-color); +} + +code.doc-symbol-module::after { + content: "mod"; +} + +.doc-signature .autorefs { + color: inherit; + border-bottom: 1px dotted currentcolor; +} diff --git a/assets/images/favicon.png b/assets/images/favicon.png new file mode 100644 index 0000000000000000000000000000000000000000..1cf13b9f9d978896599290a74f77d5dbe7d1655c GIT binary patch literal 1870 zcmV-U2eJ5xP)Gc)JR9QMau)O=X#!i9;T z37kk-upj^(fsR36MHs_+1RCI)NNu9}lD0S{B^g8PN?Ww(5|~L#Ng*g{WsqleV}|#l zz8@ri&cTzw_h33bHI+12+kK6WN$h#n5cD8OQt`5kw6p~9H3()bUQ8OS4Q4HTQ=1Ol z_JAocz`fLbT2^{`8n~UAo=#AUOf=SOq4pYkt;XbC&f#7lb$*7=$na!mWCQ`dBQsO0 zLFBSPj*N?#u5&pf2t4XjEGH|=pPQ8xh7tpx;US5Cx_Ju;!O`ya-yF`)b%TEt5>eP1ZX~}sjjA%FJF?h7cX8=b!DZl<6%Cv z*G0uvvU+vmnpLZ2paivG-(cd*y3$hCIcsZcYOGh{$&)A6*XX&kXZd3G8m)G$Zz-LV z^GF3VAW^Mdv!)4OM8EgqRiz~*Cji;uzl2uC9^=8I84vNp;ltJ|q-*uQwGp2ma6cY7 z;`%`!9UXO@fr&Ebapfs34OmS9^u6$)bJxrucutf>`dKPKT%%*d3XlFVKunp9 zasduxjrjs>f8V=D|J=XNZp;_Zy^WgQ$9WDjgY=z@stwiEBm9u5*|34&1Na8BMjjgf3+SHcr`5~>oz1Y?SW^=K z^bTyO6>Gar#P_W2gEMwq)ot3; zREHn~U&Dp0l6YT0&k-wLwYjb?5zGK`W6S2v+K>AM(95m2C20L|3m~rN8dprPr@t)5lsk9Hu*W z?pS990s;Ez=+Rj{x7p``4>+c0G5^pYnB1^!TL=(?HLHZ+HicG{~4F1d^5Awl_2!1jICM-!9eoLhbbT^;yHcefyTAaqRcY zmuctDopPT!%k+}x%lZRKnzykr2}}XfG_ne?nRQO~?%hkzo;@RN{P6o`&mMUWBYMTe z6i8ChtjX&gXl`nvrU>jah)2iNM%JdjqoaeaU%yVn!^70x-flljp6Q5tK}5}&X8&&G zX3fpb3E(!rH=zVI_9Gjl45w@{(ITqngWFe7@9{mX;tO25Z_8 zQHEpI+FkTU#4xu>RkN>b3Tnc3UpWzPXWm#o55GKF09j^Mh~)K7{QqbO_~(@CVq! zS<8954|P8mXN2MRs86xZ&Q4EfM@JB94b=(YGuk)s&^jiSF=t3*oNK3`rD{H`yQ?d; ztE=laAUoZx5?RC8*WKOj`%LXEkgDd>&^Q4M^z`%u0rg-It=hLCVsq!Z%^6eB-OvOT zFZ28TN&cRmgU}Elrnk43)!>Z1FCPL2K$7}gwzIc48NX}#!A1BpJP?#v5wkNprhV** z?Cpalt1oH&{r!o3eSKc&ap)iz2BTn_VV`4>9M^b3;(YY}4>#ML6{~(4mH+?%07*qo IM6N<$f(jP3KmY&$ literal 0 HcmV?d00001 diff --git a/assets/javascripts/bundle.88dd0f4e.min.js b/assets/javascripts/bundle.88dd0f4e.min.js new file mode 100644 index 000000000..fb8f31090 --- /dev/null +++ b/assets/javascripts/bundle.88dd0f4e.min.js @@ -0,0 +1,16 @@ +"use strict";(()=>{var Wi=Object.create;var gr=Object.defineProperty;var Di=Object.getOwnPropertyDescriptor;var Vi=Object.getOwnPropertyNames,Vt=Object.getOwnPropertySymbols,Ni=Object.getPrototypeOf,yr=Object.prototype.hasOwnProperty,ao=Object.prototype.propertyIsEnumerable;var io=(e,t,r)=>t in e?gr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,$=(e,t)=>{for(var r in t||(t={}))yr.call(t,r)&&io(e,r,t[r]);if(Vt)for(var r of Vt(t))ao.call(t,r)&&io(e,r,t[r]);return e};var so=(e,t)=>{var r={};for(var o in e)yr.call(e,o)&&t.indexOf(o)<0&&(r[o]=e[o]);if(e!=null&&Vt)for(var o of Vt(e))t.indexOf(o)<0&&ao.call(e,o)&&(r[o]=e[o]);return r};var xr=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var zi=(e,t,r,o)=>{if(t&&typeof t=="object"||typeof t=="function")for(let n of Vi(t))!yr.call(e,n)&&n!==r&&gr(e,n,{get:()=>t[n],enumerable:!(o=Di(t,n))||o.enumerable});return e};var Mt=(e,t,r)=>(r=e!=null?Wi(Ni(e)):{},zi(t||!e||!e.__esModule?gr(r,"default",{value:e,enumerable:!0}):r,e));var co=(e,t,r)=>new Promise((o,n)=>{var i=p=>{try{s(r.next(p))}catch(c){n(c)}},a=p=>{try{s(r.throw(p))}catch(c){n(c)}},s=p=>p.done?o(p.value):Promise.resolve(p.value).then(i,a);s((r=r.apply(e,t)).next())});var lo=xr((Er,po)=>{(function(e,t){typeof Er=="object"&&typeof po!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(Er,function(){"use strict";function e(r){var o=!0,n=!1,i=null,a={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function s(k){return!!(k&&k!==document&&k.nodeName!=="HTML"&&k.nodeName!=="BODY"&&"classList"in k&&"contains"in k.classList)}function p(k){var ft=k.type,qe=k.tagName;return!!(qe==="INPUT"&&a[ft]&&!k.readOnly||qe==="TEXTAREA"&&!k.readOnly||k.isContentEditable)}function c(k){k.classList.contains("focus-visible")||(k.classList.add("focus-visible"),k.setAttribute("data-focus-visible-added",""))}function l(k){k.hasAttribute("data-focus-visible-added")&&(k.classList.remove("focus-visible"),k.removeAttribute("data-focus-visible-added"))}function f(k){k.metaKey||k.altKey||k.ctrlKey||(s(r.activeElement)&&c(r.activeElement),o=!0)}function u(k){o=!1}function d(k){s(k.target)&&(o||p(k.target))&&c(k.target)}function y(k){s(k.target)&&(k.target.classList.contains("focus-visible")||k.target.hasAttribute("data-focus-visible-added"))&&(n=!0,window.clearTimeout(i),i=window.setTimeout(function(){n=!1},100),l(k.target))}function L(k){document.visibilityState==="hidden"&&(n&&(o=!0),X())}function X(){document.addEventListener("mousemove",J),document.addEventListener("mousedown",J),document.addEventListener("mouseup",J),document.addEventListener("pointermove",J),document.addEventListener("pointerdown",J),document.addEventListener("pointerup",J),document.addEventListener("touchmove",J),document.addEventListener("touchstart",J),document.addEventListener("touchend",J)}function te(){document.removeEventListener("mousemove",J),document.removeEventListener("mousedown",J),document.removeEventListener("mouseup",J),document.removeEventListener("pointermove",J),document.removeEventListener("pointerdown",J),document.removeEventListener("pointerup",J),document.removeEventListener("touchmove",J),document.removeEventListener("touchstart",J),document.removeEventListener("touchend",J)}function J(k){k.target.nodeName&&k.target.nodeName.toLowerCase()==="html"||(o=!1,te())}document.addEventListener("keydown",f,!0),document.addEventListener("mousedown",u,!0),document.addEventListener("pointerdown",u,!0),document.addEventListener("touchstart",u,!0),document.addEventListener("visibilitychange",L,!0),X(),r.addEventListener("focus",d,!0),r.addEventListener("blur",y,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var qr=xr((hy,On)=>{"use strict";/*! + * escape-html + * Copyright(c) 2012-2013 TJ Holowaychuk + * Copyright(c) 2015 Andreas Lubbe + * Copyright(c) 2015 Tiancheng "Timothy" Gu + * MIT Licensed + */var $a=/["'&<>]/;On.exports=Pa;function Pa(e){var t=""+e,r=$a.exec(t);if(!r)return t;var o,n="",i=0,a=0;for(i=r.index;i{/*! + * clipboard.js v2.0.11 + * https://clipboardjs.com/ + * + * Licensed MIT © Zeno Rocha + */(function(t,r){typeof It=="object"&&typeof Yr=="object"?Yr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof It=="object"?It.ClipboardJS=r():t.ClipboardJS=r()})(It,function(){return function(){var e={686:function(o,n,i){"use strict";i.d(n,{default:function(){return Ui}});var a=i(279),s=i.n(a),p=i(370),c=i.n(p),l=i(817),f=i.n(l);function u(V){try{return document.execCommand(V)}catch(A){return!1}}var d=function(A){var M=f()(A);return u("cut"),M},y=d;function L(V){var A=document.documentElement.getAttribute("dir")==="rtl",M=document.createElement("textarea");M.style.fontSize="12pt",M.style.border="0",M.style.padding="0",M.style.margin="0",M.style.position="absolute",M.style[A?"right":"left"]="-9999px";var F=window.pageYOffset||document.documentElement.scrollTop;return M.style.top="".concat(F,"px"),M.setAttribute("readonly",""),M.value=V,M}var X=function(A,M){var F=L(A);M.container.appendChild(F);var D=f()(F);return u("copy"),F.remove(),D},te=function(A){var M=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},F="";return typeof A=="string"?F=X(A,M):A instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(A==null?void 0:A.type)?F=X(A.value,M):(F=f()(A),u("copy")),F},J=te;function k(V){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?k=function(M){return typeof M}:k=function(M){return M&&typeof Symbol=="function"&&M.constructor===Symbol&&M!==Symbol.prototype?"symbol":typeof M},k(V)}var ft=function(){var A=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},M=A.action,F=M===void 0?"copy":M,D=A.container,Y=A.target,$e=A.text;if(F!=="copy"&&F!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(Y!==void 0)if(Y&&k(Y)==="object"&&Y.nodeType===1){if(F==="copy"&&Y.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(F==="cut"&&(Y.hasAttribute("readonly")||Y.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if($e)return J($e,{container:D});if(Y)return F==="cut"?y(Y):J(Y,{container:D})},qe=ft;function Fe(V){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?Fe=function(M){return typeof M}:Fe=function(M){return M&&typeof Symbol=="function"&&M.constructor===Symbol&&M!==Symbol.prototype?"symbol":typeof M},Fe(V)}function ki(V,A){if(!(V instanceof A))throw new TypeError("Cannot call a class as a function")}function no(V,A){for(var M=0;M0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof D.action=="function"?D.action:this.defaultAction,this.target=typeof D.target=="function"?D.target:this.defaultTarget,this.text=typeof D.text=="function"?D.text:this.defaultText,this.container=Fe(D.container)==="object"?D.container:document.body}},{key:"listenClick",value:function(D){var Y=this;this.listener=c()(D,"click",function($e){return Y.onClick($e)})}},{key:"onClick",value:function(D){var Y=D.delegateTarget||D.currentTarget,$e=this.action(Y)||"copy",Dt=qe({action:$e,container:this.container,target:this.target(Y),text:this.text(Y)});this.emit(Dt?"success":"error",{action:$e,text:Dt,trigger:Y,clearSelection:function(){Y&&Y.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(D){return vr("action",D)}},{key:"defaultTarget",value:function(D){var Y=vr("target",D);if(Y)return document.querySelector(Y)}},{key:"defaultText",value:function(D){return vr("text",D)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(D){var Y=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return J(D,Y)}},{key:"cut",value:function(D){return y(D)}},{key:"isSupported",value:function(){var D=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],Y=typeof D=="string"?[D]:D,$e=!!document.queryCommandSupported;return Y.forEach(function(Dt){$e=$e&&!!document.queryCommandSupported(Dt)}),$e}}]),M}(s()),Ui=Fi},828:function(o){var n=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function a(s,p){for(;s&&s.nodeType!==n;){if(typeof s.matches=="function"&&s.matches(p))return s;s=s.parentNode}}o.exports=a},438:function(o,n,i){var a=i(828);function s(l,f,u,d,y){var L=c.apply(this,arguments);return l.addEventListener(u,L,y),{destroy:function(){l.removeEventListener(u,L,y)}}}function p(l,f,u,d,y){return typeof l.addEventListener=="function"?s.apply(null,arguments):typeof u=="function"?s.bind(null,document).apply(null,arguments):(typeof l=="string"&&(l=document.querySelectorAll(l)),Array.prototype.map.call(l,function(L){return s(L,f,u,d,y)}))}function c(l,f,u,d){return function(y){y.delegateTarget=a(y.target,f),y.delegateTarget&&d.call(l,y)}}o.exports=p},879:function(o,n){n.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},n.nodeList=function(i){var a=Object.prototype.toString.call(i);return i!==void 0&&(a==="[object NodeList]"||a==="[object HTMLCollection]")&&"length"in i&&(i.length===0||n.node(i[0]))},n.string=function(i){return typeof i=="string"||i instanceof String},n.fn=function(i){var a=Object.prototype.toString.call(i);return a==="[object Function]"}},370:function(o,n,i){var a=i(879),s=i(438);function p(u,d,y){if(!u&&!d&&!y)throw new Error("Missing required arguments");if(!a.string(d))throw new TypeError("Second argument must be a String");if(!a.fn(y))throw new TypeError("Third argument must be a Function");if(a.node(u))return c(u,d,y);if(a.nodeList(u))return l(u,d,y);if(a.string(u))return f(u,d,y);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function c(u,d,y){return u.addEventListener(d,y),{destroy:function(){u.removeEventListener(d,y)}}}function l(u,d,y){return Array.prototype.forEach.call(u,function(L){L.addEventListener(d,y)}),{destroy:function(){Array.prototype.forEach.call(u,function(L){L.removeEventListener(d,y)})}}}function f(u,d,y){return s(document.body,u,d,y)}o.exports=p},817:function(o){function n(i){var a;if(i.nodeName==="SELECT")i.focus(),a=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var s=i.hasAttribute("readonly");s||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),s||i.removeAttribute("readonly"),a=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var p=window.getSelection(),c=document.createRange();c.selectNodeContents(i),p.removeAllRanges(),p.addRange(c),a=p.toString()}return a}o.exports=n},279:function(o){function n(){}n.prototype={on:function(i,a,s){var p=this.e||(this.e={});return(p[i]||(p[i]=[])).push({fn:a,ctx:s}),this},once:function(i,a,s){var p=this;function c(){p.off(i,c),a.apply(s,arguments)}return c._=a,this.on(i,c,s)},emit:function(i){var a=[].slice.call(arguments,1),s=((this.e||(this.e={}))[i]||[]).slice(),p=0,c=s.length;for(p;p0&&i[i.length-1])&&(c[0]===6||c[0]===2)){r=0;continue}if(c[0]===3&&(!i||c[1]>i[0]&&c[1]=e.length&&(e=void 0),{value:e&&e[o++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function N(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var o=r.call(e),n,i=[],a;try{for(;(t===void 0||t-- >0)&&!(n=o.next()).done;)i.push(n.value)}catch(s){a={error:s}}finally{try{n&&!n.done&&(r=o.return)&&r.call(o)}finally{if(a)throw a.error}}return i}function q(e,t,r){if(r||arguments.length===2)for(var o=0,n=t.length,i;o1||p(d,L)})},y&&(n[d]=y(n[d])))}function p(d,y){try{c(o[d](y))}catch(L){u(i[0][3],L)}}function c(d){d.value instanceof nt?Promise.resolve(d.value.v).then(l,f):u(i[0][2],d)}function l(d){p("next",d)}function f(d){p("throw",d)}function u(d,y){d(y),i.shift(),i.length&&p(i[0][0],i[0][1])}}function uo(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof he=="function"?he(e):e[Symbol.iterator](),r={},o("next"),o("throw"),o("return"),r[Symbol.asyncIterator]=function(){return this},r);function o(i){r[i]=e[i]&&function(a){return new Promise(function(s,p){a=e[i](a),n(s,p,a.done,a.value)})}}function n(i,a,s,p){Promise.resolve(p).then(function(c){i({value:c,done:s})},a)}}function H(e){return typeof e=="function"}function ut(e){var t=function(o){Error.call(o),o.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var zt=ut(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: +`+r.map(function(o,n){return n+1+") "+o.toString()}).join(` + `):"",this.name="UnsubscriptionError",this.errors=r}});function Qe(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ue=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,o,n,i;if(!this.closed){this.closed=!0;var a=this._parentage;if(a)if(this._parentage=null,Array.isArray(a))try{for(var s=he(a),p=s.next();!p.done;p=s.next()){var c=p.value;c.remove(this)}}catch(L){t={error:L}}finally{try{p&&!p.done&&(r=s.return)&&r.call(s)}finally{if(t)throw t.error}}else a.remove(this);var l=this.initialTeardown;if(H(l))try{l()}catch(L){i=L instanceof zt?L.errors:[L]}var f=this._finalizers;if(f){this._finalizers=null;try{for(var u=he(f),d=u.next();!d.done;d=u.next()){var y=d.value;try{ho(y)}catch(L){i=i!=null?i:[],L instanceof zt?i=q(q([],N(i)),N(L.errors)):i.push(L)}}}catch(L){o={error:L}}finally{try{d&&!d.done&&(n=u.return)&&n.call(u)}finally{if(o)throw o.error}}}if(i)throw new zt(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)ho(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Qe(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Qe(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Tr=Ue.EMPTY;function qt(e){return e instanceof Ue||e&&"closed"in e&&H(e.remove)&&H(e.add)&&H(e.unsubscribe)}function ho(e){H(e)?e():e.unsubscribe()}var Pe={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var dt={setTimeout:function(e,t){for(var r=[],o=2;o0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var o=this,n=this,i=n.hasError,a=n.isStopped,s=n.observers;return i||a?Tr:(this.currentObservers=null,s.push(r),new Ue(function(){o.currentObservers=null,Qe(s,r)}))},t.prototype._checkFinalizedStatuses=function(r){var o=this,n=o.hasError,i=o.thrownError,a=o.isStopped;n?r.error(i):a&&r.complete()},t.prototype.asObservable=function(){var r=new j;return r.source=this,r},t.create=function(r,o){return new To(r,o)},t}(j);var To=function(e){oe(t,e);function t(r,o){var n=e.call(this)||this;return n.destination=r,n.source=o,n}return t.prototype.next=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.next)===null||n===void 0||n.call(o,r)},t.prototype.error=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.error)===null||n===void 0||n.call(o,r)},t.prototype.complete=function(){var r,o;(o=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||o===void 0||o.call(r)},t.prototype._subscribe=function(r){var o,n;return(n=(o=this.source)===null||o===void 0?void 0:o.subscribe(r))!==null&&n!==void 0?n:Tr},t}(g);var _r=function(e){oe(t,e);function t(r){var o=e.call(this)||this;return o._value=r,o}return Object.defineProperty(t.prototype,"value",{get:function(){return this.getValue()},enumerable:!1,configurable:!0}),t.prototype._subscribe=function(r){var o=e.prototype._subscribe.call(this,r);return!o.closed&&r.next(this._value),o},t.prototype.getValue=function(){var r=this,o=r.hasError,n=r.thrownError,i=r._value;if(o)throw n;return this._throwIfClosed(),i},t.prototype.next=function(r){e.prototype.next.call(this,this._value=r)},t}(g);var At={now:function(){return(At.delegate||Date).now()},delegate:void 0};var Ct=function(e){oe(t,e);function t(r,o,n){r===void 0&&(r=1/0),o===void 0&&(o=1/0),n===void 0&&(n=At);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=o,i._timestampProvider=n,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=o===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,o),i}return t.prototype.next=function(r){var o=this,n=o.isStopped,i=o._buffer,a=o._infiniteTimeWindow,s=o._timestampProvider,p=o._windowTime;n||(i.push(r),!a&&i.push(s.now()+p)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var o=this._innerSubscribe(r),n=this,i=n._infiniteTimeWindow,a=n._buffer,s=a.slice(),p=0;p0?e.prototype.schedule.call(this,r,o):(this.delay=o,this.state=r,this.scheduler.flush(this),this)},t.prototype.execute=function(r,o){return o>0||this.closed?e.prototype.execute.call(this,r,o):this._execute(r,o)},t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),n!=null&&n>0||n==null&&this.delay>0?e.prototype.requestAsyncId.call(this,r,o,n):(r.flush(this),0)},t}(gt);var Lo=function(e){oe(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t}(yt);var kr=new Lo(Oo);var Mo=function(e){oe(t,e);function t(r,o){var n=e.call(this,r,o)||this;return n.scheduler=r,n.work=o,n}return t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),n!==null&&n>0?e.prototype.requestAsyncId.call(this,r,o,n):(r.actions.push(this),r._scheduled||(r._scheduled=vt.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,o,n){var i;if(n===void 0&&(n=0),n!=null?n>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,o,n);var a=r.actions;o!=null&&((i=a[a.length-1])===null||i===void 0?void 0:i.id)!==o&&(vt.cancelAnimationFrame(o),r._scheduled=void 0)},t}(gt);var _o=function(e){oe(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var o=this._scheduled;this._scheduled=void 0;var n=this.actions,i;r=r||n.shift();do if(i=r.execute(r.state,r.delay))break;while((r=n[0])&&r.id===o&&n.shift());if(this._active=!1,i){for(;(r=n[0])&&r.id===o&&n.shift();)r.unsubscribe();throw i}},t}(yt);var me=new _o(Mo);var S=new j(function(e){return e.complete()});function Yt(e){return e&&H(e.schedule)}function Hr(e){return e[e.length-1]}function Xe(e){return H(Hr(e))?e.pop():void 0}function ke(e){return Yt(Hr(e))?e.pop():void 0}function Bt(e,t){return typeof Hr(e)=="number"?e.pop():t}var xt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Gt(e){return H(e==null?void 0:e.then)}function Jt(e){return H(e[bt])}function Xt(e){return Symbol.asyncIterator&&H(e==null?void 0:e[Symbol.asyncIterator])}function Zt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function Zi(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var er=Zi();function tr(e){return H(e==null?void 0:e[er])}function rr(e){return fo(this,arguments,function(){var r,o,n,i;return Nt(this,function(a){switch(a.label){case 0:r=e.getReader(),a.label=1;case 1:a.trys.push([1,,9,10]),a.label=2;case 2:return[4,nt(r.read())];case 3:return o=a.sent(),n=o.value,i=o.done,i?[4,nt(void 0)]:[3,5];case 4:return[2,a.sent()];case 5:return[4,nt(n)];case 6:return[4,a.sent()];case 7:return a.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function or(e){return H(e==null?void 0:e.getReader)}function U(e){if(e instanceof j)return e;if(e!=null){if(Jt(e))return ea(e);if(xt(e))return ta(e);if(Gt(e))return ra(e);if(Xt(e))return Ao(e);if(tr(e))return oa(e);if(or(e))return na(e)}throw Zt(e)}function ea(e){return new j(function(t){var r=e[bt]();if(H(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function ta(e){return new j(function(t){for(var r=0;r=2;return function(o){return o.pipe(e?b(function(n,i){return e(n,i,o)}):le,Te(1),r?De(t):Qo(function(){return new ir}))}}function jr(e){return e<=0?function(){return S}:E(function(t,r){var o=[];t.subscribe(T(r,function(n){o.push(n),e=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new g}:t,o=e.resetOnError,n=o===void 0?!0:o,i=e.resetOnComplete,a=i===void 0?!0:i,s=e.resetOnRefCountZero,p=s===void 0?!0:s;return function(c){var l,f,u,d=0,y=!1,L=!1,X=function(){f==null||f.unsubscribe(),f=void 0},te=function(){X(),l=u=void 0,y=L=!1},J=function(){var k=l;te(),k==null||k.unsubscribe()};return E(function(k,ft){d++,!L&&!y&&X();var qe=u=u!=null?u:r();ft.add(function(){d--,d===0&&!L&&!y&&(f=Ur(J,p))}),qe.subscribe(ft),!l&&d>0&&(l=new at({next:function(Fe){return qe.next(Fe)},error:function(Fe){L=!0,X(),f=Ur(te,n,Fe),qe.error(Fe)},complete:function(){y=!0,X(),f=Ur(te,a),qe.complete()}}),U(k).subscribe(l))})(c)}}function Ur(e,t){for(var r=[],o=2;oe.next(document)),e}function P(e,t=document){return Array.from(t.querySelectorAll(e))}function R(e,t=document){let r=fe(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function fe(e,t=document){return t.querySelector(e)||void 0}function Ie(){var e,t,r,o;return(o=(r=(t=(e=document.activeElement)==null?void 0:e.shadowRoot)==null?void 0:t.activeElement)!=null?r:document.activeElement)!=null?o:void 0}var wa=O(h(document.body,"focusin"),h(document.body,"focusout")).pipe(_e(1),Q(void 0),m(()=>Ie()||document.body),G(1));function et(e){return wa.pipe(m(t=>e.contains(t)),K())}function $t(e,t){return C(()=>O(h(e,"mouseenter").pipe(m(()=>!0)),h(e,"mouseleave").pipe(m(()=>!1))).pipe(t?Ht(r=>Le(+!r*t)):le,Q(e.matches(":hover"))))}function Jo(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)Jo(e,r)}function x(e,t,...r){let o=document.createElement(e);if(t)for(let n of Object.keys(t))typeof t[n]!="undefined"&&(typeof t[n]!="boolean"?o.setAttribute(n,t[n]):o.setAttribute(n,""));for(let n of r)Jo(o,n);return o}function sr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function Tt(e){let t=x("script",{src:e});return C(()=>(document.head.appendChild(t),O(h(t,"load"),h(t,"error").pipe(v(()=>$r(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(m(()=>{}),_(()=>document.head.removeChild(t)),Te(1))))}var Xo=new g,Ta=C(()=>typeof ResizeObserver=="undefined"?Tt("https://unpkg.com/resize-observer-polyfill"):I(void 0)).pipe(m(()=>new ResizeObserver(e=>e.forEach(t=>Xo.next(t)))),v(e=>O(Ye,I(e)).pipe(_(()=>e.disconnect()))),G(1));function ce(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ge(e){let t=e;for(;t.clientWidth===0&&t.parentElement;)t=t.parentElement;return Ta.pipe(w(r=>r.observe(t)),v(r=>Xo.pipe(b(o=>o.target===t),_(()=>r.unobserve(t)))),m(()=>ce(e)),Q(ce(e)))}function St(e){return{width:e.scrollWidth,height:e.scrollHeight}}function cr(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}function Zo(e){let t=[],r=e.parentElement;for(;r;)(e.clientWidth>r.clientWidth||e.clientHeight>r.clientHeight)&&t.push(r),r=(e=r).parentElement;return t.length===0&&t.push(document.documentElement),t}function Ve(e){return{x:e.offsetLeft,y:e.offsetTop}}function en(e){let t=e.getBoundingClientRect();return{x:t.x+window.scrollX,y:t.y+window.scrollY}}function tn(e){return O(h(window,"load"),h(window,"resize")).pipe(Me(0,me),m(()=>Ve(e)),Q(Ve(e)))}function pr(e){return{x:e.scrollLeft,y:e.scrollTop}}function Ne(e){return O(h(e,"scroll"),h(window,"scroll"),h(window,"resize")).pipe(Me(0,me),m(()=>pr(e)),Q(pr(e)))}var rn=new g,Sa=C(()=>I(new IntersectionObserver(e=>{for(let t of e)rn.next(t)},{threshold:0}))).pipe(v(e=>O(Ye,I(e)).pipe(_(()=>e.disconnect()))),G(1));function tt(e){return Sa.pipe(w(t=>t.observe(e)),v(t=>rn.pipe(b(({target:r})=>r===e),_(()=>t.unobserve(e)),m(({isIntersecting:r})=>r))))}function on(e,t=16){return Ne(e).pipe(m(({y:r})=>{let o=ce(e),n=St(e);return r>=n.height-o.height-t}),K())}var lr={drawer:R("[data-md-toggle=drawer]"),search:R("[data-md-toggle=search]")};function nn(e){return lr[e].checked}function Je(e,t){lr[e].checked!==t&&lr[e].click()}function ze(e){let t=lr[e];return h(t,"change").pipe(m(()=>t.checked),Q(t.checked))}function Oa(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function La(){return O(h(window,"compositionstart").pipe(m(()=>!0)),h(window,"compositionend").pipe(m(()=>!1))).pipe(Q(!1))}function an(){let e=h(window,"keydown").pipe(b(t=>!(t.metaKey||t.ctrlKey)),m(t=>({mode:nn("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),b(({mode:t,type:r})=>{if(t==="global"){let o=Ie();if(typeof o!="undefined")return!Oa(o,r)}return!0}),pe());return La().pipe(v(t=>t?S:e))}function ye(){return new URL(location.href)}function lt(e,t=!1){if(B("navigation.instant")&&!t){let r=x("a",{href:e.href});document.body.appendChild(r),r.click(),r.remove()}else location.href=e.href}function sn(){return new g}function cn(){return location.hash.slice(1)}function pn(e){let t=x("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Ma(e){return O(h(window,"hashchange"),e).pipe(m(cn),Q(cn()),b(t=>t.length>0),G(1))}function ln(e){return Ma(e).pipe(m(t=>fe(`[id="${t}"]`)),b(t=>typeof t!="undefined"))}function Pt(e){let t=matchMedia(e);return ar(r=>t.addListener(()=>r(t.matches))).pipe(Q(t.matches))}function mn(){let e=matchMedia("print");return O(h(window,"beforeprint").pipe(m(()=>!0)),h(window,"afterprint").pipe(m(()=>!1))).pipe(Q(e.matches))}function Nr(e,t){return e.pipe(v(r=>r?t():S))}function zr(e,t){return new j(r=>{let o=new XMLHttpRequest;return o.open("GET",`${e}`),o.responseType="blob",o.addEventListener("load",()=>{o.status>=200&&o.status<300?(r.next(o.response),r.complete()):r.error(new Error(o.statusText))}),o.addEventListener("error",()=>{r.error(new Error("Network error"))}),o.addEventListener("abort",()=>{r.complete()}),typeof(t==null?void 0:t.progress$)!="undefined"&&(o.addEventListener("progress",n=>{var i;if(n.lengthComputable)t.progress$.next(n.loaded/n.total*100);else{let a=(i=o.getResponseHeader("Content-Length"))!=null?i:0;t.progress$.next(n.loaded/+a*100)}}),t.progress$.next(5)),o.send(),()=>o.abort()})}function je(e,t){return zr(e,t).pipe(v(r=>r.text()),m(r=>JSON.parse(r)),G(1))}function fn(e,t){let r=new DOMParser;return zr(e,t).pipe(v(o=>o.text()),m(o=>r.parseFromString(o,"text/html")),G(1))}function un(e,t){let r=new DOMParser;return zr(e,t).pipe(v(o=>o.text()),m(o=>r.parseFromString(o,"text/xml")),G(1))}function dn(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function hn(){return O(h(window,"scroll",{passive:!0}),h(window,"resize",{passive:!0})).pipe(m(dn),Q(dn()))}function bn(){return{width:innerWidth,height:innerHeight}}function vn(){return h(window,"resize",{passive:!0}).pipe(m(bn),Q(bn()))}function gn(){return z([hn(),vn()]).pipe(m(([e,t])=>({offset:e,size:t})),G(1))}function mr(e,{viewport$:t,header$:r}){let o=t.pipe(ee("size")),n=z([o,r]).pipe(m(()=>Ve(e)));return z([r,t,n]).pipe(m(([{height:i},{offset:a,size:s},{x:p,y:c}])=>({offset:{x:a.x-p,y:a.y-c+i},size:s})))}function _a(e){return h(e,"message",t=>t.data)}function Aa(e){let t=new g;return t.subscribe(r=>e.postMessage(r)),t}function yn(e,t=new Worker(e)){let r=_a(t),o=Aa(t),n=new g;n.subscribe(o);let i=o.pipe(Z(),ie(!0));return n.pipe(Z(),Re(r.pipe(W(i))),pe())}var Ca=R("#__config"),Ot=JSON.parse(Ca.textContent);Ot.base=`${new URL(Ot.base,ye())}`;function xe(){return Ot}function B(e){return Ot.features.includes(e)}function Ee(e,t){return typeof t!="undefined"?Ot.translations[e].replace("#",t.toString()):Ot.translations[e]}function Se(e,t=document){return R(`[data-md-component=${e}]`,t)}function ae(e,t=document){return P(`[data-md-component=${e}]`,t)}function ka(e){let t=R(".md-typeset > :first-child",e);return h(t,"click",{once:!0}).pipe(m(()=>R(".md-typeset",e)),m(r=>({hash:__md_hash(r.innerHTML)})))}function xn(e){if(!B("announce.dismiss")||!e.childElementCount)return S;if(!e.hidden){let t=R(".md-typeset",e);__md_hash(t.innerHTML)===__md_get("__announce")&&(e.hidden=!0)}return C(()=>{let t=new g;return t.subscribe(({hash:r})=>{e.hidden=!0,__md_set("__announce",r)}),ka(e).pipe(w(r=>t.next(r)),_(()=>t.complete()),m(r=>$({ref:e},r)))})}function Ha(e,{target$:t}){return t.pipe(m(r=>({hidden:r!==e})))}function En(e,t){let r=new g;return r.subscribe(({hidden:o})=>{e.hidden=o}),Ha(e,t).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))}function Rt(e,t){return t==="inline"?x("div",{class:"md-tooltip md-tooltip--inline",id:e,role:"tooltip"},x("div",{class:"md-tooltip__inner md-typeset"})):x("div",{class:"md-tooltip",id:e,role:"tooltip"},x("div",{class:"md-tooltip__inner md-typeset"}))}function wn(...e){return x("div",{class:"md-tooltip2",role:"tooltip"},x("div",{class:"md-tooltip2__inner md-typeset"},e))}function Tn(e,t){if(t=t?`${t}_annotation_${e}`:void 0,t){let r=t?`#${t}`:void 0;return x("aside",{class:"md-annotation",tabIndex:0},Rt(t),x("a",{href:r,class:"md-annotation__index",tabIndex:-1},x("span",{"data-md-annotation-id":e})))}else return x("aside",{class:"md-annotation",tabIndex:0},Rt(t),x("span",{class:"md-annotation__index",tabIndex:-1},x("span",{"data-md-annotation-id":e})))}function Sn(e){return x("button",{class:"md-clipboard md-icon",title:Ee("clipboard.copy"),"data-clipboard-target":`#${e} > code`})}var Ln=Mt(qr());function Qr(e,t){let r=t&2,o=t&1,n=Object.keys(e.terms).filter(p=>!e.terms[p]).reduce((p,c)=>[...p,x("del",null,(0,Ln.default)(c))," "],[]).slice(0,-1),i=xe(),a=new URL(e.location,i.base);B("search.highlight")&&a.searchParams.set("h",Object.entries(e.terms).filter(([,p])=>p).reduce((p,[c])=>`${p} ${c}`.trim(),""));let{tags:s}=xe();return x("a",{href:`${a}`,class:"md-search-result__link",tabIndex:-1},x("article",{class:"md-search-result__article md-typeset","data-md-score":e.score.toFixed(2)},r>0&&x("div",{class:"md-search-result__icon md-icon"}),r>0&&x("h1",null,e.title),r<=0&&x("h2",null,e.title),o>0&&e.text.length>0&&e.text,e.tags&&x("nav",{class:"md-tags"},e.tags.map(p=>{let c=s?p in s?`md-tag-icon md-tag--${s[p]}`:"md-tag-icon":"";return x("span",{class:`md-tag ${c}`},p)})),o>0&&n.length>0&&x("p",{class:"md-search-result__terms"},Ee("search.result.term.missing"),": ",...n)))}function Mn(e){let t=e[0].score,r=[...e],o=xe(),n=r.findIndex(l=>!`${new URL(l.location,o.base)}`.includes("#")),[i]=r.splice(n,1),a=r.findIndex(l=>l.scoreQr(l,1)),...p.length?[x("details",{class:"md-search-result__more"},x("summary",{tabIndex:-1},x("div",null,p.length>0&&p.length===1?Ee("search.result.more.one"):Ee("search.result.more.other",p.length))),...p.map(l=>Qr(l,1)))]:[]];return x("li",{class:"md-search-result__item"},c)}function _n(e){return x("ul",{class:"md-source__facts"},Object.entries(e).map(([t,r])=>x("li",{class:`md-source__fact md-source__fact--${t}`},typeof r=="number"?sr(r):r)))}function Kr(e){let t=`tabbed-control tabbed-control--${e}`;return x("div",{class:t,hidden:!0},x("button",{class:"tabbed-button",tabIndex:-1,"aria-hidden":"true"}))}function An(e){return x("div",{class:"md-typeset__scrollwrap"},x("div",{class:"md-typeset__table"},e))}function Ra(e){var o;let t=xe(),r=new URL(`../${e.version}/`,t.base);return x("li",{class:"md-version__item"},x("a",{href:`${r}`,class:"md-version__link"},e.title,((o=t.version)==null?void 0:o.alias)&&e.aliases.length>0&&x("span",{class:"md-version__alias"},e.aliases[0])))}function Cn(e,t){var o;let r=xe();return e=e.filter(n=>{var i;return!((i=n.properties)!=null&&i.hidden)}),x("div",{class:"md-version"},x("button",{class:"md-version__current","aria-label":Ee("select.version")},t.title,((o=r.version)==null?void 0:o.alias)&&t.aliases.length>0&&x("span",{class:"md-version__alias"},t.aliases[0])),x("ul",{class:"md-version__list"},e.map(Ra)))}var Ia=0;function ja(e){let t=z([et(e),$t(e)]).pipe(m(([o,n])=>o||n),K()),r=C(()=>Zo(e)).pipe(ne(Ne),pt(1),He(t),m(()=>en(e)));return t.pipe(Ae(o=>o),v(()=>z([t,r])),m(([o,n])=>({active:o,offset:n})),pe())}function Fa(e,t){let{content$:r,viewport$:o}=t,n=`__tooltip2_${Ia++}`;return C(()=>{let i=new g,a=new _r(!1);i.pipe(Z(),ie(!1)).subscribe(a);let s=a.pipe(Ht(c=>Le(+!c*250,kr)),K(),v(c=>c?r:S),w(c=>c.id=n),pe());z([i.pipe(m(({active:c})=>c)),s.pipe(v(c=>$t(c,250)),Q(!1))]).pipe(m(c=>c.some(l=>l))).subscribe(a);let p=a.pipe(b(c=>c),re(s,o),m(([c,l,{size:f}])=>{let u=e.getBoundingClientRect(),d=u.width/2;if(l.role==="tooltip")return{x:d,y:8+u.height};if(u.y>=f.height/2){let{height:y}=ce(l);return{x:d,y:-16-y}}else return{x:d,y:16+u.height}}));return z([s,i,p]).subscribe(([c,{offset:l},f])=>{c.style.setProperty("--md-tooltip-host-x",`${l.x}px`),c.style.setProperty("--md-tooltip-host-y",`${l.y}px`),c.style.setProperty("--md-tooltip-x",`${f.x}px`),c.style.setProperty("--md-tooltip-y",`${f.y}px`),c.classList.toggle("md-tooltip2--top",f.y<0),c.classList.toggle("md-tooltip2--bottom",f.y>=0)}),a.pipe(b(c=>c),re(s,(c,l)=>l),b(c=>c.role==="tooltip")).subscribe(c=>{let l=ce(R(":scope > *",c));c.style.setProperty("--md-tooltip-width",`${l.width}px`),c.style.setProperty("--md-tooltip-tail","0px")}),a.pipe(K(),ve(me),re(s)).subscribe(([c,l])=>{l.classList.toggle("md-tooltip2--active",c)}),z([a.pipe(b(c=>c)),s]).subscribe(([c,l])=>{l.role==="dialog"?(e.setAttribute("aria-controls",n),e.setAttribute("aria-haspopup","dialog")):e.setAttribute("aria-describedby",n)}),a.pipe(b(c=>!c)).subscribe(()=>{e.removeAttribute("aria-controls"),e.removeAttribute("aria-describedby"),e.removeAttribute("aria-haspopup")}),ja(e).pipe(w(c=>i.next(c)),_(()=>i.complete()),m(c=>$({ref:e},c)))})}function mt(e,{viewport$:t},r=document.body){return Fa(e,{content$:new j(o=>{let n=e.title,i=wn(n);return o.next(i),e.removeAttribute("title"),r.append(i),()=>{i.remove(),e.setAttribute("title",n)}}),viewport$:t})}function Ua(e,t){let r=C(()=>z([tn(e),Ne(t)])).pipe(m(([{x:o,y:n},i])=>{let{width:a,height:s}=ce(e);return{x:o-i.x+a/2,y:n-i.y+s/2}}));return et(e).pipe(v(o=>r.pipe(m(n=>({active:o,offset:n})),Te(+!o||1/0))))}function kn(e,t,{target$:r}){let[o,n]=Array.from(e.children);return C(()=>{let i=new g,a=i.pipe(Z(),ie(!0));return i.subscribe({next({offset:s}){e.style.setProperty("--md-tooltip-x",`${s.x}px`),e.style.setProperty("--md-tooltip-y",`${s.y}px`)},complete(){e.style.removeProperty("--md-tooltip-x"),e.style.removeProperty("--md-tooltip-y")}}),tt(e).pipe(W(a)).subscribe(s=>{e.toggleAttribute("data-md-visible",s)}),O(i.pipe(b(({active:s})=>s)),i.pipe(_e(250),b(({active:s})=>!s))).subscribe({next({active:s}){s?e.prepend(o):o.remove()},complete(){e.prepend(o)}}),i.pipe(Me(16,me)).subscribe(({active:s})=>{o.classList.toggle("md-tooltip--active",s)}),i.pipe(pt(125,me),b(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:s})=>s)).subscribe({next(s){s?e.style.setProperty("--md-tooltip-0",`${-s}px`):e.style.removeProperty("--md-tooltip-0")},complete(){e.style.removeProperty("--md-tooltip-0")}}),h(n,"click").pipe(W(a),b(s=>!(s.metaKey||s.ctrlKey))).subscribe(s=>{s.stopPropagation(),s.preventDefault()}),h(n,"mousedown").pipe(W(a),re(i)).subscribe(([s,{active:p}])=>{var c;if(s.button!==0||s.metaKey||s.ctrlKey)s.preventDefault();else if(p){s.preventDefault();let l=e.parentElement.closest(".md-annotation");l instanceof HTMLElement?l.focus():(c=Ie())==null||c.blur()}}),r.pipe(W(a),b(s=>s===o),Ge(125)).subscribe(()=>e.focus()),Ua(e,t).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))})}function Wa(e){return e.tagName==="CODE"?P(".c, .c1, .cm",e):[e]}function Da(e){let t=[];for(let r of Wa(e)){let o=[],n=document.createNodeIterator(r,NodeFilter.SHOW_TEXT);for(let i=n.nextNode();i;i=n.nextNode())o.push(i);for(let i of o){let a;for(;a=/(\(\d+\))(!)?/.exec(i.textContent);){let[,s,p]=a;if(typeof p=="undefined"){let c=i.splitText(a.index);i=c.splitText(s.length),t.push(c)}else{i.textContent=s,t.push(i);break}}}}return t}function Hn(e,t){t.append(...Array.from(e.childNodes))}function fr(e,t,{target$:r,print$:o}){let n=t.closest("[id]"),i=n==null?void 0:n.id,a=new Map;for(let s of Da(t)){let[,p]=s.textContent.match(/\((\d+)\)/);fe(`:scope > li:nth-child(${p})`,e)&&(a.set(p,Tn(p,i)),s.replaceWith(a.get(p)))}return a.size===0?S:C(()=>{let s=new g,p=s.pipe(Z(),ie(!0)),c=[];for(let[l,f]of a)c.push([R(".md-typeset",f),R(`:scope > li:nth-child(${l})`,e)]);return o.pipe(W(p)).subscribe(l=>{e.hidden=!l,e.classList.toggle("md-annotation-list",l);for(let[f,u]of c)l?Hn(f,u):Hn(u,f)}),O(...[...a].map(([,l])=>kn(l,t,{target$:r}))).pipe(_(()=>s.complete()),pe())})}function $n(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return $n(t)}}function Pn(e,t){return C(()=>{let r=$n(e);return typeof r!="undefined"?fr(r,e,t):S})}var Rn=Mt(Br());var Va=0;function In(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return In(t)}}function Na(e){return ge(e).pipe(m(({width:t})=>({scrollable:St(e).width>t})),ee("scrollable"))}function jn(e,t){let{matches:r}=matchMedia("(hover)"),o=C(()=>{let n=new g,i=n.pipe(jr(1));n.subscribe(({scrollable:c})=>{c&&r?e.setAttribute("tabindex","0"):e.removeAttribute("tabindex")});let a=[];if(Rn.default.isSupported()&&(e.closest(".copy")||B("content.code.copy")&&!e.closest(".no-copy"))){let c=e.closest("pre");c.id=`__code_${Va++}`;let l=Sn(c.id);c.insertBefore(l,e),B("content.tooltips")&&a.push(mt(l,{viewport$}))}let s=e.closest(".highlight");if(s instanceof HTMLElement){let c=In(s);if(typeof c!="undefined"&&(s.classList.contains("annotate")||B("content.code.annotate"))){let l=fr(c,e,t);a.push(ge(s).pipe(W(i),m(({width:f,height:u})=>f&&u),K(),v(f=>f?l:S)))}}return P(":scope > span[id]",e).length&&e.classList.add("md-code__content"),Na(e).pipe(w(c=>n.next(c)),_(()=>n.complete()),m(c=>$({ref:e},c)),Re(...a))});return B("content.lazy")?tt(e).pipe(b(n=>n),Te(1),v(()=>o)):o}function za(e,{target$:t,print$:r}){let o=!0;return O(t.pipe(m(n=>n.closest("details:not([open])")),b(n=>e===n),m(()=>({action:"open",reveal:!0}))),r.pipe(b(n=>n||!o),w(()=>o=e.open),m(n=>({action:n?"open":"close"}))))}function Fn(e,t){return C(()=>{let r=new g;return r.subscribe(({action:o,reveal:n})=>{e.toggleAttribute("open",o==="open"),n&&e.scrollIntoView()}),za(e,t).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}var Un=".node circle,.node ellipse,.node path,.node polygon,.node rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}marker{fill:var(--md-mermaid-edge-color)!important}.edgeLabel .label rect{fill:#0000}.flowchartTitleText{fill:var(--md-mermaid-label-fg-color)}.label{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.label foreignObject{line-height:normal;overflow:visible}.label div .edgeLabel{color:var(--md-mermaid-label-fg-color)}.edgeLabel,.edgeLabel p,.label div .edgeLabel{background-color:var(--md-mermaid-label-bg-color)}.edgeLabel,.edgeLabel p{fill:var(--md-mermaid-label-bg-color);color:var(--md-mermaid-edge-color)}.edgePath .path,.flowchart-link{stroke:var(--md-mermaid-edge-color);stroke-width:.05rem}.edgePath .arrowheadPath{fill:var(--md-mermaid-edge-color);stroke:none}.cluster rect{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}.cluster span{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}g #flowchart-circleEnd,g #flowchart-circleStart,g #flowchart-crossEnd,g #flowchart-crossStart,g #flowchart-pointEnd,g #flowchart-pointStart{stroke:none}.classDiagramTitleText{fill:var(--md-mermaid-label-fg-color)}g.classGroup line,g.classGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.classGroup text{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.classLabel .box{fill:var(--md-mermaid-label-bg-color);background-color:var(--md-mermaid-label-bg-color);opacity:1}.classLabel .label{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.node .divider{stroke:var(--md-mermaid-node-fg-color)}.relation{stroke:var(--md-mermaid-edge-color)}.cardinality{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.cardinality text{fill:inherit!important}defs #classDiagram-compositionEnd,defs #classDiagram-compositionStart,defs #classDiagram-dependencyEnd,defs #classDiagram-dependencyStart,defs #classDiagram-extensionEnd,defs #classDiagram-extensionStart{fill:var(--md-mermaid-edge-color)!important;stroke:var(--md-mermaid-edge-color)!important}defs #classDiagram-aggregationEnd,defs #classDiagram-aggregationStart{fill:var(--md-mermaid-label-bg-color)!important;stroke:var(--md-mermaid-edge-color)!important}.statediagramTitleText{fill:var(--md-mermaid-label-fg-color)}g.stateGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.stateGroup .state-title{fill:var(--md-mermaid-label-fg-color)!important;font-family:var(--md-mermaid-font-family)}g.stateGroup .composit{fill:var(--md-mermaid-label-bg-color)}.nodeLabel,.nodeLabel p{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}a .nodeLabel{text-decoration:underline}.node circle.state-end,.node circle.state-start,.start-state{fill:var(--md-mermaid-edge-color);stroke:none}.end-state-inner,.end-state-outer{fill:var(--md-mermaid-edge-color)}.end-state-inner,.node circle.state-end{stroke:var(--md-mermaid-label-bg-color)}.transition{stroke:var(--md-mermaid-edge-color)}[id^=state-fork] rect,[id^=state-join] rect{fill:var(--md-mermaid-edge-color)!important;stroke:none!important}.statediagram-cluster.statediagram-cluster .inner{fill:var(--md-default-bg-color)}.statediagram-cluster rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.statediagram-state rect.divider{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}defs #statediagram-barbEnd{stroke:var(--md-mermaid-edge-color)}.entityTitleText{fill:var(--md-mermaid-label-fg-color)}.attributeBoxEven,.attributeBoxOdd{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityBox{fill:var(--md-mermaid-label-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityLabel{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.relationshipLabelBox{fill:var(--md-mermaid-label-bg-color);fill-opacity:1;background-color:var(--md-mermaid-label-bg-color);opacity:1}.relationshipLabel{fill:var(--md-mermaid-label-fg-color)}.relationshipLine{stroke:var(--md-mermaid-edge-color)}defs #ONE_OR_MORE_END *,defs #ONE_OR_MORE_START *,defs #ONLY_ONE_END *,defs #ONLY_ONE_START *,defs #ZERO_OR_MORE_END *,defs #ZERO_OR_MORE_START *,defs #ZERO_OR_ONE_END *,defs #ZERO_OR_ONE_START *{stroke:var(--md-mermaid-edge-color)!important}defs #ZERO_OR_MORE_END circle,defs #ZERO_OR_MORE_START circle{fill:var(--md-mermaid-label-bg-color)}text:not([class]):last-child{fill:var(--md-mermaid-label-fg-color)}.actor{fill:var(--md-mermaid-sequence-actor-bg-color);stroke:var(--md-mermaid-sequence-actor-border-color)}text.actor>tspan{fill:var(--md-mermaid-sequence-actor-fg-color);font-family:var(--md-mermaid-font-family)}line{stroke:var(--md-mermaid-sequence-actor-line-color)}.actor-man circle,.actor-man line{fill:var(--md-mermaid-sequence-actorman-bg-color);stroke:var(--md-mermaid-sequence-actorman-line-color)}.messageLine0,.messageLine1{stroke:var(--md-mermaid-sequence-message-line-color)}.note{fill:var(--md-mermaid-sequence-note-bg-color);stroke:var(--md-mermaid-sequence-note-border-color)}.loopText,.loopText>tspan,.messageText,.noteText>tspan{stroke:none;font-family:var(--md-mermaid-font-family)!important}.messageText{fill:var(--md-mermaid-sequence-message-fg-color)}.loopText,.loopText>tspan{fill:var(--md-mermaid-sequence-loop-fg-color)}.noteText>tspan{fill:var(--md-mermaid-sequence-note-fg-color)}#arrowhead path{fill:var(--md-mermaid-sequence-message-line-color);stroke:none}.loopLine{fill:var(--md-mermaid-sequence-loop-bg-color);stroke:var(--md-mermaid-sequence-loop-border-color)}.labelBox{fill:var(--md-mermaid-sequence-label-bg-color);stroke:none}.labelText,.labelText>span{fill:var(--md-mermaid-sequence-label-fg-color);font-family:var(--md-mermaid-font-family)}.sequenceNumber{fill:var(--md-mermaid-sequence-number-fg-color)}rect.rect{fill:var(--md-mermaid-sequence-box-bg-color);stroke:none}rect.rect+text.text{fill:var(--md-mermaid-sequence-box-fg-color)}defs #sequencenumber{fill:var(--md-mermaid-sequence-number-bg-color)!important}";var Gr,Qa=0;function Ka(){return typeof mermaid=="undefined"||mermaid instanceof Element?Tt("https://unpkg.com/mermaid@11/dist/mermaid.min.js"):I(void 0)}function Wn(e){return e.classList.remove("mermaid"),Gr||(Gr=Ka().pipe(w(()=>mermaid.initialize({startOnLoad:!1,themeCSS:Un,sequence:{actorFontSize:"16px",messageFontSize:"16px",noteFontSize:"16px"}})),m(()=>{}),G(1))),Gr.subscribe(()=>co(this,null,function*(){e.classList.add("mermaid");let t=`__mermaid_${Qa++}`,r=x("div",{class:"mermaid"}),o=e.textContent,{svg:n,fn:i}=yield mermaid.render(t,o),a=r.attachShadow({mode:"closed"});a.innerHTML=n,e.replaceWith(r),i==null||i(a)})),Gr.pipe(m(()=>({ref:e})))}var Dn=x("table");function Vn(e){return e.replaceWith(Dn),Dn.replaceWith(An(e)),I({ref:e})}function Ya(e){let t=e.find(r=>r.checked)||e[0];return O(...e.map(r=>h(r,"change").pipe(m(()=>R(`label[for="${r.id}"]`))))).pipe(Q(R(`label[for="${t.id}"]`)),m(r=>({active:r})))}function Nn(e,{viewport$:t,target$:r}){let o=R(".tabbed-labels",e),n=P(":scope > input",e),i=Kr("prev");e.append(i);let a=Kr("next");return e.append(a),C(()=>{let s=new g,p=s.pipe(Z(),ie(!0));z([s,ge(e),tt(e)]).pipe(W(p),Me(1,me)).subscribe({next([{active:c},l]){let f=Ve(c),{width:u}=ce(c);e.style.setProperty("--md-indicator-x",`${f.x}px`),e.style.setProperty("--md-indicator-width",`${u}px`);let d=pr(o);(f.xd.x+l.width)&&o.scrollTo({left:Math.max(0,f.x-16),behavior:"smooth"})},complete(){e.style.removeProperty("--md-indicator-x"),e.style.removeProperty("--md-indicator-width")}}),z([Ne(o),ge(o)]).pipe(W(p)).subscribe(([c,l])=>{let f=St(o);i.hidden=c.x<16,a.hidden=c.x>f.width-l.width-16}),O(h(i,"click").pipe(m(()=>-1)),h(a,"click").pipe(m(()=>1))).pipe(W(p)).subscribe(c=>{let{width:l}=ce(o);o.scrollBy({left:l*c,behavior:"smooth"})}),r.pipe(W(p),b(c=>n.includes(c))).subscribe(c=>c.click()),o.classList.add("tabbed-labels--linked");for(let c of n){let l=R(`label[for="${c.id}"]`);l.replaceChildren(x("a",{href:`#${l.htmlFor}`,tabIndex:-1},...Array.from(l.childNodes))),h(l.firstElementChild,"click").pipe(W(p),b(f=>!(f.metaKey||f.ctrlKey)),w(f=>{f.preventDefault(),f.stopPropagation()})).subscribe(()=>{history.replaceState({},"",`#${l.htmlFor}`),l.click()})}return B("content.tabs.link")&&s.pipe(Ce(1),re(t)).subscribe(([{active:c},{offset:l}])=>{let f=c.innerText.trim();if(c.hasAttribute("data-md-switching"))c.removeAttribute("data-md-switching");else{let u=e.offsetTop-l.y;for(let y of P("[data-tabs]"))for(let L of P(":scope > input",y)){let X=R(`label[for="${L.id}"]`);if(X!==c&&X.innerText.trim()===f){X.setAttribute("data-md-switching",""),L.click();break}}window.scrollTo({top:e.offsetTop-u});let d=__md_get("__tabs")||[];__md_set("__tabs",[...new Set([f,...d])])}}),s.pipe(W(p)).subscribe(()=>{for(let c of P("audio, video",e))c.pause()}),Ya(n).pipe(w(c=>s.next(c)),_(()=>s.complete()),m(c=>$({ref:e},c)))}).pipe(Ke(se))}function zn(e,{viewport$:t,target$:r,print$:o}){return O(...P(".annotate:not(.highlight)",e).map(n=>Pn(n,{target$:r,print$:o})),...P("pre:not(.mermaid) > code",e).map(n=>jn(n,{target$:r,print$:o})),...P("pre.mermaid",e).map(n=>Wn(n)),...P("table:not([class])",e).map(n=>Vn(n)),...P("details",e).map(n=>Fn(n,{target$:r,print$:o})),...P("[data-tabs]",e).map(n=>Nn(n,{viewport$:t,target$:r})),...P("[title]",e).filter(()=>B("content.tooltips")).map(n=>mt(n,{viewport$:t})))}function Ba(e,{alert$:t}){return t.pipe(v(r=>O(I(!0),I(!1).pipe(Ge(2e3))).pipe(m(o=>({message:r,active:o})))))}function qn(e,t){let r=R(".md-typeset",e);return C(()=>{let o=new g;return o.subscribe(({message:n,active:i})=>{e.classList.toggle("md-dialog--active",i),r.textContent=n}),Ba(e,t).pipe(w(n=>o.next(n)),_(()=>o.complete()),m(n=>$({ref:e},n)))})}var Ga=0;function Ja(e,t){document.body.append(e);let{width:r}=ce(e);e.style.setProperty("--md-tooltip-width",`${r}px`),e.remove();let o=cr(t),n=typeof o!="undefined"?Ne(o):I({x:0,y:0}),i=O(et(t),$t(t)).pipe(K());return z([i,n]).pipe(m(([a,s])=>{let{x:p,y:c}=Ve(t),l=ce(t),f=t.closest("table");return f&&t.parentElement&&(p+=f.offsetLeft+t.parentElement.offsetLeft,c+=f.offsetTop+t.parentElement.offsetTop),{active:a,offset:{x:p-s.x+l.width/2-r/2,y:c-s.y+l.height+8}}}))}function Qn(e){let t=e.title;if(!t.length)return S;let r=`__tooltip_${Ga++}`,o=Rt(r,"inline"),n=R(".md-typeset",o);return n.innerHTML=t,C(()=>{let i=new g;return i.subscribe({next({offset:a}){o.style.setProperty("--md-tooltip-x",`${a.x}px`),o.style.setProperty("--md-tooltip-y",`${a.y}px`)},complete(){o.style.removeProperty("--md-tooltip-x"),o.style.removeProperty("--md-tooltip-y")}}),O(i.pipe(b(({active:a})=>a)),i.pipe(_e(250),b(({active:a})=>!a))).subscribe({next({active:a}){a?(e.insertAdjacentElement("afterend",o),e.setAttribute("aria-describedby",r),e.removeAttribute("title")):(o.remove(),e.removeAttribute("aria-describedby"),e.setAttribute("title",t))},complete(){o.remove(),e.removeAttribute("aria-describedby"),e.setAttribute("title",t)}}),i.pipe(Me(16,me)).subscribe(({active:a})=>{o.classList.toggle("md-tooltip--active",a)}),i.pipe(pt(125,me),b(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:a})=>a)).subscribe({next(a){a?o.style.setProperty("--md-tooltip-0",`${-a}px`):o.style.removeProperty("--md-tooltip-0")},complete(){o.style.removeProperty("--md-tooltip-0")}}),Ja(o,e).pipe(w(a=>i.next(a)),_(()=>i.complete()),m(a=>$({ref:e},a)))}).pipe(Ke(se))}function Xa({viewport$:e}){if(!B("header.autohide"))return I(!1);let t=e.pipe(m(({offset:{y:n}})=>n),Be(2,1),m(([n,i])=>[nMath.abs(i-n.y)>100),m(([,[n]])=>n),K()),o=ze("search");return z([e,o]).pipe(m(([{offset:n},i])=>n.y>400&&!i),K(),v(n=>n?r:I(!1)),Q(!1))}function Kn(e,t){return C(()=>z([ge(e),Xa(t)])).pipe(m(([{height:r},o])=>({height:r,hidden:o})),K((r,o)=>r.height===o.height&&r.hidden===o.hidden),G(1))}function Yn(e,{header$:t,main$:r}){return C(()=>{let o=new g,n=o.pipe(Z(),ie(!0));o.pipe(ee("active"),He(t)).subscribe(([{active:a},{hidden:s}])=>{e.classList.toggle("md-header--shadow",a&&!s),e.hidden=s});let i=ue(P("[title]",e)).pipe(b(()=>B("content.tooltips")),ne(a=>Qn(a)));return r.subscribe(o),t.pipe(W(n),m(a=>$({ref:e},a)),Re(i.pipe(W(n))))})}function Za(e,{viewport$:t,header$:r}){return mr(e,{viewport$:t,header$:r}).pipe(m(({offset:{y:o}})=>{let{height:n}=ce(e);return{active:o>=n}}),ee("active"))}function Bn(e,t){return C(()=>{let r=new g;r.subscribe({next({active:n}){e.classList.toggle("md-header__title--active",n)},complete(){e.classList.remove("md-header__title--active")}});let o=fe(".md-content h1");return typeof o=="undefined"?S:Za(o,t).pipe(w(n=>r.next(n)),_(()=>r.complete()),m(n=>$({ref:e},n)))})}function Gn(e,{viewport$:t,header$:r}){let o=r.pipe(m(({height:i})=>i),K()),n=o.pipe(v(()=>ge(e).pipe(m(({height:i})=>({top:e.offsetTop,bottom:e.offsetTop+i})),ee("bottom"))));return z([o,n,t]).pipe(m(([i,{top:a,bottom:s},{offset:{y:p},size:{height:c}}])=>(c=Math.max(0,c-Math.max(0,a-p,i)-Math.max(0,c+p-s)),{offset:a-i,height:c,active:a-i<=p})),K((i,a)=>i.offset===a.offset&&i.height===a.height&&i.active===a.active))}function es(e){let t=__md_get("__palette")||{index:e.findIndex(o=>matchMedia(o.getAttribute("data-md-color-media")).matches)},r=Math.max(0,Math.min(t.index,e.length-1));return I(...e).pipe(ne(o=>h(o,"change").pipe(m(()=>o))),Q(e[r]),m(o=>({index:e.indexOf(o),color:{media:o.getAttribute("data-md-color-media"),scheme:o.getAttribute("data-md-color-scheme"),primary:o.getAttribute("data-md-color-primary"),accent:o.getAttribute("data-md-color-accent")}})),G(1))}function Jn(e){let t=P("input",e),r=x("meta",{name:"theme-color"});document.head.appendChild(r);let o=x("meta",{name:"color-scheme"});document.head.appendChild(o);let n=Pt("(prefers-color-scheme: light)");return C(()=>{let i=new g;return i.subscribe(a=>{if(document.body.setAttribute("data-md-color-switching",""),a.color.media==="(prefers-color-scheme)"){let s=matchMedia("(prefers-color-scheme: light)"),p=document.querySelector(s.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");a.color.scheme=p.getAttribute("data-md-color-scheme"),a.color.primary=p.getAttribute("data-md-color-primary"),a.color.accent=p.getAttribute("data-md-color-accent")}for(let[s,p]of Object.entries(a.color))document.body.setAttribute(`data-md-color-${s}`,p);for(let s=0;sa.key==="Enter"),re(i,(a,s)=>s)).subscribe(({index:a})=>{a=(a+1)%t.length,t[a].click(),t[a].focus()}),i.pipe(m(()=>{let a=Se("header"),s=window.getComputedStyle(a);return o.content=s.colorScheme,s.backgroundColor.match(/\d+/g).map(p=>(+p).toString(16).padStart(2,"0")).join("")})).subscribe(a=>r.content=`#${a}`),i.pipe(ve(se)).subscribe(()=>{document.body.removeAttribute("data-md-color-switching")}),es(t).pipe(W(n.pipe(Ce(1))),ct(),w(a=>i.next(a)),_(()=>i.complete()),m(a=>$({ref:e},a)))})}function Xn(e,{progress$:t}){return C(()=>{let r=new g;return r.subscribe(({value:o})=>{e.style.setProperty("--md-progress-value",`${o}`)}),t.pipe(w(o=>r.next({value:o})),_(()=>r.complete()),m(o=>({ref:e,value:o})))})}var Jr=Mt(Br());function ts(e){e.setAttribute("data-md-copying","");let t=e.closest("[data-copy]"),r=t?t.getAttribute("data-copy"):e.innerText;return e.removeAttribute("data-md-copying"),r.trimEnd()}function Zn({alert$:e}){Jr.default.isSupported()&&new j(t=>{new Jr.default("[data-clipboard-target], [data-clipboard-text]",{text:r=>r.getAttribute("data-clipboard-text")||ts(R(r.getAttribute("data-clipboard-target")))}).on("success",r=>t.next(r))}).pipe(w(t=>{t.trigger.focus()}),m(()=>Ee("clipboard.copied"))).subscribe(e)}function ei(e,t){return e.protocol=t.protocol,e.hostname=t.hostname,e}function rs(e,t){let r=new Map;for(let o of P("url",e)){let n=R("loc",o),i=[ei(new URL(n.textContent),t)];r.set(`${i[0]}`,i);for(let a of P("[rel=alternate]",o)){let s=a.getAttribute("href");s!=null&&i.push(ei(new URL(s),t))}}return r}function ur(e){return un(new URL("sitemap.xml",e)).pipe(m(t=>rs(t,new URL(e))),de(()=>I(new Map)))}function os(e,t){if(!(e.target instanceof Element))return S;let r=e.target.closest("a");if(r===null)return S;if(r.target||e.metaKey||e.ctrlKey)return S;let o=new URL(r.href);return o.search=o.hash="",t.has(`${o}`)?(e.preventDefault(),I(new URL(r.href))):S}function ti(e){let t=new Map;for(let r of P(":scope > *",e.head))t.set(r.outerHTML,r);return t}function ri(e){for(let t of P("[href], [src]",e))for(let r of["href","src"]){let o=t.getAttribute(r);if(o&&!/^(?:[a-z]+:)?\/\//i.test(o)){t[r]=t[r];break}}return I(e)}function ns(e){for(let o of["[data-md-component=announce]","[data-md-component=container]","[data-md-component=header-topic]","[data-md-component=outdated]","[data-md-component=logo]","[data-md-component=skip]",...B("navigation.tabs.sticky")?["[data-md-component=tabs]"]:[]]){let n=fe(o),i=fe(o,e);typeof n!="undefined"&&typeof i!="undefined"&&n.replaceWith(i)}let t=ti(document);for(let[o,n]of ti(e))t.has(o)?t.delete(o):document.head.appendChild(n);for(let o of t.values()){let n=o.getAttribute("name");n!=="theme-color"&&n!=="color-scheme"&&o.remove()}let r=Se("container");return We(P("script",r)).pipe(v(o=>{let n=e.createElement("script");if(o.src){for(let i of o.getAttributeNames())n.setAttribute(i,o.getAttribute(i));return o.replaceWith(n),new j(i=>{n.onload=()=>i.complete()})}else return n.textContent=o.textContent,o.replaceWith(n),S}),Z(),ie(document))}function oi({location$:e,viewport$:t,progress$:r}){let o=xe();if(location.protocol==="file:")return S;let n=ur(o.base);I(document).subscribe(ri);let i=h(document.body,"click").pipe(He(n),v(([p,c])=>os(p,c)),pe()),a=h(window,"popstate").pipe(m(ye),pe());i.pipe(re(t)).subscribe(([p,{offset:c}])=>{history.replaceState(c,""),history.pushState(null,"",p)}),O(i,a).subscribe(e);let s=e.pipe(ee("pathname"),v(p=>fn(p,{progress$:r}).pipe(de(()=>(lt(p,!0),S)))),v(ri),v(ns),pe());return O(s.pipe(re(e,(p,c)=>c)),s.pipe(v(()=>e),ee("pathname"),v(()=>e),ee("hash")),e.pipe(K((p,c)=>p.pathname===c.pathname&&p.hash===c.hash),v(()=>i),w(()=>history.back()))).subscribe(p=>{var c,l;history.state!==null||!p.hash?window.scrollTo(0,(l=(c=history.state)==null?void 0:c.y)!=null?l:0):(history.scrollRestoration="auto",pn(p.hash),history.scrollRestoration="manual")}),e.subscribe(()=>{history.scrollRestoration="manual"}),h(window,"beforeunload").subscribe(()=>{history.scrollRestoration="auto"}),t.pipe(ee("offset"),_e(100)).subscribe(({offset:p})=>{history.replaceState(p,"")}),s}var ni=Mt(qr());function ii(e){let t=e.separator.split("|").map(n=>n.replace(/(\(\?[!=<][^)]+\))/g,"").length===0?"\uFFFD":n).join("|"),r=new RegExp(t,"img"),o=(n,i,a)=>`${i}${a}`;return n=>{n=n.replace(/[\s*+\-:~^]+/g," ").trim();let i=new RegExp(`(^|${e.separator}|)(${n.replace(/[|\\{}()[\]^$+*?.-]/g,"\\$&").replace(r,"|")})`,"img");return a=>(0,ni.default)(a).replace(i,o).replace(/<\/mark>(\s+)]*>/img,"$1")}}function jt(e){return e.type===1}function dr(e){return e.type===3}function ai(e,t){let r=yn(e);return O(I(location.protocol!=="file:"),ze("search")).pipe(Ae(o=>o),v(()=>t)).subscribe(({config:o,docs:n})=>r.next({type:0,data:{config:o,docs:n,options:{suggest:B("search.suggest")}}})),r}function si(e){var l;let{selectedVersionSitemap:t,selectedVersionBaseURL:r,currentLocation:o,currentBaseURL:n}=e,i=(l=Xr(n))==null?void 0:l.pathname;if(i===void 0)return;let a=ss(o.pathname,i);if(a===void 0)return;let s=ps(t.keys());if(!t.has(s))return;let p=Xr(a,s);if(!p||!t.has(p.href))return;let c=Xr(a,r);if(c)return c.hash=o.hash,c.search=o.search,c}function Xr(e,t){try{return new URL(e,t)}catch(r){return}}function ss(e,t){if(e.startsWith(t))return e.slice(t.length)}function cs(e,t){let r=Math.min(e.length,t.length),o;for(o=0;oS)),o=r.pipe(m(n=>{let[,i]=t.base.match(/([^/]+)\/?$/);return n.find(({version:a,aliases:s})=>a===i||s.includes(i))||n[0]}));r.pipe(m(n=>new Map(n.map(i=>[`${new URL(`../${i.version}/`,t.base)}`,i]))),v(n=>h(document.body,"click").pipe(b(i=>!i.metaKey&&!i.ctrlKey),re(o),v(([i,a])=>{if(i.target instanceof Element){let s=i.target.closest("a");if(s&&!s.target&&n.has(s.href)){let p=s.href;return!i.target.closest(".md-version")&&n.get(p)===a?S:(i.preventDefault(),I(new URL(p)))}}return S}),v(i=>ur(i).pipe(m(a=>{var s;return(s=si({selectedVersionSitemap:a,selectedVersionBaseURL:i,currentLocation:ye(),currentBaseURL:t.base}))!=null?s:i})))))).subscribe(n=>lt(n,!0)),z([r,o]).subscribe(([n,i])=>{R(".md-header__topic").appendChild(Cn(n,i))}),e.pipe(v(()=>o)).subscribe(n=>{var a;let i=__md_get("__outdated",sessionStorage);if(i===null){i=!0;let s=((a=t.version)==null?void 0:a.default)||"latest";Array.isArray(s)||(s=[s]);e:for(let p of s)for(let c of n.aliases.concat(n.version))if(new RegExp(p,"i").test(c)){i=!1;break e}__md_set("__outdated",i,sessionStorage)}if(i)for(let s of ae("outdated"))s.hidden=!1})}function ls(e,{worker$:t}){let{searchParams:r}=ye();r.has("q")&&(Je("search",!0),e.value=r.get("q"),e.focus(),ze("search").pipe(Ae(i=>!i)).subscribe(()=>{let i=ye();i.searchParams.delete("q"),history.replaceState({},"",`${i}`)}));let o=et(e),n=O(t.pipe(Ae(jt)),h(e,"keyup"),o).pipe(m(()=>e.value),K());return z([n,o]).pipe(m(([i,a])=>({value:i,focus:a})),G(1))}function pi(e,{worker$:t}){let r=new g,o=r.pipe(Z(),ie(!0));z([t.pipe(Ae(jt)),r],(i,a)=>a).pipe(ee("value")).subscribe(({value:i})=>t.next({type:2,data:i})),r.pipe(ee("focus")).subscribe(({focus:i})=>{i&&Je("search",i)}),h(e.form,"reset").pipe(W(o)).subscribe(()=>e.focus());let n=R("header [for=__search]");return h(n,"click").subscribe(()=>e.focus()),ls(e,{worker$:t}).pipe(w(i=>r.next(i)),_(()=>r.complete()),m(i=>$({ref:e},i)),G(1))}function li(e,{worker$:t,query$:r}){let o=new g,n=on(e.parentElement).pipe(b(Boolean)),i=e.parentElement,a=R(":scope > :first-child",e),s=R(":scope > :last-child",e);ze("search").subscribe(l=>s.setAttribute("role",l?"list":"presentation")),o.pipe(re(r),Wr(t.pipe(Ae(jt)))).subscribe(([{items:l},{value:f}])=>{switch(l.length){case 0:a.textContent=f.length?Ee("search.result.none"):Ee("search.result.placeholder");break;case 1:a.textContent=Ee("search.result.one");break;default:let u=sr(l.length);a.textContent=Ee("search.result.other",u)}});let p=o.pipe(w(()=>s.innerHTML=""),v(({items:l})=>O(I(...l.slice(0,10)),I(...l.slice(10)).pipe(Be(4),Vr(n),v(([f])=>f)))),m(Mn),pe());return p.subscribe(l=>s.appendChild(l)),p.pipe(ne(l=>{let f=fe("details",l);return typeof f=="undefined"?S:h(f,"toggle").pipe(W(o),m(()=>f))})).subscribe(l=>{l.open===!1&&l.offsetTop<=i.scrollTop&&i.scrollTo({top:l.offsetTop})}),t.pipe(b(dr),m(({data:l})=>l)).pipe(w(l=>o.next(l)),_(()=>o.complete()),m(l=>$({ref:e},l)))}function ms(e,{query$:t}){return t.pipe(m(({value:r})=>{let o=ye();return o.hash="",r=r.replace(/\s+/g,"+").replace(/&/g,"%26").replace(/=/g,"%3D"),o.search=`q=${r}`,{url:o}}))}function mi(e,t){let r=new g,o=r.pipe(Z(),ie(!0));return r.subscribe(({url:n})=>{e.setAttribute("data-clipboard-text",e.href),e.href=`${n}`}),h(e,"click").pipe(W(o)).subscribe(n=>n.preventDefault()),ms(e,t).pipe(w(n=>r.next(n)),_(()=>r.complete()),m(n=>$({ref:e},n)))}function fi(e,{worker$:t,keyboard$:r}){let o=new g,n=Se("search-query"),i=O(h(n,"keydown"),h(n,"focus")).pipe(ve(se),m(()=>n.value),K());return o.pipe(He(i),m(([{suggest:s},p])=>{let c=p.split(/([\s-]+)/);if(s!=null&&s.length&&c[c.length-1]){let l=s[s.length-1];l.startsWith(c[c.length-1])&&(c[c.length-1]=l)}else c.length=0;return c})).subscribe(s=>e.innerHTML=s.join("").replace(/\s/g," ")),r.pipe(b(({mode:s})=>s==="search")).subscribe(s=>{switch(s.type){case"ArrowRight":e.innerText.length&&n.selectionStart===n.value.length&&(n.value=e.innerText);break}}),t.pipe(b(dr),m(({data:s})=>s)).pipe(w(s=>o.next(s)),_(()=>o.complete()),m(()=>({ref:e})))}function ui(e,{index$:t,keyboard$:r}){let o=xe();try{let n=ai(o.search,t),i=Se("search-query",e),a=Se("search-result",e);h(e,"click").pipe(b(({target:p})=>p instanceof Element&&!!p.closest("a"))).subscribe(()=>Je("search",!1)),r.pipe(b(({mode:p})=>p==="search")).subscribe(p=>{let c=Ie();switch(p.type){case"Enter":if(c===i){let l=new Map;for(let f of P(":first-child [href]",a)){let u=f.firstElementChild;l.set(f,parseFloat(u.getAttribute("data-md-score")))}if(l.size){let[[f]]=[...l].sort(([,u],[,d])=>d-u);f.click()}p.claim()}break;case"Escape":case"Tab":Je("search",!1),i.blur();break;case"ArrowUp":case"ArrowDown":if(typeof c=="undefined")i.focus();else{let l=[i,...P(":not(details) > [href], summary, details[open] [href]",a)],f=Math.max(0,(Math.max(0,l.indexOf(c))+l.length+(p.type==="ArrowUp"?-1:1))%l.length);l[f].focus()}p.claim();break;default:i!==Ie()&&i.focus()}}),r.pipe(b(({mode:p})=>p==="global")).subscribe(p=>{switch(p.type){case"f":case"s":case"/":i.focus(),i.select(),p.claim();break}});let s=pi(i,{worker$:n});return O(s,li(a,{worker$:n,query$:s})).pipe(Re(...ae("search-share",e).map(p=>mi(p,{query$:s})),...ae("search-suggest",e).map(p=>fi(p,{worker$:n,keyboard$:r}))))}catch(n){return e.hidden=!0,Ye}}function di(e,{index$:t,location$:r}){return z([t,r.pipe(Q(ye()),b(o=>!!o.searchParams.get("h")))]).pipe(m(([o,n])=>ii(o.config)(n.searchParams.get("h"))),m(o=>{var a;let n=new Map,i=document.createNodeIterator(e,NodeFilter.SHOW_TEXT);for(let s=i.nextNode();s;s=i.nextNode())if((a=s.parentElement)!=null&&a.offsetHeight){let p=s.textContent,c=o(p);c.length>p.length&&n.set(s,c)}for(let[s,p]of n){let{childNodes:c}=x("span",null,p);s.replaceWith(...Array.from(c))}return{ref:e,nodes:n}}))}function fs(e,{viewport$:t,main$:r}){let o=e.closest(".md-grid"),n=o.offsetTop-o.parentElement.offsetTop;return z([r,t]).pipe(m(([{offset:i,height:a},{offset:{y:s}}])=>(a=a+Math.min(n,Math.max(0,s-i))-n,{height:a,locked:s>=i+n})),K((i,a)=>i.height===a.height&&i.locked===a.locked))}function Zr(e,o){var n=o,{header$:t}=n,r=so(n,["header$"]);let i=R(".md-sidebar__scrollwrap",e),{y:a}=Ve(i);return C(()=>{let s=new g,p=s.pipe(Z(),ie(!0)),c=s.pipe(Me(0,me));return c.pipe(re(t)).subscribe({next([{height:l},{height:f}]){i.style.height=`${l-2*a}px`,e.style.top=`${f}px`},complete(){i.style.height="",e.style.top=""}}),c.pipe(Ae()).subscribe(()=>{for(let l of P(".md-nav__link--active[href]",e)){if(!l.clientHeight)continue;let f=l.closest(".md-sidebar__scrollwrap");if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:d}=ce(f);f.scrollTo({top:u-d/2})}}}),ue(P("label[tabindex]",e)).pipe(ne(l=>h(l,"click").pipe(ve(se),m(()=>l),W(p)))).subscribe(l=>{let f=R(`[id="${l.htmlFor}"]`);R(`[aria-labelledby="${l.id}"]`).setAttribute("aria-expanded",`${f.checked}`)}),fs(e,r).pipe(w(l=>s.next(l)),_(()=>s.complete()),m(l=>$({ref:e},l)))})}function hi(e,t){if(typeof t!="undefined"){let r=`https://api.github.com/repos/${e}/${t}`;return st(je(`${r}/releases/latest`).pipe(de(()=>S),m(o=>({version:o.tag_name})),De({})),je(r).pipe(de(()=>S),m(o=>({stars:o.stargazers_count,forks:o.forks_count})),De({}))).pipe(m(([o,n])=>$($({},o),n)))}else{let r=`https://api.github.com/users/${e}`;return je(r).pipe(m(o=>({repositories:o.public_repos})),De({}))}}function bi(e,t){let r=`https://${e}/api/v4/projects/${encodeURIComponent(t)}`;return st(je(`${r}/releases/permalink/latest`).pipe(de(()=>S),m(({tag_name:o})=>({version:o})),De({})),je(r).pipe(de(()=>S),m(({star_count:o,forks_count:n})=>({stars:o,forks:n})),De({}))).pipe(m(([o,n])=>$($({},o),n)))}function vi(e){let t=e.match(/^.+github\.com\/([^/]+)\/?([^/]+)?/i);if(t){let[,r,o]=t;return hi(r,o)}if(t=e.match(/^.+?([^/]*gitlab[^/]+)\/(.+?)\/?$/i),t){let[,r,o]=t;return bi(r,o)}return S}var us;function ds(e){return us||(us=C(()=>{let t=__md_get("__source",sessionStorage);if(t)return I(t);if(ae("consent").length){let o=__md_get("__consent");if(!(o&&o.github))return S}return vi(e.href).pipe(w(o=>__md_set("__source",o,sessionStorage)))}).pipe(de(()=>S),b(t=>Object.keys(t).length>0),m(t=>({facts:t})),G(1)))}function gi(e){let t=R(":scope > :last-child",e);return C(()=>{let r=new g;return r.subscribe(({facts:o})=>{t.appendChild(_n(o)),t.classList.add("md-source__repository--active")}),ds(e).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}function hs(e,{viewport$:t,header$:r}){return ge(document.body).pipe(v(()=>mr(e,{header$:r,viewport$:t})),m(({offset:{y:o}})=>({hidden:o>=10})),ee("hidden"))}function yi(e,t){return C(()=>{let r=new g;return r.subscribe({next({hidden:o}){e.hidden=o},complete(){e.hidden=!1}}),(B("navigation.tabs.sticky")?I({hidden:!1}):hs(e,t)).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}function bs(e,{viewport$:t,header$:r}){let o=new Map,n=P(".md-nav__link",e);for(let s of n){let p=decodeURIComponent(s.hash.substring(1)),c=fe(`[id="${p}"]`);typeof c!="undefined"&&o.set(s,c)}let i=r.pipe(ee("height"),m(({height:s})=>{let p=Se("main"),c=R(":scope > :first-child",p);return s+.8*(c.offsetTop-p.offsetTop)}),pe());return ge(document.body).pipe(ee("height"),v(s=>C(()=>{let p=[];return I([...o].reduce((c,[l,f])=>{for(;p.length&&o.get(p[p.length-1]).tagName>=f.tagName;)p.pop();let u=f.offsetTop;for(;!u&&f.parentElement;)f=f.parentElement,u=f.offsetTop;let d=f.offsetParent;for(;d;d=d.offsetParent)u+=d.offsetTop;return c.set([...p=[...p,l]].reverse(),u)},new Map))}).pipe(m(p=>new Map([...p].sort(([,c],[,l])=>c-l))),He(i),v(([p,c])=>t.pipe(Fr(([l,f],{offset:{y:u},size:d})=>{let y=u+d.height>=Math.floor(s.height);for(;f.length;){let[,L]=f[0];if(L-c=u&&!y)f=[l.pop(),...f];else break}return[l,f]},[[],[...p]]),K((l,f)=>l[0]===f[0]&&l[1]===f[1])))))).pipe(m(([s,p])=>({prev:s.map(([c])=>c),next:p.map(([c])=>c)})),Q({prev:[],next:[]}),Be(2,1),m(([s,p])=>s.prev.length{let i=new g,a=i.pipe(Z(),ie(!0));if(i.subscribe(({prev:s,next:p})=>{for(let[c]of p)c.classList.remove("md-nav__link--passed"),c.classList.remove("md-nav__link--active");for(let[c,[l]]of s.entries())l.classList.add("md-nav__link--passed"),l.classList.toggle("md-nav__link--active",c===s.length-1)}),B("toc.follow")){let s=O(t.pipe(_e(1),m(()=>{})),t.pipe(_e(250),m(()=>"smooth")));i.pipe(b(({prev:p})=>p.length>0),He(o.pipe(ve(se))),re(s)).subscribe(([[{prev:p}],c])=>{let[l]=p[p.length-1];if(l.offsetHeight){let f=cr(l);if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:d}=ce(f);f.scrollTo({top:u-d/2,behavior:c})}}})}return B("navigation.tracking")&&t.pipe(W(a),ee("offset"),_e(250),Ce(1),W(n.pipe(Ce(1))),ct({delay:250}),re(i)).subscribe(([,{prev:s}])=>{let p=ye(),c=s[s.length-1];if(c&&c.length){let[l]=c,{hash:f}=new URL(l.href);p.hash!==f&&(p.hash=f,history.replaceState({},"",`${p}`))}else p.hash="",history.replaceState({},"",`${p}`)}),bs(e,{viewport$:t,header$:r}).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))})}function vs(e,{viewport$:t,main$:r,target$:o}){let n=t.pipe(m(({offset:{y:a}})=>a),Be(2,1),m(([a,s])=>a>s&&s>0),K()),i=r.pipe(m(({active:a})=>a));return z([i,n]).pipe(m(([a,s])=>!(a&&s)),K(),W(o.pipe(Ce(1))),ie(!0),ct({delay:250}),m(a=>({hidden:a})))}function Ei(e,{viewport$:t,header$:r,main$:o,target$:n}){let i=new g,a=i.pipe(Z(),ie(!0));return i.subscribe({next({hidden:s}){e.hidden=s,s?(e.setAttribute("tabindex","-1"),e.blur()):e.removeAttribute("tabindex")},complete(){e.style.top="",e.hidden=!0,e.removeAttribute("tabindex")}}),r.pipe(W(a),ee("height")).subscribe(({height:s})=>{e.style.top=`${s+16}px`}),h(e,"click").subscribe(s=>{s.preventDefault(),window.scrollTo({top:0})}),vs(e,{viewport$:t,main$:o,target$:n}).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))}function wi({document$:e,viewport$:t}){e.pipe(v(()=>P(".md-ellipsis")),ne(r=>tt(r).pipe(W(e.pipe(Ce(1))),b(o=>o),m(()=>r),Te(1))),b(r=>r.offsetWidth{let o=r.innerText,n=r.closest("a")||r;return n.title=o,B("content.tooltips")?mt(n,{viewport$:t}).pipe(W(e.pipe(Ce(1))),_(()=>n.removeAttribute("title"))):S})).subscribe(),B("content.tooltips")&&e.pipe(v(()=>P(".md-status")),ne(r=>mt(r,{viewport$:t}))).subscribe()}function Ti({document$:e,tablet$:t}){e.pipe(v(()=>P(".md-toggle--indeterminate")),w(r=>{r.indeterminate=!0,r.checked=!1}),ne(r=>h(r,"change").pipe(Dr(()=>r.classList.contains("md-toggle--indeterminate")),m(()=>r))),re(t)).subscribe(([r,o])=>{r.classList.remove("md-toggle--indeterminate"),o&&(r.checked=!1)})}function gs(){return/(iPad|iPhone|iPod)/.test(navigator.userAgent)}function Si({document$:e}){e.pipe(v(()=>P("[data-md-scrollfix]")),w(t=>t.removeAttribute("data-md-scrollfix")),b(gs),ne(t=>h(t,"touchstart").pipe(m(()=>t)))).subscribe(t=>{let r=t.scrollTop;r===0?t.scrollTop=1:r+t.offsetHeight===t.scrollHeight&&(t.scrollTop=r-1)})}function Oi({viewport$:e,tablet$:t}){z([ze("search"),t]).pipe(m(([r,o])=>r&&!o),v(r=>I(r).pipe(Ge(r?400:100))),re(e)).subscribe(([r,{offset:{y:o}}])=>{if(r)document.body.setAttribute("data-md-scrolllock",""),document.body.style.top=`-${o}px`;else{let n=-1*parseInt(document.body.style.top,10);document.body.removeAttribute("data-md-scrolllock"),document.body.style.top="",n&&window.scrollTo(0,n)}})}Object.entries||(Object.entries=function(e){let t=[];for(let r of Object.keys(e))t.push([r,e[r]]);return t});Object.values||(Object.values=function(e){let t=[];for(let r of Object.keys(e))t.push(e[r]);return t});typeof Element!="undefined"&&(Element.prototype.scrollTo||(Element.prototype.scrollTo=function(e,t){typeof e=="object"?(this.scrollLeft=e.left,this.scrollTop=e.top):(this.scrollLeft=e,this.scrollTop=t)}),Element.prototype.replaceWith||(Element.prototype.replaceWith=function(...e){let t=this.parentNode;if(t){e.length===0&&t.removeChild(this);for(let r=e.length-1;r>=0;r--){let o=e[r];typeof o=="string"?o=document.createTextNode(o):o.parentNode&&o.parentNode.removeChild(o),r?t.insertBefore(this.previousSibling,o):t.replaceChild(o,this)}}}));function ys(){return location.protocol==="file:"?Tt(`${new URL("search/search_index.js",eo.base)}`).pipe(m(()=>__index),G(1)):je(new URL("search/search_index.json",eo.base))}document.documentElement.classList.remove("no-js");document.documentElement.classList.add("js");var ot=Go(),Ut=sn(),Lt=ln(Ut),to=an(),Oe=gn(),hr=Pt("(min-width: 960px)"),Mi=Pt("(min-width: 1220px)"),_i=mn(),eo=xe(),Ai=document.forms.namedItem("search")?ys():Ye,ro=new g;Zn({alert$:ro});var oo=new g;B("navigation.instant")&&oi({location$:Ut,viewport$:Oe,progress$:oo}).subscribe(ot);var Li;((Li=eo.version)==null?void 0:Li.provider)==="mike"&&ci({document$:ot});O(Ut,Lt).pipe(Ge(125)).subscribe(()=>{Je("drawer",!1),Je("search",!1)});to.pipe(b(({mode:e})=>e==="global")).subscribe(e=>{switch(e.type){case"p":case",":let t=fe("link[rel=prev]");typeof t!="undefined"&<(t);break;case"n":case".":let r=fe("link[rel=next]");typeof r!="undefined"&<(r);break;case"Enter":let o=Ie();o instanceof HTMLLabelElement&&o.click()}});wi({viewport$:Oe,document$:ot});Ti({document$:ot,tablet$:hr});Si({document$:ot});Oi({viewport$:Oe,tablet$:hr});var rt=Kn(Se("header"),{viewport$:Oe}),Ft=ot.pipe(m(()=>Se("main")),v(e=>Gn(e,{viewport$:Oe,header$:rt})),G(1)),xs=O(...ae("consent").map(e=>En(e,{target$:Lt})),...ae("dialog").map(e=>qn(e,{alert$:ro})),...ae("palette").map(e=>Jn(e)),...ae("progress").map(e=>Xn(e,{progress$:oo})),...ae("search").map(e=>ui(e,{index$:Ai,keyboard$:to})),...ae("source").map(e=>gi(e))),Es=C(()=>O(...ae("announce").map(e=>xn(e)),...ae("content").map(e=>zn(e,{viewport$:Oe,target$:Lt,print$:_i})),...ae("content").map(e=>B("search.highlight")?di(e,{index$:Ai,location$:Ut}):S),...ae("header").map(e=>Yn(e,{viewport$:Oe,header$:rt,main$:Ft})),...ae("header-title").map(e=>Bn(e,{viewport$:Oe,header$:rt})),...ae("sidebar").map(e=>e.getAttribute("data-md-type")==="navigation"?Nr(Mi,()=>Zr(e,{viewport$:Oe,header$:rt,main$:Ft})):Nr(hr,()=>Zr(e,{viewport$:Oe,header$:rt,main$:Ft}))),...ae("tabs").map(e=>yi(e,{viewport$:Oe,header$:rt})),...ae("toc").map(e=>xi(e,{viewport$:Oe,header$:rt,main$:Ft,target$:Lt})),...ae("top").map(e=>Ei(e,{viewport$:Oe,header$:rt,main$:Ft,target$:Lt})))),Ci=ot.pipe(v(()=>Es),Re(xs),G(1));Ci.subscribe();window.document$=ot;window.location$=Ut;window.target$=Lt;window.keyboard$=to;window.viewport$=Oe;window.tablet$=hr;window.screen$=Mi;window.print$=_i;window.alert$=ro;window.progress$=oo;window.component$=Ci;})(); +//# sourceMappingURL=bundle.88dd0f4e.min.js.map + diff --git a/assets/javascripts/bundle.88dd0f4e.min.js.map b/assets/javascripts/bundle.88dd0f4e.min.js.map new file mode 100644 index 000000000..dab2a8754 --- /dev/null +++ b/assets/javascripts/bundle.88dd0f4e.min.js.map @@ -0,0 +1,7 @@ +{ + "version": 3, + "sources": ["node_modules/focus-visible/dist/focus-visible.js", "node_modules/escape-html/index.js", "node_modules/clipboard/dist/clipboard.js", "src/templates/assets/javascripts/bundle.ts", "node_modules/tslib/tslib.es6.mjs", "node_modules/rxjs/src/internal/util/isFunction.ts", "node_modules/rxjs/src/internal/util/createErrorClass.ts", "node_modules/rxjs/src/internal/util/UnsubscriptionError.ts", "node_modules/rxjs/src/internal/util/arrRemove.ts", "node_modules/rxjs/src/internal/Subscription.ts", "node_modules/rxjs/src/internal/config.ts", "node_modules/rxjs/src/internal/scheduler/timeoutProvider.ts", "node_modules/rxjs/src/internal/util/reportUnhandledError.ts", "node_modules/rxjs/src/internal/util/noop.ts", "node_modules/rxjs/src/internal/NotificationFactories.ts", "node_modules/rxjs/src/internal/util/errorContext.ts", "node_modules/rxjs/src/internal/Subscriber.ts", "node_modules/rxjs/src/internal/symbol/observable.ts", "node_modules/rxjs/src/internal/util/identity.ts", "node_modules/rxjs/src/internal/util/pipe.ts", "node_modules/rxjs/src/internal/Observable.ts", "node_modules/rxjs/src/internal/util/lift.ts", "node_modules/rxjs/src/internal/operators/OperatorSubscriber.ts", "node_modules/rxjs/src/internal/scheduler/animationFrameProvider.ts", "node_modules/rxjs/src/internal/util/ObjectUnsubscribedError.ts", "node_modules/rxjs/src/internal/Subject.ts", "node_modules/rxjs/src/internal/BehaviorSubject.ts", "node_modules/rxjs/src/internal/scheduler/dateTimestampProvider.ts", "node_modules/rxjs/src/internal/ReplaySubject.ts", "node_modules/rxjs/src/internal/scheduler/Action.ts", "node_modules/rxjs/src/internal/scheduler/intervalProvider.ts", "node_modules/rxjs/src/internal/scheduler/AsyncAction.ts", "node_modules/rxjs/src/internal/Scheduler.ts", "node_modules/rxjs/src/internal/scheduler/AsyncScheduler.ts", "node_modules/rxjs/src/internal/scheduler/async.ts", "node_modules/rxjs/src/internal/scheduler/QueueAction.ts", "node_modules/rxjs/src/internal/scheduler/QueueScheduler.ts", "node_modules/rxjs/src/internal/scheduler/queue.ts", "node_modules/rxjs/src/internal/scheduler/AnimationFrameAction.ts", "node_modules/rxjs/src/internal/scheduler/AnimationFrameScheduler.ts", "node_modules/rxjs/src/internal/scheduler/animationFrame.ts", "node_modules/rxjs/src/internal/observable/empty.ts", "node_modules/rxjs/src/internal/util/isScheduler.ts", "node_modules/rxjs/src/internal/util/args.ts", "node_modules/rxjs/src/internal/util/isArrayLike.ts", "node_modules/rxjs/src/internal/util/isPromise.ts", "node_modules/rxjs/src/internal/util/isInteropObservable.ts", "node_modules/rxjs/src/internal/util/isAsyncIterable.ts", "node_modules/rxjs/src/internal/util/throwUnobservableError.ts", "node_modules/rxjs/src/internal/symbol/iterator.ts", "node_modules/rxjs/src/internal/util/isIterable.ts", "node_modules/rxjs/src/internal/util/isReadableStreamLike.ts", "node_modules/rxjs/src/internal/observable/innerFrom.ts", "node_modules/rxjs/src/internal/util/executeSchedule.ts", "node_modules/rxjs/src/internal/operators/observeOn.ts", "node_modules/rxjs/src/internal/operators/subscribeOn.ts", "node_modules/rxjs/src/internal/scheduled/scheduleObservable.ts", "node_modules/rxjs/src/internal/scheduled/schedulePromise.ts", "node_modules/rxjs/src/internal/scheduled/scheduleArray.ts", "node_modules/rxjs/src/internal/scheduled/scheduleIterable.ts", "node_modules/rxjs/src/internal/scheduled/scheduleAsyncIterable.ts", "node_modules/rxjs/src/internal/scheduled/scheduleReadableStreamLike.ts", "node_modules/rxjs/src/internal/scheduled/scheduled.ts", "node_modules/rxjs/src/internal/observable/from.ts", "node_modules/rxjs/src/internal/observable/of.ts", "node_modules/rxjs/src/internal/observable/throwError.ts", "node_modules/rxjs/src/internal/util/EmptyError.ts", "node_modules/rxjs/src/internal/util/isDate.ts", "node_modules/rxjs/src/internal/operators/map.ts", "node_modules/rxjs/src/internal/util/mapOneOrManyArgs.ts", "node_modules/rxjs/src/internal/util/argsArgArrayOrObject.ts", "node_modules/rxjs/src/internal/util/createObject.ts", "node_modules/rxjs/src/internal/observable/combineLatest.ts", "node_modules/rxjs/src/internal/operators/mergeInternals.ts", "node_modules/rxjs/src/internal/operators/mergeMap.ts", "node_modules/rxjs/src/internal/operators/mergeAll.ts", "node_modules/rxjs/src/internal/operators/concatAll.ts", "node_modules/rxjs/src/internal/observable/concat.ts", "node_modules/rxjs/src/internal/observable/defer.ts", "node_modules/rxjs/src/internal/observable/fromEvent.ts", "node_modules/rxjs/src/internal/observable/fromEventPattern.ts", "node_modules/rxjs/src/internal/observable/timer.ts", "node_modules/rxjs/src/internal/observable/merge.ts", "node_modules/rxjs/src/internal/observable/never.ts", "node_modules/rxjs/src/internal/util/argsOrArgArray.ts", "node_modules/rxjs/src/internal/operators/filter.ts", "node_modules/rxjs/src/internal/observable/zip.ts", "node_modules/rxjs/src/internal/operators/audit.ts", "node_modules/rxjs/src/internal/operators/auditTime.ts", "node_modules/rxjs/src/internal/operators/bufferCount.ts", "node_modules/rxjs/src/internal/operators/catchError.ts", "node_modules/rxjs/src/internal/operators/scanInternals.ts", "node_modules/rxjs/src/internal/operators/combineLatest.ts", "node_modules/rxjs/src/internal/operators/combineLatestWith.ts", "node_modules/rxjs/src/internal/operators/debounce.ts", "node_modules/rxjs/src/internal/operators/debounceTime.ts", "node_modules/rxjs/src/internal/operators/defaultIfEmpty.ts", "node_modules/rxjs/src/internal/operators/take.ts", "node_modules/rxjs/src/internal/operators/ignoreElements.ts", "node_modules/rxjs/src/internal/operators/mapTo.ts", "node_modules/rxjs/src/internal/operators/delayWhen.ts", "node_modules/rxjs/src/internal/operators/delay.ts", "node_modules/rxjs/src/internal/operators/distinctUntilChanged.ts", "node_modules/rxjs/src/internal/operators/distinctUntilKeyChanged.ts", "node_modules/rxjs/src/internal/operators/throwIfEmpty.ts", "node_modules/rxjs/src/internal/operators/endWith.ts", "node_modules/rxjs/src/internal/operators/finalize.ts", "node_modules/rxjs/src/internal/operators/first.ts", "node_modules/rxjs/src/internal/operators/takeLast.ts", "node_modules/rxjs/src/internal/operators/merge.ts", "node_modules/rxjs/src/internal/operators/mergeWith.ts", "node_modules/rxjs/src/internal/operators/repeat.ts", "node_modules/rxjs/src/internal/operators/scan.ts", "node_modules/rxjs/src/internal/operators/share.ts", "node_modules/rxjs/src/internal/operators/shareReplay.ts", "node_modules/rxjs/src/internal/operators/skip.ts", "node_modules/rxjs/src/internal/operators/skipUntil.ts", "node_modules/rxjs/src/internal/operators/startWith.ts", "node_modules/rxjs/src/internal/operators/switchMap.ts", "node_modules/rxjs/src/internal/operators/takeUntil.ts", "node_modules/rxjs/src/internal/operators/takeWhile.ts", "node_modules/rxjs/src/internal/operators/tap.ts", "node_modules/rxjs/src/internal/operators/throttle.ts", "node_modules/rxjs/src/internal/operators/throttleTime.ts", "node_modules/rxjs/src/internal/operators/withLatestFrom.ts", "node_modules/rxjs/src/internal/operators/zip.ts", "node_modules/rxjs/src/internal/operators/zipWith.ts", "src/templates/assets/javascripts/browser/document/index.ts", "src/templates/assets/javascripts/browser/element/_/index.ts", "src/templates/assets/javascripts/browser/element/focus/index.ts", "src/templates/assets/javascripts/browser/element/hover/index.ts", "src/templates/assets/javascripts/utilities/h/index.ts", "src/templates/assets/javascripts/utilities/round/index.ts", "src/templates/assets/javascripts/browser/script/index.ts", "src/templates/assets/javascripts/browser/element/size/_/index.ts", "src/templates/assets/javascripts/browser/element/size/content/index.ts", "src/templates/assets/javascripts/browser/element/offset/_/index.ts", "src/templates/assets/javascripts/browser/element/offset/content/index.ts", "src/templates/assets/javascripts/browser/element/visibility/index.ts", "src/templates/assets/javascripts/browser/toggle/index.ts", "src/templates/assets/javascripts/browser/keyboard/index.ts", "src/templates/assets/javascripts/browser/location/_/index.ts", "src/templates/assets/javascripts/browser/location/hash/index.ts", "src/templates/assets/javascripts/browser/media/index.ts", "src/templates/assets/javascripts/browser/request/index.ts", "src/templates/assets/javascripts/browser/viewport/offset/index.ts", "src/templates/assets/javascripts/browser/viewport/size/index.ts", "src/templates/assets/javascripts/browser/viewport/_/index.ts", "src/templates/assets/javascripts/browser/viewport/at/index.ts", "src/templates/assets/javascripts/browser/worker/index.ts", "src/templates/assets/javascripts/_/index.ts", "src/templates/assets/javascripts/components/_/index.ts", "src/templates/assets/javascripts/components/announce/index.ts", "src/templates/assets/javascripts/components/consent/index.ts", "src/templates/assets/javascripts/templates/tooltip/index.tsx", "src/templates/assets/javascripts/templates/annotation/index.tsx", "src/templates/assets/javascripts/templates/clipboard/index.tsx", "src/templates/assets/javascripts/templates/search/index.tsx", "src/templates/assets/javascripts/templates/source/index.tsx", "src/templates/assets/javascripts/templates/tabbed/index.tsx", "src/templates/assets/javascripts/templates/table/index.tsx", "src/templates/assets/javascripts/templates/version/index.tsx", "src/templates/assets/javascripts/components/tooltip2/index.ts", "src/templates/assets/javascripts/components/content/annotation/_/index.ts", "src/templates/assets/javascripts/components/content/annotation/list/index.ts", "src/templates/assets/javascripts/components/content/annotation/block/index.ts", "src/templates/assets/javascripts/components/content/code/_/index.ts", "src/templates/assets/javascripts/components/content/details/index.ts", "src/templates/assets/javascripts/components/content/mermaid/index.css", "src/templates/assets/javascripts/components/content/mermaid/index.ts", "src/templates/assets/javascripts/components/content/table/index.ts", "src/templates/assets/javascripts/components/content/tabs/index.ts", "src/templates/assets/javascripts/components/content/_/index.ts", "src/templates/assets/javascripts/components/dialog/index.ts", "src/templates/assets/javascripts/components/tooltip/index.ts", "src/templates/assets/javascripts/components/header/_/index.ts", "src/templates/assets/javascripts/components/header/title/index.ts", "src/templates/assets/javascripts/components/main/index.ts", "src/templates/assets/javascripts/components/palette/index.ts", "src/templates/assets/javascripts/components/progress/index.ts", "src/templates/assets/javascripts/integrations/clipboard/index.ts", "src/templates/assets/javascripts/integrations/sitemap/index.ts", "src/templates/assets/javascripts/integrations/instant/index.ts", "src/templates/assets/javascripts/integrations/search/highlighter/index.ts", "src/templates/assets/javascripts/integrations/search/worker/message/index.ts", "src/templates/assets/javascripts/integrations/search/worker/_/index.ts", "src/templates/assets/javascripts/integrations/version/findurl/index.ts", "src/templates/assets/javascripts/integrations/version/index.ts", "src/templates/assets/javascripts/components/search/query/index.ts", "src/templates/assets/javascripts/components/search/result/index.ts", "src/templates/assets/javascripts/components/search/share/index.ts", "src/templates/assets/javascripts/components/search/suggest/index.ts", "src/templates/assets/javascripts/components/search/_/index.ts", "src/templates/assets/javascripts/components/search/highlight/index.ts", "src/templates/assets/javascripts/components/sidebar/index.ts", "src/templates/assets/javascripts/components/source/facts/github/index.ts", "src/templates/assets/javascripts/components/source/facts/gitlab/index.ts", "src/templates/assets/javascripts/components/source/facts/_/index.ts", "src/templates/assets/javascripts/components/source/_/index.ts", "src/templates/assets/javascripts/components/tabs/index.ts", "src/templates/assets/javascripts/components/toc/index.ts", "src/templates/assets/javascripts/components/top/index.ts", "src/templates/assets/javascripts/patches/ellipsis/index.ts", "src/templates/assets/javascripts/patches/indeterminate/index.ts", "src/templates/assets/javascripts/patches/scrollfix/index.ts", "src/templates/assets/javascripts/patches/scrolllock/index.ts", "src/templates/assets/javascripts/polyfills/index.ts"], + "sourcesContent": ["(function (global, factory) {\n typeof exports === 'object' && typeof module !== 'undefined' ? factory() :\n typeof define === 'function' && define.amd ? define(factory) :\n (factory());\n}(this, (function () { 'use strict';\n\n /**\n * Applies the :focus-visible polyfill at the given scope.\n * A scope in this case is either the top-level Document or a Shadow Root.\n *\n * @param {(Document|ShadowRoot)} scope\n * @see https://github.com/WICG/focus-visible\n */\n function applyFocusVisiblePolyfill(scope) {\n var hadKeyboardEvent = true;\n var hadFocusVisibleRecently = false;\n var hadFocusVisibleRecentlyTimeout = null;\n\n var inputTypesAllowlist = {\n text: true,\n search: true,\n url: true,\n tel: true,\n email: true,\n password: true,\n number: true,\n date: true,\n month: true,\n week: true,\n time: true,\n datetime: true,\n 'datetime-local': true\n };\n\n /**\n * Helper function for legacy browsers and iframes which sometimes focus\n * elements like document, body, and non-interactive SVG.\n * @param {Element} el\n */\n function isValidFocusTarget(el) {\n if (\n el &&\n el !== document &&\n el.nodeName !== 'HTML' &&\n el.nodeName !== 'BODY' &&\n 'classList' in el &&\n 'contains' in el.classList\n ) {\n return true;\n }\n return false;\n }\n\n /**\n * Computes whether the given element should automatically trigger the\n * `focus-visible` class being added, i.e. whether it should always match\n * `:focus-visible` when focused.\n * @param {Element} el\n * @return {boolean}\n */\n function focusTriggersKeyboardModality(el) {\n var type = el.type;\n var tagName = el.tagName;\n\n if (tagName === 'INPUT' && inputTypesAllowlist[type] && !el.readOnly) {\n return true;\n }\n\n if (tagName === 'TEXTAREA' && !el.readOnly) {\n return true;\n }\n\n if (el.isContentEditable) {\n return true;\n }\n\n return false;\n }\n\n /**\n * Add the `focus-visible` class to the given element if it was not added by\n * the author.\n * @param {Element} el\n */\n function addFocusVisibleClass(el) {\n if (el.classList.contains('focus-visible')) {\n return;\n }\n el.classList.add('focus-visible');\n el.setAttribute('data-focus-visible-added', '');\n }\n\n /**\n * Remove the `focus-visible` class from the given element if it was not\n * originally added by the author.\n * @param {Element} el\n */\n function removeFocusVisibleClass(el) {\n if (!el.hasAttribute('data-focus-visible-added')) {\n return;\n }\n el.classList.remove('focus-visible');\n el.removeAttribute('data-focus-visible-added');\n }\n\n /**\n * If the most recent user interaction was via the keyboard;\n * and the key press did not include a meta, alt/option, or control key;\n * then the modality is keyboard. Otherwise, the modality is not keyboard.\n * Apply `focus-visible` to any current active element and keep track\n * of our keyboard modality state with `hadKeyboardEvent`.\n * @param {KeyboardEvent} e\n */\n function onKeyDown(e) {\n if (e.metaKey || e.altKey || e.ctrlKey) {\n return;\n }\n\n if (isValidFocusTarget(scope.activeElement)) {\n addFocusVisibleClass(scope.activeElement);\n }\n\n hadKeyboardEvent = true;\n }\n\n /**\n * If at any point a user clicks with a pointing device, ensure that we change\n * the modality away from keyboard.\n * This avoids the situation where a user presses a key on an already focused\n * element, and then clicks on a different element, focusing it with a\n * pointing device, while we still think we're in keyboard modality.\n * @param {Event} e\n */\n function onPointerDown(e) {\n hadKeyboardEvent = false;\n }\n\n /**\n * On `focus`, add the `focus-visible` class to the target if:\n * - the target received focus as a result of keyboard navigation, or\n * - the event target is an element that will likely require interaction\n * via the keyboard (e.g. a text box)\n * @param {Event} e\n */\n function onFocus(e) {\n // Prevent IE from focusing the document or HTML element.\n if (!isValidFocusTarget(e.target)) {\n return;\n }\n\n if (hadKeyboardEvent || focusTriggersKeyboardModality(e.target)) {\n addFocusVisibleClass(e.target);\n }\n }\n\n /**\n * On `blur`, remove the `focus-visible` class from the target.\n * @param {Event} e\n */\n function onBlur(e) {\n if (!isValidFocusTarget(e.target)) {\n return;\n }\n\n if (\n e.target.classList.contains('focus-visible') ||\n e.target.hasAttribute('data-focus-visible-added')\n ) {\n // To detect a tab/window switch, we look for a blur event followed\n // rapidly by a visibility change.\n // If we don't see a visibility change within 100ms, it's probably a\n // regular focus change.\n hadFocusVisibleRecently = true;\n window.clearTimeout(hadFocusVisibleRecentlyTimeout);\n hadFocusVisibleRecentlyTimeout = window.setTimeout(function() {\n hadFocusVisibleRecently = false;\n }, 100);\n removeFocusVisibleClass(e.target);\n }\n }\n\n /**\n * If the user changes tabs, keep track of whether or not the previously\n * focused element had .focus-visible.\n * @param {Event} e\n */\n function onVisibilityChange(e) {\n if (document.visibilityState === 'hidden') {\n // If the tab becomes active again, the browser will handle calling focus\n // on the element (Safari actually calls it twice).\n // If this tab change caused a blur on an element with focus-visible,\n // re-apply the class when the user switches back to the tab.\n if (hadFocusVisibleRecently) {\n hadKeyboardEvent = true;\n }\n addInitialPointerMoveListeners();\n }\n }\n\n /**\n * Add a group of listeners to detect usage of any pointing devices.\n * These listeners will be added when the polyfill first loads, and anytime\n * the window is blurred, so that they are active when the window regains\n * focus.\n */\n function addInitialPointerMoveListeners() {\n document.addEventListener('mousemove', onInitialPointerMove);\n document.addEventListener('mousedown', onInitialPointerMove);\n document.addEventListener('mouseup', onInitialPointerMove);\n document.addEventListener('pointermove', onInitialPointerMove);\n document.addEventListener('pointerdown', onInitialPointerMove);\n document.addEventListener('pointerup', onInitialPointerMove);\n document.addEventListener('touchmove', onInitialPointerMove);\n document.addEventListener('touchstart', onInitialPointerMove);\n document.addEventListener('touchend', onInitialPointerMove);\n }\n\n function removeInitialPointerMoveListeners() {\n document.removeEventListener('mousemove', onInitialPointerMove);\n document.removeEventListener('mousedown', onInitialPointerMove);\n document.removeEventListener('mouseup', onInitialPointerMove);\n document.removeEventListener('pointermove', onInitialPointerMove);\n document.removeEventListener('pointerdown', onInitialPointerMove);\n document.removeEventListener('pointerup', onInitialPointerMove);\n document.removeEventListener('touchmove', onInitialPointerMove);\n document.removeEventListener('touchstart', onInitialPointerMove);\n document.removeEventListener('touchend', onInitialPointerMove);\n }\n\n /**\n * When the polfyill first loads, assume the user is in keyboard modality.\n * If any event is received from a pointing device (e.g. mouse, pointer,\n * touch), turn off keyboard modality.\n * This accounts for situations where focus enters the page from the URL bar.\n * @param {Event} e\n */\n function onInitialPointerMove(e) {\n // Work around a Safari quirk that fires a mousemove on whenever the\n // window blurs, even if you're tabbing out of the page. \u00AF\\_(\u30C4)_/\u00AF\n if (e.target.nodeName && e.target.nodeName.toLowerCase() === 'html') {\n return;\n }\n\n hadKeyboardEvent = false;\n removeInitialPointerMoveListeners();\n }\n\n // For some kinds of state, we are interested in changes at the global scope\n // only. For example, global pointer input, global key presses and global\n // visibility change should affect the state at every scope:\n document.addEventListener('keydown', onKeyDown, true);\n document.addEventListener('mousedown', onPointerDown, true);\n document.addEventListener('pointerdown', onPointerDown, true);\n document.addEventListener('touchstart', onPointerDown, true);\n document.addEventListener('visibilitychange', onVisibilityChange, true);\n\n addInitialPointerMoveListeners();\n\n // For focus and blur, we specifically care about state changes in the local\n // scope. This is because focus / blur events that originate from within a\n // shadow root are not re-dispatched from the host element if it was already\n // the active element in its own scope:\n scope.addEventListener('focus', onFocus, true);\n scope.addEventListener('blur', onBlur, true);\n\n // We detect that a node is a ShadowRoot by ensuring that it is a\n // DocumentFragment and also has a host property. This check covers native\n // implementation and polyfill implementation transparently. If we only cared\n // about the native implementation, we could just check if the scope was\n // an instance of a ShadowRoot.\n if (scope.nodeType === Node.DOCUMENT_FRAGMENT_NODE && scope.host) {\n // Since a ShadowRoot is a special kind of DocumentFragment, it does not\n // have a root element to add a class to. So, we add this attribute to the\n // host element instead:\n scope.host.setAttribute('data-js-focus-visible', '');\n } else if (scope.nodeType === Node.DOCUMENT_NODE) {\n document.documentElement.classList.add('js-focus-visible');\n document.documentElement.setAttribute('data-js-focus-visible', '');\n }\n }\n\n // It is important to wrap all references to global window and document in\n // these checks to support server-side rendering use cases\n // @see https://github.com/WICG/focus-visible/issues/199\n if (typeof window !== 'undefined' && typeof document !== 'undefined') {\n // Make the polyfill helper globally available. This can be used as a signal\n // to interested libraries that wish to coordinate with the polyfill for e.g.,\n // applying the polyfill to a shadow root:\n window.applyFocusVisiblePolyfill = applyFocusVisiblePolyfill;\n\n // Notify interested libraries of the polyfill's presence, in case the\n // polyfill was loaded lazily:\n var event;\n\n try {\n event = new CustomEvent('focus-visible-polyfill-ready');\n } catch (error) {\n // IE11 does not support using CustomEvent as a constructor directly:\n event = document.createEvent('CustomEvent');\n event.initCustomEvent('focus-visible-polyfill-ready', false, false, {});\n }\n\n window.dispatchEvent(event);\n }\n\n if (typeof document !== 'undefined') {\n // Apply the polyfill to the global document, so that no JavaScript\n // coordination is required to use the polyfill in the top-level document:\n applyFocusVisiblePolyfill(document);\n }\n\n})));\n", "/*!\n * escape-html\n * Copyright(c) 2012-2013 TJ Holowaychuk\n * Copyright(c) 2015 Andreas Lubbe\n * Copyright(c) 2015 Tiancheng \"Timothy\" Gu\n * MIT Licensed\n */\n\n'use strict';\n\n/**\n * Module variables.\n * @private\n */\n\nvar matchHtmlRegExp = /[\"'&<>]/;\n\n/**\n * Module exports.\n * @public\n */\n\nmodule.exports = escapeHtml;\n\n/**\n * Escape special characters in the given string of html.\n *\n * @param {string} string The string to escape for inserting into HTML\n * @return {string}\n * @public\n */\n\nfunction escapeHtml(string) {\n var str = '' + string;\n var match = matchHtmlRegExp.exec(str);\n\n if (!match) {\n return str;\n }\n\n var escape;\n var html = '';\n var index = 0;\n var lastIndex = 0;\n\n for (index = match.index; index < str.length; index++) {\n switch (str.charCodeAt(index)) {\n case 34: // \"\n escape = '"';\n break;\n case 38: // &\n escape = '&';\n break;\n case 39: // '\n escape = ''';\n break;\n case 60: // <\n escape = '<';\n break;\n case 62: // >\n escape = '>';\n break;\n default:\n continue;\n }\n\n if (lastIndex !== index) {\n html += str.substring(lastIndex, index);\n }\n\n lastIndex = index + 1;\n html += escape;\n }\n\n return lastIndex !== index\n ? html + str.substring(lastIndex, index)\n : html;\n}\n", "/*!\n * clipboard.js v2.0.11\n * https://clipboardjs.com/\n *\n * Licensed MIT \u00A9 Zeno Rocha\n */\n(function webpackUniversalModuleDefinition(root, factory) {\n\tif(typeof exports === 'object' && typeof module === 'object')\n\t\tmodule.exports = factory();\n\telse if(typeof define === 'function' && define.amd)\n\t\tdefine([], factory);\n\telse if(typeof exports === 'object')\n\t\texports[\"ClipboardJS\"] = factory();\n\telse\n\t\troot[\"ClipboardJS\"] = factory();\n})(this, function() {\nreturn /******/ (function() { // webpackBootstrap\n/******/ \tvar __webpack_modules__ = ({\n\n/***/ 686:\n/***/ (function(__unused_webpack_module, __webpack_exports__, __webpack_require__) {\n\n\"use strict\";\n\n// EXPORTS\n__webpack_require__.d(__webpack_exports__, {\n \"default\": function() { return /* binding */ clipboard; }\n});\n\n// EXTERNAL MODULE: ./node_modules/tiny-emitter/index.js\nvar tiny_emitter = __webpack_require__(279);\nvar tiny_emitter_default = /*#__PURE__*/__webpack_require__.n(tiny_emitter);\n// EXTERNAL MODULE: ./node_modules/good-listener/src/listen.js\nvar listen = __webpack_require__(370);\nvar listen_default = /*#__PURE__*/__webpack_require__.n(listen);\n// EXTERNAL MODULE: ./node_modules/select/src/select.js\nvar src_select = __webpack_require__(817);\nvar select_default = /*#__PURE__*/__webpack_require__.n(src_select);\n;// CONCATENATED MODULE: ./src/common/command.js\n/**\n * Executes a given operation type.\n * @param {String} type\n * @return {Boolean}\n */\nfunction command(type) {\n try {\n return document.execCommand(type);\n } catch (err) {\n return false;\n }\n}\n;// CONCATENATED MODULE: ./src/actions/cut.js\n\n\n/**\n * Cut action wrapper.\n * @param {String|HTMLElement} target\n * @return {String}\n */\n\nvar ClipboardActionCut = function ClipboardActionCut(target) {\n var selectedText = select_default()(target);\n command('cut');\n return selectedText;\n};\n\n/* harmony default export */ var actions_cut = (ClipboardActionCut);\n;// CONCATENATED MODULE: ./src/common/create-fake-element.js\n/**\n * Creates a fake textarea element with a value.\n * @param {String} value\n * @return {HTMLElement}\n */\nfunction createFakeElement(value) {\n var isRTL = document.documentElement.getAttribute('dir') === 'rtl';\n var fakeElement = document.createElement('textarea'); // Prevent zooming on iOS\n\n fakeElement.style.fontSize = '12pt'; // Reset box model\n\n fakeElement.style.border = '0';\n fakeElement.style.padding = '0';\n fakeElement.style.margin = '0'; // Move element out of screen horizontally\n\n fakeElement.style.position = 'absolute';\n fakeElement.style[isRTL ? 'right' : 'left'] = '-9999px'; // Move element to the same position vertically\n\n var yPosition = window.pageYOffset || document.documentElement.scrollTop;\n fakeElement.style.top = \"\".concat(yPosition, \"px\");\n fakeElement.setAttribute('readonly', '');\n fakeElement.value = value;\n return fakeElement;\n}\n;// CONCATENATED MODULE: ./src/actions/copy.js\n\n\n\n/**\n * Create fake copy action wrapper using a fake element.\n * @param {String} target\n * @param {Object} options\n * @return {String}\n */\n\nvar fakeCopyAction = function fakeCopyAction(value, options) {\n var fakeElement = createFakeElement(value);\n options.container.appendChild(fakeElement);\n var selectedText = select_default()(fakeElement);\n command('copy');\n fakeElement.remove();\n return selectedText;\n};\n/**\n * Copy action wrapper.\n * @param {String|HTMLElement} target\n * @param {Object} options\n * @return {String}\n */\n\n\nvar ClipboardActionCopy = function ClipboardActionCopy(target) {\n var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {\n container: document.body\n };\n var selectedText = '';\n\n if (typeof target === 'string') {\n selectedText = fakeCopyAction(target, options);\n } else if (target instanceof HTMLInputElement && !['text', 'search', 'url', 'tel', 'password'].includes(target === null || target === void 0 ? void 0 : target.type)) {\n // If input type doesn't support `setSelectionRange`. Simulate it. https://developer.mozilla.org/en-US/docs/Web/API/HTMLInputElement/setSelectionRange\n selectedText = fakeCopyAction(target.value, options);\n } else {\n selectedText = select_default()(target);\n command('copy');\n }\n\n return selectedText;\n};\n\n/* harmony default export */ var actions_copy = (ClipboardActionCopy);\n;// CONCATENATED MODULE: ./src/actions/default.js\nfunction _typeof(obj) { \"@babel/helpers - typeof\"; if (typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\") { _typeof = function _typeof(obj) { return typeof obj; }; } else { _typeof = function _typeof(obj) { return obj && typeof Symbol === \"function\" && obj.constructor === Symbol && obj !== Symbol.prototype ? \"symbol\" : typeof obj; }; } return _typeof(obj); }\n\n\n\n/**\n * Inner function which performs selection from either `text` or `target`\n * properties and then executes copy or cut operations.\n * @param {Object} options\n */\n\nvar ClipboardActionDefault = function ClipboardActionDefault() {\n var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};\n // Defines base properties passed from constructor.\n var _options$action = options.action,\n action = _options$action === void 0 ? 'copy' : _options$action,\n container = options.container,\n target = options.target,\n text = options.text; // Sets the `action` to be performed which can be either 'copy' or 'cut'.\n\n if (action !== 'copy' && action !== 'cut') {\n throw new Error('Invalid \"action\" value, use either \"copy\" or \"cut\"');\n } // Sets the `target` property using an element that will be have its content copied.\n\n\n if (target !== undefined) {\n if (target && _typeof(target) === 'object' && target.nodeType === 1) {\n if (action === 'copy' && target.hasAttribute('disabled')) {\n throw new Error('Invalid \"target\" attribute. Please use \"readonly\" instead of \"disabled\" attribute');\n }\n\n if (action === 'cut' && (target.hasAttribute('readonly') || target.hasAttribute('disabled'))) {\n throw new Error('Invalid \"target\" attribute. You can\\'t cut text from elements with \"readonly\" or \"disabled\" attributes');\n }\n } else {\n throw new Error('Invalid \"target\" value, use a valid Element');\n }\n } // Define selection strategy based on `text` property.\n\n\n if (text) {\n return actions_copy(text, {\n container: container\n });\n } // Defines which selection strategy based on `target` property.\n\n\n if (target) {\n return action === 'cut' ? actions_cut(target) : actions_copy(target, {\n container: container\n });\n }\n};\n\n/* harmony default export */ var actions_default = (ClipboardActionDefault);\n;// CONCATENATED MODULE: ./src/clipboard.js\nfunction clipboard_typeof(obj) { \"@babel/helpers - typeof\"; if (typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\") { clipboard_typeof = function _typeof(obj) { return typeof obj; }; } else { clipboard_typeof = function _typeof(obj) { return obj && typeof Symbol === \"function\" && obj.constructor === Symbol && obj !== Symbol.prototype ? \"symbol\" : typeof obj; }; } return clipboard_typeof(obj); }\n\nfunction _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError(\"Cannot call a class as a function\"); } }\n\nfunction _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if (\"value\" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } }\n\nfunction _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; }\n\nfunction _inherits(subClass, superClass) { if (typeof superClass !== \"function\" && superClass !== null) { throw new TypeError(\"Super expression must either be null or a function\"); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, writable: true, configurable: true } }); if (superClass) _setPrototypeOf(subClass, superClass); }\n\nfunction _setPrototypeOf(o, p) { _setPrototypeOf = Object.setPrototypeOf || function _setPrototypeOf(o, p) { o.__proto__ = p; return o; }; return _setPrototypeOf(o, p); }\n\nfunction _createSuper(Derived) { var hasNativeReflectConstruct = _isNativeReflectConstruct(); return function _createSuperInternal() { var Super = _getPrototypeOf(Derived), result; if (hasNativeReflectConstruct) { var NewTarget = _getPrototypeOf(this).constructor; result = Reflect.construct(Super, arguments, NewTarget); } else { result = Super.apply(this, arguments); } return _possibleConstructorReturn(this, result); }; }\n\nfunction _possibleConstructorReturn(self, call) { if (call && (clipboard_typeof(call) === \"object\" || typeof call === \"function\")) { return call; } return _assertThisInitialized(self); }\n\nfunction _assertThisInitialized(self) { if (self === void 0) { throw new ReferenceError(\"this hasn't been initialised - super() hasn't been called\"); } return self; }\n\nfunction _isNativeReflectConstruct() { if (typeof Reflect === \"undefined\" || !Reflect.construct) return false; if (Reflect.construct.sham) return false; if (typeof Proxy === \"function\") return true; try { Date.prototype.toString.call(Reflect.construct(Date, [], function () {})); return true; } catch (e) { return false; } }\n\nfunction _getPrototypeOf(o) { _getPrototypeOf = Object.setPrototypeOf ? Object.getPrototypeOf : function _getPrototypeOf(o) { return o.__proto__ || Object.getPrototypeOf(o); }; return _getPrototypeOf(o); }\n\n\n\n\n\n\n/**\n * Helper function to retrieve attribute value.\n * @param {String} suffix\n * @param {Element} element\n */\n\nfunction getAttributeValue(suffix, element) {\n var attribute = \"data-clipboard-\".concat(suffix);\n\n if (!element.hasAttribute(attribute)) {\n return;\n }\n\n return element.getAttribute(attribute);\n}\n/**\n * Base class which takes one or more elements, adds event listeners to them,\n * and instantiates a new `ClipboardAction` on each click.\n */\n\n\nvar Clipboard = /*#__PURE__*/function (_Emitter) {\n _inherits(Clipboard, _Emitter);\n\n var _super = _createSuper(Clipboard);\n\n /**\n * @param {String|HTMLElement|HTMLCollection|NodeList} trigger\n * @param {Object} options\n */\n function Clipboard(trigger, options) {\n var _this;\n\n _classCallCheck(this, Clipboard);\n\n _this = _super.call(this);\n\n _this.resolveOptions(options);\n\n _this.listenClick(trigger);\n\n return _this;\n }\n /**\n * Defines if attributes would be resolved using internal setter functions\n * or custom functions that were passed in the constructor.\n * @param {Object} options\n */\n\n\n _createClass(Clipboard, [{\n key: \"resolveOptions\",\n value: function resolveOptions() {\n var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};\n this.action = typeof options.action === 'function' ? options.action : this.defaultAction;\n this.target = typeof options.target === 'function' ? options.target : this.defaultTarget;\n this.text = typeof options.text === 'function' ? options.text : this.defaultText;\n this.container = clipboard_typeof(options.container) === 'object' ? options.container : document.body;\n }\n /**\n * Adds a click event listener to the passed trigger.\n * @param {String|HTMLElement|HTMLCollection|NodeList} trigger\n */\n\n }, {\n key: \"listenClick\",\n value: function listenClick(trigger) {\n var _this2 = this;\n\n this.listener = listen_default()(trigger, 'click', function (e) {\n return _this2.onClick(e);\n });\n }\n /**\n * Defines a new `ClipboardAction` on each click event.\n * @param {Event} e\n */\n\n }, {\n key: \"onClick\",\n value: function onClick(e) {\n var trigger = e.delegateTarget || e.currentTarget;\n var action = this.action(trigger) || 'copy';\n var text = actions_default({\n action: action,\n container: this.container,\n target: this.target(trigger),\n text: this.text(trigger)\n }); // Fires an event based on the copy operation result.\n\n this.emit(text ? 'success' : 'error', {\n action: action,\n text: text,\n trigger: trigger,\n clearSelection: function clearSelection() {\n if (trigger) {\n trigger.focus();\n }\n\n window.getSelection().removeAllRanges();\n }\n });\n }\n /**\n * Default `action` lookup function.\n * @param {Element} trigger\n */\n\n }, {\n key: \"defaultAction\",\n value: function defaultAction(trigger) {\n return getAttributeValue('action', trigger);\n }\n /**\n * Default `target` lookup function.\n * @param {Element} trigger\n */\n\n }, {\n key: \"defaultTarget\",\n value: function defaultTarget(trigger) {\n var selector = getAttributeValue('target', trigger);\n\n if (selector) {\n return document.querySelector(selector);\n }\n }\n /**\n * Allow fire programmatically a copy action\n * @param {String|HTMLElement} target\n * @param {Object} options\n * @returns Text copied.\n */\n\n }, {\n key: \"defaultText\",\n\n /**\n * Default `text` lookup function.\n * @param {Element} trigger\n */\n value: function defaultText(trigger) {\n return getAttributeValue('text', trigger);\n }\n /**\n * Destroy lifecycle.\n */\n\n }, {\n key: \"destroy\",\n value: function destroy() {\n this.listener.destroy();\n }\n }], [{\n key: \"copy\",\n value: function copy(target) {\n var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {\n container: document.body\n };\n return actions_copy(target, options);\n }\n /**\n * Allow fire programmatically a cut action\n * @param {String|HTMLElement} target\n * @returns Text cutted.\n */\n\n }, {\n key: \"cut\",\n value: function cut(target) {\n return actions_cut(target);\n }\n /**\n * Returns the support of the given action, or all actions if no action is\n * given.\n * @param {String} [action]\n */\n\n }, {\n key: \"isSupported\",\n value: function isSupported() {\n var action = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : ['copy', 'cut'];\n var actions = typeof action === 'string' ? [action] : action;\n var support = !!document.queryCommandSupported;\n actions.forEach(function (action) {\n support = support && !!document.queryCommandSupported(action);\n });\n return support;\n }\n }]);\n\n return Clipboard;\n}((tiny_emitter_default()));\n\n/* harmony default export */ var clipboard = (Clipboard);\n\n/***/ }),\n\n/***/ 828:\n/***/ (function(module) {\n\nvar DOCUMENT_NODE_TYPE = 9;\n\n/**\n * A polyfill for Element.matches()\n */\nif (typeof Element !== 'undefined' && !Element.prototype.matches) {\n var proto = Element.prototype;\n\n proto.matches = proto.matchesSelector ||\n proto.mozMatchesSelector ||\n proto.msMatchesSelector ||\n proto.oMatchesSelector ||\n proto.webkitMatchesSelector;\n}\n\n/**\n * Finds the closest parent that matches a selector.\n *\n * @param {Element} element\n * @param {String} selector\n * @return {Function}\n */\nfunction closest (element, selector) {\n while (element && element.nodeType !== DOCUMENT_NODE_TYPE) {\n if (typeof element.matches === 'function' &&\n element.matches(selector)) {\n return element;\n }\n element = element.parentNode;\n }\n}\n\nmodule.exports = closest;\n\n\n/***/ }),\n\n/***/ 438:\n/***/ (function(module, __unused_webpack_exports, __webpack_require__) {\n\nvar closest = __webpack_require__(828);\n\n/**\n * Delegates event to a selector.\n *\n * @param {Element} element\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @param {Boolean} useCapture\n * @return {Object}\n */\nfunction _delegate(element, selector, type, callback, useCapture) {\n var listenerFn = listener.apply(this, arguments);\n\n element.addEventListener(type, listenerFn, useCapture);\n\n return {\n destroy: function() {\n element.removeEventListener(type, listenerFn, useCapture);\n }\n }\n}\n\n/**\n * Delegates event to a selector.\n *\n * @param {Element|String|Array} [elements]\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @param {Boolean} useCapture\n * @return {Object}\n */\nfunction delegate(elements, selector, type, callback, useCapture) {\n // Handle the regular Element usage\n if (typeof elements.addEventListener === 'function') {\n return _delegate.apply(null, arguments);\n }\n\n // Handle Element-less usage, it defaults to global delegation\n if (typeof type === 'function') {\n // Use `document` as the first parameter, then apply arguments\n // This is a short way to .unshift `arguments` without running into deoptimizations\n return _delegate.bind(null, document).apply(null, arguments);\n }\n\n // Handle Selector-based usage\n if (typeof elements === 'string') {\n elements = document.querySelectorAll(elements);\n }\n\n // Handle Array-like based usage\n return Array.prototype.map.call(elements, function (element) {\n return _delegate(element, selector, type, callback, useCapture);\n });\n}\n\n/**\n * Finds closest match and invokes callback.\n *\n * @param {Element} element\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @return {Function}\n */\nfunction listener(element, selector, type, callback) {\n return function(e) {\n e.delegateTarget = closest(e.target, selector);\n\n if (e.delegateTarget) {\n callback.call(element, e);\n }\n }\n}\n\nmodule.exports = delegate;\n\n\n/***/ }),\n\n/***/ 879:\n/***/ (function(__unused_webpack_module, exports) {\n\n/**\n * Check if argument is a HTML element.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.node = function(value) {\n return value !== undefined\n && value instanceof HTMLElement\n && value.nodeType === 1;\n};\n\n/**\n * Check if argument is a list of HTML elements.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.nodeList = function(value) {\n var type = Object.prototype.toString.call(value);\n\n return value !== undefined\n && (type === '[object NodeList]' || type === '[object HTMLCollection]')\n && ('length' in value)\n && (value.length === 0 || exports.node(value[0]));\n};\n\n/**\n * Check if argument is a string.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.string = function(value) {\n return typeof value === 'string'\n || value instanceof String;\n};\n\n/**\n * Check if argument is a function.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.fn = function(value) {\n var type = Object.prototype.toString.call(value);\n\n return type === '[object Function]';\n};\n\n\n/***/ }),\n\n/***/ 370:\n/***/ (function(module, __unused_webpack_exports, __webpack_require__) {\n\nvar is = __webpack_require__(879);\nvar delegate = __webpack_require__(438);\n\n/**\n * Validates all params and calls the right\n * listener function based on its target type.\n *\n * @param {String|HTMLElement|HTMLCollection|NodeList} target\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listen(target, type, callback) {\n if (!target && !type && !callback) {\n throw new Error('Missing required arguments');\n }\n\n if (!is.string(type)) {\n throw new TypeError('Second argument must be a String');\n }\n\n if (!is.fn(callback)) {\n throw new TypeError('Third argument must be a Function');\n }\n\n if (is.node(target)) {\n return listenNode(target, type, callback);\n }\n else if (is.nodeList(target)) {\n return listenNodeList(target, type, callback);\n }\n else if (is.string(target)) {\n return listenSelector(target, type, callback);\n }\n else {\n throw new TypeError('First argument must be a String, HTMLElement, HTMLCollection, or NodeList');\n }\n}\n\n/**\n * Adds an event listener to a HTML element\n * and returns a remove listener function.\n *\n * @param {HTMLElement} node\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenNode(node, type, callback) {\n node.addEventListener(type, callback);\n\n return {\n destroy: function() {\n node.removeEventListener(type, callback);\n }\n }\n}\n\n/**\n * Add an event listener to a list of HTML elements\n * and returns a remove listener function.\n *\n * @param {NodeList|HTMLCollection} nodeList\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenNodeList(nodeList, type, callback) {\n Array.prototype.forEach.call(nodeList, function(node) {\n node.addEventListener(type, callback);\n });\n\n return {\n destroy: function() {\n Array.prototype.forEach.call(nodeList, function(node) {\n node.removeEventListener(type, callback);\n });\n }\n }\n}\n\n/**\n * Add an event listener to a selector\n * and returns a remove listener function.\n *\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenSelector(selector, type, callback) {\n return delegate(document.body, selector, type, callback);\n}\n\nmodule.exports = listen;\n\n\n/***/ }),\n\n/***/ 817:\n/***/ (function(module) {\n\nfunction select(element) {\n var selectedText;\n\n if (element.nodeName === 'SELECT') {\n element.focus();\n\n selectedText = element.value;\n }\n else if (element.nodeName === 'INPUT' || element.nodeName === 'TEXTAREA') {\n var isReadOnly = element.hasAttribute('readonly');\n\n if (!isReadOnly) {\n element.setAttribute('readonly', '');\n }\n\n element.select();\n element.setSelectionRange(0, element.value.length);\n\n if (!isReadOnly) {\n element.removeAttribute('readonly');\n }\n\n selectedText = element.value;\n }\n else {\n if (element.hasAttribute('contenteditable')) {\n element.focus();\n }\n\n var selection = window.getSelection();\n var range = document.createRange();\n\n range.selectNodeContents(element);\n selection.removeAllRanges();\n selection.addRange(range);\n\n selectedText = selection.toString();\n }\n\n return selectedText;\n}\n\nmodule.exports = select;\n\n\n/***/ }),\n\n/***/ 279:\n/***/ (function(module) {\n\nfunction E () {\n // Keep this empty so it's easier to inherit from\n // (via https://github.com/lipsmack from https://github.com/scottcorgan/tiny-emitter/issues/3)\n}\n\nE.prototype = {\n on: function (name, callback, ctx) {\n var e = this.e || (this.e = {});\n\n (e[name] || (e[name] = [])).push({\n fn: callback,\n ctx: ctx\n });\n\n return this;\n },\n\n once: function (name, callback, ctx) {\n var self = this;\n function listener () {\n self.off(name, listener);\n callback.apply(ctx, arguments);\n };\n\n listener._ = callback\n return this.on(name, listener, ctx);\n },\n\n emit: function (name) {\n var data = [].slice.call(arguments, 1);\n var evtArr = ((this.e || (this.e = {}))[name] || []).slice();\n var i = 0;\n var len = evtArr.length;\n\n for (i; i < len; i++) {\n evtArr[i].fn.apply(evtArr[i].ctx, data);\n }\n\n return this;\n },\n\n off: function (name, callback) {\n var e = this.e || (this.e = {});\n var evts = e[name];\n var liveEvents = [];\n\n if (evts && callback) {\n for (var i = 0, len = evts.length; i < len; i++) {\n if (evts[i].fn !== callback && evts[i].fn._ !== callback)\n liveEvents.push(evts[i]);\n }\n }\n\n // Remove event from queue to prevent memory leak\n // Suggested by https://github.com/lazd\n // Ref: https://github.com/scottcorgan/tiny-emitter/commit/c6ebfaa9bc973b33d110a84a307742b7cf94c953#commitcomment-5024910\n\n (liveEvents.length)\n ? e[name] = liveEvents\n : delete e[name];\n\n return this;\n }\n};\n\nmodule.exports = E;\nmodule.exports.TinyEmitter = E;\n\n\n/***/ })\n\n/******/ \t});\n/************************************************************************/\n/******/ \t// The module cache\n/******/ \tvar __webpack_module_cache__ = {};\n/******/ \t\n/******/ \t// The require function\n/******/ \tfunction __webpack_require__(moduleId) {\n/******/ \t\t// Check if module is in cache\n/******/ \t\tif(__webpack_module_cache__[moduleId]) {\n/******/ \t\t\treturn __webpack_module_cache__[moduleId].exports;\n/******/ \t\t}\n/******/ \t\t// Create a new module (and put it into the cache)\n/******/ \t\tvar module = __webpack_module_cache__[moduleId] = {\n/******/ \t\t\t// no module.id needed\n/******/ \t\t\t// no module.loaded needed\n/******/ \t\t\texports: {}\n/******/ \t\t};\n/******/ \t\n/******/ \t\t// Execute the module function\n/******/ \t\t__webpack_modules__[moduleId](module, module.exports, __webpack_require__);\n/******/ \t\n/******/ \t\t// Return the exports of the module\n/******/ \t\treturn module.exports;\n/******/ \t}\n/******/ \t\n/************************************************************************/\n/******/ \t/* webpack/runtime/compat get default export */\n/******/ \t!function() {\n/******/ \t\t// getDefaultExport function for compatibility with non-harmony modules\n/******/ \t\t__webpack_require__.n = function(module) {\n/******/ \t\t\tvar getter = module && module.__esModule ?\n/******/ \t\t\t\tfunction() { return module['default']; } :\n/******/ \t\t\t\tfunction() { return module; };\n/******/ \t\t\t__webpack_require__.d(getter, { a: getter });\n/******/ \t\t\treturn getter;\n/******/ \t\t};\n/******/ \t}();\n/******/ \t\n/******/ \t/* webpack/runtime/define property getters */\n/******/ \t!function() {\n/******/ \t\t// define getter functions for harmony exports\n/******/ \t\t__webpack_require__.d = function(exports, definition) {\n/******/ \t\t\tfor(var key in definition) {\n/******/ \t\t\t\tif(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n/******/ \t\t\t\t\tObject.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n/******/ \t\t\t\t}\n/******/ \t\t\t}\n/******/ \t\t};\n/******/ \t}();\n/******/ \t\n/******/ \t/* webpack/runtime/hasOwnProperty shorthand */\n/******/ \t!function() {\n/******/ \t\t__webpack_require__.o = function(obj, prop) { return Object.prototype.hasOwnProperty.call(obj, prop); }\n/******/ \t}();\n/******/ \t\n/************************************************************************/\n/******/ \t// module exports must be returned from runtime so entry inlining is disabled\n/******/ \t// startup\n/******/ \t// Load entry module and return exports\n/******/ \treturn __webpack_require__(686);\n/******/ })()\n.default;\n});", "/*\n * Copyright (c) 2016-2024 Martin Donath \n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport \"focus-visible\"\n\nimport {\n EMPTY,\n NEVER,\n Observable,\n Subject,\n defer,\n delay,\n filter,\n map,\n merge,\n mergeWith,\n shareReplay,\n switchMap\n} from \"rxjs\"\n\nimport { configuration, feature } from \"./_\"\nimport {\n at,\n getActiveElement,\n getOptionalElement,\n requestJSON,\n setLocation,\n setToggle,\n watchDocument,\n watchKeyboard,\n watchLocation,\n watchLocationTarget,\n watchMedia,\n watchPrint,\n watchScript,\n watchViewport\n} from \"./browser\"\nimport {\n getComponentElement,\n getComponentElements,\n mountAnnounce,\n mountBackToTop,\n mountConsent,\n mountContent,\n mountDialog,\n mountHeader,\n mountHeaderTitle,\n mountPalette,\n mountProgress,\n mountSearch,\n mountSearchHiglight,\n mountSidebar,\n mountSource,\n mountTableOfContents,\n mountTabs,\n watchHeader,\n watchMain\n} from \"./components\"\nimport {\n SearchIndex,\n setupClipboardJS,\n setupInstantNavigation,\n setupVersionSelector\n} from \"./integrations\"\nimport {\n patchEllipsis,\n patchIndeterminate,\n patchScrollfix,\n patchScrolllock\n} from \"./patches\"\nimport \"./polyfills\"\n\n/* ----------------------------------------------------------------------------\n * Functions - @todo refactor\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch search index\n *\n * @returns Search index observable\n */\nfunction fetchSearchIndex(): Observable {\n if (location.protocol === \"file:\") {\n return watchScript(\n `${new URL(\"search/search_index.js\", config.base)}`\n )\n .pipe(\n // @ts-ignore - @todo fix typings\n map(() => __index),\n shareReplay(1)\n )\n } else {\n return requestJSON(\n new URL(\"search/search_index.json\", config.base)\n )\n }\n}\n\n/* ----------------------------------------------------------------------------\n * Application\n * ------------------------------------------------------------------------- */\n\n/* Yay, JavaScript is available */\ndocument.documentElement.classList.remove(\"no-js\")\ndocument.documentElement.classList.add(\"js\")\n\n/* Set up navigation observables and subjects */\nconst document$ = watchDocument()\nconst location$ = watchLocation()\nconst target$ = watchLocationTarget(location$)\nconst keyboard$ = watchKeyboard()\n\n/* Set up media observables */\nconst viewport$ = watchViewport()\nconst tablet$ = watchMedia(\"(min-width: 960px)\")\nconst screen$ = watchMedia(\"(min-width: 1220px)\")\nconst print$ = watchPrint()\n\n/* Retrieve search index, if search is enabled */\nconst config = configuration()\nconst index$ = document.forms.namedItem(\"search\")\n ? fetchSearchIndex()\n : NEVER\n\n/* Set up Clipboard.js integration */\nconst alert$ = new Subject()\nsetupClipboardJS({ alert$ })\n\n/* Set up progress indicator */\nconst progress$ = new Subject()\n\n/* Set up instant navigation, if enabled */\nif (feature(\"navigation.instant\"))\n setupInstantNavigation({ location$, viewport$, progress$ })\n .subscribe(document$)\n\n/* Set up version selector */\nif (config.version?.provider === \"mike\")\n setupVersionSelector({ document$ })\n\n/* Always close drawer and search on navigation */\nmerge(location$, target$)\n .pipe(\n delay(125)\n )\n .subscribe(() => {\n setToggle(\"drawer\", false)\n setToggle(\"search\", false)\n })\n\n/* Set up global keyboard handlers */\nkeyboard$\n .pipe(\n filter(({ mode }) => mode === \"global\")\n )\n .subscribe(key => {\n switch (key.type) {\n\n /* Go to previous page */\n case \"p\":\n case \",\":\n const prev = getOptionalElement(\"link[rel=prev]\")\n if (typeof prev !== \"undefined\")\n setLocation(prev)\n break\n\n /* Go to next page */\n case \"n\":\n case \".\":\n const next = getOptionalElement(\"link[rel=next]\")\n if (typeof next !== \"undefined\")\n setLocation(next)\n break\n\n /* Expand navigation, see https://bit.ly/3ZjG5io */\n case \"Enter\":\n const active = getActiveElement()\n if (active instanceof HTMLLabelElement)\n active.click()\n }\n })\n\n/* Set up patches */\npatchEllipsis({ viewport$, document$ })\npatchIndeterminate({ document$, tablet$ })\npatchScrollfix({ document$ })\npatchScrolllock({ viewport$, tablet$ })\n\n/* Set up header and main area observable */\nconst header$ = watchHeader(getComponentElement(\"header\"), { viewport$ })\nconst main$ = document$\n .pipe(\n map(() => getComponentElement(\"main\")),\n switchMap(el => watchMain(el, { viewport$, header$ })),\n shareReplay(1)\n )\n\n/* Set up control component observables */\nconst control$ = merge(\n\n /* Consent */\n ...getComponentElements(\"consent\")\n .map(el => mountConsent(el, { target$ })),\n\n /* Dialog */\n ...getComponentElements(\"dialog\")\n .map(el => mountDialog(el, { alert$ })),\n\n /* Color palette */\n ...getComponentElements(\"palette\")\n .map(el => mountPalette(el)),\n\n /* Progress bar */\n ...getComponentElements(\"progress\")\n .map(el => mountProgress(el, { progress$ })),\n\n /* Search */\n ...getComponentElements(\"search\")\n .map(el => mountSearch(el, { index$, keyboard$ })),\n\n /* Repository information */\n ...getComponentElements(\"source\")\n .map(el => mountSource(el))\n)\n\n/* Set up content component observables */\nconst content$ = defer(() => merge(\n\n /* Announcement bar */\n ...getComponentElements(\"announce\")\n .map(el => mountAnnounce(el)),\n\n /* Content */\n ...getComponentElements(\"content\")\n .map(el => mountContent(el, { viewport$, target$, print$ })),\n\n /* Search highlighting */\n ...getComponentElements(\"content\")\n .map(el => feature(\"search.highlight\")\n ? mountSearchHiglight(el, { index$, location$ })\n : EMPTY\n ),\n\n /* Header */\n ...getComponentElements(\"header\")\n .map(el => mountHeader(el, { viewport$, header$, main$ })),\n\n /* Header title */\n ...getComponentElements(\"header-title\")\n .map(el => mountHeaderTitle(el, { viewport$, header$ })),\n\n /* Sidebar */\n ...getComponentElements(\"sidebar\")\n .map(el => el.getAttribute(\"data-md-type\") === \"navigation\"\n ? at(screen$, () => mountSidebar(el, { viewport$, header$, main$ }))\n : at(tablet$, () => mountSidebar(el, { viewport$, header$, main$ }))\n ),\n\n /* Navigation tabs */\n ...getComponentElements(\"tabs\")\n .map(el => mountTabs(el, { viewport$, header$ })),\n\n /* Table of contents */\n ...getComponentElements(\"toc\")\n .map(el => mountTableOfContents(el, {\n viewport$, header$, main$, target$\n })),\n\n /* Back-to-top button */\n ...getComponentElements(\"top\")\n .map(el => mountBackToTop(el, { viewport$, header$, main$, target$ }))\n))\n\n/* Set up component observables */\nconst component$ = document$\n .pipe(\n switchMap(() => content$),\n mergeWith(control$),\n shareReplay(1)\n )\n\n/* Subscribe to all components */\ncomponent$.subscribe()\n\n/* ----------------------------------------------------------------------------\n * Exports\n * ------------------------------------------------------------------------- */\n\nwindow.document$ = document$ /* Document observable */\nwindow.location$ = location$ /* Location subject */\nwindow.target$ = target$ /* Location target observable */\nwindow.keyboard$ = keyboard$ /* Keyboard observable */\nwindow.viewport$ = viewport$ /* Viewport observable */\nwindow.tablet$ = tablet$ /* Media tablet observable */\nwindow.screen$ = screen$ /* Media screen observable */\nwindow.print$ = print$ /* Media print observable */\nwindow.alert$ = alert$ /* Alert subject */\nwindow.progress$ = progress$ /* Progress indicator subject */\nwindow.component$ = component$ /* Component observable */\n", "/******************************************************************************\nCopyright (c) Microsoft Corporation.\n\nPermission to use, copy, modify, and/or distribute this software for any\npurpose with or without fee is hereby granted.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH\nREGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY\nAND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,\nINDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM\nLOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR\nOTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR\nPERFORMANCE OF THIS SOFTWARE.\n***************************************************************************** */\n/* global Reflect, Promise, SuppressedError, Symbol, Iterator */\n\nvar extendStatics = function(d, b) {\n extendStatics = Object.setPrototypeOf ||\n ({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) ||\n function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; };\n return extendStatics(d, b);\n};\n\nexport function __extends(d, b) {\n if (typeof b !== \"function\" && b !== null)\n throw new TypeError(\"Class extends value \" + String(b) + \" is not a constructor or null\");\n extendStatics(d, b);\n function __() { this.constructor = d; }\n d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());\n}\n\nexport var __assign = function() {\n __assign = Object.assign || function __assign(t) {\n for (var s, i = 1, n = arguments.length; i < n; i++) {\n s = arguments[i];\n for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p];\n }\n return t;\n }\n return __assign.apply(this, arguments);\n}\n\nexport function __rest(s, e) {\n var t = {};\n for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)\n t[p] = s[p];\n if (s != null && typeof Object.getOwnPropertySymbols === \"function\")\n for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {\n if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))\n t[p[i]] = s[p[i]];\n }\n return t;\n}\n\nexport function __decorate(decorators, target, key, desc) {\n var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;\n if (typeof Reflect === \"object\" && typeof Reflect.decorate === \"function\") r = Reflect.decorate(decorators, target, key, desc);\n else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;\n return c > 3 && r && Object.defineProperty(target, key, r), r;\n}\n\nexport function __param(paramIndex, decorator) {\n return function (target, key) { decorator(target, key, paramIndex); }\n}\n\nexport function __esDecorate(ctor, descriptorIn, decorators, contextIn, initializers, extraInitializers) {\n function accept(f) { if (f !== void 0 && typeof f !== \"function\") throw new TypeError(\"Function expected\"); return f; }\n var kind = contextIn.kind, key = kind === \"getter\" ? \"get\" : kind === \"setter\" ? \"set\" : \"value\";\n var target = !descriptorIn && ctor ? contextIn[\"static\"] ? ctor : ctor.prototype : null;\n var descriptor = descriptorIn || (target ? Object.getOwnPropertyDescriptor(target, contextIn.name) : {});\n var _, done = false;\n for (var i = decorators.length - 1; i >= 0; i--) {\n var context = {};\n for (var p in contextIn) context[p] = p === \"access\" ? {} : contextIn[p];\n for (var p in contextIn.access) context.access[p] = contextIn.access[p];\n context.addInitializer = function (f) { if (done) throw new TypeError(\"Cannot add initializers after decoration has completed\"); extraInitializers.push(accept(f || null)); };\n var result = (0, decorators[i])(kind === \"accessor\" ? { get: descriptor.get, set: descriptor.set } : descriptor[key], context);\n if (kind === \"accessor\") {\n if (result === void 0) continue;\n if (result === null || typeof result !== \"object\") throw new TypeError(\"Object expected\");\n if (_ = accept(result.get)) descriptor.get = _;\n if (_ = accept(result.set)) descriptor.set = _;\n if (_ = accept(result.init)) initializers.unshift(_);\n }\n else if (_ = accept(result)) {\n if (kind === \"field\") initializers.unshift(_);\n else descriptor[key] = _;\n }\n }\n if (target) Object.defineProperty(target, contextIn.name, descriptor);\n done = true;\n};\n\nexport function __runInitializers(thisArg, initializers, value) {\n var useValue = arguments.length > 2;\n for (var i = 0; i < initializers.length; i++) {\n value = useValue ? initializers[i].call(thisArg, value) : initializers[i].call(thisArg);\n }\n return useValue ? value : void 0;\n};\n\nexport function __propKey(x) {\n return typeof x === \"symbol\" ? x : \"\".concat(x);\n};\n\nexport function __setFunctionName(f, name, prefix) {\n if (typeof name === \"symbol\") name = name.description ? \"[\".concat(name.description, \"]\") : \"\";\n return Object.defineProperty(f, \"name\", { configurable: true, value: prefix ? \"\".concat(prefix, \" \", name) : name });\n};\n\nexport function __metadata(metadataKey, metadataValue) {\n if (typeof Reflect === \"object\" && typeof Reflect.metadata === \"function\") return Reflect.metadata(metadataKey, metadataValue);\n}\n\nexport function __awaiter(thisArg, _arguments, P, generator) {\n function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }\n return new (P || (P = Promise))(function (resolve, reject) {\n function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }\n function rejected(value) { try { step(generator[\"throw\"](value)); } catch (e) { reject(e); } }\n function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }\n step((generator = generator.apply(thisArg, _arguments || [])).next());\n });\n}\n\nexport function __generator(thisArg, body) {\n var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === \"function\" ? Iterator : Object).prototype);\n return g.next = verb(0), g[\"throw\"] = verb(1), g[\"return\"] = verb(2), typeof Symbol === \"function\" && (g[Symbol.iterator] = function() { return this; }), g;\n function verb(n) { return function (v) { return step([n, v]); }; }\n function step(op) {\n if (f) throw new TypeError(\"Generator is already executing.\");\n while (g && (g = 0, op[0] && (_ = 0)), _) try {\n if (f = 1, y && (t = op[0] & 2 ? y[\"return\"] : op[0] ? y[\"throw\"] || ((t = y[\"return\"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;\n if (y = 0, t) op = [op[0] & 2, t.value];\n switch (op[0]) {\n case 0: case 1: t = op; break;\n case 4: _.label++; return { value: op[1], done: false };\n case 5: _.label++; y = op[1]; op = [0]; continue;\n case 7: op = _.ops.pop(); _.trys.pop(); continue;\n default:\n if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }\n if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }\n if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }\n if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }\n if (t[2]) _.ops.pop();\n _.trys.pop(); continue;\n }\n op = body.call(thisArg, _);\n } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }\n if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };\n }\n}\n\nexport var __createBinding = Object.create ? (function(o, m, k, k2) {\n if (k2 === undefined) k2 = k;\n var desc = Object.getOwnPropertyDescriptor(m, k);\n if (!desc || (\"get\" in desc ? !m.__esModule : desc.writable || desc.configurable)) {\n desc = { enumerable: true, get: function() { return m[k]; } };\n }\n Object.defineProperty(o, k2, desc);\n}) : (function(o, m, k, k2) {\n if (k2 === undefined) k2 = k;\n o[k2] = m[k];\n});\n\nexport function __exportStar(m, o) {\n for (var p in m) if (p !== \"default\" && !Object.prototype.hasOwnProperty.call(o, p)) __createBinding(o, m, p);\n}\n\nexport function __values(o) {\n var s = typeof Symbol === \"function\" && Symbol.iterator, m = s && o[s], i = 0;\n if (m) return m.call(o);\n if (o && typeof o.length === \"number\") return {\n next: function () {\n if (o && i >= o.length) o = void 0;\n return { value: o && o[i++], done: !o };\n }\n };\n throw new TypeError(s ? \"Object is not iterable.\" : \"Symbol.iterator is not defined.\");\n}\n\nexport function __read(o, n) {\n var m = typeof Symbol === \"function\" && o[Symbol.iterator];\n if (!m) return o;\n var i = m.call(o), r, ar = [], e;\n try {\n while ((n === void 0 || n-- > 0) && !(r = i.next()).done) ar.push(r.value);\n }\n catch (error) { e = { error: error }; }\n finally {\n try {\n if (r && !r.done && (m = i[\"return\"])) m.call(i);\n }\n finally { if (e) throw e.error; }\n }\n return ar;\n}\n\n/** @deprecated */\nexport function __spread() {\n for (var ar = [], i = 0; i < arguments.length; i++)\n ar = ar.concat(__read(arguments[i]));\n return ar;\n}\n\n/** @deprecated */\nexport function __spreadArrays() {\n for (var s = 0, i = 0, il = arguments.length; i < il; i++) s += arguments[i].length;\n for (var r = Array(s), k = 0, i = 0; i < il; i++)\n for (var a = arguments[i], j = 0, jl = a.length; j < jl; j++, k++)\n r[k] = a[j];\n return r;\n}\n\nexport function __spreadArray(to, from, pack) {\n if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {\n if (ar || !(i in from)) {\n if (!ar) ar = Array.prototype.slice.call(from, 0, i);\n ar[i] = from[i];\n }\n }\n return to.concat(ar || Array.prototype.slice.call(from));\n}\n\nexport function __await(v) {\n return this instanceof __await ? (this.v = v, this) : new __await(v);\n}\n\nexport function __asyncGenerator(thisArg, _arguments, generator) {\n if (!Symbol.asyncIterator) throw new TypeError(\"Symbol.asyncIterator is not defined.\");\n var g = generator.apply(thisArg, _arguments || []), i, q = [];\n return i = Object.create((typeof AsyncIterator === \"function\" ? AsyncIterator : Object).prototype), verb(\"next\"), verb(\"throw\"), verb(\"return\", awaitReturn), i[Symbol.asyncIterator] = function () { return this; }, i;\n function awaitReturn(f) { return function (v) { return Promise.resolve(v).then(f, reject); }; }\n function verb(n, f) { if (g[n]) { i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; if (f) i[n] = f(i[n]); } }\n function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }\n function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }\n function fulfill(value) { resume(\"next\", value); }\n function reject(value) { resume(\"throw\", value); }\n function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }\n}\n\nexport function __asyncDelegator(o) {\n var i, p;\n return i = {}, verb(\"next\"), verb(\"throw\", function (e) { throw e; }), verb(\"return\"), i[Symbol.iterator] = function () { return this; }, i;\n function verb(n, f) { i[n] = o[n] ? function (v) { return (p = !p) ? { value: __await(o[n](v)), done: false } : f ? f(v) : v; } : f; }\n}\n\nexport function __asyncValues(o) {\n if (!Symbol.asyncIterator) throw new TypeError(\"Symbol.asyncIterator is not defined.\");\n var m = o[Symbol.asyncIterator], i;\n return m ? m.call(o) : (o = typeof __values === \"function\" ? __values(o) : o[Symbol.iterator](), i = {}, verb(\"next\"), verb(\"throw\"), verb(\"return\"), i[Symbol.asyncIterator] = function () { return this; }, i);\n function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }\n function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }\n}\n\nexport function __makeTemplateObject(cooked, raw) {\n if (Object.defineProperty) { Object.defineProperty(cooked, \"raw\", { value: raw }); } else { cooked.raw = raw; }\n return cooked;\n};\n\nvar __setModuleDefault = Object.create ? (function(o, v) {\n Object.defineProperty(o, \"default\", { enumerable: true, value: v });\n}) : function(o, v) {\n o[\"default\"] = v;\n};\n\nexport function __importStar(mod) {\n if (mod && mod.__esModule) return mod;\n var result = {};\n if (mod != null) for (var k in mod) if (k !== \"default\" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);\n __setModuleDefault(result, mod);\n return result;\n}\n\nexport function __importDefault(mod) {\n return (mod && mod.__esModule) ? mod : { default: mod };\n}\n\nexport function __classPrivateFieldGet(receiver, state, kind, f) {\n if (kind === \"a\" && !f) throw new TypeError(\"Private accessor was defined without a getter\");\n if (typeof state === \"function\" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError(\"Cannot read private member from an object whose class did not declare it\");\n return kind === \"m\" ? f : kind === \"a\" ? f.call(receiver) : f ? f.value : state.get(receiver);\n}\n\nexport function __classPrivateFieldSet(receiver, state, value, kind, f) {\n if (kind === \"m\") throw new TypeError(\"Private method is not writable\");\n if (kind === \"a\" && !f) throw new TypeError(\"Private accessor was defined without a setter\");\n if (typeof state === \"function\" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError(\"Cannot write private member to an object whose class did not declare it\");\n return (kind === \"a\" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;\n}\n\nexport function __classPrivateFieldIn(state, receiver) {\n if (receiver === null || (typeof receiver !== \"object\" && typeof receiver !== \"function\")) throw new TypeError(\"Cannot use 'in' operator on non-object\");\n return typeof state === \"function\" ? receiver === state : state.has(receiver);\n}\n\nexport function __addDisposableResource(env, value, async) {\n if (value !== null && value !== void 0) {\n if (typeof value !== \"object\" && typeof value !== \"function\") throw new TypeError(\"Object expected.\");\n var dispose, inner;\n if (async) {\n if (!Symbol.asyncDispose) throw new TypeError(\"Symbol.asyncDispose is not defined.\");\n dispose = value[Symbol.asyncDispose];\n }\n if (dispose === void 0) {\n if (!Symbol.dispose) throw new TypeError(\"Symbol.dispose is not defined.\");\n dispose = value[Symbol.dispose];\n if (async) inner = dispose;\n }\n if (typeof dispose !== \"function\") throw new TypeError(\"Object not disposable.\");\n if (inner) dispose = function() { try { inner.call(this); } catch (e) { return Promise.reject(e); } };\n env.stack.push({ value: value, dispose: dispose, async: async });\n }\n else if (async) {\n env.stack.push({ async: true });\n }\n return value;\n}\n\nvar _SuppressedError = typeof SuppressedError === \"function\" ? SuppressedError : function (error, suppressed, message) {\n var e = new Error(message);\n return e.name = \"SuppressedError\", e.error = error, e.suppressed = suppressed, e;\n};\n\nexport function __disposeResources(env) {\n function fail(e) {\n env.error = env.hasError ? new _SuppressedError(e, env.error, \"An error was suppressed during disposal.\") : e;\n env.hasError = true;\n }\n var r, s = 0;\n function next() {\n while (r = env.stack.pop()) {\n try {\n if (!r.async && s === 1) return s = 0, env.stack.push(r), Promise.resolve().then(next);\n if (r.dispose) {\n var result = r.dispose.call(r.value);\n if (r.async) return s |= 2, Promise.resolve(result).then(next, function(e) { fail(e); return next(); });\n }\n else s |= 1;\n }\n catch (e) {\n fail(e);\n }\n }\n if (s === 1) return env.hasError ? Promise.reject(env.error) : Promise.resolve();\n if (env.hasError) throw env.error;\n }\n return next();\n}\n\nexport default {\n __extends,\n __assign,\n __rest,\n __decorate,\n __param,\n __metadata,\n __awaiter,\n __generator,\n __createBinding,\n __exportStar,\n __values,\n __read,\n __spread,\n __spreadArrays,\n __spreadArray,\n __await,\n __asyncGenerator,\n __asyncDelegator,\n __asyncValues,\n __makeTemplateObject,\n __importStar,\n __importDefault,\n __classPrivateFieldGet,\n __classPrivateFieldSet,\n __classPrivateFieldIn,\n __addDisposableResource,\n __disposeResources,\n};\n", "/**\n * Returns true if the object is a function.\n * @param value The value to check\n */\nexport function isFunction(value: any): value is (...args: any[]) => any {\n return typeof value === 'function';\n}\n", "/**\n * Used to create Error subclasses until the community moves away from ES5.\n *\n * This is because compiling from TypeScript down to ES5 has issues with subclassing Errors\n * as well as other built-in types: https://github.com/Microsoft/TypeScript/issues/12123\n *\n * @param createImpl A factory function to create the actual constructor implementation. The returned\n * function should be a named function that calls `_super` internally.\n */\nexport function createErrorClass(createImpl: (_super: any) => any): T {\n const _super = (instance: any) => {\n Error.call(instance);\n instance.stack = new Error().stack;\n };\n\n const ctorFunc = createImpl(_super);\n ctorFunc.prototype = Object.create(Error.prototype);\n ctorFunc.prototype.constructor = ctorFunc;\n return ctorFunc;\n}\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface UnsubscriptionError extends Error {\n readonly errors: any[];\n}\n\nexport interface UnsubscriptionErrorCtor {\n /**\n * @deprecated Internal implementation detail. Do not construct error instances.\n * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n */\n new (errors: any[]): UnsubscriptionError;\n}\n\n/**\n * An error thrown when one or more errors have occurred during the\n * `unsubscribe` of a {@link Subscription}.\n */\nexport const UnsubscriptionError: UnsubscriptionErrorCtor = createErrorClass(\n (_super) =>\n function UnsubscriptionErrorImpl(this: any, errors: (Error | string)[]) {\n _super(this);\n this.message = errors\n ? `${errors.length} errors occurred during unsubscription:\n${errors.map((err, i) => `${i + 1}) ${err.toString()}`).join('\\n ')}`\n : '';\n this.name = 'UnsubscriptionError';\n this.errors = errors;\n }\n);\n", "/**\n * Removes an item from an array, mutating it.\n * @param arr The array to remove the item from\n * @param item The item to remove\n */\nexport function arrRemove(arr: T[] | undefined | null, item: T) {\n if (arr) {\n const index = arr.indexOf(item);\n 0 <= index && arr.splice(index, 1);\n }\n}\n", "import { isFunction } from './util/isFunction';\nimport { UnsubscriptionError } from './util/UnsubscriptionError';\nimport { SubscriptionLike, TeardownLogic, Unsubscribable } from './types';\nimport { arrRemove } from './util/arrRemove';\n\n/**\n * Represents a disposable resource, such as the execution of an Observable. A\n * Subscription has one important method, `unsubscribe`, that takes no argument\n * and just disposes the resource held by the subscription.\n *\n * Additionally, subscriptions may be grouped together through the `add()`\n * method, which will attach a child Subscription to the current Subscription.\n * When a Subscription is unsubscribed, all its children (and its grandchildren)\n * will be unsubscribed as well.\n *\n * @class Subscription\n */\nexport class Subscription implements SubscriptionLike {\n /** @nocollapse */\n public static EMPTY = (() => {\n const empty = new Subscription();\n empty.closed = true;\n return empty;\n })();\n\n /**\n * A flag to indicate whether this Subscription has already been unsubscribed.\n */\n public closed = false;\n\n private _parentage: Subscription[] | Subscription | null = null;\n\n /**\n * The list of registered finalizers to execute upon unsubscription. Adding and removing from this\n * list occurs in the {@link #add} and {@link #remove} methods.\n */\n private _finalizers: Exclude[] | null = null;\n\n /**\n * @param initialTeardown A function executed first as part of the finalization\n * process that is kicked off when {@link #unsubscribe} is called.\n */\n constructor(private initialTeardown?: () => void) {}\n\n /**\n * Disposes the resources held by the subscription. May, for instance, cancel\n * an ongoing Observable execution or cancel any other type of work that\n * started when the Subscription was created.\n * @return {void}\n */\n unsubscribe(): void {\n let errors: any[] | undefined;\n\n if (!this.closed) {\n this.closed = true;\n\n // Remove this from it's parents.\n const { _parentage } = this;\n if (_parentage) {\n this._parentage = null;\n if (Array.isArray(_parentage)) {\n for (const parent of _parentage) {\n parent.remove(this);\n }\n } else {\n _parentage.remove(this);\n }\n }\n\n const { initialTeardown: initialFinalizer } = this;\n if (isFunction(initialFinalizer)) {\n try {\n initialFinalizer();\n } catch (e) {\n errors = e instanceof UnsubscriptionError ? e.errors : [e];\n }\n }\n\n const { _finalizers } = this;\n if (_finalizers) {\n this._finalizers = null;\n for (const finalizer of _finalizers) {\n try {\n execFinalizer(finalizer);\n } catch (err) {\n errors = errors ?? [];\n if (err instanceof UnsubscriptionError) {\n errors = [...errors, ...err.errors];\n } else {\n errors.push(err);\n }\n }\n }\n }\n\n if (errors) {\n throw new UnsubscriptionError(errors);\n }\n }\n }\n\n /**\n * Adds a finalizer to this subscription, so that finalization will be unsubscribed/called\n * when this subscription is unsubscribed. If this subscription is already {@link #closed},\n * because it has already been unsubscribed, then whatever finalizer is passed to it\n * will automatically be executed (unless the finalizer itself is also a closed subscription).\n *\n * Closed Subscriptions cannot be added as finalizers to any subscription. Adding a closed\n * subscription to a any subscription will result in no operation. (A noop).\n *\n * Adding a subscription to itself, or adding `null` or `undefined` will not perform any\n * operation at all. (A noop).\n *\n * `Subscription` instances that are added to this instance will automatically remove themselves\n * if they are unsubscribed. Functions and {@link Unsubscribable} objects that you wish to remove\n * will need to be removed manually with {@link #remove}\n *\n * @param teardown The finalization logic to add to this subscription.\n */\n add(teardown: TeardownLogic): void {\n // Only add the finalizer if it's not undefined\n // and don't add a subscription to itself.\n if (teardown && teardown !== this) {\n if (this.closed) {\n // If this subscription is already closed,\n // execute whatever finalizer is handed to it automatically.\n execFinalizer(teardown);\n } else {\n if (teardown instanceof Subscription) {\n // We don't add closed subscriptions, and we don't add the same subscription\n // twice. Subscription unsubscribe is idempotent.\n if (teardown.closed || teardown._hasParent(this)) {\n return;\n }\n teardown._addParent(this);\n }\n (this._finalizers = this._finalizers ?? []).push(teardown);\n }\n }\n }\n\n /**\n * Checks to see if a this subscription already has a particular parent.\n * This will signal that this subscription has already been added to the parent in question.\n * @param parent the parent to check for\n */\n private _hasParent(parent: Subscription) {\n const { _parentage } = this;\n return _parentage === parent || (Array.isArray(_parentage) && _parentage.includes(parent));\n }\n\n /**\n * Adds a parent to this subscription so it can be removed from the parent if it\n * unsubscribes on it's own.\n *\n * NOTE: THIS ASSUMES THAT {@link _hasParent} HAS ALREADY BEEN CHECKED.\n * @param parent The parent subscription to add\n */\n private _addParent(parent: Subscription) {\n const { _parentage } = this;\n this._parentage = Array.isArray(_parentage) ? (_parentage.push(parent), _parentage) : _parentage ? [_parentage, parent] : parent;\n }\n\n /**\n * Called on a child when it is removed via {@link #remove}.\n * @param parent The parent to remove\n */\n private _removeParent(parent: Subscription) {\n const { _parentage } = this;\n if (_parentage === parent) {\n this._parentage = null;\n } else if (Array.isArray(_parentage)) {\n arrRemove(_parentage, parent);\n }\n }\n\n /**\n * Removes a finalizer from this subscription that was previously added with the {@link #add} method.\n *\n * Note that `Subscription` instances, when unsubscribed, will automatically remove themselves\n * from every other `Subscription` they have been added to. This means that using the `remove` method\n * is not a common thing and should be used thoughtfully.\n *\n * If you add the same finalizer instance of a function or an unsubscribable object to a `Subscription` instance\n * more than once, you will need to call `remove` the same number of times to remove all instances.\n *\n * All finalizer instances are removed to free up memory upon unsubscription.\n *\n * @param teardown The finalizer to remove from this subscription\n */\n remove(teardown: Exclude): void {\n const { _finalizers } = this;\n _finalizers && arrRemove(_finalizers, teardown);\n\n if (teardown instanceof Subscription) {\n teardown._removeParent(this);\n }\n }\n}\n\nexport const EMPTY_SUBSCRIPTION = Subscription.EMPTY;\n\nexport function isSubscription(value: any): value is Subscription {\n return (\n value instanceof Subscription ||\n (value && 'closed' in value && isFunction(value.remove) && isFunction(value.add) && isFunction(value.unsubscribe))\n );\n}\n\nfunction execFinalizer(finalizer: Unsubscribable | (() => void)) {\n if (isFunction(finalizer)) {\n finalizer();\n } else {\n finalizer.unsubscribe();\n }\n}\n", "import { Subscriber } from './Subscriber';\nimport { ObservableNotification } from './types';\n\n/**\n * The {@link GlobalConfig} object for RxJS. It is used to configure things\n * like how to react on unhandled errors.\n */\nexport const config: GlobalConfig = {\n onUnhandledError: null,\n onStoppedNotification: null,\n Promise: undefined,\n useDeprecatedSynchronousErrorHandling: false,\n useDeprecatedNextContext: false,\n};\n\n/**\n * The global configuration object for RxJS, used to configure things\n * like how to react on unhandled errors. Accessible via {@link config}\n * object.\n */\nexport interface GlobalConfig {\n /**\n * A registration point for unhandled errors from RxJS. These are errors that\n * cannot were not handled by consuming code in the usual subscription path. For\n * example, if you have this configured, and you subscribe to an observable without\n * providing an error handler, errors from that subscription will end up here. This\n * will _always_ be called asynchronously on another job in the runtime. This is because\n * we do not want errors thrown in this user-configured handler to interfere with the\n * behavior of the library.\n */\n onUnhandledError: ((err: any) => void) | null;\n\n /**\n * A registration point for notifications that cannot be sent to subscribers because they\n * have completed, errored or have been explicitly unsubscribed. By default, next, complete\n * and error notifications sent to stopped subscribers are noops. However, sometimes callers\n * might want a different behavior. For example, with sources that attempt to report errors\n * to stopped subscribers, a caller can configure RxJS to throw an unhandled error instead.\n * This will _always_ be called asynchronously on another job in the runtime. This is because\n * we do not want errors thrown in this user-configured handler to interfere with the\n * behavior of the library.\n */\n onStoppedNotification: ((notification: ObservableNotification, subscriber: Subscriber) => void) | null;\n\n /**\n * The promise constructor used by default for {@link Observable#toPromise toPromise} and {@link Observable#forEach forEach}\n * methods.\n *\n * @deprecated As of version 8, RxJS will no longer support this sort of injection of a\n * Promise constructor. If you need a Promise implementation other than native promises,\n * please polyfill/patch Promise as you see appropriate. Will be removed in v8.\n */\n Promise?: PromiseConstructorLike;\n\n /**\n * If true, turns on synchronous error rethrowing, which is a deprecated behavior\n * in v6 and higher. This behavior enables bad patterns like wrapping a subscribe\n * call in a try/catch block. It also enables producer interference, a nasty bug\n * where a multicast can be broken for all observers by a downstream consumer with\n * an unhandled error. DO NOT USE THIS FLAG UNLESS IT'S NEEDED TO BUY TIME\n * FOR MIGRATION REASONS.\n *\n * @deprecated As of version 8, RxJS will no longer support synchronous throwing\n * of unhandled errors. All errors will be thrown on a separate call stack to prevent bad\n * behaviors described above. Will be removed in v8.\n */\n useDeprecatedSynchronousErrorHandling: boolean;\n\n /**\n * If true, enables an as-of-yet undocumented feature from v5: The ability to access\n * `unsubscribe()` via `this` context in `next` functions created in observers passed\n * to `subscribe`.\n *\n * This is being removed because the performance was severely problematic, and it could also cause\n * issues when types other than POJOs are passed to subscribe as subscribers, as they will likely have\n * their `this` context overwritten.\n *\n * @deprecated As of version 8, RxJS will no longer support altering the\n * context of next functions provided as part of an observer to Subscribe. Instead,\n * you will have access to a subscription or a signal or token that will allow you to do things like\n * unsubscribe and test closed status. Will be removed in v8.\n */\n useDeprecatedNextContext: boolean;\n}\n", "import type { TimerHandle } from './timerHandle';\ntype SetTimeoutFunction = (handler: () => void, timeout?: number, ...args: any[]) => TimerHandle;\ntype ClearTimeoutFunction = (handle: TimerHandle) => void;\n\ninterface TimeoutProvider {\n setTimeout: SetTimeoutFunction;\n clearTimeout: ClearTimeoutFunction;\n delegate:\n | {\n setTimeout: SetTimeoutFunction;\n clearTimeout: ClearTimeoutFunction;\n }\n | undefined;\n}\n\nexport const timeoutProvider: TimeoutProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n setTimeout(handler: () => void, timeout?: number, ...args) {\n const { delegate } = timeoutProvider;\n if (delegate?.setTimeout) {\n return delegate.setTimeout(handler, timeout, ...args);\n }\n return setTimeout(handler, timeout, ...args);\n },\n clearTimeout(handle) {\n const { delegate } = timeoutProvider;\n return (delegate?.clearTimeout || clearTimeout)(handle as any);\n },\n delegate: undefined,\n};\n", "import { config } from '../config';\nimport { timeoutProvider } from '../scheduler/timeoutProvider';\n\n/**\n * Handles an error on another job either with the user-configured {@link onUnhandledError},\n * or by throwing it on that new job so it can be picked up by `window.onerror`, `process.on('error')`, etc.\n *\n * This should be called whenever there is an error that is out-of-band with the subscription\n * or when an error hits a terminal boundary of the subscription and no error handler was provided.\n *\n * @param err the error to report\n */\nexport function reportUnhandledError(err: any) {\n timeoutProvider.setTimeout(() => {\n const { onUnhandledError } = config;\n if (onUnhandledError) {\n // Execute the user-configured error handler.\n onUnhandledError(err);\n } else {\n // Throw so it is picked up by the runtime's uncaught error mechanism.\n throw err;\n }\n });\n}\n", "/* tslint:disable:no-empty */\nexport function noop() { }\n", "import { CompleteNotification, NextNotification, ErrorNotification } from './types';\n\n/**\n * A completion object optimized for memory use and created to be the\n * same \"shape\" as other notifications in v8.\n * @internal\n */\nexport const COMPLETE_NOTIFICATION = (() => createNotification('C', undefined, undefined) as CompleteNotification)();\n\n/**\n * Internal use only. Creates an optimized error notification that is the same \"shape\"\n * as other notifications.\n * @internal\n */\nexport function errorNotification(error: any): ErrorNotification {\n return createNotification('E', undefined, error) as any;\n}\n\n/**\n * Internal use only. Creates an optimized next notification that is the same \"shape\"\n * as other notifications.\n * @internal\n */\nexport function nextNotification(value: T) {\n return createNotification('N', value, undefined) as NextNotification;\n}\n\n/**\n * Ensures that all notifications created internally have the same \"shape\" in v8.\n *\n * TODO: This is only exported to support a crazy legacy test in `groupBy`.\n * @internal\n */\nexport function createNotification(kind: 'N' | 'E' | 'C', value: any, error: any) {\n return {\n kind,\n value,\n error,\n };\n}\n", "import { config } from '../config';\n\nlet context: { errorThrown: boolean; error: any } | null = null;\n\n/**\n * Handles dealing with errors for super-gross mode. Creates a context, in which\n * any synchronously thrown errors will be passed to {@link captureError}. Which\n * will record the error such that it will be rethrown after the call back is complete.\n * TODO: Remove in v8\n * @param cb An immediately executed function.\n */\nexport function errorContext(cb: () => void) {\n if (config.useDeprecatedSynchronousErrorHandling) {\n const isRoot = !context;\n if (isRoot) {\n context = { errorThrown: false, error: null };\n }\n cb();\n if (isRoot) {\n const { errorThrown, error } = context!;\n context = null;\n if (errorThrown) {\n throw error;\n }\n }\n } else {\n // This is the general non-deprecated path for everyone that\n // isn't crazy enough to use super-gross mode (useDeprecatedSynchronousErrorHandling)\n cb();\n }\n}\n\n/**\n * Captures errors only in super-gross mode.\n * @param err the error to capture\n */\nexport function captureError(err: any) {\n if (config.useDeprecatedSynchronousErrorHandling && context) {\n context.errorThrown = true;\n context.error = err;\n }\n}\n", "import { isFunction } from './util/isFunction';\nimport { Observer, ObservableNotification } from './types';\nimport { isSubscription, Subscription } from './Subscription';\nimport { config } from './config';\nimport { reportUnhandledError } from './util/reportUnhandledError';\nimport { noop } from './util/noop';\nimport { nextNotification, errorNotification, COMPLETE_NOTIFICATION } from './NotificationFactories';\nimport { timeoutProvider } from './scheduler/timeoutProvider';\nimport { captureError } from './util/errorContext';\n\n/**\n * Implements the {@link Observer} interface and extends the\n * {@link Subscription} class. While the {@link Observer} is the public API for\n * consuming the values of an {@link Observable}, all Observers get converted to\n * a Subscriber, in order to provide Subscription-like capabilities such as\n * `unsubscribe`. Subscriber is a common type in RxJS, and crucial for\n * implementing operators, but it is rarely used as a public API.\n *\n * @class Subscriber\n */\nexport class Subscriber extends Subscription implements Observer {\n /**\n * A static factory for a Subscriber, given a (potentially partial) definition\n * of an Observer.\n * @param next The `next` callback of an Observer.\n * @param error The `error` callback of an\n * Observer.\n * @param complete The `complete` callback of an\n * Observer.\n * @return A Subscriber wrapping the (partially defined)\n * Observer represented by the given arguments.\n * @nocollapse\n * @deprecated Do not use. Will be removed in v8. There is no replacement for this\n * method, and there is no reason to be creating instances of `Subscriber` directly.\n * If you have a specific use case, please file an issue.\n */\n static create(next?: (x?: T) => void, error?: (e?: any) => void, complete?: () => void): Subscriber {\n return new SafeSubscriber(next, error, complete);\n }\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n protected isStopped: boolean = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n protected destination: Subscriber | Observer; // this `any` is the escape hatch to erase extra type param (e.g. R)\n\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n * There is no reason to directly create an instance of Subscriber. This type is exported for typings reasons.\n */\n constructor(destination?: Subscriber | Observer) {\n super();\n if (destination) {\n this.destination = destination;\n // Automatically chain subscriptions together here.\n // if destination is a Subscription, then it is a Subscriber.\n if (isSubscription(destination)) {\n destination.add(this);\n }\n } else {\n this.destination = EMPTY_OBSERVER;\n }\n }\n\n /**\n * The {@link Observer} callback to receive notifications of type `next` from\n * the Observable, with a value. The Observable may call this method 0 or more\n * times.\n * @param {T} [value] The `next` value.\n * @return {void}\n */\n next(value?: T): void {\n if (this.isStopped) {\n handleStoppedNotification(nextNotification(value), this);\n } else {\n this._next(value!);\n }\n }\n\n /**\n * The {@link Observer} callback to receive notifications of type `error` from\n * the Observable, with an attached `Error`. Notifies the Observer that\n * the Observable has experienced an error condition.\n * @param {any} [err] The `error` exception.\n * @return {void}\n */\n error(err?: any): void {\n if (this.isStopped) {\n handleStoppedNotification(errorNotification(err), this);\n } else {\n this.isStopped = true;\n this._error(err);\n }\n }\n\n /**\n * The {@link Observer} callback to receive a valueless notification of type\n * `complete` from the Observable. Notifies the Observer that the Observable\n * has finished sending push-based notifications.\n * @return {void}\n */\n complete(): void {\n if (this.isStopped) {\n handleStoppedNotification(COMPLETE_NOTIFICATION, this);\n } else {\n this.isStopped = true;\n this._complete();\n }\n }\n\n unsubscribe(): void {\n if (!this.closed) {\n this.isStopped = true;\n super.unsubscribe();\n this.destination = null!;\n }\n }\n\n protected _next(value: T): void {\n this.destination.next(value);\n }\n\n protected _error(err: any): void {\n try {\n this.destination.error(err);\n } finally {\n this.unsubscribe();\n }\n }\n\n protected _complete(): void {\n try {\n this.destination.complete();\n } finally {\n this.unsubscribe();\n }\n }\n}\n\n/**\n * This bind is captured here because we want to be able to have\n * compatibility with monoid libraries that tend to use a method named\n * `bind`. In particular, a library called Monio requires this.\n */\nconst _bind = Function.prototype.bind;\n\nfunction bind any>(fn: Fn, thisArg: any): Fn {\n return _bind.call(fn, thisArg);\n}\n\n/**\n * Internal optimization only, DO NOT EXPOSE.\n * @internal\n */\nclass ConsumerObserver implements Observer {\n constructor(private partialObserver: Partial>) {}\n\n next(value: T): void {\n const { partialObserver } = this;\n if (partialObserver.next) {\n try {\n partialObserver.next(value);\n } catch (error) {\n handleUnhandledError(error);\n }\n }\n }\n\n error(err: any): void {\n const { partialObserver } = this;\n if (partialObserver.error) {\n try {\n partialObserver.error(err);\n } catch (error) {\n handleUnhandledError(error);\n }\n } else {\n handleUnhandledError(err);\n }\n }\n\n complete(): void {\n const { partialObserver } = this;\n if (partialObserver.complete) {\n try {\n partialObserver.complete();\n } catch (error) {\n handleUnhandledError(error);\n }\n }\n }\n}\n\nexport class SafeSubscriber extends Subscriber {\n constructor(\n observerOrNext?: Partial> | ((value: T) => void) | null,\n error?: ((e?: any) => void) | null,\n complete?: (() => void) | null\n ) {\n super();\n\n let partialObserver: Partial>;\n if (isFunction(observerOrNext) || !observerOrNext) {\n // The first argument is a function, not an observer. The next\n // two arguments *could* be observers, or they could be empty.\n partialObserver = {\n next: (observerOrNext ?? undefined) as (((value: T) => void) | undefined),\n error: error ?? undefined,\n complete: complete ?? undefined,\n };\n } else {\n // The first argument is a partial observer.\n let context: any;\n if (this && config.useDeprecatedNextContext) {\n // This is a deprecated path that made `this.unsubscribe()` available in\n // next handler functions passed to subscribe. This only exists behind a flag\n // now, as it is *very* slow.\n context = Object.create(observerOrNext);\n context.unsubscribe = () => this.unsubscribe();\n partialObserver = {\n next: observerOrNext.next && bind(observerOrNext.next, context),\n error: observerOrNext.error && bind(observerOrNext.error, context),\n complete: observerOrNext.complete && bind(observerOrNext.complete, context),\n };\n } else {\n // The \"normal\" path. Just use the partial observer directly.\n partialObserver = observerOrNext;\n }\n }\n\n // Wrap the partial observer to ensure it's a full observer, and\n // make sure proper error handling is accounted for.\n this.destination = new ConsumerObserver(partialObserver);\n }\n}\n\nfunction handleUnhandledError(error: any) {\n if (config.useDeprecatedSynchronousErrorHandling) {\n captureError(error);\n } else {\n // Ideal path, we report this as an unhandled error,\n // which is thrown on a new call stack.\n reportUnhandledError(error);\n }\n}\n\n/**\n * An error handler used when no error handler was supplied\n * to the SafeSubscriber -- meaning no error handler was supplied\n * do the `subscribe` call on our observable.\n * @param err The error to handle\n */\nfunction defaultErrorHandler(err: any) {\n throw err;\n}\n\n/**\n * A handler for notifications that cannot be sent to a stopped subscriber.\n * @param notification The notification being sent\n * @param subscriber The stopped subscriber\n */\nfunction handleStoppedNotification(notification: ObservableNotification, subscriber: Subscriber) {\n const { onStoppedNotification } = config;\n onStoppedNotification && timeoutProvider.setTimeout(() => onStoppedNotification(notification, subscriber));\n}\n\n/**\n * The observer used as a stub for subscriptions where the user did not\n * pass any arguments to `subscribe`. Comes with the default error handling\n * behavior.\n */\nexport const EMPTY_OBSERVER: Readonly> & { closed: true } = {\n closed: true,\n next: noop,\n error: defaultErrorHandler,\n complete: noop,\n};\n", "/**\n * Symbol.observable or a string \"@@observable\". Used for interop\n *\n * @deprecated We will no longer be exporting this symbol in upcoming versions of RxJS.\n * Instead polyfill and use Symbol.observable directly *or* use https://www.npmjs.com/package/symbol-observable\n */\nexport const observable: string | symbol = (() => (typeof Symbol === 'function' && Symbol.observable) || '@@observable')();\n", "/**\n * This function takes one parameter and just returns it. Simply put,\n * this is like `(x: T): T => x`.\n *\n * ## Examples\n *\n * This is useful in some cases when using things like `mergeMap`\n *\n * ```ts\n * import { interval, take, map, range, mergeMap, identity } from 'rxjs';\n *\n * const source$ = interval(1000).pipe(take(5));\n *\n * const result$ = source$.pipe(\n * map(i => range(i)),\n * mergeMap(identity) // same as mergeMap(x => x)\n * );\n *\n * result$.subscribe({\n * next: console.log\n * });\n * ```\n *\n * Or when you want to selectively apply an operator\n *\n * ```ts\n * import { interval, take, identity } from 'rxjs';\n *\n * const shouldLimit = () => Math.random() < 0.5;\n *\n * const source$ = interval(1000);\n *\n * const result$ = source$.pipe(shouldLimit() ? take(5) : identity);\n *\n * result$.subscribe({\n * next: console.log\n * });\n * ```\n *\n * @param x Any value that is returned by this function\n * @returns The value passed as the first parameter to this function\n */\nexport function identity(x: T): T {\n return x;\n}\n", "import { identity } from './identity';\nimport { UnaryFunction } from '../types';\n\nexport function pipe(): typeof identity;\nexport function pipe(fn1: UnaryFunction): UnaryFunction;\nexport function pipe(fn1: UnaryFunction, fn2: UnaryFunction): UnaryFunction;\nexport function pipe(fn1: UnaryFunction, fn2: UnaryFunction, fn3: UnaryFunction): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction,\n fn9: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction,\n fn9: UnaryFunction,\n ...fns: UnaryFunction[]\n): UnaryFunction;\n\n/**\n * pipe() can be called on one or more functions, each of which can take one argument (\"UnaryFunction\")\n * and uses it to return a value.\n * It returns a function that takes one argument, passes it to the first UnaryFunction, and then\n * passes the result to the next one, passes that result to the next one, and so on. \n */\nexport function pipe(...fns: Array>): UnaryFunction {\n return pipeFromArray(fns);\n}\n\n/** @internal */\nexport function pipeFromArray(fns: Array>): UnaryFunction {\n if (fns.length === 0) {\n return identity as UnaryFunction;\n }\n\n if (fns.length === 1) {\n return fns[0];\n }\n\n return function piped(input: T): R {\n return fns.reduce((prev: any, fn: UnaryFunction) => fn(prev), input as any);\n };\n}\n", "import { Operator } from './Operator';\nimport { SafeSubscriber, Subscriber } from './Subscriber';\nimport { isSubscription, Subscription } from './Subscription';\nimport { TeardownLogic, OperatorFunction, Subscribable, Observer } from './types';\nimport { observable as Symbol_observable } from './symbol/observable';\nimport { pipeFromArray } from './util/pipe';\nimport { config } from './config';\nimport { isFunction } from './util/isFunction';\nimport { errorContext } from './util/errorContext';\n\n/**\n * A representation of any set of values over any amount of time. This is the most basic building block\n * of RxJS.\n *\n * @class Observable\n */\nexport class Observable implements Subscribable {\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n */\n source: Observable | undefined;\n\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n */\n operator: Operator | undefined;\n\n /**\n * @constructor\n * @param {Function} subscribe the function that is called when the Observable is\n * initially subscribed to. This function is given a Subscriber, to which new values\n * can be `next`ed, or an `error` method can be called to raise an error, or\n * `complete` can be called to notify of a successful completion.\n */\n constructor(subscribe?: (this: Observable, subscriber: Subscriber) => TeardownLogic) {\n if (subscribe) {\n this._subscribe = subscribe;\n }\n }\n\n // HACK: Since TypeScript inherits static properties too, we have to\n // fight against TypeScript here so Subject can have a different static create signature\n /**\n * Creates a new Observable by calling the Observable constructor\n * @owner Observable\n * @method create\n * @param {Function} subscribe? the subscriber function to be passed to the Observable constructor\n * @return {Observable} a new observable\n * @nocollapse\n * @deprecated Use `new Observable()` instead. Will be removed in v8.\n */\n static create: (...args: any[]) => any = (subscribe?: (subscriber: Subscriber) => TeardownLogic) => {\n return new Observable(subscribe);\n };\n\n /**\n * Creates a new Observable, with this Observable instance as the source, and the passed\n * operator defined as the new observable's operator.\n * @method lift\n * @param operator the operator defining the operation to take on the observable\n * @return a new observable with the Operator applied\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n * If you have implemented an operator using `lift`, it is recommended that you create an\n * operator by simply returning `new Observable()` directly. See \"Creating new operators from\n * scratch\" section here: https://rxjs.dev/guide/operators\n */\n lift(operator?: Operator): Observable {\n const observable = new Observable();\n observable.source = this;\n observable.operator = operator;\n return observable;\n }\n\n subscribe(observerOrNext?: Partial> | ((value: T) => void)): Subscription;\n /** @deprecated Instead of passing separate callback arguments, use an observer argument. Signatures taking separate callback arguments will be removed in v8. Details: https://rxjs.dev/deprecations/subscribe-arguments */\n subscribe(next?: ((value: T) => void) | null, error?: ((error: any) => void) | null, complete?: (() => void) | null): Subscription;\n /**\n * Invokes an execution of an Observable and registers Observer handlers for notifications it will emit.\n *\n * Use it when you have all these Observables, but still nothing is happening.\n *\n * `subscribe` is not a regular operator, but a method that calls Observable's internal `subscribe` function. It\n * might be for example a function that you passed to Observable's constructor, but most of the time it is\n * a library implementation, which defines what will be emitted by an Observable, and when it be will emitted. This means\n * that calling `subscribe` is actually the moment when Observable starts its work, not when it is created, as it is often\n * the thought.\n *\n * Apart from starting the execution of an Observable, this method allows you to listen for values\n * that an Observable emits, as well as for when it completes or errors. You can achieve this in two\n * of the following ways.\n *\n * The first way is creating an object that implements {@link Observer} interface. It should have methods\n * defined by that interface, but note that it should be just a regular JavaScript object, which you can create\n * yourself in any way you want (ES6 class, classic function constructor, object literal etc.). In particular, do\n * not attempt to use any RxJS implementation details to create Observers - you don't need them. Remember also\n * that your object does not have to implement all methods. If you find yourself creating a method that doesn't\n * do anything, you can simply omit it. Note however, if the `error` method is not provided and an error happens,\n * it will be thrown asynchronously. Errors thrown asynchronously cannot be caught using `try`/`catch`. Instead,\n * use the {@link onUnhandledError} configuration option or use a runtime handler (like `window.onerror` or\n * `process.on('error)`) to be notified of unhandled errors. Because of this, it's recommended that you provide\n * an `error` method to avoid missing thrown errors.\n *\n * The second way is to give up on Observer object altogether and simply provide callback functions in place of its methods.\n * This means you can provide three functions as arguments to `subscribe`, where the first function is equivalent\n * of a `next` method, the second of an `error` method and the third of a `complete` method. Just as in case of an Observer,\n * if you do not need to listen for something, you can omit a function by passing `undefined` or `null`,\n * since `subscribe` recognizes these functions by where they were placed in function call. When it comes\n * to the `error` function, as with an Observer, if not provided, errors emitted by an Observable will be thrown asynchronously.\n *\n * You can, however, subscribe with no parameters at all. This may be the case where you're not interested in terminal events\n * and you also handled emissions internally by using operators (e.g. using `tap`).\n *\n * Whichever style of calling `subscribe` you use, in both cases it returns a Subscription object.\n * This object allows you to call `unsubscribe` on it, which in turn will stop the work that an Observable does and will clean\n * up all resources that an Observable used. Note that cancelling a subscription will not call `complete` callback\n * provided to `subscribe` function, which is reserved for a regular completion signal that comes from an Observable.\n *\n * Remember that callbacks provided to `subscribe` are not guaranteed to be called asynchronously.\n * It is an Observable itself that decides when these functions will be called. For example {@link of}\n * by default emits all its values synchronously. Always check documentation for how given Observable\n * will behave when subscribed and if its default behavior can be modified with a `scheduler`.\n *\n * #### Examples\n *\n * Subscribe with an {@link guide/observer Observer}\n *\n * ```ts\n * import { of } from 'rxjs';\n *\n * const sumObserver = {\n * sum: 0,\n * next(value) {\n * console.log('Adding: ' + value);\n * this.sum = this.sum + value;\n * },\n * error() {\n * // We actually could just remove this method,\n * // since we do not really care about errors right now.\n * },\n * complete() {\n * console.log('Sum equals: ' + this.sum);\n * }\n * };\n *\n * of(1, 2, 3) // Synchronously emits 1, 2, 3 and then completes.\n * .subscribe(sumObserver);\n *\n * // Logs:\n * // 'Adding: 1'\n * // 'Adding: 2'\n * // 'Adding: 3'\n * // 'Sum equals: 6'\n * ```\n *\n * Subscribe with functions ({@link deprecations/subscribe-arguments deprecated})\n *\n * ```ts\n * import { of } from 'rxjs'\n *\n * let sum = 0;\n *\n * of(1, 2, 3).subscribe(\n * value => {\n * console.log('Adding: ' + value);\n * sum = sum + value;\n * },\n * undefined,\n * () => console.log('Sum equals: ' + sum)\n * );\n *\n * // Logs:\n * // 'Adding: 1'\n * // 'Adding: 2'\n * // 'Adding: 3'\n * // 'Sum equals: 6'\n * ```\n *\n * Cancel a subscription\n *\n * ```ts\n * import { interval } from 'rxjs';\n *\n * const subscription = interval(1000).subscribe({\n * next(num) {\n * console.log(num)\n * },\n * complete() {\n * // Will not be called, even when cancelling subscription.\n * console.log('completed!');\n * }\n * });\n *\n * setTimeout(() => {\n * subscription.unsubscribe();\n * console.log('unsubscribed!');\n * }, 2500);\n *\n * // Logs:\n * // 0 after 1s\n * // 1 after 2s\n * // 'unsubscribed!' after 2.5s\n * ```\n *\n * @param {Observer|Function} observerOrNext (optional) Either an observer with methods to be called,\n * or the first of three possible handlers, which is the handler for each value emitted from the subscribed\n * Observable.\n * @param {Function} error (optional) A handler for a terminal event resulting from an error. If no error handler is provided,\n * the error will be thrown asynchronously as unhandled.\n * @param {Function} complete (optional) A handler for a terminal event resulting from successful completion.\n * @return {Subscription} a subscription reference to the registered handlers\n * @method subscribe\n */\n subscribe(\n observerOrNext?: Partial> | ((value: T) => void) | null,\n error?: ((error: any) => void) | null,\n complete?: (() => void) | null\n ): Subscription {\n const subscriber = isSubscriber(observerOrNext) ? observerOrNext : new SafeSubscriber(observerOrNext, error, complete);\n\n errorContext(() => {\n const { operator, source } = this;\n subscriber.add(\n operator\n ? // We're dealing with a subscription in the\n // operator chain to one of our lifted operators.\n operator.call(subscriber, source)\n : source\n ? // If `source` has a value, but `operator` does not, something that\n // had intimate knowledge of our API, like our `Subject`, must have\n // set it. We're going to just call `_subscribe` directly.\n this._subscribe(subscriber)\n : // In all other cases, we're likely wrapping a user-provided initializer\n // function, so we need to catch errors and handle them appropriately.\n this._trySubscribe(subscriber)\n );\n });\n\n return subscriber;\n }\n\n /** @internal */\n protected _trySubscribe(sink: Subscriber): TeardownLogic {\n try {\n return this._subscribe(sink);\n } catch (err) {\n // We don't need to return anything in this case,\n // because it's just going to try to `add()` to a subscription\n // above.\n sink.error(err);\n }\n }\n\n /**\n * Used as a NON-CANCELLABLE means of subscribing to an observable, for use with\n * APIs that expect promises, like `async/await`. You cannot unsubscribe from this.\n *\n * **WARNING**: Only use this with observables you *know* will complete. If the source\n * observable does not complete, you will end up with a promise that is hung up, and\n * potentially all of the state of an async function hanging out in memory. To avoid\n * this situation, look into adding something like {@link timeout}, {@link take},\n * {@link takeWhile}, or {@link takeUntil} amongst others.\n *\n * #### Example\n *\n * ```ts\n * import { interval, take } from 'rxjs';\n *\n * const source$ = interval(1000).pipe(take(4));\n *\n * async function getTotal() {\n * let total = 0;\n *\n * await source$.forEach(value => {\n * total += value;\n * console.log('observable -> ' + value);\n * });\n *\n * return total;\n * }\n *\n * getTotal().then(\n * total => console.log('Total: ' + total)\n * );\n *\n * // Expected:\n * // 'observable -> 0'\n * // 'observable -> 1'\n * // 'observable -> 2'\n * // 'observable -> 3'\n * // 'Total: 6'\n * ```\n *\n * @param next a handler for each value emitted by the observable\n * @return a promise that either resolves on observable completion or\n * rejects with the handled error\n */\n forEach(next: (value: T) => void): Promise;\n\n /**\n * @param next a handler for each value emitted by the observable\n * @param promiseCtor a constructor function used to instantiate the Promise\n * @return a promise that either resolves on observable completion or\n * rejects with the handled error\n * @deprecated Passing a Promise constructor will no longer be available\n * in upcoming versions of RxJS. This is because it adds weight to the library, for very\n * little benefit. If you need this functionality, it is recommended that you either\n * polyfill Promise, or you create an adapter to convert the returned native promise\n * to whatever promise implementation you wanted. Will be removed in v8.\n */\n forEach(next: (value: T) => void, promiseCtor: PromiseConstructorLike): Promise;\n\n forEach(next: (value: T) => void, promiseCtor?: PromiseConstructorLike): Promise {\n promiseCtor = getPromiseCtor(promiseCtor);\n\n return new promiseCtor((resolve, reject) => {\n const subscriber = new SafeSubscriber({\n next: (value) => {\n try {\n next(value);\n } catch (err) {\n reject(err);\n subscriber.unsubscribe();\n }\n },\n error: reject,\n complete: resolve,\n });\n this.subscribe(subscriber);\n }) as Promise;\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): TeardownLogic {\n return this.source?.subscribe(subscriber);\n }\n\n /**\n * An interop point defined by the es7-observable spec https://github.com/zenparsing/es-observable\n * @method Symbol.observable\n * @return {Observable} this instance of the observable\n */\n [Symbol_observable]() {\n return this;\n }\n\n /* tslint:disable:max-line-length */\n pipe(): Observable;\n pipe(op1: OperatorFunction): Observable;\n pipe(op1: OperatorFunction, op2: OperatorFunction): Observable;\n pipe(op1: OperatorFunction, op2: OperatorFunction, op3: OperatorFunction): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction,\n op9: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction,\n op9: OperatorFunction,\n ...operations: OperatorFunction[]\n ): Observable;\n /* tslint:enable:max-line-length */\n\n /**\n * Used to stitch together functional operators into a chain.\n * @method pipe\n * @return {Observable} the Observable result of all of the operators having\n * been called in the order they were passed in.\n *\n * ## Example\n *\n * ```ts\n * import { interval, filter, map, scan } from 'rxjs';\n *\n * interval(1000)\n * .pipe(\n * filter(x => x % 2 === 0),\n * map(x => x + x),\n * scan((acc, x) => acc + x)\n * )\n * .subscribe(x => console.log(x));\n * ```\n */\n pipe(...operations: OperatorFunction[]): Observable {\n return pipeFromArray(operations)(this);\n }\n\n /* tslint:disable:max-line-length */\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(): Promise;\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(PromiseCtor: typeof Promise): Promise;\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(PromiseCtor: PromiseConstructorLike): Promise;\n /* tslint:enable:max-line-length */\n\n /**\n * Subscribe to this Observable and get a Promise resolving on\n * `complete` with the last emission (if any).\n *\n * **WARNING**: Only use this with observables you *know* will complete. If the source\n * observable does not complete, you will end up with a promise that is hung up, and\n * potentially all of the state of an async function hanging out in memory. To avoid\n * this situation, look into adding something like {@link timeout}, {@link take},\n * {@link takeWhile}, or {@link takeUntil} amongst others.\n *\n * @method toPromise\n * @param [promiseCtor] a constructor function used to instantiate\n * the Promise\n * @return A Promise that resolves with the last value emit, or\n * rejects on an error. If there were no emissions, Promise\n * resolves with undefined.\n * @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise\n */\n toPromise(promiseCtor?: PromiseConstructorLike): Promise {\n promiseCtor = getPromiseCtor(promiseCtor);\n\n return new promiseCtor((resolve, reject) => {\n let value: T | undefined;\n this.subscribe(\n (x: T) => (value = x),\n (err: any) => reject(err),\n () => resolve(value)\n );\n }) as Promise;\n }\n}\n\n/**\n * Decides between a passed promise constructor from consuming code,\n * A default configured promise constructor, and the native promise\n * constructor and returns it. If nothing can be found, it will throw\n * an error.\n * @param promiseCtor The optional promise constructor to passed by consuming code\n */\nfunction getPromiseCtor(promiseCtor: PromiseConstructorLike | undefined) {\n return promiseCtor ?? config.Promise ?? Promise;\n}\n\nfunction isObserver(value: any): value is Observer {\n return value && isFunction(value.next) && isFunction(value.error) && isFunction(value.complete);\n}\n\nfunction isSubscriber(value: any): value is Subscriber {\n return (value && value instanceof Subscriber) || (isObserver(value) && isSubscription(value));\n}\n", "import { Observable } from '../Observable';\nimport { Subscriber } from '../Subscriber';\nimport { OperatorFunction } from '../types';\nimport { isFunction } from './isFunction';\n\n/**\n * Used to determine if an object is an Observable with a lift function.\n */\nexport function hasLift(source: any): source is { lift: InstanceType['lift'] } {\n return isFunction(source?.lift);\n}\n\n/**\n * Creates an `OperatorFunction`. Used to define operators throughout the library in a concise way.\n * @param init The logic to connect the liftedSource to the subscriber at the moment of subscription.\n */\nexport function operate(\n init: (liftedSource: Observable, subscriber: Subscriber) => (() => void) | void\n): OperatorFunction {\n return (source: Observable) => {\n if (hasLift(source)) {\n return source.lift(function (this: Subscriber, liftedSource: Observable) {\n try {\n return init(liftedSource, this);\n } catch (err) {\n this.error(err);\n }\n });\n }\n throw new TypeError('Unable to lift unknown Observable type');\n };\n}\n", "import { Subscriber } from '../Subscriber';\n\n/**\n * Creates an instance of an `OperatorSubscriber`.\n * @param destination The downstream subscriber.\n * @param onNext Handles next values, only called if this subscriber is not stopped or closed. Any\n * error that occurs in this function is caught and sent to the `error` method of this subscriber.\n * @param onError Handles errors from the subscription, any errors that occur in this handler are caught\n * and send to the `destination` error handler.\n * @param onComplete Handles completion notification from the subscription. Any errors that occur in\n * this handler are sent to the `destination` error handler.\n * @param onFinalize Additional teardown logic here. This will only be called on teardown if the\n * subscriber itself is not already closed. This is called after all other teardown logic is executed.\n */\nexport function createOperatorSubscriber(\n destination: Subscriber,\n onNext?: (value: T) => void,\n onComplete?: () => void,\n onError?: (err: any) => void,\n onFinalize?: () => void\n): Subscriber {\n return new OperatorSubscriber(destination, onNext, onComplete, onError, onFinalize);\n}\n\n/**\n * A generic helper for allowing operators to be created with a Subscriber and\n * use closures to capture necessary state from the operator function itself.\n */\nexport class OperatorSubscriber extends Subscriber {\n /**\n * Creates an instance of an `OperatorSubscriber`.\n * @param destination The downstream subscriber.\n * @param onNext Handles next values, only called if this subscriber is not stopped or closed. Any\n * error that occurs in this function is caught and sent to the `error` method of this subscriber.\n * @param onError Handles errors from the subscription, any errors that occur in this handler are caught\n * and send to the `destination` error handler.\n * @param onComplete Handles completion notification from the subscription. Any errors that occur in\n * this handler are sent to the `destination` error handler.\n * @param onFinalize Additional finalization logic here. This will only be called on finalization if the\n * subscriber itself is not already closed. This is called after all other finalization logic is executed.\n * @param shouldUnsubscribe An optional check to see if an unsubscribe call should truly unsubscribe.\n * NOTE: This currently **ONLY** exists to support the strange behavior of {@link groupBy}, where unsubscription\n * to the resulting observable does not actually disconnect from the source if there are active subscriptions\n * to any grouped observable. (DO NOT EXPOSE OR USE EXTERNALLY!!!)\n */\n constructor(\n destination: Subscriber,\n onNext?: (value: T) => void,\n onComplete?: () => void,\n onError?: (err: any) => void,\n private onFinalize?: () => void,\n private shouldUnsubscribe?: () => boolean\n ) {\n // It's important - for performance reasons - that all of this class's\n // members are initialized and that they are always initialized in the same\n // order. This will ensure that all OperatorSubscriber instances have the\n // same hidden class in V8. This, in turn, will help keep the number of\n // hidden classes involved in property accesses within the base class as\n // low as possible. If the number of hidden classes involved exceeds four,\n // the property accesses will become megamorphic and performance penalties\n // will be incurred - i.e. inline caches won't be used.\n //\n // The reasons for ensuring all instances have the same hidden class are\n // further discussed in this blog post from Benedikt Meurer:\n // https://benediktmeurer.de/2018/03/23/impact-of-polymorphism-on-component-based-frameworks-like-react/\n super(destination);\n this._next = onNext\n ? function (this: OperatorSubscriber, value: T) {\n try {\n onNext(value);\n } catch (err) {\n destination.error(err);\n }\n }\n : super._next;\n this._error = onError\n ? function (this: OperatorSubscriber, err: any) {\n try {\n onError(err);\n } catch (err) {\n // Send any errors that occur down stream.\n destination.error(err);\n } finally {\n // Ensure finalization.\n this.unsubscribe();\n }\n }\n : super._error;\n this._complete = onComplete\n ? function (this: OperatorSubscriber) {\n try {\n onComplete();\n } catch (err) {\n // Send any errors that occur down stream.\n destination.error(err);\n } finally {\n // Ensure finalization.\n this.unsubscribe();\n }\n }\n : super._complete;\n }\n\n unsubscribe() {\n if (!this.shouldUnsubscribe || this.shouldUnsubscribe()) {\n const { closed } = this;\n super.unsubscribe();\n // Execute additional teardown if we have any and we didn't already do so.\n !closed && this.onFinalize?.();\n }\n }\n}\n", "import { Subscription } from '../Subscription';\n\ninterface AnimationFrameProvider {\n schedule(callback: FrameRequestCallback): Subscription;\n requestAnimationFrame: typeof requestAnimationFrame;\n cancelAnimationFrame: typeof cancelAnimationFrame;\n delegate:\n | {\n requestAnimationFrame: typeof requestAnimationFrame;\n cancelAnimationFrame: typeof cancelAnimationFrame;\n }\n | undefined;\n}\n\nexport const animationFrameProvider: AnimationFrameProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n schedule(callback) {\n let request = requestAnimationFrame;\n let cancel: typeof cancelAnimationFrame | undefined = cancelAnimationFrame;\n const { delegate } = animationFrameProvider;\n if (delegate) {\n request = delegate.requestAnimationFrame;\n cancel = delegate.cancelAnimationFrame;\n }\n const handle = request((timestamp) => {\n // Clear the cancel function. The request has been fulfilled, so\n // attempting to cancel the request upon unsubscription would be\n // pointless.\n cancel = undefined;\n callback(timestamp);\n });\n return new Subscription(() => cancel?.(handle));\n },\n requestAnimationFrame(...args) {\n const { delegate } = animationFrameProvider;\n return (delegate?.requestAnimationFrame || requestAnimationFrame)(...args);\n },\n cancelAnimationFrame(...args) {\n const { delegate } = animationFrameProvider;\n return (delegate?.cancelAnimationFrame || cancelAnimationFrame)(...args);\n },\n delegate: undefined,\n};\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface ObjectUnsubscribedError extends Error {}\n\nexport interface ObjectUnsubscribedErrorCtor {\n /**\n * @deprecated Internal implementation detail. Do not construct error instances.\n * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n */\n new (): ObjectUnsubscribedError;\n}\n\n/**\n * An error thrown when an action is invalid because the object has been\n * unsubscribed.\n *\n * @see {@link Subject}\n * @see {@link BehaviorSubject}\n *\n * @class ObjectUnsubscribedError\n */\nexport const ObjectUnsubscribedError: ObjectUnsubscribedErrorCtor = createErrorClass(\n (_super) =>\n function ObjectUnsubscribedErrorImpl(this: any) {\n _super(this);\n this.name = 'ObjectUnsubscribedError';\n this.message = 'object unsubscribed';\n }\n);\n", "import { Operator } from './Operator';\nimport { Observable } from './Observable';\nimport { Subscriber } from './Subscriber';\nimport { Subscription, EMPTY_SUBSCRIPTION } from './Subscription';\nimport { Observer, SubscriptionLike, TeardownLogic } from './types';\nimport { ObjectUnsubscribedError } from './util/ObjectUnsubscribedError';\nimport { arrRemove } from './util/arrRemove';\nimport { errorContext } from './util/errorContext';\n\n/**\n * A Subject is a special type of Observable that allows values to be\n * multicasted to many Observers. Subjects are like EventEmitters.\n *\n * Every Subject is an Observable and an Observer. You can subscribe to a\n * Subject, and you can call next to feed values as well as error and complete.\n */\nexport class Subject extends Observable implements SubscriptionLike {\n closed = false;\n\n private currentObservers: Observer[] | null = null;\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n observers: Observer[] = [];\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n isStopped = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n hasError = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n thrownError: any = null;\n\n /**\n * Creates a \"subject\" by basically gluing an observer to an observable.\n *\n * @nocollapse\n * @deprecated Recommended you do not use. Will be removed at some point in the future. Plans for replacement still under discussion.\n */\n static create: (...args: any[]) => any = (destination: Observer, source: Observable): AnonymousSubject => {\n return new AnonymousSubject(destination, source);\n };\n\n constructor() {\n // NOTE: This must be here to obscure Observable's constructor.\n super();\n }\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n lift(operator: Operator): Observable {\n const subject = new AnonymousSubject(this, this);\n subject.operator = operator as any;\n return subject as any;\n }\n\n /** @internal */\n protected _throwIfClosed() {\n if (this.closed) {\n throw new ObjectUnsubscribedError();\n }\n }\n\n next(value: T) {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n if (!this.currentObservers) {\n this.currentObservers = Array.from(this.observers);\n }\n for (const observer of this.currentObservers) {\n observer.next(value);\n }\n }\n });\n }\n\n error(err: any) {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n this.hasError = this.isStopped = true;\n this.thrownError = err;\n const { observers } = this;\n while (observers.length) {\n observers.shift()!.error(err);\n }\n }\n });\n }\n\n complete() {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n this.isStopped = true;\n const { observers } = this;\n while (observers.length) {\n observers.shift()!.complete();\n }\n }\n });\n }\n\n unsubscribe() {\n this.isStopped = this.closed = true;\n this.observers = this.currentObservers = null!;\n }\n\n get observed() {\n return this.observers?.length > 0;\n }\n\n /** @internal */\n protected _trySubscribe(subscriber: Subscriber): TeardownLogic {\n this._throwIfClosed();\n return super._trySubscribe(subscriber);\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n this._throwIfClosed();\n this._checkFinalizedStatuses(subscriber);\n return this._innerSubscribe(subscriber);\n }\n\n /** @internal */\n protected _innerSubscribe(subscriber: Subscriber) {\n const { hasError, isStopped, observers } = this;\n if (hasError || isStopped) {\n return EMPTY_SUBSCRIPTION;\n }\n this.currentObservers = null;\n observers.push(subscriber);\n return new Subscription(() => {\n this.currentObservers = null;\n arrRemove(observers, subscriber);\n });\n }\n\n /** @internal */\n protected _checkFinalizedStatuses(subscriber: Subscriber) {\n const { hasError, thrownError, isStopped } = this;\n if (hasError) {\n subscriber.error(thrownError);\n } else if (isStopped) {\n subscriber.complete();\n }\n }\n\n /**\n * Creates a new Observable with this Subject as the source. You can do this\n * to create custom Observer-side logic of the Subject and conceal it from\n * code that uses the Observable.\n * @return {Observable} Observable that the Subject casts to\n */\n asObservable(): Observable {\n const observable: any = new Observable();\n observable.source = this;\n return observable;\n }\n}\n\n/**\n * @class AnonymousSubject\n */\nexport class AnonymousSubject extends Subject {\n constructor(\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n public destination?: Observer,\n source?: Observable\n ) {\n super();\n this.source = source;\n }\n\n next(value: T) {\n this.destination?.next?.(value);\n }\n\n error(err: any) {\n this.destination?.error?.(err);\n }\n\n complete() {\n this.destination?.complete?.();\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n return this.source?.subscribe(subscriber) ?? EMPTY_SUBSCRIPTION;\n }\n}\n", "import { Subject } from './Subject';\nimport { Subscriber } from './Subscriber';\nimport { Subscription } from './Subscription';\n\n/**\n * A variant of Subject that requires an initial value and emits its current\n * value whenever it is subscribed to.\n *\n * @class BehaviorSubject\n */\nexport class BehaviorSubject extends Subject {\n constructor(private _value: T) {\n super();\n }\n\n get value(): T {\n return this.getValue();\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n const subscription = super._subscribe(subscriber);\n !subscription.closed && subscriber.next(this._value);\n return subscription;\n }\n\n getValue(): T {\n const { hasError, thrownError, _value } = this;\n if (hasError) {\n throw thrownError;\n }\n this._throwIfClosed();\n return _value;\n }\n\n next(value: T): void {\n super.next((this._value = value));\n }\n}\n", "import { TimestampProvider } from '../types';\n\ninterface DateTimestampProvider extends TimestampProvider {\n delegate: TimestampProvider | undefined;\n}\n\nexport const dateTimestampProvider: DateTimestampProvider = {\n now() {\n // Use the variable rather than `this` so that the function can be called\n // without being bound to the provider.\n return (dateTimestampProvider.delegate || Date).now();\n },\n delegate: undefined,\n};\n", "import { Subject } from './Subject';\nimport { TimestampProvider } from './types';\nimport { Subscriber } from './Subscriber';\nimport { Subscription } from './Subscription';\nimport { dateTimestampProvider } from './scheduler/dateTimestampProvider';\n\n/**\n * A variant of {@link Subject} that \"replays\" old values to new subscribers by emitting them when they first subscribe.\n *\n * `ReplaySubject` has an internal buffer that will store a specified number of values that it has observed. Like `Subject`,\n * `ReplaySubject` \"observes\" values by having them passed to its `next` method. When it observes a value, it will store that\n * value for a time determined by the configuration of the `ReplaySubject`, as passed to its constructor.\n *\n * When a new subscriber subscribes to the `ReplaySubject` instance, it will synchronously emit all values in its buffer in\n * a First-In-First-Out (FIFO) manner. The `ReplaySubject` will also complete, if it has observed completion; and it will\n * error if it has observed an error.\n *\n * There are two main configuration items to be concerned with:\n *\n * 1. `bufferSize` - This will determine how many items are stored in the buffer, defaults to infinite.\n * 2. `windowTime` - The amount of time to hold a value in the buffer before removing it from the buffer.\n *\n * Both configurations may exist simultaneously. So if you would like to buffer a maximum of 3 values, as long as the values\n * are less than 2 seconds old, you could do so with a `new ReplaySubject(3, 2000)`.\n *\n * ### Differences with BehaviorSubject\n *\n * `BehaviorSubject` is similar to `new ReplaySubject(1)`, with a couple of exceptions:\n *\n * 1. `BehaviorSubject` comes \"primed\" with a single value upon construction.\n * 2. `ReplaySubject` will replay values, even after observing an error, where `BehaviorSubject` will not.\n *\n * @see {@link Subject}\n * @see {@link BehaviorSubject}\n * @see {@link shareReplay}\n */\nexport class ReplaySubject extends Subject {\n private _buffer: (T | number)[] = [];\n private _infiniteTimeWindow = true;\n\n /**\n * @param bufferSize The size of the buffer to replay on subscription\n * @param windowTime The amount of time the buffered items will stay buffered\n * @param timestampProvider An object with a `now()` method that provides the current timestamp. This is used to\n * calculate the amount of time something has been buffered.\n */\n constructor(\n private _bufferSize = Infinity,\n private _windowTime = Infinity,\n private _timestampProvider: TimestampProvider = dateTimestampProvider\n ) {\n super();\n this._infiniteTimeWindow = _windowTime === Infinity;\n this._bufferSize = Math.max(1, _bufferSize);\n this._windowTime = Math.max(1, _windowTime);\n }\n\n next(value: T): void {\n const { isStopped, _buffer, _infiniteTimeWindow, _timestampProvider, _windowTime } = this;\n if (!isStopped) {\n _buffer.push(value);\n !_infiniteTimeWindow && _buffer.push(_timestampProvider.now() + _windowTime);\n }\n this._trimBuffer();\n super.next(value);\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n this._throwIfClosed();\n this._trimBuffer();\n\n const subscription = this._innerSubscribe(subscriber);\n\n const { _infiniteTimeWindow, _buffer } = this;\n // We use a copy here, so reentrant code does not mutate our array while we're\n // emitting it to a new subscriber.\n const copy = _buffer.slice();\n for (let i = 0; i < copy.length && !subscriber.closed; i += _infiniteTimeWindow ? 1 : 2) {\n subscriber.next(copy[i] as T);\n }\n\n this._checkFinalizedStatuses(subscriber);\n\n return subscription;\n }\n\n private _trimBuffer() {\n const { _bufferSize, _timestampProvider, _buffer, _infiniteTimeWindow } = this;\n // If we don't have an infinite buffer size, and we're over the length,\n // use splice to truncate the old buffer values off. Note that we have to\n // double the size for instances where we're not using an infinite time window\n // because we're storing the values and the timestamps in the same array.\n const adjustedBufferSize = (_infiniteTimeWindow ? 1 : 2) * _bufferSize;\n _bufferSize < Infinity && adjustedBufferSize < _buffer.length && _buffer.splice(0, _buffer.length - adjustedBufferSize);\n\n // Now, if we're not in an infinite time window, remove all values where the time is\n // older than what is allowed.\n if (!_infiniteTimeWindow) {\n const now = _timestampProvider.now();\n let last = 0;\n // Search the array for the first timestamp that isn't expired and\n // truncate the buffer up to that point.\n for (let i = 1; i < _buffer.length && (_buffer[i] as number) <= now; i += 2) {\n last = i;\n }\n last && _buffer.splice(0, last + 1);\n }\n }\n}\n", "import { Scheduler } from '../Scheduler';\nimport { Subscription } from '../Subscription';\nimport { SchedulerAction } from '../types';\n\n/**\n * A unit of work to be executed in a `scheduler`. An action is typically\n * created from within a {@link SchedulerLike} and an RxJS user does not need to concern\n * themselves about creating and manipulating an Action.\n *\n * ```ts\n * class Action extends Subscription {\n * new (scheduler: Scheduler, work: (state?: T) => void);\n * schedule(state?: T, delay: number = 0): Subscription;\n * }\n * ```\n *\n * @class Action\n */\nexport class Action extends Subscription {\n constructor(scheduler: Scheduler, work: (this: SchedulerAction, state?: T) => void) {\n super();\n }\n /**\n * Schedules this action on its parent {@link SchedulerLike} for execution. May be passed\n * some context object, `state`. May happen at some point in the future,\n * according to the `delay` parameter, if specified.\n * @param {T} [state] Some contextual data that the `work` function uses when\n * called by the Scheduler.\n * @param {number} [delay] Time to wait before executing the work, where the\n * time unit is implicit and defined by the Scheduler.\n * @return {void}\n */\n public schedule(state?: T, delay: number = 0): Subscription {\n return this;\n }\n}\n", "import type { TimerHandle } from './timerHandle';\ntype SetIntervalFunction = (handler: () => void, timeout?: number, ...args: any[]) => TimerHandle;\ntype ClearIntervalFunction = (handle: TimerHandle) => void;\n\ninterface IntervalProvider {\n setInterval: SetIntervalFunction;\n clearInterval: ClearIntervalFunction;\n delegate:\n | {\n setInterval: SetIntervalFunction;\n clearInterval: ClearIntervalFunction;\n }\n | undefined;\n}\n\nexport const intervalProvider: IntervalProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n setInterval(handler: () => void, timeout?: number, ...args) {\n const { delegate } = intervalProvider;\n if (delegate?.setInterval) {\n return delegate.setInterval(handler, timeout, ...args);\n }\n return setInterval(handler, timeout, ...args);\n },\n clearInterval(handle) {\n const { delegate } = intervalProvider;\n return (delegate?.clearInterval || clearInterval)(handle as any);\n },\n delegate: undefined,\n};\n", "import { Action } from './Action';\nimport { SchedulerAction } from '../types';\nimport { Subscription } from '../Subscription';\nimport { AsyncScheduler } from './AsyncScheduler';\nimport { intervalProvider } from './intervalProvider';\nimport { arrRemove } from '../util/arrRemove';\nimport { TimerHandle } from './timerHandle';\n\nexport class AsyncAction extends Action {\n public id: TimerHandle | undefined;\n public state?: T;\n // @ts-ignore: Property has no initializer and is not definitely assigned\n public delay: number;\n protected pending: boolean = false;\n\n constructor(protected scheduler: AsyncScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n public schedule(state?: T, delay: number = 0): Subscription {\n if (this.closed) {\n return this;\n }\n\n // Always replace the current state with the new state.\n this.state = state;\n\n const id = this.id;\n const scheduler = this.scheduler;\n\n //\n // Important implementation note:\n //\n // Actions only execute once by default, unless rescheduled from within the\n // scheduled callback. This allows us to implement single and repeat\n // actions via the same code path, without adding API surface area, as well\n // as mimic traditional recursion but across asynchronous boundaries.\n //\n // However, JS runtimes and timers distinguish between intervals achieved by\n // serial `setTimeout` calls vs. a single `setInterval` call. An interval of\n // serial `setTimeout` calls can be individually delayed, which delays\n // scheduling the next `setTimeout`, and so on. `setInterval` attempts to\n // guarantee the interval callback will be invoked more precisely to the\n // interval period, regardless of load.\n //\n // Therefore, we use `setInterval` to schedule single and repeat actions.\n // If the action reschedules itself with the same delay, the interval is not\n // canceled. If the action doesn't reschedule, or reschedules with a\n // different delay, the interval will be canceled after scheduled callback\n // execution.\n //\n if (id != null) {\n this.id = this.recycleAsyncId(scheduler, id, delay);\n }\n\n // Set the pending flag indicating that this action has been scheduled, or\n // has recursively rescheduled itself.\n this.pending = true;\n\n this.delay = delay;\n // If this action has already an async Id, don't request a new one.\n this.id = this.id ?? this.requestAsyncId(scheduler, this.id, delay);\n\n return this;\n }\n\n protected requestAsyncId(scheduler: AsyncScheduler, _id?: TimerHandle, delay: number = 0): TimerHandle {\n return intervalProvider.setInterval(scheduler.flush.bind(scheduler, this), delay);\n }\n\n protected recycleAsyncId(_scheduler: AsyncScheduler, id?: TimerHandle, delay: number | null = 0): TimerHandle | undefined {\n // If this action is rescheduled with the same delay time, don't clear the interval id.\n if (delay != null && this.delay === delay && this.pending === false) {\n return id;\n }\n // Otherwise, if the action's delay time is different from the current delay,\n // or the action has been rescheduled before it's executed, clear the interval id\n if (id != null) {\n intervalProvider.clearInterval(id);\n }\n\n return undefined;\n }\n\n /**\n * Immediately executes this action and the `work` it contains.\n * @return {any}\n */\n public execute(state: T, delay: number): any {\n if (this.closed) {\n return new Error('executing a cancelled action');\n }\n\n this.pending = false;\n const error = this._execute(state, delay);\n if (error) {\n return error;\n } else if (this.pending === false && this.id != null) {\n // Dequeue if the action didn't reschedule itself. Don't call\n // unsubscribe(), because the action could reschedule later.\n // For example:\n // ```\n // scheduler.schedule(function doWork(counter) {\n // /* ... I'm a busy worker bee ... */\n // var originalAction = this;\n // /* wait 100ms before rescheduling the action */\n // setTimeout(function () {\n // originalAction.schedule(counter + 1);\n // }, 100);\n // }, 1000);\n // ```\n this.id = this.recycleAsyncId(this.scheduler, this.id, null);\n }\n }\n\n protected _execute(state: T, _delay: number): any {\n let errored: boolean = false;\n let errorValue: any;\n try {\n this.work(state);\n } catch (e) {\n errored = true;\n // HACK: Since code elsewhere is relying on the \"truthiness\" of the\n // return here, we can't have it return \"\" or 0 or false.\n // TODO: Clean this up when we refactor schedulers mid-version-8 or so.\n errorValue = e ? e : new Error('Scheduled action threw falsy error');\n }\n if (errored) {\n this.unsubscribe();\n return errorValue;\n }\n }\n\n unsubscribe() {\n if (!this.closed) {\n const { id, scheduler } = this;\n const { actions } = scheduler;\n\n this.work = this.state = this.scheduler = null!;\n this.pending = false;\n\n arrRemove(actions, this);\n if (id != null) {\n this.id = this.recycleAsyncId(scheduler, id, null);\n }\n\n this.delay = null!;\n super.unsubscribe();\n }\n }\n}\n", "import { Action } from './scheduler/Action';\nimport { Subscription } from './Subscription';\nimport { SchedulerLike, SchedulerAction } from './types';\nimport { dateTimestampProvider } from './scheduler/dateTimestampProvider';\n\n/**\n * An execution context and a data structure to order tasks and schedule their\n * execution. Provides a notion of (potentially virtual) time, through the\n * `now()` getter method.\n *\n * Each unit of work in a Scheduler is called an `Action`.\n *\n * ```ts\n * class Scheduler {\n * now(): number;\n * schedule(work, delay?, state?): Subscription;\n * }\n * ```\n *\n * @class Scheduler\n * @deprecated Scheduler is an internal implementation detail of RxJS, and\n * should not be used directly. Rather, create your own class and implement\n * {@link SchedulerLike}. Will be made internal in v8.\n */\nexport class Scheduler implements SchedulerLike {\n public static now: () => number = dateTimestampProvider.now;\n\n constructor(private schedulerActionCtor: typeof Action, now: () => number = Scheduler.now) {\n this.now = now;\n }\n\n /**\n * A getter method that returns a number representing the current time\n * (at the time this function was called) according to the scheduler's own\n * internal clock.\n * @return {number} A number that represents the current time. May or may not\n * have a relation to wall-clock time. May or may not refer to a time unit\n * (e.g. milliseconds).\n */\n public now: () => number;\n\n /**\n * Schedules a function, `work`, for execution. May happen at some point in\n * the future, according to the `delay` parameter, if specified. May be passed\n * some context object, `state`, which will be passed to the `work` function.\n *\n * The given arguments will be processed an stored as an Action object in a\n * queue of actions.\n *\n * @param {function(state: ?T): ?Subscription} work A function representing a\n * task, or some unit of work to be executed by the Scheduler.\n * @param {number} [delay] Time to wait before executing the work, where the\n * time unit is implicit and defined by the Scheduler itself.\n * @param {T} [state] Some contextual data that the `work` function uses when\n * called by the Scheduler.\n * @return {Subscription} A subscription in order to be able to unsubscribe\n * the scheduled work.\n */\n public schedule(work: (this: SchedulerAction, state?: T) => void, delay: number = 0, state?: T): Subscription {\n return new this.schedulerActionCtor(this, work).schedule(state, delay);\n }\n}\n", "import { Scheduler } from '../Scheduler';\nimport { Action } from './Action';\nimport { AsyncAction } from './AsyncAction';\nimport { TimerHandle } from './timerHandle';\n\nexport class AsyncScheduler extends Scheduler {\n public actions: Array> = [];\n /**\n * A flag to indicate whether the Scheduler is currently executing a batch of\n * queued actions.\n * @type {boolean}\n * @internal\n */\n public _active: boolean = false;\n /**\n * An internal ID used to track the latest asynchronous task such as those\n * coming from `setTimeout`, `setInterval`, `requestAnimationFrame`, and\n * others.\n * @type {any}\n * @internal\n */\n public _scheduled: TimerHandle | undefined;\n\n constructor(SchedulerAction: typeof Action, now: () => number = Scheduler.now) {\n super(SchedulerAction, now);\n }\n\n public flush(action: AsyncAction): void {\n const { actions } = this;\n\n if (this._active) {\n actions.push(action);\n return;\n }\n\n let error: any;\n this._active = true;\n\n do {\n if ((error = action.execute(action.state, action.delay))) {\n break;\n }\n } while ((action = actions.shift()!)); // exhaust the scheduler queue\n\n this._active = false;\n\n if (error) {\n while ((action = actions.shift()!)) {\n action.unsubscribe();\n }\n throw error;\n }\n }\n}\n", "import { AsyncAction } from './AsyncAction';\nimport { AsyncScheduler } from './AsyncScheduler';\n\n/**\n *\n * Async Scheduler\n *\n * Schedule task as if you used setTimeout(task, duration)\n *\n * `async` scheduler schedules tasks asynchronously, by putting them on the JavaScript\n * event loop queue. It is best used to delay tasks in time or to schedule tasks repeating\n * in intervals.\n *\n * If you just want to \"defer\" task, that is to perform it right after currently\n * executing synchronous code ends (commonly achieved by `setTimeout(deferredTask, 0)`),\n * better choice will be the {@link asapScheduler} scheduler.\n *\n * ## Examples\n * Use async scheduler to delay task\n * ```ts\n * import { asyncScheduler } from 'rxjs';\n *\n * const task = () => console.log('it works!');\n *\n * asyncScheduler.schedule(task, 2000);\n *\n * // After 2 seconds logs:\n * // \"it works!\"\n * ```\n *\n * Use async scheduler to repeat task in intervals\n * ```ts\n * import { asyncScheduler } from 'rxjs';\n *\n * function task(state) {\n * console.log(state);\n * this.schedule(state + 1, 1000); // `this` references currently executing Action,\n * // which we reschedule with new state and delay\n * }\n *\n * asyncScheduler.schedule(task, 3000, 0);\n *\n * // Logs:\n * // 0 after 3s\n * // 1 after 4s\n * // 2 after 5s\n * // 3 after 6s\n * ```\n */\n\nexport const asyncScheduler = new AsyncScheduler(AsyncAction);\n\n/**\n * @deprecated Renamed to {@link asyncScheduler}. Will be removed in v8.\n */\nexport const async = asyncScheduler;\n", "import { AsyncAction } from './AsyncAction';\nimport { Subscription } from '../Subscription';\nimport { QueueScheduler } from './QueueScheduler';\nimport { SchedulerAction } from '../types';\nimport { TimerHandle } from './timerHandle';\n\nexport class QueueAction extends AsyncAction {\n constructor(protected scheduler: QueueScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n public schedule(state?: T, delay: number = 0): Subscription {\n if (delay > 0) {\n return super.schedule(state, delay);\n }\n this.delay = delay;\n this.state = state;\n this.scheduler.flush(this);\n return this;\n }\n\n public execute(state: T, delay: number): any {\n return delay > 0 || this.closed ? super.execute(state, delay) : this._execute(state, delay);\n }\n\n protected requestAsyncId(scheduler: QueueScheduler, id?: TimerHandle, delay: number = 0): TimerHandle {\n // If delay exists and is greater than 0, or if the delay is null (the\n // action wasn't rescheduled) but was originally scheduled as an async\n // action, then recycle as an async action.\n\n if ((delay != null && delay > 0) || (delay == null && this.delay > 0)) {\n return super.requestAsyncId(scheduler, id, delay);\n }\n\n // Otherwise flush the scheduler starting with this action.\n scheduler.flush(this);\n\n // HACK: In the past, this was returning `void`. However, `void` isn't a valid\n // `TimerHandle`, and generally the return value here isn't really used. So the\n // compromise is to return `0` which is both \"falsy\" and a valid `TimerHandle`,\n // as opposed to refactoring every other instanceo of `requestAsyncId`.\n return 0;\n }\n}\n", "import { AsyncScheduler } from './AsyncScheduler';\n\nexport class QueueScheduler extends AsyncScheduler {\n}\n", "import { QueueAction } from './QueueAction';\nimport { QueueScheduler } from './QueueScheduler';\n\n/**\n *\n * Queue Scheduler\n *\n * Put every next task on a queue, instead of executing it immediately\n *\n * `queue` scheduler, when used with delay, behaves the same as {@link asyncScheduler} scheduler.\n *\n * When used without delay, it schedules given task synchronously - executes it right when\n * it is scheduled. However when called recursively, that is when inside the scheduled task,\n * another task is scheduled with queue scheduler, instead of executing immediately as well,\n * that task will be put on a queue and wait for current one to finish.\n *\n * This means that when you execute task with `queue` scheduler, you are sure it will end\n * before any other task scheduled with that scheduler will start.\n *\n * ## Examples\n * Schedule recursively first, then do something\n * ```ts\n * import { queueScheduler } from 'rxjs';\n *\n * queueScheduler.schedule(() => {\n * queueScheduler.schedule(() => console.log('second')); // will not happen now, but will be put on a queue\n *\n * console.log('first');\n * });\n *\n * // Logs:\n * // \"first\"\n * // \"second\"\n * ```\n *\n * Reschedule itself recursively\n * ```ts\n * import { queueScheduler } from 'rxjs';\n *\n * queueScheduler.schedule(function(state) {\n * if (state !== 0) {\n * console.log('before', state);\n * this.schedule(state - 1); // `this` references currently executing Action,\n * // which we reschedule with new state\n * console.log('after', state);\n * }\n * }, 0, 3);\n *\n * // In scheduler that runs recursively, you would expect:\n * // \"before\", 3\n * // \"before\", 2\n * // \"before\", 1\n * // \"after\", 1\n * // \"after\", 2\n * // \"after\", 3\n *\n * // But with queue it logs:\n * // \"before\", 3\n * // \"after\", 3\n * // \"before\", 2\n * // \"after\", 2\n * // \"before\", 1\n * // \"after\", 1\n * ```\n */\n\nexport const queueScheduler = new QueueScheduler(QueueAction);\n\n/**\n * @deprecated Renamed to {@link queueScheduler}. Will be removed in v8.\n */\nexport const queue = queueScheduler;\n", "import { AsyncAction } from './AsyncAction';\nimport { AnimationFrameScheduler } from './AnimationFrameScheduler';\nimport { SchedulerAction } from '../types';\nimport { animationFrameProvider } from './animationFrameProvider';\nimport { TimerHandle } from './timerHandle';\n\nexport class AnimationFrameAction extends AsyncAction {\n constructor(protected scheduler: AnimationFrameScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n protected requestAsyncId(scheduler: AnimationFrameScheduler, id?: TimerHandle, delay: number = 0): TimerHandle {\n // If delay is greater than 0, request as an async action.\n if (delay !== null && delay > 0) {\n return super.requestAsyncId(scheduler, id, delay);\n }\n // Push the action to the end of the scheduler queue.\n scheduler.actions.push(this);\n // If an animation frame has already been requested, don't request another\n // one. If an animation frame hasn't been requested yet, request one. Return\n // the current animation frame request id.\n return scheduler._scheduled || (scheduler._scheduled = animationFrameProvider.requestAnimationFrame(() => scheduler.flush(undefined)));\n }\n\n protected recycleAsyncId(scheduler: AnimationFrameScheduler, id?: TimerHandle, delay: number = 0): TimerHandle | undefined {\n // If delay exists and is greater than 0, or if the delay is null (the\n // action wasn't rescheduled) but was originally scheduled as an async\n // action, then recycle as an async action.\n if (delay != null ? delay > 0 : this.delay > 0) {\n return super.recycleAsyncId(scheduler, id, delay);\n }\n // If the scheduler queue has no remaining actions with the same async id,\n // cancel the requested animation frame and set the scheduled flag to\n // undefined so the next AnimationFrameAction will request its own.\n const { actions } = scheduler;\n if (id != null && actions[actions.length - 1]?.id !== id) {\n animationFrameProvider.cancelAnimationFrame(id as number);\n scheduler._scheduled = undefined;\n }\n // Return undefined so the action knows to request a new async id if it's rescheduled.\n return undefined;\n }\n}\n", "import { AsyncAction } from './AsyncAction';\nimport { AsyncScheduler } from './AsyncScheduler';\n\nexport class AnimationFrameScheduler extends AsyncScheduler {\n public flush(action?: AsyncAction): void {\n this._active = true;\n // The async id that effects a call to flush is stored in _scheduled.\n // Before executing an action, it's necessary to check the action's async\n // id to determine whether it's supposed to be executed in the current\n // flush.\n // Previous implementations of this method used a count to determine this,\n // but that was unsound, as actions that are unsubscribed - i.e. cancelled -\n // are removed from the actions array and that can shift actions that are\n // scheduled to be executed in a subsequent flush into positions at which\n // they are executed within the current flush.\n const flushId = this._scheduled;\n this._scheduled = undefined;\n\n const { actions } = this;\n let error: any;\n action = action || actions.shift()!;\n\n do {\n if ((error = action.execute(action.state, action.delay))) {\n break;\n }\n } while ((action = actions[0]) && action.id === flushId && actions.shift());\n\n this._active = false;\n\n if (error) {\n while ((action = actions[0]) && action.id === flushId && actions.shift()) {\n action.unsubscribe();\n }\n throw error;\n }\n }\n}\n", "import { AnimationFrameAction } from './AnimationFrameAction';\nimport { AnimationFrameScheduler } from './AnimationFrameScheduler';\n\n/**\n *\n * Animation Frame Scheduler\n *\n * Perform task when `window.requestAnimationFrame` would fire\n *\n * When `animationFrame` scheduler is used with delay, it will fall back to {@link asyncScheduler} scheduler\n * behaviour.\n *\n * Without delay, `animationFrame` scheduler can be used to create smooth browser animations.\n * It makes sure scheduled task will happen just before next browser content repaint,\n * thus performing animations as efficiently as possible.\n *\n * ## Example\n * Schedule div height animation\n * ```ts\n * // html:
\n * import { animationFrameScheduler } from 'rxjs';\n *\n * const div = document.querySelector('div');\n *\n * animationFrameScheduler.schedule(function(height) {\n * div.style.height = height + \"px\";\n *\n * this.schedule(height + 1); // `this` references currently executing Action,\n * // which we reschedule with new state\n * }, 0, 0);\n *\n * // You will see a div element growing in height\n * ```\n */\n\nexport const animationFrameScheduler = new AnimationFrameScheduler(AnimationFrameAction);\n\n/**\n * @deprecated Renamed to {@link animationFrameScheduler}. Will be removed in v8.\n */\nexport const animationFrame = animationFrameScheduler;\n", "import { Observable } from '../Observable';\nimport { SchedulerLike } from '../types';\n\n/**\n * A simple Observable that emits no items to the Observer and immediately\n * emits a complete notification.\n *\n * Just emits 'complete', and nothing else.\n *\n * ![](empty.png)\n *\n * A simple Observable that only emits the complete notification. It can be used\n * for composing with other Observables, such as in a {@link mergeMap}.\n *\n * ## Examples\n *\n * Log complete notification\n *\n * ```ts\n * import { EMPTY } from 'rxjs';\n *\n * EMPTY.subscribe({\n * next: () => console.log('Next'),\n * complete: () => console.log('Complete!')\n * });\n *\n * // Outputs\n * // Complete!\n * ```\n *\n * Emit the number 7, then complete\n *\n * ```ts\n * import { EMPTY, startWith } from 'rxjs';\n *\n * const result = EMPTY.pipe(startWith(7));\n * result.subscribe(x => console.log(x));\n *\n * // Outputs\n * // 7\n * ```\n *\n * Map and flatten only odd numbers to the sequence `'a'`, `'b'`, `'c'`\n *\n * ```ts\n * import { interval, mergeMap, of, EMPTY } from 'rxjs';\n *\n * const interval$ = interval(1000);\n * const result = interval$.pipe(\n * mergeMap(x => x % 2 === 1 ? of('a', 'b', 'c') : EMPTY),\n * );\n * result.subscribe(x => console.log(x));\n *\n * // Results in the following to the console:\n * // x is equal to the count on the interval, e.g. (0, 1, 2, 3, ...)\n * // x will occur every 1000ms\n * // if x % 2 is equal to 1, print a, b, c (each on its own)\n * // if x % 2 is not equal to 1, nothing will be output\n * ```\n *\n * @see {@link Observable}\n * @see {@link NEVER}\n * @see {@link of}\n * @see {@link throwError}\n */\nexport const EMPTY = new Observable((subscriber) => subscriber.complete());\n\n/**\n * @param scheduler A {@link SchedulerLike} to use for scheduling\n * the emission of the complete notification.\n * @deprecated Replaced with the {@link EMPTY} constant or {@link scheduled} (e.g. `scheduled([], scheduler)`). Will be removed in v8.\n */\nexport function empty(scheduler?: SchedulerLike) {\n return scheduler ? emptyScheduled(scheduler) : EMPTY;\n}\n\nfunction emptyScheduled(scheduler: SchedulerLike) {\n return new Observable((subscriber) => scheduler.schedule(() => subscriber.complete()));\n}\n", "import { SchedulerLike } from '../types';\nimport { isFunction } from './isFunction';\n\nexport function isScheduler(value: any): value is SchedulerLike {\n return value && isFunction(value.schedule);\n}\n", "import { SchedulerLike } from '../types';\nimport { isFunction } from './isFunction';\nimport { isScheduler } from './isScheduler';\n\nfunction last(arr: T[]): T | undefined {\n return arr[arr.length - 1];\n}\n\nexport function popResultSelector(args: any[]): ((...args: unknown[]) => unknown) | undefined {\n return isFunction(last(args)) ? args.pop() : undefined;\n}\n\nexport function popScheduler(args: any[]): SchedulerLike | undefined {\n return isScheduler(last(args)) ? args.pop() : undefined;\n}\n\nexport function popNumber(args: any[], defaultValue: number): number {\n return typeof last(args) === 'number' ? args.pop()! : defaultValue;\n}\n", "export const isArrayLike = ((x: any): x is ArrayLike => x && typeof x.length === 'number' && typeof x !== 'function');", "import { isFunction } from \"./isFunction\";\n\n/**\n * Tests to see if the object is \"thennable\".\n * @param value the object to test\n */\nexport function isPromise(value: any): value is PromiseLike {\n return isFunction(value?.then);\n}\n", "import { InteropObservable } from '../types';\nimport { observable as Symbol_observable } from '../symbol/observable';\nimport { isFunction } from './isFunction';\n\n/** Identifies an input as being Observable (but not necessary an Rx Observable) */\nexport function isInteropObservable(input: any): input is InteropObservable {\n return isFunction(input[Symbol_observable]);\n}\n", "import { isFunction } from './isFunction';\n\nexport function isAsyncIterable(obj: any): obj is AsyncIterable {\n return Symbol.asyncIterator && isFunction(obj?.[Symbol.asyncIterator]);\n}\n", "/**\n * Creates the TypeError to throw if an invalid object is passed to `from` or `scheduled`.\n * @param input The object that was passed.\n */\nexport function createInvalidObservableTypeError(input: any) {\n // TODO: We should create error codes that can be looked up, so this can be less verbose.\n return new TypeError(\n `You provided ${\n input !== null && typeof input === 'object' ? 'an invalid object' : `'${input}'`\n } where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.`\n );\n}\n", "export function getSymbolIterator(): symbol {\n if (typeof Symbol !== 'function' || !Symbol.iterator) {\n return '@@iterator' as any;\n }\n\n return Symbol.iterator;\n}\n\nexport const iterator = getSymbolIterator();\n", "import { iterator as Symbol_iterator } from '../symbol/iterator';\nimport { isFunction } from './isFunction';\n\n/** Identifies an input as being an Iterable */\nexport function isIterable(input: any): input is Iterable {\n return isFunction(input?.[Symbol_iterator]);\n}\n", "import { ReadableStreamLike } from '../types';\nimport { isFunction } from './isFunction';\n\nexport async function* readableStreamLikeToAsyncGenerator(readableStream: ReadableStreamLike): AsyncGenerator {\n const reader = readableStream.getReader();\n try {\n while (true) {\n const { value, done } = await reader.read();\n if (done) {\n return;\n }\n yield value!;\n }\n } finally {\n reader.releaseLock();\n }\n}\n\nexport function isReadableStreamLike(obj: any): obj is ReadableStreamLike {\n // We don't want to use instanceof checks because they would return\n // false for instances from another Realm, like an