From 6acd40c022b377ddf70574f40d3dd10113174be4 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Tue, 28 Jan 2020 22:03:34 -0700 Subject: [PATCH 1/9] Refactor of subsampling module All three of our subsampling functions should now be able to consume any dataframe in our two standard u_nk and dHdl forms and produce meaningful results. Each of these functions now performs a groupby with all non-time indexes, and separately performs its operation on each group. Some notes on the implementation: 1. The `series` kwarg is now the `column` arg for `statistical_inefficiency`, `equilibrium_detection`. A column name is now what we use, as requiring a column of the dataframe offers guarantees around the index matching, ordering, etc. that we had to do backflips to guarantee before. It is still possible to use another series as the basis for either operation, but it should be grafted on to the dataframe as a column to do so. 2. Since `column` is an arg, it is now required for the above functions. Therefore, those functions no longer behave like `slicing` when `column` isn't specified. My logic with breaking this is to favor explicitness and to avoid surprising users with behavior that throws no warnings but silently may not be doing what they want. 3. I've added some kwargs to `force` ignoring exceptions. The only exception these functions throw now is one due to duplicate time values in the groupings, as this tends to indicate two or more timeseries have been dumped together, which is unlikely what one wants to do with these subsampling functions that assume correlation to work as intended. Some other nice things: 1. I will be adding the minimal assumptions the standard forms make to their doc page as part of this change's PR. One of the assumptions I want to propagate is that the outermost index is *always* time, or at least something implying time (frame, sample, whatever). It needn't be called 'time', however, for all the tooling to work. 2. These subsamplers now explicitly call out that they *can't be expected* to preserve any custom ordering of rows. They will always be doing sorting of some kind, so the time index values are meaningful. --- src/alchemlyb/preprocessing/subsampling.py | 241 +++++++++++++-------- src/alchemlyb/tests/test_preprocessing.py | 56 ++--- 2 files changed, 175 insertions(+), 122 deletions(-) diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index 0b1a91a1..ebcb2405 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -1,22 +1,23 @@ """Functions for subsampling datasets. """ + +from collections import defaultdict import numpy as np -from pymbar.timeseries import (statisticalInefficiency, - detectEquilibration, - subsampleCorrelatedData, ) +import pandas as pd +from pymbar import timeseries def _check_multiple_times(df): return df.sort_index(0).reset_index(0).duplicated('time').any() -def _check_sorted(df): - return df.reset_index(0)['time'].is_monotonic_increasing - - def slicing(df, lower=None, upper=None, step=None, force=False): - """Subsample a DataFrame using simple slicing. + """Subsample an alchemlyb DataFrame using slicing on the outermost index (time). + + Slicing will be performed separately on groups of rows corresponding to + each set of lambda values present in the DataFrame's index. Each group will + be sorted on the outermost (time) index prior to slicing. Parameters ---------- @@ -37,60 +38,62 @@ def slicing(df, lower=None, upper=None, step=None, force=False): `df` subsampled. """ - try: - df = df.loc[lower:upper:step] - except KeyError: - raise KeyError("DataFrame rows must be sorted by time, increasing.") + # we always start with a full index sort on the whole dataframe + df = df.sort_index() + + index_names = list(df.index.names[1:]) + resdf = list() + + for name, group in df.groupby(level=index_names): + group_s = group.loc[lower:upper:step] - if not force and _check_multiple_times(df): - raise KeyError("Duplicate time values found; it's generally advised " - "to use slicing on DataFrames with unique time values " - "for each row. Use `force=True` to ignore this error.") + if not force and _check_multiple_times(group_s): + raise KeyError("Duplicate time values found; it's generally advised " + "to use slicing on DataFrames with unique time values " + "for each row. Use `force=True` to ignore this error.") - # drop any rows that have missing values - df = df.dropna() + resdf.append(group_s) - return df + return pd.concat(resdf) -def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None, - conservative=True): - """Subsample a DataFrame based on the calculated statistical inefficiency - of a timeseries. +def statistical_inefficiency(df, column, lower=None, upper=None, step=None, + conservative=True, return_calculated=False, force=False): + """Subsample an alchemlyb DataFrame based on the calculated statistical inefficiency + of one of its columns. - If `series` is ``None``, then this function will behave the same as - :func:`slicing`. + Calculation of statistical inefficiency and subsequent subsampling will be + performed separately on groups of rows corresponding to each set of lambda + values present in the DataFrame's index. Each group will be sorted on the + outermost (time) index prior to any calculation. Parameters ---------- df : DataFrame DataFrame to subsample according statistical inefficiency of `series`. - series : Series - Series to use for calculating statistical inefficiency. If ``None``, - no statistical inefficiency-based subsampling will be performed. + column : label + Label of column to use for calculating statistical inefficiency. lower : float - Lower bound to pre-slice `series` data from. + Lower time to pre-slice data from. upper : float - Upper bound to pre-slice `series` to (inclusive). + Upper time to pre-slice data to (inclusive). step : int - Step between `series` items to pre-slice by. + Step between rows to pre-slice by. conservative : bool ``True`` use ``ceil(statistical_inefficiency)`` to slice the data in uniform intervals (the default). ``False`` will sample at non-uniform intervals to closely match the (fractional) statistical_inefficieny, as implemented in :func:`pymbar.timeseries.subsampleCorrelatedData`. + return_calculated : bool + ``True`` return a tuple, with the second item a dict giving, e.g. `statinef` + for each group. + force : bool + Ignore checks that DataFrame is in proper form for expected behavior. Returns ------- DataFrame - `df` subsampled according to subsampled `series`. - - Warning - ------- - The `series` and the data to be sliced, `df`, need to have the same number - of elements because the statistical inefficiency is calculated based on - the index of the series (and not an associated time). At the moment there is - no automatic conversion from a time to an index. + `df` subsampled according to subsampled `column`. Note ---- @@ -108,95 +111,155 @@ def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None, pymbar.timeseries.subsampleCorrelatedData : used for subsampling + .. versionchanged:: 0.4.0 + The ``series`` keyword was replaced with the ``column`` keyword. + The function no longer takes an arbitrary series as input for + calculating statistical inefficiency. + .. versionchanged:: 0.2.0 The ``conservative`` keyword was added and the method is now using - ``pymbar.timeseries.statisticalInefficiency()``; previously, the statistical + ``pymbar.timeseries.subsampleCorrelatedData()``; previously, the statistical inefficiency was _rounded_ (instead of ``ceil()``) and thus one could end up with correlated data. """ - if _check_multiple_times(df): - raise KeyError("Duplicate time values found; statistical inefficiency " - "only works on a single, contiguous, " - "and sorted timeseries.") + # we always start with a full index sort on the whole dataframe + df = df.sort_index() + + index_names = list(df.index.names[1:]) + resdf = list() - if not _check_sorted(df): - raise KeyError("Statistical inefficiency only works as expected if " - "values are sorted by time, increasing.") + if return_calculated: + calculated = defaultdict(dict) - if series is not None: - series = slicing(series, lower=lower, upper=upper, step=step) + for name, group in df.groupby(level=index_names): + + group_s = slicing(group, lower=lower, upper=upper, step=step) - if (len(series) != len(df) or - not all(series.reset_index()['time'] == df.reset_index()['time'])): - raise ValueError("series and data must be sampled at the same times") + if not force and _check_multiple_times(group): + raise KeyError("Duplicate time values found; statistical inefficiency" + "is only meaningful for a single, contiguous, " + "and sorted timeseries.") - # calculate statistical inefficiency of series (could use fft=True but needs test) - statinef = statisticalInefficiency(series, fast=False) + # calculate statistical inefficiency of column (could use fft=True but needs test) + statinef = timeseries.statisticalInefficiency(group_s[column], fast=False) # use the subsampleCorrelatedData function to get the subsample index - indices = subsampleCorrelatedData(series, g=statinef, - conservative=conservative) - df = df.iloc[indices] - else: - df = slicing(df, lower=lower, upper=upper, step=step) + indices = timeseries.subsampleCorrelatedData(group_s[column], g=statinef, + conservative=conservative) + + resdf.append(group_s.iloc[indices]) - return df + if return_calculated: + calculated['statinef'][name] = statinef + + if return_calculated: + return pd.concat(resdf), calculated + else: + return pd.concat(resdf) -def equilibrium_detection(df, series=None, lower=None, upper=None, step=None): - """Subsample a DataFrame using automated equilibrium detection on a timeseries. +def equilibrium_detection(df, column, lower=None, upper=None, step=None, + conservative=True, return_calculated=False, force=False): + """Subsample a DataFrame using automated equilibrium detection on one of + its columns. - If `series` is ``None``, then this function will behave the same as - :func:`slicing`. + Equilibrium detection and subsequent subsampling will be performed + separately on groups of rows corresponding to each set of lambda values + present in the DataFrame's index. Each group will be sorted on the + outermost (time) index prior to any calculation. Parameters ---------- df : DataFrame DataFrame to subsample according to equilibrium detection on `series`. - series : Series - Series to detect equilibration on. If ``None``, no equilibrium - detection-based subsampling will be performed. + column : label + Label of column to use for equilibrium detection. lower : float - Lower bound to pre-slice `series` data from. + Lower time to pre-slice data from. upper : float - Upper bound to pre-slice `series` to (inclusive). + Upper time to pre-slice data to (inclusive). step : int - Step between `series` items to pre-slice by. + Step between rows to pre-slice by. + conservative : bool + ``True`` use ``ceil(statistical_inefficiency)`` to slice the data in uniform + intervals (the default). ``False`` will sample at non-uniform intervals to + closely match the (fractional) statistical_inefficieny, as implemented + in :func:`pymbar.timeseries.subsampleCorrelatedData`. + return_calculated : bool + ``True`` return a tuple, with the second item a dict giving, e.g. `statinef` + for each group. + force : bool + Ignore checks that DataFrame is in proper form for expected behavior. Returns ------- DataFrame - `df` subsampled according to subsampled `series`. + `df` subsampled according to subsampled `column`. + + Note + ---- + For a non-integer statistical ineffciency :math:`g`, the default value + ``conservative=True`` will provide _fewer_ data points than allowed by + :math:`g` and thus error estimates will be _higher_. For large numbers of + data points and converged free energies, the choice should not make a + difference. For small numbers of data points, ``conservative=True`` + decreases a false sense of accuracy and is deemed the more careful and + conservative approach. See Also -------- pymbar.timeseries.detectEquilibration : detailed background + pymbar.timeseries.subsampleCorrelatedData : used for subsampling + + + .. versionchanged:: 0.4.0 + The ``series`` keyword was replaced with the ``column`` keyword. + The function no longer takes an arbitrary series as input for + calculating statistical inefficiency. + + .. versionchanged:: 0.4.0 + The ``conservative`` keyword was added and the method is now using + ``pymbar.timeseries.subsampleCorrelatedData()``; previously, the statistical + inefficiency was _rounded_ (instead of ``ceil()``) and thus one could + end up with correlated data. """ - if _check_multiple_times(df): - raise KeyError("Duplicate time values found; equilibrium detection " - "is only meaningful for a single, contiguous, " - "and sorted timeseries.") + # we always start with a full index sort on the whole dataframe + df = df.sort_index() + + index_names = list(df.index.names[1:]) + resdf = list() - if not _check_sorted(df): - raise KeyError("Equilibrium detection only works as expected if " - "values are sorted by time, increasing.") + if return_calculated: + calculated = defaultdict(dict) - if series is not None: - series = slicing(series, lower=lower, upper=upper, step=step) + for name, group in df.groupby(level=index_names): + group_s = slicing(group, lower=lower, upper=upper, step=step) + + if not force and _check_multiple_times(group): + raise KeyError("Duplicate time values found; equilibrium detection " + "is only meaningful for a single, contiguous, " + "and sorted timeseries.") # calculate statistical inefficiency of series, with equilibrium detection - t, statinef, Neff_max = detectEquilibration(series.values) + t, statinef, Neff_max = timeseries.detectEquilibration(group_s[column]) - # we round up - statinef = int(np.rint(statinef)) + # only keep values past `t` + group_s = group_s.iloc[t:] - # subsample according to statistical inefficiency - series = series.iloc[t::statinef] + # use the subsampleCorrelatedData function to get the subsample index + indices = timeseries.subsampleCorrelatedData(group_s[column], g=statinef, + conservative=conservative) - df = df.loc[series.index] - else: - df = slicing(df, lower=lower, upper=upper, step=step) + resdf.append(group_s.iloc[indices]) - return df + if return_calculated: + calculated['t'][name] = statinef + calculated['statinef'][name] = statinef + calculated['Neff_max'][name] = statinef + + if return_calculated: + return pd.concat(resdf), calculated + else: + return pd.concat(resdf) diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py index 4e895c73..69141607 100644 --- a/src/alchemlyb/tests/test_preprocessing.py +++ b/src/alchemlyb/tests/test_preprocessing.py @@ -18,11 +18,23 @@ def gmx_benzene_dHdl(): return gmx.extract_dHdl(dataset['data']['Coulomb'][0], T=300) +def gmx_benzene_dHdl_duplicated(): + dataset = alchemtest.gmx.load_benzene() + df = gmx.extract_dHdl(dataset['data']['Coulomb'][0], T=300) + return pd.concat([df, df]) + + def gmx_benzene_u_nk(): dataset = alchemtest.gmx.load_benzene() return gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300) +def gmx_benzene_u_nk_duplicated(): + dataset = alchemtest.gmx.load_benzene() + df = gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300) + return pd.concat([df, df]) + + def gmx_benzene_dHdl_full(): dataset = alchemtest.gmx.load_benzene() return pd.concat([gmx.extract_dHdl(i, T=300) for i in dataset['data']['Coulomb']]) @@ -32,6 +44,7 @@ def gmx_benzene_u_nk_full(): dataset = alchemtest.gmx.load_benzene() return pd.concat([gmx.extract_u_nk(i, T=300) for i in dataset['data']['Coulomb']]) + class TestSlicing: """Test slicing functionality. @@ -46,8 +59,8 @@ def test_basic_slicing(self, data, size): @pytest.mark.parametrize('data', [gmx_benzene_dHdl(), gmx_benzene_u_nk()]) - def test_disordered_exception(self, data): - """Test that a shuffled DataFrame yields a KeyError. + def test_disordered(self, data): + """Test that a shuffled DataFrame yields same result as unshuffled. """ indices = np.arange(len(data)) @@ -55,18 +68,16 @@ def test_disordered_exception(self, data): df = data.iloc[indices] - with pytest.raises(KeyError): - self.slicer(df, lower=200) + assert (self.slicer(df, lower=200) == self.slicer(data, lower=200)).all().all() - @pytest.mark.parametrize('data', [gmx_benzene_dHdl_full(), - gmx_benzene_u_nk_full()]) + @pytest.mark.parametrize('data', [gmx_benzene_dHdl_duplicated(), + gmx_benzene_u_nk_duplicated()]) def test_duplicated_exception(self, data): """Test that a DataFrame with duplicate times yields a KeyError. """ with pytest.raises(KeyError): - self.slicer(data.sort_index(0), lower=200) - + self.slicer(data, lower=200) class CorrelatedPreprocessors: @@ -76,21 +87,10 @@ def test_subsampling(self, data, size): """Basic test for execution; resulting size of dataset sensitive to machine and depends on algorithm. """ - assert len(self.slicer(data, series=data.iloc[:, 0])) <= size + assert len(self.slicer(data, data.columns[0])) <= size - @pytest.mark.parametrize('data', [gmx_benzene_dHdl(), - gmx_benzene_u_nk()]) - def test_no_series(self, data): - """Check that we get the same result as simple slicing with no Series. - - """ - df_sub = self.slicer(data, lower=200, upper=5000, step=2) - df_sliced = slicing(data, lower=200, upper=5000, step=2) - assert np.all((df_sub == df_sliced)) - - -class TestStatisticalInefficiency(TestSlicing, CorrelatedPreprocessors): +class TestStatisticalInefficiency(CorrelatedPreprocessors): def slicer(self, *args, **kwargs): return statistical_inefficiency(*args, **kwargs) @@ -103,24 +103,14 @@ def slicer(self, *args, **kwargs): (False, gmx_benzene_u_nk(), 3571), ]) def test_conservative(self, data, size, conservative): - sliced = self.slicer(data, series=data.iloc[:, 0], conservative=conservative) + sliced = self.slicer(data, data.columns[0], conservative=conservative) # results can vary slightly with different machines # so possibly do # delta = 10 # assert size - delta < len(sliced) < size + delta assert len(sliced) == size - @pytest.mark.parametrize('series', [ - gmx_benzene_dHdl()['fep'][:20], # wrong length - gmx_benzene_dHdl()['fep'][::-1], # wrong time stamps (reversed) - ]) - def test_raise_ValueError_for_mismatched_data(self, series): - data = gmx_benzene_dHdl() - with pytest.raises(ValueError): - self.slicer(data, series=series) - - -class TestEquilibriumDetection(TestSlicing, CorrelatedPreprocessors): +class TestEquilibriumDetection(CorrelatedPreprocessors): def slicer(self, *args, **kwargs): return equilibrium_detection(*args, **kwargs) From 166cace35a7628bf30fbc614ec6fd01ee034298a Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 5 Feb 2020 22:04:03 -0700 Subject: [PATCH 2/9] Added tests for subsampling on most of our datasets Not getting all passes yet, and not sure I'm happy with how these work. There are clear shortcomings with the static column approach, requiring perhaps manual effort on the part of the user to work around. This is counter to alchemlyb's philosophy of making the 'most right' thing easy to do. --- src/alchemlyb/tests/test_preprocessing.py | 116 ++++++++++++++++++++-- 1 file changed, 107 insertions(+), 9 deletions(-) diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py index 69141607..7424a7cd 100644 --- a/src/alchemlyb/tests/test_preprocessing.py +++ b/src/alchemlyb/tests/test_preprocessing.py @@ -10,8 +10,66 @@ from alchemlyb.preprocessing import (slicing, statistical_inefficiency, equilibrium_detection,) +from . import test_ti_estimators as tti +from . import test_fep_estimators as tfep + import alchemtest.gmx +@pytest.fixture(scope="module", + params = [(tti.gmx_benzene_coul_dHdl, "single", 0), + (tti.gmx_benzene_vdw_dHdl, "single", 0), + (tti.gmx_expanded_ensemble_case_1_dHdl, "single", 1), + (tti.gmx_expanded_ensemble_case_2_dHdl, "repeat", 1), + (tti.gmx_expanded_ensemble_case_3_dHdl, "repeat", 1), + (tti.gmx_water_particle_with_total_energy_dHdl, "single", 0), + (tti.gmx_water_particle_with_potential_energy_dHdl, "single", 0), + (tti.gmx_water_particle_without_energy_dHdl, "single", 0), + (tti.amber_simplesolvated_charge_dHdl, "single", 0), + (tti.amber_simplesolvated_vdw_dHdl, "single", 0) + ], + ids = ["tti.gmx_benzene_coul_dHdl", + "tti.gmx_benzene_vdw_dHdl", + "tti.gmx_expanded_ensemble_case_1_dHdl", + "tti.gmx_expanded_ensemble_case_2_dHdl", + "tti.gmx_expanded_ensemble_case_3_dHdl", + "tti.gmx_water_particle_with_total_energy_dHdl", + "tti.gmx_water_particle_with_potential_energy_dHdl", + "tti.gmx_water_particle_without_energy_dHdl", + "tti.amber_simplesolvated_charge_dHdl", + "tti.amber_simplesolvated_vdw_dHdl", + ]) +def dHdl(request): + get_dHdl, nsims, column_index = request.param + return get_dHdl(), nsims, column_index + + +@pytest.fixture(scope="class", + params=[(tfep.gmx_benzene_coul_u_nk, "single", 0), + (tfep.gmx_benzene_vdw_u_nk, "single", 0), + (tfep.gmx_expanded_ensemble_case_1, "single", 0), + (tfep.gmx_expanded_ensemble_case_2, "repeat", 0), + (tfep.gmx_expanded_ensemble_case_3, "repeat", 0), + (tfep.gmx_water_particle_with_total_energy, "single", 0), + (tfep.gmx_water_particle_with_potential_energy, "single", 0), + (tfep.gmx_water_particle_without_energy, "single", -1), + (tfep.amber_bace_example_complex_vdw, "single", -1), + (tfep.gomc_benzene_u_nk, "single", 0), + ], + ids = ["tfep.gmx_benzene_coul_u_nk", + "tfep.gmx_benzene_vdw_u_nk", + "tfep.gmx_expanded_ensemble_case_1", + "tfep.gmx_expanded_ensemble_case_2", + "tfep.gmx_expanded_ensemble_case_3", + "tfep.gmx_water_particle_with_total_energy", + "tfep.gmx_water_particle_with_potential_energy", + "tfep.gmx_water_particle_without_energy", + "tfep.amber_bace_example_complex_vdw", + "tfep.gomc_benzene_u_nk", + ]) +def u_nk(request): + get_unk, nsims, column_index = request.param + return get_unk(), nsims, column_index + def gmx_benzene_dHdl(): dataset = alchemtest.gmx.load_benzene() @@ -49,13 +107,13 @@ class TestSlicing: """Test slicing functionality. """ - def slicer(self, *args, **kwargs): + def subsampler(self, *args, **kwargs): return slicing(*args, **kwargs) @pytest.mark.parametrize(('data', 'size'), [(gmx_benzene_dHdl(), 661), (gmx_benzene_u_nk(), 661)]) def test_basic_slicing(self, data, size): - assert len(self.slicer(data, lower=1000, upper=34000, step=5)) == size + assert len(self.subsampler(data, lower=1000, upper=34000, step=5)) == size @pytest.mark.parametrize('data', [gmx_benzene_dHdl(), gmx_benzene_u_nk()]) @@ -68,16 +126,36 @@ def test_disordered(self, data): df = data.iloc[indices] - assert (self.slicer(df, lower=200) == self.slicer(data, lower=200)).all().all() + assert (self.subsampler(df, lower=200) == self.subsampler(data, lower=200)).all().all() @pytest.mark.parametrize('data', [gmx_benzene_dHdl_duplicated(), gmx_benzene_u_nk_duplicated()]) def test_duplicated_exception(self, data): - """Test that a DataFrame with duplicate times yields a KeyError. + """Test that a DataFrame with duplicate times for a lambda combination + yields a KeyError. """ with pytest.raises(KeyError): - self.slicer(data, lower=200) + self.subsampler(data, lower=200) + + def test_slicing_dHdl(self, dHdl): + data, nsims, column_index = dHdl + + if nsims == "single": + dHdl_s = self.subsampler(data) + elif nsims == "repeat": + with pytest.raises(KeyError): + dHdl_s = self.subsampler(data) + + def test_slicing_u_nk(self, u_nk): + data, nsims, column_index = u_nk + + if nsims == "single": + u_nk_s = self.subsampler(data) + elif nsims == "repeat": + with pytest.raises(KeyError): + u_nk_s = self.subsampler(data) + class CorrelatedPreprocessors: @@ -87,12 +165,32 @@ def test_subsampling(self, data, size): """Basic test for execution; resulting size of dataset sensitive to machine and depends on algorithm. """ - assert len(self.slicer(data, data.columns[0])) <= size + assert len(self.subsampler(data, data.columns[0])) <= size + + def test_subsampling_dHdl(self, dHdl): + data, nsims, column_index = dHdl + + if nsims == "single": + dHdl_s = self.subsampler(data, data.columns[column_index]) + assert len(dHdl_s) < data + elif nsims == "repeat": + with pytest.raises(KeyError): + dHdl_s = self.subsampler(data, data.columns[column_index]) + + def test_subsampling_u_nk(self, u_nk): + data, nsims, column_index = u_nk + + if nsims == "single": + u_nk_s = self.subsampler(data, data.columns[column_index]) + assert len(u_nk_s) < data + elif nsims == "repeat": + with pytest.raises(KeyError): + u_nk_s = self.subsampler(data, data.columns[column_index]) class TestStatisticalInefficiency(CorrelatedPreprocessors): - def slicer(self, *args, **kwargs): + def subsampler(self, *args, **kwargs): return statistical_inefficiency(*args, **kwargs) @pytest.mark.parametrize(('conservative', 'data', 'size'), @@ -103,7 +201,7 @@ def slicer(self, *args, **kwargs): (False, gmx_benzene_u_nk(), 3571), ]) def test_conservative(self, data, size, conservative): - sliced = self.slicer(data, data.columns[0], conservative=conservative) + sliced = self.subsampler(data, data.columns[0], conservative=conservative) # results can vary slightly with different machines # so possibly do # delta = 10 @@ -112,5 +210,5 @@ def test_conservative(self, data, size, conservative): class TestEquilibriumDetection(CorrelatedPreprocessors): - def slicer(self, *args, **kwargs): + def subsampler(self, *args, **kwargs): return equilibrium_detection(*args, **kwargs) From 0043af931a6d6e3e423f11a7fab009799b3e3925 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 5 Feb 2020 22:06:36 -0700 Subject: [PATCH 3/9] Docs for subsamplers reflect a change in direction In the spirit of wanting to make it easy for users to do the 'most right' thing, I have looked at the details of what `alchemical-analysis` currently does for subsampling with statisticalInefficiency, and it uses an approach similar to the one documented in each of our `statistical_inefficiency` and `equilbrium_detection` functions now. Next steps are to *actually* implement this functionality. We are going to make our lives easier by adding an `alchemform` attribute to dataframes our parsers produce, and add this to the standard form spec. This attribute will have 'dHdl' or 'u_nk' as its value, depending on the form of data we're dealing with. Our estimators can also use this as a check or warning. --- src/alchemlyb/preprocessing/subsampling.py | 144 ++++++++++++++++----- 1 file changed, 110 insertions(+), 34 deletions(-) diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index ebcb2405..882747b0 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -8,11 +8,11 @@ from pymbar import timeseries -def _check_multiple_times(df): - return df.sort_index(0).reset_index(0).duplicated('time').any() +def _check_multiple_times(data): + return data.sort_index(0).reset_index(0).duplicated('time').any() -def slicing(df, lower=None, upper=None, step=None, force=False): +def slicing(data, lower=None, upper=None, step=None, force=False): """Subsample an alchemlyb DataFrame using slicing on the outermost index (time). Slicing will be performed separately on groups of rows corresponding to @@ -21,7 +21,7 @@ def slicing(df, lower=None, upper=None, step=None, force=False): Parameters ---------- - df : DataFrame + data : DataFrame DataFrame to subsample. lower : float Lower time to slice from. @@ -35,16 +35,16 @@ def slicing(df, lower=None, upper=None, step=None, force=False): Returns ------- DataFrame - `df` subsampled. + `data` subsampled. """ # we always start with a full index sort on the whole dataframe - df = df.sort_index() + data = data.sort_index() - index_names = list(df.index.names[1:]) - resdf = list() + index_names = list(data.index.names[1:]) + resdata = list() - for name, group in df.groupby(level=index_names): + for name, group in data.groupby(level=index_names): group_s = group.loc[lower:upper:step] if not force and _check_multiple_times(group_s): @@ -52,12 +52,12 @@ def slicing(df, lower=None, upper=None, step=None, force=False): "to use slicing on DataFrames with unique time values " "for each row. Use `force=True` to ignore this error.") - resdf.append(group_s) + resdata.append(group_s) - return pd.concat(resdf) + return pd.concat(resdata) -def statistical_inefficiency(df, column, lower=None, upper=None, step=None, +def statistical_inefficiency(data, how='auto', column=None, lower=None, upper=None, step=None, conservative=True, return_calculated=False, force=False): """Subsample an alchemlyb DataFrame based on the calculated statistical inefficiency of one of its columns. @@ -67,12 +67,49 @@ def statistical_inefficiency(df, column, lower=None, upper=None, step=None, values present in the DataFrame's index. Each group will be sorted on the outermost (time) index prior to any calculation. + The `how` parameter determines the observable used for calculating the + correlations within each group of samples. The options are as follows: + + 'auto' + The default; the approach is chosen from the below approaches based + on the `alchemform` of the data (either 'dHdl' or 'u_nk'). Use this + if in doubt. + 'right' + The default for 'u_nk' datasets; the column immediately to the + right of the column corresponding to the group's lambda index value + is used. If there is no column to the right, then the column to the left is used. + If there is no column corresponding to the group's lambda index + value, then 'random' is used (see below). + 'left' + The opposite of the 'right' approach; the column immediately to the + left of the column corresponding to the group's lambda index value + is used. If there is no column to the left, then the column to the + right is used. If there is no column corresponding to the group's + lambda index value, then 'random' is used for that group (see below). + 'random' + A column is chosen at random from the set of columns available in + the group. If the correlation calculation fails, then another + column is tried. This process continues until success or until all + columns have been attempted without success. + 'sum' + The default for 'dHdl' datasets; the columns are simply summed, and + the resulting `Series` is used. + + Specifying the 'column' parameter overrides the behavior of 'how'. This + allows the user to use a particular column or a specially-crafted `Series` + for correlation calculation. + Parameters ---------- - df : DataFrame + data : DataFrame DataFrame to subsample according statistical inefficiency of `series`. - column : label + how : {'auto', 'right', 'left', 'random', 'sum'} + The approach used to choose the observable on which correlations are + calculated. See explanation above. + column : label or `pandas.Series` Label of column to use for calculating statistical inefficiency. + Overrides `how`; can also take a `Series` object, but the index of the + `Series` *must* match that of `data` exactly. lower : float Lower time to pre-slice data from. upper : float @@ -93,7 +130,7 @@ def statistical_inefficiency(df, column, lower=None, upper=None, step=None, Returns ------- DataFrame - `df` subsampled according to subsampled `column`. + `data` subsampled according to subsampled `column`. Note ---- @@ -124,15 +161,17 @@ def statistical_inefficiency(df, column, lower=None, upper=None, step=None, """ # we always start with a full index sort on the whole dataframe - df = df.sort_index() + data = data.sort_index() - index_names = list(df.index.names[1:]) - resdf = list() + index_names = list(data.index.names[1:]) + resdata = list() if return_calculated: calculated = defaultdict(dict) - for name, group in df.groupby(level=index_names): + if column: + + for name, group in data.groupby(level=index_names): group_s = slicing(group, lower=lower, upper=upper, step=step) @@ -148,18 +187,18 @@ def statistical_inefficiency(df, column, lower=None, upper=None, step=None, indices = timeseries.subsampleCorrelatedData(group_s[column], g=statinef, conservative=conservative) - resdf.append(group_s.iloc[indices]) + resdata.append(group_s.iloc[indices]) if return_calculated: calculated['statinef'][name] = statinef if return_calculated: - return pd.concat(resdf), calculated + return pd.concat(resdata), calculated else: - return pd.concat(resdf) + return pd.concat(resdata) -def equilibrium_detection(df, column, lower=None, upper=None, step=None, +def equilibrium_detection(data, how='auto', column=None, lower=None, upper=None, step=None, conservative=True, return_calculated=False, force=False): """Subsample a DataFrame using automated equilibrium detection on one of its columns. @@ -169,12 +208,49 @@ def equilibrium_detection(df, column, lower=None, upper=None, step=None, present in the DataFrame's index. Each group will be sorted on the outermost (time) index prior to any calculation. + The `how` parameter determines the observable used for calculating the + correlations within each group of samples. The options are as follows: + + 'auto' + The default; the approach is chosen from the below approaches based + on the `alchemform` of the data (either 'dHdl' or 'u_nk'). Use this + if in doubt. + 'right' + The default for 'u_nk' datasets; the column immediately to the + right of the column corresponding to the group's lambda index value + is used. If there is no column to the right, then the column to the left is used. + If there is no column corresponding to the group's lambda index + value, then 'random' is used (see below). + 'left' + The opposite of the 'right' approach; the column immediately to the + left of the column corresponding to the group's lambda index value + is used. If there is no column to the left, then the column to the + right is used. If there is no column corresponding to the group's + lambda index value, then 'random' is used for that group (see below). + 'random' + A column is chosen at random from the set of columns available in + the group. If the correlation calculation fails, then another + column is tried. This process continues until success or until all + columns have been attempted without success. + 'sum' + The default for 'dHdl' datasets; the columns are simply summed, and + the resulting `Series` is used. + + Specifying the 'column' parameter overrides the behavior of 'how'. This + allows the user to use a particular column or a specially-crafted `Series` + for correlation calculation. + Parameters ---------- - df : DataFrame + data : DataFrame DataFrame to subsample according to equilibrium detection on `series`. - column : label - Label of column to use for equilibrium detection. + how : {'auto', 'right', 'left', 'random', 'sum'} + The approach used to choose the observable on which correlations are + calculated. See explanation above. + column : label or `pandas.Series` + Label of column to use for calculating statistical inefficiency. + Overrides `how`; can also take a `Series` object, but the index of the + `Series` *must* match that of `data` exactly. lower : float Lower time to pre-slice data from. upper : float @@ -195,7 +271,7 @@ def equilibrium_detection(df, column, lower=None, upper=None, step=None, Returns ------- DataFrame - `df` subsampled according to subsampled `column`. + `data` subsampled according to subsampled `column`. Note ---- @@ -226,15 +302,15 @@ def equilibrium_detection(df, column, lower=None, upper=None, step=None, """ # we always start with a full index sort on the whole dataframe - df = df.sort_index() + data = data.sort_index() - index_names = list(df.index.names[1:]) - resdf = list() + index_names = list(data.index.names[1:]) + resdata = list() if return_calculated: calculated = defaultdict(dict) - for name, group in df.groupby(level=index_names): + for name, group in data.groupby(level=index_names): group_s = slicing(group, lower=lower, upper=upper, step=step) if not force and _check_multiple_times(group): @@ -252,7 +328,7 @@ def equilibrium_detection(df, column, lower=None, upper=None, step=None, indices = timeseries.subsampleCorrelatedData(group_s[column], g=statinef, conservative=conservative) - resdf.append(group_s.iloc[indices]) + resdata.append(group_s.iloc[indices]) if return_calculated: calculated['t'][name] = statinef @@ -260,6 +336,6 @@ def equilibrium_detection(df, column, lower=None, upper=None, step=None, calculated['Neff_max'][name] = statinef if return_calculated: - return pd.concat(resdf), calculated + return pd.concat(resdata), calculated else: - return pd.concat(resdf) + return pd.concat(resdata) From ec591ef673f323779d7eeda89ff8b99dfe41de32 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Mon, 10 Feb 2020 23:11:12 -0700 Subject: [PATCH 4/9] statistical_inefficiency should now behave according to its doc `how` methods implemented; these require explicit tests yet. We should also add logging to each of these functions, using a module-level logger. This would allow for auditability later in case downstream issues are discovered. --- src/alchemlyb/preprocessing/subsampling.py | 143 ++++++++++++++++++--- 1 file changed, 127 insertions(+), 16 deletions(-) diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index 882747b0..e71b0c02 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -1,15 +1,67 @@ """Functions for subsampling datasets. """ +import random from collections import defaultdict import numpy as np import pandas as pd from pymbar import timeseries +from pymbar.utils import ParameterError + + +class CorrelationError(Exception): + pass def _check_multiple_times(data): - return data.sort_index(0).reset_index(0).duplicated('time').any() + return data.index.levels[0].duplicated().any() + + +def _how_lr(name, group, direction): + + # first, get column index of column `name`, if it exists + # if it does exist, increment or decrement position by one based on + # `direction` + try: + pos = group.columns.get_loc(name) + except KeyError: + raise KeyError("No column with label '{}'".format(name)) + else: + if direction == 'right': + pos += 1 + elif direction == 'left': + pos -= 1 + else: + raise ValueError("`direction` must be either 'right' or 'left'") + + # handle cases where we are dealing with the leftmost column or rightmost + # column + if pos == -1: + pos += 2 + elif pos == len(group.columns): + pos -= 2 + elif (pos < -1) or (pos > len(group.columns)): + raise IndexError("Position of selected column is outside of all expected bounds") + + return group[group.columns[pos]] + + +def _how_random(name, group, tried=None): + + candidates = set(group.columns) - (set(tried) + {name}) + + if not candidates: + raise CorrelationError("No column in the dataset could be used" + " successfully for decorrelation") + else: + selection = random.choice(candidates) + + return group[selection], selection + + +def _how_sum(name, group): + return group.sum(axis=1) def slicing(data, lower=None, upper=None, step=None, force=False): @@ -87,10 +139,12 @@ def statistical_inefficiency(data, how='auto', column=None, lower=None, upper=No right is used. If there is no column corresponding to the group's lambda index value, then 'random' is used for that group (see below). 'random' - A column is chosen at random from the set of columns available in - the group. If the correlation calculation fails, then another - column is tried. This process continues until success or until all - columns have been attempted without success. + A column is chosen uniformly at random from the set of columns + available in the group, with the column corresponding to the + group's lambda index value excluded, if present. If the correlation + calculation fails, then another column is tried. This process + continues until success or until all columns have been attempted + without success (in which case, ``CorrelationError`` is raised). 'sum' The default for 'dHdl' datasets; the columns are simply summed, and the resulting `Series` is used. @@ -132,6 +186,11 @@ def statistical_inefficiency(data, how='auto', column=None, lower=None, upper=No DataFrame `data` subsampled according to subsampled `column`. + Raises + ------ + CorrelationError + If correlation removal fails irrecoverably. + Note ---- For a non-integer statistical ineffciency :math:`g`, the default value @@ -161,6 +220,7 @@ def statistical_inefficiency(data, how='auto', column=None, lower=None, upper=No """ # we always start with a full index sort on the whole dataframe + # should produce a copy data = data.sort_index() index_names = list(data.index.names[1:]) @@ -170,23 +230,74 @@ def statistical_inefficiency(data, how='auto', column=None, lower=None, upper=No calculated = defaultdict(dict) if column: + if isinstance(column, pd.Series): + #TODO: check equality of index between Series, data + pass - for name, group in data.groupby(level=index_names): - - group_s = slicing(group, lower=lower, upper=upper, step=step) + # assign specific `how` settings if ``how == 'auto'`` + if how == 'auto': + if data.attrs['alchemform'] == 'u_nk': + how = 'right' + if data.attrs['alchemform'] == 'dHdl': + how = 'sum' - if not force and _check_multiple_times(group): - raise KeyError("Duplicate time values found; statistical inefficiency" - "is only meaningful for a single, contiguous, " - "and sorted timeseries.") + def subsample(group_c, group): + group_cs = slicing(group_c, lower=lower, upper=upper, step=step) # calculate statistical inefficiency of column (could use fft=True but needs test) - statinef = timeseries.statisticalInefficiency(group_s[column], fast=False) + statinef = timeseries.statisticalInefficiency(group_cs, fast=False) # use the subsampleCorrelatedData function to get the subsample index - indices = timeseries.subsampleCorrelatedData(group_s[column], g=statinef, + indices = timeseries.subsampleCorrelatedData(group_cs, g=statinef, conservative=conservative) + return indices + + def random_selection(name, group): + tried = set() + while True: + group_c, selection = _how_random(name, group, tried=tried) + try: + indices = subsample(group_c, group) + except: + tried.add(selection) + else: + break + + return indices + + for name, group in data.groupby(level=index_names): + + if not force and _check_multiple_times(group): + raise KeyError("Duplicate time values found; statistical inefficiency" + "is only meaningful for a single, contiguous, " + "and sorted timeseries") + + if column: + if isinstance(column, pd.Series): + group_c = column.groupby(level=index_names).get_group(name) + indices = subsample(group_c, group) + elif isinstance(column, basestring): + group_c = group[column] + indices = subsample(group_c, group) + else: + if (how == 'right') or (how == 'left'): + try: + group_c = _how_lr(name, group, how) + except KeyError: + indices = random_selection(name, group) + else: + indices = subsample(group_c, group) + elif how == 'random': + indices = random_selection(name, group) + elif how == 'sum': + group_c = _how_sum(name, group) + indices = subsample(group_c, group) + else: + raise ValueError("`how` cannot be '{}';" + " see docstring for available options".format(how)) + + group_s = slicing(group, lower=lower, upper=upper, step=step) resdata.append(group_s.iloc[indices]) if return_calculated: @@ -313,10 +424,10 @@ def equilibrium_detection(data, how='auto', column=None, lower=None, upper=None, for name, group in data.groupby(level=index_names): group_s = slicing(group, lower=lower, upper=upper, step=step) - if not force and _check_multiple_times(group): + if not force and _check_multiple_times(group_s): raise KeyError("Duplicate time values found; equilibrium detection " "is only meaningful for a single, contiguous, " - "and sorted timeseries.") + "and sorted timeseries") # calculate statistical inefficiency of series, with equilibrium detection t, statinef, Neff_max = timeseries.detectEquilibration(group_s[column]) From 2a1d27f5ebea3dbb6e6e0dc1aaf21e278a7a1e0a Mon Sep 17 00:00:00 2001 From: David Dotson Date: Tue, 11 Feb 2020 14:25:04 -0700 Subject: [PATCH 5/9] Tests passing for statistical_inefficiency About to break out the generic bits to make the same changes to equilibrium_detection. --- src/alchemlyb/preprocessing/subsampling.py | 37 ++++--- src/alchemlyb/tests/test_fep_estimators.py | 23 +++++ src/alchemlyb/tests/test_preprocessing.py | 111 +++++++++++++-------- src/alchemlyb/tests/test_ti_estimators.py | 22 ++++ 4 files changed, 135 insertions(+), 58 deletions(-) diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index e71b0c02..73b7f6db 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -15,7 +15,7 @@ class CorrelationError(Exception): def _check_multiple_times(data): - return data.index.levels[0].duplicated().any() + return data.index.get_level_values(0).duplicated().any() def _how_lr(name, group, direction): @@ -109,10 +109,11 @@ def slicing(data, lower=None, upper=None, step=None, force=False): return pd.concat(resdata) -def statistical_inefficiency(data, how='auto', column=None, lower=None, upper=None, step=None, - conservative=True, return_calculated=False, force=False): - """Subsample an alchemlyb DataFrame based on the calculated statistical inefficiency - of one of its columns. +def statistical_inefficiency(data, how='auto', column=None, lower=None, + upper=None, step=None, conservative=True, return_calculated=False, + force=False, random_state=None): + """Subsample an alchemlyb DataFrame based on the calculated statistical + inefficiency of one of its columns. Calculation of statistical inefficiency and subsequent subsampling will be performed separately on groups of rows corresponding to each set of lambda @@ -129,9 +130,9 @@ def statistical_inefficiency(data, how='auto', column=None, lower=None, upper=No 'right' The default for 'u_nk' datasets; the column immediately to the right of the column corresponding to the group's lambda index value - is used. If there is no column to the right, then the column to the left is used. - If there is no column corresponding to the group's lambda index - value, then 'random' is used (see below). + is used. If there is no column to the right, then the column to the + left is used. If there is no column corresponding to the group's + lambda index value, then 'random' is used (see below). 'left' The opposite of the 'right' approach; the column immediately to the left of the column corresponding to the group's lambda index value @@ -171,15 +172,19 @@ def statistical_inefficiency(data, how='auto', column=None, lower=None, upper=No step : int Step between rows to pre-slice by. conservative : bool - ``True`` use ``ceil(statistical_inefficiency)`` to slice the data in uniform - intervals (the default). ``False`` will sample at non-uniform intervals to - closely match the (fractional) statistical_inefficieny, as implemented - in :func:`pymbar.timeseries.subsampleCorrelatedData`. + ``True`` use ``ceil(statistical_inefficiency)`` to slice the data in + uniform intervals (the default). ``False`` will sample at non-uniform + intervals to closely match the (fractional) statistical_inefficieny, as + implemented in :func:`pymbar.timeseries.subsampleCorrelatedData`. return_calculated : bool ``True`` return a tuple, with the second item a dict giving, e.g. `statinef` for each group. force : bool Ignore checks that DataFrame is in proper form for expected behavior. + random_state : int, optional + Integer between 0 and 2**32 -1 inclusive; fed to `numpy.random.seed`. + Running this function on the same data with a specific random seed will + produce the same result each time. Returns ------- @@ -214,11 +219,13 @@ def statistical_inefficiency(data, how='auto', column=None, lower=None, upper=No .. versionchanged:: 0.2.0 The ``conservative`` keyword was added and the method is now using - ``pymbar.timeseries.subsampleCorrelatedData()``; previously, the statistical - inefficiency was _rounded_ (instead of ``ceil()``) and thus one could - end up with correlated data. + ``pymbar.timeseries.subsampleCorrelatedData()``; previously, the + statistical inefficiency was _rounded_ (instead of ``ceil()``) and thus + one could end up with correlated data. """ + np.random.seed(random_state) + # we always start with a full index sort on the whole dataframe # should produce a copy data = data.sort_index() diff --git a/src/alchemlyb/tests/test_fep_estimators.py b/src/alchemlyb/tests/test_fep_estimators.py index f33a72fd..a3bcb796 100644 --- a/src/alchemlyb/tests/test_fep_estimators.py +++ b/src/alchemlyb/tests/test_fep_estimators.py @@ -19,6 +19,8 @@ def gmx_benzene_coul_u_nk(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['Coulomb']]) + u_nk.attrs['alchemform'] = 'u_nk' + return u_nk def gmx_benzene_vdw_u_nk(): @@ -27,6 +29,8 @@ def gmx_benzene_vdw_u_nk(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['VDW']]) + u_nk.attrs['alchemform'] = 'u_nk' + return u_nk def gmx_expanded_ensemble_case_1(): @@ -35,6 +39,8 @@ def gmx_expanded_ensemble_case_1(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['AllStates']]) + u_nk.attrs['alchemform'] = 'u_nk' + return u_nk def gmx_expanded_ensemble_case_2(): @@ -43,6 +49,8 @@ def gmx_expanded_ensemble_case_2(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['AllStates']]) + u_nk.attrs['alchemform'] = 'u_nk' + return u_nk def gmx_expanded_ensemble_case_3(): @@ -51,6 +59,8 @@ def gmx_expanded_ensemble_case_3(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['AllStates']]) + u_nk.attrs['alchemform'] = 'u_nk' + return u_nk def gmx_water_particle_with_total_energy(): @@ -59,6 +69,8 @@ def gmx_water_particle_with_total_energy(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['AllStates']]) + u_nk.attrs['alchemform'] = 'u_nk' + return u_nk def gmx_water_particle_with_potential_energy(): @@ -67,6 +79,8 @@ def gmx_water_particle_with_potential_energy(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['AllStates']]) + u_nk.attrs['alchemform'] = 'u_nk' + return u_nk def gmx_water_particle_without_energy(): @@ -75,6 +89,8 @@ def gmx_water_particle_without_energy(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['AllStates']]) + u_nk.attrs['alchemform'] = 'u_nk' + return u_nk def amber_bace_example_complex_vdw(): @@ -82,6 +98,8 @@ def amber_bace_example_complex_vdw(): u_nk = pd.concat([amber.extract_u_nk(filename, T=300) for filename in dataset['data']['complex']['vdw']]) + u_nk.attrs['alchemform'] = 'u_nk' + return u_nk def gomc_benzene_u_nk(): @@ -90,6 +108,8 @@ def gomc_benzene_u_nk(): u_nk = pd.concat([gomc.extract_u_nk(filename, T=298) for filename in dataset['data']]) + u_nk.attrs['alchemform'] = 'u_nk' + return u_nk def namd_tyr2ala(): @@ -101,8 +121,11 @@ def namd_tyr2ala(): u_nk1[u_nk1.isna()] = u_nk2 u_nk = u_nk1.sort_index(level=u_nk1.index.names[1:]) + u_nk.attrs['alchemform'] = 'u_nk' + return u_nk + class FEPestimatorMixin: """Mixin for all FEP Estimator test classes. diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py index 7424a7cd..9d375dcc 100644 --- a/src/alchemlyb/tests/test_preprocessing.py +++ b/src/alchemlyb/tests/test_preprocessing.py @@ -16,16 +16,16 @@ import alchemtest.gmx @pytest.fixture(scope="module", - params = [(tti.gmx_benzene_coul_dHdl, "single", 0), - (tti.gmx_benzene_vdw_dHdl, "single", 0), - (tti.gmx_expanded_ensemble_case_1_dHdl, "single", 1), - (tti.gmx_expanded_ensemble_case_2_dHdl, "repeat", 1), - (tti.gmx_expanded_ensemble_case_3_dHdl, "repeat", 1), - (tti.gmx_water_particle_with_total_energy_dHdl, "single", 0), - (tti.gmx_water_particle_with_potential_energy_dHdl, "single", 0), - (tti.gmx_water_particle_without_energy_dHdl, "single", 0), - (tti.amber_simplesolvated_charge_dHdl, "single", 0), - (tti.amber_simplesolvated_vdw_dHdl, "single", 0) + params = [(tti.gmx_benzene_coul_dHdl, "single"), + (tti.gmx_benzene_vdw_dHdl, "single"), + (tti.gmx_expanded_ensemble_case_1_dHdl, "single"), + (tti.gmx_expanded_ensemble_case_2_dHdl, "repeat"), + (tti.gmx_expanded_ensemble_case_3_dHdl, "repeat"), + (tti.gmx_water_particle_with_total_energy_dHdl, "single"), + (tti.gmx_water_particle_with_potential_energy_dHdl, "single"), + (tti.gmx_water_particle_without_energy_dHdl, "single"), + (tti.amber_simplesolvated_charge_dHdl, "single"), + (tti.amber_simplesolvated_vdw_dHdl, "single") ], ids = ["tti.gmx_benzene_coul_dHdl", "tti.gmx_benzene_vdw_dHdl", @@ -39,21 +39,21 @@ "tti.amber_simplesolvated_vdw_dHdl", ]) def dHdl(request): - get_dHdl, nsims, column_index = request.param - return get_dHdl(), nsims, column_index + get_dHdl, nsims = request.param + return get_dHdl(), nsims @pytest.fixture(scope="class", - params=[(tfep.gmx_benzene_coul_u_nk, "single", 0), - (tfep.gmx_benzene_vdw_u_nk, "single", 0), - (tfep.gmx_expanded_ensemble_case_1, "single", 0), - (tfep.gmx_expanded_ensemble_case_2, "repeat", 0), - (tfep.gmx_expanded_ensemble_case_3, "repeat", 0), - (tfep.gmx_water_particle_with_total_energy, "single", 0), - (tfep.gmx_water_particle_with_potential_energy, "single", 0), - (tfep.gmx_water_particle_without_energy, "single", -1), - (tfep.amber_bace_example_complex_vdw, "single", -1), - (tfep.gomc_benzene_u_nk, "single", 0), + params=[(tfep.gmx_benzene_coul_u_nk, "single"), + (tfep.gmx_benzene_vdw_u_nk, "single"), + (tfep.gmx_expanded_ensemble_case_1, "single"), + (tfep.gmx_expanded_ensemble_case_2, "repeat"), + (tfep.gmx_expanded_ensemble_case_3, "repeat"), + (tfep.gmx_water_particle_with_total_energy, "single"), + (tfep.gmx_water_particle_with_potential_energy, "single"), + (tfep.gmx_water_particle_without_energy, "single"), + (tfep.amber_bace_example_complex_vdw, "single"), + (tfep.gomc_benzene_u_nk, "single"), ], ids = ["tfep.gmx_benzene_coul_u_nk", "tfep.gmx_benzene_vdw_u_nk", @@ -67,41 +67,65 @@ def dHdl(request): "tfep.gomc_benzene_u_nk", ]) def u_nk(request): - get_unk, nsims, column_index = request.param - return get_unk(), nsims, column_index + get_unk, nsims = request.param + return get_unk(), nsims def gmx_benzene_dHdl(): dataset = alchemtest.gmx.load_benzene() - return gmx.extract_dHdl(dataset['data']['Coulomb'][0], T=300) + dHdl = gmx.extract_dHdl(dataset['data']['Coulomb'][0], T=300) + + dHdl.attrs['alchemform'] = 'dHdl' + + return dHdl def gmx_benzene_dHdl_duplicated(): dataset = alchemtest.gmx.load_benzene() df = gmx.extract_dHdl(dataset['data']['Coulomb'][0], T=300) - return pd.concat([df, df]) + dHdl = pd.concat([df, df]) + + dHdl.attrs['alchemform'] = 'dHdl' + + return dHdl def gmx_benzene_u_nk(): dataset = alchemtest.gmx.load_benzene() - return gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300) + u_nk = gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300) + + u_nk.attrs['alchemform'] = 'u_nk' + + return u_nk def gmx_benzene_u_nk_duplicated(): dataset = alchemtest.gmx.load_benzene() df = gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300) - return pd.concat([df, df]) + u_nk = pd.concat([df, df]) + + u_nk.attrs['alchemform'] = 'u_nk' + + return u_nk def gmx_benzene_dHdl_full(): dataset = alchemtest.gmx.load_benzene() - return pd.concat([gmx.extract_dHdl(i, T=300) for i in dataset['data']['Coulomb']]) + dHdl = pd.concat([gmx.extract_dHdl(i, T=300) for i in dataset['data']['Coulomb']]) + + dHdl.attrs['alchemform'] = 'dHdl' + + return dHdl def gmx_benzene_u_nk_full(): dataset = alchemtest.gmx.load_benzene() return pd.concat([gmx.extract_u_nk(i, T=300) for i in dataset['data']['Coulomb']]) + u_nk.attrs['alchemform'] = 'u_nk' + + return u_nk + class TestSlicing: """Test slicing functionality. @@ -139,7 +163,7 @@ def test_duplicated_exception(self, data): self.subsampler(data, lower=200) def test_slicing_dHdl(self, dHdl): - data, nsims, column_index = dHdl + data, nsims = dHdl if nsims == "single": dHdl_s = self.subsampler(data) @@ -148,7 +172,7 @@ def test_slicing_dHdl(self, dHdl): dHdl_s = self.subsampler(data) def test_slicing_u_nk(self, u_nk): - data, nsims, column_index = u_nk + data, nsims = u_nk if nsims == "single": u_nk_s = self.subsampler(data) @@ -165,27 +189,27 @@ def test_subsampling(self, data, size): """Basic test for execution; resulting size of dataset sensitive to machine and depends on algorithm. """ - assert len(self.subsampler(data, data.columns[0])) <= size + assert len(self.subsampler(data)) <= size def test_subsampling_dHdl(self, dHdl): - data, nsims, column_index = dHdl + data, nsims = dHdl if nsims == "single": - dHdl_s = self.subsampler(data, data.columns[column_index]) - assert len(dHdl_s) < data + dHdl_s = self.subsampler(data) + assert len(dHdl_s) < len(data) elif nsims == "repeat": with pytest.raises(KeyError): - dHdl_s = self.subsampler(data, data.columns[column_index]) + dHdl_s = self.subsampler(data) def test_subsampling_u_nk(self, u_nk): - data, nsims, column_index = u_nk + data, nsims = u_nk if nsims == "single": - u_nk_s = self.subsampler(data, data.columns[column_index]) - assert len(u_nk_s) < data + u_nk_s = self.subsampler(data) + assert len(u_nk_s) < len(data) elif nsims == "repeat": with pytest.raises(KeyError): - u_nk_s = self.subsampler(data, data.columns[column_index]) + u_nk_s = self.subsampler(data) class TestStatisticalInefficiency(CorrelatedPreprocessors): @@ -198,17 +222,18 @@ def subsampler(self, *args, **kwargs): (True, gmx_benzene_dHdl(), 2001), # 0.00: g = 1.0559445620585415 (True, gmx_benzene_u_nk(), 2001), # 'fep': g = 1.0560203916559594 (False, gmx_benzene_dHdl(), 3789), - (False, gmx_benzene_u_nk(), 3571), + (False, gmx_benzene_u_nk(), 3788), ]) def test_conservative(self, data, size, conservative): - sliced = self.subsampler(data, data.columns[0], conservative=conservative) + sliced = self.subsampler(data, conservative=conservative) # results can vary slightly with different machines # so possibly do # delta = 10 # assert size - delta < len(sliced) < size + delta assert len(sliced) == size -class TestEquilibriumDetection(CorrelatedPreprocessors): + +class TestEquilibriumDetection(): def subsampler(self, *args, **kwargs): return equilibrium_detection(*args, **kwargs) diff --git a/src/alchemlyb/tests/test_ti_estimators.py b/src/alchemlyb/tests/test_ti_estimators.py index 208e27a9..d143010f 100644 --- a/src/alchemlyb/tests/test_ti_estimators.py +++ b/src/alchemlyb/tests/test_ti_estimators.py @@ -18,6 +18,8 @@ def gmx_benzene_coul_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['Coulomb']]) + dHdl.attrs['alchemform'] = 'dHdl' + return dHdl def gmx_benzene_vdw_dHdl(): @@ -26,6 +28,8 @@ def gmx_benzene_vdw_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['VDW']]) + dHdl.attrs['alchemform'] = 'dHdl' + return dHdl def gmx_expanded_ensemble_case_1_dHdl(): @@ -34,6 +38,8 @@ def gmx_expanded_ensemble_case_1_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['AllStates']]) + dHdl.attrs['alchemform'] = 'dHdl' + return dHdl def gmx_expanded_ensemble_case_2_dHdl(): @@ -42,6 +48,8 @@ def gmx_expanded_ensemble_case_2_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['AllStates']]) + dHdl.attrs['alchemform'] = 'dHdl' + return dHdl def gmx_expanded_ensemble_case_3_dHdl(): @@ -50,6 +58,8 @@ def gmx_expanded_ensemble_case_3_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['AllStates']]) + dHdl.attrs['alchemform'] = 'dHdl' + return dHdl def gmx_water_particle_with_total_energy_dHdl(): @@ -58,6 +68,8 @@ def gmx_water_particle_with_total_energy_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['AllStates']]) + dHdl.attrs['alchemform'] = 'dHdl' + return dHdl def gmx_water_particle_with_potential_energy_dHdl(): @@ -66,6 +78,8 @@ def gmx_water_particle_with_potential_energy_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['AllStates']]) + dHdl.attrs['alchemform'] = 'dHdl' + return dHdl def gmx_water_particle_without_energy_dHdl(): @@ -74,6 +88,8 @@ def gmx_water_particle_without_energy_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['AllStates']]) + dHdl.attrs['alchemform'] = 'dHdl' + return dHdl def amber_simplesolvated_charge_dHdl(): @@ -82,6 +98,8 @@ def amber_simplesolvated_charge_dHdl(): dHdl = pd.concat([amber.extract_dHdl(filename, T=300) for filename in dataset['data']['charge']]) + dHdl.attrs['alchemform'] = 'dHdl' + return dHdl def amber_simplesolvated_vdw_dHdl(): @@ -90,6 +108,8 @@ def amber_simplesolvated_vdw_dHdl(): dHdl = pd.concat([amber.extract_dHdl(filename, T=300) for filename in dataset['data']['vdw']]) + dHdl.attrs['alchemform'] = 'dHdl' + return dHdl def gomc_benzene_dHdl(): @@ -98,6 +118,8 @@ def gomc_benzene_dHdl(): dHdl = pd.concat([gomc.extract_dHdl(filename, T=298) for filename in dataset['data']]) + dHdl.attrs['alchemform'] = 'dHdl' + return dHdl From 1b387cc8bbfc9cd7311f3fcc0e4fdd2a5f6e9874 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 13 Feb 2020 12:00:46 -0700 Subject: [PATCH 6/9] equilibrium_detection now shares same basic code as statistical_inefficiency All tests pass, but we should try and think carefully about tests that we really want to add to ensure these are working as expected. --- src/alchemlyb/preprocessing/subsampling.py | 244 +++++++++++---------- src/alchemlyb/tests/test_preprocessing.py | 2 +- 2 files changed, 132 insertions(+), 114 deletions(-) diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index 73b7f6db..28eb2ac6 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -109,7 +109,91 @@ def slicing(data, lower=None, upper=None, step=None, force=False): return pd.concat(resdata) -def statistical_inefficiency(data, how='auto', column=None, lower=None, +def _decorrelator(subsample, data, how='auto', column=None, lower=None, + upper=None, step=None, conservative=True, return_calculated=False, + force=False, random_state=None): + + np.random.seed(random_state) + + # we always start with a full index sort on the whole dataframe + # should produce a copy + data = data.sort_index() + + index_names = list(data.index.names[1:]) + resdata = list() + + if return_calculated: + calculated = defaultdict(dict) + + if column: + if isinstance(column, pd.Series): + #TODO: check equality of index between Series, data + pass + + # assign specific `how` settings if ``how == 'auto'`` + if how == 'auto': + if data.attrs['alchemform'] == 'u_nk': + how = 'right' + if data.attrs['alchemform'] == 'dHdl': + how = 'sum' + + def random_selection(name, group): + tried = set() + while True: + group_c, selection = _how_random(name, group, tried=tried) + try: + indices, calculated = subsample(group_c, group) + except: + tried.add(selection) + else: + break + + return indices, calculated + + for name, group in data.groupby(level=index_names): + + if not force and _check_multiple_times(group): + raise KeyError("Duplicate time values found; decorrelation " + "is only meaningful for a single, contiguous, " + "and sorted timeseries") + + if column: + if isinstance(column, pd.Series): + group_c = column.groupby(level=index_names).get_group(name) + indices, calc = subsample(group_c, group) + elif isinstance(column, basestring): + group_c = group[column] + indices, calc = subsample(group_c, group) + else: + if (how == 'right') or (how == 'left'): + try: + group_c = _how_lr(name, group, how) + except KeyError: + indices, calc = random_selection(name, group) + else: + indices, calc = subsample(group_c, group) + elif how == 'random': + indices, calc = random_selection(name, group) + elif how == 'sum': + group_c = _how_sum(name, group) + indices, calc = subsample(group_c, group) + else: + raise ValueError("`how` cannot be '{}';" + " see docstring for available options".format(how)) + + group_s = slicing(group, lower=lower, upper=upper, step=step) + resdata.append(group_s.iloc[indices]) + + if return_calculated: + calculated[name] = calc + + if return_calculated: + return pd.concat(resdata), pd.DataFrame(calculated) + else: + return pd.concat(resdata) + + +def statistical_inefficiency(data, column=None, how='auto', lower=None, upper=None, step=None, conservative=True, return_calculated=False, force=False, random_state=None): """Subsample an alchemlyb DataFrame based on the calculated statistical @@ -224,29 +308,6 @@ def statistical_inefficiency(data, how='auto', column=None, lower=None, one could end up with correlated data. """ - np.random.seed(random_state) - - # we always start with a full index sort on the whole dataframe - # should produce a copy - data = data.sort_index() - - index_names = list(data.index.names[1:]) - resdata = list() - - if return_calculated: - calculated = defaultdict(dict) - - if column: - if isinstance(column, pd.Series): - #TODO: check equality of index between Series, data - pass - - # assign specific `how` settings if ``how == 'auto'`` - if how == 'auto': - if data.attrs['alchemform'] == 'u_nk': - how = 'right' - if data.attrs['alchemform'] == 'dHdl': - how = 'sum' def subsample(group_c, group): group_cs = slicing(group_c, lower=lower, upper=upper, step=step) @@ -258,66 +319,28 @@ def subsample(group_c, group): indices = timeseries.subsampleCorrelatedData(group_cs, g=statinef, conservative=conservative) - return indices - - def random_selection(name, group): - tried = set() - while True: - group_c, selection = _how_random(name, group, tried=tried) - try: - indices = subsample(group_c, group) - except: - tried.add(selection) - else: - break - - return indices - - for name, group in data.groupby(level=index_names): - - if not force and _check_multiple_times(group): - raise KeyError("Duplicate time values found; statistical inefficiency" - "is only meaningful for a single, contiguous, " - "and sorted timeseries") - - if column: - if isinstance(column, pd.Series): - group_c = column.groupby(level=index_names).get_group(name) - indices = subsample(group_c, group) - elif isinstance(column, basestring): - group_c = group[column] - indices = subsample(group_c, group) - else: - if (how == 'right') or (how == 'left'): - try: - group_c = _how_lr(name, group, how) - except KeyError: - indices = random_selection(name, group) - else: - indices = subsample(group_c, group) - elif how == 'random': - indices = random_selection(name, group) - elif how == 'sum': - group_c = _how_sum(name, group) - indices = subsample(group_c, group) - else: - raise ValueError("`how` cannot be '{}';" - " see docstring for available options".format(how)) - - group_s = slicing(group, lower=lower, upper=upper, step=step) - resdata.append(group_s.iloc[indices]) - - if return_calculated: - calculated['statinef'][name] = statinef - - if return_calculated: - return pd.concat(resdata), calculated - else: - return pd.concat(resdata) - - -def equilibrium_detection(data, how='auto', column=None, lower=None, upper=None, step=None, - conservative=True, return_calculated=False, force=False): + calculated = dict() + calculated['statinef'] = statinef + + return indices, calculated + + return _decorrelator( + subsample=subsample, + data=data, + how=how, + column=column, + lower=lower, + upper=upper, + step=step, + conservative=conservative, + return_calculated=return_calculated, + force=force, + random_state=random_state) + + +def equilibrium_detection(data, column=None, how='auto', lower=None, + upper=None, step=None, conservative=True, return_calculated=False, + force=False, random_state=None): """Subsample a DataFrame using automated equilibrium detection on one of its columns. @@ -419,41 +442,36 @@ def equilibrium_detection(data, how='auto', column=None, lower=None, upper=None, end up with correlated data. """ - # we always start with a full index sort on the whole dataframe - data = data.sort_index() - index_names = list(data.index.names[1:]) - resdata = list() - - if return_calculated: - calculated = defaultdict(dict) - - for name, group in data.groupby(level=index_names): - group_s = slicing(group, lower=lower, upper=upper, step=step) - - if not force and _check_multiple_times(group_s): - raise KeyError("Duplicate time values found; equilibrium detection " - "is only meaningful for a single, contiguous, " - "and sorted timeseries") + def subsample(group_c, group): + group_cs = slicing(group_c, lower=lower, upper=upper, step=step) # calculate statistical inefficiency of series, with equilibrium detection - t, statinef, Neff_max = timeseries.detectEquilibration(group_s[column]) + t, statinef, Neff_max = timeseries.detectEquilibration(group_cs) # only keep values past `t` - group_s = group_s.iloc[t:] + group_cs = group_cs.iloc[t:] # use the subsampleCorrelatedData function to get the subsample index - indices = timeseries.subsampleCorrelatedData(group_s[column], g=statinef, + indices = timeseries.subsampleCorrelatedData(group_cs, g=statinef, conservative=conservative) - resdata.append(group_s.iloc[indices]) - - if return_calculated: - calculated['t'][name] = statinef - calculated['statinef'][name] = statinef - calculated['Neff_max'][name] = statinef - - if return_calculated: - return pd.concat(resdata), calculated - else: - return pd.concat(resdata) + calculated = dict() + calculated['t'] = t + calculated['statinef'] = statinef + calculated['Neff_max'] = Neff_max + + return indices, calculated + + return _decorrelator( + subsample=subsample, + data=data, + how=how, + column=column, + lower=lower, + upper=upper, + step=step, + conservative=conservative, + return_calculated=return_calculated, + force=force, + random_state=random_state) diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py index 9d375dcc..83d215ea 100644 --- a/src/alchemlyb/tests/test_preprocessing.py +++ b/src/alchemlyb/tests/test_preprocessing.py @@ -233,7 +233,7 @@ def test_conservative(self, data, size, conservative): assert len(sliced) == size -class TestEquilibriumDetection(): +class TestEquilibriumDetection(CorrelatedPreprocessors): def subsampler(self, *args, **kwargs): return equilibrium_detection(*args, **kwargs) From 523cdd35c64e50f83907a3571104cb93bb593f69 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Tue, 18 Feb 2020 09:38:47 -0700 Subject: [PATCH 7/9] alchemform hits a stumbling block Using something like `pd.concat` on a set of DataFrames doesn't propagate forward the `.attrs` dictionary, so having an `alchemform` key here created by the parsers is not sufficient for signalling to preprocessors the type of data. We will likely remove the 'auto' option from the subsamplers and insist on explicit values for `how`. This is not *necessarily* bad, as it promotes some understanding for the user of what they are choosing to do here. --- src/alchemlyb/preprocessing/subsampling.py | 31 +++++++++++++--------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index 28eb2ac6..e5f1065f 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -109,7 +109,7 @@ def slicing(data, lower=None, upper=None, step=None, force=False): return pd.concat(resdata) -def _decorrelator(subsample, data, how='auto', column=None, lower=None, +def _decorrelator(subsample, data, column=None, how='auto', lower=None, upper=None, step=None, conservative=True, return_calculated=False, force=False, random_state=None): @@ -125,13 +125,20 @@ def _decorrelator(subsample, data, how='auto', column=None, lower=None, if return_calculated: calculated = defaultdict(dict) + # input checks if column: if isinstance(column, pd.Series): - #TODO: check equality of index between Series, data + if ~(data.index == column.index).all(): + raise ValueError("The index of `column` must match the index of `data` exactly" + " to be used for decorrelation.") + elif column in group.columns: pass + else: + raise ValueError("`column` must give either an existing column label in `data`" + " or a Series object with a matching index.") + elif how == 'auto': # assign specific `how` settings if ``how == 'auto'`` - if how == 'auto': if data.attrs['alchemform'] == 'u_nk': how = 'right' if data.attrs['alchemform'] == 'dHdl': @@ -161,7 +168,7 @@ def random_selection(name, group): if isinstance(column, pd.Series): group_c = column.groupby(level=index_names).get_group(name) indices, calc = subsample(group_c, group) - elif isinstance(column, basestring): + elif column in group.columns: group_c = group[column] indices, calc = subsample(group_c, group) else: @@ -242,13 +249,13 @@ def statistical_inefficiency(data, column=None, how='auto', lower=None, ---------- data : DataFrame DataFrame to subsample according statistical inefficiency of `series`. - how : {'auto', 'right', 'left', 'random', 'sum'} - The approach used to choose the observable on which correlations are - calculated. See explanation above. column : label or `pandas.Series` Label of column to use for calculating statistical inefficiency. Overrides `how`; can also take a `Series` object, but the index of the `Series` *must* match that of `data` exactly. + how : {'auto', 'right', 'left', 'random', 'sum'} + The approach used to choose the observable on which correlations are + calculated. See explanation above. lower : float Lower time to pre-slice data from. upper : float @@ -327,8 +334,8 @@ def subsample(group_c, group): return _decorrelator( subsample=subsample, data=data, - how=how, column=column, + how=how, lower=lower, upper=upper, step=step, @@ -385,13 +392,13 @@ def equilibrium_detection(data, column=None, how='auto', lower=None, ---------- data : DataFrame DataFrame to subsample according to equilibrium detection on `series`. - how : {'auto', 'right', 'left', 'random', 'sum'} - The approach used to choose the observable on which correlations are - calculated. See explanation above. column : label or `pandas.Series` Label of column to use for calculating statistical inefficiency. Overrides `how`; can also take a `Series` object, but the index of the `Series` *must* match that of `data` exactly. + how : {'auto', 'right', 'left', 'random', 'sum'} + The approach used to choose the observable on which correlations are + calculated. See explanation above. lower : float Lower time to pre-slice data from. upper : float @@ -466,8 +473,8 @@ def subsample(group_c, group): return _decorrelator( subsample=subsample, data=data, - how=how, column=column, + how=how, lower=lower, upper=upper, step=step, From 166f3df51465ba8239fb8c422a1291a98e959d0b Mon Sep 17 00:00:00 2001 From: David Dotson Date: Tue, 10 Mar 2020 22:11:10 -0700 Subject: [PATCH 8/9] Removed use of `alchemform`; users must explicitly choose their `how` Alternatively, they can choose a `column`. I think this is appropriate, as it is still important that users are consciously aware of some of the high-level choices the software is making. There will always be touchpoints like this, though we should try to minimize them where possible. --- .../alchemlyb.preprocessing.subsampling.rst | 259 +++++++++++++++++- src/alchemlyb/preprocessing/subsampling.py | 119 ++++---- src/alchemlyb/tests/test_fep_estimators.py | 22 -- src/alchemlyb/tests/test_preprocessing.py | 86 ++++-- src/alchemlyb/tests/test_ti_estimators.py | 22 -- 5 files changed, 382 insertions(+), 126 deletions(-) diff --git a/docs/preprocessing/alchemlyb.preprocessing.subsampling.rst b/docs/preprocessing/alchemlyb.preprocessing.subsampling.rst index bbc41dd1..b36b581c 100644 --- a/docs/preprocessing/alchemlyb.preprocessing.subsampling.rst +++ b/docs/preprocessing/alchemlyb.preprocessing.subsampling.rst @@ -1,3 +1,5 @@ +.. currentmodule:: alchemlyb.preprocessing.subsampling + .. _subsampling: Subsampling @@ -5,7 +7,262 @@ Subsampling .. automodule:: alchemlyb.preprocessing.subsampling -The functions featured in this module can be used to easily subsample either :ref:`dHdl ` or :ref:`u_nk ` datasets to give less correlated timeseries. +The functions featured in this module can be used to easily subsample either :math:`\frac{dH}{d\lambda}` (:ref:`dHdl `) or :math:`u_{nk}` (:ref:`u_nk `) datasets to give uncorrelated timeseries. + +Each of these functions splits the dataset into groups based on :math:`\lambda` index values, with each group featuring all samples with the same :math:`\lambda` values. +The function then performs its core operation (described below) on each group individually, with samples sorted according to the outermost ``time`` index beforehand. +Each of these groups therefore functions as an individual *timeseries* being subsampled. +The resulting :py:class:`pandas.DataFrame` is the concatenation of all groups after they have been subsampled. + + +Slicing +------- +The :func:`~alchemlyb.preprocessing.subsampling.slicing` function only performs slicing of the dataset on the basis of ``time``. +The ``lower`` and ``upper`` keyword specify the lower and upper bounds, *inclusive*, while ``step`` indicates the degree to which rows are skipped (e.g. ``step=2`` means to keep every other sample within the bounds). + +For example, if we have:: + + >>> u_nk + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 10.0 0.0 0.308844 2.616688 4.924532 7.232376 9.540220 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 30.0 0.0 0.309712 1.579647 2.849583 4.119518 5.389454 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + ... ... ... ... ... ... + 39960.0 1.0 3.913594 3.011197 2.108801 1.206405 0.304009 + 39970.0 1.0 -0.365724 -0.197390 -0.029055 0.139279 0.307614 + 39980.0 1.0 1.495407 1.199280 0.903152 0.607024 0.310897 + 39990.0 1.0 1.578606 1.260235 0.941863 0.623492 0.305121 + 40000.0 1.0 0.715197 0.611187 0.507178 0.403169 0.299160 + + [20005 rows x 5 columns] + +Then we can grab all samples between :math:`t = 35.0` and :math:`t = 200.0`, inclusive:: + + >>> slicing(u_nk, lower=35.0, upper=200.0) + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 50.0 0.0 0.301968 3.209507 6.117047 9.024587 11.932126 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 70.0 0.0 0.311610 2.773057 5.234504 7.695950 10.157397 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 160.0 1.0 1.396968 1.122475 0.847982 0.573489 0.298995 + 170.0 1.0 -1.812027 -1.283715 -0.755404 -0.227092 0.301219 + 180.0 1.0 0.979355 0.810205 0.641054 0.471904 0.302754 + 190.0 1.0 -2.455231 -1.766201 -1.077171 -0.388141 0.300889 + 200.0 1.0 2.419113 1.890386 1.361659 0.832932 0.304205 + + [85 rows x 5 columns] + + +In practice, this approach is not much different than directly using :py:property:`pandas.DataFrame.loc`:: + + >>> lower, upper, step = (35.0, 200.0, 1) + >>> u_nk.loc[lower:upper:step] + +The :func:`~alchemlyb.preprocessing.subsampling.slicing` function, however, performs some additional checks, +such as ensuring there are not duplicate time values in the dataset, which can happen if data from repeat simulations were concatenated together prior to use. +It's generally advisable to use subsampling functions on data from repeat simulations with ``time`` overlap *individually*, only concatenating afterward just before use with an :ref:`estimator `. + + +.. _subsampling_statinef: + +Statistical Inefficiency +------------------------ +The :func:`~alchemlyb.preprocessing.subsampling.statistical_inefficiency` function subsamples each timeseries in the dataset by its calculated *statistical inefficiency*, :math:`g`, defined as: + +.. math:: + + g \equiv (1 + 2\tau) > 1 + +where :math:`\tau` is the autocorrelation time of the timeseries. +:math:`g` therefore functions as the spacing between uncorrelated samples in the timeseries. +The timeseries is sliced with the :math:`\text{ceil}(g)` as its step (in the conservative case, see the API docs below). + +For example, if we have:: + + >>> u_nk + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 10.0 0.0 0.308844 2.616688 4.924532 7.232376 9.540220 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 30.0 0.0 0.309712 1.579647 2.849583 4.119518 5.389454 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + ... ... ... ... ... ... + 39960.0 1.0 3.913594 3.011197 2.108801 1.206405 0.304009 + 39970.0 1.0 -0.365724 -0.197390 -0.029055 0.139279 0.307614 + 39980.0 1.0 1.495407 1.199280 0.903152 0.607024 0.310897 + 39990.0 1.0 1.578606 1.260235 0.941863 0.623492 0.305121 + 40000.0 1.0 0.715197 0.611187 0.507178 0.403169 0.299160 + + [20005 rows x 5 columns] + +We can extract uncorrelated samples for each ``fep-lambda`` with:: + + >>> statistical_inefficiency(u_nk, how='right') + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 39920.0 1.0 3.175234 2.457369 1.739504 1.021640 0.303775 + 39940.0 1.0 -1.480193 -1.034104 -0.588015 -0.141925 0.304164 + 39960.0 1.0 3.913594 3.011197 2.108801 1.206405 0.304009 + 39980.0 1.0 1.495407 1.199280 0.903152 0.607024 0.310897 + 40000.0 1.0 0.715197 0.611187 0.507178 0.403169 0.299160 + + [12005 rows x 5 columns] + +The ``how`` parameter indicates the choice of observable for performing the calculation of :math:`g`. + +* For :math:`u_{nk}` datasets, the choice of ``'right'`` is recommended: the column immediately to the right of the column corresponding to the group's lambda index value is used as the observable. +* For :math:`\frac{dH}{d\lambda}` datasets, the choice of ``'sum'`` is recommended: the columns are simply summed, and the resulting :py:class:`pandas.Series` is used as the observable. + +See the API documentation below on the possible values for ``how``, as well as more detailed explanations for each choice. + +It is also possible to choose a specific column, or an arbitrary :py:class:`pandas.Series` (with an *exactly* matching index to the dataset), as the basis for calculating :math:`g`:: + + >>> statistical_inefficiency(u_nk, column=0.75) + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 39920.0 1.0 3.175234 2.457369 1.739504 1.021640 0.303775 + 39940.0 1.0 -1.480193 -1.034104 -0.588015 -0.141925 0.304164 + 39960.0 1.0 3.913594 3.011197 2.108801 1.206405 0.304009 + 39980.0 1.0 1.495407 1.199280 0.903152 0.607024 0.310897 + 40000.0 1.0 0.715197 0.611187 0.507178 0.403169 0.299160 + + [12005 rows x 5 columns] + + >>> statistical_inefficiency(u_nk, column=u_nk[0.75]) + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 39920.0 1.0 3.175234 2.457369 1.739504 1.021640 0.303775 + 39940.0 1.0 -1.480193 -1.034104 -0.588015 -0.141925 0.304164 + 39960.0 1.0 3.913594 3.011197 2.108801 1.206405 0.304009 + 39980.0 1.0 1.495407 1.199280 0.903152 0.607024 0.310897 + 40000.0 1.0 0.715197 0.611187 0.507178 0.403169 0.299160 + + [12005 rows x 5 columns] + +See :py:func:`pymbar.timeseries.statisticalInefficiency` for more details; this is used internally by this subsampler. +Please reference the following if you use this function in your research: + + [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted histogram analysis method for the analysis of simulated and parallel tempering simulations. JCTC 3(1):26-41, 2007. + + +Equilibrium Detection +--------------------- +The :func:`~alchemlyb.preprocessing.subsampling.equilibrium_detection` function subsamples each timeseries in the dataset using the equilibration detection approach developed by John Chodera (see reference below). + +For each sorted timeseries in the dataset, and for each sample in each timeseries, the sample is treated as the starting point of the trajectory and the *statistical inefficiency*, :math:`g`, calculated. +Each of these starting points yields an *effective* number of uncorrelated samples, :math:`N_{\text{eff}}`, and the starting point with the greatest :math:`N_{\text{eff}}` is chosen as the start of the *equilibrated* region of the trajectory. + +The sorted timeseries is then subsampled by dropping the samples prior to the chosen starting point, then slicing the remaining samples with :math:`\text{ceil}(g)` as its step (in the conservative case, see the API docs below). + +For example, if we have:: + + >>> u_nk + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 10.0 0.0 0.308844 2.616688 4.924532 7.232376 9.540220 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 30.0 0.0 0.309712 1.579647 2.849583 4.119518 5.389454 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + ... ... ... ... ... ... + 39960.0 1.0 3.913594 3.011197 2.108801 1.206405 0.304009 + 39970.0 1.0 -0.365724 -0.197390 -0.029055 0.139279 0.307614 + 39980.0 1.0 1.495407 1.199280 0.903152 0.607024 0.310897 + 39990.0 1.0 1.578606 1.260235 0.941863 0.623492 0.305121 + 40000.0 1.0 0.715197 0.611187 0.507178 0.403169 0.299160 + + [20005 rows x 5 columns] + +We can extract uncorrelated samples for each ``fep-lambda`` with:: + + >>> equilibrium_detection(u_nk, how='right') + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 39820.0 1.0 5.238549 4.004569 2.770589 1.536609 0.302630 + 39840.0 1.0 -1.611068 -1.133603 -0.656139 -0.178674 0.298791 + 39860.0 1.0 0.052599 0.116262 0.179924 0.243587 0.307249 + 39880.0 1.0 1.312812 1.060874 0.808936 0.556998 0.305060 + 39900.0 1.0 6.940932 5.280870 3.620806 1.960743 0.300680 + + [11968 rows x 5 columns] + +The ``how`` parameter indicates the choice of observable for performing the calculation of :math:`g`. +See the brief recommendations for ``how`` in the :ref:`subsampling_statinef` section above, or the API documentation below on the possible values for ``how`` as well as more detailed explanations for each choice. + +It is also possible to choose a specific column, or an arbitrary :py:class:`pandas.Series` (with an *exactly* matching index to the dataset), as the basis for subsampling with equilibrium detection:: + + >>> equilibrium_detection(u_nk, column=0.75) + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 39820.0 1.0 5.238549 4.004569 2.770589 1.536609 0.302630 + 39840.0 1.0 -1.611068 -1.133603 -0.656139 -0.178674 0.298791 + 39860.0 1.0 0.052599 0.116262 0.179924 0.243587 0.307249 + 39880.0 1.0 1.312812 1.060874 0.808936 0.556998 0.305060 + 39900.0 1.0 6.940932 5.280870 3.620806 1.960743 0.300680 + + [11961 rows x 5 columns] + + >>> equilibrium_detection(u_nk, column=u_nk[0.75]) + 0.00 0.25 0.50 0.75 1.00 + time fep-lambda + 0.0 0.0 0.309323 3.656838 7.004353 10.351868 13.699383 + 20.0 0.0 0.300940 1.626739 2.952538 4.278337 5.604136 + 40.0 0.0 0.299979 2.255387 4.210794 6.166202 8.121610 + 60.0 0.0 0.308315 2.284146 4.259977 6.235809 8.211640 + 80.0 0.0 0.301432 1.397817 2.494203 3.590589 4.686975 + ... ... ... ... ... ... + 39820.0 1.0 5.238549 4.004569 2.770589 1.536609 0.302630 + 39840.0 1.0 -1.611068 -1.133603 -0.656139 -0.178674 0.298791 + 39860.0 1.0 0.052599 0.116262 0.179924 0.243587 0.307249 + 39880.0 1.0 1.312812 1.060874 0.808936 0.556998 0.305060 + 39900.0 1.0 6.940932 5.280870 3.620806 1.960743 0.300680 + + [11961 rows x 5 columns] + +See :py:func:`pymbar.timeseries.detectEquilibration` for more details; this is used internally by this subsampler. +Please reference the following if you use this function in your research: + + [1] John D. Chodera. A simple method for automated equilibration detection in molecular simulations. Journal of Chemical Theory and Computation, 12:1799, 2016. + + [2] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted histogram analysis method for the analysis of simulated and parallel tempering simulations. JCTC 3(1):26-41, 2007. + API Reference ------------- diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index e5f1065f..ea0230f5 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -49,13 +49,13 @@ def _how_lr(name, group, direction): def _how_random(name, group, tried=None): - candidates = set(group.columns) - (set(tried) + {name}) + candidates = set(group.columns) - (set(tried) | {name}) if not candidates: raise CorrelationError("No column in the dataset could be used" " successfully for decorrelation") else: - selection = random.choice(candidates) + selection = random.choice(list(candidates)) return group[selection], selection @@ -109,10 +109,26 @@ def slicing(data, lower=None, upper=None, step=None, force=False): return pd.concat(resdata) -def _decorrelator(subsample, data, column=None, how='auto', lower=None, +def _decorrelator(subsample, data, column=None, how=None, lower=None, upper=None, step=None, conservative=True, return_calculated=False, force=False, random_state=None): + # input checks + if column is not None: + if isinstance(column, pd.Series): + if not (data.index == column.index).all(): + raise ValueError("The index of `column` must match the index of `data` exactly" + " to be used for decorrelation.") + elif column in data.columns: + pass + else: + raise ValueError("`column` must give either an existing column label in `data`" + " or a Series object with a matching index.") + elif how is not None: + pass + else: + raise ValueError("One of `column` or `how` must be set") + np.random.seed(random_state) # we always start with a full index sort on the whole dataframe @@ -125,25 +141,6 @@ def _decorrelator(subsample, data, column=None, how='auto', lower=None, if return_calculated: calculated = defaultdict(dict) - # input checks - if column: - if isinstance(column, pd.Series): - if ~(data.index == column.index).all(): - raise ValueError("The index of `column` must match the index of `data` exactly" - " to be used for decorrelation.") - elif column in group.columns: - pass - else: - raise ValueError("`column` must give either an existing column label in `data`" - " or a Series object with a matching index.") - - elif how == 'auto': - # assign specific `how` settings if ``how == 'auto'`` - if data.attrs['alchemform'] == 'u_nk': - how = 'right' - if data.attrs['alchemform'] == 'dHdl': - how = 'sum' - def random_selection(name, group): tried = set() while True: @@ -164,14 +161,14 @@ def random_selection(name, group): "is only meaningful for a single, contiguous, " "and sorted timeseries") - if column: + if column is not None: if isinstance(column, pd.Series): group_c = column.groupby(level=index_names).get_group(name) indices, calc = subsample(group_c, group) elif column in group.columns: group_c = group[column] indices, calc = subsample(group_c, group) - else: + elif how is not None: if (how == 'right') or (how == 'left'): try: group_c = _how_lr(name, group, how) @@ -200,7 +197,7 @@ def random_selection(name, group): return pd.concat(resdata) -def statistical_inefficiency(data, column=None, how='auto', lower=None, +def statistical_inefficiency(data, column=None, how=None, lower=None, upper=None, step=None, conservative=True, return_calculated=False, force=False, random_state=None): """Subsample an alchemlyb DataFrame based on the calculated statistical @@ -214,16 +211,12 @@ def statistical_inefficiency(data, column=None, how='auto', lower=None, The `how` parameter determines the observable used for calculating the correlations within each group of samples. The options are as follows: - 'auto' - The default; the approach is chosen from the below approaches based - on the `alchemform` of the data (either 'dHdl' or 'u_nk'). Use this - if in doubt. 'right' - The default for 'u_nk' datasets; the column immediately to the - right of the column corresponding to the group's lambda index value - is used. If there is no column to the right, then the column to the - left is used. If there is no column corresponding to the group's - lambda index value, then 'random' is used (see below). + The recommended setting for 'u_nk' datasets; the column immediately + to the right of the column corresponding to the group's lambda + index value is used. If there is no column to the right, then the + column to the left is used. If there is no column corresponding to + the group's lambda index value, then 'random' is used (see below). 'left' The opposite of the 'right' approach; the column immediately to the left of the column corresponding to the group's lambda index value @@ -238,8 +231,8 @@ def statistical_inefficiency(data, column=None, how='auto', lower=None, continues until success or until all columns have been attempted without success (in which case, ``CorrelationError`` is raised). 'sum' - The default for 'dHdl' datasets; the columns are simply summed, and - the resulting `Series` is used. + The recommended setting for 'dHdl' datasets; the columns are simply + summed, and the resulting `Series` is used. Specifying the 'column' parameter overrides the behavior of 'how'. This allows the user to use a particular column or a specially-crafted `Series` @@ -253,7 +246,7 @@ def statistical_inefficiency(data, column=None, how='auto', lower=None, Label of column to use for calculating statistical inefficiency. Overrides `how`; can also take a `Series` object, but the index of the `Series` *must* match that of `data` exactly. - how : {'auto', 'right', 'left', 'random', 'sum'} + how : {'right', 'left', 'random', 'sum'} The approach used to choose the observable on which correlations are calculated. See explanation above. lower : float @@ -272,7 +265,7 @@ def statistical_inefficiency(data, column=None, how='auto', lower=None, for each group. force : bool Ignore checks that DataFrame is in proper form for expected behavior. - random_state : int, optional + random_state : int Integer between 0 and 2**32 -1 inclusive; fed to `numpy.random.seed`. Running this function on the same data with a specific random seed will produce the same result each time. @@ -297,6 +290,12 @@ def statistical_inefficiency(data, column=None, how='auto', lower=None, decreases a false sense of accuracy and is deemed the more careful and conservative approach. + References + ---------- + [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted + histogram analysis method for the analysis of simulated and parallel tempering simulations. + JCTC 3(1):26-41, 2007. + See Also -------- pymbar.timeseries.statisticalInefficiency : detailed background @@ -345,9 +344,9 @@ def subsample(group_c, group): random_state=random_state) -def equilibrium_detection(data, column=None, how='auto', lower=None, +def equilibrium_detection(data, column=None, how=None, lower=None, upper=None, step=None, conservative=True, return_calculated=False, - force=False, random_state=None): + force=False, random_state=None, nskip=1): """Subsample a DataFrame using automated equilibrium detection on one of its columns. @@ -359,16 +358,12 @@ def equilibrium_detection(data, column=None, how='auto', lower=None, The `how` parameter determines the observable used for calculating the correlations within each group of samples. The options are as follows: - 'auto' - The default; the approach is chosen from the below approaches based - on the `alchemform` of the data (either 'dHdl' or 'u_nk'). Use this - if in doubt. 'right' - The default for 'u_nk' datasets; the column immediately to the - right of the column corresponding to the group's lambda index value - is used. If there is no column to the right, then the column to the left is used. - If there is no column corresponding to the group's lambda index - value, then 'random' is used (see below). + The recommended setting for 'u_nk' datasets; the column immediately + to the right of the column corresponding to the group's lambda + index value is used. If there is no column to the right, then the + column to the left is used. If there is no column corresponding to + the group's lambda index value, then 'random' is used (see below). 'left' The opposite of the 'right' approach; the column immediately to the left of the column corresponding to the group's lambda index value @@ -381,8 +376,8 @@ def equilibrium_detection(data, column=None, how='auto', lower=None, column is tried. This process continues until success or until all columns have been attempted without success. 'sum' - The default for 'dHdl' datasets; the columns are simply summed, and - the resulting `Series` is used. + The recommended setting for 'dHdl' datasets; the columns are simply + summed, and the resulting `Series` is used. Specifying the 'column' parameter overrides the behavior of 'how'. This allows the user to use a particular column or a specially-crafted `Series` @@ -396,7 +391,7 @@ def equilibrium_detection(data, column=None, how='auto', lower=None, Label of column to use for calculating statistical inefficiency. Overrides `how`; can also take a `Series` object, but the index of the `Series` *must* match that of `data` exactly. - how : {'auto', 'right', 'left', 'random', 'sum'} + how : {'right', 'left', 'random', 'sum'} The approach used to choose the observable on which correlations are calculated. See explanation above. lower : float @@ -415,6 +410,14 @@ def equilibrium_detection(data, column=None, how='auto', lower=None, for each group. force : bool Ignore checks that DataFrame is in proper form for expected behavior. + random_state : int + Integer between 0 and 2**32 -1 inclusive; fed to `numpy.random.seed`. + Running this function on the same data with a specific random seed will + produce the same result each time. + nskip : int + Number of samples to skip when walking through dataset; for long + trajectories, can offer substantial speedups at the cost of possibly + discarding slightly more data. Returns ------- @@ -431,6 +434,16 @@ def equilibrium_detection(data, column=None, how='auto', lower=None, decreases a false sense of accuracy and is deemed the more careful and conservative approach. + References + ---------- + [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted + histogram analysis method for the analysis of simulated and parallel tempering simulations. + JCTC 3(1):26-41, 2007. + [2] John D. Chodera. A simple method for automated + equilibration detection in molecular simulations. + Journal of Chemical Theory and Computation, + 12:1799, 2016. + See Also -------- pymbar.timeseries.detectEquilibration : detailed background @@ -454,7 +467,7 @@ def subsample(group_c, group): group_cs = slicing(group_c, lower=lower, upper=upper, step=step) # calculate statistical inefficiency of series, with equilibrium detection - t, statinef, Neff_max = timeseries.detectEquilibration(group_cs) + t, statinef, Neff_max = timeseries.detectEquilibration(group_cs, nskip=nskip) # only keep values past `t` group_cs = group_cs.iloc[t:] diff --git a/src/alchemlyb/tests/test_fep_estimators.py b/src/alchemlyb/tests/test_fep_estimators.py index a3bcb796..d1a704bc 100644 --- a/src/alchemlyb/tests/test_fep_estimators.py +++ b/src/alchemlyb/tests/test_fep_estimators.py @@ -19,8 +19,6 @@ def gmx_benzene_coul_u_nk(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['Coulomb']]) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk def gmx_benzene_vdw_u_nk(): @@ -29,8 +27,6 @@ def gmx_benzene_vdw_u_nk(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['VDW']]) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk def gmx_expanded_ensemble_case_1(): @@ -39,8 +35,6 @@ def gmx_expanded_ensemble_case_1(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['AllStates']]) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk def gmx_expanded_ensemble_case_2(): @@ -49,8 +43,6 @@ def gmx_expanded_ensemble_case_2(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['AllStates']]) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk def gmx_expanded_ensemble_case_3(): @@ -59,8 +51,6 @@ def gmx_expanded_ensemble_case_3(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['AllStates']]) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk def gmx_water_particle_with_total_energy(): @@ -69,8 +59,6 @@ def gmx_water_particle_with_total_energy(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['AllStates']]) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk def gmx_water_particle_with_potential_energy(): @@ -79,8 +67,6 @@ def gmx_water_particle_with_potential_energy(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['AllStates']]) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk def gmx_water_particle_without_energy(): @@ -89,8 +75,6 @@ def gmx_water_particle_without_energy(): u_nk = pd.concat([gmx.extract_u_nk(filename, T=300) for filename in dataset['data']['AllStates']]) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk def amber_bace_example_complex_vdw(): @@ -98,8 +82,6 @@ def amber_bace_example_complex_vdw(): u_nk = pd.concat([amber.extract_u_nk(filename, T=300) for filename in dataset['data']['complex']['vdw']]) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk def gomc_benzene_u_nk(): @@ -108,8 +90,6 @@ def gomc_benzene_u_nk(): u_nk = pd.concat([gomc.extract_u_nk(filename, T=298) for filename in dataset['data']]) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk def namd_tyr2ala(): @@ -121,8 +101,6 @@ def namd_tyr2ala(): u_nk1[u_nk1.isna()] = u_nk2 u_nk = u_nk1.sort_index(level=u_nk1.index.names[1:]) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py index 83d215ea..0c9a563b 100644 --- a/src/alchemlyb/tests/test_preprocessing.py +++ b/src/alchemlyb/tests/test_preprocessing.py @@ -75,8 +75,6 @@ def gmx_benzene_dHdl(): dataset = alchemtest.gmx.load_benzene() dHdl = gmx.extract_dHdl(dataset['data']['Coulomb'][0], T=300) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl @@ -85,8 +83,6 @@ def gmx_benzene_dHdl_duplicated(): df = gmx.extract_dHdl(dataset['data']['Coulomb'][0], T=300) dHdl = pd.concat([df, df]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl @@ -94,8 +90,6 @@ def gmx_benzene_u_nk(): dataset = alchemtest.gmx.load_benzene() u_nk = gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk @@ -104,8 +98,6 @@ def gmx_benzene_u_nk_duplicated(): df = gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300) u_nk = pd.concat([df, df]) - u_nk.attrs['alchemform'] = 'u_nk' - return u_nk @@ -113,16 +105,12 @@ def gmx_benzene_dHdl_full(): dataset = alchemtest.gmx.load_benzene() dHdl = pd.concat([gmx.extract_dHdl(i, T=300) for i in dataset['data']['Coulomb']]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl def gmx_benzene_u_nk_full(): dataset = alchemtest.gmx.load_benzene() - return pd.concat([gmx.extract_u_nk(i, T=300) for i in dataset['data']['Coulomb']]) - - u_nk.attrs['alchemform'] = 'u_nk' + u_nk = pd.concat([gmx.extract_u_nk(i, T=300) for i in dataset['data']['Coulomb']]) return u_nk @@ -183,33 +171,60 @@ def test_slicing_u_nk(self, u_nk): class CorrelatedPreprocessors: - @pytest.mark.parametrize(('data', 'size'), [(gmx_benzene_dHdl(), 4001), - (gmx_benzene_u_nk(), 4001)]) - def test_subsampling(self, data, size): + @pytest.mark.parametrize(('data', 'size', 'how'), + [(gmx_benzene_dHdl(), 4001, 'sum',), + (gmx_benzene_u_nk(), 4001, 'right')]) + def test_subsampling(self, data, size, how): """Basic test for execution; resulting size of dataset sensitive to machine and depends on algorithm. """ - assert len(self.subsampler(data)) <= size + assert len(self.subsampler(data, how=how)) <= size + + @pytest.mark.parametrize(('data', 'size', 'column'), + [(gmx_benzene_dHdl(), 20005, 0), + (gmx_benzene_u_nk(), 20005, 0)]) + def test_subsampling_column(self, data, size, column): + assert len(self.subsampler(data, column=data.columns[column])) <= size def test_subsampling_dHdl(self, dHdl): data, nsims = dHdl if nsims == "single": - dHdl_s = self.subsampler(data) + dHdl_s = self.subsampler(data, how='sum') assert len(dHdl_s) < len(data) elif nsims == "repeat": with pytest.raises(KeyError): - dHdl_s = self.subsampler(data) + dHdl_s = self.subsampler(data, how='sum') def test_subsampling_u_nk(self, u_nk): data, nsims = u_nk if nsims == "single": - u_nk_s = self.subsampler(data) + u_nk_s = self.subsampler(data, how='right') assert len(u_nk_s) < len(data) elif nsims == "repeat": with pytest.raises(KeyError): - u_nk_s = self.subsampler(data) + u_nk_s = self.subsampler(data, how='right') + + def test_subsampling_u_nk_left(self, u_nk): + data, nsims = u_nk + + if nsims == "single": + u_nk_s = self.subsampler(data, how='left') + assert len(u_nk_s) < len(data) + elif nsims == "repeat": + with pytest.raises(KeyError): + u_nk_s = self.subsampler(data, how='left') + + def test_subsampling_u_nk_random(self, u_nk): + data, nsims = u_nk + + if nsims == "single": + u_nk_s = self.subsampler(data, how='random', random_state=42) + assert len(u_nk_s) < len(data) + elif nsims == "repeat": + with pytest.raises(KeyError): + u_nk_s = self.subsampler(data, how='random', random_state=42) class TestStatisticalInefficiency(CorrelatedPreprocessors): @@ -217,15 +232,15 @@ class TestStatisticalInefficiency(CorrelatedPreprocessors): def subsampler(self, *args, **kwargs): return statistical_inefficiency(*args, **kwargs) - @pytest.mark.parametrize(('conservative', 'data', 'size'), + @pytest.mark.parametrize(('conservative', 'data', 'size', 'how'), [ - (True, gmx_benzene_dHdl(), 2001), # 0.00: g = 1.0559445620585415 - (True, gmx_benzene_u_nk(), 2001), # 'fep': g = 1.0560203916559594 - (False, gmx_benzene_dHdl(), 3789), - (False, gmx_benzene_u_nk(), 3788), + (True, gmx_benzene_dHdl(), 2001, 'sum'), # 0.00: g = 1.0559445620585415 + (True, gmx_benzene_u_nk(), 2001, 'right'), # 'fep': g = 1.0560203916559594 + (False, gmx_benzene_dHdl(), 3789, 'sum'), + (False, gmx_benzene_u_nk(), 3788, 'right'), ]) - def test_conservative(self, data, size, conservative): - sliced = self.subsampler(data, conservative=conservative) + def test_conservative(self, data, size, conservative, how): + sliced = self.subsampler(data, how=how, conservative=conservative) # results can vary slightly with different machines # so possibly do # delta = 10 @@ -237,3 +252,18 @@ class TestEquilibriumDetection(CorrelatedPreprocessors): def subsampler(self, *args, **kwargs): return equilibrium_detection(*args, **kwargs) + + @pytest.mark.parametrize(('conservative', 'data', 'size', 'how'), + [ + (True, gmx_benzene_dHdl(), 1979, 'sum'), # 0.00: g = 1.0559445620585415 + (True, gmx_benzene_u_nk(), 1979, 'right'), # 'fep': g = 1.0560203916559594 + (False, gmx_benzene_dHdl(), 2848, 'sum'), + (False, gmx_benzene_u_nk(), 2849, 'right'), + ]) + def test_conservative(self, data, size, conservative, how): + sliced = self.subsampler(data, how=how, conservative=conservative) + # results can vary slightly with different machines + # so possibly do + # delta = 10 + # assert size - delta < len(sliced) < size + delta + assert len(sliced) == size diff --git a/src/alchemlyb/tests/test_ti_estimators.py b/src/alchemlyb/tests/test_ti_estimators.py index d143010f..208e27a9 100644 --- a/src/alchemlyb/tests/test_ti_estimators.py +++ b/src/alchemlyb/tests/test_ti_estimators.py @@ -18,8 +18,6 @@ def gmx_benzene_coul_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['Coulomb']]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl def gmx_benzene_vdw_dHdl(): @@ -28,8 +26,6 @@ def gmx_benzene_vdw_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['VDW']]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl def gmx_expanded_ensemble_case_1_dHdl(): @@ -38,8 +34,6 @@ def gmx_expanded_ensemble_case_1_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['AllStates']]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl def gmx_expanded_ensemble_case_2_dHdl(): @@ -48,8 +42,6 @@ def gmx_expanded_ensemble_case_2_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['AllStates']]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl def gmx_expanded_ensemble_case_3_dHdl(): @@ -58,8 +50,6 @@ def gmx_expanded_ensemble_case_3_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['AllStates']]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl def gmx_water_particle_with_total_energy_dHdl(): @@ -68,8 +58,6 @@ def gmx_water_particle_with_total_energy_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['AllStates']]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl def gmx_water_particle_with_potential_energy_dHdl(): @@ -78,8 +66,6 @@ def gmx_water_particle_with_potential_energy_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['AllStates']]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl def gmx_water_particle_without_energy_dHdl(): @@ -88,8 +74,6 @@ def gmx_water_particle_without_energy_dHdl(): dHdl = pd.concat([gmx.extract_dHdl(filename, T=300) for filename in dataset['data']['AllStates']]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl def amber_simplesolvated_charge_dHdl(): @@ -98,8 +82,6 @@ def amber_simplesolvated_charge_dHdl(): dHdl = pd.concat([amber.extract_dHdl(filename, T=300) for filename in dataset['data']['charge']]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl def amber_simplesolvated_vdw_dHdl(): @@ -108,8 +90,6 @@ def amber_simplesolvated_vdw_dHdl(): dHdl = pd.concat([amber.extract_dHdl(filename, T=300) for filename in dataset['data']['vdw']]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl def gomc_benzene_dHdl(): @@ -118,8 +98,6 @@ def gomc_benzene_dHdl(): dHdl = pd.concat([gomc.extract_dHdl(filename, T=298) for filename in dataset['data']]) - dHdl.attrs['alchemform'] = 'dHdl' - return dHdl From dc1b3ff106eb1d66f1b13555c6d1abe3d65c0e8d Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 11 Mar 2020 16:29:19 -0700 Subject: [PATCH 9/9] Added some test coverage for return_calculated, using a Series These changes apply to tests of `statistical_inefficiency`, `equilibrium_detection`. I also set `nskip=5` for `equilibrium_detection` cases, which required some changes to test parameters. This change speeds up the tests, however, at no real coverage cost. --- src/alchemlyb/tests/test_preprocessing.py | 34 +++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py index 0c9a563b..6c889691 100644 --- a/src/alchemlyb/tests/test_preprocessing.py +++ b/src/alchemlyb/tests/test_preprocessing.py @@ -186,6 +186,12 @@ def test_subsampling(self, data, size, how): def test_subsampling_column(self, data, size, column): assert len(self.subsampler(data, column=data.columns[column])) <= size + @pytest.mark.parametrize(('data', 'size', 'column'), + [(gmx_benzene_dHdl(), 20005, 0), + (gmx_benzene_u_nk(), 20005, 0)]) + def test_subsampling_series(self, data, size, column): + assert len(self.subsampler(data, column=data[data.columns[column]])) <= size + def test_subsampling_dHdl(self, dHdl): data, nsims = dHdl @@ -247,18 +253,27 @@ def test_conservative(self, data, size, conservative, how): # assert size - delta < len(sliced) < size + delta assert len(sliced) == size + @pytest.mark.parametrize(('data', 'size', 'how'), + [(gmx_benzene_dHdl(), 4001, 'sum',), + (gmx_benzene_u_nk(), 4001, 'right')]) + def test_subsampling_return_calc(self, data, size, how): + data_s, calculated = self.subsampler(data, how=how, return_calculated=True) + + assert all(i in calculated.index for i in ['statinef']) + assert len(calculated.columns) == len(data.groupby(level=list(data.index.names[1:]))) + class TestEquilibriumDetection(CorrelatedPreprocessors): def subsampler(self, *args, **kwargs): - return equilibrium_detection(*args, **kwargs) + return equilibrium_detection(*args, nskip=5, **kwargs) @pytest.mark.parametrize(('conservative', 'data', 'size', 'how'), [ - (True, gmx_benzene_dHdl(), 1979, 'sum'), # 0.00: g = 1.0559445620585415 - (True, gmx_benzene_u_nk(), 1979, 'right'), # 'fep': g = 1.0560203916559594 - (False, gmx_benzene_dHdl(), 2848, 'sum'), - (False, gmx_benzene_u_nk(), 2849, 'right'), + (True, gmx_benzene_dHdl(), 2001, 'sum'), # 0.00: g = 1.0559445620585415 + (True, gmx_benzene_u_nk(), 2001, 'right'), # 'fep': g = 1.0560203916559594 + (False, gmx_benzene_dHdl(), 2844, 'sum'), + (False, gmx_benzene_u_nk(), 2845, 'right'), ]) def test_conservative(self, data, size, conservative, how): sliced = self.subsampler(data, how=how, conservative=conservative) @@ -267,3 +282,12 @@ def test_conservative(self, data, size, conservative, how): # delta = 10 # assert size - delta < len(sliced) < size + delta assert len(sliced) == size + + @pytest.mark.parametrize(('data', 'size', 'how'), + [(gmx_benzene_dHdl(), 4001, 'sum',), + (gmx_benzene_u_nk(), 4001, 'right')]) + def test_subsampling_return_calc(self, data, size, how): + data_s, calculated = self.subsampler(data, how=how, return_calculated=True) + + assert all(i in calculated.index for i in ['t', 'statinef', 'Neff_max']) + assert len(calculated.columns) == len(data.groupby(level=list(data.index.names[1:])))