diff --git a/.buildinfo b/.buildinfo new file mode 100644 index 0000000..abdc149 --- /dev/null +++ b/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 53b96ff69ff94ca92a8cf411fc29606d +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md new file mode 100644 index 0000000..2668354 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# GitHub Pages Cache + +Nothing to see here. The contents of this branch are essentially a cache that's not intended to be viewed on github.com. + + +If you're looking to update our documentation, check the relevant development branch's 'docs/' dir. + +For more information on how this documentation is built using Sphinx, Read the Docs, and GitHub Actions/Pages, see: + + * https://tech.michaelaltfield.net/2020/07/18/sphinx-rtd-github-pages-1 diff --git a/_images/dagt.svg b/_images/dagt.svg new file mode 100644 index 0000000..d047a79 --- /dev/null +++ b/_images/dagt.svg @@ -0,0 +1,598 @@ + + + + + diff --git a/_images/esc_prof_compare1.png b/_images/esc_prof_compare1.png new file mode 100644 index 0000000..a4d9559 Binary files /dev/null and b/_images/esc_prof_compare1.png differ diff --git a/_images/esc_prof_compare2.png b/_images/esc_prof_compare2.png new file mode 100644 index 0000000..5c0ddb2 Binary files /dev/null and b/_images/esc_prof_compare2.png differ diff --git a/_images/esc_prof_ot_diagram.png b/_images/esc_prof_ot_diagram.png new file mode 100644 index 0000000..c573e84 Binary files /dev/null and b/_images/esc_prof_ot_diagram.png differ diff --git a/_images/example-counts-R.svg b/_images/example-counts-R.svg new file mode 100644 index 0000000..465d681 --- /dev/null +++ b/_images/example-counts-R.svg @@ -0,0 +1,86 @@ + + diff --git a/_images/example-heatmap-Py-2.svg b/_images/example-heatmap-Py-2.svg new file mode 100644 index 0000000..03cf4f4 --- /dev/null +++ b/_images/example-heatmap-Py-2.svg @@ -0,0 +1,3564 @@ + + + diff --git a/_images/heatmap_viz_app.png b/_images/heatmap_viz_app.png new file mode 100644 index 0000000..e84cc94 Binary files /dev/null and b/_images/heatmap_viz_app.png differ diff --git a/_images/launch_viz_app.png b/_images/launch_viz_app.png new file mode 100644 index 0000000..039cfda Binary files /dev/null and b/_images/launch_viz_app.png differ diff --git a/_images/logo_to_distr.png b/_images/logo_to_distr.png new file mode 100644 index 0000000..eb4f262 Binary files /dev/null and b/_images/logo_to_distr.png differ diff --git a/_images/logoplot_example.png b/_images/logoplot_example.png new file mode 100644 index 0000000..e5aaa56 Binary files /dev/null and b/_images/logoplot_example.png differ diff --git a/_images/phippery-suite-6.svg b/_images/phippery-suite-6.svg new file mode 100644 index 0000000..9728522 --- /dev/null +++ b/_images/phippery-suite-6.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/_images/query_peptides_viz_app.png b/_images/query_peptides_viz_app.png new file mode 100644 index 0000000..ce21556 Binary files /dev/null and b/_images/query_peptides_viz_app.png differ diff --git a/_images/xarray-format.svg b/_images/xarray-format.svg new file mode 100644 index 0000000..4f7c682 --- /dev/null +++ b/_images/xarray-format.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/_modules/index.html b/_modules/index.html new file mode 100644 index 0000000..4ad6a85 --- /dev/null +++ b/_modules/index.html @@ -0,0 +1,145 @@ + + +
+ + +
+r"""
+=================
+Eigen
+=================
+
+A method for MDS via Singular Value Decomposition
+on phip-seq datasets.
+"""
+
+import numpy as np
+from numpy.linalg import svd
+import xarray as xr
+import pandas as pd
+import scipy.stats as st
+import itertools
+import copy
+
+[docs]def eigenassay_projections(
+ ds,
+ data_table="counts",
+ compute_correlations=False,
+ return_raw_decomposition=False,
+ n_eigenvectors=None,
+):
+
+ r"""Compute the Singular Value Decomposition
+ of the enrichment data before projecting each
+ sample into the first n eigenvectors ("eigenassays")
+ in the dataset.
+
+ Concretely, given a Matrix of, :math:`X` enrichments in the
+ `phippery` dataset with shape (peptides, samples). We compute
+ the decomposition :math:`X = USV^{T}`
+
+ The principal axes in feature space are then represented by the columns of
+ :math:`V` and represent the direction of maximum variance in the data.
+ The sample projections into this space are then computed and tied to the
+ sample annotation in the returned dictionary.
+
+ Parameters
+ ----------
+ ds : xarray.DataSet
+ The dataset you would like to perform eigen-decomposition on.
+
+ data_table : str
+ The name of the enrichment layer in the phippery dataset to perform
+ the operation on.
+
+ compute_correlations : bool
+ If true, compute the correlation of a sample's true binding enrichments to
+ the n'th principal axes. These will be added in the same fashion as the sample
+ scores that are appended to the sample table.
+
+ return_raw_decomposition : bool
+ If true, include the raw :math:`USV^{T}` decomposition of the enrichment
+ matrix specified
+
+ n_eigenvectors : int
+ the number of projections "eigenassay dimensions" to include.
+
+
+ Returns
+ -------
+ dict :
+ 1. The eigenassay projects tied with the appended to sample annotations
+ included in `ds`.
+ 2. (optional) The raw "economy" SVD decomposition matrices.
+
+ """
+
+ if data_table not in ds:
+ avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ raise KeyError(
+ f"{data_table} is not included in dataset. \n available datasets: {avail}"
+ )
+
+ n_eigenvectors = len(ds.sample_id) if n_eigenvectors is None else n_eigenvectors
+ if n_eigenvectors > len(ds.sample_id):
+ raise ValueError(
+ f"n_eigenvectors must be less than or equal to the number of samples in the dataset"
+ )
+
+ a = ds[f"{data_table}"].values
+ U, S, V = svd(a, full_matrices=False)
+ sam_meta = ds.sample_table.to_pandas()
+
+ z_jk = np.zeros([n_eigenvectors, n_eigenvectors])
+ if compute_correlations:
+ p_jk = np.zeros([n_eigenvectors, n_eigenvectors])
+ for j in range(n_eigenvectors):
+ for k in range(n_eigenvectors):
+ z_jk[j, k] = np.dot(a[:, j], U[:, k])
+ if compute_correlations:
+ p_jk[j, k] = st.pearsonr(a[:, j], U[:, k])[0]
+
+ for k in range(n_eigenvectors):
+ sam_meta[f"Eigenassay-{k}-projection"] = z_jk[:, k]
+ if compute_correlations:
+ sam_meta[f"Eigenassay-{k}-correlation"] = p_jk[:, k]
+
+ ret = {"sample_eigenassay_projections": sam_meta}
+ if return_raw_decomposition:
+ ret["raw_decomposition"] = (U, S, V)
+
+ return ret
+
+r"""
+=================
+Escape Profile
+=================
+
+Use
+`optimal transport method <https://en.wikipedia.org/wiki/Transportation_theory_(mathematics)>`_
+to compare
+`phage-dms <https://www.sciencedirect.com/science/article/pii/S2589004220308142>`_
+escape profiles.
+See the escape profile :ref:`description and examples <sec_escape_profile_comparisons>`.
+"""
+
+# dependencies
+import pandas as pd
+import numpy as np
+import xarray as xr
+import ot
+from phippery.utils import id_query
+from Bio.Align import substitution_matrices
+
+
+def _get_aa_ordered_list():
+ """
+ return the ordered list of amino acid.
+ This convention is based on the BLOSUM
+ matrix in biopython and assumed for the
+ binned distribution presenting amino
+ acid contribution to differential
+ selection at a site
+ """
+ return list("ARNDCQEGHILKMFPSTWYV")
+
+
+[docs]def get_cost_matrix():
+ """
+ Returns the default 40x40 cost matrix based on BLOSUM62
+ and assigns maximum cost to transport between
+ opposite signed differential selection contributions.
+
+ Returns
+ -------
+ list :
+ 40x40 matrix (as a list of lists)
+ """
+
+ substitution_matrix = substitution_matrices.load("BLOSUM62")
+ alphabet_list = _get_aa_ordered_list()
+ Naa = len(alphabet_list)
+
+ # chosen so that range of costs in the
+ # matrix is within an order of magnitude
+ nthroot = 7.0
+
+ # maximum cost assigned by the cost matrix
+ maxMij = np.exp(np.max(-substitution_matrix) / nthroot)
+
+ cost_matrix = []
+
+ # first 20 rows
+ for aa in alphabet_list:
+ row = [-x / nthroot for x in substitution_matrix[aa, :][:Naa]]
+ cost_row = (np.exp(row)).tolist() + [maxMij for i in range(Naa)]
+ cost_matrix.append(cost_row)
+
+ # last 20 rows
+ for aa in alphabet_list:
+ row = [-x / nthroot for x in substitution_matrix[aa, :][:Naa]]
+ cost_row = [maxMij for i in range(Naa)] + (np.exp(row)).tolist()
+ cost_matrix.append(cost_row)
+
+ return cost_matrix
+
+
+[docs]def get_loc_esc_distr(ds, metric, sample_factor, sfact_val, loc):
+ """
+ Returns the normalized distribution represented as a list
+ for the amino acid pattern of scaled differential
+ selection for a specified individual and amino acid site.
+
+ Parameters
+ ----------
+ ds : xarray.DataSet
+ The dataset containing the sample of interest.
+
+ metric : str
+ The name of the scaled differential selection data in ds.
+
+ sample_factor : str
+ The sample annotation label to identify the individual sample (e.g. 'sample_ID').
+
+ sfact_val : str
+ The sample_factor value to identify the sample of interest.
+
+ loc : int
+ The location number for the amino acid site of interest.
+
+ Returns
+ -------
+ list :
+ The relative contributions to the total absolute scaled differential selection at the site.
+ The first 20 entries are contributions to negative selection (binding loss).
+ The last 20 entries are contributions to positive selection (binding gain).
+ """
+
+ # Code assumes peptide annotation for location is called 'Loc',
+ # and for amino acid substitution is called 'aa_sub'
+ my_ds = ds.loc[
+ dict(
+ peptide_id=id_query(ds, f"Loc == '{loc}'", "peptide"),
+ sample_id=id_query(ds, f"{sample_factor} == '{sfact_val}'")
+ )
+ ]
+
+ diff_sel = my_ds[metric].to_pandas().to_numpy().flatten()
+
+ my_df = my_ds.peptide_table.loc[:, ["aa_sub"]].to_pandas()
+ my_df["diff_sel"] = diff_sel
+
+ esc_data_neg = []
+ esc_data_pos = []
+ for aa in _get_aa_ordered_list():
+ val = my_df[my_df["aa_sub"] == aa]["diff_sel"].item()
+ if val > 0:
+ esc_data_neg.append(0)
+ esc_data_pos.append(val)
+ else:
+ esc_data_neg.append(-val)
+ esc_data_pos.append(0)
+
+ esc_data = esc_data_neg + esc_data_pos
+
+ if np.sum(esc_data) == 0:
+ return esc_data
+ else:
+ return esc_data / np.sum(esc_data)
+
+
+def _get_weights(ds, metric, sample_factor, sfact_val1, sfact_val2, loc_start, loc_end):
+ """
+ return the list of weights for computing the
+ weighted sum of similarity scores in region
+ [loc_start, loc_end]
+ """
+
+ # Code assumes peptide annotation for location is called 'Loc'
+
+ loc_sums1 = []
+ loc_sums2 = []
+ for loc in range(loc_start, loc_end + 1):
+ ds1 = ds.loc[
+ dict(
+ peptide_id=id_query(ds, f"Loc == '{loc}'", "peptide"),
+ sample_id=id_query(ds, f"{sample_factor} == '{sfact_val1}'")
+ )
+ ]
+
+ diff_sel1 = ds1[metric].to_pandas().to_numpy().flatten()
+ loc_sums1.append(0)
+ for val in diff_sel1:
+ loc_sums1[-1] = loc_sums1[-1] + abs(val)
+
+ ds2 = ds.loc[
+ dict(
+ peptide_id=id_query(ds, f"Loc == '{loc}'", "peptide"),
+ sample_id=id_query(ds, f"{sample_factor} == '{sfact_val2}'")
+ )
+ ]
+
+ diff_sel2 = ds2[metric].to_pandas().to_numpy().flatten()
+ loc_sums2.append(0)
+ for val in diff_sel2:
+ loc_sums2[-1] = loc_sums2[-1] + abs(val)
+
+ loc_sums1 = loc_sums1 / np.sum(loc_sums1)
+ loc_sums2 = loc_sums2 / np.sum(loc_sums2)
+
+ weights = {}
+ total = 0
+ for i, loc in zip(range(loc_end - loc_start + 1), range(loc_start, loc_end + 1)):
+ val = min(loc_sums1[i], loc_sums2[i])
+ total = total + val
+ weights[loc] = val
+
+ weights = {k: v / total for k, v in weights.items()}
+
+ return weights
+
+
+[docs]def compute_sim_score(a, b, cost_matrix):
+ """
+ Returns the similarity score given
+ two distributions and the cost matrix.
+
+ Parameters
+ ----------
+ a : list
+ A distribution of relative contribution for each amino acid to scaled differential selection.
+
+ b : list
+ Another distribution of relative contribution for each amino acid to scaled differential selection.
+
+ cost_matrix : list
+ The cost matrix to evaluate optimal transport from a to b.
+
+ Returns
+ -------
+ float :
+ The similarity score (reciprocal of the optimal transport cost).
+ """
+
+ if np.sum(a) == 0 or np.sum(b) == 0:
+ return 0
+
+ cost = ot.emd2(a, b, cost_matrix)
+ return 1.0 / cost
+
+
+[docs]def loc_sim_score(ds, metric, cost_matrix, sample_factor, sfact_val1, sfact_val2, loc):
+ """
+ Returns the similarity score for comparison at a site between two samples.
+
+ Parameters
+ ----------
+ ds : xarray.DataSet
+ The dataset containing the sample of interest.
+
+ metric : str
+ The name of the scaled differential selection data in ds.
+
+ cost_matrix : list
+ The cost matrix to evaluate optimal transport between two distributions.
+
+ sample_factor : str
+ The sample annotation label to identify the samples (e.g. 'sample_ID').
+
+ sfact_val1 : str
+ The sample_factor value to identify sample 1.
+
+ sfact_val2 : str
+ The sample_factor value to identify sample 2.
+
+ loc : int
+ The location number for the amino acid site of interest.
+
+ Returns
+ -------
+ float :
+ The similarity score at the amino acid site.
+ """
+
+ a = get_loc_esc_distr(ds, metric, sample_factor, sfact_val1, loc)
+ b = get_loc_esc_distr(ds, metric, sample_factor, sfact_val2, loc)
+
+ return compute_sim_score(a, b, cost_matrix)
+
+
+[docs]def region_sim_score(
+ ds, metric, cost_matrix, sample_factor, sfact_val1, sfact_val2, loc_start, loc_end
+):
+ """
+ Returns the similarity score for comparison in the
+ region [loc_start, loc_end].
+
+ Parameters
+ ----------
+ ds : xarray.DataSet
+ The dataset containing the sample of interest
+
+ metric : str
+ The name of the scaled differential selection data in ds.
+
+ cost_matrix : list
+ The cost matrix to evaluate optimal transport between two distributions.
+
+ sample_factor : str
+ The sample annotation label to identify the samples (e.g. 'sample_ID').
+
+ sfact_val1 : str
+ The sample_factor value to identify sample 1.
+
+ sfact_val2 : str
+ The sample_factor value to identify sample 2.
+
+ loc_start : int
+ The location number for the first amino acid site in the region of interest.
+
+ loc_end : int
+ The location number for the last amino acid site in the region of interest.
+
+ Returns
+ -------
+ float :
+ The similarity score for the region.
+ """
+
+ weights = _get_weights(
+ ds, metric, sample_factor, sfact_val1, sfact_val2, loc_start, loc_end
+ )
+ region_sim = 0
+ for loc in range(loc_start, loc_end + 1):
+ sim = weights[loc] * loc_sim_score(
+ ds, metric, cost_matrix, sample_factor, sfact_val1, sfact_val2, loc
+ )
+ region_sim = region_sim + sim
+
+ return region_sim
+
+r"""
+=================
+Modeling
+=================
+
+A collection of interfaces for modeling binding enrichments,
+and estimating significance.
+"""
+
+import numpy as np
+import xarray as xr
+import copy
+import scipy.stats as st
+from phippery.gampois import fit_gamma
+from phippery.gampois import gamma_poisson_posterior_rates
+from phippery.gampois import mlxp_gamma_poisson
+from phippery.zscore import zscore_pids_binning
+from phippery.zscore import compute_zscore
+
+
+[docs]def gamma_poisson_model(
+ ds,
+ starting_alpha=0.8,
+ starting_beta=0.1,
+ trim_percentile=99.9,
+ data_table="size_factors",
+ inplace=True,
+ new_table_name="gamma_poisson_mlxp",
+):
+ r"""Fit a Gamma distribution to determine Poisson rates
+ per peptide for the non-specific binding background and estimate the
+ :math:`-\log_{10}(p)` value, or *mlxp*,
+ for each sample-peptide enrichment in the dataset provided.
+ We use the following parameterization of the Gamma distribution:
+
+ .. math::
+ f(x) = \frac{\beta^\alpha}{\Gamma(\alpha)}x^{\alpha-1}e^{-\beta x}
+
+ The fit is performed on the distribution of peptide average counts to
+ obtain :math:`\alpha` and :math:`\beta`. If there are :math:`n`
+ samples involved in the fit, and for a given peptide with counts
+ :math:`x_1, x_2, \ldots, x_n`, the background Poisson distribution
+ is determined by the rate,
+
+ .. math::
+ \lambda = \frac{\alpha + \sum^n_{k=1} x_k}{\beta + n}
+
+ Note
+ ----
+ much of this source code is derived from
+ https://github.com/lasersonlab/phip-stat
+ and written by Uri Laserson.
+
+
+ Parameters
+ ----------
+ ds : xarray.DataSet
+ The dataset you would like to fit to.
+
+ starting_alpha : float
+ Initial value for the shape parameter of the Gamma distribution.
+
+ starting_beta : float
+ Initial value for the rate parameter of the Gamma distribution.
+
+ trim_percentile : float
+ The percentile cutoff for removing peptides with very high counts.
+ (e.g. a value of 98 means peptides in the highest 2% in counts
+ would be removed from the fit)
+ This parameter is used to remove potential signal peptides that
+ would bias the fit.
+
+ data_table : str
+ The name of the enrichment layer you would like to fit mlxp to.
+
+ new_table_name : str
+ The name of the new layer you would like to append to the dataset.
+
+ inplace : bool
+ If True, then this function
+ appends a dataArray to ds which is indexed with the same coordinate dimensions as
+ 'data_table'. If False, a copy of ds is returned with the appended dataArray
+
+ Returns
+ -------
+ tuple :
+ the alpha, and beta values of the fit. If inplace is false, a copy of the new dataset
+ is returned first.
+ """
+
+ print(
+ f"Running new modeling with starting alpha = {starting_alpha}, beta = {starting_beta}"
+ )
+
+ if data_table not in ds:
+ raise KeyError(f"{data_table} is not included in dataset.")
+
+ counts = copy.deepcopy(ds[f"{data_table}"].to_pandas())
+ counts = counts.round(2)
+
+ upper_bound = st.scoreatpercentile(counts.values, trim_percentile)
+ trimmed_means = np.ma.mean(
+ np.ma.masked_greater(counts.values, upper_bound), axis=1
+ ).data
+ alpha, beta = fit_gamma(
+ trimmed_means, starting_alpha=starting_alpha, starting_beta=starting_beta
+ )
+ background_rates = gamma_poisson_posterior_rates(counts, alpha, beta, upper_bound)
+ counts.loc[:, :] = mlxp_gamma_poisson(counts, background_rates)
+
+ if inplace:
+ ds[new_table_name] = xr.DataArray(counts, dims=ds[data_table].dims)
+ return (alpha, beta)
+ else:
+ ds_copy = copy.deepcopy(ds)
+ ds_copy[new_table_name] = xr.DataArray(counts, dims=ds[data_table].dims)
+ return (alpha, beta), ds_copy
+
+
+[docs]def zscore(
+ ds,
+ beads_ds,
+ data_table="cpm",
+ min_Npeptides_per_bin=300,
+ lower_quantile_limit=0.05,
+ upper_quantile_limit=0.95,
+ inplace=True,
+ new_table_name="zscore",
+):
+ r"""Calculate a Z-score of empirical enrichment relative to
+ expected background mean CPM (:math:`\mu`) and stddev CPM (:math:`\sigma`)
+ from beads-only samples,
+ for each sample-peptide enrichment in the dataset provided.
+ For a peptide with CPM :math:`n`, the Z-score is,
+
+ .. math::
+ z = \frac{n - \mu}{\sigma}
+
+ Note
+ ----
+ This implementation follows the method described in the
+ supplement to DOI:10.1126/science.aay6485.
+
+ Parameters
+ ----------
+ ds : xarray.DataSet
+ The dataset containing samples to estimate significance on.
+
+ beads_ds : xarray.DataSet
+ The dataset containing beads only control samples to estimate
+ background means and stddevs.
+
+ min_Npeptides_per_bin : int
+ Minimum number of peptides per bin.
+
+ lower_quantile_limit : float
+ Counts below this quantile are ignored for computing background mean and stddev.
+
+ upper_quantile_limit : float
+ Counts above this quantile are ignored for computing background mean and stddev.
+
+ data_table : str
+ The name of the enrichment layer from which you would like to compute Z-scores.
+
+ new_table_name : str
+ The name of the new layer you would like to append to the dataset.
+
+ inplace : bool
+ If True, then this function
+ appends a dataArray to ds which is indexed with the same coordinate dimensions as
+ 'data_table'. If False, a copy of ds is returned with the appended dataArray
+
+ Returns
+ -------
+ None, xarray.DataSet :
+ If inplace is false, a copy of the new dataset is returned.
+ """
+ # This is a wrapper function for our xarray dataset.
+
+ # for each sample in the dataset provided, compute Z-score following the method described
+ # in the supplement to DOI:10.1126/science.aay6485
+
+ # If 'inplace' parameter is True, then this function
+ # appends a dataArray to ds which is indexed with the same coordinate dimensions as
+ #'data_table'. If False, a copy of ds is returned with the appended dataArray
+ # """
+
+ binning = zscore_pids_binning(beads_ds, data_table, min_Npeptides_per_bin)
+
+ zscore_table = copy.deepcopy(ds[f"{data_table}"].to_pandas())
+ zs_df, mu_df, sigma_df = compute_zscore(
+ ds, data_table, binning, lower_quantile_limit, upper_quantile_limit
+ )
+ zscore_table.loc[:, :] = zs_df
+
+ if inplace:
+ ds[new_table_name] = xr.DataArray(zscore_table, dims=ds[data_table].dims)
+ return None
+ else:
+ ds_copy = copy.deepcopy(ds)
+ ds_copy[new_table_name] = xr.DataArray(zscore_table, dims=ds[data_table].dims)
+ return ds_copy
+
+"""
+=================
+Normalize
+=================
+
+A set of useful functions for normalizing enrichments
+in a phippery dataset.
+"""
+
+import numpy as np
+from numpy.linalg import svd
+import xarray as xr
+import pandas as pd
+import copy
+
+from phippery.utils import iter_groups
+from phippery.utils import id_query
+from phippery.utils import to_tall
+
+
+[docs]def standardized_enrichment(
+ ds,
+ lib_ds,
+ beads_ds,
+ data_table="counts",
+ inplace=True,
+ new_table_name="std_enrichment",
+):
+ """Compute standardized enrichment of sample counts.
+ This is the *fold enrichment* of each sample's frequency
+ compared to the library average frequency, minus the mock IP
+ (beads only control) frequency.
+
+ Note
+ ----
+ Psuedo counts and exact calculation are
+ derived from the Bloom Lab's formulated normalization
+ heuristic for differential selection. See
+ https://jbloomlab.github.io/dms_tools2/diffsel.html#id5
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to fit to
+
+ lib_ds : xarray.DataSet
+ The dataset of phage library control samples used in normalization
+
+ beads_ds : xarray.DataSet
+ The dataset of beads only control samples used in normalization
+
+ data_table : str
+ The name of the enrichment layer you would like to fit mlxp to.
+
+ new_table_name : str
+ The name of the new layer you would like to append to the dataset.
+
+ inplace : bool
+ If True, then this function
+ appends a dataArray to ds which is indexed with the same coordinate dimensions as
+ 'data_table'. If False, a copy of ds is returned with the appended dataArray
+
+
+ Returns
+ -------
+
+ xarray.DataSet :
+ If inplace is False, return a new DataSet object which has
+ the enrichment values appended
+
+ Example
+ -------
+ Given a dataset, ``ds``, with the following samples and
+ counts across 10 peptides:
+
+ >>> phippery.get_annotation_table(ds, "sample")
+ sample_metadata fastq_filename reference seq_dir sample_type
+ sample_id
+ 0 sample_0.fastq refa expa beads_only
+ 1 sample_1.fastq refa expa beads_only
+ 2 sample_2.fastq refa expa library
+ 3 sample_3.fastq refa expa library
+ 4 sample_4.fastq refa expa IP
+ 5 sample_5.fastq refa expa IP
+ 6 sample_6.fastq refa expa IP
+ 7 sample_7.fastq refa expa IP
+ 8 sample_8.fastq refa expa IP
+ 9 sample_9.fastq refa expa IP
+ >>> ds.counts
+ <xarray.DataArray 'counts' (peptide_id: 5, sample_id: 10)>
+ array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+ [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+ [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+ [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+ [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
+ Coordinates:
+ * sample_id (sample_id) int64 0 1 2 3 4 5 6 7 8 9
+ * peptide_id (peptide_id) int64 0 1 2 3 4
+
+ First, separate the dataset into it's various samples types using
+ :func:`phippery.utils.ds_query`
+
+ >>> ip_ds = ds_query(ds, "sample_type == 'IP'")
+ >>> lib_ds = ds_query(ds, "sample_type == 'library'")
+ >>> mockip_ds = ds_query(ds, "sample_type == 'beads_only'")
+
+ We can then compute standardized fold enrichment like so:
+
+ >>> phippery.normalize.standardized_enrichment(ip_ds, lib_ds, mockip_ds)
+
+ which will modify the ``ip_ds`` dataset inplace to include a new table
+
+ >>> ip_ds.std_enrichment
+ <xarray.DataArray 'std_enrichment' (peptide_id: 5, sample_id: 6)>
+ array([[0., 0., 0., 0., 0., 0.],
+ [0., 0., 0., 0., 0., 0.],
+ [0., 0., 0., 0., 0., 0.],
+ [0., 0., 0., 0., 0., 0.],
+ [0., 0., 0., 0., 0., 0.]])
+ Coordinates:
+ * sample_id (sample_id) int64 4 5 6 7 8 9
+ * peptide_id (peptide_id) int64 0 1 2 3 4
+
+ Note that we expect the result to be all zeros because
+ a 1-to-1 fold enrichment for ip's to library samples minus
+ 1-to-1 beads to library
+ """
+
+ if data_table not in ds:
+ avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ raise KeyError(
+ f"{data_table} is not included in dataset. \n available datasets: {avail}"
+ )
+
+ for control in [lib_ds, beads_ds]:
+ if not isinstance(control, xr.Dataset):
+ raise ValueError(
+ "ds_lib_control_indices must be of type list, even if there is only a single value"
+ )
+
+ std_enrichments = xr.DataArray(
+ _comp_std_enr(
+ counts=ds[data_table].to_pandas(),
+ lib_counts=lib_ds[data_table].to_pandas(),
+ mock_ip_counts=beads_ds[data_table].to_pandas(),
+ ),
+ dims=ds[data_table].dims,
+ )
+
+ if inplace:
+ ds[new_table_name] = std_enrichments
+ return None
+ else:
+ ds_copy = copy.deepcopy(ds)
+ ds_copy[new_table_name] = std_enrichments
+ return ds_copy
+
+
+def _comp_std_enr(counts, lib_counts, mock_ip_counts):
+ """Computes standardized enrichment."""
+
+ normalized_ds_counts = copy.deepcopy(counts)
+
+ # find controls and average all
+ ds_lib_counts_mean = lib_counts.mean(axis=1)
+ ds_bead_counts_mean = mock_ip_counts.mean(axis=1)
+ ds_lib_counts_mean_sum = sum(ds_lib_counts_mean)
+
+ # compute beads control std enrichment
+ pseudo_sample = ds_bead_counts_mean + max(
+ 1, sum(ds_bead_counts_mean) / ds_lib_counts_mean_sum
+ )
+ pseudo_lib_counts = ds_lib_counts_mean + max(
+ 1, ds_lib_counts_mean_sum / sum(ds_bead_counts_mean)
+ )
+ pseudo_sample_freq = pseudo_sample / sum(pseudo_sample)
+ pseudo_lib_counts_freq = pseudo_lib_counts / sum(pseudo_lib_counts)
+ pseudo_bead_enrichment = pseudo_sample_freq / pseudo_lib_counts_freq
+
+ # compute all sample standardized enrichment
+ for sample_id, sample in counts.items():
+ pseudo_sample = sample + max(1, sum(sample) / ds_lib_counts_mean_sum)
+ pseudo_lib_counts = ds_lib_counts_mean + max(
+ 1, ds_lib_counts_mean_sum / sum(sample)
+ )
+ pseudo_sample_freq = pseudo_sample / sum(pseudo_sample)
+ pseudo_lib_counts_freq = pseudo_lib_counts / sum(pseudo_lib_counts)
+ sample_enrichment = pseudo_sample_freq / pseudo_lib_counts_freq
+ normalized_ds_counts.loc[:, sample_id] = (
+ sample_enrichment - pseudo_bead_enrichment
+ )
+
+ return normalized_ds_counts
+
+
+[docs]def enrichment(
+ ds,
+ lib_ds,
+ data_table="counts",
+ inplace=True,
+ new_table_name="enrichment",
+):
+ """This function computes fold enrichment in the same fashion as
+ the **standardized_enrichment**, but does *not* subtract beads only controls
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to fit to
+
+ lib_ds : xarray.DataSet
+ The dataset of phage library control samples used in normalization
+
+ data_table : str
+ The name of the enrichment layer you would like to fit mlxp to.
+
+ new_table_name : str
+ The name of the new layer you would like to append to the dataset.
+
+ inplace : bool
+ If True, then this function
+ appends a dataArray to ds which is indexed with the same coordinate dimensions as
+ 'data_table'. If False, a copy of ds is returned with the appended dataArray
+
+ Returns
+ -------
+
+ xarray.DataSet :
+ If inplace is False, return a new DataSet object which has
+ the enrichment values appended
+ """
+
+ if data_table not in ds:
+ avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ raise KeyError(
+ f"{data_table} is not included in dataset. \n available datasets: {avail}"
+ )
+
+ if not isinstance(lib_ds, xr.Dataset):
+ raise ValueError(
+ "ds_lib_control_indices must be of type list, even if there is only a single value"
+ )
+
+ enrichments = xr.DataArray(
+ _comp_enr(
+ counts=ds[data_table].to_pandas(), lib_counts=lib_ds[data_table].to_pandas()
+ ),
+ dims=ds[data_table].dims,
+ )
+
+ if inplace:
+ ds[new_table_name] = enrichments
+ return None
+ else:
+ ds_copy = copy.deepcopy(ds)
+ ds_copy[new_table_name] = enrichments
+ return ds_copy
+
+
+def _comp_enr(counts, lib_counts):
+ """Compute enrichment values."""
+
+ # we are going to add an augmented counts matrix
+ enrichments = copy.deepcopy(counts)
+
+ # find controls and average all
+ lib_counts_mean = lib_counts.mean(axis=1)
+ lib_counts_mean_sum = sum(lib_counts_mean)
+
+ # compute all sample standardized enrichment
+ for sample_id, sample in enrichments.items():
+
+ pseudo_sample = sample + max(1, sum(sample) / lib_counts_mean_sum)
+ pseudo_lib_control = lib_counts_mean + max(1, lib_counts_mean_sum / sum(sample))
+ pseudo_sample_freq = pseudo_sample / sum(pseudo_sample)
+ pseudo_lib_control_freq = pseudo_lib_control / sum(pseudo_lib_control)
+ sample_enrichment = pseudo_sample_freq / pseudo_lib_control_freq
+ enrichments.loc[:, sample_id] = sample_enrichment
+
+ return enrichments
+
+
+[docs]def svd_rank_reduction(
+ ds,
+ rank=1,
+ data_table="enrichment",
+ inplace=True,
+ new_table_name="svd_rr",
+):
+ r"""
+ This function computes the singular value decomposition,
+ then recomputes the enrichment matrix up to the rank specified.
+
+ Concretely, given a Matrix of, :math:`X` enrichments in the
+ `phippery` dataset with shape (peptides, samples). We compute
+ the decomposition :math:`X = USV^{T}`, then return the
+ recomposition using the first
+ ``rank`` eigenvectors and eigenvalues.
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to fit to
+
+ rank : int
+ The number of ranks to include in the reconstruction.
+
+ data_table : str
+ The name of the enrichment layer you would like to fit mlxp to.
+
+ new_table_name : str
+ The name of the new layer you would like to append to the dataset.
+
+ inplace : bool
+ If True, then this function
+ appends a dataArray to ds which is indexed with the same coordinate dimensions as
+ 'data_table'. If False, a copy of ds is returned with the appended dataArray
+
+
+ Returns
+ -------
+
+ xarray.DataSet :
+ If inplace is False, return a new DataSet object which has
+ the enrichment values appended
+ """
+
+ if data_table not in ds:
+ avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ raise KeyError(
+ f"{data_table} is not included in dataset. \n available datasets: {avail}"
+ )
+
+ low_rank_dt = copy.deepcopy(ds[data_table].to_pandas())
+
+ # compute rank reduction decomposition matrices
+ U, S, V = svd(low_rank_dt.values)
+
+ # Grab the first X outer products in the finite summation of rank layers.
+ low_rank = U[:, :rank] @ np.diag(S[:rank]) @ V[:rank, :]
+ low_rank_dt.loc[:, :] = low_rank
+ svd_rr_approx = xr.DataArray(low_rank_dt, dims=ds[data_table].dims)
+
+ if inplace:
+ ds[new_table_name] = svd_rr_approx
+ return None
+ else:
+ ds_copy = copy.deepcopy(ds)
+ ds_copy[new_table_name] = svd_rr_approx
+ return ds_copy
+
+
+[docs]def svd_aa_loc(
+ ds,
+ rank=1,
+ data_table="enrichment",
+ protein_name_column="Protein",
+ location_col="Loc",
+ aa_sub_col="aa_sub",
+ inplace=True,
+ new_table_name="svd_rr",
+):
+ """
+ Compute singular value decomposition rank reduction
+ on the aa / loc matrix by pivoting before computing decomposition
+ and re-shaping to add to the dataset.
+
+ Note
+ ----
+
+ This function is meant to be used specifically with phage-dms data
+ where the peptide table includes a "loc" column, and an "aa_sub_col"
+ which specifies the amino acid at that location.
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to fit to
+
+ data_table : str
+ The name of the enrichment layer you would like to fit mlxp to.
+
+ protein_name_column : str
+ The peptide table feature which specifies which protein a specific peptide
+ derives from.
+
+ location_col : str
+ The peptide table feature which specifies the site that a particular peptide
+ is centered at.
+
+ aa_sub_col : str
+ The peptide table feature which specifies the amino acid at a given site.
+
+
+ new_table_name : str
+ The name of the new layer you would like to append to the dataset.
+
+ inplace : bool
+ If True, then this function
+ appends a dataArray to ds which is indexed with the same coordinate dimensions as
+ 'data_table'. If False, a copy of ds is returned with the appended dataArray
+
+ Returns
+ -------
+
+ xarray.DataSet :
+ If inplace is False, return a new DataSet object which has
+ the enrichment values appended
+ """
+
+ if data_table not in ds:
+ avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ raise KeyError(
+ f"{data_table} is not included in dataset. \n available datasets: {avail}"
+ )
+
+ low_rank_dt = copy.deepcopy(ds[data_table].to_pandas())
+
+ for prot, prot_ds in iter_groups(ds, protein_name_column, "peptide"):
+
+ for sid in prot_ds.sample_id.values:
+
+ # grab the single sample ds
+ rep_ds = prot_ds.loc[
+ dict(
+ sample_id=[sid],
+ sample_metadata=["sample_ID"],
+ peptide_metadata=[aa_sub_col, location_col],
+ )
+ ]
+
+ # melt
+ tall = to_tall(rep_ds)
+
+ # Pivot so that we get the (aa X Loc)
+ piv = tall.pivot(index=aa_sub_col, columns=location_col, values=data_table)
+
+ # Preserve the indices for population of new enrichment table
+ piv_index = tall.pivot(
+ index=aa_sub_col, columns=location_col, values="peptide_id"
+ )
+
+ # compute rank reduction decomposition matrices
+ U, S, V = svd(piv)
+
+ # Grab the first X outer products in the finite summation of rank layers.
+ low_rank = U[:, :rank] @ np.diag(S[:rank]) @ V[:rank, :]
+
+ low_rank_piv = pd.DataFrame(low_rank, index=piv.index, columns=piv.columns)
+ melted_values = pd.melt(low_rank_piv.reset_index(), id_vars=[aa_sub_col])
+ melted_index = pd.melt(piv_index.reset_index(), id_vars=[aa_sub_col])
+ melted_values["peptide_id"] = melted_index["value"]
+ low_rank_dt.loc[melted_values["peptide_id"], sid] = melted_values[
+ "value"
+ ].values
+
+ svd_rr_approx = xr.DataArray(low_rank_dt, dims=ds[data_table].dims)
+
+ if inplace:
+ ds[new_table_name] = svd_rr_approx
+ return None
+ else:
+ ds_copy = copy.deepcopy(ds)
+ ds_copy[new_table_name] = svd_rr_approx
+ return ds_copy
+
+
+[docs]def differential_selection_wt_mut(
+ ds,
+ data_table="enrichment",
+ scaled_by_wt=False,
+ smoothing_flank_size=0,
+ groupby=["Protein"],
+ loc_column="Loc",
+ is_wt_column="is_wt",
+ inplace=True,
+ new_table_name="wt_mutant_differential_selection",
+ relu_bias=None,
+ skip_samples=set(),
+):
+ r"""A generalized function to compute differential selection
+ of amino acid variants in relation to the wildtype sequence.
+ The function computed log fold change between enrichments
+ of a wildtype and mutation at any given site.
+
+ Concretely, given some site, :math:`s` (defined by `loc_column`
+ feature in the peptide table) in the enrichment
+ matrix, :math:`X`,
+ the differential selection of any mutation with enrichment, :math:`m`,
+ at a site with wildtype enrichment, :math:`wt`, is :math:`\log_{2}(m/wt)`.
+
+ Note
+ ----
+
+ This function is meant to be used specifically with phage-dms data
+ where the peptide table includes a "loc" column, and an "aa_sub_col"
+ which specifies the amino acid at that location.
+
+ Note
+ ----
+ This calculation of differential selection
+ is derived from the Bloom Lab's formulated from
+ https://jbloomlab.github.io/dms_tools2/diffsel.html#id5
+
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to fit to
+
+ data_table : str
+ The name of the enrichment layer you would like to fit mlxp to.
+
+ scaled_by_wt : bool
+ A boolean flag indicating whether or not you would like to multiply the
+ differential selection value by the wildtype enrichment
+
+ smoothing_flank_size : int
+ This parameter should only be used if **scaled_by_wt** is true.
+ By specifying an integer greater than 0, you are scaling the differential
+ selection value by enrichment values of the wildtype peptides surrounding
+ , in both directions, a given site. The integer specified here then
+ determines how many peptides are used for the scaling in both directions.
+
+ groupby: list[str]
+ This will specify which peptide feature groups such that site-mutation
+ combinations are unique.
+
+ loc_column : str
+ The peptide table feature which specifies the site that a particular peptide
+ is centered at.
+
+ is_wt_column : str
+ The column specifying which peptides are wildtype.
+
+ relu_bias : int
+ If an integer is specified, then enrichment values less than
+ 1 are replaced by the specified value before computing differential
+ selection.
+
+ skip_samples : set
+ sample id's which you do not want to calculate the differential selection on,
+ such as controls. This function has many nested loops, so avoid computing
+ on unnecessary samples.
+
+ new_table_name : str
+ The name of the new layer you would like to append to the dataset.
+
+ inplace : bool
+ If True, then this function
+ appends a dataArray to ds which is indexed with the same coordinate dimensions as
+ 'data_table'. If False, a copy of ds is returned with the appended dataArray
+
+ Returns
+ -------
+
+ xarray.DataSet :
+ If inplace is False, return a new DataSet object which has
+ the enrichment values appended
+
+ """
+
+ if data_table not in ds:
+ avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ raise KeyError(
+ f"{data_table} is not included in dataset. \n available datasets: {avail}"
+ )
+
+ diff_sel = copy.deepcopy(ds[data_table])
+ sw = smoothing_flank_size
+
+ for group, group_ds in iter_groups(ds, groupby, "peptide"):
+
+ wt_pep_id = id_query(group_ds, f"{is_wt_column} == True", "peptide")
+
+ group_loc = group_ds.peptide_table.loc[wt_pep_id, loc_column].values
+ for i, loc in enumerate(group_loc):
+
+ loc_pid = id_query(group_ds, f"{loc_column} == {loc}", "peptide")
+ loc_ds = group_ds.loc[dict(peptide_id=loc_pid)]
+
+ # check that skip samples is of type list
+ sams = set(loc_ds.sample_id.values) - set(skip_samples)
+ for sam_id in sams:
+
+ wt_seq_enr = group_ds[data_table].loc[wt_pep_id, sam_id].values
+ wt_enrichment = float(wt_seq_enr[i])
+ scalar = (
+ _wt_window_scalar(list(wt_seq_enr), i, sw) if scaled_by_wt else 1
+ )
+ values = loc_ds[data_table].loc[:, sam_id].values
+
+ if relu_bias is not None:
+ values[values < 1] = relu_bias
+ dsel = _comp_diff_sel(wt_enrichment, values, scalar)
+
+ diff_sel.loc[list(loc_ds.peptide_id.values), sam_id] = dsel
+ assert diff_sel.loc[wt_pep_id[0], sam_id] == 0.0
+
+ if inplace:
+ ds[new_table_name] = diff_sel
+ return None
+ else:
+ ds_copy = copy.deepcopy(ds)
+ ds_copy[new_table_name] = diff_sel
+ return ds_copy
+
+
+def _wt_window_scalar(wt_enr, i, flank_size):
+ """
+ Get a scalar from a wt sequence with a certain flank size.
+ """
+
+ if flank_size == 0:
+ return wt_enr[i]
+
+ lcase = i - flank_size < 0
+ rcase = i + flank_size + 1 > len(wt_enr)
+ lflank = wt_enr[i - flank_size : i] if not lcase else wt_enr[:i]
+ rflank = wt_enr[i : i + flank_size + 1] if not rcase else wt_enr[i:]
+ window_enr = lflank + rflank
+ return sum(window_enr) / len(window_enr)
+
+
+[docs]def differential_selection_sample_groups(
+ ds,
+ sample_feature="library_batch",
+ is_equal_to="batch_a",
+ data_table="counts",
+ aggregate_function=np.mean,
+ inplace=True,
+ new_table_name="sample_group_differential_selection",
+):
+ """This function computes differential selection
+ between groups of samples.
+
+ Note
+ ----
+ This function is still experimental.
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to fit to
+
+ data_table : str
+ The name of the enrichment layer you would like to fit mlxp to.
+
+ new_table_name : str
+ The name of the new layer you would like to append to the dataset.
+
+ inplace : bool
+ If True, then this function
+ appends a dataArray to ds which is indexed with the same coordinate dimensions as
+ 'data_table'. If False, a copy of ds is returned with the appended dataArray
+
+
+ Returns
+ -------
+
+ xarray.DataSet :
+ If inplace is False, return a new DataSet object which has
+ the enrichment values appended
+
+
+ """
+
+ if data_table not in ds:
+ avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ raise KeyError(
+ f"{data_table} is not included in dataset. \n available datasets: {avail}"
+ )
+
+ diff_sel = copy.deepcopy(ds[data_table])
+ group_id = id_query(ds, f"{sample_feature} == '{is_equal_to}'")
+ group_enrichments = ds[data_table].loc[:, group_id].values
+ group_agg = np.apply_along_axis(aggregate_function, 1, group_enrichments)
+ for agg_enrich, peptide_id in zip(group_agg, ds.peptide_id.values):
+ all_other_sample_values = ds[data_table].loc[peptide_id, :].values
+ peptide_diff_sel = _comp_diff_sel(agg_enrich, all_other_sample_values)
+ diff_sel.loc[peptide_id, :] = peptide_diff_sel
+
+ if inplace:
+ ds[new_table_name] = diff_sel
+ return None
+ else:
+ ds_copy = copy.deepcopy(ds)
+ ds_copy[new_table_name] = diff_sel
+ return ds_copy
+
+
+def _comp_diff_sel(base, all_other_values, scalar=1):
+ """
+ a private function to compute differential selection of one values to a list of other values.
+ Optionally, you can scale each of the differential selection values by the base if desired.
+ """
+
+ if np.any(np.array(all_other_values) == 0):
+ raise ZeroDivisionError(
+ "All values for which we are computing differential selection must be non-zero"
+ )
+ diff_sel = np.array([np.log2(v / base) for v in all_other_values])
+ return diff_sel * scalar
+
+
+[docs]def size_factors(ds, inplace=True, data_table="counts", new_table_name="size_factors"):
+ r"""
+ Warning
+ -------
+ This method is deprecated. It is currently maintained only for reproducibility of previous results.
+
+
+ Compute size factors from
+ `Anders and Huber 2010
+ <https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-10-r106>`_.
+
+ Concretely, given a Matrix of enrichments, :math:`X_{i,j}`, in the
+ `phippery` dataset with shape (peptides, samples). Consider a *pseudo-reference sample*
+ where the count of each peptide is the geometric mean of counts for that peptide across
+ the samples in the dataset. For a sample to be normalized, calculate for each peptide
+ the ratio of peptide count in the sample to the peptide count in the pseudo-reference
+ sample. The median of this ratio, :math:`\hat S_{j}`, is the size factor to normalize
+ all counts in sample :math:`j`,
+
+ .. math::
+ \hat S_{j} = {median \atop i} \frac{X_{i,j}}{(\prod_{v=1}^{m}{X_{i,v}})^{1/m}}
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to fit to
+
+ data_table : str
+ The name of the enrichment layer you would like to fit mlxp to.
+
+ new_table_name : str
+ The name of the new layer you would like to append to the dataset.
+
+ inplace : bool
+ If True, then this function
+ appends a dataArray to ds which is indexed with the same coordinate dimensions as
+ 'data_table'. If False, a copy of ds is returned with the appended dataArray
+
+
+ Returns
+ -------
+
+ xarray.DataSet :
+ If inplace is False, return a new DataSet object which has
+ the enrichment values appended
+ """
+
+ if data_table not in ds:
+ avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ raise KeyError(
+ f"{data_table} is not included in dataset. \n available datasets: {avail}"
+ )
+
+ size_factors = _comp_size_factors(ds[data_table].to_pandas().values)
+ sf_da = xr.DataArray(size_factors, dims=ds[data_table].dims)
+
+ if inplace:
+ ds[new_table_name] = sf_da
+ return None
+ else:
+ ds_copy = copy.deepcopy(ds)
+ ds_copy[new_table_name] = sf_da
+ return ds_copy
+
+
+def _comp_size_factors(counts):
+
+ size_factors = copy.deepcopy(counts)
+ masked = np.ma.masked_equal(counts, 0)
+
+ if not isinstance(masked.mask, np.ndarray):
+ bool_mask = np.full(counts.shape, False, dtype=bool)
+ else:
+ bool_mask = ~masked.mask
+
+ geom_means = (
+ np.ma.exp(np.ma.log(masked).sum(axis=1) / (bool_mask).sum(axis=1))
+ .data[np.newaxis]
+ .T
+ )
+
+ size_factors = (
+ size_factors / np.ma.median(masked / geom_means, axis=0).data
+ ).round(2)
+
+ return size_factors
+
+
+[docs]def counts_per_million(
+ ds, inplace=True, new_table_name="cpm", per_sample=True, data_table="counts"
+):
+ r"""Compute counts per million.
+
+ Concretely, given a Matrix of enrichments, :math:`X`, in the
+ `phippery` dataset with shape P peptides and S samples,
+ we compute the :math:`i^{th}` peptide and :math:`j^{th}` sample position like so:
+
+ .. math::
+
+ \text{cpm}(X,i,j) = \left\{
+ \begin{array}{@{}ll@{}}
+ X_{i,j}/\sum_{p\in P} X_{p,j} \times 10^6, & \text{if per_sample is True} \\[10pt]
+ X_{i,j}/\sum_{p\in P}\sum_{s\in S} X_{p,s} \times 10^6, & \text{if per_sample is False}
+ \end{array}
+ \right.
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to fit to
+
+ per_sample : bool
+ If True, compute counts per million separately for each sample.
+ Otherwise, frequencies are computed as a ratio of each count to the sum of
+ all counts in the dataset.
+
+ data_table : str
+ The name of the enrichment layer you would like to fit mlxp to.
+
+ new_table_name : str
+ The name of the new layer you would like to append to the dataset.
+
+ inplace : bool
+ If True, then this function
+ appends a dataArray to ds which is indexed with the same coordinate dimensions as
+ 'data_table'. If False, a copy of ds is returned with the appended dataArray
+
+
+ Returns
+ -------
+
+ xarray.DataSet :
+ If inplace is False, return a new DataSet object which has
+ the enrichment values appended
+
+ Example
+ -------
+
+ >>> ds.counts.values
+ array([[8, 8, 3, 1],
+ [1, 3, 7, 4],
+ [9, 0, 5, 1],
+ [5, 2, 4, 4],
+ [1, 4, 1, 3],
+ [3, 4, 4, 0],
+ [4, 8, 2, 7],
+ [4, 3, 6, 5],
+ [0, 2, 9, 1],
+ [0, 5, 6, 5]])
+ >>> from phippery.normalize import counts_per_million
+ >>> counts_per_million(ds, new_table_name="cpm", inplace=True)
+ >>> ds.cpm.values
+ array([[228571.43, 205128.21, 63829.79, 32258.06],
+ [ 28571.43, 76923.08, 148936.17, 129032.26],
+ [257142.86, 0. , 106382.98, 32258.06],
+ [142857.14, 51282.05, 85106.38, 129032.26],
+ [ 28571.43, 102564.1 , 21276.6 , 96774.19],
+ [ 85714.29, 102564.1 , 85106.38, 0. ],
+ [114285.71, 205128.21, 42553.19, 225806.45],
+ [114285.71, 76923.08, 127659.57, 161290.32],
+ [ 0. , 51282.05, 191489.36, 32258.06],
+ [ 0. , 128205.13, 127659.57, 161290.32]])
+ """
+
+ if data_table not in ds:
+ avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ raise KeyError(
+ f"{data_table} is not included in dataset. \n available datasets: {avail}"
+ )
+
+ counts = ds[data_table].to_pandas().values
+ cpm = _comp_cpm_per_sample(counts) if per_sample else _comp_cpm(counts)
+ cpm_da = xr.DataArray(cpm, dims=ds[data_table].dims)
+
+ if inplace:
+ ds[new_table_name] = cpm_da
+ return None
+ else:
+ ds_copy = copy.deepcopy(ds)
+ ds_copy[new_table_name] = cpm_da
+ return ds_copy
+
+
+def _comp_cpm(counts):
+
+ ret = copy.deepcopy(counts)
+ return (ret / (ret.sum() / 1e6)).round(2)
+
+
+def _comp_cpm_per_sample(counts):
+
+ ret = copy.deepcopy(counts)
+ return (ret / (ret.sum(axis=0) / 1e6)).round(2)
+
+
+[docs]def rank_data(
+ ds,
+ data_table="counts",
+ inplace=True,
+ per_sample=False,
+ new_table_name="rank",
+):
+ """Compute the rank of specified enrichment layer.
+ The rank is descending
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to fit to
+
+ per_sample : bool
+ If True, compute rank separately for each sample.
+ Otherwise, frequencies are computed as a ratio of each count to the sum of
+ all counts in the dataset.
+
+ data_table : str
+ The name of the enrichment layer you would like to fit mlxp to.
+
+ new_table_name : str
+ The name of the new layer you would like to append to the dataset.
+
+ inplace : bool
+ If True, then this function
+ appends a dataArray to ds which is indexed with the same coordinate dimensions as
+ 'data_table'. If False, a copy of ds is returned with the appended dataArray
+
+
+ Returns
+ -------
+
+ xarray.DataSet :
+ If inplace is False, return a new DataSet object which has
+ the enrichment values appended
+
+ Example
+ -------
+
+ >>> ds["counts"].values
+ array([[533, 734, 399, 588],
+ [563, 947, 814, 156],
+ [ 47, 705, 750, 685],
+ [675, 118, 897, 290],
+ [405, 880, 772, 570],
+ [629, 961, 530, 63],
+ [633, 931, 268, 115],
+ [833, 290, 164, 184],
+ [ 18, 704, 359, 33],
+ [143, 486, 371, 415]])
+ >>> rank_data(ds, data_table="counts", per_sample=True)
+ >>> ds["rank"].values
+ array([[4, 5, 4, 8],
+ [5, 8, 8, 3],
+ [1, 4, 6, 9],
+ [8, 0, 9, 5],
+ [3, 6, 7, 7],
+ [6, 9, 5, 1],
+ [7, 7, 1, 2],
+ [9, 1, 0, 4],
+ [0, 3, 2, 0],
+ [2, 2, 3, 6]])
+ """
+
+ if data_table not in ds:
+ avail = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ raise KeyError(
+ f"{data_table} is not included in dataset. \n available datasets: {avail}"
+ )
+
+ counts = ds[data_table].to_pandas().values
+ cpm = _comp_rank_per_sample(counts) if per_sample else _comp_rank(counts)
+ cpm_da = xr.DataArray(cpm, dims=ds[data_table].dims)
+
+ if inplace:
+ ds[new_table_name] = cpm_da
+ return None
+ else:
+ ds_copy = copy.deepcopy(ds)
+ ds_copy[new_table_name] = cpm_da
+ return ds_copy
+
+
+def _comp_rank(enrichment):
+
+ ret = np.ones(enrichment.shape)
+ sample_data_sh = enrichment.shape
+ sample_data = enrichment.flatten()
+ temp = sample_data.argsort()
+ ranks = np.empty_like(temp)
+ ranks[temp] = np.arange(len(sample_data))
+ ret[:, :] = ranks.reshape(sample_data_sh)
+
+ return ret.astype(int)
+
+
+def _comp_rank_per_sample(enrichment):
+
+ ret = np.ones(enrichment.shape)
+ for sid in range(enrichment.shape[1]):
+ sample_data = enrichment[:, sid]
+ temp = sample_data.argsort()
+ ranks = np.empty_like(temp)
+ ranks[temp] = np.arange(len(sample_data))
+ ret[:, sid] = ranks.flatten()
+
+ return ret.astype(int)
+
+r"""
+=================
+Utils
+=================
+
+Utilities for building, indexing, and manipulating
+and xarray dataset topology
+specific to most **phippery** functions provided in this package
+"""
+
+# dependencies
+import pandas as pd
+import numpy as np
+import xarray as xr
+import scipy.stats as st
+
+# built-in python3
+import os
+import copy
+import itertools
+import pickle
+from functools import reduce
+from collections import defaultdict
+
+
+[docs]def iter_groups(ds, by, dim="sample"):
+ """This function returns an iterator
+ yielding subsets of the provided dataset,
+ grouped by items in the metadata of either of the
+ dimensions specified.
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset to iterate over.
+
+ Returns
+ -------
+
+ generator :
+ Returns subsets of the original dataset sliced by
+ either sample or peptide table groups.
+
+ Example
+ -------
+
+ >>> phippery.get_annotation_table(ds, "sample")
+ sample_metadata fastq_filename reference seq_dir sample_type
+ sample_id
+ 0 sample_0.fastq refa expa beads_only
+ 1 sample_1.fastq refa expa beads_only
+ 2 sample_2.fastq refa expa library
+ 3 sample_3.fastq refa expa library
+ >>> ds["counts"].values
+ array([[458, 204, 897, 419],
+ [599, 292, 436, 186],
+ [ 75, 90, 978, 471],
+ [872, 33, 108, 505],
+ [206, 107, 981, 208]])
+ >>> sample_groups = iter_groups(ds, by="sample_type")
+ >>> for group, phip_dataset in sample_groups:
+ ... print(group)
+ ... print(phip_dataset["counts"].values)
+ ...
+ beads_only
+ [[458 204]
+ [599 292]
+ [ 75 90]
+ [872 33]
+ [206 107]]
+ library
+ [[897 419]
+ [436 186]
+ [978 471]
+ [108 505]
+ [981 208]]
+ """
+
+ table = get_annotation_table(ds, dim=dim)
+ for group, group_df in table.groupby(by):
+ group_ds = ds.loc[{f"{dim}_id": list(group_df.index.values)}]
+ yield group, group_ds
+
+
+[docs]def get_annotation_table(ds, dim="sample"):
+ """
+ return a copy of the peptide table after converting all
+ the data types applying the pandas NaN heuristic
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset to extract an annotation from.
+
+ dim : str
+ The annotation table to grab: "sample" or "peptide".
+
+ Returns
+ -------
+
+ pd.DataFrame :
+ The annotation table.
+
+ """
+
+ st = ds[f"{dim}_table"].to_pandas().convert_dtypes()
+ st.index.name = f"{dim}_id"
+ return st
+
+
+[docs]def stitch_dataset(
+ counts,
+ peptide_table,
+ sample_table,
+):
+ """Build an phippery xarray dataset from individual
+ tables.
+
+ Note
+ ----
+ If the counts matrix that you're passing has the shape
+ (M x N) for M peptides, and N samples, the
+ sample table should have a len of N,
+ and peptide table should have len M
+
+ Parameters
+ ----------
+
+ counts : numpy.ndarray
+ The counts matrix for sample peptide enrichments.
+
+ sample_table : pd.DataFrame
+ The sample annotations corresponding to the columns of
+ the counts matrix.
+
+ peptide_table : pd.DataFrame
+ The peptide annotations corresponding to the rows of
+ the counts matrix.
+
+ Returns
+ -------
+
+ xarray.DataSet :
+ The formatted phippery xarray dataset used by most of the phippery functionality.
+
+ """
+
+ # make sure the coordinated match up.
+ assert np.all(counts.columns == sample_table.index)
+ assert np.all(counts.index == peptide_table.index)
+
+ # we are returning the xarray dataset organized by four coordinates seen below.
+ pds = xr.Dataset(
+ {
+ "counts": (["peptide_id", "sample_id"], counts),
+ "sample_table": (["sample_id", "sample_metadata"], sample_table),
+ "peptide_table": (["peptide_id", "peptide_metadata"], peptide_table),
+ },
+ coords={
+ "sample_id": counts.columns.values,
+ "peptide_id": counts.index.values,
+ "sample_metadata": sample_table.columns.values,
+ "peptide_metadata": peptide_table.columns.values,
+ },
+ )
+ return pds
+
+
+[docs]def collect_counts(counts):
+ r"""merge individual tsv files from individual samples alignments
+ into a counts matrix.
+
+ Parameters
+ ----------
+
+ counts : list[str]
+ A list a filepaths relative to current working directory to read in.
+ The filepaths should point to tab-separated files for each sample
+ which contains two columns (without headers):
+
+ 1. peptide ids - the integer peptide identifiers
+ 2. enrichments - the respective enrichments for any peptide id
+
+ Returns
+ -------
+
+ pd.DataFrame :
+ The merged enrichments with peptides as the index, and filenames as column names.
+
+ """
+
+ load = lambda path, sample: pd.read_csv( # noqa
+ path, index_col=0, sep="\t", names=["peptide_id", sample]
+ )
+
+ sample_dataframes = [
+ load(path, int(os.path.basename(path).split(".")[0])) for path in counts
+ ]
+
+ merged_counts_df = reduce(
+ lambda l, r: pd.merge(
+ l, r, how="outer", left_index=True, right_index=True
+ ), # noqa: E741
+ sample_dataframes,
+ ).fillna(0)
+
+ merged_counts_df.columns = merged_counts_df.columns.astype(int)
+ merged_counts_df.index = merged_counts_df.index.astype(int)
+ merged_counts_df.sort_index(inplace=True)
+ merged_counts_df.sort_index(axis=1, inplace=True)
+
+ return merged_counts_df
+
+
+[docs]def to_tall(ds: xr.Dataset):
+ """Melt a phippery xarray dataset into a single long-formatted
+ dataframe that has a unique sample peptide interaction on each
+ row. Ideal for ggplotting.
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset to extract an annotation from.
+
+ Returns
+ -------
+
+ pd.DataFrame :
+ The tall formatted dataset.
+
+ Example
+ -------
+ >>> ds["counts"].to_pandas()
+ sample_id 0 1
+ peptide_id
+ 0 453 393
+ 1 456 532
+ 2 609 145
+ >>> to_tall(ds)[["sample_id", "peptide_id", "counts"]]
+ sample_id peptide_id counts
+ 0 0 0 453
+ 1 0 1 456
+ 2 0 2 609
+ 3 1 0 393
+ 4 1 1 532
+ 5 1 2 145
+ """
+
+ return pd.concat([sample_df for sample_df in yield_tall(ds)])
+
+
+[docs]def yield_tall(ds: xr.Dataset):
+ """For each sample, yield a tall DataFrame."""
+
+ # Get the table of samples
+ sample_table = ds.sample_table.to_pandas().reset_index().infer_objects()
+
+ # Keep track of the order of columns in all emitted items
+ cnames = None
+
+ # For each sample
+ for sample_id in sample_table["sample_id"].values:
+
+ # Make sure that values for this sample are present in all data tables
+ for dt in set(list(ds.data_vars)) - set(["sample_table", "peptide_table"]):
+ assert (
+ sample_id in ds[f"{dt}"].to_pandas().columns.values
+ ), f"Could not find sample '{sample_id}' in table for {dt}"
+
+ # Make a wide table
+ sample_df = pd.DataFrame(
+ {
+ f"{dt}": ds[f"{dt}"].to_pandas()[sample_id]
+ for dt in set(list(ds.data_vars))
+ - set(["sample_table", "peptide_table"])
+ }
+ ).assign(sample_id=sample_id)
+
+ # Get the table of peptides
+ peptide_table = ds.peptide_table.to_pandas().reset_index().infer_objects()
+
+ # merge the metadata into the melted datatables
+ sample_df = sample_df.merge(peptide_table, on="peptide_id")
+
+ # Merge the sample table
+ sample_df = sample_df.merge(sample_table, on="sample_id")
+
+ # If the column names have not yet been set
+ if cnames is None:
+
+ # Set them based on the first table
+ cnames = sample_df.columns.values
+
+ # If the column names were set in a previous iteration
+ else:
+
+ # Make sure that this table conforms to the same column order
+ sample_df = sample_df.reindex(columns=cnames)
+
+ yield sample_df
+
+
+[docs]def to_wide(ds):
+ """Take a phippery dataset and split it into
+ its separate components in a dictionary.
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset to separate.
+
+ Returns
+ -------
+ dict :
+ The dictionary of annotation tables and enrichment matrices.
+
+ """
+
+ ret = {}
+ enr_layers = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ for dt in enr_layers:
+ layer = copy.deepcopy(ds[f"{dt}"].to_pandas())
+ layer.index.name = ""
+ ret[dt] = layer
+
+ for at in ["sample", "peptide"]:
+ ret[at] = get_annotation_table(ds, dim=at)
+
+ return ret
+
+
+[docs]def to_wide_csv(ds, file_prefix):
+ """Take a phippery dataset and split it into
+ its separate components at writes each into a
+ comma separated file.
+
+ Note
+ ----
+ This is the inverse operation of the
+ `dataset_from_csv()` utility function.
+ Generally speaking these functions are used for
+ long term storage in common formats when pickle
+ dumped binaries are not ideal.
+
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset to extract an annotation from.
+
+ file_prefix : str
+ The file prefix relative to the current working directory
+ where the files should be written.
+
+ Returns
+ -------
+ None
+ """
+
+ enr_layers = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ for dt in enr_layers:
+ layer = ds[f"{dt}"].to_pandas()
+ layer.index.name = ""
+ layer.to_csv(f"{file_prefix}_{dt}.csv", na_rep="NA", index_label=None)
+
+ for at in ["sample", "peptide"]:
+ get_annotation_table(ds, dim=at).to_csv(
+ f"{file_prefix}_{at}_annotation_table.csv"
+ )
+
+
+[docs]def id_coordinate_from_query_df(ds, query_df):
+ """Given a dataframe with pandas query statements
+ for both samples and peptides, return the relevant sample
+ and peptide id's after applying the logical AND of all queries.
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to query.
+
+ query_df : pd.DataFrame
+ A dataframe with must have two columns (including headers):
+
+ 1. "dimension" - either "sample" or "peptide" to specify expression dimension
+ 2. "expression" - The pandas query expression to apply.
+
+ Returns
+ -------
+ tuple : list, list
+ Return a tuple of sample id's and peptide id's
+
+ """
+
+ sq = list(query_df.loc[query_df["dimension"] == "sample", "expression"].values)
+ sid = id_query(ds, "sample", " & ".join(sq))
+
+ pq = list(query_df.loc[query_df["dimension"] == "peptide", "expression"].values)
+ pid = id_query(ds, "peptide", " & ".join(pq))
+
+ return sid, pid
+
+
+[docs]def ds_query(ds, query, dim="sample"):
+ """
+ Apply a sample or peptide query statement to the
+ entire dataset.
+
+ Note
+ ----
+ For more on pandas queries, see
+ `the pandas documentation <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html>`_
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to query.
+
+ query : str
+ pandas query expression
+
+ dim : str
+ The dimension to to apply the expression
+
+ Returns
+ -------
+ xarray.DataSet :
+ reference to the dataset slice from the given expression.
+
+ Example
+ -------
+
+ >>> phippery.get_annotation_table(ds, "peptide")
+ peptide_metadata Oligo virus
+ peptide_id
+ 0 ATCG zika
+ 1 ATCG zika
+ 2 ATCG zika
+ 3 ATCG zika
+ 4 ATCG dengue
+ 5 ATCG dengue
+ 6 ATCG dengue
+ 7 ATCG dengue
+ >>> zka_ds = ds_query(ds, "virus == 'zika'", dim="peptide")
+ >>> zka_ds["counts"].to_pandas()
+ sample_id 0 1 2 3 4 5 6 7 8 9
+ peptide_id
+ 0 110 829 872 475 716 815 308 647 216 791
+ 1 604 987 776 923 858 985 396 539 32 600
+ 2 865 161 413 760 422 297 639 786 857 878
+ 3 992 354 825 535 440 416 572 988 763 841
+ """
+
+ idx = id_query(ds, query, dim)
+ return ds.loc[{f"{dim}_id": idx}]
+
+
+[docs]def id_query(ds, query, dim="sample"):
+ """
+ Apply a sample or peptide query statement to the
+ entire dataset and retrieve the respective indices.
+
+ Note
+ ----
+ For more on pandas queries, see
+ https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to query.
+
+ query : str
+ pandas query expression
+
+ dim : str
+
+
+ Return
+ ------
+ list[int] :
+ The list of integer identifiers that apply to a given expression
+ for the respective dimension.
+ """
+
+ return get_annotation_table(ds, dim).query(query).index.values
+
+
+[docs]def load(path):
+ """simple wrapper for loading
+ xarray datasets from pickle binary
+
+ Parameters
+ ----------
+
+ path : str
+ Relative path of binary phippery dataset
+
+ Returns
+ -------
+
+ xarray.DataSet :
+ phippery dataset
+ """
+
+ return pickle.load(open(path, "rb"))
+
+
+[docs]def dump(ds, path):
+ """
+ simple wrapper for dumping xarray datasets to pickle binary
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The dataset you would like to write to disk.
+
+ path : str
+ The relative path you would like to write to.
+
+ Returns
+ -------
+ None
+ """
+
+ pickle.dump(ds, open(path, "wb"))
+
+
+[docs]def dataset_from_csv(
+ peptide_table_filename, sample_table_filename, counts_table_filename
+):
+ r"""Load a dataset from individual comma separated
+ files containing the counts matrix, as well as
+ sample and peptide annotation tables.
+
+ Note
+ ----
+ This is the inverse operation of the
+ `to_wide_csv()` utility function.
+ Generally speaking these functions are used for
+ long term storage in common formats when pickle
+ dumped binaries are not ideal.
+ For now, this function only supports
+ a single enrichment table to be added with the
+ variable name "counts" to the dataset.
+ If you would like to add other transformation of the
+ enrichment table (i.e. cpm, mlxp, etc), you can
+ load the csv's via pandas and add to the dataset
+ using the `add_enrichment_layer_from_array` function
+
+
+ Parameters
+ ----------
+ counts_table_filename : str
+ The glob filepath to csv file(s) containing the enrichments.
+ All files should have the first column be indices which match the
+ given peptide table index column.
+ The first row then should have column headers that match the
+ index of the sample table.
+
+ peptide_table_filename : str
+ The relative filepath to the peptide annotation table.
+
+ sample_table_filename : str
+ The relative filepath to the sample annotation table.
+
+ Returns
+ -------
+ xarray.DataSet :
+ The combined tables in a phippery dataset.
+ """
+
+ counts_df = _collect_counts_matrix(counts_table_filename)
+ peptide_df = _collect_peptide_table(peptide_table_filename)
+ sample_df = _collect_sample_table(sample_table_filename)
+
+ return stitch_dataset(
+ counts=counts_df,
+ peptide_table=peptide_df,
+ sample_table=sample_df,
+ )
+
+
+def _collect_sample_table(sample_table_filename: str):
+ """Read and verify a sample table."""
+
+ sample_table = pd.read_csv(sample_table_filename, sep=",", index_col=0, header=0)
+
+ if sample_table.index.name != "sample_id":
+ raise ValueError("The name of the index must be 'sample_id'")
+
+ if sample_table.index.dtype != "int64":
+ raise ValueError("The index values for sample_id must be inferred as integers")
+
+ sample_table.sort_index(inplace=True)
+ return sample_table
+
+
+def _collect_peptide_table(peptide_table_filename: str):
+ """Read and verify a peptide table."""
+
+ peptide_table = pd.read_csv(peptide_table_filename, sep=",", index_col=0, header=0)
+
+ if peptide_table.index.name != "peptide_id":
+ raise ValueError
+
+ if peptide_table.index.dtype != "int64":
+ raise ValueError("The index values for peptide_id must be inferred as integers")
+
+ peptide_table.sort_index(inplace=True)
+ return peptide_table
+
+
+def _collect_counts_matrix(counts_matrix_filename: str):
+ """Read and verify a counts matrix file."""
+
+ counts_matrix = pd.read_csv(counts_matrix_filename, sep=",", index_col=0, header=0)
+
+ try:
+ counts_matrix.columns = counts_matrix.columns.astype(int)
+ except ValueError:
+ raise ValueError(
+ "column header values much be able to cast to type 'int' to match peptide table index"
+ )
+
+ try:
+ counts_matrix.index = counts_matrix.index.astype(int)
+ except ValueError:
+ raise ValueError(
+ "row index values much be able to cast to type 'int' to match peptide table index"
+ )
+
+ counts_matrix.sort_index(inplace=True)
+ counts_matrix.sort_index(axis=1, inplace=True)
+
+ return counts_matrix
+
+
+[docs]def add_enrichment_layer_from_array(ds, enrichment, new_table_name=None, inplace=True):
+ """Append an enrichment layer to the dataset.
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The phippery dataset to append to.
+
+ enrichment : np.array
+ The enrichment matrix to append to the phippery dataset.
+ The number of rows should be the same length as ds.peptide_id
+ and the number of columns should be the same length as ds.sample_id
+
+ new_table_name : str
+ What you would like to name the enrichment layer.
+
+ inplace : bool
+ Determines whether to modify the passed dataset, or return an augmented
+ copy
+
+ Returns
+ -------
+ None | xarray.DataSet
+ The augmented phippery dataset copy is returned if inplace is ``True``
+ """
+
+ if enrichment.shape != ds.counts.shape:
+ ins = enrichment.shape
+ cur = ds.counts.shape
+ pri = f"provided enrichment layer shape: {ins},"
+ pri += f"current working dataset counts shape: {cur}"
+ raise ValueError(
+ f"Enrichments must have the same shape as enrichments in dataset. {pri}"
+ )
+ enr_layers = set(list(ds.data_vars)) - set(["sample_table", "peptide_table"])
+ if new_table_name is None:
+ new_table_name = f"enrichment_layer_{len(enr_layers)+1}"
+ enrichment = xr.DataArray(enrichment, dims=ds.counts.dims)
+ if inplace:
+ ds[new_table_name] = enrichment
+ return None
+ else:
+ ds_copy = copy.deepcopy(ds)
+ ds_copy[new_table_name] = enrichment
+ return ds_copy
+
+
+def _throw_mismatched_features(df, by):
+ """
+ When you collapse by some set of columns in the dataframe,
+ keep only features which homogeneous within groups.
+
+ This is similar to 'DataFrameGroupby.first()' method,
+ but instead of keeping the first factor level appearing for each group
+ category, we only throw any features which are note homogeneous within groups.
+ You could achieve the same functionality by listing features you know to be
+ homogeneous in the 'by' parameter.
+ """
+
+ # Find out which collapse features are shared within groups
+ collapsed_sample_metadata = defaultdict(list)
+ for i, (group, group_df) in enumerate(df.groupby(by)):
+ for column, value in group_df.items():
+ v = value.values
+ if np.all(v == v[0]) or np.all([n != n for n in v]):
+ collapsed_sample_metadata[column].append(v[0])
+
+ # Throw out features that are not shared between groups
+ to_throw = [
+ key for key, value in collapsed_sample_metadata.items() if len(value) < i + 1
+ ]
+ [collapsed_sample_metadata.pop(key) for key in to_throw]
+ return pd.DataFrame(collapsed_sample_metadata)
+
+
+def _mean_pw_cc_by_multiple_tables(ds, by, dim="sample", data_tables="all"):
+ """
+ A wrapper for computing pw cc within groups defined
+ with the 'by' parameter. Merges the correlations into a
+ single table
+ """
+
+ # Compute mean pw cc on all possible data tables
+ if data_tables == "all":
+ data_tables = list(set(ds.data_vars) - set(["sample_table", "peptide_table"]))
+
+ # Some error handling
+ if dim not in ["sample", "peptide"]:
+ raise ValueError("parameter 'dim' must be either 'sample' or 'peptide'")
+
+ groups_avail = ds[f"{dim}_metadata"].values
+ for data_table in data_tables:
+ if data_table not in ds:
+ raise KeyError(f"{data_table} is not included in dataset.")
+
+ for group in by:
+ if group not in groups_avail:
+ raise KeyError(
+ f"{group} is not included as a column in the {dim} table. The available groups are {groups_avail}"
+ )
+
+ # Compute mean pw cc on all data layers - resulting in a df for each
+ corr_dfs = [
+ _mean_pw_cc_by(ds, by, data_table=data_table, dim=dim)
+ for data_table in data_tables
+ ]
+
+ # return a single merged df containing info for all data layer pw cc
+ return reduce(
+ lambda l, r: pd.merge(
+ l, r, how="outer", left_index=True, right_index=True
+ ), # noqa: E741
+ corr_dfs,
+ )
+
+
+def _mean_pw_cc_by(ds, by, data_table="counts", dim="sample"):
+ """
+ Computes pairwise cc for all
+ dim in a group specified by 'group' column.
+
+ returns a dataframe with each group, it's
+ respective pw_cc, and the number of dims
+ in the group.
+ """
+
+ data = ds[f"{data_table}"].to_pandas()
+
+ groups, pw_cc, n = [], [], []
+
+ for s_group, group_ds in iter_groups(ds, by, dim):
+ groups.append(s_group)
+ n.append(len(group_ds[f"{dim}_id"].values))
+
+ if len(group_ds[f"{dim}_id"].values) < 2:
+ pw_cc.append(1.0)
+ continue
+
+ correlations = []
+ for dim_ids in itertools.combinations(group_ds[f"{dim}_id"].values, 2):
+ dim_0_enrichment = data.loc[:, dim_ids[0]]
+ dim_1_enrichment = data.loc[:, dim_ids[1]]
+ correlation = (
+ st.pearsonr(dim_0_enrichment, dim_1_enrichment)[0]
+ if np.any(dim_0_enrichment != dim_1_enrichment)
+ else 1.0
+ )
+ correlations.append(correlation)
+ pw_cc.append(round(sum(correlations) / len(correlations), 5))
+
+ name = "_".join(by)
+ column_prefix = f"{name}_{data_table}"
+
+ ret = pd.DataFrame(
+ {
+ f"{name}": groups,
+ f"{column_prefix}_pw_cc": np.array(pw_cc).astype(np.float64),
+ f"{column_prefix}_n_reps": np.array(n).astype(int),
+ }
+ ).set_index(name)
+
+ return ret
+
+
+[docs]def collapse_sample_groups(*args, **kwargs):
+ """wrap for sample collapse"""
+ return collapse_groups(*args, **kwargs, collapse_dim="sample")
+
+
+[docs]def collapse_peptide_groups(*args, **kwargs):
+ """wrap for peptide collapse"""
+ return collapse_groups(*args, **kwargs, collapse_dim="peptide")
+
+
+[docs]def collapse_groups(
+ ds, by, collapse_dim="sample", agg_func=np.mean, compute_pw_cc=False, **kwargs
+):
+ """Collapse an xarray dataset along one of the annotation axis
+ by applying the agg_function to annotation groups of 'by'.
+
+ Parameters
+ ----------
+
+ ds : xarray.DataSet
+ The phippery dataset to append to.
+
+ by : list
+ The name of the annotation feature you would like to collapse.
+
+ collapse_dim : str
+ The dimension you's like to collapse. "sample" or "peptide"
+
+ compute_pw_cc : bool
+ Whether or not to compute the mean pairwise correlation
+ of all values within any feature group that is being
+ collapsed.
+
+ agg_func : *function*
+ This function must take a one-dimensional array and aggregate
+ all values to a single number, agg_func(list[float | int]) -> float | int
+
+
+ Returns
+ -------
+
+ xarray.DataSet :
+ The collapsed phippery dataset.
+
+ Example
+ -------
+ >>> get_annotation_table(ds, dim="sample")
+ sample_metadata fastq_filename reference seq_dir sample_type
+ sample_id
+ 0 sample_0.fastq refa expa beads_only
+ 1 sample_1.fastq refa expa beads_only
+ 2 sample_2.fastq refa expa library
+ 3 sample_3.fastq refa expa library
+ 4 sample_4.fastq refa expa IP
+ 5 sample_5.fastq refa expa IP
+ >>> ds["counts"].to_pandas()
+ sample_id 0 1 2 3 4 5
+ peptide_id
+ 0 7 0 3 2 3 2
+ 1 6 3 1 0 7 5
+ 2 9 1 7 8 4 7
+ >>> mean_sample_type_ds = collapse_groups(ds, by=["sample_type"])
+ >>> get_annotation_table(mean_sample_type_ds, dim="sample")
+ sample_metadata reference seq_dir sample_type
+ sample_id
+ 0 refa expa IP
+ 1 refa expa beads_only
+ 2 refa expa library
+ >>> mean_sample_type_ds["counts"].to_pandas()
+ sample_id 0 1 2
+ peptide_id
+ 0 2.5 3.5 2.5
+ 1 6.0 4.5 0.5
+ 2 5.5 5.0 7.5
+ """
+
+ # Check to see if the group(s) is/are available
+ groups_avail = ds[f"{collapse_dim}_metadata"].values
+
+ for group in by:
+ if group not in groups_avail:
+ raise KeyError(
+ f"{group} is not included as a column in the {collapse_dim} table. The available groups are {groups_avail}"
+ )
+
+ # define collapse and fixed df
+ dims = set(["sample", "peptide"])
+ fixed_dim = list(dims - set([collapse_dim]))[0]
+
+ # grab relevant annotation tables
+ collapse_df = ds[f"{collapse_dim}_table"].to_pandas()
+ fixed_df = ds[f"{fixed_dim}_table"].to_pandas()
+
+ # Create group-able dataset by assigning table columns to a coordinate
+ if len(by) == 1:
+ coord = collapse_df[by[0]]
+ coord_ds = ds.assign_coords({f"{by[0]}": (f"{collapse_dim}_id", coord)})
+ else:
+ print("WARNING: Nothing available, here")
+ return None
+
+ # if were grouping by multiple things, we need to zip 'em into a tuple coord
+ # psuedo-code
+ # else:
+ # common_dim = f"{collapse_dim}_id"
+ # coor_arr = np.empty(len(ds[common_dim]), dtype=object)
+ # coor_arr[:] = list(zip(*(collapse_df[f].values for f in by)))
+ # coord_ds = ds.assign_coords(
+ # coord=xr.DataArray(coor_arr, collapse_dims=common_dim)
+ # )
+
+ del coord_ds["sample_table"]
+ del coord_ds["peptide_table"]
+ del coord_ds["sample_metadata"]
+ del coord_ds["peptide_metadata"]
+
+ # Perform the reduction on all data tables.
+ collapsed_enrichments = coord_ds.groupby(f"{by[0]}", squeeze=False).reduce(agg_func)
+
+ if collapse_dim == "sample":
+ collapsed_enrichments = collapsed_enrichments.transpose()
+
+ # Once the data tables are grouped we have no use for first copy.
+ del coord_ds
+
+ # Compile each of the collapsed xarray variables.
+ collapsed_xr_dfs = {
+ f"{dt}": (
+ ["peptide_id", "sample_id"],
+ collapsed_enrichments[f"{dt}"].to_pandas(),
+ )
+ for dt in set(list(collapsed_enrichments.data_vars))
+ }
+
+ cat = _throw_mismatched_features(collapse_df, by)
+
+ # Compute mean pairwise correlation for all groups,
+ # for all enrichment layers - and add it to the
+ # resulting collapsed sample table
+ if compute_pw_cc:
+ mean_pw_cc = _mean_pw_cc_by(ds, by, **kwargs)
+ cat = cat.merge(mean_pw_cc, left_index=True, right_index=True)
+
+ # Insert the correct annotation tables to out dictionary of variables
+ collapsed_xr_dfs[f"{collapse_dim}_table"] = (
+ [f"{collapse_dim}_id", f"{collapse_dim}_metadata"],
+ cat,
+ )
+
+ collapsed_xr_dfs[f"{fixed_dim}_table"] = (
+ [f"{fixed_dim}_id", f"{fixed_dim}_metadata"],
+ fixed_df,
+ )
+
+ pds = xr.Dataset(
+ collapsed_xr_dfs,
+ coords={
+ f"{collapse_dim}_id": collapsed_xr_dfs[f"{collapse_dim}_table"][
+ 1
+ ].index.values,
+ f"{fixed_dim}_id": collapsed_xr_dfs[f"{fixed_dim}_table"][1].index.values,
+ f"{collapse_dim}_metadata": collapsed_xr_dfs[f"{collapse_dim}_table"][
+ 1
+ ].columns.values,
+ f"{fixed_dim}_metadata": collapsed_xr_dfs[f"{fixed_dim}_table"][
+ 1
+ ].columns.values,
+ },
+ )
+ return pds
+