Skip to content

Commit

Permalink
tests and fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
sanjaynagi committed Jan 8, 2024
1 parent a2f08c2 commit da5beea
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 17 deletions.
42 changes: 28 additions & 14 deletions anoexpress/anoexpress.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,17 +98,17 @@ def resolve_gene_id(gene_id, analysis):

return gene_id

def filter_low_counts(data, data_type, analysis, gene_id, count_threshold=5, func=np.nanmedian):
def filter_low_counts(data_df, data_type, analysis, gene_id, count_threshold=5, func=np.nanmedian):
if data_type != 'log2counts':
count_data = data(data_type='log2counts', analysis=analysis, gene_id=gene_id)
mask = 2**count_data.apply(func=func, axis=1) > count_threshold
else:
mask = 2**data.apply(func=func, axis=1) > count_threshold
mask = 2**data_df.apply(func=func, axis=1) > count_threshold

print(f"Removing {(mask == 0).sum()} genes with median counts below the threshold ({count_threshold})")
mask = mask[mask].index.to_list()

return data.query("GeneID in @mask")
return data_df.query("GeneID in @mask")

def data(data_type, analysis, microarray=False, gene_id=None, sort_by=None, annotations=False, pvalue_filter=None, low_count_filter=None, fraction_na_allowed=None):
"""
Expand Down Expand Up @@ -180,7 +180,7 @@ def data(data_type, analysis, microarray=False, gene_id=None, sort_by=None, anno

# remove low count genes
if low_count_filter is not None:
df = filter_low_counts(data=df, data_type=data_type, analysis=analysis, gene_id=gene_id, low_count_filter=low_count_filter, func=np.nanmedian)
df = filter_low_counts(data_df=df, data_type=data_type, analysis=analysis, gene_id=gene_id, count_threshold=low_count_filter, func=np.nanmedian)

# remove genes with lots of NA
if fraction_na_allowed:
Expand Down Expand Up @@ -411,7 +411,7 @@ def filter_nas(df, fraction_na_allowed):
return df.loc[~na_mask, :]


def load_candidates(analysis, name='median', func=np.nanmedian, query_annotation=None, query_fc=None, microarray=False, fraction_na_allowed=None):
def load_candidates(analysis, name='median', func=np.nanmedian, query_annotation=None, query_fc=None, microarray=False, low_count_filter=None, fraction_na_allowed=None):
"""
Load the candidate genes for a given analysis. Optionally, filter by annotation or fold change data.
Expand All @@ -430,6 +430,8 @@ def load_candidates(analysis, name='median', func=np.nanmedian, query_annotation
filter genes by fold change. Defaults to None
microarray: bool, optional
whether to include the IR-Tex microarray data in the requested data. Default is False.
low_count_filter: int, optional
if provided, genes with a median count below the threshold will be removed before gene ranking. Default is None.
fraction_nas_allowed: float, optional
fraction of missing values allowed in the data. Defaults to 0.5
Expand All @@ -438,7 +440,7 @@ def load_candidates(analysis, name='median', func=np.nanmedian, query_annotation
fc_ranked: pd.DataFrame
"""

fc_data = data(data_type='fcs', analysis=analysis, microarray=microarray, annotations=True, sort_by=None, fraction_na_allowed=fraction_na_allowed)
fc_data = data(data_type='fcs', analysis=analysis, microarray=microarray, annotations=True, sort_by=None, low_count_filter=low_count_filter, fraction_na_allowed=fraction_na_allowed)

if query_annotation is not None:
gene_annot_df = load_annotations()
Expand All @@ -457,7 +459,7 @@ def load_candidates(analysis, name='median', func=np.nanmedian, query_annotation
return(fc_ranked)


def load_genes_for_enrichment(analysis, func, gene_ids, percentile):
def load_genes_for_enrichment(analysis, func, gene_ids, percentile, microarray, low_count_filter=None):

assert func is not None or gene_ids is not None, "either a ranking function (func) or gene_ids must be provided"
assert func is None or gene_ids is None, "Only a ranking function (func) or gene_ids must be provided, not both"
Expand All @@ -468,15 +470,15 @@ def load_genes_for_enrichment(analysis, func, gene_ids, percentile):

if func:
# get top % percentile genes ranked by func
fc_ranked = load_candidates(analysis=analysis, name='enrich', func=func)
fc_ranked = load_candidates(analysis=analysis, name='enrich', func=func, microarray=microarray, low_count_filter=low_count_filter)
percentile_idx = fc_ranked.reset_index()['GeneID'].unique().shape[0] * percentile
top_geneIDs = fc_ranked.reset_index().loc[:, 'GeneID'][:int(percentile_idx)]
elif gene_ids:
top_geneIDs = resolve_gene_id(gene_id=gene_ids, analysis=analysis)

return top_geneIDs, fc_genes

def go_hypergeometric(analysis, func=None, gene_ids=None, percentile=0.05):
def go_hypergeometric(analysis, func=None, gene_ids=None, percentile=0.05, microarray=False, low_count_filter=None):
"""
Perform a hypergeometric test on GO terms of the the top % percentile genes ranked by user input function, or on
a user inputted gene_id list
Expand All @@ -492,13 +494,17 @@ def go_hypergeometric(analysis, func=None, gene_ids=None, percentile=0.05):
list of gene ids to perform hypergeometric test on. Defaults to None
percentile: float, optional
percentile of genes to use for the enriched set in hypergeometric test. Defaults to 0.05
microarray: bool, optional
whether to include the IR-Tex microarray data in the gene ranking. Default is False.
low_count_filter: int, optional
if provided, genes with a median count below the threshold will be removed before gene ranking. Default is None.
Returns
-------
go_hypergeo_results: pd.DataFrame
"""

top_geneIDs, fc_genes = load_genes_for_enrichment(analysis=analysis, func=func, gene_ids=gene_ids, percentile=percentile)
top_geneIDs, fc_genes = load_genes_for_enrichment(analysis=analysis, func=func, gene_ids=gene_ids, percentile=percentile, microarray=microarray, low_count_filter=low_count_filter)

# load gene annotation file
gaf_df = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/AnoExpress/main/resources/AgamP4.gaf", sep="\t")
Expand All @@ -518,7 +524,7 @@ def go_hypergeometric(analysis, func=None, gene_ids=None, percentile=0.05):
return(hyper_geo)


def pfam_hypergeometric(analysis, func=None, gene_ids=None, percentile=0.05):
def pfam_hypergeometric(analysis, func=None, gene_ids=None, percentile=0.05, microarray=False, low_count_filter=None):
"""
Perform a hypergeometric test on PFAM domains of the the top % percentile genes ranked by user input function,
or on a user inputted gene_id list
Expand All @@ -536,13 +542,17 @@ def pfam_hypergeometric(analysis, func=None, gene_ids=None, percentile=0.05):
list of gene ids to perform hypergeometric test on. Defaults to None
percentile: float, optional
percentile of genes to use for the enriched set in hypergeometric test. Defaults to 0.05
microarray: bool, optional
whether to include the IR-Tex microarray data in the gene ranking. Default is False.
low_count_filter: int, optional
if provided, genes with a median count below the threshold will be removed before gene ranking. Default is None.
Returns
-------
pfam_hypergeo_results: pd.DataFrame
"""

top_geneIDs, fc_genes = load_genes_for_enrichment(analysis=analysis, func=func, gene_ids=gene_ids, percentile=percentile)
top_geneIDs, fc_genes = load_genes_for_enrichment(analysis=analysis, func=func, gene_ids=gene_ids, percentile=percentile, microarray=microarray, low_count_filter=low_count_filter)

# load gene annotation file
pfam_df = pd.read_csv("https://github.com/sanjaynagi/AnoExpress/blob/main/resources/Anogam_long.pep_Pfamscan.seqs.gz?raw=true", sep="\s+", header=None, compression='gzip').iloc[:, [0,4]]
Expand All @@ -562,7 +572,7 @@ def pfam_hypergeometric(analysis, func=None, gene_ids=None, percentile=0.05):

return(hyper_geo)

def kegg_hypergeometric(analysis, func=None, gene_ids=None, percentile=0.05):
def kegg_hypergeometric(analysis, func=None, gene_ids=None, percentile=0.05, microarray=False, low_count_filter=None):
"""
Perform a hypergeometric test on GO terms of the the top % percentile genes ranked by user input function.
Expand All @@ -579,13 +589,17 @@ def kegg_hypergeometric(analysis, func=None, gene_ids=None, percentile=0.05):
list of gene ids to perform hypergeometric test on. Defaults to None
percentile: float, optional
percentile of genes to use for the enriched set in hypergeometric test. Defaults to 0.05
microarray: bool, optional
whether to include the IR-Tex microarray data in the gene ranking. Default is False.
low_count_filter: int, optional
if provided, genes with a median count below the threshold will be removed before gene ranking. Default is None.
Returns
-------
go_hypergeo_results: pd.DataFrame
"""

top_geneIDs, fc_genes = load_genes_for_enrichment(analysis=analysis, func=func, gene_ids=gene_ids, percentile=percentile)
top_geneIDs, fc_genes = load_genes_for_enrichment(analysis=analysis, func=func, gene_ids=gene_ids, percentile=percentile, microarray=microarray, low_count_filter=low_count_filter)

# load gene annotation file
kegg_df = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/AnoExpress/main/resources/AgamP4.kegg", sep="\t")
Expand Down
32 changes: 29 additions & 3 deletions tests/test_anoexpress.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ def test_load_results_arrays(data_type, analysis):
assert not df.empty





@pytest.mark.parametrize(
"data_type",
["fcs", "pvals"],
Expand Down Expand Up @@ -59,7 +62,7 @@ def test_data_analyses(analysis):
assert isinstance(data_df, pd.DataFrame)

@pytest.mark.parametrize('data_type', ["fcs", "pvals", "log2counts"])
@pytest.mark.parametrize("gene_id", [None, gene, gene_ids])
@pytest.mark.parametrize("gene_id", [None, gene, gene_ids, "2RL:28,500,500-28,530,000"])
def test_data_types_genes(data_type, gene_id):

data_df = xpress.data(
Expand Down Expand Up @@ -100,6 +103,26 @@ def test_data_pvalue_filter():
assert not data_df.empty
assert isinstance(data_df, pd.DataFrame)

def test_data_low_count_filter(low_count_filter=10):

data_df = xpress.data(
data_type="fcs",
analysis="gamb_colu",
microarray=False,
sort_by=None,
)

data_low_df = xpress.data(
data_type="fcs",
analysis="gamb_colu",
microarray=False,
low_count_filter=low_count_filter,
sort_by=None)

assert data_low_df is not None
assert not data_low_df.empty
assert isinstance(data_low_df, pd.DataFrame)
assert data_df.shape[0] > data_low_df.shape[0]

@pytest.mark.parametrize(
"gene_ids",
Expand Down Expand Up @@ -160,14 +183,17 @@ def test_contig_expression():
)


def test_load_candidates():

df = xpress.load_candidates(analysis="gamb_colu_arab_fun", name="median", func=np.nanmedian, fraction_na_allowed=0.5, low_count_filter=5)

assert isinstance(df, pd.DataFrame)
assert not df.empty




### hypergeometric functions ###


def test_go_hypergeometric():
go = xpress.go_hypergeometric(analysis='gamb_colu_arab_fun', func=np.nanmedian
)
Expand Down

0 comments on commit da5beea

Please sign in to comment.