Skip to content

Commit

Permalink
enable download of gene symbol pathways
Browse files Browse the repository at this point in the history
  • Loading branch information
cwieder committed Nov 7, 2024
1 parent ec53627 commit 1e4e167
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 91 deletions.
29 changes: 22 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ sspa provides a Python interface for metabolomics pathway analysis. In addition
- Metabolite set enrichment analysis (based on GSEA)
- Single-sample pathway analysis
- Compound identifier conversion
- Pathway database download (KEGG, Reactome, and MetExplore metabolic networks)
- Pathway database download (KEGG, Reactome, and PathBank)

Although this package is designed to provide a user-friendly interface for metabolomics pathway analysis, the methods are also applicable to other datatypes such as normalised RNA-seq data.
Although this package is designed to provide a user-friendly interface for metabolomics pathway analysis, the methods are also applicable to other datatypes such as normalised RNA-seq and proteomics data.

## Documentation and tutorials
This README provides a quickstart guide to the package and its functions. For new users we **highly recommend following our full walkthrough notebook tutorial** available on Google Colab which provides a step-by-step guide to using the package.
Expand Down Expand Up @@ -92,14 +92,29 @@ custom_pathways = sspa.process_gmt("wikipathways-20220310-gmt-Homo_sapiens.gmt")

Download latest version of pathways
```python
# download KEGG latest
# download KEGG latest metabolomics pathways
kegg_mouse_latest = sspa.process_kegg("mmu", download_latest=True, filepath=".")

# download Reactome latest
reactome_mouse_latest = sspa.process_reactome("Mus musculus", download_latest=True, filepath=".", omicstype='metabolomics')
# download Reactome latest metabolomics pathways
reactome_mouse_latest = sspa.process_reactome("Mus musculus", download_latest=True, filepath=".", omics_type='metabolomics')

# download Pathbank latest
pathbank_human_latest = sspa.process_pathbank("Homo sapiens", download_latest=True, filepath=".", omicstype='metabolomics')
# download Pathbank latest metabolomics pathways
pathbank_human_latest = sspa.process_pathbank("Homo sapiens", download_latest=True, filepath=".", omics_type='metabolomics')
```

Download latest version of multi-omics pathways
- For Reactome, users can specify the omics types required via the 'identifiers' argument. Leaving this to None downloads all omics (ChEBI, UniProt, Gene Symbol). Users can specify any combination of `['chebi', 'uniprot', 'gene_symbol']`.
- For KEGG, multi-omics pathways are represented by KEGG gene and KEGG compound identifiers.

```python
# download multi-omics pathways from Reactome (ChEBI, UniProt, Gene Symbol)
reactome_human_mo = sspa.process_reactome('Homo sapiens', download_latest=True, filepath=".", omics_type='multiomics', identifiers=['chebi', 'uniprot', 'gene_symbol'])

# download multi-omics pathways from Reactome (ChEBI and UniProt)
reactome_human_mo = sspa.process_reactome('Homo sapiens', download_latest=True, filepath=".", omics_type='multiomics', identifiers=['chebi', 'uniprot'])

# download multi-omics pathways from KEGG (KEGG gene and KEGG compound)
kegg_mouse_latest = sspa.process_kegg("mmu", download_latest=True, filepath=".", omics_type='multiomics')
```

## Identifier harmonization
Expand Down
2 changes: 1 addition & 1 deletion src/sspa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@
from .sspa_ora import sspa_ora
from .sspa_gsea import sspa_gsea
from .sspa_ssGSEA import sspa_ssGSEA
from .download_pathways import download_KEGG, download_reactome, MetExplorePaths
from .download_pathways import download_KEGG, download_reactome
from .identifier_conversion import identifier_conversion, map_identifiers
222 changes: 142 additions & 80 deletions src/sspa/download_pathways.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,18 @@
import re
import pandas as pd
import warnings
import json
from tqdm import tqdm
import zipfile
import requests
import io

def download_KEGG(organism, filepath=None, omics_type='metabolomics'):
'''
Function for KEGG pathway download
Args:
organism (str): KEGG 3 letter organism code
filepath (str): filepath to save pathway file to, default is None - save to variable
omics_type(str): type of omics pathways to download (metabolomics or multiomics)
Returns:
GMT-like pd.DataFrame containing KEGG pathways
'''
Expand Down Expand Up @@ -125,14 +128,15 @@ def download_KEGG(organism, filepath=None, omics_type='metabolomics'):

return df

def download_reactome(organism, filepath=None, omics_type='metabolomics'):
def download_reactome(organism, filepath=None, omics_type='metabolomics', identifiers=None):
'''
Function for Reactome pathway download
Args:
organism (str): Reactome organism name
filepath (str): filepath (str): filepath to save pathway file to, default is None - save to variable
omics_type(str): type of omics pathways to download.
Options are 'metabolomics' (ChEBI identifiers), 'proteomics' (UniProt identifiers), or 'multiomics' (ChEBI and UniProt identifiers)
Options are 'metabolomics' (ChEBI identifiers), 'proteomics' (UniProt identifiers), 'transcriptomics' (Gene Symbol), or 'multiomics' (ChEBI, UniProt and Gene Symbol identifiers)
identifiers (list): list of identifiers to download for multi-omics pathways, default is None (download all). Options are 'chebi', 'uniprot', 'gene_symbol'
Returns:
GMT-like pd.DataFrame containing Reactome pathways
'''
Expand Down Expand Up @@ -188,32 +192,93 @@ def download_reactome(organism, filepath=None, omics_type='metabolomics'):

print("Complete!")
return pathways_df

if omics_type == 'transcriptomics':
url_gmt = 'https://reactome.org/download/current/ReactomePathways.gmt.zip'
resp_gmt = requests.get(url_gmt)

input_gmt = []
with zipfile.ZipFile(io.BytesIO(resp_gmt.content)) as zip_ref:
gmt_filename = [f for f in zip_ref.namelist() if f.endswith('.gmt')][0]

# Extract the GMT file contents
with zip_ref.open(gmt_filename) as gmt_file:
for i in gmt_file:
i = i.decode('utf-8').strip()
input_gmt.append(i.strip("\n").split("\t"))
pathways_df = pd.DataFrame(input_gmt)
pathways_df = pathways_df.rename({0:"Pathway_name", 1:"Pathway_ID"}, axis=1)
pathways_df.index = pathways_df["Pathway_ID"]
pathways_df = pathways_df.drop("Pathway_ID", axis=1)
pathways_df = pathways_df.dropna(axis=0, how='all', subset=pathways_df.columns.tolist()[1:])
pathways_df = pathways_df.dropna(axis=1, how='all')

if filepath:
fpath = filepath + "/Reactome_" + "_".join(organism.split())+ "_pathways_GeneSymbol_R" + str(version_no) + ".gmt"
pathways_df.to_csv(fpath, sep="\t", header=False)
print("Reactome DB file saved to " + fpath)

print("Complete!")
return pathways_df

if omics_type == 'multiomics':

if organism != 'Homo sapiens' and ('gene_symbol' in identifiers):
print('WARNING: Reactome does not provide gene_symbols for a non-Human organism, use UniProt instead')

name_df = pd.read_csv('https://reactome.org/download/current/ReactomePathways.txt', sep="\t", header=None)
url_prot = 'https://reactome.org/download/current/UniProt2Reactome_All_Levels.txt'
url_metab = 'https://reactome.org/download/current/ChEBI2Reactome_All_Levels.txt'
url_gmt = 'https://reactome.org/download/current/ReactomePathways.gmt.zip'

pathway_dfs = []
for url in [url_prot, url_metab]:
# read the chebi and uniprot files
pathway_dicts = {'chebi': None, 'uniprot': None, 'gene_symbol': None}
for k, url in {'uniprot': url_prot, 'chebi': url_metab}.items():
f = pd.read_csv(url, sep="\t", header=None)

f.columns = ['molecule_ID', 'pathway_ID', 'link', 'pathway_name', 'evidence_code', 'species']
f_filt = f[f.species == organism]
name_dict = dict(zip(f_filt['pathway_ID'], f_filt['pathway_name']))
groups = f_filt.groupby(['pathway_ID'])['molecule_ID'].apply(list).to_dict()
groups = {k: list(set(v)) for k, v in groups.items()}
df = pd.DataFrame.from_dict(groups, orient='index', dtype="object")
pathways_df = df.dropna(axis=0, how='all', subset=df.columns.tolist()[1:])
pathways_df = df.dropna(axis=1, how='all')
pathways_df["Pathway_name"] = pathways_df.index.map(name_dict)
pathways_df.insert(0, 'Pathway_name', pathways_df.pop('Pathway_name'))
pathways_df.set_index([pathways_df.index, pathways_df['Pathway_name']], inplace=True)
pathways_df.drop(['Pathway_name'], axis=1, inplace=True)
pathway_dfs.append(pathways_df)

# merge pathways on index
reactome_mo = pathway_dfs[0].merge(pathway_dfs[1], how='outer', left_index=True, right_index=True)
reactome_mo = reactome_mo.reset_index(level=[1])
pathway_dicts[k] = groups

# read the GMT with gene symbols
url_gmt = 'https://reactome.org/download/current/ReactomePathways.gmt.zip'
resp_gmt = requests.get(url_gmt)

input_gmt = []
gene_dict = {}
with zipfile.ZipFile(io.BytesIO(resp_gmt.content)) as zip_ref:
gmt_filename = [f for f in zip_ref.namelist() if f.endswith('.gmt')][0]

# Extract the GMT file contents
with zip_ref.open(gmt_filename) as gmt_file:
for i in gmt_file:
i = i.decode('utf-8').strip()
line = i.strip("\n").split("\t")
gene_dict[line[1]] = line[2:]
pathway_dicts['gene_symbol'] = gene_dict

# combine all dicts by key
# combine the required dictionaries based on the identifiers
if identifiers:
required_dicts = [pathway_dicts[k] for k in identifiers]
else:
required_dicts = list(pathway_dicts.values())

combined_dict = {}
for k in set().union(*required_dicts):
combined_dict[k] = list(d.get(k) for d in required_dicts)
# remove none values
combined_dict = {k: [x for x in v if x is not None] for k, v in combined_dict.items()}
# flatten the list of lists
combined_dict = {k: [item for sublist in v for item in sublist] for k, v in combined_dict.items()}

# create the dataframe
reactome_mo = pd.DataFrame.from_dict(combined_dict, orient='index', dtype="object")

# add the pathway names
reactome_mo["Pathway_name"] = reactome_mo.index.map(dict(zip(name_df[0], name_df[1])))
reactome_mo.insert(0, 'Pathway_name', reactome_mo.pop('Pathway_name'))

if filepath:
fpath = filepath + "/Reactome_" + "_".join(organism.split())+ "_pathways_multiomics_R" + str(version_no) + ".gmt"
Expand All @@ -222,73 +287,71 @@ def download_reactome(organism, filepath=None, omics_type='metabolomics'):

print("Complete!")
return reactome_mo




class MetExplorePaths:
'''
Class for downloading metexplore metabolic models in the form of pathways with mapped identifiers
Attributes:
model (str): identifier of genome scale metabolic model available on MetExplore
id_type (str): identifier type for the model pathways
filepath (str): filepath to save the pathway file to, default is None (save to variable)
nMappedID (int): Number of metabolites mapping to the selected identifier type
nMetab (int): Number of metabolites in the model
pathways (pd.DataFrame): GMT-like format pathway DataFrame
'''
def __init__(self, model, id_type, filepath=None):
self.model = model
self.id_type = id_type
self.filepath = filepath
self.nMappedID = None
self.nMetab = None
self.pathways = None
# downloads pathways on object instantiation
self.download_metexplore()

def download_metexplore(self):
'''
Function to download MetExplore pathways
'''
warnings.filterwarnings("ignore")
metexploreURL = "https://metexplore.toulouse.inrae.fr/metexplore-api/"+str(self.model)+"/pathwaymetabolite/"+str(self.id_type)+"/"
stats_nmapped_url = "https://metexplore.toulouse.inrae.fr/metexplore-api/stat/"+str(self.model)+"/"+str(self.id_type)+"/"
stats_nmetab_url = "https://metexplore.toulouse.inrae.fr/metexplore-api/stat/"+str(self.model)+"/nbMetab/"
# class MetExplorePaths:
# '''
# Class for downloading metexplore metabolic models in the form of pathways with mapped identifiers (DEPRECATED)

# Attributes:
# model (str): identifier of genome scale metabolic model available on MetExplore
# id_type (str): identifier type for the model pathways
# filepath (str): filepath to save the pathway file to, default is None (save to variable)
# nMappedID (int): Number of metabolites mapping to the selected identifier type
# nMetab (int): Number of metabolites in the model
# pathways (pd.DataFrame): GMT-like format pathway DataFrame

# '''
# def __init__(self, model, id_type, filepath=None):
# self.model = model
# self.id_type = id_type
# self.filepath = filepath
# self.nMappedID = None
# self.nMetab = None
# self.pathways = None
# # downloads pathways on object instantiation
# self.download_metexplore()

# def download_metexplore(self):
# '''
# Function to download MetExplore pathways
# '''
# warnings.filterwarnings("ignore")
# metexploreURL = "https://metexplore.toulouse.inrae.fr/metexplore-api/"+str(self.model)+"/pathwaymetabolite/"+str(self.id_type)+"/"
# stats_nmapped_url = "https://metexplore.toulouse.inrae.fr/metexplore-api/stat/"+str(self.model)+"/"+str(self.id_type)+"/"
# stats_nmetab_url = "https://metexplore.toulouse.inrae.fr/metexplore-api/stat/"+str(self.model)+"/nbMetab/"

stats_nmetab = requests.get(stats_nmetab_url, verify=False)
stats_nmapped = requests.get(stats_nmapped_url, verify=False)
data_api = requests.get(metexploreURL, verify=False)
pathways_json = data_api.json()
# stats_nmetab = requests.get(stats_nmetab_url, verify=False)
# stats_nmapped = requests.get(stats_nmapped_url, verify=False)
# data_api = requests.get(metexploreURL, verify=False)
# pathways_json = data_api.json()

pathways_df = pd.DataFrame.from_dict(pathways_json, orient='columns')
del(pathways_df["id"])
# pathways_df = pd.DataFrame.from_dict(pathways_json, orient='columns')
# del(pathways_df["id"])

pathways = pathways_df.merge(pathways_df.Metabolites.apply(pd.Series), right_index=True, left_index=True)
del(pathways["Metabolites"])
# pathways = pathways_df.merge(pathways_df.Metabolites.apply(pd.Series), right_index=True, left_index=True)
# del(pathways["Metabolites"])

pathways = pathways.fillna("None")
pathways.rename(columns={'name':"Pathway_name"}, inplace = True)
# pathways = pathways.fillna("None")
# pathways.rename(columns={'name':"Pathway_name"}, inplace = True)

pathways = pathways.fillna("None")
pathways = pathways.set_index('dbIdentifier')
pathways = pathways.drop(columns=['len'])
pathways.rename(columns={'name':"Pathway_name"}, inplace = True)

#if file path provided save gmt to drive
if self.filepath:
fpath = self.filepath + "/MetExplorePathways_" + str(self.model) + "_" + str(self.id_type) + ".gmt"
pathways.to_csv(fpath, sep="\t", header=False)
print("MetExplore metabolic network pathways file saved to " + fpath)

self.pathways = pathways
self.nMappedID = stats_nmapped.text.split("\n")[2]
self.nMetab = stats_nmetab.text.split("\n")[2]
# pathways = pathways.fillna("None")
# pathways = pathways.set_index('dbIdentifier')
# pathways = pathways.drop(columns=['len'])
# pathways.rename(columns={'name':"Pathway_name"}, inplace = True)

# #if file path provided save gmt to drive
# if self.filepath:
# fpath = self.filepath + "/MetExplorePathways_" + str(self.model) + "_" + str(self.id_type) + ".gmt"
# pathways.to_csv(fpath, sep="\t", header=False)
# print("MetExplore metabolic network pathways file saved to " + fpath)

# self.pathways = pathways
# self.nMappedID = stats_nmapped.text.split("\n")[2]
# self.nMetab = stats_nmetab.text.split("\n")[2]

print("Complete!")
return pathways
# print("Complete!")
# return pathways


def download_pathbank(organism, filepath=None, omicstype='metabolomics'):
Expand Down Expand Up @@ -378,5 +441,4 @@ def download_pathbank(organism, filepath=None, omicstype='metabolomics'):
print("Complete!")

return multiomics_pathways_gmt

download_pathbank()

9 changes: 6 additions & 3 deletions src/sspa/process_pathways.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,25 @@
import pkg_resources
import sspa.download_pathways

def process_reactome(organism, infile=None, download_latest=False, filepath=None, omics_type='metabolomics'):
def process_reactome(organism, infile=None, download_latest=False, filepath=None, omics_type='metabolomics', identifiers=None):
'''
Function to load Reactome pathways
Args:
organism (str): Reactome organism name
infile (str): default None, provide a Reactome pathway file to process into the GMT-style dataframe
download_latest (Bool): Downloads the latest version of Reactome metabolic pathways
filepath (str): filepath to save pathway file to, default is None - save to variable
omics_type(str): If using download_latest, specify type of omics pathways to download. Options are 'metabolomics', 'proteomics', or 'multiomics'
omics_type(str): If using download_latest, specify type of omics pathways to download. Options are 'metabolomics', 'proteomics', 'transcriptomics', or 'multiomics'
identifiers (list): list of identifiers to download for multi-omics pathways, default is None (download all). Options are 'chebi', 'uniprot', 'gene_symbol'
Returns:
GMT-like pd.DataFrame containing Reactome pathways
'''

# Process CHEBI to reactome data

if download_latest:
pathways_df = sspa.download_pathways.download_reactome(organism, filepath, omics_type)
pathways_df = sspa.download_pathways.download_reactome(organism, filepath, omics_type, identifiers)
return pathways_df

else:
Expand All @@ -29,6 +31,7 @@ def process_reactome(organism, infile=None, download_latest=False, filepath=None
f = pd.read_csv(stream, sep="\t", header=None, encoding='latin-1')
else:
f = pd.read_csv(infile, sep="\t", header=None)

f.columns = ['CHEBI', 'pathway_ID', 'link', 'pathway_name', 'evidence_code', 'species']
f_filt = f[f.species == organism]
name_dict = dict(zip(f_filt['pathway_ID'], f_filt['pathway_name']))
Expand Down

0 comments on commit 1e4e167

Please sign in to comment.