diff --git a/README.md b/README.md index 815f124..a4f326b 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,9 @@ sspa provides a Python interface for metabolomics pathway analysis. In addition - Metabolite set enrichment analysis (based on GSEA) - Single-sample pathway analysis - Compound identifier conversion -- Pathway database download (KEGG, Reactome, and MetExplore metabolic networks) +- Pathway database download (KEGG, Reactome, and PathBank) -Although this package is designed to provide a user-friendly interface for metabolomics pathway analysis, the methods are also applicable to other datatypes such as normalised RNA-seq data. +Although this package is designed to provide a user-friendly interface for metabolomics pathway analysis, the methods are also applicable to other datatypes such as normalised RNA-seq and proteomics data. ## Documentation and tutorials This README provides a quickstart guide to the package and its functions. For new users we **highly recommend following our full walkthrough notebook tutorial** available on Google Colab which provides a step-by-step guide to using the package. @@ -92,14 +92,29 @@ custom_pathways = sspa.process_gmt("wikipathways-20220310-gmt-Homo_sapiens.gmt") Download latest version of pathways ```python -# download KEGG latest +# download KEGG latest metabolomics pathways kegg_mouse_latest = sspa.process_kegg("mmu", download_latest=True, filepath=".") -# download Reactome latest -reactome_mouse_latest = sspa.process_reactome("Mus musculus", download_latest=True, filepath=".", omicstype='metabolomics') +# download Reactome latest metabolomics pathways +reactome_mouse_latest = sspa.process_reactome("Mus musculus", download_latest=True, filepath=".", omics_type='metabolomics') -# download Pathbank latest -pathbank_human_latest = sspa.process_pathbank("Homo sapiens", download_latest=True, filepath=".", omicstype='metabolomics') +# download Pathbank latest metabolomics pathways +pathbank_human_latest = sspa.process_pathbank("Homo sapiens", download_latest=True, filepath=".", omics_type='metabolomics') +``` + +Download latest version of multi-omics pathways +- For Reactome, users can specify the omics types required via the 'identifiers' argument. Leaving this to None downloads all omics (ChEBI, UniProt, Gene Symbol). Users can specify any combination of `['chebi', 'uniprot', 'gene_symbol']`. +- For KEGG, multi-omics pathways are represented by KEGG gene and KEGG compound identifiers. + +```python +# download multi-omics pathways from Reactome (ChEBI, UniProt, Gene Symbol) +reactome_human_mo = sspa.process_reactome('Homo sapiens', download_latest=True, filepath=".", omics_type='multiomics', identifiers=['chebi', 'uniprot', 'gene_symbol']) + +# download multi-omics pathways from Reactome (ChEBI and UniProt) +reactome_human_mo = sspa.process_reactome('Homo sapiens', download_latest=True, filepath=".", omics_type='multiomics', identifiers=['chebi', 'uniprot']) + +# download multi-omics pathways from KEGG (KEGG gene and KEGG compound) +kegg_mouse_latest = sspa.process_kegg("mmu", download_latest=True, filepath=".", omics_type='multiomics') ``` ## Identifier harmonization diff --git a/src/sspa/__init__.py b/src/sspa/__init__.py index d0703be..edbb0ad 100644 --- a/src/sspa/__init__.py +++ b/src/sspa/__init__.py @@ -10,5 +10,5 @@ from .sspa_ora import sspa_ora from .sspa_gsea import sspa_gsea from .sspa_ssGSEA import sspa_ssGSEA -from .download_pathways import download_KEGG, download_reactome, MetExplorePaths +from .download_pathways import download_KEGG, download_reactome from .identifier_conversion import identifier_conversion, map_identifiers \ No newline at end of file diff --git a/src/sspa/download_pathways.py b/src/sspa/download_pathways.py index d1bba1c..0aab603 100644 --- a/src/sspa/download_pathways.py +++ b/src/sspa/download_pathways.py @@ -5,8 +5,10 @@ import re import pandas as pd import warnings -import json from tqdm import tqdm +import zipfile +import requests +import io def download_KEGG(organism, filepath=None, omics_type='metabolomics'): ''' @@ -14,6 +16,7 @@ def download_KEGG(organism, filepath=None, omics_type='metabolomics'): Args: organism (str): KEGG 3 letter organism code filepath (str): filepath to save pathway file to, default is None - save to variable + omics_type(str): type of omics pathways to download (metabolomics or multiomics) Returns: GMT-like pd.DataFrame containing KEGG pathways ''' @@ -125,14 +128,15 @@ def download_KEGG(organism, filepath=None, omics_type='metabolomics'): return df -def download_reactome(organism, filepath=None, omics_type='metabolomics'): +def download_reactome(organism, filepath=None, omics_type='metabolomics', identifiers=None): ''' Function for Reactome pathway download Args: organism (str): Reactome organism name filepath (str): filepath (str): filepath to save pathway file to, default is None - save to variable omics_type(str): type of omics pathways to download. - Options are 'metabolomics' (ChEBI identifiers), 'proteomics' (UniProt identifiers), or 'multiomics' (ChEBI and UniProt identifiers) + Options are 'metabolomics' (ChEBI identifiers), 'proteomics' (UniProt identifiers), 'transcriptomics' (Gene Symbol), or 'multiomics' (ChEBI, UniProt and Gene Symbol identifiers) + identifiers (list): list of identifiers to download for multi-omics pathways, default is None (download all). Options are 'chebi', 'uniprot', 'gene_symbol' Returns: GMT-like pd.DataFrame containing Reactome pathways ''' @@ -188,32 +192,93 @@ def download_reactome(organism, filepath=None, omics_type='metabolomics'): print("Complete!") return pathways_df + + if omics_type == 'transcriptomics': + url_gmt = 'https://reactome.org/download/current/ReactomePathways.gmt.zip' + resp_gmt = requests.get(url_gmt) + + input_gmt = [] + with zipfile.ZipFile(io.BytesIO(resp_gmt.content)) as zip_ref: + gmt_filename = [f for f in zip_ref.namelist() if f.endswith('.gmt')][0] + + # Extract the GMT file contents + with zip_ref.open(gmt_filename) as gmt_file: + for i in gmt_file: + i = i.decode('utf-8').strip() + input_gmt.append(i.strip("\n").split("\t")) + pathways_df = pd.DataFrame(input_gmt) + pathways_df = pathways_df.rename({0:"Pathway_name", 1:"Pathway_ID"}, axis=1) + pathways_df.index = pathways_df["Pathway_ID"] + pathways_df = pathways_df.drop("Pathway_ID", axis=1) + pathways_df = pathways_df.dropna(axis=0, how='all', subset=pathways_df.columns.tolist()[1:]) + pathways_df = pathways_df.dropna(axis=1, how='all') + + if filepath: + fpath = filepath + "/Reactome_" + "_".join(organism.split())+ "_pathways_GeneSymbol_R" + str(version_no) + ".gmt" + pathways_df.to_csv(fpath, sep="\t", header=False) + print("Reactome DB file saved to " + fpath) + + print("Complete!") + return pathways_df if omics_type == 'multiomics': + + if organism != 'Homo sapiens' and ('gene_symbol' in identifiers): + print('WARNING: Reactome does not provide gene_symbols for a non-Human organism, use UniProt instead') + + name_df = pd.read_csv('https://reactome.org/download/current/ReactomePathways.txt', sep="\t", header=None) url_prot = 'https://reactome.org/download/current/UniProt2Reactome_All_Levels.txt' url_metab = 'https://reactome.org/download/current/ChEBI2Reactome_All_Levels.txt' + url_gmt = 'https://reactome.org/download/current/ReactomePathways.gmt.zip' - pathway_dfs = [] - for url in [url_prot, url_metab]: + # read the chebi and uniprot files + pathway_dicts = {'chebi': None, 'uniprot': None, 'gene_symbol': None} + for k, url in {'uniprot': url_prot, 'chebi': url_metab}.items(): f = pd.read_csv(url, sep="\t", header=None) - f.columns = ['molecule_ID', 'pathway_ID', 'link', 'pathway_name', 'evidence_code', 'species'] f_filt = f[f.species == organism] - name_dict = dict(zip(f_filt['pathway_ID'], f_filt['pathway_name'])) groups = f_filt.groupby(['pathway_ID'])['molecule_ID'].apply(list).to_dict() groups = {k: list(set(v)) for k, v in groups.items()} - df = pd.DataFrame.from_dict(groups, orient='index', dtype="object") - pathways_df = df.dropna(axis=0, how='all', subset=df.columns.tolist()[1:]) - pathways_df = df.dropna(axis=1, how='all') - pathways_df["Pathway_name"] = pathways_df.index.map(name_dict) - pathways_df.insert(0, 'Pathway_name', pathways_df.pop('Pathway_name')) - pathways_df.set_index([pathways_df.index, pathways_df['Pathway_name']], inplace=True) - pathways_df.drop(['Pathway_name'], axis=1, inplace=True) - pathway_dfs.append(pathways_df) - - # merge pathways on index - reactome_mo = pathway_dfs[0].merge(pathway_dfs[1], how='outer', left_index=True, right_index=True) - reactome_mo = reactome_mo.reset_index(level=[1]) + pathway_dicts[k] = groups + + # read the GMT with gene symbols + url_gmt = 'https://reactome.org/download/current/ReactomePathways.gmt.zip' + resp_gmt = requests.get(url_gmt) + + input_gmt = [] + gene_dict = {} + with zipfile.ZipFile(io.BytesIO(resp_gmt.content)) as zip_ref: + gmt_filename = [f for f in zip_ref.namelist() if f.endswith('.gmt')][0] + + # Extract the GMT file contents + with zip_ref.open(gmt_filename) as gmt_file: + for i in gmt_file: + i = i.decode('utf-8').strip() + line = i.strip("\n").split("\t") + gene_dict[line[1]] = line[2:] + pathway_dicts['gene_symbol'] = gene_dict + + # combine all dicts by key + # combine the required dictionaries based on the identifiers + if identifiers: + required_dicts = [pathway_dicts[k] for k in identifiers] + else: + required_dicts = list(pathway_dicts.values()) + + combined_dict = {} + for k in set().union(*required_dicts): + combined_dict[k] = list(d.get(k) for d in required_dicts) + # remove none values + combined_dict = {k: [x for x in v if x is not None] for k, v in combined_dict.items()} + # flatten the list of lists + combined_dict = {k: [item for sublist in v for item in sublist] for k, v in combined_dict.items()} + + # create the dataframe + reactome_mo = pd.DataFrame.from_dict(combined_dict, orient='index', dtype="object") + + # add the pathway names + reactome_mo["Pathway_name"] = reactome_mo.index.map(dict(zip(name_df[0], name_df[1]))) + reactome_mo.insert(0, 'Pathway_name', reactome_mo.pop('Pathway_name')) if filepath: fpath = filepath + "/Reactome_" + "_".join(organism.split())+ "_pathways_multiomics_R" + str(version_no) + ".gmt" @@ -222,73 +287,71 @@ def download_reactome(organism, filepath=None, omics_type='metabolomics'): print("Complete!") return reactome_mo - - -class MetExplorePaths: - ''' - Class for downloading metexplore metabolic models in the form of pathways with mapped identifiers - - Attributes: - model (str): identifier of genome scale metabolic model available on MetExplore - id_type (str): identifier type for the model pathways - filepath (str): filepath to save the pathway file to, default is None (save to variable) - nMappedID (int): Number of metabolites mapping to the selected identifier type - nMetab (int): Number of metabolites in the model - pathways (pd.DataFrame): GMT-like format pathway DataFrame - - ''' - def __init__(self, model, id_type, filepath=None): - self.model = model - self.id_type = id_type - self.filepath = filepath - self.nMappedID = None - self.nMetab = None - self.pathways = None - # downloads pathways on object instantiation - self.download_metexplore() - - def download_metexplore(self): - ''' - Function to download MetExplore pathways - ''' - warnings.filterwarnings("ignore") - metexploreURL = "https://metexplore.toulouse.inrae.fr/metexplore-api/"+str(self.model)+"/pathwaymetabolite/"+str(self.id_type)+"/" - stats_nmapped_url = "https://metexplore.toulouse.inrae.fr/metexplore-api/stat/"+str(self.model)+"/"+str(self.id_type)+"/" - stats_nmetab_url = "https://metexplore.toulouse.inrae.fr/metexplore-api/stat/"+str(self.model)+"/nbMetab/" +# class MetExplorePaths: +# ''' +# Class for downloading metexplore metabolic models in the form of pathways with mapped identifiers (DEPRECATED) + +# Attributes: +# model (str): identifier of genome scale metabolic model available on MetExplore +# id_type (str): identifier type for the model pathways +# filepath (str): filepath to save the pathway file to, default is None (save to variable) +# nMappedID (int): Number of metabolites mapping to the selected identifier type +# nMetab (int): Number of metabolites in the model +# pathways (pd.DataFrame): GMT-like format pathway DataFrame + +# ''' +# def __init__(self, model, id_type, filepath=None): +# self.model = model +# self.id_type = id_type +# self.filepath = filepath +# self.nMappedID = None +# self.nMetab = None +# self.pathways = None +# # downloads pathways on object instantiation +# self.download_metexplore() + +# def download_metexplore(self): +# ''' +# Function to download MetExplore pathways +# ''' +# warnings.filterwarnings("ignore") +# metexploreURL = "https://metexplore.toulouse.inrae.fr/metexplore-api/"+str(self.model)+"/pathwaymetabolite/"+str(self.id_type)+"/" +# stats_nmapped_url = "https://metexplore.toulouse.inrae.fr/metexplore-api/stat/"+str(self.model)+"/"+str(self.id_type)+"/" +# stats_nmetab_url = "https://metexplore.toulouse.inrae.fr/metexplore-api/stat/"+str(self.model)+"/nbMetab/" - stats_nmetab = requests.get(stats_nmetab_url, verify=False) - stats_nmapped = requests.get(stats_nmapped_url, verify=False) - data_api = requests.get(metexploreURL, verify=False) - pathways_json = data_api.json() +# stats_nmetab = requests.get(stats_nmetab_url, verify=False) +# stats_nmapped = requests.get(stats_nmapped_url, verify=False) +# data_api = requests.get(metexploreURL, verify=False) +# pathways_json = data_api.json() - pathways_df = pd.DataFrame.from_dict(pathways_json, orient='columns') - del(pathways_df["id"]) +# pathways_df = pd.DataFrame.from_dict(pathways_json, orient='columns') +# del(pathways_df["id"]) - pathways = pathways_df.merge(pathways_df.Metabolites.apply(pd.Series), right_index=True, left_index=True) - del(pathways["Metabolites"]) +# pathways = pathways_df.merge(pathways_df.Metabolites.apply(pd.Series), right_index=True, left_index=True) +# del(pathways["Metabolites"]) - pathways = pathways.fillna("None") - pathways.rename(columns={'name':"Pathway_name"}, inplace = True) +# pathways = pathways.fillna("None") +# pathways.rename(columns={'name':"Pathway_name"}, inplace = True) - pathways = pathways.fillna("None") - pathways = pathways.set_index('dbIdentifier') - pathways = pathways.drop(columns=['len']) - pathways.rename(columns={'name':"Pathway_name"}, inplace = True) - - #if file path provided save gmt to drive - if self.filepath: - fpath = self.filepath + "/MetExplorePathways_" + str(self.model) + "_" + str(self.id_type) + ".gmt" - pathways.to_csv(fpath, sep="\t", header=False) - print("MetExplore metabolic network pathways file saved to " + fpath) - - self.pathways = pathways - self.nMappedID = stats_nmapped.text.split("\n")[2] - self.nMetab = stats_nmetab.text.split("\n")[2] +# pathways = pathways.fillna("None") +# pathways = pathways.set_index('dbIdentifier') +# pathways = pathways.drop(columns=['len']) +# pathways.rename(columns={'name':"Pathway_name"}, inplace = True) + +# #if file path provided save gmt to drive +# if self.filepath: +# fpath = self.filepath + "/MetExplorePathways_" + str(self.model) + "_" + str(self.id_type) + ".gmt" +# pathways.to_csv(fpath, sep="\t", header=False) +# print("MetExplore metabolic network pathways file saved to " + fpath) + +# self.pathways = pathways +# self.nMappedID = stats_nmapped.text.split("\n")[2] +# self.nMetab = stats_nmetab.text.split("\n")[2] - print("Complete!") - return pathways +# print("Complete!") +# return pathways def download_pathbank(organism, filepath=None, omicstype='metabolomics'): @@ -378,5 +441,4 @@ def download_pathbank(organism, filepath=None, omicstype='metabolomics'): print("Complete!") return multiomics_pathways_gmt - - download_pathbank() \ No newline at end of file + \ No newline at end of file diff --git a/src/sspa/process_pathways.py b/src/sspa/process_pathways.py index 68b2f6b..24e0858 100644 --- a/src/sspa/process_pathways.py +++ b/src/sspa/process_pathways.py @@ -2,7 +2,7 @@ import pkg_resources import sspa.download_pathways -def process_reactome(organism, infile=None, download_latest=False, filepath=None, omics_type='metabolomics'): +def process_reactome(organism, infile=None, download_latest=False, filepath=None, omics_type='metabolomics', identifiers=None): ''' Function to load Reactome pathways Args: @@ -10,7 +10,9 @@ def process_reactome(organism, infile=None, download_latest=False, filepath=None infile (str): default None, provide a Reactome pathway file to process into the GMT-style dataframe download_latest (Bool): Downloads the latest version of Reactome metabolic pathways filepath (str): filepath to save pathway file to, default is None - save to variable - omics_type(str): If using download_latest, specify type of omics pathways to download. Options are 'metabolomics', 'proteomics', or 'multiomics' + omics_type(str): If using download_latest, specify type of omics pathways to download. Options are 'metabolomics', 'proteomics', 'transcriptomics', or 'multiomics' + identifiers (list): list of identifiers to download for multi-omics pathways, default is None (download all). Options are 'chebi', 'uniprot', 'gene_symbol' + Returns: GMT-like pd.DataFrame containing Reactome pathways ''' @@ -18,7 +20,7 @@ def process_reactome(organism, infile=None, download_latest=False, filepath=None # Process CHEBI to reactome data if download_latest: - pathways_df = sspa.download_pathways.download_reactome(organism, filepath, omics_type) + pathways_df = sspa.download_pathways.download_reactome(organism, filepath, omics_type, identifiers) return pathways_df else: @@ -29,6 +31,7 @@ def process_reactome(organism, infile=None, download_latest=False, filepath=None f = pd.read_csv(stream, sep="\t", header=None, encoding='latin-1') else: f = pd.read_csv(infile, sep="\t", header=None) + f.columns = ['CHEBI', 'pathway_ID', 'link', 'pathway_name', 'evidence_code', 'species'] f_filt = f[f.species == organism] name_dict = dict(zip(f_filt['pathway_ID'], f_filt['pathway_name']))