From d3c0dfc25a30af01878ba85d4507d88466f1d14f Mon Sep 17 00:00:00 2001
From: cwieder <cw2019@ic.ac.uk>
Date: Wed, 27 Mar 2024 13:53:08 +0000
Subject: [PATCH] enable KEGG multi-omics download

---
 src/sspa/download_pathways.py | 111 +++++++++++++++++++++++++---------
 src/sspa/process_pathways.py  |   4 +-
 2 files changed, 84 insertions(+), 31 deletions(-)

diff --git a/src/sspa/download_pathways.py b/src/sspa/download_pathways.py
index 95674be..d1bba1c 100644
--- a/src/sspa/download_pathways.py
+++ b/src/sspa/download_pathways.py
@@ -8,7 +8,7 @@
 import json
 from tqdm import tqdm
 
-def download_KEGG(organism, filepath=None):
+def download_KEGG(organism, filepath=None, omics_type='metabolomics'):
     '''
     Function for KEGG pathway download
     Args:
@@ -26,7 +26,7 @@ def download_KEGG(organism, filepath=None):
     pathways = pathways.split("\n")
     pathways = filter(None, pathways)
     pathway_dict = dict()
-    
+
     for path in pathways:
         path = path.split("\t")
         name = path[1]
@@ -38,41 +38,92 @@ def download_KEGG(organism, filepath=None):
 
     pathway_ids = [*pathway_dict]
     pathway_names = list(pathway_dict.values())
-    pathway_compound_mapping = dict()
-
-    for index,i in enumerate(tqdm(pathway_ids)):
-        complist = []
-        current_url = base_url + "pathway:" +i
-        # parse the pathway description page
-        page = requests.get(current_url)
-        lines = page.text.split("\n")
-
-        try:
-            cpds_start = [lines.index(i) for i in lines if i.startswith("COMPOUND")][0]
-            reference_start = [lines.index(i) for i in lines if i.startswith("REFERENCE") or i.startswith("REL_PATHWAY")][0]
-            cpds_lines = lines[cpds_start:reference_start]
-            first_cpd = cpds_lines.pop(0).split()[1]
-            complist.append(first_cpd)
-            complist = complist + [i.split()[0] for i in cpds_lines]
-            pathway_compound_mapping[i] = list(set(complist))
-        except IndexError:
-            pathway_compound_mapping[i] = []
 
     # get release details
     release_data = requests.get('http://rest.kegg.jp/info/kegg')
     version_no = release_data.text.split()[9][0:3]
 
-    # create GMT style file
-    df = pd.DataFrame.from_dict(pathway_compound_mapping, orient='index')
-    df.insert(0, 'Pathway_name', pathway_names)
+    if omics_type == 'metabolomics':
+        pathway_compound_mapping = dict()
+
+        for index,i in enumerate(tqdm(pathway_ids)):
+            complist = []
+            current_url = base_url + "pathway:" +i
+            # parse the pathway description page
+            page = requests.get(current_url)
+            lines = page.text.split("\n")
+
+            try:
+                cpds_start = [lines.index(i) for i in lines if i.startswith("COMPOUND")][0]
+                reference_start = [lines.index(i) for i in lines if i.startswith("REFERENCE") or i.startswith("REL_PATHWAY")][0]
+                cpds_lines = lines[cpds_start:reference_start]
+                first_cpd = cpds_lines.pop(0).split()[1]
+                complist.append(first_cpd)
+                complist = complist + [i.split()[0] for i in cpds_lines]
+                pathway_compound_mapping[i] = list(set(complist))
+            except IndexError:
+                pathway_compound_mapping[i] = []
+
+        # remove empty pathway entries
+        pathway_compound_mapping = {k: v for k, v in pathway_compound_mapping.items() if v}
+
+        # create GMT style file
+        df = pd.DataFrame.from_dict(pathway_compound_mapping, orient='index')
+        # map pathway names onto first column
+        df.insert(0, 'Pathway_name', df.index.map(pathway_dict.get))
+
+        if filepath:
+            fpath = filepath + "/KEGG_" + organism + "_pathways_compounds_R" + str(version_no) + ".gmt"
+            df.to_csv(fpath, sep="\t", header=False)
+            print("KEGG DB file saved to " + fpath)
+        print("Complete!")
+
+        return df
+        
+
+    if omics_type == 'multiomics':
+        pathway_mapping = dict()
+
+        for index,i in enumerate(tqdm(pathway_ids)):
+            complist = []
+            genelist = []
+            current_url = base_url + "pathway:" +i
+            # parse the pathway description page
+            page = requests.get(current_url)
+            lines = page.text.split("\n")
+
+            try:
+                genes_start = [lines.index(i) for i in lines if i.startswith("GENE")][0]
+                cpds_start = [lines.index(i) for i in lines if i.startswith("COMPOUND")][0]
+                reference_start = [lines.index(i) for i in lines if i.startswith("REFERENCE") or i.startswith("REL_PATHWAY")][0]
+                genes_lines = lines[genes_start:cpds_start]
+                cpds_lines = lines[cpds_start:reference_start]
+
+                first_cpd = cpds_lines.pop(0).split()[1]
+                complist.append(first_cpd)
+                complist = complist + [i.split()[0] for i in cpds_lines]
+                first_gene = genes_lines.pop(0).split()[1]
+                genelist.append(first_gene)
+                genelist = genelist + [i.split()[0] for i in genes_lines]
+                pathway_mapping[i] = list(set(complist)) + list(set(genelist))
+            except IndexError:
+                pathway_mapping[i] = []
+
+        # remove empty pathway entries
+        pathway_mapping = {k: v for k, v in pathway_mapping.items() if v}
+
+        # create GMT style file
+        df = pd.DataFrame.from_dict(pathway_mapping, orient='index')
+        # map pathway names onto first column
+        df.insert(0, 'Pathway_name', df.index.map(pathway_dict.get))
 
-    if filepath:
-        fpath = filepath + "/KEGG_" + organism + "_pathways_compounds_R" + str(version_no) + ".gmt"
-        df.to_csv(fpath, sep="\t", header=False)
-        print("KEGG DB file saved to " + fpath)
-    print("Complete!")
+        if filepath:
+            fpath = filepath + "/KEGG_" + organism + "_pathways_multiomics_R" + str(version_no) + ".gmt"
+            df.to_csv(fpath, sep="\t", header=False)
+            print("KEGG DB file saved to " + fpath)
+        print("Complete!")
 
-    return df
+        return df
 
 def download_reactome(organism, filepath=None, omics_type='metabolomics'):
     '''
diff --git a/src/sspa/process_pathways.py b/src/sspa/process_pathways.py
index a756421..b201129 100644
--- a/src/sspa/process_pathways.py
+++ b/src/sspa/process_pathways.py
@@ -44,7 +44,7 @@ def process_reactome(organism, infile=None, download_latest=False, filepath=None
 
         return pathways_df
 
-def process_kegg(organism, infile=None, download_latest=False, filepath=None):
+def process_kegg(organism, infile=None, download_latest=False, filepath=None, omics_type='metabolomics'):
     '''
     Function to load KEGG pathways 
     Args:
@@ -60,6 +60,8 @@ def process_kegg(organism, infile=None, download_latest=False, filepath=None):
         return pathways_df
 
     else:
+        if omics_type != 'metabolomics':
+            raise ValueError('Proteomics/multi-omics pathways only accessible when download_latest=True')
         if infile == None or infile == "R98":
             stream = pkg_resources.resource_stream(__name__, 'pathway_databases/KEGG_human_pathways_compounds_R98.csv')
             pathways_df = pd.read_csv(stream, index_col=0, encoding='latin-1')