From fd819d3be09595907eb4675acef04e0daeb1a0c6 Mon Sep 17 00:00:00 2001
From: Vasudha Jha <reachvasudha27@gmail.com>
Date: Mon, 24 Jul 2023 09:18:38 -0700
Subject: [PATCH 1/4] add genoannotate + bump minor version

---
 .github/workflows/release.yml        |   2 +-
 genomap/genoAnnotate/__init__.py     |   1 +
 genomap/genoAnnotate/genoAnnotate.py |  92 +++++++++++
 genomap/genotype/__init__.py         |   1 +
 genomap/genotype/genotype.py         | 237 +++++++++++++++++++++++++++
 setup.py                             |   2 +-
 6 files changed, 333 insertions(+), 2 deletions(-)
 create mode 100644 genomap/genoAnnotate/__init__.py
 create mode 100644 genomap/genoAnnotate/genoAnnotate.py
 create mode 100644 genomap/genotype/__init__.py
 create mode 100644 genomap/genotype/genotype.py

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 8ce388b..409c80f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -14,5 +14,5 @@ jobs:
     steps:
       - uses: rymndhng/release-on-push-action@master
         with:
-          bump_version_scheme: patch # can be either "major", "minor", "patch" or "norelease"
+          bump_version_scheme: minor # can be either "major", "minor", "patch" or "norelease"
           tag_prefix: v
\ No newline at end of file
diff --git a/genomap/genoAnnotate/__init__.py b/genomap/genoAnnotate/__init__.py
new file mode 100644
index 0000000..79ae6c4
--- /dev/null
+++ b/genomap/genoAnnotate/__init__.py
@@ -0,0 +1 @@
+from .genoAnnotate import *
\ No newline at end of file
diff --git a/genomap/genoAnnotate/genoAnnotate.py b/genomap/genoAnnotate/genoAnnotate.py
new file mode 100644
index 0000000..95cd99c
--- /dev/null
+++ b/genomap/genoAnnotate/genoAnnotate.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jul 23 15:18:43 2023
+
+@author: Md Tauhidul Islam
+# This code is inspired by scType (https://github.com/IanevskiAleksandr/sc-type)
+# We are in the process of using image matching techique for further enhancement
+# of the cell annotation
+"""
+
+from genomap.genotype import *
+import scanpy as sc
+
+def genoAnnotate(adata,tissue_type,database=None):
+    # Input: adata: annData containing the raw gene counts
+    # tissue type: e.g. Immune system,Pancreas,Liver,Eye,Kidney,Brain,Lung,Adrenal,Heart,Intestine,Muscle,Placenta,Spleen,Stomach,Thymus 
+    # database: User can select his/her own database in excel format
+
+    # Database file
+    if database==None:
+        database = "https://raw.githubusercontent.com/xinglab-ai/self-consistent-expression-recovery-machine/master/demo/data/genoANN_db.xlsx";        
+    
+    # Filter cells
+    sc.pp.filter_cells(adata, min_genes=200)
+    # Normalize data
+    adata.raw = adata
+    sc.pp.normalize_total(adata, target_sum=10000)
+    sc.pp.log1p(adata)
+    sc.pp.highly_variable_genes(adata, n_top_genes=2000)
+    adata = adata[:, adata.var['highly_variable']]
+    # Scale data and run PCA
+    sc.pp.scale(adata, max_value=10)
+    sc.tl.pca(adata)
+    
+    # Prepare positive and negative gene sets
+    result = gene_sets_prepare(database, tissue_type)
+    gs = result['gs_positive']
+    gs2 = result['gs_negative']
+    cell_types = result['cell_types']
+
+
+    data=adata.raw.X.toarray()
+    # Get cell-type by cell matrix
+    scRNAseqData = pd.DataFrame(data, index=adata.raw.obs_names, columns=adata.raw.var_names)
+
+    # Compute cell-type score fro each cell
+    es_max = sctype_score(scRNAseqData=scRNAseqData, scaled=True, gs=gs, gs2=gs2, cell_types=cell_types)
+    es_max.columns = cell_types
+    es_max.index = scRNAseqData.index
+
+    # Calculate neighborhood graph of cells (replace 'adata' with your actual AnnData object)
+    sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_pca')
+    # Perform clustering so that cell-type can be assigned to each cluster
+    sc.tl.leiden(adata)
+    # The cluster labels are stored in `adata.obs['louvain']`
+    results = []
+    for cl in adata.obs['leiden'].unique():
+        cells_in_cluster = adata.obs_names[adata.obs['leiden'] == cl]
+        es_max_cl = es_max.loc[cells_in_cluster].sum().sort_values(ascending=False)
+        results.append(pd.DataFrame({
+            'cluster': cl,
+            'type': es_max_cl.index[:1],
+            'scores': es_max_cl.values[:1],
+            'ncells': len(cells_in_cluster)
+            }))
+
+    results = pd.concat(results)
+    results.loc[results['scores'] < results['ncells'] / 4, 'type'] = 'Unknown'
+    results.set_index('cluster', inplace=True)
+    # Assign the cell type labels to the cells in the AnnData object
+    adata.obs['cell_type'] = results.loc[adata.obs['leiden'], 'type'].values
+    return adata
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/genomap/genotype/__init__.py b/genomap/genotype/__init__.py
new file mode 100644
index 0000000..55769b9
--- /dev/null
+++ b/genomap/genotype/__init__.py
@@ -0,0 +1 @@
+from .genotype import *
\ No newline at end of file
diff --git a/genomap/genotype/genotype.py b/genomap/genotype/genotype.py
new file mode 100644
index 0000000..e723bd5
--- /dev/null
+++ b/genomap/genotype/genotype.py
@@ -0,0 +1,237 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jul 20 16:57:36 2023
+
+@author: Md Tauhidul Islam
+# This code is inspired by scType (https://github.com/IanevskiAleksandr/sc-type)
+# We are in the process of using image matching techique for further enhancement
+# of the cell annotation
+"""
+import pandas as pd
+import numpy as np
+import scipy
+
+
+
+def sctype_score(scRNAseqData, scaled=True, gs=None, gs2=None, gene_names_to_uppercase=True, cell_types=None):
+    # Ensure input is a pandas DataFrame
+    if not isinstance(scRNAseqData, pd.DataFrame):
+        print("scRNAseqData doesn't seem to be a DataFrame")
+        return None
+    
+    # Check if DataFrame is empty
+    if scRNAseqData.empty:
+        print("The input scRNAseqData DataFrame is empty")
+        return None
+
+    # Marker sensitivity
+    marker_stat = pd.Series([gene for sublist in gs for gene in sublist]).value_counts().sort_values(ascending=False)
+    marker_sensitivity = pd.DataFrame({
+        "score_marker_sensitivity": (len(gs)-marker_stat.values) / (len(gs) - 1),
+        "gene_": marker_stat.index
+    }) 
+
+    # Convert gene names to uppercase
+    if gene_names_to_uppercase:
+        scRNAseqData.columns = scRNAseqData.columns.astype(str).str.upper()
+
+    # Subselect genes found in data
+    gs = [[gene for gene in sublist if gene in scRNAseqData.columns] for sublist in gs]
+    gs2 = [[gene for gene in sublist if gene in scRNAseqData.columns] for sublist in gs2]
+
+    cell_markers_genes_score = marker_sensitivity[marker_sensitivity['gene_'].isin(np.unique(np.concatenate(gs)))]
+
+    Z = scRNAseqData.copy()
+    
+    #scanpy.pp.scale(Z)
+    Z= scipy.stats.zscore(Z, axis=0, ddof=1)  
+    Z = Z.fillna(0)
+
+    # Multiply by marker sensitivity
+    Z = Z.T
+    for idx, row in cell_markers_genes_score.iterrows():
+        Z.loc[row["gene_"]] *= row["score_marker_sensitivity"]
+    Z = Z.T
+    # Subselect only with marker genes
+    Z = Z[np.unique(np.concatenate(gs + gs2))]    
+    Z=Z.T
+    score_series_list = []
+    # Loop over each gene symbol (gss_) in gs
+    
+    for i in range(0,len(gs)):
+        gss_=np.array(gs[i])
+        gss2_=np.array(gs2[i])
+        scores = []    
+        # Loop over each column (j) in Z
+        for j in range(Z.shape[1]):
+            gs_z = Z.loc[gss_].iloc[:, j]  # Using iloc for integer-based indexing  
+            gz_2 = Z.loc[gss2_].iloc[:, j] * -1  # Using iloc for integer-based indexing
+            sum_t1 = np.sum(gs_z) / np.sqrt(len(gs_z))
+            sum_t2 = np.sum(gz_2) / np.sqrt(len(gz_2))        
+            if np.isnan(sum_t2):
+                sum_t2 = 0        
+            scores.append(sum_t1 + sum_t2)    
+        # Convert the scores list to a pandas Series and add it to the score_series_list
+        score_series_list.append(pd.Series(scores))
+    # Concatenate the score series in score_series_list along axis 0 to create a DataFrame
+    es = pd.concat(score_series_list, axis=1)
+
+    # Remove rows with all NA or empty values
+    es = es.dropna(how='all')
+    es = es[~(es == "").all(axis=1)]
+
+    return es
+
+
+
+
+import openpyxl
+
+def gene_sets_prepare(path_to_db_file, cell_type):
+    # Read the Excel file
+    cell_markers = pd.read_excel(path_to_db_file, engine='openpyxl')
+
+    # Select rows where 'tissueType' matches 'cell_type'
+    cell_markers = cell_markers[cell_markers['tissueType'] == cell_type]
+    # Apply the function to each row in the DataFrame 'cell_markers'
+    # cell_markers['geneSymbolmore1'] = cell_markers['geneSymbolmore1'].apply(correct_gene_symbols)
+
+
+    # Convert to string and clean 'geneSymbolmore1' and 'geneSymbolmore2' columns
+    cell_markers['geneSymbolmore1'] = cell_markers['geneSymbolmore1'].astype(str).str.replace(" ", "")
+    cell_markers['geneSymbolmore2'] = cell_markers['geneSymbolmore2'].astype(str).str.replace(" ", "")
+    
+ 
+    cell_markers["geneSymbolmore1"] = cell_markers["geneSymbolmore1"].apply(lambda x: correct_gene_symbols(x))
+    cell_markers["geneSymbolmore2"] = cell_markers["geneSymbolmore2"].apply(lambda x: correct_gene_symbols(x))
+    
+
+    # Define a helper function to handle potential NaN values
+    def process_genes(x):
+        if x == 'nan':
+            return []
+        else:
+            return sorted([i for i in x.split(",") if i not in ["NA", ""]])
+
+    # Split 'geneSymbolmore1' and 'geneSymbolmore2' into lists of genes
+    cell_markers['geneSymbolmore1'] = cell_markers['geneSymbolmore1'].apply(process_genes)
+    cell_markers['geneSymbolmore2'] = cell_markers['geneSymbolmore2'].apply(process_genes)
+
+    # Replace '///' with ',' and remove spaces
+    cell_markers['geneSymbolmore1'] = cell_markers['geneSymbolmore1'].apply(lambda x: ",".join(x).replace("///", ",").replace(" ", ""))
+    cell_markers['geneSymbolmore2'] = cell_markers['geneSymbolmore2'].apply(lambda x: ",".join(x).replace("///", ",").replace(" ", ""))
+
+    # Split 'geneSymbolmore1' and 'geneSymbolmore2' into lists of genes again
+    cell_markers['geneSymbolmore1'] = cell_markers['geneSymbolmore1'].str.split(",")
+    cell_markers['geneSymbolmore2'] = cell_markers['geneSymbolmore2'].str.split(",")
+
+    # Prepare 'gs' and 'gs2' lists
+    gs = [genes for genes in cell_markers['geneSymbolmore1']]
+    gs2 = [genes for genes in cell_markers['geneSymbolmore2']]
+    
+    # Prepare cell types list
+    cell_types = list(cell_markers['cellName'])
+
+    return {'gs_positive': gs, 'gs_negative': gs2, 'cell_types': cell_types}
+
+
+
+def correct_gene_symbols(gene_symbols, species="human"):
+    if isinstance(gene_symbols, str):
+        gene_symbols = gene_symbols.split(",")
+    gene_symbols = [gs.strip().replace(" ", "").upper() for gs in gene_symbols if gs.strip().upper() not in ["NA", ""]]
+    gene_symbols = sorted(set(gene_symbols))
+
+    if len(gene_symbols) > 0:
+        # Assuming you have defined and imported the gene_symbol_mapping function
+        # This function should perform the gene symbol mapping using pybiomart or any other method you have implemented.
+        # gene_symbol_mapping(markers_all) should return a DataFrame with the "Suggested.Symbol" column.
+        # For simplicity, I'll use a dummy function that returns an empty DataFrame here.
+        #def gene_symbol_mapping(gene_symbols):
+            #return pd.DataFrame(columns=["Suggested.Symbol"])
+
+        suppressMessages = lambda x: x  # Suppressing messages in Python is not necessary
+
+        markers_all = check_gene_symbols(gene_symbols)
+        markers_all = markers_all.dropna(subset=["Suggested.Symbol"])
+        markers_all = sorted(set(markers_all["Suggested.Symbol"]))
+
+        return ",".join(markers_all)
+    else:
+        return ""
+
+
+
+
+from pybiomart import Dataset
+import re
+
+def check_gene_symbols(x, unmapped_as_na=True, map=None, species="human"):
+    if species == "human":
+        dataset_name = 'hsapiens_gene_ensembl'
+    elif species == "mouse":
+        dataset_name = 'mmusculus_gene_ensembl'
+    else:
+        raise ValueError("Species must be 'human' or 'mouse'")
+
+    biomart = Dataset(name=dataset_name, host='http://www.ensembl.org')
+    biomart_df = biomart.query(attributes=['external_gene_name', 'gene_biotype'])
+
+
+    casecorrection = False
+    if species == "human":
+        casecorrection = True
+        if map is None:
+            #lastupdate = biomart_df["date"].max()
+            #print("Maps last updated on:", lastupdate)
+            map = biomart_df[['Gene name', 'Gene type']].drop_duplicates()
+    elif species == "mouse" and map is None:
+        #lastupdate = biomart_df["date"].max()
+        #print("Maps last updated on:", lastupdate)
+        map = biomart_df[['Gene name', 'Gene type']].drop_duplicates()
+    else:
+        if map is None:
+            raise ValueError("If species is not 'human' or 'mouse', then map argument must be specified")
+
+    if not isinstance(map, pd.DataFrame) or not set(map.columns) == {"Gene name", "Gene type"}:
+        raise ValueError("If map is specified, it must be a dataframe with two columns named 'external_gene_name' and 'gene_biotype'")
+
+    approved = [sym in map["Gene name"].values for sym in x]
+
+    if casecorrection:
+        x_casecorrected = [sym.upper() for sym in x]
+        x_casecorrected = [re.sub(r"(.*C[0-9XY]+)ORF(.+)", r"\1orf\2", sym) for sym in x_casecorrected]
+    else:
+        x_casecorrected = x.split()
+
+    approvedaftercasecorrection = [sym in map["Gene name"].values for sym in x_casecorrected]
+
+    if x != " ".join(x_casecorrected):
+        print("Human gene symbols should be all upper-case except for the 'orf' in open reading frames. The case of some letters was corrected.")
+
+    alias = [sym in map["Gene name"].values for sym in x_casecorrected]
+
+    suggested_symbols = []
+    for i in range(len(x_casecorrected)):
+        if approved[i]:
+            suggested_symbols.append(x_casecorrected[i])
+        else:
+            if alias[i]:
+                suggested_symbols.append(" /// ".join(map.loc[map["Gene name"] == x_casecorrected[i], "Gene name"]))
+            elif approvedaftercasecorrection[i]:
+                suggested_symbols.append(x_casecorrected[i])
+            else:
+                suggested_symbols.append(None)
+
+    df = pd.DataFrame({"x": x, "Approved": approved, "Suggested.Symbol": suggested_symbols})
+    df["Approved"][df["x"].isnull()] = False
+
+    if not unmapped_as_na:
+        df.loc[df["Suggested.Symbol"].isnull(), "Suggested.Symbol"] = df.loc[df["Suggested.Symbol"].isnull(), "x"]
+
+    if df["Approved"].sum() != df.shape[0]:
+        print("x contains non-approved gene symbols")
+
+    return df
+
+
diff --git a/setup.py b/setup.py
index a7d0bf3..fe955d2 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="genomap",
-    version="1.2.9",
+    version="1.3.0",
     author="Md Tauhidul Islam",
     author_email="tauhid@stanford.edu",
     description="Genomap converts tabular gene expression data into spatially meaningful images.",

From 044c71ce89ccd76947760eb6be4661d1bdcffc7e Mon Sep 17 00:00:00 2001
From: Vasudha Jha <reachvasudha27@gmail.com>
Date: Mon, 24 Jul 2023 10:14:05 -0700
Subject: [PATCH 2/4] update code for genomoi to most recent one, add examples
 in readme

---
 README.md                  |  81 ++++++++++++++---
 genomap/genoMOI/genoMOI.py | 175 ++++++++++++++++++++++++++++++++++---
 requirements.txt           |   1 +
 3 files changed, 232 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 0e51ad1..7e8f33d 100644
--- a/README.md
+++ b/README.md
@@ -171,12 +171,10 @@ plt.show()
 ```python
 import scanpy as sc
 import matplotlib.pyplot as plt
-import genomap.genoMOI as gp
 import scipy.io as sio
 import numpy as np
 import pandas as pd
-import umap
-
+from genomap.genoMOI import genoMOIvis, genoMOItraj
 
 # Load five different pancreatic datasets
 dx = sio.loadmat('dataBaronX.mat')
@@ -196,23 +194,79 @@ y = np.squeeze(dx['classLabel'])
 dx = sio.loadmat('batchLabel.mat')
 ybatch = np.squeeze(dx['batchLabel'])
 
-# Apply genoMOI with genomap size of 44x44 and dimension of 32 for the returned integrated data
-resVis=gp.genoMOI(data, data2, data3, data4, data5, colNum=44, rowNum=44, n_dim=32)
+# Apply genomap-based multi omic integration and visualize the integrated data with local structure for cluster analysis
+# returns 2D visualization, cluster labels, and intgerated data
+resVis,cli,int_data=genoMOIvis(data, data2, data3, data4, data5, colNum=12, rowNum=12, n_dim=32, epoch=10, prealign_method='scanorama')
 
-# Visualize the integrated data using UMAP
-embedding = umap.UMAP(n_neighbors=30,min_dist=0.3,n_epochs=200).fit_transform(resVis)
 
 plt.figure(figsize=(15, 10))
 plt.rcParams.update({'font.size': 28})    
-h1=plt.scatter(embedding[:, 0], embedding[:, 1], c=y,cmap='jet', marker='o', s=18)      #  ax = plt.subplot(3, n, i + 1*10+1)
-plt.xlabel('UMAP1')
-plt.ylabel('UMAP2')
+h1=plt.scatter(resVis[:, 0], resVis[:, 1], c=y,cmap='jet', marker='o', s=18)      #  ax = plt.subplot(3, n, i + 1*10+1)
+plt.xlabel('genoVis1')
+plt.ylabel('genoVis2')
+plt.tight_layout()
+plt.colorbar(h1)
+plt.show()
+
+plt.figure(figsize=(15, 10))
+plt.rcParams.update({'font.size': 28})    
+h1=plt.scatter(resVis[:, 0], resVis[:, 1], c=ybatch,cmap='jet', marker='o', s=18)      #  ax = plt.subplot(3, n, i + 1*10+1)
+plt.xlabel('genoVis1')
+plt.ylabel('genoVis2')
 plt.tight_layout()
 plt.colorbar(h1)
 plt.show()
 ```
 
-### Example 6 - Try genoSig for finding gene signatures for cell/data classes
+```python
+# Apply genomap-based multi omic integration and visualize the integrated data with global structure for trajectory analysis
+
+# returns 2D embedding, cluster labels, and intgerated data
+resTraj,cli,int_data=genoMOItraj(data, data2, data3, data4, data5, colNum=12, rowNum=12, n_dim=32, epoch=10, prealign_method='scanorama')
+
+
+plt.figure(figsize=(15, 10))
+plt.rcParams.update({'font.size': 28})    
+h1=plt.scatter(resTraj[:, 0], resTraj[:, 1], c=y,cmap='jet', marker='o', s=18)      #  ax = plt.subplot(3, n, i + 1*10+1)
+plt.xlabel('genoTraj1')
+plt.ylabel('genoTraj2')
+plt.tight_layout()
+plt.colorbar(h1)
+plt.show()
+
+plt.figure(figsize=(15, 10))
+plt.rcParams.update({'font.size': 28})    
+h1=plt.scatter(resTraj[:, 0], resTraj[:, 1], c=ybatch,cmap='jet', marker='o', s=18)      #  ax = plt.subplot(3, n, i + 1*10+1)
+plt.xlabel('genoTraj1')
+plt.ylabel('genoTraj2')
+plt.tight_layout()
+plt.colorbar(h1)
+plt.show()
+```
+
+### Example 6 - Try genoAnnotate for cell annotation
+
+```python
+import scanpy as sc
+import pandas as pd
+import genomap.genoAnnotate as gp
+#Load the PBMC dataset
+adata = sc.read_10x_mtx("pbmc3k_filtered_gene_bc_matrices/")
+
+# Input: adata: annData containing the raw gene counts
+# tissue type: e.g. Immune system,Pancreas,Liver,Eye,Kidney,Brain,Lung,Adrenal,Heart,Intestine,Muscle,Placenta,Spleen,Stomach,Thymus 
+ 
+adataP = gp.genoAnnotate(adata,tissue_type="Immune system")
+
+
+# Compute UMAP (requires neighborhood graph, see the previous code for Louvain clustering)
+sc.tl.umap(adataP)
+# Create a UMAP plot colored by cell type labels
+cell_annotations=adataP.obs['cell_type']
+sc.pl.umap(adataP, color='cell_type')
+```
+
+### Example 7 - Try genoSig for finding gene signatures for cell/data classes
 
 ```python
 import numpy as np
@@ -244,7 +298,7 @@ result=gp.genoSig(genoMaps,T,label,userPD,gene_namesRe, epochs=50)
 print(result.head())
 ```
 
-### Example 7 - Try genoClassification for tabular data classification
+### Example 8 - Try genoClassification for tabular data classification
 
 ```python
 import pandas as pd
@@ -281,8 +335,7 @@ est=gp.genoClassification(training_data, training_labels, test_data, rowNum=rowN
 print('Classification accuracy of genomap approach:'+str(np.sum(est==groundTruthTest) / est.shape[0]))  
 ```
 
-
-### Example 8 - Try genoRegression for tabular data regression
+### Example 9 - Try genoRegression for tabular data regression
 
 ```python
 import pandas as pd
diff --git a/genomap/genoMOI/genoMOI.py b/genomap/genoMOI/genoMOI.py
index 7d28b1b..691168d 100644
--- a/genomap/genoMOI/genoMOI.py
+++ b/genomap/genoMOI/genoMOI.py
@@ -6,25 +6,79 @@
 """
 
 from tensorflow.keras.optimizers import Adam
-from genomap.utils.ConvIDEC import ConvIDEC
+import umap
+import numpy as np
+import scanorama
+import phate
+import scanpy as sc
+import scipy.io
+import pandas as pd
+import anndata as ad
 from sklearn.feature_selection import VarianceThreshold
+from genomap.utils.ConvIDEC import ConvIDEC
 from genomap.genomap import construct_genomap
-import umap
-from genomap.utils.gTraj_utils import nearest_divisible_by_four
+from genomap.utils.class_discriminative_opt import ClassDiscriminative_OPT
+from genomap.utils.gTraj_utils import nearest_divisible_by_four, compute_cluster_distances
 from genomap.utils.utils_MOI import * 
 from genomap.utils.util_Sig import select_n_features
 
-def genoMOI(*arrays, n_clusters=None, n_dim=32, colNum=32, rowNum=32):
+def selectNdimIDX(*args, N):
+    data_selected_list = []
+
+    for data in args:
+        gvar = np.var(data, axis=0)
+        varidx = np.argsort(gvar)[::-1]  # Sort indices in descending order
+        var500idx = varidx[:N]
+
+        var500idxS = np.sort(var500idx)
+        data_selected = data[:, var500idxS]
+        data_selected_list.append(data_selected)
+
+    return data_selected_list, var500idxS
+
+def genoMOIvis(*arrays, n_clusters=None, n_dim=32, colNum=32, rowNum=32, epoch=100, prealign_method='scanorama'):
 
 # arrays: a number of arrays such as array1, array2 from different sources
 # n_clusters: number of data classes
 # n_dim: number of the dimension in returned integrated data
 # colNum and rowNum: column and row number of genomaps
-#
-# Pre-align data with bbknn
-    batch_corrected_data=apply_bbknn_and_return_batch_corrected(*arrays)
+# # Returns the embedding, estimated cluster labels, and intgerated data
+    # Pre-align data with bbknn/scanorama
+    if (prealign_method=='scanorama'):
+        batch_corrected_data=apply_scanorama_and_return_batch_corrected(*arrays,n_dim=colNum*rowNum)
+    else:
+        batch_corrected_data=apply_bbknn_and_return_batch_corrected(*arrays)
     dataDX=scipy.stats.zscore(batch_corrected_data, axis=0, ddof=1) 
+    #dataDX=batch_corrected_data
+    if n_clusters==None:
+        array1=arrays[0]
+        adata = convertToAnnData(array1)
+        # Perform Louvain clustering
+        sc.pp.neighbors(adata)
+        sc.tl.louvain(adata, resolution=1.0)
+        # Access the cluster assignments
+        cluster_labels = adata.obs['louvain']
+        n_clusters = len(np.unique(cluster_labels))        
     
+    embedding, y_pred, feat_DNN=extract_genoVis_features(dataDX, n_clusters=n_clusters, n_dim=n_dim, colNum=colNum,rowNum=rowNum, 
+                                               pretrain_epochs=epoch)    
+
+    return embedding, y_pred, feat_DNN
+
+def genoMOItraj(*arrays, n_clusters=None, n_dim=32, colNum=32, rowNum=32, epoch=100, prealign_method='scanorama'):
+
+# arrays: a number of arrays such as array1, array2 from different sources
+# n_clusters: number of data classes
+# n_dim: number of the dimension in returned integrated data
+# colNum and rowNum: column and row number of genomaps
+# # Returns the embedding, estimated cluster labels, and intgerated data
+    # Pre-align data with bbknn/scanorama
+    if (prealign_method=='scanorama'):
+        batch_corrected_data=apply_scanorama_and_return_batch_corrected(*arrays,n_dim=colNum*rowNum)
+    else:
+        batch_corrected_data=apply_bbknn_and_return_batch_corrected(*arrays)
+    dataDX=scipy.stats.zscore(batch_corrected_data, axis=0, ddof=1) 
+    #dataDX=batch_corrected_data
     if n_clusters==None:
         array1=arrays[0]
         adata = convertToAnnData(array1)
@@ -35,9 +89,30 @@ def genoMOI(*arrays, n_clusters=None, n_dim=32, colNum=32, rowNum=32):
         cluster_labels = adata.obs['louvain']
         n_clusters = len(np.unique(cluster_labels))        
     
-    resVis=extract_genoVis_features(dataDX, n_clusters=n_clusters, n_dim=n_dim, colNum=colNum,rowNum=rowNum)
-    return resVis
+    embedding, y_pred, feat_DNN=extract_genoVis_features(dataDX, n_clusters=n_clusters, n_dim=n_dim, colNum=colNum,rowNum=rowNum, 
+                                               pretrain_epochs=epoch) 
+    
+    outGenoTraj=apply_genoTraj(dataDX,y_pred)
 
+    return embedding, y_pred, feat_DNN
+
+def apply_genoTraj(data, y_pred):
+    Kdis=compute_cluster_distances(data,y_pred);
+
+# Kdis = normalize(Kdis, axis=1, norm='l1')
+# Kdis=1-Kdis
+    n_clusters=len(np.unique(y_pred))
+    num_comp=n_clusters-1
+    # Second optimization (class-discriminative optimization)
+    clf = ClassDiscriminative_OPT(n_components=num_comp)
+    clf.fit(Kdis, y_pred)
+    ccifOut = clf.transform(Kdis)
+    ccifOutzc = scipy.stats.zscore(ccifOut, axis=0, ddof=1) # Z-score of the CCIF 
+
+
+    phate_op = phate.PHATE()
+    data_phate = phate_op.fit_transform(ccifOutzc)
+    return data_phate
 
 def extract_genoVis_features(data,n_clusters=20, n_dim=32, colNum=32, rowNum=32, batch_size=64, verbose=1,
                     pretrain_epochs=100, maxiter=300):
@@ -47,6 +122,7 @@ def extract_genoVis_features(data,n_clusters=20, n_dim=32, colNum=32, rowNum=32,
 # verbose: whether training progress will be shown or not
 # pretrain_epochs: number of epoch for pre-training the CNN
 # maxiter: number of epoch of fine-tuning training
+# Returns the embedding, estimated cluster labels, and intgerated data 
 # Construction of genomap    
     colNum=nearest_divisible_by_four(colNum)
     rowNum=nearest_divisible_by_four(rowNum)
@@ -69,9 +145,86 @@ def extract_genoVis_features(data,n_clusters=20, n_dim=32, colNum=32, rowNum=32,
     y_pred = model.predict_labels(genoMaps)
     # Extract DNN features
     feat_DNN=model.extract_features(genoMaps)
-    #embedding2D = umap.UMAP(n_neighbors=30,min_dist=0.3,n_epochs=200).fit_transform(feat_DNN)
-    return feat_DNN
+    feat_DNN=scipy.stats.zscore(feat_DNN, axis=0, ddof=1)
+    embedding2D = umap.UMAP(n_neighbors=30,min_dist=0.3,n_epochs=200).fit_transform(feat_DNN)
+    return embedding2D, y_pred, feat_DNN
 
+def select_highly_variable_genes_top_genes(adata,n_top_genes=2500):
+    # Select_highly_variable_genes of the data
+    sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, flavor='seurat_v3')
+    adata = adata[:, adata.var.highly_variable]
+    return adata
 
+def nonrmalize_data(adata,target_sum=1e4):
+    # Normalize the data
+    sc.pp.normalize_total(adata, target_sum)
+    return adata
+
+def apply_scanorama_and_return_batch_corrected(*arrays,n_dim=50):    
+    adatas = []
+    for array in arrays:
+        adata = convertToAnnData(array)
+        sc.pp.filter_genes(adata, min_cells=0)
+        sc.pp.scale(adata)        
+        adatas.append(adata)
+    # Concatenate the AnnData objects
+    merged_adata = ad.concat(adatas, join='outer', label="batch")
+    if array.shape[1]>1000:
+        ngene=1000
+    else:
+        ngene=array.shape[1]            
+    # Apply bbknn for batch correction
+    sc.pp.highly_variable_genes(merged_adata,n_top_genes=ngene)
+    sc.pp.pca(merged_adata, n_comps=30, use_highly_variable=True, svd_solver='arpack')   
+
+        
+    scanorama.integrate_scanpy(adatas, dimred = n_dim)    
+    # Get all the integrated matrices.
+    scanorama_int = [ad.obsm['X_scanorama'] for ad in adatas]
+
+    # make into one matrix.
+    all_s = np.concatenate(scanorama_int)
+    
+    # Extract the batch-corrected data
+    batch_corrected_data = all_s
+    # Return the batch-corrected data as a NumPy array
+    return np.array(batch_corrected_data)    
 
+def apply_bbknn_and_return_batch_corrected(*arrays):    
+    adatas = []
+    for array in arrays:
+        adata = convertToAnnData(array)
+        sc.pp.filter_genes(adata, min_cells=0)
+        sc.pp.scale(adata)        
+        adatas.append(adata)
+    # Concatenate the AnnData objects
+    merged_adata = ad.concat(adatas, join='outer', label="batch")
+    if array.shape[1]>1000:
+        ngene=1000
+    else:
+        ngene=array.shape[1]            
+    # Apply bbknn for batch correction
+    sc.pp.highly_variable_genes(merged_adata,n_top_genes=ngene)
+    sc.pp.pca(merged_adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
+    sc.external.pp.bbknn(merged_adata, batch_key='batch', neighbors_within_batch=5, n_pcs=50)    
+    #sc.pp.combat(merged_adata, key='batch')
+    # Extract the batch-corrected data
+    batch_corrected_data = merged_adata.X
+    # Return the batch-corrected data as a NumPy array
+    return np.array(batch_corrected_data)
+       
+def convertToAnnData(data):
+    # Create pseudo cell names
+    cell_names = ['Cell_' + str(i) for i in range(1, data.shape[0]+1)]
+    # Create pseudo gene names
+    gene_names = ['Gene_' + str(i) for i in range(1, data.shape[1]+1)]
+    # Create a pandas DataFrame
+    df = pd.DataFrame(data, index=cell_names, columns=gene_names)
+    adataMy=sc.AnnData(df)
+    return adataMy
 
+def write_numpy_array_to_mat_file(array, filename):
+    # Create a dictionary with the array data
+    data_dict = {"array": array}
+    # Save the dictionary as a .mat file
+    scipy.io.savemat(filename, data_dict)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 397ad68..407aff0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ pandas
 phate
 pot
 scanpy
+scanorama
 scipy
 scikit-learn
 tensorflow

From 20aac82e6a67296870de0d7a75e6b611fb040d80 Mon Sep 17 00:00:00 2001
From: Vasudha Jha <reachvasudha27@gmail.com>
Date: Mon, 24 Jul 2023 10:21:02 -0700
Subject: [PATCH 3/4] fix data link

---
 data/readme.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/data/readme.txt b/data/readme.txt
index 053d537..f4acccd 100644
--- a/data/readme.txt
+++ b/data/readme.txt
@@ -1,2 +1 @@
-Please download the data from https://drive.google.com/drive/u/3/folders/1QNJdPdXf1lfq0Mu5p5JrzMDhwJJCwgO7
- and put it in this folder (data/)
\ No newline at end of file
+Please download the data from https://drive.google.com/drive/folders/1xq3bBgVP0NCMD7bGTXit0qRkL8fbutZ6
\ No newline at end of file

From b984ca39470e7352ac79e28d62753757f9247581 Mon Sep 17 00:00:00 2001
From: Vasudha Jha <reachvasudha27@gmail.com>
Date: Mon, 24 Jul 2023 10:26:54 -0700
Subject: [PATCH 4/4] add additional requirements from genotype

---
 genomap/genotype/genotype.py | 17 +++--------------
 requirements.txt             |  2 ++
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/genomap/genotype/genotype.py b/genomap/genotype/genotype.py
index e723bd5..3442b3f 100644
--- a/genomap/genotype/genotype.py
+++ b/genomap/genotype/genotype.py
@@ -10,8 +10,9 @@
 import pandas as pd
 import numpy as np
 import scipy
-
-
+import openpyxl
+from pybiomart import Dataset
+import re
 
 def sctype_score(scRNAseqData, scaled=True, gs=None, gs2=None, gene_names_to_uppercase=True, cell_types=None):
     # Ensure input is a pandas DataFrame
@@ -82,11 +83,6 @@ def sctype_score(scRNAseqData, scaled=True, gs=None, gs2=None, gene_names_to_upp
 
     return es
 
-
-
-
-import openpyxl
-
 def gene_sets_prepare(path_to_db_file, cell_type):
     # Read the Excel file
     cell_markers = pd.read_excel(path_to_db_file, engine='openpyxl')
@@ -134,8 +130,6 @@ def process_genes(x):
 
     return {'gs_positive': gs, 'gs_negative': gs2, 'cell_types': cell_types}
 
-
-
 def correct_gene_symbols(gene_symbols, species="human"):
     if isinstance(gene_symbols, str):
         gene_symbols = gene_symbols.split(",")
@@ -161,11 +155,6 @@ def correct_gene_symbols(gene_symbols, species="human"):
         return ""
 
 
-
-
-from pybiomart import Dataset
-import re
-
 def check_gene_symbols(x, unmapped_as_na=True, map=None, species="human"):
     if species == "human":
         dataset_name = 'hsapiens_gene_ensembl'
diff --git a/requirements.txt b/requirements.txt
index 407aff0..6e6e41d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,9 +5,11 @@ keras
 lime
 louvain
 numpy
+openpyxl
 pandas
 phate
 pot
+pybiomart
 scanpy
 scanorama
 scipy