diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8ce388b..409c80f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,5 +14,5 @@ jobs: steps: - uses: rymndhng/release-on-push-action@master with: - bump_version_scheme: patch # can be either "major", "minor", "patch" or "norelease" + bump_version_scheme: minor # can be either "major", "minor", "patch" or "norelease" tag_prefix: v \ No newline at end of file diff --git a/README.md b/README.md index 0e51ad1..7e8f33d 100644 --- a/README.md +++ b/README.md @@ -171,12 +171,10 @@ plt.show() ```python import scanpy as sc import matplotlib.pyplot as plt -import genomap.genoMOI as gp import scipy.io as sio import numpy as np import pandas as pd -import umap - +from genomap.genoMOI import genoMOIvis, genoMOItraj # Load five different pancreatic datasets dx = sio.loadmat('dataBaronX.mat') @@ -196,23 +194,79 @@ y = np.squeeze(dx['classLabel']) dx = sio.loadmat('batchLabel.mat') ybatch = np.squeeze(dx['batchLabel']) -# Apply genoMOI with genomap size of 44x44 and dimension of 32 for the returned integrated data -resVis=gp.genoMOI(data, data2, data3, data4, data5, colNum=44, rowNum=44, n_dim=32) +# Apply genomap-based multi omic integration and visualize the integrated data with local structure for cluster analysis +# returns 2D visualization, cluster labels, and intgerated data +resVis,cli,int_data=genoMOIvis(data, data2, data3, data4, data5, colNum=12, rowNum=12, n_dim=32, epoch=10, prealign_method='scanorama') -# Visualize the integrated data using UMAP -embedding = umap.UMAP(n_neighbors=30,min_dist=0.3,n_epochs=200).fit_transform(resVis) plt.figure(figsize=(15, 10)) plt.rcParams.update({'font.size': 28}) -h1=plt.scatter(embedding[:, 0], embedding[:, 1], c=y,cmap='jet', marker='o', s=18) # ax = plt.subplot(3, n, i + 1*10+1) -plt.xlabel('UMAP1') -plt.ylabel('UMAP2') +h1=plt.scatter(resVis[:, 0], resVis[:, 1], c=y,cmap='jet', marker='o', s=18) # ax = plt.subplot(3, n, i + 1*10+1) +plt.xlabel('genoVis1') +plt.ylabel('genoVis2') +plt.tight_layout() +plt.colorbar(h1) +plt.show() + +plt.figure(figsize=(15, 10)) +plt.rcParams.update({'font.size': 28}) +h1=plt.scatter(resVis[:, 0], resVis[:, 1], c=ybatch,cmap='jet', marker='o', s=18) # ax = plt.subplot(3, n, i + 1*10+1) +plt.xlabel('genoVis1') +plt.ylabel('genoVis2') plt.tight_layout() plt.colorbar(h1) plt.show() ``` -### Example 6 - Try genoSig for finding gene signatures for cell/data classes +```python +# Apply genomap-based multi omic integration and visualize the integrated data with global structure for trajectory analysis + +# returns 2D embedding, cluster labels, and intgerated data +resTraj,cli,int_data=genoMOItraj(data, data2, data3, data4, data5, colNum=12, rowNum=12, n_dim=32, epoch=10, prealign_method='scanorama') + + +plt.figure(figsize=(15, 10)) +plt.rcParams.update({'font.size': 28}) +h1=plt.scatter(resTraj[:, 0], resTraj[:, 1], c=y,cmap='jet', marker='o', s=18) # ax = plt.subplot(3, n, i + 1*10+1) +plt.xlabel('genoTraj1') +plt.ylabel('genoTraj2') +plt.tight_layout() +plt.colorbar(h1) +plt.show() + +plt.figure(figsize=(15, 10)) +plt.rcParams.update({'font.size': 28}) +h1=plt.scatter(resTraj[:, 0], resTraj[:, 1], c=ybatch,cmap='jet', marker='o', s=18) # ax = plt.subplot(3, n, i + 1*10+1) +plt.xlabel('genoTraj1') +plt.ylabel('genoTraj2') +plt.tight_layout() +plt.colorbar(h1) +plt.show() +``` + +### Example 6 - Try genoAnnotate for cell annotation + +```python +import scanpy as sc +import pandas as pd +import genomap.genoAnnotate as gp +#Load the PBMC dataset +adata = sc.read_10x_mtx("pbmc3k_filtered_gene_bc_matrices/") + +# Input: adata: annData containing the raw gene counts +# tissue type: e.g. Immune system,Pancreas,Liver,Eye,Kidney,Brain,Lung,Adrenal,Heart,Intestine,Muscle,Placenta,Spleen,Stomach,Thymus + +adataP = gp.genoAnnotate(adata,tissue_type="Immune system") + + +# Compute UMAP (requires neighborhood graph, see the previous code for Louvain clustering) +sc.tl.umap(adataP) +# Create a UMAP plot colored by cell type labels +cell_annotations=adataP.obs['cell_type'] +sc.pl.umap(adataP, color='cell_type') +``` + +### Example 7 - Try genoSig for finding gene signatures for cell/data classes ```python import numpy as np @@ -244,7 +298,7 @@ result=gp.genoSig(genoMaps,T,label,userPD,gene_namesRe, epochs=50) print(result.head()) ``` -### Example 7 - Try genoClassification for tabular data classification +### Example 8 - Try genoClassification for tabular data classification ```python import pandas as pd @@ -281,8 +335,7 @@ est=gp.genoClassification(training_data, training_labels, test_data, rowNum=rowN print('Classification accuracy of genomap approach:'+str(np.sum(est==groundTruthTest) / est.shape[0])) ``` - -### Example 8 - Try genoRegression for tabular data regression +### Example 9 - Try genoRegression for tabular data regression ```python import pandas as pd diff --git a/data/readme.txt b/data/readme.txt index 053d537..f4acccd 100644 --- a/data/readme.txt +++ b/data/readme.txt @@ -1,2 +1 @@ -Please download the data from https://drive.google.com/drive/u/3/folders/1QNJdPdXf1lfq0Mu5p5JrzMDhwJJCwgO7 - and put it in this folder (data/) \ No newline at end of file +Please download the data from https://drive.google.com/drive/folders/1xq3bBgVP0NCMD7bGTXit0qRkL8fbutZ6 \ No newline at end of file diff --git a/genomap/genoAnnotate/__init__.py b/genomap/genoAnnotate/__init__.py new file mode 100644 index 0000000..79ae6c4 --- /dev/null +++ b/genomap/genoAnnotate/__init__.py @@ -0,0 +1 @@ +from .genoAnnotate import * \ No newline at end of file diff --git a/genomap/genoAnnotate/genoAnnotate.py b/genomap/genoAnnotate/genoAnnotate.py new file mode 100644 index 0000000..95cd99c --- /dev/null +++ b/genomap/genoAnnotate/genoAnnotate.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Jul 23 15:18:43 2023 + +@author: Md Tauhidul Islam +# This code is inspired by scType (https://github.com/IanevskiAleksandr/sc-type) +# We are in the process of using image matching techique for further enhancement +# of the cell annotation +""" + +from genomap.genotype import * +import scanpy as sc + +def genoAnnotate(adata,tissue_type,database=None): + # Input: adata: annData containing the raw gene counts + # tissue type: e.g. Immune system,Pancreas,Liver,Eye,Kidney,Brain,Lung,Adrenal,Heart,Intestine,Muscle,Placenta,Spleen,Stomach,Thymus + # database: User can select his/her own database in excel format + + # Database file + if database==None: + database = "https://raw.githubusercontent.com/xinglab-ai/self-consistent-expression-recovery-machine/master/demo/data/genoANN_db.xlsx"; + + # Filter cells + sc.pp.filter_cells(adata, min_genes=200) + # Normalize data + adata.raw = adata + sc.pp.normalize_total(adata, target_sum=10000) + sc.pp.log1p(adata) + sc.pp.highly_variable_genes(adata, n_top_genes=2000) + adata = adata[:, adata.var['highly_variable']] + # Scale data and run PCA + sc.pp.scale(adata, max_value=10) + sc.tl.pca(adata) + + # Prepare positive and negative gene sets + result = gene_sets_prepare(database, tissue_type) + gs = result['gs_positive'] + gs2 = result['gs_negative'] + cell_types = result['cell_types'] + + + data=adata.raw.X.toarray() + # Get cell-type by cell matrix + scRNAseqData = pd.DataFrame(data, index=adata.raw.obs_names, columns=adata.raw.var_names) + + # Compute cell-type score fro each cell + es_max = sctype_score(scRNAseqData=scRNAseqData, scaled=True, gs=gs, gs2=gs2, cell_types=cell_types) + es_max.columns = cell_types + es_max.index = scRNAseqData.index + + # Calculate neighborhood graph of cells (replace 'adata' with your actual AnnData object) + sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_pca') + # Perform clustering so that cell-type can be assigned to each cluster + sc.tl.leiden(adata) + # The cluster labels are stored in `adata.obs['louvain']` + results = [] + for cl in adata.obs['leiden'].unique(): + cells_in_cluster = adata.obs_names[adata.obs['leiden'] == cl] + es_max_cl = es_max.loc[cells_in_cluster].sum().sort_values(ascending=False) + results.append(pd.DataFrame({ + 'cluster': cl, + 'type': es_max_cl.index[:1], + 'scores': es_max_cl.values[:1], + 'ncells': len(cells_in_cluster) + })) + + results = pd.concat(results) + results.loc[results['scores'] < results['ncells'] / 4, 'type'] = 'Unknown' + results.set_index('cluster', inplace=True) + # Assign the cell type labels to the cells in the AnnData object + adata.obs['cell_type'] = results.loc[adata.obs['leiden'], 'type'].values + return adata + + + + + + + + + + + + + + + + + + + + diff --git a/genomap/genoMOI/genoMOI.py b/genomap/genoMOI/genoMOI.py index 7d28b1b..691168d 100644 --- a/genomap/genoMOI/genoMOI.py +++ b/genomap/genoMOI/genoMOI.py @@ -6,25 +6,79 @@ """ from tensorflow.keras.optimizers import Adam -from genomap.utils.ConvIDEC import ConvIDEC +import umap +import numpy as np +import scanorama +import phate +import scanpy as sc +import scipy.io +import pandas as pd +import anndata as ad from sklearn.feature_selection import VarianceThreshold +from genomap.utils.ConvIDEC import ConvIDEC from genomap.genomap import construct_genomap -import umap -from genomap.utils.gTraj_utils import nearest_divisible_by_four +from genomap.utils.class_discriminative_opt import ClassDiscriminative_OPT +from genomap.utils.gTraj_utils import nearest_divisible_by_four, compute_cluster_distances from genomap.utils.utils_MOI import * from genomap.utils.util_Sig import select_n_features -def genoMOI(*arrays, n_clusters=None, n_dim=32, colNum=32, rowNum=32): +def selectNdimIDX(*args, N): + data_selected_list = [] + + for data in args: + gvar = np.var(data, axis=0) + varidx = np.argsort(gvar)[::-1] # Sort indices in descending order + var500idx = varidx[:N] + + var500idxS = np.sort(var500idx) + data_selected = data[:, var500idxS] + data_selected_list.append(data_selected) + + return data_selected_list, var500idxS + +def genoMOIvis(*arrays, n_clusters=None, n_dim=32, colNum=32, rowNum=32, epoch=100, prealign_method='scanorama'): # arrays: a number of arrays such as array1, array2 from different sources # n_clusters: number of data classes # n_dim: number of the dimension in returned integrated data # colNum and rowNum: column and row number of genomaps -# -# Pre-align data with bbknn - batch_corrected_data=apply_bbknn_and_return_batch_corrected(*arrays) +# # Returns the embedding, estimated cluster labels, and intgerated data + # Pre-align data with bbknn/scanorama + if (prealign_method=='scanorama'): + batch_corrected_data=apply_scanorama_and_return_batch_corrected(*arrays,n_dim=colNum*rowNum) + else: + batch_corrected_data=apply_bbknn_and_return_batch_corrected(*arrays) dataDX=scipy.stats.zscore(batch_corrected_data, axis=0, ddof=1) + #dataDX=batch_corrected_data + if n_clusters==None: + array1=arrays[0] + adata = convertToAnnData(array1) + # Perform Louvain clustering + sc.pp.neighbors(adata) + sc.tl.louvain(adata, resolution=1.0) + # Access the cluster assignments + cluster_labels = adata.obs['louvain'] + n_clusters = len(np.unique(cluster_labels)) + embedding, y_pred, feat_DNN=extract_genoVis_features(dataDX, n_clusters=n_clusters, n_dim=n_dim, colNum=colNum,rowNum=rowNum, + pretrain_epochs=epoch) + + return embedding, y_pred, feat_DNN + +def genoMOItraj(*arrays, n_clusters=None, n_dim=32, colNum=32, rowNum=32, epoch=100, prealign_method='scanorama'): + +# arrays: a number of arrays such as array1, array2 from different sources +# n_clusters: number of data classes +# n_dim: number of the dimension in returned integrated data +# colNum and rowNum: column and row number of genomaps +# # Returns the embedding, estimated cluster labels, and intgerated data + # Pre-align data with bbknn/scanorama + if (prealign_method=='scanorama'): + batch_corrected_data=apply_scanorama_and_return_batch_corrected(*arrays,n_dim=colNum*rowNum) + else: + batch_corrected_data=apply_bbknn_and_return_batch_corrected(*arrays) + dataDX=scipy.stats.zscore(batch_corrected_data, axis=0, ddof=1) + #dataDX=batch_corrected_data if n_clusters==None: array1=arrays[0] adata = convertToAnnData(array1) @@ -35,9 +89,30 @@ def genoMOI(*arrays, n_clusters=None, n_dim=32, colNum=32, rowNum=32): cluster_labels = adata.obs['louvain'] n_clusters = len(np.unique(cluster_labels)) - resVis=extract_genoVis_features(dataDX, n_clusters=n_clusters, n_dim=n_dim, colNum=colNum,rowNum=rowNum) - return resVis + embedding, y_pred, feat_DNN=extract_genoVis_features(dataDX, n_clusters=n_clusters, n_dim=n_dim, colNum=colNum,rowNum=rowNum, + pretrain_epochs=epoch) + + outGenoTraj=apply_genoTraj(dataDX,y_pred) + return embedding, y_pred, feat_DNN + +def apply_genoTraj(data, y_pred): + Kdis=compute_cluster_distances(data,y_pred); + +# Kdis = normalize(Kdis, axis=1, norm='l1') +# Kdis=1-Kdis + n_clusters=len(np.unique(y_pred)) + num_comp=n_clusters-1 + # Second optimization (class-discriminative optimization) + clf = ClassDiscriminative_OPT(n_components=num_comp) + clf.fit(Kdis, y_pred) + ccifOut = clf.transform(Kdis) + ccifOutzc = scipy.stats.zscore(ccifOut, axis=0, ddof=1) # Z-score of the CCIF + + + phate_op = phate.PHATE() + data_phate = phate_op.fit_transform(ccifOutzc) + return data_phate def extract_genoVis_features(data,n_clusters=20, n_dim=32, colNum=32, rowNum=32, batch_size=64, verbose=1, pretrain_epochs=100, maxiter=300): @@ -47,6 +122,7 @@ def extract_genoVis_features(data,n_clusters=20, n_dim=32, colNum=32, rowNum=32, # verbose: whether training progress will be shown or not # pretrain_epochs: number of epoch for pre-training the CNN # maxiter: number of epoch of fine-tuning training +# Returns the embedding, estimated cluster labels, and intgerated data # Construction of genomap colNum=nearest_divisible_by_four(colNum) rowNum=nearest_divisible_by_four(rowNum) @@ -69,9 +145,86 @@ def extract_genoVis_features(data,n_clusters=20, n_dim=32, colNum=32, rowNum=32, y_pred = model.predict_labels(genoMaps) # Extract DNN features feat_DNN=model.extract_features(genoMaps) - #embedding2D = umap.UMAP(n_neighbors=30,min_dist=0.3,n_epochs=200).fit_transform(feat_DNN) - return feat_DNN + feat_DNN=scipy.stats.zscore(feat_DNN, axis=0, ddof=1) + embedding2D = umap.UMAP(n_neighbors=30,min_dist=0.3,n_epochs=200).fit_transform(feat_DNN) + return embedding2D, y_pred, feat_DNN +def select_highly_variable_genes_top_genes(adata,n_top_genes=2500): + # Select_highly_variable_genes of the data + sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, flavor='seurat_v3') + adata = adata[:, adata.var.highly_variable] + return adata +def nonrmalize_data(adata,target_sum=1e4): + # Normalize the data + sc.pp.normalize_total(adata, target_sum) + return adata + +def apply_scanorama_and_return_batch_corrected(*arrays,n_dim=50): + adatas = [] + for array in arrays: + adata = convertToAnnData(array) + sc.pp.filter_genes(adata, min_cells=0) + sc.pp.scale(adata) + adatas.append(adata) + # Concatenate the AnnData objects + merged_adata = ad.concat(adatas, join='outer', label="batch") + if array.shape[1]>1000: + ngene=1000 + else: + ngene=array.shape[1] + # Apply bbknn for batch correction + sc.pp.highly_variable_genes(merged_adata,n_top_genes=ngene) + sc.pp.pca(merged_adata, n_comps=30, use_highly_variable=True, svd_solver='arpack') + + + scanorama.integrate_scanpy(adatas, dimred = n_dim) + # Get all the integrated matrices. + scanorama_int = [ad.obsm['X_scanorama'] for ad in adatas] + + # make into one matrix. + all_s = np.concatenate(scanorama_int) + + # Extract the batch-corrected data + batch_corrected_data = all_s + # Return the batch-corrected data as a NumPy array + return np.array(batch_corrected_data) +def apply_bbknn_and_return_batch_corrected(*arrays): + adatas = [] + for array in arrays: + adata = convertToAnnData(array) + sc.pp.filter_genes(adata, min_cells=0) + sc.pp.scale(adata) + adatas.append(adata) + # Concatenate the AnnData objects + merged_adata = ad.concat(adatas, join='outer', label="batch") + if array.shape[1]>1000: + ngene=1000 + else: + ngene=array.shape[1] + # Apply bbknn for batch correction + sc.pp.highly_variable_genes(merged_adata,n_top_genes=ngene) + sc.pp.pca(merged_adata, n_comps=50, use_highly_variable=True, svd_solver='arpack') + sc.external.pp.bbknn(merged_adata, batch_key='batch', neighbors_within_batch=5, n_pcs=50) + #sc.pp.combat(merged_adata, key='batch') + # Extract the batch-corrected data + batch_corrected_data = merged_adata.X + # Return the batch-corrected data as a NumPy array + return np.array(batch_corrected_data) + +def convertToAnnData(data): + # Create pseudo cell names + cell_names = ['Cell_' + str(i) for i in range(1, data.shape[0]+1)] + # Create pseudo gene names + gene_names = ['Gene_' + str(i) for i in range(1, data.shape[1]+1)] + # Create a pandas DataFrame + df = pd.DataFrame(data, index=cell_names, columns=gene_names) + adataMy=sc.AnnData(df) + return adataMy +def write_numpy_array_to_mat_file(array, filename): + # Create a dictionary with the array data + data_dict = {"array": array} + # Save the dictionary as a .mat file + scipy.io.savemat(filename, data_dict) \ No newline at end of file diff --git a/genomap/genotype/__init__.py b/genomap/genotype/__init__.py new file mode 100644 index 0000000..55769b9 --- /dev/null +++ b/genomap/genotype/__init__.py @@ -0,0 +1 @@ +from .genotype import * \ No newline at end of file diff --git a/genomap/genotype/genotype.py b/genomap/genotype/genotype.py new file mode 100644 index 0000000..3442b3f --- /dev/null +++ b/genomap/genotype/genotype.py @@ -0,0 +1,226 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Jul 20 16:57:36 2023 + +@author: Md Tauhidul Islam +# This code is inspired by scType (https://github.com/IanevskiAleksandr/sc-type) +# We are in the process of using image matching techique for further enhancement +# of the cell annotation +""" +import pandas as pd +import numpy as np +import scipy +import openpyxl +from pybiomart import Dataset +import re + +def sctype_score(scRNAseqData, scaled=True, gs=None, gs2=None, gene_names_to_uppercase=True, cell_types=None): + # Ensure input is a pandas DataFrame + if not isinstance(scRNAseqData, pd.DataFrame): + print("scRNAseqData doesn't seem to be a DataFrame") + return None + + # Check if DataFrame is empty + if scRNAseqData.empty: + print("The input scRNAseqData DataFrame is empty") + return None + + # Marker sensitivity + marker_stat = pd.Series([gene for sublist in gs for gene in sublist]).value_counts().sort_values(ascending=False) + marker_sensitivity = pd.DataFrame({ + "score_marker_sensitivity": (len(gs)-marker_stat.values) / (len(gs) - 1), + "gene_": marker_stat.index + }) + + # Convert gene names to uppercase + if gene_names_to_uppercase: + scRNAseqData.columns = scRNAseqData.columns.astype(str).str.upper() + + # Subselect genes found in data + gs = [[gene for gene in sublist if gene in scRNAseqData.columns] for sublist in gs] + gs2 = [[gene for gene in sublist if gene in scRNAseqData.columns] for sublist in gs2] + + cell_markers_genes_score = marker_sensitivity[marker_sensitivity['gene_'].isin(np.unique(np.concatenate(gs)))] + + Z = scRNAseqData.copy() + + #scanpy.pp.scale(Z) + Z= scipy.stats.zscore(Z, axis=0, ddof=1) + Z = Z.fillna(0) + + # Multiply by marker sensitivity + Z = Z.T + for idx, row in cell_markers_genes_score.iterrows(): + Z.loc[row["gene_"]] *= row["score_marker_sensitivity"] + Z = Z.T + # Subselect only with marker genes + Z = Z[np.unique(np.concatenate(gs + gs2))] + Z=Z.T + score_series_list = [] + # Loop over each gene symbol (gss_) in gs + + for i in range(0,len(gs)): + gss_=np.array(gs[i]) + gss2_=np.array(gs2[i]) + scores = [] + # Loop over each column (j) in Z + for j in range(Z.shape[1]): + gs_z = Z.loc[gss_].iloc[:, j] # Using iloc for integer-based indexing + gz_2 = Z.loc[gss2_].iloc[:, j] * -1 # Using iloc for integer-based indexing + sum_t1 = np.sum(gs_z) / np.sqrt(len(gs_z)) + sum_t2 = np.sum(gz_2) / np.sqrt(len(gz_2)) + if np.isnan(sum_t2): + sum_t2 = 0 + scores.append(sum_t1 + sum_t2) + # Convert the scores list to a pandas Series and add it to the score_series_list + score_series_list.append(pd.Series(scores)) + # Concatenate the score series in score_series_list along axis 0 to create a DataFrame + es = pd.concat(score_series_list, axis=1) + + # Remove rows with all NA or empty values + es = es.dropna(how='all') + es = es[~(es == "").all(axis=1)] + + return es + +def gene_sets_prepare(path_to_db_file, cell_type): + # Read the Excel file + cell_markers = pd.read_excel(path_to_db_file, engine='openpyxl') + + # Select rows where 'tissueType' matches 'cell_type' + cell_markers = cell_markers[cell_markers['tissueType'] == cell_type] + # Apply the function to each row in the DataFrame 'cell_markers' + # cell_markers['geneSymbolmore1'] = cell_markers['geneSymbolmore1'].apply(correct_gene_symbols) + + + # Convert to string and clean 'geneSymbolmore1' and 'geneSymbolmore2' columns + cell_markers['geneSymbolmore1'] = cell_markers['geneSymbolmore1'].astype(str).str.replace(" ", "") + cell_markers['geneSymbolmore2'] = cell_markers['geneSymbolmore2'].astype(str).str.replace(" ", "") + + + cell_markers["geneSymbolmore1"] = cell_markers["geneSymbolmore1"].apply(lambda x: correct_gene_symbols(x)) + cell_markers["geneSymbolmore2"] = cell_markers["geneSymbolmore2"].apply(lambda x: correct_gene_symbols(x)) + + + # Define a helper function to handle potential NaN values + def process_genes(x): + if x == 'nan': + return [] + else: + return sorted([i for i in x.split(",") if i not in ["NA", ""]]) + + # Split 'geneSymbolmore1' and 'geneSymbolmore2' into lists of genes + cell_markers['geneSymbolmore1'] = cell_markers['geneSymbolmore1'].apply(process_genes) + cell_markers['geneSymbolmore2'] = cell_markers['geneSymbolmore2'].apply(process_genes) + + # Replace '///' with ',' and remove spaces + cell_markers['geneSymbolmore1'] = cell_markers['geneSymbolmore1'].apply(lambda x: ",".join(x).replace("///", ",").replace(" ", "")) + cell_markers['geneSymbolmore2'] = cell_markers['geneSymbolmore2'].apply(lambda x: ",".join(x).replace("///", ",").replace(" ", "")) + + # Split 'geneSymbolmore1' and 'geneSymbolmore2' into lists of genes again + cell_markers['geneSymbolmore1'] = cell_markers['geneSymbolmore1'].str.split(",") + cell_markers['geneSymbolmore2'] = cell_markers['geneSymbolmore2'].str.split(",") + + # Prepare 'gs' and 'gs2' lists + gs = [genes for genes in cell_markers['geneSymbolmore1']] + gs2 = [genes for genes in cell_markers['geneSymbolmore2']] + + # Prepare cell types list + cell_types = list(cell_markers['cellName']) + + return {'gs_positive': gs, 'gs_negative': gs2, 'cell_types': cell_types} + +def correct_gene_symbols(gene_symbols, species="human"): + if isinstance(gene_symbols, str): + gene_symbols = gene_symbols.split(",") + gene_symbols = [gs.strip().replace(" ", "").upper() for gs in gene_symbols if gs.strip().upper() not in ["NA", ""]] + gene_symbols = sorted(set(gene_symbols)) + + if len(gene_symbols) > 0: + # Assuming you have defined and imported the gene_symbol_mapping function + # This function should perform the gene symbol mapping using pybiomart or any other method you have implemented. + # gene_symbol_mapping(markers_all) should return a DataFrame with the "Suggested.Symbol" column. + # For simplicity, I'll use a dummy function that returns an empty DataFrame here. + #def gene_symbol_mapping(gene_symbols): + #return pd.DataFrame(columns=["Suggested.Symbol"]) + + suppressMessages = lambda x: x # Suppressing messages in Python is not necessary + + markers_all = check_gene_symbols(gene_symbols) + markers_all = markers_all.dropna(subset=["Suggested.Symbol"]) + markers_all = sorted(set(markers_all["Suggested.Symbol"])) + + return ",".join(markers_all) + else: + return "" + + +def check_gene_symbols(x, unmapped_as_na=True, map=None, species="human"): + if species == "human": + dataset_name = 'hsapiens_gene_ensembl' + elif species == "mouse": + dataset_name = 'mmusculus_gene_ensembl' + else: + raise ValueError("Species must be 'human' or 'mouse'") + + biomart = Dataset(name=dataset_name, host='http://www.ensembl.org') + biomart_df = biomart.query(attributes=['external_gene_name', 'gene_biotype']) + + + casecorrection = False + if species == "human": + casecorrection = True + if map is None: + #lastupdate = biomart_df["date"].max() + #print("Maps last updated on:", lastupdate) + map = biomart_df[['Gene name', 'Gene type']].drop_duplicates() + elif species == "mouse" and map is None: + #lastupdate = biomart_df["date"].max() + #print("Maps last updated on:", lastupdate) + map = biomart_df[['Gene name', 'Gene type']].drop_duplicates() + else: + if map is None: + raise ValueError("If species is not 'human' or 'mouse', then map argument must be specified") + + if not isinstance(map, pd.DataFrame) or not set(map.columns) == {"Gene name", "Gene type"}: + raise ValueError("If map is specified, it must be a dataframe with two columns named 'external_gene_name' and 'gene_biotype'") + + approved = [sym in map["Gene name"].values for sym in x] + + if casecorrection: + x_casecorrected = [sym.upper() for sym in x] + x_casecorrected = [re.sub(r"(.*C[0-9XY]+)ORF(.+)", r"\1orf\2", sym) for sym in x_casecorrected] + else: + x_casecorrected = x.split() + + approvedaftercasecorrection = [sym in map["Gene name"].values for sym in x_casecorrected] + + if x != " ".join(x_casecorrected): + print("Human gene symbols should be all upper-case except for the 'orf' in open reading frames. The case of some letters was corrected.") + + alias = [sym in map["Gene name"].values for sym in x_casecorrected] + + suggested_symbols = [] + for i in range(len(x_casecorrected)): + if approved[i]: + suggested_symbols.append(x_casecorrected[i]) + else: + if alias[i]: + suggested_symbols.append(" /// ".join(map.loc[map["Gene name"] == x_casecorrected[i], "Gene name"])) + elif approvedaftercasecorrection[i]: + suggested_symbols.append(x_casecorrected[i]) + else: + suggested_symbols.append(None) + + df = pd.DataFrame({"x": x, "Approved": approved, "Suggested.Symbol": suggested_symbols}) + df["Approved"][df["x"].isnull()] = False + + if not unmapped_as_na: + df.loc[df["Suggested.Symbol"].isnull(), "Suggested.Symbol"] = df.loc[df["Suggested.Symbol"].isnull(), "x"] + + if df["Approved"].sum() != df.shape[0]: + print("x contains non-approved gene symbols") + + return df + + diff --git a/requirements.txt b/requirements.txt index 397ad68..6e6e41d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,10 +5,13 @@ keras lime louvain numpy +openpyxl pandas phate pot +pybiomart scanpy +scanorama scipy scikit-learn tensorflow diff --git a/setup.py b/setup.py index f2e7362..fe955d2 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="genomap", - version="1.2.11", + version="1.3.0", author="Md Tauhidul Islam", author_email="tauhid@stanford.edu", description="Genomap converts tabular gene expression data into spatially meaningful images.",