From 871d3bba272ea6b795c5821f2a8d091c15665270 Mon Sep 17 00:00:00 2001 From: YifanLu2000 Date: Wed, 3 Jul 2024 07:47:03 +0000 Subject: [PATCH 01/10] update spateo alignment --- spateo/alignment/methods/morpho.py | 16 ++++- spateo/alignment/methods/morpho_sparse.py | 10 +++ .../alignment/methods/morpho_sparse_utils.py | 61 ++++++++++++++----- spateo/alignment/methods/utils.py | 30 ++++++++- 4 files changed, 99 insertions(+), 18 deletions(-) diff --git a/spateo/alignment/methods/morpho.py b/spateo/alignment/methods/morpho.py index 286a674f..4e584b62 100755 --- a/spateo/alignment/methods/morpho.py +++ b/spateo/alignment/methods/morpho.py @@ -350,6 +350,17 @@ def BA_align( inlier_A = _data(nx, inlier_A, type_as) inlier_B = _data(nx, inlier_B, type_as) inlier_P = _data(nx, inlier_P, type_as) + else: + init_R = np.eye(D) + init_t = np.zeros((D,)) + inlier_A = np.zeros((4,D)) + inlier_B = np.zeros((4,D)) + inlier_P = np.ones((4,1)) + init_R = _data(nx, init_R, type_as) + init_t = _data(nx, init_t, type_as) + inlier_A = _data(nx, inlier_A, type_as) + inlier_B = _data(nx, inlier_B, type_as) + inlier_P = _data(nx, inlier_P, type_as) coarse_alignment = coordsA # Random select control points @@ -529,7 +540,10 @@ def BA_align( ) # Update R() - lambdaReg = 1e0 * Sp / nx.sum(inlier_P) + if nn_init: + lambdaReg = partial_robust_level * 1e0 * Sp / nx.sum(inlier_P) + else: + lambdaReg = 0 if SVI_mode: PXA, PVA, PXB = ( _dot(nx)(K_NA, coordsA)[None, :], diff --git a/spateo/alignment/methods/morpho_sparse.py b/spateo/alignment/methods/morpho_sparse.py index 723c7853..2e532894 100644 --- a/spateo/alignment/methods/morpho_sparse.py +++ b/spateo/alignment/methods/morpho_sparse.py @@ -90,6 +90,7 @@ def get_P_sparse( labelB: Optional[pd.Series] = None, label_transfer_prior: Optional[dict] = None, top_k: int = 1024, + dissimilarity: str = "kl", ): assert XnAHat.shape[1] == XnB.shape[1], "XnAHat and XnB do not have the same number of features." assert XnAHat.shape[0] == alpha.shape[0], "XnAHat and alpha do not have the same length." @@ -118,6 +119,7 @@ def get_P_sparse( col_mul=(_mul(nx)(alpha, nx.exp(-Sigma / sigma2))), batch_capacity=batch_capacity, top_k=top_k, + dissimilarity=dissimilarity, ) K_NA = P.sum(1).to_dense() @@ -151,6 +153,7 @@ def BA_align_sparse( iter_key_added: Optional[str] = "iter_spatial", vecfld_key_added: Optional[str] = "VecFld_morpho", layer: str = "X", + use_rep: Optional[str] = None, dissimilarity: str = "kl", max_iter: int = 200, lambdaVF: Union[int, float] = 1e2, @@ -196,6 +199,7 @@ def BA_align_sparse( dtype=dtype, device=device, verbose=verbose, + use_rep=use_rep, ) coordsA, coordsB = spatial_coords[1], spatial_coords[0] X_A, X_B = exp_matrices[1], exp_matrices[0] @@ -351,6 +355,8 @@ def BA_align_sparse( Sigma=SigmaDiag, outlier_variance=outlier_variance, label_transfer_prior=label_transfer_prior, + dissimilarity=dissimilarity, + batch_capacity=batch_capacity, ) else: P, assignment_results = get_P_sparse( @@ -367,6 +373,8 @@ def BA_align_sparse( Sigma=SigmaDiag, outlier_variance=outlier_variance, label_transfer_prior=label_transfer_prior, + dissimilarity=dissimilarity, + batch_capacity=batch_capacity, ) # update temperature @@ -539,6 +547,8 @@ def BA_align_sparse( outlier_variance=outlier_variance, label_transfer_prior=label_transfer_prior, top_k=32, + dissimilarity=dissimilarity, + batch_capacity=batch_capacity, ) # Get optimal Rigid transformation optimal_RnA, optimal_R, optimal_t = get_optimal_R_sparse( diff --git a/spateo/alignment/methods/morpho_sparse_utils.py b/spateo/alignment/methods/morpho_sparse_utils.py index c26c1884..322df41a 100644 --- a/spateo/alignment/methods/morpho_sparse_utils.py +++ b/spateo/alignment/methods/morpho_sparse_utils.py @@ -107,11 +107,13 @@ def calc_P_related( label_transfer_prior: Optional[dict] = None, top_k: int = 1024, ): + labelA = None if labelA is None else labelA.values + labelB = None if labelA is None else labelB.values # metric = 'square_euc' NA, NB = XnAHat.shape[0], XnB.shape[0] D = XnAHat.shape[1] - batch_base = 1e7 - split_size = min(int(batch_capacity * batch_base / (NA * D)), NB) + batch_base = 1e9 + split_size = min(int(batch_capacity * batch_base/ (NA * G)), NB) split_size = 1 if split_size == 0 else split_size nx = ot.backend.get_backend(XnAHat, XnB) @@ -120,19 +122,15 @@ def calc_P_related( XnB_chunks = _split(nx, XnB, split_size, dim=0) X_B_chunks = _split(nx, X_B, split_size, dim=0) - label_mask = _construct_label_mask(labelA, labelB, label_transfer_prior).T - - mask_chunks_arr = _split(nx, nx.arange(NB), split_size, dim=0) + labelB_chunks = [None] * len(XnB_chunks) if labelB is None else _torch_like_split(labelB, split_size, dim=0) K_NA_spatial = nx.zeros((NA,), type_as=XnAHat) K_NA_sigma2 = nx.zeros((NA,), type_as=XnAHat) Ps = [] sigma2_temp = 0 cur_row = 0 - for XnB_chunk, X_B_chunk, mask_chunk_arr in zip(XnB_chunks, X_B_chunks, mask_chunks_arr): - label_mask_chunk = ( - None if labelA is None else _data(nx, label_mask[:, np.array(mask_chunk_arr)], type_as=XnB_chunk) - ) + for XnB_chunk, X_B_chunk, labelB_chunk in zip(XnB_chunks, X_B_chunks, labelB_chunks): + label_mask_chunk = (None if labelA is None else _construct_label_mask(nx, labelA, labelB_chunk, label_transfer_prior, XnB_chunk).T) # calculate distance matrix (common step) SpatialMat = _dist(XnAHat, XnB_chunk, "square_euc") @@ -272,12 +270,13 @@ def _init_guess_beta2( return beta2, beta2_end -def _construct_label_mask(labelA, labelB, label_transfer_prior): - label_mask = np.zeros((labelB.shape[0], labelA.shape[0])) +def _construct_label_mask(nx, labelA, labelB, label_transfer_prior, type_as): + label_mask = nx.zeros((labelB.shape[0], labelA.shape[0]), type_as=type_as) for k in label_transfer_prior.keys(): - idx = np.where((labelB == k))[0] - cur_P = labelA.map(label_transfer_prior[k]).values - label_mask[idx, :] = cur_P + idxB = np.where((labelB == k))[0] + for j in label_transfer_prior[k].keys(): + idxA = np.where((labelA == j))[0] + label_mask[idxB[:, None],idxA] = label_transfer_prior[k][j] return label_mask @@ -322,6 +321,20 @@ def _SparseTensor(nx, row, col, value, sparse_sizes): else: return coo_array((value, (row, col)), shape=sparse_sizes) +def _cos_similarity( + mat1: Union[np.ndarray, torch.Tensor], + mat2: Union[np.ndarray, torch.Tensor], +): + nx = ot.backend.get_backend(mat1, mat2) + if nx_torch(nx): + torch_cos = torch.nn.CosineSimilarity(dim=1) + mat1_unsqueeze = mat1.unsqueeze(-1) + mat2_unsqueeze = mat2.unsqueeze(-1).transpose(0,2) + distMat = -torch_cos(mat1_unsqueeze, mat2_unsqueeze) * 0.5 + 0.5 + else: + distMat = (ot.dist(mat1, mat2, metric='cosine'))*0.5 + return distMat + def _dist( mat1: Union[np.ndarray, torch.Tensor], @@ -334,6 +347,8 @@ def _dist( "square_euc", "square_euclidean", "kl", + "cos", + "cosine", ], "``metric`` value is wrong. Available ``metric`` are: ``'euc'``, ``'euclidean'``, ``'square_euc'``, ``'square_euclidean'``, and ``'kl'``." nx = ot.backend.get_backend(mat1, mat2) if ( @@ -357,9 +372,12 @@ def _dist( - _dot(nx)(mat1, nx.log(mat2).T) - _dot(nx)(mat2, nx.log(mat1).T).T ) / 2 + elif (metric.lower() == "cos" or metric.lower() == "cosine"): + distMat = _cos_similarity(mat1, mat2) return distMat + # Check if nx is a torch backend nx_torch = lambda nx: True if isinstance(nx, ot.backend.TorchBackend) else False _cat = lambda nx, x, dim: torch.cat(x, dim=dim) if nx_torch(nx) else np.concatenate(x, axis=dim) @@ -370,6 +388,21 @@ def _dist( if nx_torch(nx) else np.array_split(x, chunk_size, axis=dim) ) +def _torch_like_split(arr, size, dim=0): + if dim < 0: + dim += arr.ndim + shape = arr.shape + arr = np.swapaxes(arr, dim, -1) + flat_arr = arr.reshape(-1, shape[dim]) + num_splits = flat_arr.shape[-1] // size + remainder = flat_arr.shape[-1] % size + splits = np.array_split(flat_arr[:, :num_splits * size], num_splits, axis=-1) + if remainder: + splits.append(flat_arr[:, num_splits * size:]) + splits = [np.swapaxes(split.reshape(*shape[:dim], -1, *shape[dim+1:]), dim, -1) for split in splits] + + return splits + _where = lambda nx, condition: torch.where(condition) if nx_torch(nx) else np.where(condition) _repeat_interleave = ( lambda nx, x, repeats, axis: torch.repeat_interleave(x, repeats, dim=axis) diff --git a/spateo/alignment/methods/utils.py b/spateo/alignment/methods/utils.py index 5752323b..3b11e2f0 100755 --- a/spateo/alignment/methods/utils.py +++ b/spateo/alignment/methods/utils.py @@ -200,6 +200,7 @@ def align_preprocess( genes: Optional[Union[list, np.ndarray]] = None, spatial_key: str = "spatial", layer: str = "X", + use_rep: Optional[str] = None, normalize_c: bool = False, normalize_g: bool = False, select_high_exp_genes: Union[bool, float, int] = False, @@ -242,7 +243,10 @@ def align_preprocess( new_samples = [s[:, common_genes] for s in new_samples] # Gene expression matrix of all samples - exp_matrices = [nx.from_numpy(check_exp(sample=s, layer=layer), type_as=type_as) for s in new_samples] + if (use_rep is None) or (not isinstance(use_rep, str)) or (use_rep not in samples[0].obsm.keys()) or (use_rep not in samples[1].obsm.keys()): + exp_matrices = [nx.from_numpy(check_exp(sample=s, layer=layer), type_as=type_as) for s in new_samples] + else: + exp_matrices = [nx.from_numpy(s.obsm[use_rep], type_as=type_as) for s in new_samples] + [nx.from_numpy(check_exp(sample=s, layer=layer), type_as=type_as) for s in new_samples] if not (select_high_exp_genes is False): # Select significance genes if select_high_exp_genes is True ExpressionData = _cat(nx=nx, x=exp_matrices, dim=0) @@ -267,7 +271,7 @@ def align_preprocess( spatial_coords, normalize_scale_list, normalize_mean_list = normalize_coords( coords=spatial_coords, nx=nx, verbose=verbose ) - if normalize_g: + if normalize_g and ((use_rep is None) or (not isinstance(use_rep, str)) or (use_rep not in samples[0].obsm.keys()) or (use_rep not in samples[1].obsm.keys())): exp_matrices = normalize_exps(matrices=exp_matrices, nx=nx, verbose=verbose) return ( @@ -503,6 +507,8 @@ def calc_exp_dissimilarity( "kl", "euclidean", "euc", + "cos", + "cosine" ], "``dissimilarity`` value is wrong. Available ``dissimilarity`` are: ``'kl'``, ``'euclidean'`` and ``'euc'``." if dissimilarity.lower() == "kl": X_A = X_A + 0.01 @@ -686,6 +692,20 @@ def get_optimal_R( ############################### +def _cos_similarity( + mat1: Union[np.ndarray, torch.Tensor], + mat2: Union[np.ndarray, torch.Tensor], +): + nx = ot.backend.get_backend(mat1, mat2) + if nx_torch(nx): + torch_cos = torch.nn.CosineSimilarity(dim=1) + mat1_unsqueeze = mat1.unsqueeze(-1) + mat2_unsqueeze = mat2.unsqueeze(-1).transpose(0,2) + distMat = torch_cos(mat1_unsqueeze, mat2_unsqueeze) * 0.5 + 0.5 + else: + distMat = (-ot.dist(mat1, mat2, metric='cosine')+1)*0.5 + 0.5 + return distMat + def _dist( mat1: Union[np.ndarray, torch.Tensor], mat2: Union[np.ndarray, torch.Tensor], @@ -695,17 +715,21 @@ def _dist( "euc", "euclidean", "kl", + "cos", + "cosine" ], "``metric`` value is wrong. Available ``metric`` are: ``'euc'``, ``'euclidean'`` and ``'kl'``." nx = ot.backend.get_backend(mat1, mat2) if metric.lower() == "euc" or metric.lower() == "euclidean": distMat = nx.sum(mat1**2, 1)[:, None] + nx.sum(mat2**2, 1)[None, :] - 2 * _dot(nx)(mat1, mat2.T) - else: + elif metric.lower() == "kl": distMat = ( nx.sum(mat1 * nx.log(mat1), 1)[:, None] + nx.sum(mat2 * nx.log(mat2), 1)[None, :] - _dot(nx)(mat1, nx.log(mat2).T) - _dot(nx)(mat2, nx.log(mat1).T).T ) / 2 + elif (metric.lower() == "cosine") or (metric.lower() == "cos"): + distMat = _cos_similarity(mat1, mat2) return distMat From ebe8b325fb369dc3017b3332c6cead8ba55bdbf9 Mon Sep 17 00:00:00 2001 From: YifanLu2000 Date: Wed, 3 Jul 2024 20:47:52 +0000 Subject: [PATCH 02/10] fix a bug --- spateo/alignment/methods/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spateo/alignment/methods/utils.py b/spateo/alignment/methods/utils.py index 3b11e2f0..03f8bfce 100755 --- a/spateo/alignment/methods/utils.py +++ b/spateo/alignment/methods/utils.py @@ -140,13 +140,14 @@ def normalize_coords( # normalize_scale now becomes to a list normalize_scale_list = [] + normalize_scale_list = nx.zeros((len(coords, )), type_as=coords[0]) normalize_mean_list = [] for i in range(len(coords)): normalize_mean = nx.einsum("ij->j", coords[i]) / coords[i].shape[0] normalize_mean_list.append(normalize_mean) coords[i] -= normalize_mean normalize_scale = nx.sqrt(nx.einsum("ij->", nx.einsum("ij,ij->ij", coords[i], coords[i])) / coords[i].shape[0]) - normalize_scale_list.append(normalize_scale) + normalize_scale_list[i] = normalize_scale if coords[0].shape[1] == 2: normalize_scale = nx.mean(normalize_scale_list) From f8b6b049f53c60c7a0da1ace9bcf258e74730a22 Mon Sep 17 00:00:00 2001 From: YifanLu2000 Date: Sat, 6 Jul 2024 03:09:57 +0000 Subject: [PATCH 03/10] fix bugs for chunk --- spateo/alignment/methods/morpho_sparse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spateo/alignment/methods/morpho_sparse.py b/spateo/alignment/methods/morpho_sparse.py index 2e532894..b5454df8 100644 --- a/spateo/alignment/methods/morpho_sparse.py +++ b/spateo/alignment/methods/morpho_sparse.py @@ -176,6 +176,7 @@ def BA_align_sparse( batch_size: int = 1024, use_sparse: bool = True, pre_compute_dist: bool = False, + batch_capacity: int = 1, ) -> Tuple[Optional[Tuple[AnnData, AnnData]], np.ndarray, np.ndarray]: empty_cache(device=device) # Preprocessing and extract the spatial and expression information From dd0994b7f656f1b844a5794e09786960e5d014d5 Mon Sep 17 00:00:00 2001 From: YifanLu2000 Date: Sat, 6 Jul 2024 03:16:22 +0000 Subject: [PATCH 04/10] add docstrings --- spateo/alignment/methods/morpho.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spateo/alignment/methods/morpho.py b/spateo/alignment/methods/morpho.py index 4e584b62..d3c82283 100755 --- a/spateo/alignment/methods/morpho.py +++ b/spateo/alignment/methods/morpho.py @@ -240,7 +240,7 @@ def BA_align( batch_size: int = 1000, partial_robust_level: float = 25, ) -> Tuple[Optional[Tuple[AnnData, AnnData]], np.ndarray, np.ndarray]: - """_summary_ + """core function for spateo pairwise alignment Args: sampleA: Sample A that acts as reference. @@ -353,9 +353,9 @@ def BA_align( else: init_R = np.eye(D) init_t = np.zeros((D,)) - inlier_A = np.zeros((4,D)) - inlier_B = np.zeros((4,D)) - inlier_P = np.ones((4,1)) + inlier_A = np.zeros((4, D)) + inlier_B = np.zeros((4, D)) + inlier_P = np.ones((4, 1)) init_R = _data(nx, init_R, type_as) init_t = _data(nx, init_t, type_as) inlier_A = _data(nx, inlier_A, type_as) From a78beef265278641b48cf1d2df2be9355887021d Mon Sep 17 00:00:00 2001 From: YifanLu2000 Date: Sat, 6 Jul 2024 03:21:52 +0000 Subject: [PATCH 05/10] black format --- .../alignment/methods/morpho_sparse_utils.py | 29 +++++++++------ spateo/alignment/methods/utils.py | 36 ++++++++++++++----- 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/spateo/alignment/methods/morpho_sparse_utils.py b/spateo/alignment/methods/morpho_sparse_utils.py index 322df41a..3e041ff7 100644 --- a/spateo/alignment/methods/morpho_sparse_utils.py +++ b/spateo/alignment/methods/morpho_sparse_utils.py @@ -113,7 +113,7 @@ def calc_P_related( NA, NB = XnAHat.shape[0], XnB.shape[0] D = XnAHat.shape[1] batch_base = 1e9 - split_size = min(int(batch_capacity * batch_base/ (NA * G)), NB) + split_size = min(int(batch_capacity * batch_base / (NA * G)), NB) split_size = 1 if split_size == 0 else split_size nx = ot.backend.get_backend(XnAHat, XnB) @@ -130,7 +130,11 @@ def calc_P_related( sigma2_temp = 0 cur_row = 0 for XnB_chunk, X_B_chunk, labelB_chunk in zip(XnB_chunks, X_B_chunks, labelB_chunks): - label_mask_chunk = (None if labelA is None else _construct_label_mask(nx, labelA, labelB_chunk, label_transfer_prior, XnB_chunk).T) + label_mask_chunk = ( + None + if labelA is None + else _construct_label_mask(nx, labelA, labelB_chunk, label_transfer_prior, XnB_chunk).T + ) # calculate distance matrix (common step) SpatialMat = _dist(XnAHat, XnB_chunk, "square_euc") @@ -276,7 +280,7 @@ def _construct_label_mask(nx, labelA, labelB, label_transfer_prior, type_as): idxB = np.where((labelB == k))[0] for j in label_transfer_prior[k].keys(): idxA = np.where((labelA == j))[0] - label_mask[idxB[:, None],idxA] = label_transfer_prior[k][j] + label_mask[idxB[:, None], idxA] = label_transfer_prior[k][j] return label_mask @@ -321,6 +325,7 @@ def _SparseTensor(nx, row, col, value, sparse_sizes): else: return coo_array((value, (row, col)), shape=sparse_sizes) + def _cos_similarity( mat1: Union[np.ndarray, torch.Tensor], mat2: Union[np.ndarray, torch.Tensor], @@ -329,10 +334,10 @@ def _cos_similarity( if nx_torch(nx): torch_cos = torch.nn.CosineSimilarity(dim=1) mat1_unsqueeze = mat1.unsqueeze(-1) - mat2_unsqueeze = mat2.unsqueeze(-1).transpose(0,2) + mat2_unsqueeze = mat2.unsqueeze(-1).transpose(0, 2) distMat = -torch_cos(mat1_unsqueeze, mat2_unsqueeze) * 0.5 + 0.5 else: - distMat = (ot.dist(mat1, mat2, metric='cosine'))*0.5 + distMat = (ot.dist(mat1, mat2, metric="cosine")) * 0.5 return distMat @@ -372,12 +377,11 @@ def _dist( - _dot(nx)(mat1, nx.log(mat2).T) - _dot(nx)(mat2, nx.log(mat1).T).T ) / 2 - elif (metric.lower() == "cos" or metric.lower() == "cosine"): + elif metric.lower() == "cos" or metric.lower() == "cosine": distMat = _cos_similarity(mat1, mat2) return distMat - # Check if nx is a torch backend nx_torch = lambda nx: True if isinstance(nx, ot.backend.TorchBackend) else False _cat = lambda nx, x, dim: torch.cat(x, dim=dim) if nx_torch(nx) else np.concatenate(x, axis=dim) @@ -388,6 +392,8 @@ def _dist( if nx_torch(nx) else np.array_split(x, chunk_size, axis=dim) ) + + def _torch_like_split(arr, size, dim=0): if dim < 0: dim += arr.ndim @@ -396,13 +402,14 @@ def _torch_like_split(arr, size, dim=0): flat_arr = arr.reshape(-1, shape[dim]) num_splits = flat_arr.shape[-1] // size remainder = flat_arr.shape[-1] % size - splits = np.array_split(flat_arr[:, :num_splits * size], num_splits, axis=-1) + splits = np.array_split(flat_arr[:, : num_splits * size], num_splits, axis=-1) if remainder: - splits.append(flat_arr[:, num_splits * size:]) - splits = [np.swapaxes(split.reshape(*shape[:dim], -1, *shape[dim+1:]), dim, -1) for split in splits] - + splits.append(flat_arr[:, num_splits * size :]) + splits = [np.swapaxes(split.reshape(*shape[:dim], -1, *shape[dim + 1 :]), dim, -1) for split in splits] + return splits + _where = lambda nx, condition: torch.where(condition) if nx_torch(nx) else np.where(condition) _repeat_interleave = ( lambda nx, x, repeats, axis: torch.repeat_interleave(x, repeats, dim=axis) diff --git a/spateo/alignment/methods/utils.py b/spateo/alignment/methods/utils.py index 03f8bfce..1a0ccfbe 100755 --- a/spateo/alignment/methods/utils.py +++ b/spateo/alignment/methods/utils.py @@ -140,7 +140,14 @@ def normalize_coords( # normalize_scale now becomes to a list normalize_scale_list = [] - normalize_scale_list = nx.zeros((len(coords, )), type_as=coords[0]) + normalize_scale_list = nx.zeros( + ( + len( + coords, + ) + ), + type_as=coords[0], + ) normalize_mean_list = [] for i in range(len(coords)): normalize_mean = nx.einsum("ij->j", coords[i]) / coords[i].shape[0] @@ -244,10 +251,17 @@ def align_preprocess( new_samples = [s[:, common_genes] for s in new_samples] # Gene expression matrix of all samples - if (use_rep is None) or (not isinstance(use_rep, str)) or (use_rep not in samples[0].obsm.keys()) or (use_rep not in samples[1].obsm.keys()): + if ( + (use_rep is None) + or (not isinstance(use_rep, str)) + or (use_rep not in samples[0].obsm.keys()) + or (use_rep not in samples[1].obsm.keys()) + ): exp_matrices = [nx.from_numpy(check_exp(sample=s, layer=layer), type_as=type_as) for s in new_samples] else: - exp_matrices = [nx.from_numpy(s.obsm[use_rep], type_as=type_as) for s in new_samples] + [nx.from_numpy(check_exp(sample=s, layer=layer), type_as=type_as) for s in new_samples] + exp_matrices = [nx.from_numpy(s.obsm[use_rep], type_as=type_as) for s in new_samples] + [ + nx.from_numpy(check_exp(sample=s, layer=layer), type_as=type_as) for s in new_samples + ] if not (select_high_exp_genes is False): # Select significance genes if select_high_exp_genes is True ExpressionData = _cat(nx=nx, x=exp_matrices, dim=0) @@ -272,7 +286,12 @@ def align_preprocess( spatial_coords, normalize_scale_list, normalize_mean_list = normalize_coords( coords=spatial_coords, nx=nx, verbose=verbose ) - if normalize_g and ((use_rep is None) or (not isinstance(use_rep, str)) or (use_rep not in samples[0].obsm.keys()) or (use_rep not in samples[1].obsm.keys())): + if normalize_g and ( + (use_rep is None) + or (not isinstance(use_rep, str)) + or (use_rep not in samples[0].obsm.keys()) + or (use_rep not in samples[1].obsm.keys()) + ): exp_matrices = normalize_exps(matrices=exp_matrices, nx=nx, verbose=verbose) return ( @@ -509,7 +528,7 @@ def calc_exp_dissimilarity( "euclidean", "euc", "cos", - "cosine" + "cosine", ], "``dissimilarity`` value is wrong. Available ``dissimilarity`` are: ``'kl'``, ``'euclidean'`` and ``'euc'``." if dissimilarity.lower() == "kl": X_A = X_A + 0.01 @@ -701,12 +720,13 @@ def _cos_similarity( if nx_torch(nx): torch_cos = torch.nn.CosineSimilarity(dim=1) mat1_unsqueeze = mat1.unsqueeze(-1) - mat2_unsqueeze = mat2.unsqueeze(-1).transpose(0,2) + mat2_unsqueeze = mat2.unsqueeze(-1).transpose(0, 2) distMat = torch_cos(mat1_unsqueeze, mat2_unsqueeze) * 0.5 + 0.5 else: - distMat = (-ot.dist(mat1, mat2, metric='cosine')+1)*0.5 + 0.5 + distMat = (-ot.dist(mat1, mat2, metric="cosine") + 1) * 0.5 + 0.5 return distMat + def _dist( mat1: Union[np.ndarray, torch.Tensor], mat2: Union[np.ndarray, torch.Tensor], @@ -717,7 +737,7 @@ def _dist( "euclidean", "kl", "cos", - "cosine" + "cosine", ], "``metric`` value is wrong. Available ``metric`` are: ``'euc'``, ``'euclidean'`` and ``'kl'``." nx = ot.backend.get_backend(mat1, mat2) if metric.lower() == "euc" or metric.lower() == "euclidean": From b5b2acf6a427e032370a563004359995a1db0e98 Mon Sep 17 00:00:00 2001 From: YifanLu2000 Date: Sat, 6 Jul 2024 03:47:50 +0000 Subject: [PATCH 06/10] fix bugs --- spateo/alignment/methods/morpho_sparse_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spateo/alignment/methods/morpho_sparse_utils.py b/spateo/alignment/methods/morpho_sparse_utils.py index 3e041ff7..cea8a2e3 100644 --- a/spateo/alignment/methods/morpho_sparse_utils.py +++ b/spateo/alignment/methods/morpho_sparse_utils.py @@ -112,6 +112,7 @@ def calc_P_related( # metric = 'square_euc' NA, NB = XnAHat.shape[0], XnB.shape[0] D = XnAHat.shape[1] + G = X_A.shape[1] batch_base = 1e9 split_size = min(int(batch_capacity * batch_base / (NA * G)), NB) split_size = 1 if split_size == 0 else split_size From e9aaa785fae76184ed7c90be9dbc3e2806ff5953 Mon Sep 17 00:00:00 2001 From: YifanLu2000 Date: Sun, 7 Jul 2024 04:34:10 +0000 Subject: [PATCH 07/10] update Spateo alignment --- spateo/alignment/methods/morpho.py | 56 ++++++++++++++++-- spateo/alignment/methods/morpho_sparse.py | 2 +- .../alignment/methods/morpho_sparse_utils.py | 59 +++++++++++++++++-- spateo/alignment/methods/utils.py | 53 +++++++++++++++-- 4 files changed, 154 insertions(+), 16 deletions(-) diff --git a/spateo/alignment/methods/morpho.py b/spateo/alignment/methods/morpho.py index d3c82283..3c352cd3 100755 --- a/spateo/alignment/methods/morpho.py +++ b/spateo/alignment/methods/morpho.py @@ -36,6 +36,7 @@ coarse_rigid_alignment, empty_cache, get_optimal_R, + nx_torch, ) @@ -80,6 +81,7 @@ def get_P( SpatialDistMat: Union[np.ndarray, torch.Tensor], samples_s: Optional[List[float]] = None, outlier_variance: float = None, + label_dist: Optional[Union[np.ndarray, torch.Tensor]] = None, ) -> Tuple[Any, Any, Any]: """Calculating the generating probability matrix P. @@ -119,6 +121,7 @@ def get_P( exp_SpatialMat, (_mul(nx)(alpha, nx.exp(-Sigma / sigma2))), ) + spatial_term1 = spatial_term1 * label_dist if label_dist is not None else spatial_term1 spatial_outlier = _power(nx)((2 * _pi(nx) * sigma2), _data(nx, D / 2, XnAHat)) * (1 - gamma) / (gamma * outlier_s) spatial_term2 = spatial_outlier + nx.einsum("ij->j", spatial_term1) spatial_P = spatial_term1 / _unsqueeze(nx)(spatial_term2, 0) @@ -128,6 +131,7 @@ def get_P( _mul(nx)(nx.exp(-SpatialDistMat / (2 * sigma2)), nx.exp(-GeneDistMat / (2 * beta2))), (_mul(nx)(alpha, nx.exp(-Sigma / sigma2))), ) + term1 = term1 * label_dist if label_dist is not None else term1 P = term1 / (_unsqueeze(nx)(nx.einsum("ij->j", term1), 0) + 1e-8) P = nx.einsum("j,ij->ij", spatial_inlier, P) @@ -155,6 +159,7 @@ def get_P_chunk( outlier_variance: float = None, chunk_size: int = 1000, dissimilarity: str = "kl", + label_dist: Optional[Union[np.ndarray, torch.Tensor]] = None, ) -> Union[np.ndarray, torch.Tensor]: """Calculating the generating probability matrix P. @@ -183,9 +188,9 @@ def get_P_chunk( # chunk X_Bs = _chunk(nx, X_B, chunk_num, dim=0) XnBs = _chunk(nx, XnB, chunk_num, dim=0) - + label_dists = _chunk(nx, label_dist, chunk_num, dim=1) if label_dist is not None else [None] * len(XnBs) Ps = [] - for x_Bs, xnBs in zip(X_Bs, XnBs): + for x_Bs, xnBs, l_d in zip(X_Bs, XnBs, label_dists): SpatialDistMat = cal_dist(XnAHat, xnBs) GeneDistMat = calc_exp_dissimilarity(X_A=X_A, X_B=x_Bs, dissimilarity=dissimilarity) if outlier_variance is None: @@ -197,6 +202,7 @@ def get_P_chunk( exp_SpatialMat, (_mul(nx)(alpha, nx.exp(-Sigma / sigma2))), ) + spatial_term1 = spatial_term1 * l_d if l_d is not None else spatial_term1 spatial_outlier = ( _power(nx)((2 * _pi(nx) * sigma2), _data(nx, D / 2, XnAHat)) * (1 - gamma) / (gamma * outlier_s) ) @@ -206,6 +212,7 @@ def get_P_chunk( _mul(nx)(nx.exp(-SpatialDistMat / (2 * sigma2)), nx.exp(-GeneDistMat / (2 * beta2))), (_mul(nx)(alpha, nx.exp(-Sigma / sigma2))), ) + term1 = term1 * l_d if l_d is not None else term1 P = term1 / (_unsqueeze(nx)(nx.einsum("ij->j", term1), 0) + 1e-8) P = nx.einsum("j,ij->ij", spatial_inlier, P) Ps.append(P) @@ -239,6 +246,12 @@ def BA_align( SVI_mode: bool = True, batch_size: int = 1000, partial_robust_level: float = 25, + use_rep: Optional[str] = None, + use_label_prior: bool = False, + label_key: Optional[str] = "cluster", + label_transfer_prior: Optional[dict] = None, + beta2: Optional[float] = None, + beta2_end: Optional[float] = None, ) -> Tuple[Optional[Tuple[AnnData, AnnData]], np.ndarray, np.ndarray]: """core function for spateo pairwise alignment @@ -293,8 +306,26 @@ def BA_align( dtype=dtype, device=device, verbose=verbose, + use_rep=use_rep, ) + if use_label_prior: + catB = sampleA.obs[label_key].cat.categories.tolist() + catA = sampleB.obs[label_key].cat.categories.tolist() + label_transfer = np.zeros((len(catA), len(catB)), dtype=np.float32) + + for j, ca in enumerate(catA): + for k, cb in enumerate(catB): + label_transfer[j, k] = label_transfer_prior[ca][cb] + label_transfer = nx.from_numpy(label_transfer, type_as=type_as) + labelA = nx.from_numpy(np.array(sampleB.obs[label_key].cat.codes.values, dtype=np.int64)) + labelB = nx.from_numpy(np.array(sampleA.obs[label_key].cat.codes.values, dtype=np.int64)) + if nx_torch(nx): + labelA = labelA.to(type_as.device) + labelB = labelB.to(type_as.device) + else: + label_transfer, labelA, labelB = None, None, None + coordsA, coordsB = spatial_coords[1], spatial_coords[0] X_A, X_B = exp_matrices[1], exp_matrices[0] del spatial_coords, exp_matrices @@ -400,20 +431,29 @@ def BA_align( R = _identity(nx, D, type_as) minGeneDistMat = nx.min(GeneDistMat, 1) # Automatically determine the value of beta2 - beta2 = minGeneDistMat[nx.argsort(minGeneDistMat)[int(GeneDistMat.shape[0] * 0.05)]] / 5 - beta2_end = nx.max(minGeneDistMat) / 5 + beta2 = ( + minGeneDistMat[nx.argsort(minGeneDistMat)[int(GeneDistMat.shape[0] * 0.05)]] / 5 + if beta2 is None + else _data(nx, data=beta2, type_as=type_as) + ) + beta2_end = nx.max(minGeneDistMat) / 5 if beta2_end is None else _data(nx, data=beta2_end, type_as=type_as) del minGeneDistMat if sub_sample: del sub_X_A, sub_X_B, GeneDistMat # The value of beta2 becomes progressively larger beta2 = nx.maximum(beta2, _data(nx, 1e-2, type_as)) beta2_decrease = _power(nx)(beta2_end / beta2, 1 / (50)) - + print("beta2: {} --> {}".format(beta2, beta2_end)) # Use smaller spatial variance to reduce tails outlier_variance = 1 max_outlier_variance = partial_robust_level # 20 outlier_variance_decrease = _power(nx)(_data(nx, max_outlier_variance, type_as), 1 / (max_iter / 2)) + if use_label_prior: + label_dist = label_transfer[labelA, :][:, labelB] + else: + label_dist = None + if SVI_mode: SVI_deacy = _data(nx, 10.0, type_as) # Select a random subset of data @@ -428,6 +468,7 @@ def BA_align( else: randGeneDistMat = GeneDistMat[:, randIdx] # NA x batch_size SpatialDistMat = SpatialDistMat[:, randIdx] # NA x batch_size + randlabel_dist = label_dist[:, randIdx] if use_label_prior else None Sp, Sp_spatial, Sp_sigma2 = 0, 0, 0 SigmaInv = nx.zeros((K, K), type_as=type_as) # K x K PXB_term = nx.zeros((NA, D), type_as=type_as) # NA x D @@ -462,6 +503,7 @@ def BA_align( GeneDistMat=randGeneDistMat, SpatialDistMat=SpatialDistMat, outlier_variance=outlier_variance, + label_dist=randlabel_dist, ) else: P, spatial_P, sigma2_P = get_P( @@ -475,6 +517,7 @@ def BA_align( GeneDistMat=GeneDistMat, SpatialDistMat=SpatialDistMat, outlier_variance=outlier_variance, + label_dist=label_dist, ) if iter > 5: @@ -624,6 +667,7 @@ def BA_align( else: randGeneDistMat = GeneDistMat[:, randIdx] # NA x batch_size SpatialDistMat = cal_dist(XAHat, randcoordsB) + randlabel_dist = label_dist[:, randIdx] if use_label_prior else None empty_cache(device=device) # full data @@ -639,6 +683,8 @@ def BA_align( gamma=gamma, Sigma=SigmaDiag, outlier_variance=outlier_variance, + label_dist=label_dist, + dissimilarity=dissimilarity, ) # Get optimal Rigid transformation optimal_RnA, optimal_R, optimal_t = get_optimal_R( diff --git a/spateo/alignment/methods/morpho_sparse.py b/spateo/alignment/methods/morpho_sparse.py index b5454df8..22467ee6 100644 --- a/spateo/alignment/methods/morpho_sparse.py +++ b/spateo/alignment/methods/morpho_sparse.py @@ -547,7 +547,7 @@ def BA_align_sparse( Sigma=SigmaDiag, outlier_variance=outlier_variance, label_transfer_prior=label_transfer_prior, - top_k=32, + top_k=512, dissimilarity=dissimilarity, batch_capacity=batch_capacity, ) diff --git a/spateo/alignment/methods/morpho_sparse_utils.py b/spateo/alignment/methods/morpho_sparse_utils.py index cea8a2e3..3625e220 100644 --- a/spateo/alignment/methods/morpho_sparse_utils.py +++ b/spateo/alignment/methods/morpho_sparse_utils.py @@ -29,7 +29,9 @@ def calc_distance( "square_euc", "square_euclidean", "kl", - ], "``metric`` value is wrong. Available ``metric`` are: ``'euc'``, ``'euclidean'``, ``'square_euc'``, ``'square_euclidean'``, and ``'kl'``." + "cos", + "cosine", + ], "``metric`` value is wrong. Available ``metric`` are: ``'euc'``, ``'euclidean'``, ``'square_euc'``, ``'square_euclidean'``, ``'kl'``, ``'cos'``, and ``'cosine'``." if use_sparse: assert sparse_method in [ @@ -327,18 +329,63 @@ def _SparseTensor(nx, row, col, value, sparse_sizes): return coo_array((value, (row, col)), shape=sparse_sizes) +def _cosine_distance_backend( + X: Union[np.ndarray, torch.Tensor], + Y: Union[np.ndarray, torch.Tensor], + eps: float = 1e-8, +) -> Union[np.ndarray, torch.Tensor]: + """ + Compute the pairwise cosine similarity between all pairs of samples in matrices X and Y. + + Parameters + ---------- + X : np.ndarray or torch.Tensor + Matrix with shape (N, D), where each row represents a sample. + Y : np.ndarray or torch.Tensor + Matrix with shape (M, D), where each row represents a sample. + eps : float, optional + A small value to avoid division by zero. Default is 1e-8. + + Returns + ------- + np.ndarray or torch.Tensor + Pairwise cosine similarity matrix with shape (N, M). + + Raises + ------ + AssertionError + If the number of features in X and Y do not match. + """ + + assert X.shape[1] == Y.shape[1], "X and Y do not have the same number of features." + + # Get the appropriate backend (either NumPy or PyTorch) + nx = ot.backend.get_backend(X, Y) + + # Normalize rows to unit vectors + X_norm = nx.sqrt(nx.sum(X**2, axis=1, keepdims=True)) + Y_norm = nx.sqrt(nx.sum(Y**2, axis=1, keepdims=True)) + X = X / nx.maximum(X_norm, eps) + Y = Y / nx.maximum(Y_norm, eps) + + # Compute cosine similarity + D = nx.dot(X, Y.T) + + return D + + def _cos_similarity( mat1: Union[np.ndarray, torch.Tensor], mat2: Union[np.ndarray, torch.Tensor], ): nx = ot.backend.get_backend(mat1, mat2) if nx_torch(nx): - torch_cos = torch.nn.CosineSimilarity(dim=1) - mat1_unsqueeze = mat1.unsqueeze(-1) - mat2_unsqueeze = mat2.unsqueeze(-1).transpose(0, 2) - distMat = -torch_cos(mat1_unsqueeze, mat2_unsqueeze) * 0.5 + 0.5 + # torch_cos = torch.nn.CosineSimilarity(dim=1) + # mat1_unsqueeze = mat1.unsqueeze(-1) + # mat2_unsqueeze = mat2.unsqueeze(-1).transpose(0, 2) + distMat = -_cosine_distance_backend(mat1, mat2) * 0.5 + 0.5 else: - distMat = (ot.dist(mat1, mat2, metric="cosine")) * 0.5 + distMat = (-ot.dist(mat1, mat2, metric="cosine") + 1) * 0.5 + 0.5 return distMat diff --git a/spateo/alignment/methods/utils.py b/spateo/alignment/methods/utils.py index 1a0ccfbe..ca3816cd 100755 --- a/spateo/alignment/methods/utils.py +++ b/spateo/alignment/methods/utils.py @@ -712,16 +712,61 @@ def get_optimal_R( ############################### +def _cosine_distance_backend( + X: Union[np.ndarray, torch.Tensor], + Y: Union[np.ndarray, torch.Tensor], + eps: float = 1e-8, +) -> Union[np.ndarray, torch.Tensor]: + """ + Compute the pairwise cosine similarity between all pairs of samples in matrices X and Y. + + Parameters + ---------- + X : np.ndarray or torch.Tensor + Matrix with shape (N, D), where each row represents a sample. + Y : np.ndarray or torch.Tensor + Matrix with shape (M, D), where each row represents a sample. + eps : float, optional + A small value to avoid division by zero. Default is 1e-8. + + Returns + ------- + np.ndarray or torch.Tensor + Pairwise cosine similarity matrix with shape (N, M). + + Raises + ------ + AssertionError + If the number of features in X and Y do not match. + """ + + assert X.shape[1] == Y.shape[1], "X and Y do not have the same number of features." + + # Get the appropriate backend (either NumPy or PyTorch) + nx = ot.backend.get_backend(X, Y) + + # Normalize rows to unit vectors + X_norm = nx.sqrt(nx.sum(X**2, axis=1, keepdims=True)) + Y_norm = nx.sqrt(nx.sum(Y**2, axis=1, keepdims=True)) + X = X / nx.maximum(X_norm, eps) + Y = Y / nx.maximum(Y_norm, eps) + + # Compute cosine similarity + D = nx.dot(X, Y.T) + + return D + + def _cos_similarity( mat1: Union[np.ndarray, torch.Tensor], mat2: Union[np.ndarray, torch.Tensor], ): nx = ot.backend.get_backend(mat1, mat2) if nx_torch(nx): - torch_cos = torch.nn.CosineSimilarity(dim=1) - mat1_unsqueeze = mat1.unsqueeze(-1) - mat2_unsqueeze = mat2.unsqueeze(-1).transpose(0, 2) - distMat = torch_cos(mat1_unsqueeze, mat2_unsqueeze) * 0.5 + 0.5 + # torch_cos = torch.nn.CosineSimilarity(dim=1) + # mat1_unsqueeze = mat1.unsqueeze(-1) + # mat2_unsqueeze = mat2.unsqueeze(-1).transpose(0, 2) + distMat = -_cosine_distance_backend(mat1, mat2) * 0.5 + 0.5 else: distMat = (-ot.dist(mat1, mat2, metric="cosine") + 1) * 0.5 + 0.5 return distMat From 007d6d29c90f6d5a4f48772c1930a4e8259ac2fa Mon Sep 17 00:00:00 2001 From: YifanLu2000 Date: Sun, 7 Jul 2024 05:07:22 +0000 Subject: [PATCH 08/10] spateo alignment: add sigma2_end --- spateo/alignment/methods/morpho.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spateo/alignment/methods/morpho.py b/spateo/alignment/methods/morpho.py index 3c352cd3..d0c6e691 100755 --- a/spateo/alignment/methods/morpho.py +++ b/spateo/alignment/methods/morpho.py @@ -252,6 +252,7 @@ def BA_align( label_transfer_prior: Optional[dict] = None, beta2: Optional[float] = None, beta2_end: Optional[float] = None, + sigma2_end: Optional[float] = None, ) -> Tuple[Optional[Tuple[AnnData, AnnData]], np.ndarray, np.ndarray]: """core function for spateo pairwise alignment @@ -677,7 +678,7 @@ def BA_align( XnB=coordsB, X_A=X_A, X_B=X_B, - sigma2=sigma2, + sigma2=sigma2 if sigma2_end is None else _data(nx, sigma2_end, type_as), beta2=beta2, alpha=alpha, gamma=gamma, From 6e8b8822e4ad727d61e0f25533d3749584214354 Mon Sep 17 00:00:00 2001 From: "Daniel Y. Zhu" Date: Fri, 12 Jul 2024 13:26:46 -0400 Subject: [PATCH 09/10] Small database update --- spateo/tools/CCI_effects_modeling/MuSIC.py | 33 ++++++---- .../CCI_effects_modeling/MuSIC_downstream.py | 7 +- spateo/tools/cci_two_cluster.py | 64 ++++++++++++++++++- 3 files changed, 91 insertions(+), 13 deletions(-) diff --git a/spateo/tools/CCI_effects_modeling/MuSIC.py b/spateo/tools/CCI_effects_modeling/MuSIC.py index 81b77eb0..8f7e48df 100644 --- a/spateo/tools/CCI_effects_modeling/MuSIC.py +++ b/spateo/tools/CCI_effects_modeling/MuSIC.py @@ -1030,9 +1030,10 @@ def define_sig_inputs(self, adata: Optional[anndata.AnnData] = None, recompute: ligands = [l for l in ligands if l in database_ligands] # Some ligands in the mouse database are not ligands, but internal factors that interact w/ i.e. # the hormone receptors: + ligands_test = l_test = [l[0].upper() + l[1:].lower() for l in ligands] ligands = [ l - for l in ligands + for l in ligands_test if l.title() not in [ "Lta4h", @@ -1056,6 +1057,7 @@ def define_sig_inputs(self, adata: Optional[anndata.AnnData] = None, recompute: "Enho", "Ptgr1", "Agrp", + "Pnmt", "Akr1b3", "Daglb", "Ubash3d", @@ -1075,6 +1077,17 @@ def define_sig_inputs(self, adata: Optional[anndata.AnnData] = None, recompute: "Mmp2", "Ttr", "Alb", + "Sult2a1", + "Hsd17b6", + "Cyp11a1", + "Cyp11b1", + "Cyp11b2", + "Cyp17a1", + "Cyp19a1", + "Cyp21a1", + "Cyp27b1", + "Sult1e1", + "Dio3", ] ] l_complexes = [elem for elem in ligands if "_" in elem] @@ -1448,7 +1461,8 @@ def define_sig_inputs(self, adata: Optional[anndata.AnnData] = None, recompute: targets = np.array(targets)[target_expr_percentage > self.target_expr_threshold] # Filter targets to those that can be found in our prior GRN: - targets = [t for t in targets if t in self.grn.index] + if self.mod_type != "niche": + targets = [t for t in targets if t in self.grn.index] self.targets_expr = pd.DataFrame( adata[:, targets].X.A if scipy.sparse.issparse(adata.X) else adata[:, targets].X, @@ -3430,7 +3444,6 @@ def fit( indices = self.subsampled_indices[target] self.x_chunk = np.array(indices) - if self.mod_type != "niche": # To avoid false negative coefficients due to collinearity, feature mask based on the global correlation correlations = [] for idx in range(X.shape[1]): @@ -3830,10 +3843,9 @@ def return_outputs( # receptor models, do not infer expression in cells that do not express the target because it # is unknown whether the ligand/receptor (the half of the interacting pair that is missing) is # present in the neighborhood of these cells: - if self.mod_type in ["receptor", "ligand", "downstream"]: - mask_matrix = (self.adata[betas.index, target].X != 0).toarray().astype(int) - betas *= mask_matrix - standard_errors *= mask_matrix + mask_matrix = (self.adata[betas.index, target].X != 0).toarray().astype(int) + betas *= mask_matrix + standard_errors *= mask_matrix mask_df = (self.X_df.loc[betas.index] != 0).astype(int) mask_df = mask_df.loc[:, [g for g in mask_df.columns if g in feat_sub]] for col in betas.columns: @@ -3868,10 +3880,9 @@ def return_outputs( else: if not load_for_interpreter: # Same processing as for subsampling, but without the subsampling: - if self.mod_type in ["receptor", "ligand", "downstream"]: - mask_matrix = (self.adata[betas.index, target].X != 0).toarray().astype(int) - betas *= mask_matrix - standard_errors *= mask_matrix + mask_matrix = (self.adata[betas.index, target].X != 0).toarray().astype(int) + betas *= mask_matrix + standard_errors *= mask_matrix mask_df = (self.X_df.loc[betas.index] != 0).astype(int) mask_df = mask_df.loc[:, [g for g in mask_df.columns if g in feat_sub]] for col in betas.columns: diff --git a/spateo/tools/CCI_effects_modeling/MuSIC_downstream.py b/spateo/tools/CCI_effects_modeling/MuSIC_downstream.py index dece7b2e..e4cf59fa 100644 --- a/spateo/tools/CCI_effects_modeling/MuSIC_downstream.py +++ b/spateo/tools/CCI_effects_modeling/MuSIC_downstream.py @@ -6907,7 +6907,8 @@ def CCI_deg_detection_setup( all_TFs.extend(custom_tfs) # Also add all TFs that can bind these TFs: - primary_tf_rows = grn.loc[all_TFs] + check_TFs = [tf for tf in all_TFs if tf in grn.index] + primary_tf_rows = grn.loc[check_TFs] secondary_TFs = primary_tf_rows.columns[(primary_tf_rows == 1).any()].tolist() if scipy.sparse.issparse(adata.X): nnz_counts = np.array(adata[:, secondary_TFs].X.getnnz(axis=0)).flatten() @@ -7009,6 +7010,10 @@ def intersection_ratio(signal_col, regulator_col): counts_targets.obs[group_key] = cell_types if self.total_counts_key is not None: + if self.total_counts_key not in self.adata.obs.columns: + self.adata.obs[self.total_counts_key] = ( + self.adata.X.sum() if scipy.sparse.issparse(self.adata.X) else self.adata.X.sum(axis=1) + ) counts_targets.obs[self.total_counts_key] = self.adata.obs.loc[ signal[subset_key].index, self.total_counts_key ] diff --git a/spateo/tools/cci_two_cluster.py b/spateo/tools/cci_two_cluster.py index 5e1e3661..30c6a6ac 100644 --- a/spateo/tools/cci_two_cluster.py +++ b/spateo/tools/cci_two_cluster.py @@ -175,8 +175,70 @@ def find_cci_two_group( raise ValueError(f"No intersected receptor between your adata object and lr_network dataset.") lr_network = lr_network[lr_network["to"].isin(expressed_receptor)] + ligands = list(set(lr_network["from"])) + ligands_test = [l[0].upper() + l[1:].lower() for l in ligands] + ligands = [ + l + for l in ligands_test + if l.title() + not in [ + "Lta4h", + "Fdx1", + "Tfrc", + "Trf", + "Lamc1", + "Aldh1a1", + "Aldh1a2", + "Dhcr24", + "Rnaset2a", + "Ptges3", + "Nampt", + "Trf", + "Fdx1", + "Kdr", + "Apoa1", + "Apoa2", + "Apoe", + "Dhcr7", + "Enho", + "Ptgr1", + "Agrp", + "Pnmt", + "Akr1b3", + "Daglb", + "Ubash3d", + "Psap", + "Lck", + "Lipa", + "Alox5", + "Alox5ap", + "Alox12", + "Cbr1", + "Srd5a3", + "Ddc", + "Ggt1", + "Ggt5", + "Srd5a1", + "Tyr", + "Mmp2", + "Ttr", + "Alb", + "Sult2a1", + "Hsd17b6", + "Cyp11a1", + "Cyp11b1", + "Cyp11b2", + "Cyp17a1", + "Cyp19a1", + "Cyp21a1", + "Cyp27b1", + "Sult1e1", + "Dio3", + ] + ] + # ligand_sender_spec - adata_l = adata[:, list(set(lr_network["from"]))] + adata_l = adata[:, ligands] for g in adata.obs[group_sp].unique(): # Of all cells expressing particular ligand, what proportion are group g: frac = (adata_l[adata_l.obs[group_sp] == g].X > 0).sum(axis=0) / (adata_l.X > 0).sum(axis=0) From 2429a0b8bfdb95ee3ad3926a67ec83be19d718f5 Mon Sep 17 00:00:00 2001 From: Purkinje-cell Date: Tue, 16 Jul 2024 18:08:15 +0000 Subject: [PATCH 10/10] fixed some bugs in CCI model --- spateo/tools/CCI_effects_modeling/MuSIC.py | 23 ++++++++-- .../CCI_effects_modeling/MuSIC_downstream.py | 44 ++++++++++++++----- 2 files changed, 52 insertions(+), 15 deletions(-) diff --git a/spateo/tools/CCI_effects_modeling/MuSIC.py b/spateo/tools/CCI_effects_modeling/MuSIC.py index 81b77eb0..55a8a928 100644 --- a/spateo/tools/CCI_effects_modeling/MuSIC.py +++ b/spateo/tools/CCI_effects_modeling/MuSIC.py @@ -293,6 +293,8 @@ def _set_up_model(self, verbose: bool = True): for file in file_list: if not "predictions" in file: check = pd.read_csv(os.path.join(parent_dir, file), index_col=0) + if isinstance(check.index[0], int): + check.index = check.index.astype(str) if any([name not in check.index for name in self.sample_names]): self.map_new_cells() break @@ -635,6 +637,10 @@ def load_and_process(self, upstream: bool = False): self.adata.X.data += 1 else: self.adata.X += 1 + else: + self.adata.layers[ + "original_counts" + ] = self.adata.X.copy() # add this since gaussian model may also need this in downstream model if "downstream" in self.mod_type: # For finding upstream associations with ligand @@ -2738,7 +2744,10 @@ def local_fit( pred_y = np.dot(X[i], betas) residual = y[i] - pred_y - diagnostic = residual + if isinstance(residual, np.ndarray) or isinstance(residual, list): + diagnostic = residual[0] + else: + diagnostic = residual # Reshape coefficients if necessary: betas = betas.flatten() # Effect of deleting sample i from the dataset on the estimated predicted value at sample i: @@ -3447,7 +3456,11 @@ def fit( else: # Compute the Pearson correlation coefficient for the subset if len(X_subset) > 1: # Ensure there are at least 2 data points to compute correlation - correlation = scipy.stats.pearsonr(X_subset, y_subset)[0] + try: + correlation = scipy.stats.pearsonr(X_subset, y_subset)[0] + except ValueError: + self.logger.info("Some values are missing in the data. Skipping.") + correlation = np.nan else: correlation = np.nan # Not enough data points to compute correlation @@ -3831,9 +3844,13 @@ def return_outputs( # is unknown whether the ligand/receptor (the half of the interacting pair that is missing) is # present in the neighborhood of these cells: if self.mod_type in ["receptor", "ligand", "downstream"]: - mask_matrix = (self.adata[betas.index, target].X != 0).toarray().astype(int) + if isinstance(self.adata[betas.index, target].X, np.ndarray): + mask_matrix = (self.adata[betas.index, target].X != 0).astype(int) + else: + mask_matrix = (self.adata[betas.index, target].X != 0).toarray().astype(int) betas *= mask_matrix standard_errors *= mask_matrix + mask_df = (self.X_df.loc[betas.index] != 0).astype(int) mask_df = mask_df.loc[:, [g for g in mask_df.columns if g in feat_sub]] for col in betas.columns: diff --git a/spateo/tools/CCI_effects_modeling/MuSIC_downstream.py b/spateo/tools/CCI_effects_modeling/MuSIC_downstream.py index dece7b2e..a36a3bba 100644 --- a/spateo/tools/CCI_effects_modeling/MuSIC_downstream.py +++ b/spateo/tools/CCI_effects_modeling/MuSIC_downstream.py @@ -2002,7 +2002,7 @@ def effect_distribution_heatmap( else: rounding_base = 1000 - pos = pos.round(-int(pd.np.log10(rounding_base))) + pos = pos.round(-int(np.log10(rounding_base))) # If position array is numerical, there may not be an exact match- convert the data type to integer: if pos.dtype == float: @@ -2206,10 +2206,14 @@ def effect_distribution_heatmap( consecutive_counts[feature] += 1 if consecutive_counts[feature] >= int(window_size * 1.67): feats_of_interest.add(feature) - for feature in combinations: - if feature not in top_combinations[pos]: - consecutive_counts[feature] = 0 - + # I am not sure what this is for. It will remove genes encountered before, and consecutive_counts is no longer used here after. + # for feature in combinations: + # if feature not in top_combinations[pos]: + # consecutive_counts[feature] = 0 + + feats_of_interest = list( + feats_of_interest + ) # fix for set subscription, set subscription is no longer allowed to_plot = all_fc_coord_sorted[feats_of_interest] if to_plot.index.is_numeric(): # Minmax scale to normalize positional context: @@ -3944,7 +3948,10 @@ def cell_type_specific_interactions( if to_plot == "mean": mean_effects = [] - coef_target = self.coeffs[target].loc[adata.obs_names] + # coef_target = self.coeffs[target].loc[adata.obs_names] + coef_target = self.coeffs[target].loc[ + cell_in_ct.obs_names + ] # This should be cell_in_ct, since it's cell type specific effect, and not the entire adata object coef_target = coef_target[[c for c in coef_target.columns if "intercept" not in c]] if effect_threshold is None: @@ -6901,7 +6908,10 @@ def CCI_deg_detection_setup( if scipy.sparse.issparse(adata.X): nnz_counts = np.array(adata[:, all_TFs].X.getnnz(axis=0)).flatten() else: - nnz_counts = np.array(adata[:, all_TFs].X.getnnz(axis=0)).flatten() + # nnz_counts = np.array(adata[:, all_TFs].X.getnnz(axis=0)).flatten() + nnz_counts = np.count_nonzero( + adata[:, all_TFs].X, axis=0 + ).flatten() # np.ndarray have no getnnz attributes all_TFs = [tf for tf, nnz in zip(all_TFs, nnz_counts) if nnz >= n_cells_threshold] if custom_tfs is not None: all_TFs.extend(custom_tfs) @@ -6912,7 +6922,7 @@ def CCI_deg_detection_setup( if scipy.sparse.issparse(adata.X): nnz_counts = np.array(adata[:, secondary_TFs].X.getnnz(axis=0)).flatten() else: - nnz_counts = np.array(adata[:, secondary_TFs].X.getnnz(axis=0)).flatten() + nnz_counts = np.count_nonzero(adata[:, secondary_TFs].X, axis=0).flatten() secondary_TFs = [ tf for tf, nnz in zip(secondary_TFs, nnz_counts) if nnz >= int(0.5 * n_cells_threshold) ] @@ -6920,9 +6930,14 @@ def CCI_deg_detection_setup( regulator_features = all_TFs + secondary_TFs # Prioritize those that are most coexpressed with at least one target: - regulator_expr = pd.DataFrame( - adata[:, regulator_features].X.A, index=signal[subset_key].index, columns=regulator_features - ) + if scipy.sparse.issparse(adata.X): + regulator_expr = pd.DataFrame( + adata[:, regulator_features].X.A, index=signal[subset_key].index, columns=regulator_features + ) + elif isinstance(adata.X, np.ndarray): # adata.X might be np.ndarray + regulator_expr = pd.DataFrame( + adata[:, regulator_features].X, index=signal[subset_key].index, columns=regulator_features + ) # Dataframe containing target expression: ds_targets_df = signal[subset_key] @@ -6963,7 +6978,10 @@ def intersection_ratio(signal_col, regulator_col): counts = adata[:, regulator_features].copy() # Convert to dataframe: - counts_df = pd.DataFrame(counts.X.toarray(), index=counts.obs_names, columns=counts.var_names) + if scipy.sparse.issparse(adata.X): + counts_df = pd.DataFrame(counts.X.toarray(), index=counts.obs_names, columns=counts.var_names) + elif isinstance(counts.X, np.ndarray): + counts_df = pd.DataFrame(counts.X, index=counts.obs_names, columns=counts.var_names) # combined_df = pd.concat([counts_df, signal[subset_key]], axis=1) # Store the targets (ligands/receptors) to AnnData object, save to file path: @@ -7129,6 +7147,7 @@ def CCI_deg_detection( kwargs["bw_fixed"] = True kwargs["total_counts_threshold"] = self.total_counts_threshold kwargs["total_counts_key"] = self.total_counts_key + kwargs["distr"] = self.distr # Use the same output directory as the main model, add folder demarcating results from downstream task: output_dir = os.path.dirname(self.output_path) @@ -7613,6 +7632,7 @@ def deg_effect_heatmap( all_cells_affected = dm[dm[f"regulator_{interaction}"] > 0] else: all_cells_affected = dm[dm[interaction] > 0] + specificity = (all_coeffs_target.loc[all_cells_affected.index, interaction] != 0).mean() all_plot_values.loc[interaction, target] = specificity all_plot_values.index = [replace_col_with_collagens(f) for f in all_plot_values.index]