Skip to content

Commit

Permalink
Merge pull request #37 from maxozo/main
Browse files Browse the repository at this point in the history
Some modifications made to Hadge that were esential for running large scale processing of Onek1k data
  • Loading branch information
wxicu authored Feb 8, 2024
2 parents 2ec7166 + 919079c commit a9286cb
Show file tree
Hide file tree
Showing 72 changed files with 2,887 additions and 3,835 deletions.
13 changes: 13 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
fail_fast: false
default_language_version:
python: python3
default_stages:
- commit
- push
minimum_pre_commit_version: 2.16.0
repos:
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v2.7.0
hooks:
- id: prettier
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.2.0
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix, --unsafe-fixes]
- id: ruff-format
12 changes: 11 additions & 1 deletion bin/bff.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,21 @@ library(DropletUtils)
library(Seurat)
library(ggplot2)
library(cowplot)
if(!require("cellhashR")){
devtools::install_github(repo = 'bimberlab/cellhashR', ref = 'master', dependencies = TRUE, upgrade = 'always')
library("cellhashR")
}

library(cellhashR)
library(here)
library(dplyr)
library(argparse)
library(tidyverse)

if(!require("tidyverse")){
install.packages("tidyverse")
library("tidyverse")
}


# Create a parser
parser <- ArgumentParser("Parameters for BFF")
Expand Down
165 changes: 126 additions & 39 deletions bin/demuxem.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,45 +7,110 @@
import pandas as pd
import pegasusio as io

parser = argparse.ArgumentParser(description='Parser for DemuxEM - Demultiplexing')
parser.add_argument('--rna_matrix_dir', help= 'cellranger output folder which contains raw RNA count matrix in mtx format.')
parser.add_argument('--hto_matrix_dir', help= 'cellranger output folder which contains raw HTO (antibody tag) count matrix in mtx format.')
parser.add_argument('--randomState', help='Random seed set for reproducing results.', type=int, default=0)
parser.add_argument('--min_signal', help='Any cell/nucleus with less than min_signal hashtags from the signal will be marked as unknown.', type=float, default=10.0)
parser.add_argument('--min_num_genes', help='We only demultiplex cells/nuclei with at least <number> of expressed genes.', type=int, default=100)
parser.add_argument('--min_num_umis', help='We only demultiplex cells/nuclei with at least <number> of UMIs.', type=int, default=100)
parser.add_argument('--alpha', help='The Dirichlet prior concentration parameter (alpha) on samples. An alpha value < 1.0 will make the prior sparse.', type=float, default=0.0)
parser.add_argument('--alpha_noise', help='The Dirichlet prior concenration parameter on the background noise.', type=float, default=1.0)
parser.add_argument('--tol', help='Threshold used for the EM convergence.', type=float, default=1e-6)
parser.add_argument('--n_threads', help='Number of threads to use. Must be a positive integer.', type=int, default=1)
parser.add_argument('--filter_demuxem', help='Use the filter for RNA, True or False', default='True')
parser.add_argument('--generateGenderPlot', help='Generate violin plots using gender-specific genes (e.g. Xist). <gene> is a comma-separated list of gene names.', default='')
parser.add_argument('--objectOutDemuxem', help='Output name of demultiplexing results. All outputs will use it as the prefix.', default="demuxem_res")
parser.add_argument('--outputdir', help='Output directory')
parser = argparse.ArgumentParser(description="Parser for DemuxEM - Demultiplexing")
parser.add_argument(
"--rna_matrix_dir",
help="cellranger output folder which contains raw RNA count matrix in mtx format.",
)
parser.add_argument(
"--hto_matrix_dir",
help="cellranger output folder which contains raw HTO (antibody tag) count matrix in mtx format.",
)
parser.add_argument(
"--randomState",
help="Random seed set for reproducing results.",
type=int,
default=0,
)
parser.add_argument(
"--min_signal",
help="Any cell/nucleus with less than min_signal hashtags from the signal will be marked as unknown.",
type=float,
default=10.0,
)
parser.add_argument(
"--min_num_genes",
help="We only demultiplex cells/nuclei with at least <number> of expressed genes.",
type=int,
default=100,
)
parser.add_argument(
"--min_num_umis",
help="We only demultiplex cells/nuclei with at least <number> of UMIs.",
type=int,
default=100,
)
parser.add_argument(
"--alpha",
help="The Dirichlet prior concentration parameter (alpha) on samples. An alpha value < 1.0 will make the prior sparse.",
type=float,
default=0.0,
)
parser.add_argument(
"--alpha_noise",
help="The Dirichlet prior concenration parameter on the background noise.",
type=float,
default=1.0,
)
parser.add_argument(
"--tol", help="Threshold used for the EM convergence.", type=float, default=1e-6
)
parser.add_argument(
"--n_threads",
help="Number of threads to use. Must be a positive integer.",
type=int,
default=1,
)
parser.add_argument(
"--filter_demuxem", help="Use the filter for RNA, True or False", default="True"
)
parser.add_argument(
"--generateGenderPlot",
help="Generate violin plots using gender-specific genes (e.g. Xist). <gene> is a comma-separated list of gene names.",
default="",
)
parser.add_argument(
"--objectOutDemuxem",
help="Output name of demultiplexing results. All outputs will use it as the prefix.",
default="demuxem_res",
)
parser.add_argument("--outputdir", help="Output directory")

args = parser.parse_args()
param_list = [['rna_matrix_dir', args.rna_matrix_dir], ['hto_matrix_dir', args.hto_matrix_dir], ['randomState', args.randomState], ['min_signal', args.min_signal], ['min_num_genes', args.min_num_genes], ['min_num_umis', args.min_num_umis], ['alpha', args.alpha], ['alpha_noise', args.alpha_noise], ['tol', args.tol], ['n_threads', args.n_threads], ['generateGenderPlot', args.generateGenderPlot]]

param_df = pd.DataFrame(param_list, columns=['Argument', 'Value'])
param_list = [
["rna_matrix_dir", args.rna_matrix_dir],
["hto_matrix_dir", args.hto_matrix_dir],
["randomState", args.randomState],
["min_signal", args.min_signal],
["min_num_genes", args.min_num_genes],
["min_num_umis", args.min_num_umis],
["alpha", args.alpha],
["alpha_noise", args.alpha_noise],
["tol", args.tol],
["n_threads", args.n_threads],
["generateGenderPlot", args.generateGenderPlot],
]

if __name__ == '__main__':
param_df = pd.DataFrame(param_list, columns=["Argument", "Value"])

if __name__ == "__main__":
output_name = args.outputdir + "/" + args.objectOutDemuxem
# load input rna data
rna_data = sc.read_10x_mtx(args.rna_matrix_dir)
hashing_data = sc.read_10x_mtx(args.hto_matrix_dir,gex_only=False)
hashing_data = sc.read_10x_mtx(args.hto_matrix_dir, gex_only=False)
rna = args.rna_matrix_dir
filter = ""
if args.filter_demuxem.lower() in ['true', 't', 'yes', 'y', '1']:
if args.filter_demuxem.lower() in ["true", "t", "yes", "y", "1"]:
filter = True
elif args.filter_demuxem.lower() in ['false', 'f', 'no', 'n', '0']:
elif args.filter_demuxem.lower() in ["false", "f", "no", "n", "0"]:
filter = False
else:
raise ValueError("Invalid boolean value: {}".format(value))
raise ValueError(f"Invalid boolean value: {args.filter_demuxem.lower()}")
# Filter the RNA matrix
rna_data.obs["n_genes"] = rna_data.X.getnnz(axis=1)
rna_data.obs["n_counts"] = rna_data.X.sum(axis=1).A1
#data.obs["n_counts"] = rna_data.X.sum(axis=1).A1
if(filter):
# data.obs["n_counts"] = rna_data.X.sum(axis=1).A1
if filter:
print("Filtering RNA matrix")
obs_index = np.logical_and.reduce(
(
Expand All @@ -56,25 +121,42 @@
rna_data._inplace_subset_obs(obs_index)
# run demuxEM
demuxEM.estimate_background_probs(hashing_data, random_state=args.randomState)
demuxEM.demultiplex(rna_data, hashing_data, min_signal=args.min_signal, alpha=args.alpha, alpha_noise=args.alpha_noise, tol=args.tol, n_threads=args.n_threads)
demuxEM.demultiplex(
rna_data,
hashing_data,
min_signal=args.min_signal,
alpha=args.alpha,
alpha_noise=args.alpha_noise,
tol=args.tol,
n_threads=args.n_threads,
)
# annotate raw matrix with demuxEM results
demux_results = demuxEM.attach_demux_results(args.rna_matrix_dir, rna_data)
# generate plots
demuxEM.plot_hto_hist(hashing_data, "hto_type", output_name + ".ambient_hashtag.hist.pdf", alpha=1.0)
demuxEM.plot_bar(hashing_data.uns["background_probs"], hashing_data.var_names, "Sample ID",
"Background probability", output_name + ".background_probabilities.bar.pdf",)
demuxEM.plot_hto_hist(hashing_data, "rna_type", output_name + ".real_content.hist.pdf", alpha=0.5)
demuxEM.plot_hto_hist(
hashing_data, "hto_type", output_name + ".ambient_hashtag.hist.pdf", alpha=1.0
)
demuxEM.plot_bar(
hashing_data.uns["background_probs"],
hashing_data.var_names,
"Sample ID",
"Background probability",
output_name + ".background_probabilities.bar.pdf",
)
demuxEM.plot_hto_hist(
hashing_data, "rna_type", output_name + ".real_content.hist.pdf", alpha=0.5
)
demuxEM.plot_rna_hist(rna_data, hashing_data, output_name + ".rna_demux.hist.pdf")

if len(args.generateGenderPlot) > 0:
rna_data.matrices["raw.X"] = rna_data.X.copy()
rna_data.as_float()
scale = 1e5 / rna_data.X.sum(axis=1).A1
rna_data.X.data *= np.repeat(scale, np.diff(data.X.indptr))
rna_data.X.data *= np.repeat(scale, np.diff(rna_data.X.indptr))
rna_data.X.data = np.log1p(rna_data.X.data)

for gene_name in args.generateGenderPlot:
plot_gene_violin(
pg.violin(
rna_data,
gene_name,
"{output_name}.{gene_name}.violin.pdf".format(
Expand All @@ -91,16 +173,21 @@
print("total\t{}".format(rna_data.shape[0]))
for name, value in rna_data.obs["demux_type"].value_counts().items():
print("{}\t{}".format(name, value))
summary = rna_data.obs["demux_type"].value_counts().rename_axis('classification').reset_index(name='counts')
summary = (
rna_data.obs["demux_type"]
.value_counts()
.rename_axis("classification")
.reset_index(name="counts")
)
total = ["total", rna_data.shape[0]]
summary.loc[len(summary)] = total
summary.to_csv(output_name + "_summary.csv", index=False)
param_df.fillna("None",inplace=True)
param_df.fillna("None", inplace=True)
param_df.to_csv(args.outputdir + "/params.csv", index=False)
rna_data.obs.assignment.replace(np.nan,'negative', inplace=True)

rna_data.obs.assignment.replace(np.nan, "negative", inplace=True)
hashtags = hashing_data.var.index.tolist()
hashtags = hashtags + ["negative"]
toreplace = [ht for ht in rna_data.obs['assignment'].unique() if ht not in hashtags]
rna_data.obs.assignment.replace(toreplace,'doublet', inplace=True)
toreplace = [ht for ht in rna_data.obs["assignment"].unique() if ht not in hashtags]
rna_data.obs.assignment.replace(toreplace, "doublet", inplace=True)
rna_data.obs.to_csv(output_name + "_obs.csv")
Loading

0 comments on commit a9286cb

Please sign in to comment.