Skip to content

Commit

Permalink
add log info
Browse files Browse the repository at this point in the history
  • Loading branch information
SarahOuologuem committed Apr 22, 2024
1 parent 9cac95c commit 98822f5
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 65 deletions.
28 changes: 17 additions & 11 deletions panpipes/python_scripts/aggregate_cellranger_summary_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
parser.set_defaults(verbose=True)
args = parser.parse_args()

L.info(args)
L.info("Running with params: %s", args)

def check_path(path):
if os.path.isfile(path):
Expand All @@ -60,7 +60,7 @@ def get_metrics_summary_path(path,sample_id=None):
"""
# subset path to only go up to 'outs'
if 'outs' not in path:
print("you are parsing a cellranger output but your path to raw data doesn't end wth the outs folder")
L.warning("You are parsing a cellranger output but your path to raw data doesn't end with the outs folder")
path = check_path(path)
else:
path = path.split("outs")[0] + "outs"
Expand All @@ -73,10 +73,10 @@ def get_metrics_summary_path(path,sample_id=None):
elif sample_id is not None and os.path.exists(os.path.join(path, 'per_sample_outs', sample_id, 'metrics_summary.csv') ):
outpath = os.path.join(path, 'per_sample_outs', sample_id, 'metrics_summary.csv')
elif sample_id is None and os.path.exists(os.path.join(path, 'per_sample_outs')):
print('input folder appears to be from cellranger multi but no sample_id is given')
L.warning('Input folder appears to be from cellranger multi but no sample_id is given')
else:
# use the alternative path from cellranger_multi outputs
print('path not found')
L.warning('Path not found')
return outpath


Expand Down Expand Up @@ -178,37 +178,37 @@ def parse_10x_cellranger_count(path_df, convert_df, path_col='metrics_summary_p
return msums


L.info("Reading in tsv file '%s'" % args.cellranger_column_conversion_df)
convert_df = pd.read_csv(args.cellranger_column_conversion_df, sep='\t')

# get the paths
pipe_df = pd.read_csv(args.pipe_df, sep='\t')
L.info('finding all the cellranger paths')
L.info('Searching for all cellranger paths')
all_paths_df = get_all_unique_paths(pipe_df)
all_paths_df['metrics_summary_path'] = all_paths_df.apply(lambda x: get_metrics_summary_path(path=x.path, sample_id=x.sample_id), axis=1)
# all_paths_df_uniq = all_paths_df.drop(columns=['path', 'path_type']).drop_duplicates().reset_index(drop=True)
all_paths_df['cellranger_type'] = [detect_cellranger_algorithm(x) for x in all_paths_df['metrics_summary_path']]
L.info('cellranger metrics_summary files found')
L.info(all_paths_df)
L.info("Cellranger metrics_summary files found: %s" % all_paths_df)

# parse and concatenate the tenx x metrics summary files into long format
L.info("Parsing and concatenating the 10X metrics summary files into long format")
tenx_metrics = []
if any(all_paths_df['cellranger_type']=='multi'):
tenx_metrics.append(parse_10x_cellranger_multi(all_paths_df[all_paths_df['cellranger_type']=='multi']))

if any(all_paths_df['cellranger_type']=='count'):
tenx_metrics.append(parse_10x_cellranger_count(all_paths_df[all_paths_df['cellranger_type']=='count'], convert_df))



tenx_metrics_full = pd.concat(tenx_metrics)
tenx_metrics_full = tenx_metrics_full.sort_values(['library_type', 'metric_name'])
L.info("Done. Saving to '%s'" % args.output_file)
tenx_metrics_full.to_csv(args.output_file, index=False)
L.info('done')


# tenx_metrics['metric_value'] = [re.sub(",|%", "", x) for x in tenx_metrics['metric_value']]
# tenx_metrics['metric_value'] = tenx_metrics['metric_value'].astype(float)
# split by library_type and metric_name
L.info("Plotting metrics for each library_type and metric_name")
for idx, row in tenx_metrics_full[['library_type','metric_name']].drop_duplicates().iterrows():
mn = row['metric_name']
lt = row['library_type']
Expand All @@ -217,12 +217,14 @@ def parse_10x_cellranger_count(path_df, convert_df, path_col='metrics_summary_p
# if multiple categories i.e. Cells and Libraries, these are likely duplicate rows
plt_df = plt_df[['sample_id', 'metric_name', 'library_type' , 'metric_value']].drop_duplicates()
if len(plt_df['sample_id']) == len(plt_df['sample_id'].unique()):
L.info("Plotting barplot for library_type %s and metric_name %s" % (lt, mn))
fig = sns.barplot(plt_df, x='sample_id', y='metric_value', color='grey')
fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
fig.set_title(lt + ':' + mn)
plt.savefig(os.path.join(args.figdir, lt + '-' + mn + '.png'), bbox_inches='tight')
else:
# do a boxplot instead
L.info("Plotting boxplot for library_type %s and metric_name %s" % (lt, mn))
fig = sns.boxplot(plt_df, x='sample_id', y='metric_value', color='grey')
fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
fig.set_title(lt + ':' + mn)
Expand All @@ -244,6 +246,7 @@ def parse_10x_cellranger_count(path_df, convert_df, path_col='metrics_summary_p
# y='Number of reads', size='Mean reads per cell',sizes=(20, 200))

if 'Sequencing saturation' in plt_df.metric_name.unique():
L.info("Plotting scatter plot of sequencing saturation against number of reads")
f, ax = plt.subplots()
points = ax.scatter(x=plt_tab['Sequencing saturation'],
y=plt_tab['Number of reads'],
Expand All @@ -263,6 +266,7 @@ def parse_10x_cellranger_count(path_df, convert_df, path_col='metrics_summary_p

# plot 2 - cells vs UMI counts per cell

L.info("Plotting scatter plot of estimated number of cells against median UMI counts per cell")
fig, ax = plt.subplots()
x=list(np.log10(plt_tab['Estimated number of cells']))
y=list(np.log10(plt_tab['Median UMI counts per cell']))
Expand All @@ -289,4 +293,6 @@ def parse_10x_cellranger_count(path_df, convert_df, path_col='metrics_summary_p
plt.grid(axis='both', color='0.95')
# save
plt.savefig(os.path.join(args.figdir,'10x_cells_by_UMIs.png'))
plt.clf()
plt.clf()

L.info("Done")
25 changes: 14 additions & 11 deletions panpipes/python_scripts/concat_adata.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@

parser.set_defaults(verbose=True)
args, opt = parser.parse_known_args()
L.info("running with:")
L.info(args)
L.info("Running with params: %s", args)

if len(args.input_files_str) == 0:
sys.exit("no input files detected")
L.error("No input files detected")
sys.exit("No input files detected")

lf = re.split(',', args.input_files_str)

Expand All @@ -75,6 +75,7 @@
# lf = glob.glob("./tmp/*raw.h5mu")
# sfile= "CARTC_bonemarrow_samples.tsv"

L.info("Reading in submission file from '%s'" % sfile)
caf = pd.read_csv(sfile, sep="\t")
# the modality argument is relevent only if use_muon is True.
# mdatas = [read_anndata(i, use_muon, modality="all") for i in lf]
Expand All @@ -95,6 +96,7 @@
del temp
elif 'prot' in temp.mod.keys() or 'rna' in temp.mod.keys():
## IF RNA and PROT is ok to concatenate ----------
L.info("Concatenating RNA and prot")
mdata = concat_mdatas(mdatas,
batch_key="sample_id",
join_type=args.join_type)
Expand All @@ -106,7 +108,7 @@
L.debug(mdata.obs.columns)
L.debug(mdata.obs.head())

L.info("add metadata")
L.info("Adding metadata")
# add metmdata to each object
metadatacols = args.metadatacols
# metadatacols="mdata_cols"
Expand Down Expand Up @@ -148,7 +150,7 @@
# which column is missing?
missingcols= " ".join([x for x in metadatacols if x not in caf.columns])
L.error("Required columns missing form samples file: %s" % missingcols)
sys.exit("Exiting because required columns not in samples file, check the log file for details")
sys.exit("Required columns missing form samples file: %s" % missingcols)


# if 'rep' in mdata.mod.keys():
Expand All @@ -158,7 +160,7 @@

# add in cell level metadata
if args.barcode_mtd_df is not None :
L.info("Add barcode level metadata")
L.info("Adding barcode level metadata")
# check demult_metadatacols exists and contains antibody
barcode_metadatacols = args.barcode_mtd_metadatacols.split(',')
# load the demultiplexing data
Expand All @@ -181,12 +183,13 @@
# update the protein variable to add in extra info like isotype and alternate name for prot
if args.protein_var_table is not None:
try:
L.info("Reading in protein var table from '%s'" % args.protein_var_table)
df = pd.read_csv(args.protein_var_table, sep='\t', index_col=0)
L.info("merging protein table with var")
L.info("Merging protein table with var")
# add_var_mtd(mdata['prot'], df)
var_df = mdata['prot'].var.merge(df, left_index=True, right_index=True)
if args.protein_new_index_col is not None:
L.info("updating prot.var index")
L.info("Updating prot.var index")
# update_var_index(mdata['prot'], args.protein_new_index_col)
var_df =var_df.reset_index().set_index(args.protein_new_index_col)
var_df = var_df.rename(columns={'index':'orig_id'})
Expand All @@ -202,7 +205,7 @@
# subset old modality to remove hashing
mdata.mod['prot'] = mdata["prot"][:, ~mdata["prot"].var["hashing_ab"]]
except FileNotFoundError:
warnings.warn("protein metadata table not found")
warnings.warn("Protein metadata table not found")
mdata.update()
# tidy up metadata
# move sample_id to the front
Expand All @@ -211,8 +214,8 @@
# mdata.obs = mdata.obs.reindex(columns=cols)
L.debug(mdata.obs.dtypes)

L.info("writing to file {}".format(str(args.output_file)))
L.info("Writing to file '%s'" % args.output_file)

mdata.write(args.output_file)

L.info("done")
L.info("Done")
28 changes: 19 additions & 9 deletions panpipes/python_scripts/make_adata_from_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@

parser.set_defaults(verbose=True)
args, opt = parser.parse_known_args()
L.info(args)
L.info("Running with params: %s", args)
# Make mdata object

# unimodal mu (check if all the modalities)
Expand All @@ -99,40 +99,50 @@
all_files = {nm: x for (nm, x) in all_files.items() if nm in permf}
# this will work for any combination of modalities or number of modalities in all_files.
[check_filetype(x[0], x[1]) for x in all_files.values()]
L.info("Creating MuData")
mdata = load_mdata_from_multiple_files(all_files)

# now lets do some extra processing on the different modalities
L.info("Preprocessing each modality")
if 'rna' in mdata.mod.keys():
L.info("Modality: RNA")
L.info("Making var names unique")
mdata['rna'].var_names_make_unique()
# keep only rna with at least 1 count (relevent when loading raw data)
mdata['rna'].obs['total_counts'] = mdata['rna'].X.sum(axis=1)
L.info("Subsetting on cells with total_counts > 1.")
mu.pp.filter_obs(mdata['rna'], 'total_counts', lambda x: (x > 1))
# drop this columns as we don't need it anymore, and it will confuse things downstream
mdata['rna'].obs = mdata['rna'].obs.drop(columns="total_counts")
mdata.update()

if 'prot' in mdata.mod.keys():
L.info("Modality: prot")
if 'rna' in mdata.mod.keys():
if check_for_bool(args.subset_prot_barcodes_to_rna):
L.info("Intersecting observations of modalities RNA and prot")
intersect_obs_by_mod(mdata, ['rna', 'prot'])
mdata.update()
L.info(mdata['prot'].var.head())

if 'atac' in mdata.mod.keys():
L.info("Modality: ATAC")
L.info("Making var names unique")
mdata['atac'].var_names_make_unique()
if args.per_barcode_metrics_file is not None:
L.info("Returning mudata with metadata from barcode metrics csv")
L.info("Merging in metadata from barcode metrics csv '%s'" % args.per_barcode_metrics_file)
per_barcode_metrics_file = pd.read_csv(args.per_barcode_metrics_file, index_col=0) #it's a csv by default and goes in atac obs
mdata.mod['atac'].obs = mdata.mod['atac'].obs.merge(per_barcode_metrics_file, left_index=True, right_index=True)
if args.peak_annotation_file is not None:
L.info("Returning mudata with peak annotation in mdata['atac'].uns['atac']['peak_annotation']")
L.info("Saving peak annotation from '%s' to mdata['atac'].uns['atac']['peak_annotation']" % args.peak_annotation_file)
add_peak_annotation(mdata, annotation=args.peak_annotation_file, sep="\t", return_annotation= False)
if args.fragments_file is not None:
L.info("Returning mudata with fragments file in mdata['atac'].uns['files']['fragments']")
L.info("Saving fragments file from '%s' to mdata['atac'].uns['files']['fragments']" % args.fragments_file)
locate_fragments(mdata,fragments=args.fragments_file, return_fragments= False)


if 'rep' in mdata.mod.keys():
L.info("Modality: rep")
L.info("Saving count metadata in .obs as numeric")
col_update = mdata['rep'].obs.columns[mdata['rep'].obs.columns.str.contains("count")]
mdata['rep'].obs[col_update] = mdata['rep'].obs[col_update].apply(pd.to_numeric)
mdata.update()
Expand All @@ -142,21 +152,21 @@
for mm in mdata.mod.keys():
mdata[mm].var_names_make_unique()

L.info("Saving sample id to mdata.obs['sample_id']")
mdata.obs['sample_id'] = str(args.sample_id)

# copy the to each modality
print("Saving sample_id to each modality")
for mm in mdata.mod.keys():
print("saving sample_id to each modality")
# mdata[mm].obs['sample_id'] = mdata.obs['sample_id']
mdata[mm].obs['sample_id'] = mdata.obs.loc[mdata[mm].obs_names,:]['sample_id']

mdata.update()


L.info("saving data")
L.debug(mdata)
L.info("Saving MuData to '%s'" % args.output_file)
# this will write the file as anndata or muon depending on the class of mdata.
mdata.write(args.output_file)

L.info("done")
L.info("Done")

Loading

0 comments on commit 98822f5

Please sign in to comment.