add log info

DendrouLab · Apr 22, 2024 · 98822f5 · 98822f5
1 parent 9cac95c
commit 98822f5
Show file tree

Hide file tree

Showing 5 changed files with 89 additions and 65 deletions.
diff --git a/panpipes/python_scripts/aggregate_cellranger_summary_metrics.py b/panpipes/python_scripts/aggregate_cellranger_summary_metrics.py
@@ -40,7 +40,7 @@
 parser.set_defaults(verbose=True)
 args = parser.parse_args()
 
-L.info(args)
+L.info("Running with params: %s", args)
 
 def check_path(path):
     if os.path.isfile(path):
@@ -60,7 +60,7 @@ def get_metrics_summary_path(path,sample_id=None):
     """    
     # subset path to only go up to 'outs'
     if 'outs' not in path:
-        print("you are parsing a cellranger output but your path to raw data doesn't end wth the outs folder")
+        L.warning("You are parsing a cellranger output but your path to raw data doesn't end with the outs folder")
         path = check_path(path)
     else:
         path = path.split("outs")[0] + "outs"
@@ -73,10 +73,10 @@ def get_metrics_summary_path(path,sample_id=None):
     elif sample_id is not None and os.path.exists(os.path.join(path, 'per_sample_outs', sample_id, 'metrics_summary.csv') ):
         outpath = os.path.join(path, 'per_sample_outs', sample_id, 'metrics_summary.csv')
     elif sample_id is None and os.path.exists(os.path.join(path, 'per_sample_outs')):
-        print('input folder appears to be from cellranger multi but no sample_id is given')
+        L.warning('Input folder appears to be from cellranger multi but no sample_id is given')
     else:
         # use the alternative path from cellranger_multi outputs
-        print('path not found')
+        L.warning('Path not found')
     return outpath
 
 
@@ -178,37 +178,37 @@ def parse_10x_cellranger_count(path_df, convert_df,  path_col='metrics_summary_p
     return msums
 
 
+L.info("Reading in tsv file '%s'" % args.cellranger_column_conversion_df)
 convert_df = pd.read_csv(args.cellranger_column_conversion_df, sep='\t')   
 
 # get the paths
 pipe_df = pd.read_csv(args.pipe_df, sep='\t')
-L.info('finding all the cellranger paths')
+L.info('Searching for all cellranger paths')
 all_paths_df = get_all_unique_paths(pipe_df)
 all_paths_df['metrics_summary_path'] = all_paths_df.apply(lambda x: get_metrics_summary_path(path=x.path, sample_id=x.sample_id), axis=1)
 # all_paths_df_uniq = all_paths_df.drop(columns=['path', 'path_type']).drop_duplicates().reset_index(drop=True)
 all_paths_df['cellranger_type'] = [detect_cellranger_algorithm(x) for x in all_paths_df['metrics_summary_path']]
-L.info('cellranger metrics_summary files found')
-L.info(all_paths_df)
+L.info("Cellranger metrics_summary files found: %s" % all_paths_df)
 
 # parse and concatenate the tenx x metrics summary files into long format
+L.info("Parsing and concatenating the 10X metrics summary files into long format")
 tenx_metrics = []
 if any(all_paths_df['cellranger_type']=='multi'):
     tenx_metrics.append(parse_10x_cellranger_multi(all_paths_df[all_paths_df['cellranger_type']=='multi']))
 
 if any(all_paths_df['cellranger_type']=='count'):
     tenx_metrics.append(parse_10x_cellranger_count(all_paths_df[all_paths_df['cellranger_type']=='count'], convert_df))
 
-
-
 tenx_metrics_full = pd.concat(tenx_metrics)
 tenx_metrics_full = tenx_metrics_full.sort_values(['library_type', 'metric_name'])
+L.info("Done. Saving to '%s'" % args.output_file)
 tenx_metrics_full.to_csv(args.output_file, index=False)
-L.info('done')
 
 
 # tenx_metrics['metric_value'] = [re.sub(",|%", "", x) for x in tenx_metrics['metric_value']]
 # tenx_metrics['metric_value'] = tenx_metrics['metric_value'].astype(float)
 # split by library_type and metric_name 
+L.info("Plotting metrics for each library_type and metric_name")
 for idx, row in tenx_metrics_full[['library_type','metric_name']].drop_duplicates().iterrows():
     mn = row['metric_name']
     lt = row['library_type']
@@ -217,12 +217,14 @@ def parse_10x_cellranger_count(path_df, convert_df,  path_col='metrics_summary_p
         # if multiple categories i.e. Cells and Libraries, these are likely duplicate rows
         plt_df = plt_df[['sample_id', 'metric_name', 'library_type' , 'metric_value']].drop_duplicates()
     if len(plt_df['sample_id']) == len(plt_df['sample_id'].unique()):
+        L.info("Plotting barplot for library_type %s and metric_name %s" % (lt, mn))
         fig = sns.barplot(plt_df, x='sample_id', y='metric_value', color='grey')
         fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
         fig.set_title(lt + ':' + mn)
         plt.savefig(os.path.join(args.figdir, lt + '-' + mn + '.png'), bbox_inches='tight')
     else:
         # do a boxplot instead
+        L.info("Plotting boxplot for library_type %s and metric_name %s" % (lt, mn))
         fig = sns.boxplot(plt_df, x='sample_id', y='metric_value', color='grey')
         fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
         fig.set_title(lt + ':' + mn)
@@ -244,6 +246,7 @@ def parse_10x_cellranger_count(path_df, convert_df,  path_col='metrics_summary_p
 #                 y='Number of reads', size='Mean reads per cell',sizes=(20, 200))
 
 if 'Sequencing saturation' in plt_df.metric_name.unique(): 
+    L.info("Plotting scatter plot of sequencing saturation against number of reads")
     f, ax = plt.subplots()
     points = ax.scatter(x=plt_tab['Sequencing saturation'], 
                 y=plt_tab['Number of reads'], 
@@ -263,6 +266,7 @@ def parse_10x_cellranger_count(path_df, convert_df,  path_col='metrics_summary_p
 
 # plot 2 -  cells vs UMI counts per cell
 
+L.info("Plotting scatter plot of estimated number of cells against median UMI counts per cell")
 fig, ax = plt.subplots()
 x=list(np.log10(plt_tab['Estimated number of cells']))
 y=list(np.log10(plt_tab['Median UMI counts per cell']))
@@ -289,4 +293,6 @@ def parse_10x_cellranger_count(path_df, convert_df,  path_col='metrics_summary_p
 plt.grid(axis='both', color='0.95')
 # save
 plt.savefig(os.path.join(args.figdir,'10x_cells_by_UMIs.png'))
-plt.clf()
+plt.clf()
+
+L.info("Done")
diff --git a/panpipes/python_scripts/concat_adata.py b/panpipes/python_scripts/concat_adata.py
@@ -62,11 +62,11 @@
 
 parser.set_defaults(verbose=True)
 args, opt = parser.parse_known_args()
-L.info("running with:")
-L.info(args)
+L.info("Running with params: %s", args)
 
 if len(args.input_files_str) == 0:
-    sys.exit("no input files detected")
+    L.error("No input files detected")
+    sys.exit("No input files detected")
 
 lf = re.split(',', args.input_files_str)
 
@@ -75,6 +75,7 @@
 #  lf = glob.glob("./tmp/*raw.h5mu")
 # sfile= "CARTC_bonemarrow_samples.tsv"
 
+L.info("Reading in submission file from '%s'" % sfile)
 caf = pd.read_csv(sfile, sep="\t")
 # the modality argument is relevent only if use_muon is True.
 # mdatas = [read_anndata(i, use_muon, modality="all") for i in lf]
@@ -95,6 +96,7 @@
         del temp
     elif 'prot' in temp.mod.keys() or 'rna' in temp.mod.keys():
         ## IF RNA and PROT is ok to concatenate ----------
+        L.info("Concatenating RNA and prot")
         mdata = concat_mdatas(mdatas,
             batch_key="sample_id", 
             join_type=args.join_type)
@@ -106,7 +108,7 @@
 L.debug(mdata.obs.columns)
 L.debug(mdata.obs.head())
 
-L.info("add metadata")
+L.info("Adding metadata")
 # add metmdata to each object
 metadatacols = args.metadatacols
 # metadatacols="mdata_cols"
@@ -148,7 +150,7 @@
         # which column is missing?
         missingcols= " ".join([x for x in metadatacols if x not in caf.columns])
         L.error("Required columns missing form samples file: %s" % missingcols)
-        sys.exit("Exiting because required columns not in samples file, check the log file for details")
+        sys.exit("Required columns missing form samples file: %s" % missingcols)
 
 
 # if 'rep' in mdata.mod.keys():
@@ -158,7 +160,7 @@
 
 # add in cell level metadata
 if args.barcode_mtd_df is not None :
-    L.info("Add barcode level metadata")
+    L.info("Adding barcode level metadata")
     # check demult_metadatacols exists and contains antibody
     barcode_metadatacols = args.barcode_mtd_metadatacols.split(',')
     # load the demultiplexing data
@@ -181,12 +183,13 @@
 # update the protein variable to add in extra info like isotype and alternate name for prot
 if args.protein_var_table is not None:
     try:
+        L.info("Reading in protein var table from '%s'" % args.protein_var_table)
         df = pd.read_csv(args.protein_var_table, sep='\t', index_col=0)
-        L.info("merging protein table with var")
+        L.info("Merging protein table with var")
         # add_var_mtd(mdata['prot'], df)
         var_df = mdata['prot'].var.merge(df, left_index=True, right_index=True)
         if args.protein_new_index_col is not None:
-            L.info("updating prot.var index")
+            L.info("Updating prot.var index")
             # update_var_index(mdata['prot'], args.protein_new_index_col)
             var_df =var_df.reset_index().set_index(args.protein_new_index_col)
             var_df = var_df.rename(columns={'index':'orig_id'})
@@ -202,7 +205,7 @@
             # subset old modality to remove hashing
             mdata.mod['prot'] = mdata["prot"][:, ~mdata["prot"].var["hashing_ab"]]
     except FileNotFoundError:
-        warnings.warn("protein metadata table not found")
+        warnings.warn("Protein metadata table not found")
 mdata.update()
 # tidy up metadata
 # move sample_id to the front
@@ -211,8 +214,8 @@
 # mdata.obs = mdata.obs.reindex(columns=cols)
 L.debug(mdata.obs.dtypes)
 
-L.info("writing to file {}".format(str(args.output_file)))
+L.info("Writing to file '%s'" % args.output_file)
 
 mdata.write(args.output_file)
 
-L.info("done")
+L.info("Done")
diff --git a/panpipes/python_scripts/make_adata_from_csv.py b/panpipes/python_scripts/make_adata_from_csv.py
@@ -80,7 +80,7 @@
 
 parser.set_defaults(verbose=True)
 args, opt = parser.parse_known_args()
-L.info(args)
+L.info("Running with params: %s", args)
 # Make mdata object
 
 # unimodal mu (check if all the modalities)
@@ -99,40 +99,50 @@
 all_files = {nm: x  for (nm, x) in all_files.items() if nm in permf}
 # this will work for any combination of modalities or number of modalities in all_files.
 [check_filetype(x[0], x[1]) for x in all_files.values()]
+L.info("Creating MuData")
 mdata = load_mdata_from_multiple_files(all_files)
 
 # now lets do some extra processing on the different modalities
+L.info("Preprocessing each modality")
 if 'rna' in mdata.mod.keys():
+    L.info("Modality: RNA")
+    L.info("Making var names unique")
     mdata['rna'].var_names_make_unique()
     # keep only rna with at least 1 count (relevent when loading raw data)
     mdata['rna'].obs['total_counts'] = mdata['rna'].X.sum(axis=1) 
+    L.info("Subsetting on cells with total_counts > 1.")
     mu.pp.filter_obs(mdata['rna'], 'total_counts', lambda x: (x > 1))
     # drop this columns as we don't need it anymore, and it will confuse things downstream
     mdata['rna'].obs = mdata['rna'].obs.drop(columns="total_counts")
     mdata.update()
 
 if 'prot' in mdata.mod.keys():
+    L.info("Modality: prot")
     if 'rna' in mdata.mod.keys():
         if check_for_bool(args.subset_prot_barcodes_to_rna):
+            L.info("Intersecting observations of modalities RNA and prot")
             intersect_obs_by_mod(mdata, ['rna', 'prot'])
             mdata.update()
-    L.info(mdata['prot'].var.head())
 
 if 'atac' in mdata.mod.keys():
+    L.info("Modality: ATAC")
+    L.info("Making var names unique")
     mdata['atac'].var_names_make_unique()
     if args.per_barcode_metrics_file is not None:
-        L.info("Returning mudata with metadata from barcode metrics csv")
+        L.info("Merging in metadata from barcode metrics csv '%s'" % args.per_barcode_metrics_file)
         per_barcode_metrics_file = pd.read_csv(args.per_barcode_metrics_file, index_col=0) #it's a csv by default and goes in atac obs
         mdata.mod['atac'].obs = mdata.mod['atac'].obs.merge(per_barcode_metrics_file, left_index=True, right_index=True)
     if args.peak_annotation_file is not None:
-        L.info("Returning mudata with peak annotation in mdata['atac'].uns['atac']['peak_annotation']")
+        L.info("Saving peak annotation from '%s' to mdata['atac'].uns['atac']['peak_annotation']" % args.peak_annotation_file)
         add_peak_annotation(mdata, annotation=args.peak_annotation_file, sep="\t", return_annotation= False)
     if args.fragments_file is not None:
-        L.info("Returning mudata with fragments file in mdata['atac'].uns['files']['fragments']")
+        L.info("Saving fragments file from '%s' to mdata['atac'].uns['files']['fragments']" % args.fragments_file)
         locate_fragments(mdata,fragments=args.fragments_file, return_fragments= False)
 
 
 if 'rep' in mdata.mod.keys():
+    L.info("Modality: rep")
+    L.info("Saving count metadata in .obs as numeric")
     col_update = mdata['rep'].obs.columns[mdata['rep'].obs.columns.str.contains("count")]
     mdata['rep'].obs[col_update] = mdata['rep'].obs[col_update].apply(pd.to_numeric)
     mdata.update()
@@ -142,21 +152,21 @@
 for mm in mdata.mod.keys():
     mdata[mm].var_names_make_unique()
 
+L.info("Saving sample id to mdata.obs['sample_id']")
 mdata.obs['sample_id'] = str(args.sample_id)
 
 # copy the  to each modality
+print("Saving sample_id to each modality")
 for mm in mdata.mod.keys():
-    print("saving sample_id to each modality")
     # mdata[mm].obs['sample_id'] = mdata.obs['sample_id']
     mdata[mm].obs['sample_id'] = mdata.obs.loc[mdata[mm].obs_names,:]['sample_id']
 
 mdata.update()
 
 
-L.info("saving data")
-L.debug(mdata)
+L.info("Saving MuData to '%s'" % args.output_file)
 # this will write the file as anndata or muon depending on the class of mdata.
 mdata.write(args.output_file)
 
-L.info("done")
+L.info("Done")