Changed default behaviour of pycoQC during initial file parsing

a-slide · Jun 28, 2017 · 3eee83f · 3eee83f
1 parent afdeefe
commit 3eee83f
Show file tree

Hide file tree

Showing 2 changed files with 284 additions and 194 deletions.
diff --git a/pycoQC/pycoQC.py b/pycoQC/pycoQC.py
@@ -36,18 +36,18 @@ class pycoQC():
     #~~~~~~~FUNDAMENTAL METHODS~~~~~~~#    
     def __init__ (self, seq_summary_file, runid=None, verbose=False):
         """
-        Parse Albacore sequencing_summary.txt file and cleanup the data
+        Parse Albacore sequencing_summary.txt file and clean-up the data
         * seq_summary_file
             Path to the sequencing_summary.txt generated by Albacore
         * runid
-            If you want a specific runid to be analysed. Usually there are 2 runids per minion experiment, the mux run and the sequencing
-            run. By default it will analyse the runid with the most reads, ie the sequencing run. [Default None]
+            If you want a specific runid to be analysed. By default it will analyse all the read in the file irrespective of their runid 
+            [Default None]
         * verbose
             print additional informations. [Default False]
         """
         self.verbose=verbose
 
-        # import in a dataframe
+        # Import the summary file in a dataframe
         self.seq_summary_file = seq_summary_file
         self.df = pd.read_csv(seq_summary_file, sep ="\t")
         self.df.dropna(inplace=True)
@@ -57,32 +57,23 @@ def __init__ (self, seq_summary_file, runid=None, verbose=False):
             assert colname in self.df.columns, "Column {} not found in the provided sequence_summary file".format(colname)
 
         # Find or verify runid
-        runid_counts = self.df['run_id'].value_counts(sort=True)
-
-        if not runid:
-            if self.verbose:
-                print ("Runid found in the datasets")
-                runid_counts.name = "Count"
-                runid_df = pd.DataFrame(runid_counts)
-                runid_df.columns.name = "Run_ID"
-                display(runid_df)
-                print ("Selecting Run_ID {}".format(runid_counts.index[0]))
-
-            self.runid = runid_counts.index[0]
-            self.total_reads = runid_counts.loc[self.runid]
-
-        else:
-            self.runid = runid
-            self.total_reads = runid_counts.loc[self.runid]
+        if verbose:
+            print ("Runid found in the datasets")
+            runid_counts = self.df['run_id'].value_counts(sort=True).to_frame(name="Counts")
+            display(runid_counts)
+
+        # Select Runid if required
+        if runid:
+            if verbose:
+                print ("Selecting reads with Run_ID {}".format(runid))
+            self.df = self.df[(self.df["run_id"] == runid)]
 
         # Extract the runid data from the overall dataframe
-        self.df = self.df[(self.df["run_id"] == self.runid)]
         self.df = self.df.reset_index(drop=True)
         self.df.set_index("read_id", inplace=True)
-        #self.df.drop(['filename', 'run_id'], axis=1, inplace=True)
+        self.total_reads = len(self.df)
 
         if self.verbose:
-            print ("Dataframe head")
             display (self.df.head())
 
     def __str__(self):
@@ -421,7 +412,7 @@ def reads_len_quality (self, figsize=12, kde=True, scatter=True, margin_plot=Tru
         xmax=None, ymin=None, ymax=None, **kwargs):
         """
         Draw a bivariate plot of read length vs mean read quality with marginal univariate plots.
-        The bivariate kde can takes time to calculate depending on the number of datapoints 
+        The bivariate kde can takes time to calculate depending on the number of data points 
         * figsize
             Size of square ploting area [Default 12]
         * kde