Add new plotting function read len vs qual

a-slide · Jun 22, 2017 · 64781f0 · 64781f0
1 parent 6bca488
commit 64781f0
Show file tree

Hide file tree

Showing 13 changed files with 56,323 additions and 56,134 deletions.
diff --git a/README.ipynb b/README.ipynb
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 # pycoQC 1.0.dev1
 
 ___
-**pycoQC is a Python 3 package for Jupyter Notebook, computing metrics and generating simple QC plots from Oxford Nanopore technologies (ONT) Albacore basecaller**
+**pycoQC is a python3 package for Jupyter Notebook, computing metrics and generating simple QC plots from Oxford Nanopore technologies (ONT) Albacore basecaller**
 ___
 
 pycoQC is a very simple quality control package for Nanopore data written in pure python3, meant to be used directly in a jupyter notebook 4.0.0 +. As opposed to current and more exhaustive QC programs for nanopore data, pycoQC is very fast as it relies entirely on the *sequencing_summary.txt* file generated by ONT Albacore Sequencing Pipeline Software 1.2.1+, during base calling. Consequently, pycoQC will only provide read level metrics (and not at base level)
@@ -137,7 +137,7 @@ pl.style.use('ggplot')
 from pycoQC.pycoQC import pycoQC as pcq
 ```
 
-* A sample test file generated by Albacore can be obtained from the package data, or you can use your own file
+* A sample test file generated by Albacore can be obtained from the package data using *pkg_resources*, or you can use your own file
 
 
 ```python
@@ -157,12 +157,12 @@ print(p)
     	Parameters list
     	runid	ad3de3b63de71c4c6d5ea4470a82782cf51210d9
     	seq_summary_file	/home/aleg/Programming/Python3/pycoQC/pycoQC/data/sequencing_summary.txt
-    	total_reads	126724
+    	total_reads	126583
     	verbose	False
-
 
 
-Plots can be generated by calling the pycoCQ object with one of the 5 available plotting functions.
+
+Plots can be generated by calling the pycoCQ object with one of the 6 available plotting functions.
 
 Each function has specific options that are comprehensively detailed in the testing notebook provided with the package or in html version on nbviewer: [link to test_notebook](https://nbviewer.jupyter.org/github/a-slide/pycoQC/blob/master/pycoQC/test_pycoQC.ipynb?flush_cache=true)
 
@@ -186,7 +186,7 @@ g = p.mean_qual_distribution()
 
 
 ```python
-g = p.output_over_time()
+g = p.extra/output_over_time()
 ```
 
 
@@ -211,6 +211,15 @@ g = p.reads_len_distribution()
 ![png](extra/output_50_0.png)
 
 
+
+```python
+g = p.reads_len_quality()
+```
+
+
+![png](extra/output_51_0.png)
+
+
 ## Authors and Contact
 
 Adrien Leger - 2017

diff --git a/README.rst b/README.rst
@@ -174,7 +174,7 @@ Using pycoQC
     from pycoQC.pycoQC import pycoQC as pcq
 
 -  A sample test file generated by Albacore can be obtained from the
-   package data, or you can use your own file
+   package data using *pkg\_resources*, or you can use your own file
 
 .. code:: python
 
@@ -196,12 +196,12 @@ Using pycoQC
     	Parameters list
     	runid	ad3de3b63de71c4c6d5ea4470a82782cf51210d9
     	seq_summary_file	/home/aleg/Programming/Python3/pycoQC/pycoQC/data/sequencing_summary.txt
-    	total_reads	126724
+    	total_reads	126583
     	verbose	False
     
 
 
-Plots can be generated by calling the pycoCQ object with one of the 5
+Plots can be generated by calling the pycoCQ object with one of the 6
 available plotting functions.
 
 Each function has specific options that are comprehensively detailed in
@@ -254,6 +254,15 @@ test\_notebook <https://nbviewer.jupyter.org/github/a-slide/pycoQC/blob/master/p
 .. image:: extra/output_50_0.png
 
 
+.. code:: python
+
+    g = p.reads_len_quality()
+
+
+
+.. image:: extra/output_51_0.png
+
+
 Authors and Contact
 -------------------
 

diff --git a/extra/output_46_0.png b/extra/output_46_0.png
diff --git a/extra/output_47_0.png b/extra/output_47_0.png
diff --git a/extra/output_48_0.png b/extra/output_48_0.png
diff --git a/extra/output_49_0.png b/extra/output_49_0.png
diff --git a/extra/output_50_0.png b/extra/output_50_0.png
diff --git a/extra/output_51_0.png b/extra/output_51_0.png
diff --git a/pycoQC/data/sequencing_summary.txt b/pycoQC/data/sequencing_summary.txt
diff --git a/pycoQC/pycoQC.py b/pycoQC/pycoQC.py
@@ -41,9 +41,9 @@ def __init__ (self, seq_summary_file, runid=None, verbose=False):
             Path to the sequencing_summary.txt generated by Albacore
         * runid
             If you want a specific runid to be analysed. Usually there are 2 runids per minion experiment, the mux run and the sequencing
-            run. By default it will analyse the runid with the most reads, ie the sequencing run. [Default: None]
+            run. By default it will analyse the runid with the most reads, ie the sequencing run. [Default None]
         * verbose
-            print additional informations. [Default: False]
+            print additional informations. [Default False]
         """
         self.verbose=verbose
 
@@ -53,7 +53,7 @@ def __init__ (self, seq_summary_file, runid=None, verbose=False):
         self.df.dropna(inplace=True)
 
         # Verify the presence of the columns required for pycoQC
-        for colname in ['channel', 'start_time', 'duration', 'num_events','sequence_length_template', 'mean_qscore_template']:
+        for colname in ['run_id', 'channel', 'start_time', 'duration', 'num_events','sequence_length_template', 'mean_qscore_template']:
             assert colname in self.df.columns, "Column {} not found in the provided sequence_summary file".format(colname)
 
         # Find or verify runid
@@ -103,21 +103,21 @@ def channels_activity (self, level="reads", figsize=[24,12], cmap="OrRd", alpha=
         Plot the activity of channels at read, base or event level. Based on Seaborn heatmap function. The layout does not represent the
         physical layout of the flowcell, and   
         * level
-            Aggregate channel output results by "reads", "bases" or "events". [Default: "reads"]
+            Aggregate channel output results by "reads", "bases" or "events". [Default "reads"]
         * figsize 
-            Size of ploting area [Default: [24,12]]
+            Size of ploting area [Default [24,12]]
         * cmap
-            Matplotlib colormap code to color the space [Default: "OrRd"]
+            Matplotlib colormap code to color the space [Default "OrRd"]
         * alpha
-            Opacity of the area from 0 to 1 [Default: 1]
+            Opacity of the area from 0 to 1 [Default 1]
         * robust
-            if True the colormap range is computed with robust quantiles instead of the extreme values [Default: True]
+            if True the colormap range is computed with robust quantiles instead of the extreme values [Default True]
         * annot
-            If True, write the data value in each cell. [Default: True]
+            If True, write the data value in each cell. [Default True]
         * fmt
-            String formatting code to use when adding annotations (see matplotlib documentation) [Default: "d"]
+            String formatting code to use when adding annotations (see matplotlib documentation) [Default "d"]
         * cbar
-            Whether to draw a colorbar scale on the right of the graph [Default: False]
+            Whether to draw a colorbar scale on the right of the graph [Default False]
         => Return
             A matplotlib.axes object for further user customisation (http://matplotlib.org/api/axes_api.html)
         """
@@ -162,11 +162,11 @@ def mean_qual_distribution (self, figsize=[30,7], color="orangered", alpha=0.5,
         """
         Plot the distribution of the mean read PHRED qualities
         * figsize
-            Size of ploting area [Default:figsize=[30,7]
+            Size of ploting area [Default [30,7]
         * color
             Color of the plot. Valid matplotlib color code [Default "orangered"]
         * alpha
-            Opacity of the area from 0 to 1 [Default: 1]
+            Opacity of the area from 0 to 1 [Default 1]
         * normed
             Normalised results. Frenquency rather than counts [Default True]
         * win_size
@@ -196,13 +196,13 @@ def output_over_time (self, level="reads", figsize=[30,7], color="orangered", al
         """
         Plot the output over the time of the experiment at read, base or event level
         * level
-            Aggregate channel output results by "reads", "bases" or "events" [Default: "reads"]
+            Aggregate channel output results by "reads", "bases" or "events" [Default "reads"]
         * figsize
-            Size of ploting area [Default:figsize=[30,7]
+            Size of ploting area [Default [30,7]
         * color
             Color of the plot. Valid matplotlib color code [Default "orangered"]
         * alpha
-            Opacity of the area from 0 to 1 [Default: 0.5]
+            Opacity of the area from 0 to 1 [Default 0.5]
         * win_size
             Size of the bins in hours [Default 0.25]
         * cumulative
@@ -239,11 +239,11 @@ def quality_over_time (self, figsize=[30,7], color="orangered", alpha=0.25, win_
         """
         Plot the evolution of the mean read quality over the time of the experiment at read, base or event level
         * figsize
-            Size of ploting area [Default:figsize=[30,7]
+            Size of ploting area [Default [30,7]
         * color
             Color of the plot. Valid matplotlib color code [Default "orangered"]
         * alpha
-            Opacity of the area from 0 to 1 [Default: 0.25]
+            Opacity of the area from 0 to 1 [Default 0.25]
         * win_size
             Size of the bins in hours [Default 0.25]
         => Return
@@ -274,58 +274,115 @@ def quality_over_time (self, figsize=[30,7], color="orangered", alpha=0.25, win_
 
         return ax
 
-    def reads_len_distribution (self, figsize=[30,7], color="orangered", alpha=0.5, win_size=1000, normed=False,
-                                xlog=False, ylog=False, xmin=1, xmax=None, ymin=1, ymax=None, **kwargs):
+    def reads_len_distribution (self, figsize=[30,7], hist=True, kde=True, kde_color="black", hist_color="orangered", kde_alpha=0.5,
+        hist_alpha=0.5, win_size=500, xmin=None, xmax=None, ymin=None, ymax=None, **kwargs):
+
         """
         Plot the distribution of read length in base pairs
         * figsize
-            Size of ploting area [Default:figsize=[30,7]
+            Size of ploting area [Default [30,7]]
         * color
             Color of the plot. Valid matplotlib color code [Default "orangered"]
         * alpha
-            Opacity of the area from 0 to 1 [Default: 0.5]
+            Opacity of the area from 0 to 1 [Default 0.5]
+        * hist
+            If True plot an histogram of distribution [Default True]
+        * kde
+            If True plot a univariate kernel density estimate [Default True]
+        * kde_color / hist_color
+            Color map or color codes to use for the 3 plots [Default "black" "orangered"]
+        * kde_alpha / hist_alpha
+            Opacity of the area from 0 to 1 for the 3 plots [Default 0.5 0.5]
         * win_size
-            Size of the bins in base pairs [Default 1000]
-        * normed
-            Normalised results. Frenquency rather than counts [Default False]
-        * xlog, ylog
-            Use log10 scale for x and y axis [Default True, True]
-        * xmin, xmax
-            Lower and upper limits on x axis [Default 1, None]
-        * ymin, ymax
-            Lower and upper limits on y axis [Default 1, None]
+            Size of the bins in base pairs for the histogram [Default 500]
+        * xmin, xmax, ymin, ymax
+            Lower and upper limits on x/y axis [Default None]
         => Return
             A matplotlib.axes object for further user customisation (http://matplotlib.org/api/axes_api.html)
         """
+        # Extract length limits
         if not xmax:
             xmax = max(self.df['sequence_length_template'])
+        if not xmin:
+            xmin = 0
 
-        if xlog:
-            bins = np.logspace(int(np.log10(xmin)), int(np.log10(xmax))+1, num=int(xmax/win_size), endpoint=True, base=10)
-        else:
-            bins = np.arange(xmin, xmax, win_size)
-
-        # Plot the graph
+        # Plot
         fig, ax = pl.subplots(figsize=figsize)
-        ax = self.df['sequence_length_template'].plot.hist(bins=bins, ax=ax, normed=normed, color=color, alpha=alpha, histtype='stepfilled')
+        # Plot the kde graph
+
+        if kde:
+            sns.kdeplot(self.df["sequence_length_template"], ax=ax, color=kde_color, alpha=kde_alpha, shade=not hist, gridsize=500, legend=False)
+        # Plot a frequency histogram 
+        if hist:
+            ax = self.df['sequence_length_template'].plot.hist(
+                bins=np.arange(xmin, xmax, win_size), ax=ax, normed=True, color=hist_color, alpha=hist_alpha, histtype='stepfilled')
 
         # Tweak the plot       
         t = ax.set_title ("Distribution of reads length")
         t = ax.set_xlabel("Length in bp")
-        if normed:
-            t = ax.set_ylabel("Frequency")
-        else:
-            t = ax.set_ylabel("Read count")
+        t = ax.set_ylabel("Read Frequency")
+
+        if not ymin:
+            ymin = 0
+        if not ymax:
+            ymax = ax.get_ylim()[1]
+
+        t = ax.set_xlim([xmin, xmax])
+        t = ax.set_ylim([ymin, ymax])
+
+        return ax
 
-        if xlog:
-            ax.set_xscale("log")
-        if ylog:
-            ax.set_yscale("log")
+    def reads_len_quality (self, figsize=12, kde=True, scatter=True, margin_plot=True, kde_cmap="copper", scatter_color="orangered",
+        margin_plot_color="orangered", kde_alpha=1, scatter_alpha=0.01, margin_plot_alpha=0.5, kde_levels=10, kde_shade=False, xmin=None,
+        xmax=None, ymin=None, ymax=None, **kwargs):
+        """
+        Draw a bivariate plot of read length vs mean read quality with marginal univariate plots.
+        * figsize
+            Size of square ploting area [Default 12]
+        * kde
+            If True plot a bivariate kernel density estimate [Default True]
+        * scatter
+            If True plot a scatter plot  [Default true]
+        * margin_plot
+            If True plot marginal univariate distributions [Default True]
+        * kde_cmap / scatter_color / margin_plot_color
+            Color map or color codes to use for the 3 plots [Default "copper", "orangered", "orangered"]
+        * kde_alpha / scatter_alpha / margin_plot_alpha
+            Opacity of the area from 0 to 1 for the 3 plots [Default 1, 0.01, 0.5]
+        * kde_levels
+            Number of levels for the central density plot [Default 10]
+        * kde_shade
+            If True the density curves will be filled [Default False]
+        * xmin, xmax, ymin, ymax
+            Lower and upper limits on x/y axis [Default None]
+        => Return
+            A seaborn JointGrid object with the plot on it. (http://seaborn.pydata.org/generated/seaborn.JointGrid.html)
+        """
 
-        t = ax.set_xlim (xmin, xmax)
-        if ymax:
-            t = ax.set_ylim (ymin, ymax)
-        else:
-            t = ax.set_ylim (ymin, ax.get_ylim()[1])
+        # Plot the graph
+        g = sns.JointGrid("sequence_length_template", "mean_qscore_template", data=self.df, space=0.1, size=figsize)
+        if kde:
+            g = g.plot_joint(sns.kdeplot, cmap=kde_cmap, alpha=kde_alpha, shade=kde_shade, shade_lowest=False, n_levels=kde_levels)
+        if scatter:
+            g = g.plot_joint(pl.scatter, color=scatter_color, alpha=scatter_alpha)
+        if margin_plot:
+            g = g.plot_marginals(sns.kdeplot, shade=True, color=margin_plot_color, alpha=margin_plot_alpha)
+
+        # Tweak the plot
+        t = g.ax_marg_x.set_title ("Mean read quality per sequence length")
+        t = g.ax_joint.set_xlabel("Sequence length (bp)")
+        t = g.ax_joint.set_ylabel("Mean read quality (PHRED)")
 
-        return ax
+        if not xmin:
+            xmin = g.ax_joint.get_xlim()[0]
+        if not xmax:
+            xmax = g.ax_joint.get_xlim()[1]
+        if not ymin:
+            ymin = g.ax_joint.get_ylim()[0]
+        if not ymax:
+            ymax = g.ax_joint.get_ylim()[1]
+
+        t = g.ax_joint.set_xlim([xmin, xmax])
+        t = g.ax_joint.set_ylim([ymin, ymax])
+
+        return g