Skip to content
This repository has been archived by the owner on Oct 18, 2024. It is now read-only.

Commit

Permalink
Changed default behaviour of pycoQC during initial file parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
a-slide committed Jun 28, 2017
1 parent afdeefe commit 3eee83f
Show file tree
Hide file tree
Showing 2 changed files with 284 additions and 194 deletions.
41 changes: 16 additions & 25 deletions pycoQC/pycoQC.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,18 @@ class pycoQC():
#~~~~~~~FUNDAMENTAL METHODS~~~~~~~#
def __init__ (self, seq_summary_file, runid=None, verbose=False):
"""
Parse Albacore sequencing_summary.txt file and cleanup the data
Parse Albacore sequencing_summary.txt file and clean-up the data
* seq_summary_file
Path to the sequencing_summary.txt generated by Albacore
* runid
If you want a specific runid to be analysed. Usually there are 2 runids per minion experiment, the mux run and the sequencing
run. By default it will analyse the runid with the most reads, ie the sequencing run. [Default None]
If you want a specific runid to be analysed. By default it will analyse all the read in the file irrespective of their runid
[Default None]
* verbose
print additional informations. [Default False]
"""
self.verbose=verbose

# import in a dataframe
# Import the summary file in a dataframe
self.seq_summary_file = seq_summary_file
self.df = pd.read_csv(seq_summary_file, sep ="\t")
self.df.dropna(inplace=True)
Expand All @@ -57,32 +57,23 @@ def __init__ (self, seq_summary_file, runid=None, verbose=False):
assert colname in self.df.columns, "Column {} not found in the provided sequence_summary file".format(colname)

# Find or verify runid
runid_counts = self.df['run_id'].value_counts(sort=True)

if not runid:
if self.verbose:
print ("Runid found in the datasets")
runid_counts.name = "Count"
runid_df = pd.DataFrame(runid_counts)
runid_df.columns.name = "Run_ID"
display(runid_df)
print ("Selecting Run_ID {}".format(runid_counts.index[0]))

self.runid = runid_counts.index[0]
self.total_reads = runid_counts.loc[self.runid]

else:
self.runid = runid
self.total_reads = runid_counts.loc[self.runid]
if verbose:
print ("Runid found in the datasets")
runid_counts = self.df['run_id'].value_counts(sort=True).to_frame(name="Counts")
display(runid_counts)

# Select Runid if required
if runid:
if verbose:
print ("Selecting reads with Run_ID {}".format(runid))
self.df = self.df[(self.df["run_id"] == runid)]

# Extract the runid data from the overall dataframe
self.df = self.df[(self.df["run_id"] == self.runid)]
self.df = self.df.reset_index(drop=True)
self.df.set_index("read_id", inplace=True)
#self.df.drop(['filename', 'run_id'], axis=1, inplace=True)
self.total_reads = len(self.df)

if self.verbose:
print ("Dataframe head")
display (self.df.head())

def __str__(self):
Expand Down Expand Up @@ -421,7 +412,7 @@ def reads_len_quality (self, figsize=12, kde=True, scatter=True, margin_plot=Tru
xmax=None, ymin=None, ymax=None, **kwargs):
"""
Draw a bivariate plot of read length vs mean read quality with marginal univariate plots.
The bivariate kde can takes time to calculate depending on the number of datapoints
The bivariate kde can takes time to calculate depending on the number of data points
* figsize
Size of square ploting area [Default 12]
* kde
Expand Down
Loading

0 comments on commit 3eee83f

Please sign in to comment.