From 4c41400f3d1470a49d0e6048e6e6212c4b4211f9 Mon Sep 17 00:00:00 2001 From: Ian Fiddes Date: Thu, 31 Aug 2017 11:20:43 +0100 Subject: [PATCH] Bug fix for constructing hints from protein sequences. NOTE: This update requires that you update your Kent repository to at least commit 5b8e436 and rebuild to get the latest version of `pslCheck`. --- cat/__init__.py | 5 +++-- cat/hgm.py | 16 +++++++++++----- cat/hints_db.py | 12 +++++++----- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/cat/__init__.py b/cat/__init__.py index cb528e61..caec28e0 100644 --- a/cat/__init__.py +++ b/cat/__init__.py @@ -1632,8 +1632,9 @@ def validate(self): 'package not in global path.'.format(tool)) if not tools.misc.is_exec('halLiftover'): raise ToolMissingException('halLiftover from the halTools package not in global path.') - if not tools.misc.is_exec('bedtools'): - raise ToolMissingException('bedtools is required for the homGeneMapping module.') + for tool in ['bedtools', 'bedSort']: + if not tools.misc.is_exec(tool): + raise ToolMissingException('{} is required for the homGeneMapping module.'.format(tool)) def requires(self): pipeline_args = self.get_pipeline_args() diff --git a/cat/hgm.py b/cat/hgm.py index e098f6d2..78be3fab 100644 --- a/cat/hgm.py +++ b/cat/hgm.py @@ -159,11 +159,17 @@ def extract_exon_hints(hints_db, in_gtf, genome): cmd = ['bedtools', 'merge', '-i', hints_file, '-c', '4', '-o', 'mean'] tools.procOps.run_proc(cmd, stdout=merged_hints_file, stderr='/dev/null') # overlap the merged exons with the given GTF, producing a final set. - cmd = [['grep', '\texon\t', in_gtf], # exons only - ['cut', '-d', '\t', '-f', '1,4,5'], # slice into BED format - ['sort'], # sort to make unique - ['uniq'], # make unique to remove duplicate exons - ['bedtools', 'intersect', '-a', 'stdin', '-b', merged_hints_file, '-f', '0.8', '-wa', '-wb'], + tmp_bed = tools.fileOps.get_tmp_file() + cmd = [['grep', '-P', '(\texon\t|\tCDS\t)', in_gtf], # exons or CDS only + ['cut', '-d', '\t', '-f', '1,4,5']] # slice into BED-like format with GTF intervals + tools.procOps.run_proc(cmd, stdout=tmp_bed) + # sort the BED + tools.procOps.run_proc(['bedSort', tmp_bed, tmp_bed]) + # merge the CDS and exon intervals + tmp_merged = tools.fileOps.get_tmp_file() + tools.procOps.run_proc(['bedtools', 'merge', '-i', tmp_bed], stdout=tmp_merged) + # intersect with hints and retain scores + cmd = [['bedtools', 'intersect', '-a', tmp_merged, '-b', merged_hints_file, '-f', '0.8', '-wa', '-wb'], # bedtools reports both entire A and entire B if at least 80% of A overlaps a B ['cut', '-d', '\t', '-f', '1,2,3,7']] # retain the A positions with the B score # these BED-like records are actually GFF intervals with 1-based starts and closed intervals diff --git a/cat/hints_db.py b/cat/hints_db.py index cf3f257e..e33f5290 100644 --- a/cat/hints_db.py +++ b/cat/hints_db.py @@ -21,6 +21,7 @@ import tools.transcripts import tools.bio from exceptions import UserException +from tools.pipeline import ProcException logger = logging.getLogger(__name__) @@ -299,8 +300,12 @@ def run_protein_blat(job, protein_subset, genome_fasta_file_id): tools.bio.write_fasta(outf, name, str(seq)) # perform alignment tmp_psl = tools.fileOps.get_tmp_toil_file() - cmd = ['blat', '-t=dnax', '-q=prot', '-noHead', genome_fasta, protein_fasta, tmp_psl] - tools.procOps.run_proc(cmd) + cmd = [['blat', '-t=dnax', '-q=prot', '-noHead', genome_fasta, protein_fasta, '/dev/stdout'], + ['pslCheck', '-skipInsertCounts', '/dev/stdin', '-pass={}'.format(tmp_psl)]] + try: # we expect pslCheck to fail + tools.procOps.run_proc(cmd, stderr='/dev/null') + except ProcException: + pass return job.fileStore.writeGlobalFile(tmp_psl) @@ -320,9 +325,6 @@ def convert_blat_results_to_hints(job, results): ['perl', '-ne', '@f=split; print if ($f[0]>=100)'], ['blat2hints.pl', '--in=/dev/stdin', '--nomult', '--ep_cutoff=5', '--out={}'.format(out_hints)]] tools.procOps.run_proc(cmd) - # fix the names - cmd = ['sed', '-i', 's/exon/CDS/; s/ep/CDSpart/', out_hints] - tools.procOps.run_proc(cmd) return job.fileStore.writeGlobalFile(out_hints)