Skip to content

Commit

Permalink
Bug fix for constructing hints from protein sequences.
Browse files Browse the repository at this point in the history
NOTE: This update requires that you update your Kent repository to at least commit 5b8e436 and rebuild to get the latest version of `pslCheck`.
  • Loading branch information
ifiddes committed Aug 31, 2017
1 parent 9c84e48 commit 4c41400
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 12 deletions.
5 changes: 3 additions & 2 deletions cat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1632,8 +1632,9 @@ def validate(self):
'package not in global path.'.format(tool))
if not tools.misc.is_exec('halLiftover'):
raise ToolMissingException('halLiftover from the halTools package not in global path.')
if not tools.misc.is_exec('bedtools'):
raise ToolMissingException('bedtools is required for the homGeneMapping module.')
for tool in ['bedtools', 'bedSort']:
if not tools.misc.is_exec(tool):
raise ToolMissingException('{} is required for the homGeneMapping module.'.format(tool))

def requires(self):
pipeline_args = self.get_pipeline_args()
Expand Down
16 changes: 11 additions & 5 deletions cat/hgm.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,17 @@ def extract_exon_hints(hints_db, in_gtf, genome):
cmd = ['bedtools', 'merge', '-i', hints_file, '-c', '4', '-o', 'mean']
tools.procOps.run_proc(cmd, stdout=merged_hints_file, stderr='/dev/null')
# overlap the merged exons with the given GTF, producing a final set.
cmd = [['grep', '\texon\t', in_gtf], # exons only
['cut', '-d', '\t', '-f', '1,4,5'], # slice into BED format
['sort'], # sort to make unique
['uniq'], # make unique to remove duplicate exons
['bedtools', 'intersect', '-a', 'stdin', '-b', merged_hints_file, '-f', '0.8', '-wa', '-wb'],
tmp_bed = tools.fileOps.get_tmp_file()
cmd = [['grep', '-P', '(\texon\t|\tCDS\t)', in_gtf], # exons or CDS only
['cut', '-d', '\t', '-f', '1,4,5']] # slice into BED-like format with GTF intervals
tools.procOps.run_proc(cmd, stdout=tmp_bed)
# sort the BED
tools.procOps.run_proc(['bedSort', tmp_bed, tmp_bed])
# merge the CDS and exon intervals
tmp_merged = tools.fileOps.get_tmp_file()
tools.procOps.run_proc(['bedtools', 'merge', '-i', tmp_bed], stdout=tmp_merged)
# intersect with hints and retain scores
cmd = [['bedtools', 'intersect', '-a', tmp_merged, '-b', merged_hints_file, '-f', '0.8', '-wa', '-wb'],
# bedtools reports both entire A and entire B if at least 80% of A overlaps a B
['cut', '-d', '\t', '-f', '1,2,3,7']] # retain the A positions with the B score
# these BED-like records are actually GFF intervals with 1-based starts and closed intervals
Expand Down
12 changes: 7 additions & 5 deletions cat/hints_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import tools.transcripts
import tools.bio
from exceptions import UserException
from tools.pipeline import ProcException

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -299,8 +300,12 @@ def run_protein_blat(job, protein_subset, genome_fasta_file_id):
tools.bio.write_fasta(outf, name, str(seq))
# perform alignment
tmp_psl = tools.fileOps.get_tmp_toil_file()
cmd = ['blat', '-t=dnax', '-q=prot', '-noHead', genome_fasta, protein_fasta, tmp_psl]
tools.procOps.run_proc(cmd)
cmd = [['blat', '-t=dnax', '-q=prot', '-noHead', genome_fasta, protein_fasta, '/dev/stdout'],
['pslCheck', '-skipInsertCounts', '/dev/stdin', '-pass={}'.format(tmp_psl)]]
try: # we expect pslCheck to fail
tools.procOps.run_proc(cmd, stderr='/dev/null')
except ProcException:
pass
return job.fileStore.writeGlobalFile(tmp_psl)


Expand All @@ -320,9 +325,6 @@ def convert_blat_results_to_hints(job, results):
['perl', '-ne', '@f=split; print if ($f[0]>=100)'],
['blat2hints.pl', '--in=/dev/stdin', '--nomult', '--ep_cutoff=5', '--out={}'.format(out_hints)]]
tools.procOps.run_proc(cmd)
# fix the names
cmd = ['sed', '-i', 's/exon/CDS/; s/ep/CDSpart/', out_hints]
tools.procOps.run_proc(cmd)
return job.fileStore.writeGlobalFile(out_hints)


Expand Down

0 comments on commit 4c41400

Please sign in to comment.