Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
ifiddes committed Jun 27, 2020
2 parents ff86398 + 8de91a8 commit b1e400f
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 30 deletions.
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ RUN git clone git://github.com/samtools/htslib.git
RUN cd htslib && make install

# bcftools
RUN git clone git://github.com/samtools/bcftools.git
RUN git clone git://github.com/samtools/bcftools.git
RUN cd bcftools && make

# samtools
Expand All @@ -33,7 +33,7 @@ RUN cd augustus && make
# HDF5
RUN wget -q http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/hdf5-1.10.1.tar.gz
RUN tar xzf hdf5-1.10.1.tar.gz
RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr
RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr
RUN cd hdf5-1.10.1 && make && make install

# sonLib
Expand Down Expand Up @@ -62,7 +62,7 @@ RUN tar xvjf sambamba_v0.6.7_linux.tar.bz2

FROM ubuntu:18.04
RUN apt-get update
RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev
RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev exonerate
RUN add-apt-repository -y ppa:deadsnakes/ppa
RUN apt-get install -y python3.7 python3-pip
# Kent
Expand Down
2 changes: 2 additions & 0 deletions cat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,8 @@ def load_docker(self):
'Either install it or use a different option for --binary-mode.')
os.environ['SINGULARITY_PULLFOLDER'] = os.path.abspath(self.work_dir)
os.environ['SINGULARITY_CACHEDIR'] = os.path.abspath(self.work_dir)
if os.environ.get('SINGULARITY_IMAGE'):
return
tools.fileOps.ensure_dir(self.work_dir)
if not os.path.isfile(os.path.join(self.work_dir, 'cat.img')):
subprocess.check_call(['singularity', 'pull', '--name', 'cat.img',
Expand Down
27 changes: 16 additions & 11 deletions cat/consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,9 +648,11 @@ def add_exref_ids(s):
'source_gene_common_name': s.CommonName}

# bring in extra tags for exRef
if 'exRef' in denovo_tx_modes:
for key, val in tools.misc.parse_gff_attr_line(exref_annot.loc[aln_id].ExtraTags).items():
denovo_tx_dict[aln_id][key] = val
if tools.nameConversions.aln_id_is_exref(aln_id):
tags = exref_annot.loc[aln_id].ExtraTags
if len(tags) > 0:
for key, val in tools.misc.parse_gff_attr_line(tags).items():
denovo_tx_dict[aln_id][key] = val

# record some metrics
metrics['denovo'][tx_mode][s.TranscriptClass.replace('_', ' ').capitalize()] += 1
Expand Down Expand Up @@ -907,12 +909,15 @@ def convert_attrs(attrs, id_field):
else:
attrs['Name'] = attrs['gene_id']
# convert empty strings into nan
attrs = {x: y if y != '' else 'nan' for x, y in attrs.items()}
# don't include the support vectors in the string, they will be placed in their respective places
attrs_str = ['='.join([key, str(val).replace('=', '_')]) for key, val in sorted(attrs.items()) if 'support' not in key]
# explicitly escape any semicolons that may exist in the input strings
attrs_str = [x.replace(';', '%3B') for x in attrs_str]
return score, ';'.join(attrs_str)
attrs_str = []
for key, val in attrs.items():
val = str(val)
if len(val) == 0:
val = 'nan'
val = str(val).replace('=', '%3D').replace(';', '%3B')
key = key.replace('=', '%3D').replace(';', '%3B')
attrs_str.append(f"{key}={val}")
return score, ";".join(attrs_str)

def find_feature_support(attrs, feature, i):
"""Extracts the boolean value from the comma delimited string"""
Expand All @@ -929,7 +934,7 @@ def find_all_tx_modes(attrs_list):
for attrs in attrs_list:
tx_modes.update(attrs['transcript_modes'].split(','))
return ','.join(tx_modes)

intervals = set()
for tx in tx_objs:
intervals.update(tx.exon_intervals)
Expand Down Expand Up @@ -1025,4 +1030,4 @@ def write_consensus_fastas(consensus_gene_dict, consensus_fasta, consensus_prote
for tx_obj, attrs in tx_list:
tools.bio.write_fasta(cfa, attrs['transcript_id'], tx_obj.get_mrna(seq_dict))
if tx_obj.cds_size > 0:
tools.bio.write_fasta(cpfa, attrs['transcript_id'], tx_obj.get_protein_sequence(seq_dict))
tools.bio.write_fasta(cpfa, attrs['transcript_id'], tx_obj.get_protein_sequence(seq_dict))
2 changes: 1 addition & 1 deletion cat/hints_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def run_protein_aln(job, protein_subset, genome_fasta_file_id):
# perform alignment
tmp_exonerate = tools.fileOps.get_tmp_toil_file()
cmd = ['exonerate', '--model', 'protein2genome', '--showvulgar', 'no', '--showalignment', 'no',
'--showquerygff', 'yes', genome_fasta, protein_fasta]
'--showquerygff', 'yes', protein_fasta, genome_fasta]
tools.procOps.run_proc(cmd, stdout=tmp_exonerate)
return job.fileStore.writeGlobalFile(tmp_exonerate)

Expand Down
22 changes: 17 additions & 5 deletions logging.cfg
Original file line number Diff line number Diff line change
@@ -1,21 +1,33 @@
[loggers]
keys=root
keys=root,luigi,cat

[handlers]
keys=consoleHandler

[formatters]
keys=simpleFormatter
keys=consoleFormatter

[logger_root]
level=WARNING
handlers=consoleHandler

[logger_luigi]
level=INFO
handlers=consoleHandler
qualname=luigi-interface
propagate=0

[logger_cat]
level=INFO
handlers=consoleHandler
qualname=cat
propagate=0

[handler_consoleHandler]
class=StreamHandler
level=INFO
formatter=simpleFormatter
formatter=consoleFormatter
args=(sys.stdout,)

[formatter_simpleFormatter]
format=%(levelname)s: %(message)s
[formatter_consoleFormatter]
format=%(levelname)s: %(asctime)-15s - %(message)s
25 changes: 17 additions & 8 deletions tools/gff3.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

reserved_keys = ['gene_biotype',
'transcript_biotype',
'gene_type',
'transcript_type',
'gene_name',
'gene_id',
'transcript_id',
Expand All @@ -29,17 +31,23 @@ def parse_attrs(attrs):
results = []
for tx_id, gene_id in tx_name_map.items():
d = attrs_dict[tx_id]
gene_biotype = d['gene_biotype']
tx_biotype = d['transcript_biotype']
gene_biotype = d.get('gene_biotype', d.get('gene_type', None))
if gene_biotype is None:
raise Exception("Did not find a gene biotype or gene type for {} (attrs={})".format(gene_id, d))
tx_biotype = d.get('transcript_biotype', d.get('transcript_type', None))
if tx_biotype is None:
raise Exception("Did not find a transcript biotype or type for {} (attrs={})".format(tx_id, d))
gene_name = d['gene_name']
gene_id = d['gene_id']
tx_id = d['transcript_id']
tx_name = d['transcript_name']
extra_tags = ';'.join(['{}={}'.format(x, y.replace(';', '%3B')) for x, y in d.items() if x not in reserved_keys])
try:
misc.parse_gff_attr_line(extra_tags)
except:
raise Exception('Error parsing extra tags in input GFF3')
extra_tags = ';'.join(['{}={}'.format(x, y.replace(';', '%3B').replace('=', '%3D'))
for x, y in d.items() if x not in reserved_keys])
if len(extra_tags) > 0:
try:
misc.parse_gff_attr_line(extra_tags)
except:
raise Exception(f'Error parsing extra tags in input GFF3 {extra_tags}')
if is_external_reference is True:
# hack to fix names
gene_id = f'exRef-{gene_id}'
Expand All @@ -53,5 +61,6 @@ def parse_attrs(attrs):

def convert_gff3_cmd(annotation_attrs, annotation):
cmd = ['gff3ToGenePred', '-rnaNameAttr=transcript_id', '-geneNameAttr=gene_id', '-honorStartStopCodons',
'-refseqHacks',
'-attrsOut={}'.format(annotation_attrs), annotation, '/dev/stdout']
return cmd
return cmd
4 changes: 4 additions & 0 deletions tools/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,17 @@ def sort_gff(input_file, output_file):

def parse_gtf_attr_line(attr_line):
"""parse a GTF attributes line"""
if len(attr_line) == 0:
return {}
attr_line = [x.split(' ') for x in re.split('; +', attr_line.replace('"', ''))]
attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '')
return dict(attr_line)


def parse_gff_attr_line(attr_line):
"""parse a GFF attributes line"""
if len(attr_line) == 0:
return {}
attr_line = [x.split('=') for x in re.split('; *', attr_line.replace('"', ''))]
attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '')
return dict(attr_line)
Expand Down
11 changes: 9 additions & 2 deletions tools/procOps.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ def cmdLists(cmd):
else:
return getDockerCommand('quay.io/ucsc_cgl/cat', cmd)
elif os.environ.get('CAT_BINARY_MODE') == 'singularity':
img = os.path.join(os.environ['SINGULARITY_PULLFOLDER'], 'cat.img')
if os.environ.get('SINGULARITY_IMAGE'):
img = os.environ['SINGULARITY_IMAGE']
else:
img = os.path.join(os.environ['SINGULARITY_PULLFOLDER'], 'cat.img')
assert os.path.exists(img)
if isinstance(cmd[0], list):
return list([get_singularity_command(img, c) for c in cmd])
Expand Down Expand Up @@ -107,7 +110,7 @@ def popen_catch(command, stdin=None):
def mrca_path(path1, path2):
"""
Gives the Most Recent Common Ancestor directory that contains both paths.
>>> mrca_path('/usr/lib/python2.7', '/usr/bin/python')
'/usr'
>>> mrca_path('/usr/', '/usr/')
Expand Down Expand Up @@ -165,6 +168,10 @@ def getDockerCommand(image, cmd):
cmd: list of arguments
"""
dockerPreamble = ['docker', 'run', '-i', '--rm', '-u', "%s:%s" % (os.getuid(), os.getgid())]
if "TMPDIR" in os.environ:
tmpdir = os.environ["TMPDIR"]
dockerPreamble.extend(["--env", "TMPDIR={}".format(tmpdir)])
dockerPreamble.extend(['-v', tmpdir + ':' + tmpdir])
work_dirs = []
for i, arg in enumerate(cmd):
if arg.startswith('-') and '=' in arg:
Expand Down

0 comments on commit b1e400f

Please sign in to comment.