Merge branch 'master' of https://github.com/ComparativeGenomicsToolki…

…t/Comparative-Annotation-Toolkit
ComparativeGenomicsToolkit · Jun 27, 2020 · b1e400f · b1e400f
2 parents ff86398 + 8de91a8
commit b1e400f
Show file tree

Hide file tree

Showing 8 changed files with 69 additions and 30 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -9,7 +9,7 @@ RUN git clone git://github.com/samtools/htslib.git
 RUN cd htslib && make install
 
 # bcftools
-RUN git clone git://github.com/samtools/bcftools.git 
+RUN git clone git://github.com/samtools/bcftools.git
 RUN cd bcftools && make
 
 # samtools
@@ -33,7 +33,7 @@ RUN cd augustus && make
 # HDF5
 RUN wget -q http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/hdf5-1.10.1.tar.gz
 RUN tar xzf hdf5-1.10.1.tar.gz
-RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr 
+RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr
 RUN cd hdf5-1.10.1 && make && make install
 
 # sonLib
@@ -62,7 +62,7 @@ RUN tar xvjf sambamba_v0.6.7_linux.tar.bz2
 
 FROM ubuntu:18.04
 RUN apt-get update
-RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev
+RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev exonerate
 RUN add-apt-repository -y ppa:deadsnakes/ppa
 RUN apt-get install -y python3.7 python3-pip
 # Kent

diff --git a/cat/__init__.py b/cat/__init__.py
@@ -398,6 +398,8 @@ def load_docker(self):
                                            'Either install it or use a different option for --binary-mode.')
             os.environ['SINGULARITY_PULLFOLDER'] = os.path.abspath(self.work_dir)
             os.environ['SINGULARITY_CACHEDIR'] = os.path.abspath(self.work_dir)
+            if os.environ.get('SINGULARITY_IMAGE'):
+                return
             tools.fileOps.ensure_dir(self.work_dir)
             if not os.path.isfile(os.path.join(self.work_dir, 'cat.img')):
                 subprocess.check_call(['singularity', 'pull', '--name', 'cat.img',

diff --git a/cat/consensus.py b/cat/consensus.py
@@ -648,9 +648,11 @@ def add_exref_ids(s):
                                   'source_gene_common_name': s.CommonName}
 
         # bring in extra tags for exRef
-        if 'exRef' in denovo_tx_modes:
-            for key, val in tools.misc.parse_gff_attr_line(exref_annot.loc[aln_id].ExtraTags).items():
-                denovo_tx_dict[aln_id][key] = val
+        if tools.nameConversions.aln_id_is_exref(aln_id):
+            tags = exref_annot.loc[aln_id].ExtraTags
+            if len(tags) > 0:
+                for key, val in tools.misc.parse_gff_attr_line(tags).items():
+                    denovo_tx_dict[aln_id][key] = val
 
         # record some metrics
         metrics['denovo'][tx_mode][s.TranscriptClass.replace('_', ' ').capitalize()] += 1
@@ -907,12 +909,15 @@ def convert_attrs(attrs, id_field):
         else:
             attrs['Name'] = attrs['gene_id']
         # convert empty strings into nan
-        attrs = {x: y if y != '' else 'nan' for x, y in attrs.items()}
-        # don't include the support vectors in the string, they will be placed in their respective places
-        attrs_str = ['='.join([key, str(val).replace('=', '_')]) for key, val in sorted(attrs.items()) if 'support' not in key]
-        # explicitly escape any semicolons that may exist in the input strings
-        attrs_str = [x.replace(';', '%3B') for x in attrs_str]
-        return score, ';'.join(attrs_str)
+        attrs_str = []
+        for key, val in attrs.items():
+            val = str(val)
+            if len(val) == 0:
+                val = 'nan'
+            val = str(val).replace('=', '%3D').replace(';', '%3B')
+            key = key.replace('=', '%3D').replace(';', '%3B')
+            attrs_str.append(f"{key}={val}")
+        return score, ";".join(attrs_str)
 
     def find_feature_support(attrs, feature, i):
         """Extracts the boolean value from the comma delimited string"""
@@ -929,7 +934,7 @@ def find_all_tx_modes(attrs_list):
             for attrs in attrs_list:
                 tx_modes.update(attrs['transcript_modes'].split(','))
             return ','.join(tx_modes)
-        
+
         intervals = set()
         for tx in tx_objs:
             intervals.update(tx.exon_intervals)
@@ -1025,4 +1030,4 @@ def write_consensus_fastas(consensus_gene_dict, consensus_fasta, consensus_prote
                 for tx_obj, attrs in tx_list:
                     tools.bio.write_fasta(cfa, attrs['transcript_id'], tx_obj.get_mrna(seq_dict))
                     if tx_obj.cds_size > 0:
-                        tools.bio.write_fasta(cpfa, attrs['transcript_id'], tx_obj.get_protein_sequence(seq_dict))
+                        tools.bio.write_fasta(cpfa, attrs['transcript_id'], tx_obj.get_protein_sequence(seq_dict))
diff --git a/cat/hints_db.py b/cat/hints_db.py
@@ -307,7 +307,7 @@ def run_protein_aln(job, protein_subset, genome_fasta_file_id):
     # perform alignment
     tmp_exonerate = tools.fileOps.get_tmp_toil_file()
     cmd = ['exonerate', '--model', 'protein2genome', '--showvulgar', 'no', '--showalignment', 'no',
-           '--showquerygff', 'yes', genome_fasta, protein_fasta]
+           '--showquerygff', 'yes', protein_fasta, genome_fasta]
     tools.procOps.run_proc(cmd, stdout=tmp_exonerate)
     return job.fileStore.writeGlobalFile(tmp_exonerate)
 

diff --git a/logging.cfg b/logging.cfg
@@ -1,21 +1,33 @@
 [loggers]
-keys=root
+keys=root,luigi,cat
 
 [handlers]
 keys=consoleHandler
 
 [formatters]
-keys=simpleFormatter
+keys=consoleFormatter
 
 [logger_root]
+level=WARNING
+handlers=consoleHandler
+
+[logger_luigi]
+level=INFO
+handlers=consoleHandler
+qualname=luigi-interface
+propagate=0
+
+[logger_cat]
 level=INFO
 handlers=consoleHandler
+qualname=cat
+propagate=0
 
 [handler_consoleHandler]
 class=StreamHandler
 level=INFO
-formatter=simpleFormatter
+formatter=consoleFormatter
 args=(sys.stdout,)
 
-[formatter_simpleFormatter]
-format=%(levelname)s: %(message)s
+[formatter_consoleFormatter]
+format=%(levelname)s: %(asctime)-15s - %(message)s
diff --git a/tools/gff3.py b/tools/gff3.py
@@ -7,6 +7,8 @@
 
 reserved_keys = ['gene_biotype',
                  'transcript_biotype',
+                 'gene_type',
+                 'transcript_type',
                  'gene_name',
                  'gene_id',
                  'transcript_id',
@@ -29,17 +31,23 @@ def parse_attrs(attrs):
     results = []
     for tx_id, gene_id in tx_name_map.items():
         d = attrs_dict[tx_id]
-        gene_biotype = d['gene_biotype']
-        tx_biotype = d['transcript_biotype']
+        gene_biotype = d.get('gene_biotype', d.get('gene_type', None))
+        if gene_biotype is None:
+            raise Exception("Did not find a gene biotype or gene type for {} (attrs={})".format(gene_id, d))
+        tx_biotype = d.get('transcript_biotype', d.get('transcript_type', None))
+        if tx_biotype is None:
+            raise Exception("Did not find a transcript biotype or type for {} (attrs={})".format(tx_id, d))
         gene_name = d['gene_name']
         gene_id = d['gene_id']
         tx_id = d['transcript_id']
         tx_name = d['transcript_name']
-        extra_tags = ';'.join(['{}={}'.format(x, y.replace(';', '%3B')) for x, y in d.items() if x not in reserved_keys])
-        try:
-            misc.parse_gff_attr_line(extra_tags)
-        except:
-            raise Exception('Error parsing extra tags in input GFF3')
+        extra_tags = ';'.join(['{}={}'.format(x, y.replace(';', '%3B').replace('=', '%3D'))
+                               for x, y in d.items() if x not in reserved_keys])
+        if len(extra_tags) > 0:
+            try:
+                misc.parse_gff_attr_line(extra_tags)
+            except:
+                raise Exception(f'Error parsing extra tags in input GFF3 {extra_tags}')
         if is_external_reference is True:
             # hack to fix names
             gene_id = f'exRef-{gene_id}'
@@ -53,5 +61,6 @@ def parse_attrs(attrs):
 
 def convert_gff3_cmd(annotation_attrs, annotation):
     cmd = ['gff3ToGenePred', '-rnaNameAttr=transcript_id', '-geneNameAttr=gene_id', '-honorStartStopCodons',
+            '-refseqHacks',
            '-attrsOut={}'.format(annotation_attrs), annotation, '/dev/stdout']
-    return cmd
+    return cmd
diff --git a/tools/misc.py b/tools/misc.py
@@ -102,13 +102,17 @@ def sort_gff(input_file, output_file):
 
 def parse_gtf_attr_line(attr_line):
     """parse a GTF attributes line"""
+    if len(attr_line) == 0:
+        return {}
     attr_line = [x.split(' ') for x in re.split('; +', attr_line.replace('"', ''))]
     attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '')
     return dict(attr_line)
 
 
 def parse_gff_attr_line(attr_line):
     """parse a GFF attributes line"""
+    if len(attr_line) == 0:
+        return {}
     attr_line = [x.split('=') for x in re.split('; *', attr_line.replace('"', ''))]
     attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '')
     return dict(attr_line)

diff --git a/tools/procOps.py b/tools/procOps.py
@@ -26,7 +26,10 @@ def cmdLists(cmd):
         else:
             return getDockerCommand('quay.io/ucsc_cgl/cat', cmd)
     elif os.environ.get('CAT_BINARY_MODE') == 'singularity':
-        img = os.path.join(os.environ['SINGULARITY_PULLFOLDER'], 'cat.img')
+        if os.environ.get('SINGULARITY_IMAGE'):
+            img = os.environ['SINGULARITY_IMAGE']
+        else:
+            img = os.path.join(os.environ['SINGULARITY_PULLFOLDER'], 'cat.img')
         assert os.path.exists(img)
         if isinstance(cmd[0], list):
             return list([get_singularity_command(img, c) for c in cmd])
@@ -107,7 +110,7 @@ def popen_catch(command, stdin=None):
 def mrca_path(path1, path2):
     """
     Gives the Most Recent Common Ancestor directory that contains both paths.
-    
+
     >>> mrca_path('/usr/lib/python2.7', '/usr/bin/python')
     '/usr'
     >>> mrca_path('/usr/', '/usr/')
@@ -165,6 +168,10 @@ def getDockerCommand(image, cmd):
     cmd: list of arguments
     """
     dockerPreamble = ['docker', 'run', '-i', '--rm', '-u', "%s:%s" % (os.getuid(), os.getgid())]
+    if "TMPDIR" in os.environ:
+        tmpdir = os.environ["TMPDIR"]
+        dockerPreamble.extend(["--env", "TMPDIR={}".format(tmpdir)])
+        dockerPreamble.extend(['-v', tmpdir + ':' + tmpdir])
     work_dirs = []
     for i, arg in enumerate(cmd):
         if arg.startswith('-') and '=' in arg: