diff --git a/Dockerfile b/Dockerfile index f9d93d47..d043be0e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ RUN git clone git://github.com/samtools/htslib.git RUN cd htslib && make install # bcftools -RUN git clone git://github.com/samtools/bcftools.git +RUN git clone git://github.com/samtools/bcftools.git RUN cd bcftools && make # samtools @@ -33,7 +33,7 @@ RUN cd augustus && make # HDF5 RUN wget -q http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/hdf5-1.10.1.tar.gz RUN tar xzf hdf5-1.10.1.tar.gz -RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr +RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr RUN cd hdf5-1.10.1 && make && make install # sonLib @@ -62,7 +62,7 @@ RUN tar xvjf sambamba_v0.6.7_linux.tar.bz2 FROM ubuntu:18.04 RUN apt-get update -RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev +RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev exonerate RUN add-apt-repository -y ppa:deadsnakes/ppa RUN apt-get install -y python3.7 python3-pip # Kent diff --git a/cat/__init__.py b/cat/__init__.py index b6b02b36..51e0b88d 100644 --- a/cat/__init__.py +++ b/cat/__init__.py @@ -398,6 +398,8 @@ def load_docker(self): 'Either install it or use a different option for --binary-mode.') os.environ['SINGULARITY_PULLFOLDER'] = os.path.abspath(self.work_dir) os.environ['SINGULARITY_CACHEDIR'] = os.path.abspath(self.work_dir) + if os.environ.get('SINGULARITY_IMAGE'): + return tools.fileOps.ensure_dir(self.work_dir) if not os.path.isfile(os.path.join(self.work_dir, 'cat.img')): subprocess.check_call(['singularity', 'pull', '--name', 'cat.img', diff --git a/cat/consensus.py b/cat/consensus.py index e5443880..39c3804b 100644 --- a/cat/consensus.py +++ b/cat/consensus.py @@ -648,9 +648,11 @@ def add_exref_ids(s): 'source_gene_common_name': s.CommonName} # bring in extra tags for exRef - if 'exRef' in denovo_tx_modes: - for key, val in tools.misc.parse_gff_attr_line(exref_annot.loc[aln_id].ExtraTags).items(): - denovo_tx_dict[aln_id][key] = val + if tools.nameConversions.aln_id_is_exref(aln_id): + tags = exref_annot.loc[aln_id].ExtraTags + if len(tags) > 0: + for key, val in tools.misc.parse_gff_attr_line(tags).items(): + denovo_tx_dict[aln_id][key] = val # record some metrics metrics['denovo'][tx_mode][s.TranscriptClass.replace('_', ' ').capitalize()] += 1 @@ -907,12 +909,15 @@ def convert_attrs(attrs, id_field): else: attrs['Name'] = attrs['gene_id'] # convert empty strings into nan - attrs = {x: y if y != '' else 'nan' for x, y in attrs.items()} - # don't include the support vectors in the string, they will be placed in their respective places - attrs_str = ['='.join([key, str(val).replace('=', '_')]) for key, val in sorted(attrs.items()) if 'support' not in key] - # explicitly escape any semicolons that may exist in the input strings - attrs_str = [x.replace(';', '%3B') for x in attrs_str] - return score, ';'.join(attrs_str) + attrs_str = [] + for key, val in attrs.items(): + val = str(val) + if len(val) == 0: + val = 'nan' + val = str(val).replace('=', '%3D').replace(';', '%3B') + key = key.replace('=', '%3D').replace(';', '%3B') + attrs_str.append(f"{key}={val}") + return score, ";".join(attrs_str) def find_feature_support(attrs, feature, i): """Extracts the boolean value from the comma delimited string""" @@ -929,7 +934,7 @@ def find_all_tx_modes(attrs_list): for attrs in attrs_list: tx_modes.update(attrs['transcript_modes'].split(',')) return ','.join(tx_modes) - + intervals = set() for tx in tx_objs: intervals.update(tx.exon_intervals) @@ -1025,4 +1030,4 @@ def write_consensus_fastas(consensus_gene_dict, consensus_fasta, consensus_prote for tx_obj, attrs in tx_list: tools.bio.write_fasta(cfa, attrs['transcript_id'], tx_obj.get_mrna(seq_dict)) if tx_obj.cds_size > 0: - tools.bio.write_fasta(cpfa, attrs['transcript_id'], tx_obj.get_protein_sequence(seq_dict)) + tools.bio.write_fasta(cpfa, attrs['transcript_id'], tx_obj.get_protein_sequence(seq_dict)) \ No newline at end of file diff --git a/cat/hints_db.py b/cat/hints_db.py index 8b73cfc0..3c049c94 100644 --- a/cat/hints_db.py +++ b/cat/hints_db.py @@ -307,7 +307,7 @@ def run_protein_aln(job, protein_subset, genome_fasta_file_id): # perform alignment tmp_exonerate = tools.fileOps.get_tmp_toil_file() cmd = ['exonerate', '--model', 'protein2genome', '--showvulgar', 'no', '--showalignment', 'no', - '--showquerygff', 'yes', genome_fasta, protein_fasta] + '--showquerygff', 'yes', protein_fasta, genome_fasta] tools.procOps.run_proc(cmd, stdout=tmp_exonerate) return job.fileStore.writeGlobalFile(tmp_exonerate) diff --git a/logging.cfg b/logging.cfg index 304ec597..9cb5d68f 100644 --- a/logging.cfg +++ b/logging.cfg @@ -1,21 +1,33 @@ [loggers] -keys=root +keys=root,luigi,cat [handlers] keys=consoleHandler [formatters] -keys=simpleFormatter +keys=consoleFormatter [logger_root] +level=WARNING +handlers=consoleHandler + +[logger_luigi] +level=INFO +handlers=consoleHandler +qualname=luigi-interface +propagate=0 + +[logger_cat] level=INFO handlers=consoleHandler +qualname=cat +propagate=0 [handler_consoleHandler] class=StreamHandler level=INFO -formatter=simpleFormatter +formatter=consoleFormatter args=(sys.stdout,) -[formatter_simpleFormatter] -format=%(levelname)s: %(message)s \ No newline at end of file +[formatter_consoleFormatter] +format=%(levelname)s: %(asctime)-15s - %(message)s \ No newline at end of file diff --git a/tools/gff3.py b/tools/gff3.py index a424af6f..64b0a6be 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -7,6 +7,8 @@ reserved_keys = ['gene_biotype', 'transcript_biotype', + 'gene_type', + 'transcript_type', 'gene_name', 'gene_id', 'transcript_id', @@ -29,17 +31,23 @@ def parse_attrs(attrs): results = [] for tx_id, gene_id in tx_name_map.items(): d = attrs_dict[tx_id] - gene_biotype = d['gene_biotype'] - tx_biotype = d['transcript_biotype'] + gene_biotype = d.get('gene_biotype', d.get('gene_type', None)) + if gene_biotype is None: + raise Exception("Did not find a gene biotype or gene type for {} (attrs={})".format(gene_id, d)) + tx_biotype = d.get('transcript_biotype', d.get('transcript_type', None)) + if tx_biotype is None: + raise Exception("Did not find a transcript biotype or type for {} (attrs={})".format(tx_id, d)) gene_name = d['gene_name'] gene_id = d['gene_id'] tx_id = d['transcript_id'] tx_name = d['transcript_name'] - extra_tags = ';'.join(['{}={}'.format(x, y.replace(';', '%3B')) for x, y in d.items() if x not in reserved_keys]) - try: - misc.parse_gff_attr_line(extra_tags) - except: - raise Exception('Error parsing extra tags in input GFF3') + extra_tags = ';'.join(['{}={}'.format(x, y.replace(';', '%3B').replace('=', '%3D')) + for x, y in d.items() if x not in reserved_keys]) + if len(extra_tags) > 0: + try: + misc.parse_gff_attr_line(extra_tags) + except: + raise Exception(f'Error parsing extra tags in input GFF3 {extra_tags}') if is_external_reference is True: # hack to fix names gene_id = f'exRef-{gene_id}' @@ -53,5 +61,6 @@ def parse_attrs(attrs): def convert_gff3_cmd(annotation_attrs, annotation): cmd = ['gff3ToGenePred', '-rnaNameAttr=transcript_id', '-geneNameAttr=gene_id', '-honorStartStopCodons', + '-refseqHacks', '-attrsOut={}'.format(annotation_attrs), annotation, '/dev/stdout'] - return cmd + return cmd \ No newline at end of file diff --git a/tools/misc.py b/tools/misc.py index 668a57df..1f3d5f73 100644 --- a/tools/misc.py +++ b/tools/misc.py @@ -102,6 +102,8 @@ def sort_gff(input_file, output_file): def parse_gtf_attr_line(attr_line): """parse a GTF attributes line""" + if len(attr_line) == 0: + return {} attr_line = [x.split(' ') for x in re.split('; +', attr_line.replace('"', ''))] attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '') return dict(attr_line) @@ -109,6 +111,8 @@ def parse_gtf_attr_line(attr_line): def parse_gff_attr_line(attr_line): """parse a GFF attributes line""" + if len(attr_line) == 0: + return {} attr_line = [x.split('=') for x in re.split('; *', attr_line.replace('"', ''))] attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '') return dict(attr_line) diff --git a/tools/procOps.py b/tools/procOps.py index 9d83b490..af75be52 100644 --- a/tools/procOps.py +++ b/tools/procOps.py @@ -26,7 +26,10 @@ def cmdLists(cmd): else: return getDockerCommand('quay.io/ucsc_cgl/cat', cmd) elif os.environ.get('CAT_BINARY_MODE') == 'singularity': - img = os.path.join(os.environ['SINGULARITY_PULLFOLDER'], 'cat.img') + if os.environ.get('SINGULARITY_IMAGE'): + img = os.environ['SINGULARITY_IMAGE'] + else: + img = os.path.join(os.environ['SINGULARITY_PULLFOLDER'], 'cat.img') assert os.path.exists(img) if isinstance(cmd[0], list): return list([get_singularity_command(img, c) for c in cmd]) @@ -107,7 +110,7 @@ def popen_catch(command, stdin=None): def mrca_path(path1, path2): """ Gives the Most Recent Common Ancestor directory that contains both paths. - + >>> mrca_path('/usr/lib/python2.7', '/usr/bin/python') '/usr' >>> mrca_path('/usr/', '/usr/') @@ -165,6 +168,10 @@ def getDockerCommand(image, cmd): cmd: list of arguments """ dockerPreamble = ['docker', 'run', '-i', '--rm', '-u', "%s:%s" % (os.getuid(), os.getgid())] + if "TMPDIR" in os.environ: + tmpdir = os.environ["TMPDIR"] + dockerPreamble.extend(["--env", "TMPDIR={}".format(tmpdir)]) + dockerPreamble.extend(['-v', tmpdir + ':' + tmpdir]) work_dirs = [] for i, arg in enumerate(cmd): if arg.startswith('-') and '=' in arg: