diff --git a/.gitignore b/.gitignore index d7bb263..69bba4c 100644 --- a/.gitignore +++ b/.gitignore @@ -173,3 +173,5 @@ data_tables/ .vscode/ *.tsv *.png +*.parq +*.ipynb diff --git a/README.md b/README.md index e40ef42..51eb71c 100644 --- a/README.md +++ b/README.md @@ -65,11 +65,11 @@ Probe filtering workflow: * [x] Add spacing option (length +- nt in optimization step) * [x] Remove gene of interest when using rna-seq data * [x] Save alignment and kmer scores to csv output + * [x] Add GTF file reading & saving as parquet/csv step + * [x] Add EnsembleID support in entrez query + * [x] Verify bowtie parameters run on endo and exo targets * [ ] Allow selection of isoform in entrez query - * [ ] Add EnsembleID support in entrez query * [ ] Select intron/exon based on blast from sequence - * [ ] Verify bowtie parameters run on endo and exo targets - * [ ] Add GTF file reading & saving as parquet/csv step * **Interface** * [x] Clean up output files @@ -77,7 +77,8 @@ Probe filtering workflow: * [x] Decide on way to handle temporary files (tempdir?) * [x] Find way to handle rerunning same gene with different parameters (unique name hash?) * [x] Find way to make CLI alter config (luigi.configuration.add_config_path) - * [ ] Use final output file as indicator if pipeline finished running + * [x] Use final output file as indicator if pipeline finished running + * [x] Proper boolean support :monkey: * **Testing** * [ ] Add unit tests for core components diff --git a/src/prepare_sequence.py b/src/prepare_sequence.py index 55419ff..72e6fe1 100644 --- a/src/prepare_sequence.py +++ b/src/prepare_sequence.py @@ -3,11 +3,12 @@ Select the right strand and select intronic/exonic regions. """ -import os import logging +import os +import subprocess -import luigi import Bio.SeqIO +import luigi from config import GeneralConfig from config import SequenceConfig @@ -24,26 +25,34 @@ def output(self): return luigi.LocalTarget(os.path.join(util.get_output_dir(), fname)) def run(self): - if not SequenceConfig().gene_name or not SequenceConfig().organism_name: + has_ensembl = SequenceConfig().ensemble_id + has_gene_and_organism = ( + SequenceConfig().gene_name and SequenceConfig().organism_name + ) + if not has_ensembl and not has_gene_and_organism: raise ValueError( "For downloading Entrez Gene Probes, " - " you need to specify the gene name and organism name." + " you need to specify the gene name and organism name or provide an Emsembl ID." ) - fasta = os.popen( - f"esearch\ - -db gene\ - -query '{SequenceConfig().gene_name} [GENE]\ - {SequenceConfig().organism_name} [ORGN]' |" - " elink\ - -db gene\ - -target nuccore\ - -name gene_nuccore_refseqrna |" - " efetch -format fasta" - ).read() - self.logger.debug( - f"Fetched Entrez Gene Probes for {SequenceConfig().gene_name} in {SequenceConfig().organism_name}." - ) + if has_ensembl: + query = SequenceConfig().ensemble_id + else: + query = f"{SequenceConfig().gene_name} [GENE] {SequenceConfig().organism_name} [ORGN]" + args_search = ["esearch", "-db", "gene", "-query", query] + args_link = [ + "elink", + "-db", + "gene", + "-target", + "nuccore", + "-name", + "gene_nuccore_refseqrna", + ] + args_fetch = ["efetch", "-format", "fasta"] + args_entrez = [*args_search, "|", *args_link, "|", *args_fetch] + self.logger.debug(f"Fetching from Entrez using query '{query}'.") + fasta = subprocess.check_output(args_entrez, stderr=subprocess.STDOUT).decode() if not fasta: raise LookupError( @@ -64,12 +73,16 @@ def output(self): ] def run(self): - os.system( - f"makeblastdb\ - -dbtype nucl\ - -in {GeneralConfig().reference_genome}\ - -out {util.get_genome_name()}" - ) + args_blast = [ + "makeblastdb", + "-dbtype", + "nucl", + "-in", + GeneralConfig().reference_genome, + "-out", + util.get_genome_name(), + ] + subprocess.check_call(args_blast) class PrepareSequence(luigi.Task):