diff --git a/README.md b/README.md index 51eb71c..95058c9 100644 --- a/README.md +++ b/README.md @@ -68,8 +68,8 @@ Probe filtering workflow: * [x] Add GTF file reading & saving as parquet/csv step * [x] Add EnsembleID support in entrez query * [x] Verify bowtie parameters run on endo and exo targets - * [ ] Allow selection of isoform in entrez query - * [ ] Select intron/exon based on blast from sequence + * [x] Select intron/exon based on blast from sequence - doesn't make sense given all different isoforms and variations + * [x] Allow selection of isoform in entrez query - not available / different transcript id's yield same sequence * **Interface** * [x] Clean up output files diff --git a/efishent/__init__.py b/efishent/__init__.py new file mode 100644 index 0000000..7a9c708 --- /dev/null +++ b/efishent/__init__.py @@ -0,0 +1,21 @@ +"""eFISHent""" + +__version__ = "0.0.1" + +import logging + +# Silence luigi-interface +logging.getLogger("luigi-interface").setLevel(level=logging.CRITICAL) + +from . import alignment +from . import basic_filtering +from . import cleanup +from . import cli +from . import config +from . import constants +from . import generate_probes +from . import kmers +from . import optimization +from . import prepare_sequence +from . import secondary_structure +from . import util diff --git a/src/__main__.py b/efishent/__main__.py similarity index 79% rename from src/__main__.py rename to efishent/__main__.py index 28c968a..31fc276 100644 --- a/src/__main__.py +++ b/efishent/__main__.py @@ -1,6 +1,6 @@ """Entrypoint module for the application.""" -from cli import main +from .cli import main if __name__ == "__main__": main() diff --git a/src/alignment.py b/efishent/alignment.py similarity index 97% rename from src/alignment.py rename to efishent/alignment.py index 9e0418a..e4f9d30 100644 --- a/src/alignment.py +++ b/efishent/alignment.py @@ -16,14 +16,14 @@ import numpy as np import pandas as pd -from config import GeneralConfig -from config import ProbeConfig -from config import SequenceConfig -from basic_filtering import BasicFiltering -from prepare_sequence import BuildBlastDatabase -from prepare_sequence import PrepareSequence -import constants -import util +from .config import GeneralConfig +from .config import ProbeConfig +from .config import SequenceConfig +from .basic_filtering import BasicFiltering +from .prepare_sequence import BuildBlastDatabase +from .prepare_sequence import PrepareSequence +from . import constants +from . import util class BuildBowtieIndex(luigi.Task): @@ -112,6 +112,17 @@ def output(self): ) return tasks + @staticmethod + def read_count_table() -> pd.DataFrame: + """Read and verify a RNAseq FPRKM count table.""" + df_counts = pd.read_csv(ProbeConfig().encode_count_table, sep="\t") + counts = df_counts[df_counts["gene_id"].str.contains("ENSG")].copy() + counts["clean_id"] = counts["gene_id"].apply(lambda x: x.split(".")[0]) + return counts[constants.COUNTS_COLUMNS] + + def read_gtf_file(self): + return pd.read_parquet(self.input()["gtf"].path)[constants.GTF_COLUMNS] + def align_probes(self, fname_fasta: str, fname_sam: str) -> None: """Align probes to the reference genome.""" # Convert fasta to fastq - bowtie doesn't return read names if not in fastq... @@ -162,17 +173,6 @@ def filter_unique_probes(self, fname_sam: str) -> pd.DataFrame: ) return df - @staticmethod - def read_count_table() -> pd.DataFrame: - """Read and verify a RNAseq FPRKM count table.""" - df_counts = pd.read_csv(ProbeConfig().encode_count_table, sep="\t") - counts = df_counts[df_counts["gene_id"].str.contains("ENSG")].copy() - counts["clean_id"] = counts["gene_id"].apply(lambda x: x.split(".")[0]) - return counts[constants.COUNTS_COLUMNS] - - def read_gtf_file(self): - return pd.read_parquet(self.input()["gtf"].path)[constants.GTF_COLUMNS] - def filter_gene_of_interest(self, df: pd.DataFrame) -> pd.DataFrame: """Filter FPKM table to exclude the gene of interest.""" # Filter using provided EnsembleID directly diff --git a/src/basic_filtering.py b/efishent/basic_filtering.py similarity index 94% rename from src/basic_filtering.py rename to efishent/basic_filtering.py index a165fc5..d9d9bbd 100644 --- a/src/basic_filtering.py +++ b/efishent/basic_filtering.py @@ -10,10 +10,10 @@ import Bio.SeqUtils.MeltingTemp import luigi -from config import GeneralConfig -from config import ProbeConfig -from generate_probes import GenerateAllProbes -import util +from .config import GeneralConfig +from .config import ProbeConfig +from .generate_probes import GenerateAllProbes +from . import util def get_melting_temp(sequence: str) -> float: diff --git a/src/cleanup.py b/efishent/cleanup.py similarity index 88% rename from src/cleanup.py rename to efishent/cleanup.py index 16ea48b..62d2db6 100644 --- a/src/cleanup.py +++ b/efishent/cleanup.py @@ -15,17 +15,17 @@ import pandas as pd import Bio.SeqIO -from alignment import AlignProbeCandidates -from basic_filtering import get_gc_content -from basic_filtering import get_melting_temp -from config import ProbeConfig -from config import RunConfig -from config import SequenceConfig -from kmers import BuildJellyfishIndex -from kmers import get_max_kmer_count -from optimization import OptimizeProbeCoverage -from secondary_structure import get_free_energy -import util +from .alignment import AlignProbeCandidates +from .basic_filtering import get_gc_content +from .basic_filtering import get_melting_temp +from .config import ProbeConfig +from .config import RunConfig +from .config import SequenceConfig +from .kmers import BuildJellyfishIndex +from .kmers import get_max_kmer_count +from .optimization import OptimizeProbeCoverage +from .secondary_structure import get_free_energy +from . import util class CleanUpOutput(luigi.Task): diff --git a/src/cli.py b/efishent/cli.py similarity index 91% rename from src/cli.py rename to efishent/cli.py index 52087a0..4110fed 100644 --- a/src/cli.py +++ b/efishent/cli.py @@ -2,18 +2,19 @@ import configparser import logging import os +import sys import tempfile import luigi -from alignment import BuildBowtieIndex -from cleanup import CleanUpOutput -from config import GeneralConfig -from config import ProbeConfig -from config import RunConfig -from config import SequenceConfig -from kmers import BuildJellyfishIndex -from util import UniCode +from .alignment import BuildBowtieIndex +from .cleanup import CleanUpOutput +from .config import GeneralConfig +from .config import ProbeConfig +from .config import RunConfig +from .config import SequenceConfig +from .kmers import BuildJellyfishIndex +from .util import UniCode CONFIG_CLASSES = [GeneralConfig, RunConfig, SequenceConfig, ProbeConfig] @@ -110,6 +111,12 @@ def _parse_args() -> argparse.Namespace: ) _add_groups(parser) _add_utilities(parser) + try: + if len(sys.argv) == 1: + parser.print_help() + parser.exit(0) + except Exception as e: + print(e) return parser.parse_args() @@ -143,6 +150,7 @@ def set_logging_level(verbose: bool, debug: bool) -> logging.Logger: custom_level = logging.WARNING logging.getLogger("luigi").setLevel(luigi_level) + logging.getLogger("luigi-interface").setLevel(luigi_level) luigi.interface.core.log_level = luigi_level logger = logging.getLogger("custom-logger") diff --git a/src/config.py b/efishent/config.py similarity index 63% rename from src/config.py rename to efishent/config.py index 1ae1cc3..3c02c5e 100644 --- a/src/config.py +++ b/efishent/config.py @@ -10,7 +10,6 @@ - number of threads - method of optimization - optimization time limit - - verbosity Probe specific options: - (sequence file) or (gene name & organism) @@ -30,19 +29,19 @@ class GeneralConfig(luigi.Config): reference_genome = luigi.Parameter( - description="Path to reference genome fasta file.", + description="Path to reference genome fasta/fa file.", significant=True, - default=None, + default="", ) reference_annotation = luigi.Parameter( - description="Path to reference genome annotation file.", default=None + description="Path to reference genome annotation (gene transfer format / gtf) file.", + default="", ) - threads = luigi.IntParameter(description="Number of threads to use.", default=42) - verbosity = luigi.IntParameter(description="Verbosity level.", default=1) + threads = luigi.IntParameter(description="Number of threads to launch.", default=42) output_dir = luigi.Parameter( description="Path to output directory. " "If not specified, will use the current directory.", - default=None, + default="", ) @@ -54,7 +53,11 @@ class RunConfig(luigi.Config): description="Save intermediate files?", default=False ) optimization_method = luigi.Parameter( - description="Optimization method to use [options: optimal, greedy].", + description="Optimization method to use. " + "Greedy will procedurally select the next longest probe. " + "Optimal will optimize probes for maximum gene coverage - " + "slow if many overlaps are present (typically with non-stringent parameter settings). " + "[options: optimal, greedy].", default="greedy", ) optimization_time_limit = luigi.IntParameter( @@ -66,22 +69,21 @@ class RunConfig(luigi.Config): class SequenceConfig(luigi.Config): sequence_file = luigi.Parameter( - description="Path to the gene's sequence file.", default="" + description="Path to the gene's sequence file. " + "Use this if the sequence can't be easily downloaded or if only certain regions should be targeted.", + default="", ) ensemble_id = luigi.Parameter( description="Ensembl ID of the gene of interest." "Can be used instead of gene and organism name to download the gene of interest." - "Used to filter out the gene of interest from FPKM based filtering.", + "Used to filter out the gene of interest from FPKM based filtering - " + "will do an automated blast-based filtering if not passed.", default="", ) gene_name = luigi.Parameter(description="Gene name.", default="") organism_name = luigi.Parameter( description="Latin name of the organism.", default="" ) - is_intronic = luigi.BoolParameter( - description="Is the probe intronic?", default=False - ) - is_exonic = luigi.BoolParameter(description="Is the probe exonic?", default=True) is_plus_strand = luigi.BoolParameter( description="Is the probe targeting the plus strand?", default=True ) @@ -91,19 +93,30 @@ class SequenceConfig(luigi.Config): class ProbeConfig(luigi.Config): - min_length = luigi.IntParameter(description="Minimum probe length.", default=21) - max_length = luigi.IntParameter(description="Maximum probe length.", default=25) + min_length = luigi.IntParameter( + description="Minimum probe length in nucleotides.", default=21 + ) + max_length = luigi.IntParameter( + description="Maximum probe length in nucleotides.", default=25 + ) spacing = luigi.IntParameter( description="Minimum distance in nucleotides between probes.", default=2 ) min_tm = luigi.FloatParameter( - description="Minimum melting temperature.", default=40.0 + description="Minimum melting temperature in degrees C. " + "Formamide and Na concentration will affect melting temperature!", + default=40.0, ) max_tm = luigi.FloatParameter( - description="Maximum melting temperature.", default=60.0 + description="Maximum melting temperature in degrees C (see minimum).", + default=60.0, + ) + min_gc = luigi.FloatParameter( + description="Minimum GC content in percentage.", default=20.0 + ) + max_gc = luigi.FloatParameter( + description="Maximum GC content in percentage.", default=80.0 ) - min_gc = luigi.FloatParameter(description="Minimum GC content.", default=20.0) - max_gc = luigi.FloatParameter(description="Maximum GC content.", default=80.0) formamide_concentration = luigi.FloatParameter( description="Formamide concentration as a percentage of the total buffer.", default=0.0, @@ -111,14 +124,16 @@ class ProbeConfig(luigi.Config): na_concentration = luigi.FloatParameter( description="Na concentration in mM.", default=390.0 ) - kmer_length = luigi.IntParameter( - description="Length of k-mer used to filter probe sequences.", default=15 - ) max_off_targets = luigi.IntParameter( - description="Maximum number of off-targets.", default=0 + description="Maximum number of off-targets binding anywhere BUT " + "the gene of interest in the genome.", + default=0, ) encode_count_table = luigi.Parameter( - description="Path to the ENCODE RNAseq count table.", default=None + description="Path to the ENCODE RNAseq count table. " + "Must contain the columns 'gene_id' and 'FPKM'. " + "'gene_id' must use ensembl ID's!", + default="", ) max_off_target_fpkm = luigi.FloatParameter( description=( @@ -128,9 +143,14 @@ class ProbeConfig(luigi.Config): ), default=10.0, ) + kmer_length = luigi.IntParameter( + description="Length of k-mer used to filter probe sequences.", default=15 + ) max_kmers = luigi.IntParameter( description="Highest count of sub-k-mers found in reference genome.", default=5 ) max_deltaG = luigi.FloatParameter( - description="Maximum deltaG in kcal/mol.", default=-10.0 + description="Maximum predicted deltaG in kcal/mol. " + "Note: deltaGs range from 0 (no secondary structures) to increasingly negative values!", + default=-10.0, ) diff --git a/src/constants.py b/efishent/constants.py similarity index 100% rename from src/constants.py rename to efishent/constants.py diff --git a/src/generate_probes.py b/efishent/generate_probes.py similarity index 92% rename from src/generate_probes.py rename to efishent/generate_probes.py index f13334a..32176a7 100644 --- a/src/generate_probes.py +++ b/efishent/generate_probes.py @@ -12,9 +12,9 @@ import Bio.SeqUtils.MeltingTemp import luigi -from config import ProbeConfig -from prepare_sequence import PrepareSequence -import util +from .config import ProbeConfig +from .prepare_sequence import PrepareSequence +from . import util class GenerateAllProbes(luigi.Task): @@ -48,4 +48,3 @@ def run(self): util.log_and_check_candidates(self.logger, "GenerateAllProbes", len(candidates)) Bio.SeqIO.write(candidates, self.output().path, "fasta") - diff --git a/src/kmers.py b/efishent/kmers.py similarity index 96% rename from src/kmers.py rename to efishent/kmers.py index 675bdab..f5b3763 100644 --- a/src/kmers.py +++ b/efishent/kmers.py @@ -12,10 +12,10 @@ import Bio.SeqRecord import luigi -from config import GeneralConfig -from config import ProbeConfig -from alignment import AlignProbeCandidates -import util +from .config import GeneralConfig +from .config import ProbeConfig +from .alignment import AlignProbeCandidates +from . import util def get_max_kmer_count(sequence: Bio.SeqRecord, jellyfish_path: str) -> int: diff --git a/src/luigi.cfg b/efishent/luigi.cfg similarity index 98% rename from src/luigi.cfg rename to efishent/luigi.cfg index 790c938..b817e48 100644 --- a/src/luigi.cfg +++ b/efishent/luigi.cfg @@ -2,7 +2,6 @@ reference_genome: ../dm6/dm6.fa reference_annotation: threads: 42 -verbosity: 1 output_dir: [RunConfig] diff --git a/src/optimization.py b/efishent/optimization.py similarity index 97% rename from src/optimization.py rename to efishent/optimization.py index 42a4590..f96e6c3 100644 --- a/src/optimization.py +++ b/efishent/optimization.py @@ -15,10 +15,10 @@ import pandas as pd import pyomo.environ as pe -from config import GeneralConfig, ProbeConfig -from config import RunConfig -from secondary_structure import SecondaryStructureFiltering -import util +from .config import GeneralConfig, ProbeConfig +from .config import RunConfig +from .secondary_structure import SecondaryStructureFiltering +from . import util class OptimizeProbeCoverage(luigi.Task): diff --git a/src/prepare_sequence.py b/efishent/prepare_sequence.py similarity index 85% rename from src/prepare_sequence.py rename to efishent/prepare_sequence.py index 72e6fe1..c0955dd 100644 --- a/src/prepare_sequence.py +++ b/efishent/prepare_sequence.py @@ -10,9 +10,9 @@ import Bio.SeqIO import luigi -from config import GeneralConfig -from config import SequenceConfig -import util +from .config import GeneralConfig +from .config import SequenceConfig +from . import util class DownloadEntrezGeneSequence(luigi.Task): @@ -66,6 +66,8 @@ def run(self): class BuildBlastDatabase(luigi.Task): + """Build local nucleotide blast database.""" + def output(self): return [ luigi.LocalTarget(f"{util.get_genome_name()}.{extension}") @@ -94,26 +96,14 @@ class PrepareSequence(luigi.Task): def requires(self): tasks = {} - if SequenceConfig().sequence_file == "None": + if not SequenceConfig().sequence_file: tasks["entrez"] = DownloadEntrezGeneSequence() - if SequenceConfig().is_exonic or SequenceConfig().is_intronic: - tasks["blast"] = BuildBlastDatabase() return tasks def output(self): fname = f"{util.get_gene_name()}_sequence.fasta" return luigi.LocalTarget(os.path.join(util.get_output_dir(), fname)) - def select_gene_region( - self, sequence: Bio.SeqRecord.SeqRecord - ) -> Bio.SeqRecord.SeqRecord: - """Select exon/intronic regions.""" - # TODO select exon/intron/both - # Create blast database - # Blast sequence against database - # Parse blast output - # Select regions based on genome annotation - def run(self): input_file = ( self.input()["entrez"].path @@ -138,6 +128,4 @@ def run(self): sequence = sequence.reverse_complement(id=True, description=True) self.logger.debug("Converted sequence to reverse complement.") - # sequence = self.select_gene_region(sequence) - Bio.SeqIO.write(sequence, self.output().path, format="fasta") diff --git a/src/secondary_structure.py b/efishent/secondary_structure.py similarity index 94% rename from src/secondary_structure.py rename to efishent/secondary_structure.py index 54e8718..61fbe9b 100644 --- a/src/secondary_structure.py +++ b/efishent/secondary_structure.py @@ -13,10 +13,10 @@ import Bio.SeqRecord import luigi -from config import GeneralConfig -from config import ProbeConfig -from kmers import KMerFiltering -import util +from .config import GeneralConfig +from .config import ProbeConfig +from .kmers import KMerFiltering +from . import util def get_free_energy(sequence: Bio.SeqRecord) -> float: diff --git a/src/util.py b/efishent/util.py similarity index 98% rename from src/util.py rename to efishent/util.py index 16f1200..43d3281 100644 --- a/src/util.py +++ b/efishent/util.py @@ -1,4 +1,3 @@ -import copy import hashlib import logging import os @@ -8,8 +7,8 @@ import luigi import pandas as pd -from config import GeneralConfig -from config import SequenceConfig +from .config import GeneralConfig +from .config import SequenceConfig FASTA_EXT = (".fasta", ".fna", ".ffn", ".faa", ".frn", ".fa") diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0673b42 --- /dev/null +++ b/setup.py @@ -0,0 +1,61 @@ +"""Setup file for pypi package called efishent.""" + +# TODO write makefile for basic deployment using +# python setup.py sdist +# twine upload dist/latest-version.tar.gz + +import textwrap +from setuptools import find_packages +from setuptools import setup + +setup( + # Description + name="efishent", + version="0.0.1", + license="MIT", + description="RNA FISH oligos/probes design tool.", + long_description_content_type="text/plain", + long_description=textwrap.dedent("""TDB."""), + # Installation + python_requires=">3.9", + packages=find_packages(), + include_package_data=True, + zip_safe=False, + install_requires=[ + "matplotlib", + "numpy", + "pandas", + "luigi", + "gtfparse", + "biopython", + "tqdm", + "pyomo", + ], + entry_points={"console_scripts": ["efishent = efishent.cli:main"]}, + # Metadata + author="Bastian Eichenberger", + author_email="bastian@eichenbergers.ch", + url="https://github.com/bbquercus/efishent/", + project_urls={ + "Documentation": "https://github.com/BBQuercus/efishent/wiki", + "Changelog": "https://github.com/BBQuercus/efishent/releases", + "Issue Tracker": "https://github.com/bbquercus/efishent/issues", + }, + keywords=["biomedical", "bioinformatics", "RNA probe design"], + classifiers=[ + # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: MacOS", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: Unix", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Utilities", + ], +) diff --git a/src/__init__.py b/src/__init__.py deleted file mode 100644 index e69de29..0000000