Skip to content

Commit

Permalink
Add setup.py file; Rename src to efishent
Browse files Browse the repository at this point in the history
  • Loading branch information
BBQuercus committed Jun 23, 2022
1 parent 1621698 commit 3daee92
Show file tree
Hide file tree
Showing 18 changed files with 204 additions and 109 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ Probe filtering workflow:
* [x] Add GTF file reading & saving as parquet/csv step
* [x] Add EnsembleID support in entrez query
* [x] Verify bowtie parameters run on endo and exo targets
* [ ] Allow selection of isoform in entrez query
* [ ] Select intron/exon based on blast from sequence
* [x] Select intron/exon based on blast from sequence - doesn't make sense given all different isoforms and variations
* [x] Allow selection of isoform in entrez query - not available / different transcript id's yield same sequence

* **Interface**
* [x] Clean up output files
Expand Down
21 changes: 21 additions & 0 deletions efishent/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""eFISHent"""

__version__ = "0.0.1"

import logging

# Silence luigi-interface
logging.getLogger("luigi-interface").setLevel(level=logging.CRITICAL)

from . import alignment
from . import basic_filtering
from . import cleanup
from . import cli
from . import config
from . import constants
from . import generate_probes
from . import kmers
from . import optimization
from . import prepare_sequence
from . import secondary_structure
from . import util
2 changes: 1 addition & 1 deletion src/__main__.py → efishent/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Entrypoint module for the application."""

from cli import main
from .cli import main

if __name__ == "__main__":
main()
38 changes: 19 additions & 19 deletions src/alignment.py → efishent/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
import numpy as np
import pandas as pd

from config import GeneralConfig
from config import ProbeConfig
from config import SequenceConfig
from basic_filtering import BasicFiltering
from prepare_sequence import BuildBlastDatabase
from prepare_sequence import PrepareSequence
import constants
import util
from .config import GeneralConfig
from .config import ProbeConfig
from .config import SequenceConfig
from .basic_filtering import BasicFiltering
from .prepare_sequence import BuildBlastDatabase
from .prepare_sequence import PrepareSequence
from . import constants
from . import util


class BuildBowtieIndex(luigi.Task):
Expand Down Expand Up @@ -112,6 +112,17 @@ def output(self):
)
return tasks

@staticmethod
def read_count_table() -> pd.DataFrame:
"""Read and verify a RNAseq FPRKM count table."""
df_counts = pd.read_csv(ProbeConfig().encode_count_table, sep="\t")
counts = df_counts[df_counts["gene_id"].str.contains("ENSG")].copy()
counts["clean_id"] = counts["gene_id"].apply(lambda x: x.split(".")[0])
return counts[constants.COUNTS_COLUMNS]

def read_gtf_file(self):
return pd.read_parquet(self.input()["gtf"].path)[constants.GTF_COLUMNS]

def align_probes(self, fname_fasta: str, fname_sam: str) -> None:
"""Align probes to the reference genome."""
# Convert fasta to fastq - bowtie doesn't return read names if not in fastq...
Expand Down Expand Up @@ -162,17 +173,6 @@ def filter_unique_probes(self, fname_sam: str) -> pd.DataFrame:
)
return df

@staticmethod
def read_count_table() -> pd.DataFrame:
"""Read and verify a RNAseq FPRKM count table."""
df_counts = pd.read_csv(ProbeConfig().encode_count_table, sep="\t")
counts = df_counts[df_counts["gene_id"].str.contains("ENSG")].copy()
counts["clean_id"] = counts["gene_id"].apply(lambda x: x.split(".")[0])
return counts[constants.COUNTS_COLUMNS]

def read_gtf_file(self):
return pd.read_parquet(self.input()["gtf"].path)[constants.GTF_COLUMNS]

def filter_gene_of_interest(self, df: pd.DataFrame) -> pd.DataFrame:
"""Filter FPKM table to exclude the gene of interest."""
# Filter using provided EnsembleID directly
Expand Down
8 changes: 4 additions & 4 deletions src/basic_filtering.py → efishent/basic_filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
import Bio.SeqUtils.MeltingTemp
import luigi

from config import GeneralConfig
from config import ProbeConfig
from generate_probes import GenerateAllProbes
import util
from .config import GeneralConfig
from .config import ProbeConfig
from .generate_probes import GenerateAllProbes
from . import util


def get_melting_temp(sequence: str) -> float:
Expand Down
22 changes: 11 additions & 11 deletions src/cleanup.py → efishent/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,17 @@
import pandas as pd
import Bio.SeqIO

from alignment import AlignProbeCandidates
from basic_filtering import get_gc_content
from basic_filtering import get_melting_temp
from config import ProbeConfig
from config import RunConfig
from config import SequenceConfig
from kmers import BuildJellyfishIndex
from kmers import get_max_kmer_count
from optimization import OptimizeProbeCoverage
from secondary_structure import get_free_energy
import util
from .alignment import AlignProbeCandidates
from .basic_filtering import get_gc_content
from .basic_filtering import get_melting_temp
from .config import ProbeConfig
from .config import RunConfig
from .config import SequenceConfig
from .kmers import BuildJellyfishIndex
from .kmers import get_max_kmer_count
from .optimization import OptimizeProbeCoverage
from .secondary_structure import get_free_energy
from . import util


class CleanUpOutput(luigi.Task):
Expand Down
24 changes: 16 additions & 8 deletions src/cli.py → efishent/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,19 @@
import configparser
import logging
import os
import sys
import tempfile

import luigi

from alignment import BuildBowtieIndex
from cleanup import CleanUpOutput
from config import GeneralConfig
from config import ProbeConfig
from config import RunConfig
from config import SequenceConfig
from kmers import BuildJellyfishIndex
from util import UniCode
from .alignment import BuildBowtieIndex
from .cleanup import CleanUpOutput
from .config import GeneralConfig
from .config import ProbeConfig
from .config import RunConfig
from .config import SequenceConfig
from .kmers import BuildJellyfishIndex
from .util import UniCode

CONFIG_CLASSES = [GeneralConfig, RunConfig, SequenceConfig, ProbeConfig]

Expand Down Expand Up @@ -110,6 +111,12 @@ def _parse_args() -> argparse.Namespace:
)
_add_groups(parser)
_add_utilities(parser)
try:
if len(sys.argv) == 1:
parser.print_help()
parser.exit(0)
except Exception as e:
print(e)
return parser.parse_args()


Expand Down Expand Up @@ -143,6 +150,7 @@ def set_logging_level(verbose: bool, debug: bool) -> logging.Logger:
custom_level = logging.WARNING

logging.getLogger("luigi").setLevel(luigi_level)
logging.getLogger("luigi-interface").setLevel(luigi_level)
luigi.interface.core.log_level = luigi_level

logger = logging.getLogger("custom-logger")
Expand Down
72 changes: 46 additions & 26 deletions src/config.py → efishent/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
- number of threads
- method of optimization
- optimization time limit
- verbosity
Probe specific options:
- (sequence file) or (gene name & organism)
Expand All @@ -30,19 +29,19 @@

class GeneralConfig(luigi.Config):
reference_genome = luigi.Parameter(
description="Path to reference genome fasta file.",
description="Path to reference genome fasta/fa file.",
significant=True,
default=None,
default="",
)
reference_annotation = luigi.Parameter(
description="Path to reference genome annotation file.", default=None
description="Path to reference genome annotation (gene transfer format / gtf) file.",
default="",
)
threads = luigi.IntParameter(description="Number of threads to use.", default=42)
verbosity = luigi.IntParameter(description="Verbosity level.", default=1)
threads = luigi.IntParameter(description="Number of threads to launch.", default=42)
output_dir = luigi.Parameter(
description="Path to output directory. "
"If not specified, will use the current directory.",
default=None,
default="",
)


Expand All @@ -54,7 +53,11 @@ class RunConfig(luigi.Config):
description="Save intermediate files?", default=False
)
optimization_method = luigi.Parameter(
description="Optimization method to use [options: optimal, greedy].",
description="Optimization method to use. "
"Greedy will procedurally select the next longest probe. "
"Optimal will optimize probes for maximum gene coverage - "
"slow if many overlaps are present (typically with non-stringent parameter settings). "
"[options: optimal, greedy].",
default="greedy",
)
optimization_time_limit = luigi.IntParameter(
Expand All @@ -66,22 +69,21 @@ class RunConfig(luigi.Config):

class SequenceConfig(luigi.Config):
sequence_file = luigi.Parameter(
description="Path to the gene's sequence file.", default=""
description="Path to the gene's sequence file. "
"Use this if the sequence can't be easily downloaded or if only certain regions should be targeted.",
default="",
)
ensemble_id = luigi.Parameter(
description="Ensembl ID of the gene of interest."
"Can be used instead of gene and organism name to download the gene of interest."
"Used to filter out the gene of interest from FPKM based filtering.",
"Used to filter out the gene of interest from FPKM based filtering - "
"will do an automated blast-based filtering if not passed.",
default="",
)
gene_name = luigi.Parameter(description="Gene name.", default="")
organism_name = luigi.Parameter(
description="Latin name of the organism.", default=""
)
is_intronic = luigi.BoolParameter(
description="Is the probe intronic?", default=False
)
is_exonic = luigi.BoolParameter(description="Is the probe exonic?", default=True)
is_plus_strand = luigi.BoolParameter(
description="Is the probe targeting the plus strand?", default=True
)
Expand All @@ -91,34 +93,47 @@ class SequenceConfig(luigi.Config):


class ProbeConfig(luigi.Config):
min_length = luigi.IntParameter(description="Minimum probe length.", default=21)
max_length = luigi.IntParameter(description="Maximum probe length.", default=25)
min_length = luigi.IntParameter(
description="Minimum probe length in nucleotides.", default=21
)
max_length = luigi.IntParameter(
description="Maximum probe length in nucleotides.", default=25
)
spacing = luigi.IntParameter(
description="Minimum distance in nucleotides between probes.", default=2
)
min_tm = luigi.FloatParameter(
description="Minimum melting temperature.", default=40.0
description="Minimum melting temperature in degrees C. "
"Formamide and Na concentration will affect melting temperature!",
default=40.0,
)
max_tm = luigi.FloatParameter(
description="Maximum melting temperature.", default=60.0
description="Maximum melting temperature in degrees C (see minimum).",
default=60.0,
)
min_gc = luigi.FloatParameter(
description="Minimum GC content in percentage.", default=20.0
)
max_gc = luigi.FloatParameter(
description="Maximum GC content in percentage.", default=80.0
)
min_gc = luigi.FloatParameter(description="Minimum GC content.", default=20.0)
max_gc = luigi.FloatParameter(description="Maximum GC content.", default=80.0)
formamide_concentration = luigi.FloatParameter(
description="Formamide concentration as a percentage of the total buffer.",
default=0.0,
)
na_concentration = luigi.FloatParameter(
description="Na concentration in mM.", default=390.0
)
kmer_length = luigi.IntParameter(
description="Length of k-mer used to filter probe sequences.", default=15
)
max_off_targets = luigi.IntParameter(
description="Maximum number of off-targets.", default=0
description="Maximum number of off-targets binding anywhere BUT "
"the gene of interest in the genome.",
default=0,
)
encode_count_table = luigi.Parameter(
description="Path to the ENCODE RNAseq count table.", default=None
description="Path to the ENCODE RNAseq count table. "
"Must contain the columns 'gene_id' and 'FPKM'. "
"'gene_id' must use ensembl ID's!",
default="",
)
max_off_target_fpkm = luigi.FloatParameter(
description=(
Expand All @@ -128,9 +143,14 @@ class ProbeConfig(luigi.Config):
),
default=10.0,
)
kmer_length = luigi.IntParameter(
description="Length of k-mer used to filter probe sequences.", default=15
)
max_kmers = luigi.IntParameter(
description="Highest count of sub-k-mers found in reference genome.", default=5
)
max_deltaG = luigi.FloatParameter(
description="Maximum deltaG in kcal/mol.", default=-10.0
description="Maximum predicted deltaG in kcal/mol. "
"Note: deltaGs range from 0 (no secondary structures) to increasingly negative values!",
default=-10.0,
)
File renamed without changes.
7 changes: 3 additions & 4 deletions src/generate_probes.py → efishent/generate_probes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
import Bio.SeqUtils.MeltingTemp
import luigi

from config import ProbeConfig
from prepare_sequence import PrepareSequence
import util
from .config import ProbeConfig
from .prepare_sequence import PrepareSequence
from . import util


class GenerateAllProbes(luigi.Task):
Expand Down Expand Up @@ -48,4 +48,3 @@ def run(self):

util.log_and_check_candidates(self.logger, "GenerateAllProbes", len(candidates))
Bio.SeqIO.write(candidates, self.output().path, "fasta")

8 changes: 4 additions & 4 deletions src/kmers.py → efishent/kmers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
import Bio.SeqRecord
import luigi

from config import GeneralConfig
from config import ProbeConfig
from alignment import AlignProbeCandidates
import util
from .config import GeneralConfig
from .config import ProbeConfig
from .alignment import AlignProbeCandidates
from . import util


def get_max_kmer_count(sequence: Bio.SeqRecord, jellyfish_path: str) -> int:
Expand Down
1 change: 0 additions & 1 deletion src/luigi.cfg → efishent/luigi.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
reference_genome: ../dm6/dm6.fa
reference_annotation:
threads: 42
verbosity: 1
output_dir:

[RunConfig]
Expand Down
8 changes: 4 additions & 4 deletions src/optimization.py → efishent/optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
import pandas as pd
import pyomo.environ as pe

from config import GeneralConfig, ProbeConfig
from config import RunConfig
from secondary_structure import SecondaryStructureFiltering
import util
from .config import GeneralConfig, ProbeConfig
from .config import RunConfig
from .secondary_structure import SecondaryStructureFiltering
from . import util


class OptimizeProbeCoverage(luigi.Task):
Expand Down
Loading

0 comments on commit 3daee92

Please sign in to comment.