Merge pull request #240 from HadrienG/dev

New release
HadrienG · Feb 15, 2024 · d25d840 · d25d840
2 parents 1687d52 + 0b80d8e
commit d25d840
Show file tree

Hide file tree

Showing 44 changed files with 2,073 additions and 1,626 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+max-line-length = 120
+ignore = E203, W503, C901
+exclude = .git,__pycache__,docs/source/conf.py,old,build,dist
+max-complexity = 10
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -21,11 +21,17 @@ jobs:
                   python -m pip install --upgrade pip
                   pip install pipenv
                   pipenv install --dev
+            - name: Style check
+              run: |
+                  pipenv run black
+                  pipenv run isort
+                  pipenv run flake8
             - name: Test with pytest
               run: |
                   chmod -w data/read_only.fasta
                   pipenv run tests
             - name: Upload to Codecov
+              if: github.event.repository.fork == false
               uses: codecov/codecov-action@v1.0.2
               with:
                   token: ${{ secrets.CODECOV_TOKEN }}

diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 *.bam
 *.bam.bai
 *.bt2
+*.npz
 test_runs/
 
 # Byte-compiled / optimized / DLL files
@@ -11,6 +12,7 @@ __pycache__/
 
 # Distribution / packaging
 *.egg-info/
+.eggs/
 dist/
 build/
 

diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -1,4 +1,11 @@
+version: 2
+
+build:
+    os: ubuntu-22.04
+    tools: 
+        python: "3"
 python:
     install:
         - requirements: doc/requirements.txt
         - method: pip
+          path: .
diff --git a/Pipfile b/Pipfile
@@ -4,11 +4,9 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
-future = "*"
 numpy = "*"
 scipy = "*"
 biopython = "*"
-joblib = "*"
 pysam = "*"
 requests = "*"
 urllib3 = ">=1.26.5"
@@ -20,7 +18,15 @@ pycodestyle = "*"
 pytest = "*"
 pytest-cov = "*"
 exceptiongroup = "*"
+black = "*"
+isort = "*"
+flake8 = "*"
+typing-extensions = "*"
+tomli = "*"
 
 [scripts]
 iss = "python -m iss"
 tests = "pytest --cov=iss ."
+black = "black --check ."
+isort = "isort --check ."
+flake8 = "flake8 ."
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/data/readcounts.txt b/data/readcounts.txt
@@ -0,0 +1,5 @@
+amplicon_A    2
+amplicon_T    2
+amplicon_GC   4
+amplicon_ATCG 1
+amplicon_TA   1
diff --git a/doc/index.rst b/doc/index.rst
@@ -16,7 +16,7 @@ InSilicoSeq supports substitution, insertion, deletion errors, and models gc bia
 Details
 -------
 
-* **Authors:** Hadrien Gourlé, Juliette Hayer, Oskar E. Karlsson and Erik Bongcam-Rudloff
+* **Authors:** Hadrien Gourlé, Juliette Hayer, Oskar E. Karlsson, Erik Bongcam-Rudloff, Stefan Lelieveld, Thijs Maas
 * **Contact:** `hadrien.gourle@slu.se <hadrien.gourle@slu.se>`_
 * **GitHub:** `HadrienG/InSilicoSeq <https://github.com/HadrienG/InSilicoSeq>`_
 * **License:** MIT

diff --git a/doc/iss/generate.rst b/doc/iss/generate.rst
@@ -3,10 +3,19 @@
 Generating reads
 ================
 
+InSilicoSeq can simulate amplicon reads, or reads from whole metagenome sequencing (the default).
+You can specify the type of reads you want to simulate with the ``--sequence_type`` option.
+
 InSilicoSeq comes with a set of pre-computed error models to allow the user to easily generate reads from the most popular Illumina instruments:
 
 - HiSeq
-- MiSeq
+- MiSeq (optionally, with various quality thresholds):
+    - MiSeq-20
+    - MiSeq-24
+    - MiSeq-28
+    - MiSeq-32
+    - MiSeq-36
+- NextSeq
 - NovaSeq
 
 Per example generate 1 million MiSeq reads from a set of input genomes:
@@ -49,6 +58,19 @@ You can also provide multiple input files:
     curl -O -J -L https://osf.io/37kg8/download  # download another example file
     iss generate --genomes SRS121011.fasta minigut.fasta --n_genomes 5 --model novaseq --output novaseq_reads
 
+Amplicons
+---------
+
+To generate amplicon reads, use the ``--sequence_type amplicon`` option:
+
+.. code-block:: bash
+
+    # no example data is provided here
+    iss generate --genomes my_amplicons.fasta ---readcount_file counts.txt -sequence_type amplicon --model nextseq --output reads
+
+where ``counts.txt`` is a tab-delimited file containing the number of reads to generate for each amplicon sequence present in ``my_amplicons.fasta``.
+Alternatively, you can use the ``--n_reads`` option to generate a fixed number of reads, together with an abundance distribution.
+
 Draft genomes
 -------------
 
@@ -230,6 +252,11 @@ coverage distribution. Can be uniform, halfnormal, exponential, lognormal or zer
 
 file containing coverage information (default: None).
 
+--readcount_file
+^^^^^^^^^^^^^^^^
+
+file containing read_count information (default: None).
+
 --n_reads
 ^^^^^^^^^
 
@@ -246,16 +273,21 @@ Can be 'kde' or 'basic'
 ^^^^^^^^
 
 Error model file. (default: None).
-Use HiSeq, NovaSeq or MiSeq for a pre-computed error model provided with the software, or a file generated with iss model.
-If you do not wish to use a model, use --mode basic.
-The name of the built-in models is case insensitive.
+Use HiSeq, NextSeq, NovaSeq, MiSeq or Miseq-[20,24,28,32] for a pre-computed error model provided with the software, or a file generated with iss model.
+If you do not wish to use a model, use --mode basic or --mode perfect.
+The name of the built-in models are case insensitive.
 
 --gc_bias
 ^^^^^^^^^
 
 If set, may fail to sequence reads with abnormal GC content.
 Does not guarantee --n_reads (default: False)
 
+--sequence_type
+^^^^^^^^^^^^^^^
+
+Type of sequencing. Can be metagenomics or amplicon (default: metagenomics).
+
 --cpus
 ^^^^^^
 
@@ -284,4 +316,9 @@ Output file path and prefix (Required)
 --compress
 ^^^^^^^^^^
 
-Compress the output in gzip format (default: False).
+Compress the output in gzip format (default: False).
+
+--store_mutations
+^^^^^^^^^^^^^^^^^
+
+Generates an additional VCF file with the mutations introduced in the reads (bool).
diff --git a/doc/iss/install.rst b/doc/iss/install.rst
@@ -18,7 +18,6 @@ To install InSilicoSeq, type the following in your terminal:
 It will install InSilicoSeq as well as the following dependencies:
 
 * biopython
-* joblib
 * numpy
 * pysam
 * scipy

diff --git a/doc/iss/model.rst b/doc/iss/model.rst
@@ -12,10 +12,12 @@ Available models
 +------------+-------------+
 | Model name | Read length |
 +============+=============+
-| MiSeq      | 300 bp      |
+| MiSeq*     | 300 bp      |
 +------------+-------------+
 | HiSeq      | 125 bp      |
 +------------+-------------+
+| NextSeq    | 300 bp      |
++------------+-------------+
 | NovaSeq    | 150 bp      |
 +------------+-------------+
 

diff --git a/doc/iss/qualities.png b/doc/iss/qualities.png
diff --git a/iss/__init__.py b/iss/__init__.py
@@ -1,4 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from iss import *

diff --git a/iss/__main__.py b/iss/__main__.py
@@ -2,4 +2,5 @@
 # -*- coding: utf-8 -*-
 
 from iss.app import main
+
 main()
diff --git a/iss/abundance.py b/iss/abundance.py
@@ -1,13 +1,40 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-from Bio import SeqIO
-from scipy import stats
-
+import logging
 import os
 import sys
-import logging
+
 import numpy as np
+from Bio import SeqIO
+from scipy import stats
+
+
+def parse_readcount_file(readcount_file):
+    logger = logging.getLogger(__name__)
+
+    readcount_dic = {}
+    try:
+        assert os.stat(readcount_file).st_size != 0
+        f = open(readcount_file, "r")
+    except (IOError, OSError) as e:
+        logger.error("Failed to open file:%s" % e)
+        sys.exit(1)
+    except AssertionError:
+        logger.error("File seems empty: %s" % readcount_file)
+        sys.exit(1)
+    else:
+        with f:
+            for line in f:
+                try:
+                    genome_id = line.split()[0]
+                    read_count = int(line.split()[1])
+                except IndexError as e:
+                    logger.error("Failed to read file: %s" % e)
+                    sys.exit(1)
+                else:
+                    readcount_dic[genome_id] = read_count
+    return readcount_dic
 
 
 def parse_abundance_file(abundance_file):
@@ -28,12 +55,12 @@ def parse_abundance_file(abundance_file):
     abundance_dic = {}
     try:
         assert os.stat(abundance_file).st_size != 0
-        f = open(abundance_file, 'r')
+        f = open(abundance_file, "r")
     except (IOError, OSError) as e:
-        logger.error('Failed to open file:%s' % e)
+        logger.error("Failed to open file:%s" % e)
         sys.exit(1)
-    except AssertionError as e:
-        logger.error('File seems empty: %s' % abundance_file)
+    except AssertionError:
+        logger.error("File seems empty: %s" % abundance_file)
         sys.exit(1)
     else:
         with f:
@@ -42,11 +69,11 @@ def parse_abundance_file(abundance_file):
                     genome_id = line.split()[0]
                     abundance = float(line.split()[1])
                 except IndexError as e:
-                    logger.error('Failed to read file: %s' % e)
+                    logger.error("Failed to read file: %s" % e)
                     sys.exit(1)
                 else:
                     abundance_dic[genome_id] = abundance
-    logger.debug('Loaded abundance/coverage file: %s' % abundance_file)
+    logger.debug("Loaded abundance/coverage file: %s" % abundance_file)
     return abundance_dic
 
 
@@ -179,16 +206,16 @@ def coverage_scaling(total_n_reads, abundance_dic, genome_file, read_length):
     Returns:
         dict: scaled coverage dictionary
     """
+    logger = logging.getLogger(__name__)
     total_reads = 0
-    f = open(genome_file, 'r')  # re-opens the file
+    f = open(genome_file, "r")  # re-opens the file
     with f:
-        fasta_file = SeqIO.parse(f, 'fasta')
+        fasta_file = SeqIO.parse(f, "fasta")
         for record in fasta_file:
             try:
                 species_coverage = abundance_dic[record.id]
             except KeyError as e:
-                logger.error(
-                    'Fasta record not found in abundance file: %s' % e)
+                logger.error("Fasta record not found in abundance file: %s" % e)
                 sys.exit(1)
 
             genome_size = len(record.seq)
@@ -210,18 +237,18 @@ def to_file(abundance_dic, output, mode="abundance"):
     """
     logger = logging.getLogger(__name__)
     if mode == "abundance":
-        output_abundance = output + '_abundance.txt'
+        output_abundance = output + "_abundance.txt"
     else:
-        output_abundance = output + '_coverage.txt'
+        output_abundance = output + "_coverage.txt"
     try:
-        f = open(output_abundance, 'w')
+        f = open(output_abundance, "w")
     except PermissionError as e:
-        logger.error('Failed to open output file: %s' % e)
+        logger.error("Failed to open output file: %s" % e)
         sys.exit(1)
     else:
         with f:
             for record, abundance in abundance_dic.items():
-                f.write('%s\t%s\n' % (record, abundance))
+                f.write("%s\t%s\n" % (record, abundance))
 
 
 def draft(genomes, draft, distribution, output, mode="abundance"):
@@ -240,13 +267,10 @@ def draft(genomes, draft, distribution, output, mode="abundance"):
     # first we get a list of contig names in draft genomes
     draft_records = []
     for d in draft:
-        draft_records.extend(
-            [record.name for record in SeqIO.parse(d, 'fasta')])
+        draft_records.extend([record.name for record in SeqIO.parse(d, "fasta")])
     genomes = list(set(genomes) - set(draft_records))
     abundance_dic = distribution(genomes + draft)
-    complete_genomes_abundance = {k: v for
-                                  k, v in abundance_dic.items()
-                                  if k not in draft}
+    complete_genomes_abundance = {k: v for k, v in abundance_dic.items() if k not in draft}
     to_file(abundance_dic, output)
     draft_dic = expand_draft_abundance(abundance_dic, draft, mode)
     full_abundance_dic = {**complete_genomes_abundance, **draft_dic}
@@ -273,12 +297,12 @@ def expand_draft_abundance(abundance_dic, draft, mode="abundance"):
     for key, abundance in abundance_dic.items():
         if key in draft:
             try:
-                f = open(key, 'r')
+                f = open(key, "r")
                 with f:
-                    fasta_file = SeqIO.parse(f, 'fasta')
+                    fasta_file = SeqIO.parse(f, "fasta")
                     draft_records = [record for record in fasta_file]
                     total_length = sum(len(record) for record in draft_records)
-            except Exception as e:
+            except Exception:
                 raise
             else:
                 total_length = sum(len(record) for record in draft_records)